diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/added_tokens.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/config.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/config.json new file mode 100644 index 0000000000000000000000000000000000000000..374e3fe25bd186c707e2c209262e18d97ef2223c --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/config.json @@ -0,0 +1,195 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "image_aspect_ratio": "pad", + "init_weight": false, + "initializer_range": 0.02, + "intermediate_size": 8192, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.001, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/generation_config.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/latest b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/latest new file mode 100644 index 0000000000000000000000000000000000000000..7487e9ce9b3fafd51a933b147ce761f1d3c7fd06 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/latest @@ -0,0 +1 @@ +global_step1664 \ No newline at end of file diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..109a4e6adfb72b86557e6cbed765d39a6cbc877a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bf06e300710da107cefdf173f5dba55b9932f6fb7e7a1fcc94584a041e17540 +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e48cfe924605301ad5e75c6b9607adafe881d14d --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15586e0cf0a14c49d029b480b3eb97a242b1352d44c74b99897dbb04777facef +size 4985754844 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..883d9d8b45767b98cc34ca09259ac37a163cbb44 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:937144bd9bfa39f775a52e5982d5c82a27573a661081da2310f49d2dcba01f05 +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..a900cb68b39c8fe6eedc011196340060a750c9c8 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207040684 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_0.pth b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b3594867c86978428e5e5bbdcae43068ca572a4 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4fbc0c3ab23cb6a2bc8d12881f8f60b88c9d707345d9e650d367af127e750b9 +size 15024 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_1.pth b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d2963e51043b85c2837399b5ae8212b62ea2cf9 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca439560b72fdfdeca80538c1d7fd13a79cb40a4113abdd40bed2ee18c276f6e +size 15024 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_2.pth b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..51d113c2fd99f3ab9ae0a827bc55e4424d99d271 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37d8a08a60f1e45bbe669ccc291b732178afde092185a2275107087813030b6c +size 15024 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_3.pth b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e37d90e8d2dbd6c0377326df7ded780972f9ced --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5990ef8a1c2a5a5daffd1a6f0a3bfedabc1eacf2b1a98ac77694877c0faf73e4 +size 15024 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer.model b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/trainer_state.json b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..487db48bad8059a9aeaaeeecb8ba3f9db1c54259 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/trainer_state.json @@ -0,0 +1,24993 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20008417002344736, + "eval_steps": 500, + "global_step": 1664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.04592296, + "auxiliary_loss_mlp": 0.0257779, + "balance_loss_clip": 2.47145319, + "balance_loss_mlp": 2.09008121, + "epoch": 0.00012024289063909097, + "flos": 24932483919360.0, + "grad_norm": 40.31813729776111, + "language_loss": 2.5798173, + "learning_rate": 0.0, + "loss": 1.90189219, + "num_input_tokens_seen": 20375, + "step": 1, + "time_per_iteration": 13.679930686950684 + }, + { + "auxiliary_loss_clip": 0.03096922, + "auxiliary_loss_mlp": 0.01644309, + "balance_loss_clip": 1.6545341, + "balance_loss_mlp": 1.32025051, + "epoch": 0.00024048578127818193, + "flos": 30664624377600.0, + "grad_norm": 55.02365207892332, + "language_loss": 1.89100933, + "learning_rate": 5.021476677069823e-07, + "loss": 1.93842173, + "num_input_tokens_seen": 39035, + "step": 2, + "time_per_iteration": 2.474804162979126 + }, + { + "auxiliary_loss_clip": 0.03069635, + "auxiliary_loss_mlp": 0.01670008, + "balance_loss_clip": 1.6526866, + "balance_loss_mlp": 1.34823871, + "epoch": 0.0003607286719172729, + "flos": 19026227969280.0, + "grad_norm": 40.55690044198473, + "language_loss": 1.61390448, + "learning_rate": 7.958852231401551e-07, + "loss": 1.6613009, + "num_input_tokens_seen": 57600, + "step": 3, + "time_per_iteration": 2.318293333053589 + }, + { + "auxiliary_loss_clip": 0.03077831, + "auxiliary_loss_mlp": 0.01736865, + "balance_loss_clip": 1.65183318, + "balance_loss_mlp": 1.41356897, + "epoch": 0.00048097156255636386, + "flos": 19316314206720.0, + "grad_norm": 36.535469138224435, + "language_loss": 1.64225054, + "learning_rate": 1.0042953354139647e-06, + "loss": 1.69039762, + "num_input_tokens_seen": 76465, + "step": 4, + "time_per_iteration": 2.3873391151428223 + }, + { + "auxiliary_loss_clip": 0.03073988, + "auxiliary_loss_mlp": 0.0166799, + "balance_loss_clip": 1.65372992, + "balance_loss_mlp": 1.35098875, + "epoch": 0.0006012144531954548, + "flos": 13991264893440.0, + "grad_norm": 55.26641045100814, + "language_loss": 1.93399584, + "learning_rate": 1.1659507774310057e-06, + "loss": 1.98141563, + "num_input_tokens_seen": 94350, + "step": 5, + "time_per_iteration": 2.5851845741271973 + }, + { + "auxiliary_loss_clip": 0.03085224, + "auxiliary_loss_mlp": 0.01684223, + "balance_loss_clip": 1.65821815, + "balance_loss_mlp": 1.36188066, + "epoch": 0.0007214573438345458, + "flos": 23148988225920.0, + "grad_norm": 47.392088372963684, + "language_loss": 1.60929465, + "learning_rate": 1.2980328908471373e-06, + "loss": 1.6569891, + "num_input_tokens_seen": 114595, + "step": 6, + "time_per_iteration": 2.6140356063842773 + }, + { + "auxiliary_loss_clip": 0.03138453, + "auxiliary_loss_mlp": 0.01611064, + "balance_loss_clip": 1.79509974, + "balance_loss_mlp": 1.40964794, + "epoch": 0.0008417002344736367, + "flos": 67663246170240.0, + "grad_norm": 4.608918149065674, + "language_loss": 0.81463742, + "learning_rate": 1.4097067265369432e-06, + "loss": 0.86213261, + "num_input_tokens_seen": 179590, + "step": 7, + "time_per_iteration": 3.0851175785064697 + }, + { + "auxiliary_loss_clip": 0.03050037, + "auxiliary_loss_mlp": 0.01707597, + "balance_loss_clip": 1.64213991, + "balance_loss_mlp": 1.39479208, + "epoch": 0.0009619431251127277, + "flos": 21281381504640.0, + "grad_norm": 40.91424104955932, + "language_loss": 1.58874691, + "learning_rate": 1.506443003120947e-06, + "loss": 1.63632321, + "num_input_tokens_seen": 195090, + "step": 8, + "time_per_iteration": 2.5903615951538086 + }, + { + "auxiliary_loss_clip": 0.03056585, + "auxiliary_loss_mlp": 0.01697593, + "balance_loss_clip": 1.65001082, + "balance_loss_mlp": 1.37925625, + "epoch": 0.0010821860157518186, + "flos": 23331342597120.0, + "grad_norm": 20.25020460704316, + "language_loss": 1.47886133, + "learning_rate": 1.5917704462803102e-06, + "loss": 1.52640295, + "num_input_tokens_seen": 211635, + "step": 9, + "time_per_iteration": 2.6365909576416016 + }, + { + "auxiliary_loss_clip": 0.03042249, + "auxiliary_loss_mlp": 0.01656965, + "balance_loss_clip": 1.64865518, + "balance_loss_mlp": 1.34015429, + "epoch": 0.0012024289063909096, + "flos": 17010166337280.0, + "grad_norm": 13.271652495451278, + "language_loss": 1.52986228, + "learning_rate": 1.6680984451379884e-06, + "loss": 1.57685447, + "num_input_tokens_seen": 224705, + "step": 10, + "time_per_iteration": 2.590702533721924 + }, + { + "auxiliary_loss_clip": 0.03050442, + "auxiliary_loss_mlp": 0.01681698, + "balance_loss_clip": 1.64481628, + "balance_loss_mlp": 1.37690389, + "epoch": 0.0013226717970300007, + "flos": 21288133261440.0, + "grad_norm": 14.775333800858181, + "language_loss": 1.32634711, + "learning_rate": 1.7371455188905097e-06, + "loss": 1.37366843, + "num_input_tokens_seen": 244635, + "step": 11, + "time_per_iteration": 2.6825337409973145 + }, + { + "auxiliary_loss_clip": 0.03064075, + "auxiliary_loss_mlp": 0.01704508, + "balance_loss_clip": 1.64960694, + "balance_loss_mlp": 1.37663448, + "epoch": 0.0014429146876690916, + "flos": 27237884935680.0, + "grad_norm": 10.930196651173963, + "language_loss": 1.25496101, + "learning_rate": 1.8001805585541196e-06, + "loss": 1.30264676, + "num_input_tokens_seen": 265765, + "step": 12, + "time_per_iteration": 2.7420575618743896 + }, + { + "auxiliary_loss_clip": 0.03044598, + "auxiliary_loss_mlp": 0.01665624, + "balance_loss_clip": 1.64174843, + "balance_loss_mlp": 1.3581593, + "epoch": 0.0015631575783081825, + "flos": 19062174504960.0, + "grad_norm": 6.734022188218813, + "language_loss": 1.29245877, + "learning_rate": 1.8581671739548328e-06, + "loss": 1.33956099, + "num_input_tokens_seen": 283500, + "step": 13, + "time_per_iteration": 2.756082773208618 + }, + { + "auxiliary_loss_clip": 0.03039485, + "auxiliary_loss_mlp": 0.0161854, + "balance_loss_clip": 1.63809609, + "balance_loss_mlp": 1.30573452, + "epoch": 0.0016834004689472734, + "flos": 48139473985920.0, + "grad_norm": 6.218154608776597, + "language_loss": 1.13518643, + "learning_rate": 1.9118543942439254e-06, + "loss": 1.18176675, + "num_input_tokens_seen": 305685, + "step": 14, + "time_per_iteration": 2.8718252182006836 + }, + { + "auxiliary_loss_clip": 0.03015634, + "auxiliary_loss_mlp": 0.0167643, + "balance_loss_clip": 1.6327852, + "balance_loss_mlp": 1.36000097, + "epoch": 0.0018036433595863645, + "flos": 34970026314240.0, + "grad_norm": 5.585264676798799, + "language_loss": 1.12792444, + "learning_rate": 1.961836000571161e-06, + "loss": 1.1748451, + "num_input_tokens_seen": 327340, + "step": 15, + "time_per_iteration": 2.7548837661743164 + }, + { + "auxiliary_loss_clip": 0.03024889, + "auxiliary_loss_mlp": 0.01440848, + "balance_loss_clip": 1.76858163, + "balance_loss_mlp": 1.25087607, + "epoch": 0.0019238862502254555, + "flos": 59768284440960.0, + "grad_norm": 3.7812827511611795, + "language_loss": 0.64721709, + "learning_rate": 2.0085906708279293e-06, + "loss": 0.69187444, + "num_input_tokens_seen": 382710, + "step": 16, + "time_per_iteration": 5.555383920669556 + }, + { + "auxiliary_loss_clip": 0.03002207, + "auxiliary_loss_mlp": 0.01636336, + "balance_loss_clip": 1.63715792, + "balance_loss_mlp": 1.325629, + "epoch": 0.0020441291408645466, + "flos": 20814543417600.0, + "grad_norm": 5.149524066240761, + "language_loss": 1.16015792, + "learning_rate": 2.0525099325728135e-06, + "loss": 1.20654321, + "num_input_tokens_seen": 400890, + "step": 17, + "time_per_iteration": 2.6887638568878174 + }, + { + "auxiliary_loss_clip": 0.029913, + "auxiliary_loss_mlp": 0.01411446, + "balance_loss_clip": 1.76141357, + "balance_loss_mlp": 1.22376311, + "epoch": 0.0021643720315036373, + "flos": 63857001582720.0, + "grad_norm": 3.5249476400751183, + "language_loss": 0.72181082, + "learning_rate": 2.0939181139872922e-06, + "loss": 0.76583827, + "num_input_tokens_seen": 462605, + "step": 18, + "time_per_iteration": 3.1064062118530273 + }, + { + "auxiliary_loss_clip": 0.02970689, + "auxiliary_loss_mlp": 0.01583766, + "balance_loss_clip": 1.63021719, + "balance_loss_mlp": 1.28278613, + "epoch": 0.0022846149221427284, + "flos": 31284981192960.0, + "grad_norm": 5.068147463859064, + "language_loss": 1.01411319, + "learning_rate": 2.1330868934640175e-06, + "loss": 1.05965781, + "num_input_tokens_seen": 483280, + "step": 19, + "time_per_iteration": 2.712682008743286 + }, + { + "auxiliary_loss_clip": 0.02944429, + "auxiliary_loss_mlp": 0.01372943, + "balance_loss_clip": 1.75237823, + "balance_loss_mlp": 1.19060075, + "epoch": 0.002404857812781819, + "flos": 51083648161920.0, + "grad_norm": 3.5544246215179043, + "language_loss": 0.76397097, + "learning_rate": 2.170246112844971e-06, + "loss": 0.8071447, + "num_input_tokens_seen": 537620, + "step": 20, + "time_per_iteration": 2.890746593475342 + }, + { + "auxiliary_loss_clip": 0.02915263, + "auxiliary_loss_mlp": 0.01538439, + "balance_loss_clip": 1.61829948, + "balance_loss_mlp": 1.2389853, + "epoch": 0.0025251007034209102, + "flos": 15815347309440.0, + "grad_norm": 4.421502737527982, + "language_loss": 1.01509464, + "learning_rate": 2.2055919496770983e-06, + "loss": 1.05963159, + "num_input_tokens_seen": 555760, + "step": 21, + "time_per_iteration": 2.681652069091797 + }, + { + "auxiliary_loss_clip": 0.0290131, + "auxiliary_loss_mlp": 0.01522155, + "balance_loss_clip": 1.61400509, + "balance_loss_mlp": 1.22479916, + "epoch": 0.0026453435940600014, + "flos": 37851857458560.0, + "grad_norm": 7.382501605944554, + "language_loss": 0.89493495, + "learning_rate": 2.2392931865974923e-06, + "loss": 0.93916965, + "num_input_tokens_seen": 578450, + "step": 22, + "time_per_iteration": 2.757664203643799 + }, + { + "auxiliary_loss_clip": 0.02862635, + "auxiliary_loss_mlp": 0.01506351, + "balance_loss_clip": 1.60505772, + "balance_loss_mlp": 1.21052122, + "epoch": 0.002765586484699092, + "flos": 21141976821120.0, + "grad_norm": 4.682417881099616, + "language_loss": 1.02103889, + "learning_rate": 2.271496085962064e-06, + "loss": 1.06472874, + "num_input_tokens_seen": 596145, + "step": 23, + "time_per_iteration": 2.6176741123199463 + }, + { + "auxiliary_loss_clip": 0.02835416, + "auxiliary_loss_mlp": 0.01482041, + "balance_loss_clip": 1.59226811, + "balance_loss_mlp": 1.18926311, + "epoch": 0.002885829375338183, + "flos": 20667381396480.0, + "grad_norm": 3.43666376788469, + "language_loss": 1.02808046, + "learning_rate": 2.3023282262611022e-06, + "loss": 1.07125497, + "num_input_tokens_seen": 614920, + "step": 24, + "time_per_iteration": 2.6299045085906982 + }, + { + "auxiliary_loss_clip": 0.02845925, + "auxiliary_loss_mlp": 0.01483435, + "balance_loss_clip": 1.6000222, + "balance_loss_mlp": 1.19904912, + "epoch": 0.003006072265977274, + "flos": 34823869873920.0, + "grad_norm": 3.0001014876747876, + "language_loss": 0.92529213, + "learning_rate": 2.3319015548620114e-06, + "loss": 0.96858567, + "num_input_tokens_seen": 636060, + "step": 25, + "time_per_iteration": 2.7235043048858643 + }, + { + "auxiliary_loss_clip": 0.02803271, + "auxiliary_loss_mlp": 0.01451435, + "balance_loss_clip": 1.5875833, + "balance_loss_mlp": 1.1754415, + "epoch": 0.003126315156616365, + "flos": 24422021118720.0, + "grad_norm": 2.3569558820490357, + "language_loss": 0.93086529, + "learning_rate": 2.3603148416618152e-06, + "loss": 0.97341239, + "num_input_tokens_seen": 655575, + "step": 26, + "time_per_iteration": 2.6431405544281006 + }, + { + "auxiliary_loss_clip": 0.02809498, + "auxiliary_loss_mlp": 0.01435626, + "balance_loss_clip": 1.58839178, + "balance_loss_mlp": 1.16268444, + "epoch": 0.003246558047255456, + "flos": 23622326674560.0, + "grad_norm": 2.250051670986053, + "language_loss": 1.00897861, + "learning_rate": 2.3876556694204647e-06, + "loss": 1.05142975, + "num_input_tokens_seen": 675730, + "step": 27, + "time_per_iteration": 2.643726348876953 + }, + { + "auxiliary_loss_clip": 0.0277008, + "auxiliary_loss_mlp": 0.01438905, + "balance_loss_clip": 1.58151054, + "balance_loss_mlp": 1.15299392, + "epoch": 0.003366800937894547, + "flos": 17820275725440.0, + "grad_norm": 3.5125502491615017, + "language_loss": 0.90615028, + "learning_rate": 2.414002061950908e-06, + "loss": 0.94824016, + "num_input_tokens_seen": 694605, + "step": 28, + "time_per_iteration": 2.6073849201202393 + }, + { + "auxiliary_loss_clip": 0.02753506, + "auxiliary_loss_mlp": 0.01412083, + "balance_loss_clip": 1.5741179, + "balance_loss_mlp": 1.14429164, + "epoch": 0.003487043828533638, + "flos": 24426115269120.0, + "grad_norm": 2.229407356657713, + "language_loss": 0.99840701, + "learning_rate": 2.4394238264681557e-06, + "loss": 1.04006302, + "num_input_tokens_seen": 714340, + "step": 29, + "time_per_iteration": 2.657010555267334 + }, + { + "auxiliary_loss_clip": 0.02728734, + "auxiliary_loss_mlp": 0.01410666, + "balance_loss_clip": 1.56707942, + "balance_loss_mlp": 1.13734317, + "epoch": 0.003607286719172729, + "flos": 26140311002880.0, + "grad_norm": 2.1013063073254483, + "language_loss": 0.99750847, + "learning_rate": 2.4639836682781433e-06, + "loss": 1.03890252, + "num_input_tokens_seen": 734470, + "step": 30, + "time_per_iteration": 2.657283067703247 + }, + { + "auxiliary_loss_clip": 0.0274252, + "auxiliary_loss_mlp": 0.01399216, + "balance_loss_clip": 1.58084083, + "balance_loss_mlp": 1.11883581, + "epoch": 0.00372752960981182, + "flos": 20593082113920.0, + "grad_norm": 2.5290194477593095, + "language_loss": 1.00179088, + "learning_rate": 2.487738122623307e-06, + "loss": 1.04320836, + "num_input_tokens_seen": 753380, + "step": 31, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.02700365, + "auxiliary_loss_mlp": 0.01377165, + "balance_loss_clip": 1.5633738, + "balance_loss_mlp": 1.10460484, + "epoch": 0.003847772500450911, + "flos": 22674608282880.0, + "grad_norm": 2.6587190587413474, + "language_loss": 0.98849338, + "learning_rate": 2.510738338534912e-06, + "loss": 1.02926874, + "num_input_tokens_seen": 772105, + "step": 32, + "time_per_iteration": 2.690436363220215 + }, + { + "auxiliary_loss_clip": 0.02563486, + "auxiliary_loss_mlp": 0.01361417, + "balance_loss_clip": 1.52081466, + "balance_loss_mlp": 1.09419751, + "epoch": 0.003968015391090002, + "flos": 17967796882560.0, + "grad_norm": 2.3748496192044164, + "language_loss": 1.02413309, + "learning_rate": 2.5330307420306648e-06, + "loss": 1.06338215, + "num_input_tokens_seen": 788955, + "step": 33, + "time_per_iteration": 2.726891279220581 + }, + { + "auxiliary_loss_clip": 0.02522708, + "auxiliary_loss_mlp": 0.01344607, + "balance_loss_clip": 1.51539016, + "balance_loss_mlp": 1.10046613, + "epoch": 0.004088258281729093, + "flos": 27304103658240.0, + "grad_norm": 2.5944786148321604, + "language_loss": 0.88078773, + "learning_rate": 2.554657600279796e-06, + "loss": 0.91946083, + "num_input_tokens_seen": 810230, + "step": 34, + "time_per_iteration": 2.8231966495513916 + }, + { + "auxiliary_loss_clip": 0.02503904, + "auxiliary_loss_mlp": 0.01325456, + "balance_loss_clip": 1.5071764, + "balance_loss_mlp": 1.07349491, + "epoch": 0.004208501172368184, + "flos": 23258587599360.0, + "grad_norm": 2.6932436317700867, + "language_loss": 1.03471887, + "learning_rate": 2.5756575039679493e-06, + "loss": 1.07301247, + "num_input_tokens_seen": 829780, + "step": 35, + "time_per_iteration": 2.6862430572509766 + }, + { + "auxiliary_loss_clip": 0.0246641, + "auxiliary_loss_mlp": 0.01352755, + "balance_loss_clip": 1.49664128, + "balance_loss_mlp": 1.1013664, + "epoch": 0.0043287440630072746, + "flos": 17312104062720.0, + "grad_norm": 3.806337713054072, + "language_loss": 0.94963145, + "learning_rate": 2.5960657816942747e-06, + "loss": 0.98782313, + "num_input_tokens_seen": 848695, + "step": 36, + "time_per_iteration": 2.656785011291504 + }, + { + "auxiliary_loss_clip": 0.0230379, + "auxiliary_loss_mlp": 0.01399004, + "balance_loss_clip": 1.57838988, + "balance_loss_mlp": 1.25709724, + "epoch": 0.004448986953646365, + "flos": 53092491160320.0, + "grad_norm": 1.3897414030459383, + "language_loss": 0.60942662, + "learning_rate": 2.6159148575788668e-06, + "loss": 0.64645457, + "num_input_tokens_seen": 906730, + "step": 37, + "time_per_iteration": 3.103044271469116 + }, + { + "auxiliary_loss_clip": 0.02417033, + "auxiliary_loss_mlp": 0.01358563, + "balance_loss_clip": 1.48632324, + "balance_loss_mlp": 1.10889125, + "epoch": 0.004569229844285457, + "flos": 13444165866240.0, + "grad_norm": 3.4692995901598414, + "language_loss": 0.98698151, + "learning_rate": 2.635234561171e-06, + "loss": 1.02473736, + "num_input_tokens_seen": 925125, + "step": 38, + "time_per_iteration": 2.6615142822265625 + }, + { + "auxiliary_loss_clip": 0.02394631, + "auxiliary_loss_mlp": 0.01325958, + "balance_loss_clip": 1.47936511, + "balance_loss_mlp": 1.09135365, + "epoch": 0.0046894727349245475, + "flos": 16209609966720.0, + "grad_norm": 2.6569661678519076, + "language_loss": 0.94213772, + "learning_rate": 2.6540523970949877e-06, + "loss": 0.97934365, + "num_input_tokens_seen": 939970, + "step": 39, + "time_per_iteration": 2.8100712299346924 + }, + { + "auxiliary_loss_clip": 0.02364583, + "auxiliary_loss_mlp": 0.01335111, + "balance_loss_clip": 1.47830486, + "balance_loss_mlp": 1.09879065, + "epoch": 0.004809715625563638, + "flos": 23914244505600.0, + "grad_norm": 2.8245373459000502, + "language_loss": 0.92652309, + "learning_rate": 2.6723937805519533e-06, + "loss": 0.96352005, + "num_input_tokens_seen": 957470, + "step": 40, + "time_per_iteration": 2.6364290714263916 + }, + { + "auxiliary_loss_clip": 0.02356107, + "auxiliary_loss_mlp": 0.01305705, + "balance_loss_clip": 1.46764278, + "balance_loss_mlp": 1.07949352, + "epoch": 0.00492995851620273, + "flos": 20773030273920.0, + "grad_norm": 2.201506207378197, + "language_loss": 0.9308815, + "learning_rate": 2.690282243737839e-06, + "loss": 0.96749961, + "num_input_tokens_seen": 976405, + "step": 41, + "time_per_iteration": 2.6587958335876465 + }, + { + "auxiliary_loss_clip": 0.02321, + "auxiliary_loss_mlp": 0.01327649, + "balance_loss_clip": 1.45625973, + "balance_loss_mlp": 1.09685957, + "epoch": 0.0050502014068418205, + "flos": 20338655103360.0, + "grad_norm": 4.313716182702165, + "language_loss": 0.99323666, + "learning_rate": 2.7077396173840807e-06, + "loss": 1.02972317, + "num_input_tokens_seen": 994690, + "step": 42, + "time_per_iteration": 3.44149112701416 + }, + { + "auxiliary_loss_clip": 0.02296594, + "auxiliary_loss_mlp": 0.01315151, + "balance_loss_clip": 1.44871891, + "balance_loss_mlp": 1.09408879, + "epoch": 0.005170444297480911, + "flos": 25994872834560.0, + "grad_norm": 2.804216307882528, + "language_loss": 0.92762637, + "learning_rate": 2.7247861909342594e-06, + "loss": 0.96374375, + "num_input_tokens_seen": 1015615, + "step": 43, + "time_per_iteration": 3.580928325653076 + }, + { + "auxiliary_loss_clip": 0.02294591, + "auxiliary_loss_mlp": 0.01312627, + "balance_loss_clip": 1.44867611, + "balance_loss_mlp": 1.09509444, + "epoch": 0.005290687188120003, + "flos": 20954055841920.0, + "grad_norm": 2.3722573958427398, + "language_loss": 0.83172256, + "learning_rate": 2.7414408543044743e-06, + "loss": 0.86779475, + "num_input_tokens_seen": 1031255, + "step": 44, + "time_per_iteration": 2.664051055908203 + }, + { + "auxiliary_loss_clip": 0.02243993, + "auxiliary_loss_mlp": 0.01334302, + "balance_loss_clip": 1.43369818, + "balance_loss_mlp": 1.11343145, + "epoch": 0.005410930078759093, + "flos": 15851401585920.0, + "grad_norm": 4.30684794071737, + "language_loss": 0.79317051, + "learning_rate": 2.7577212237113157e-06, + "loss": 0.82895339, + "num_input_tokens_seen": 1048295, + "step": 45, + "time_per_iteration": 2.611945629119873 + }, + { + "auxiliary_loss_clip": 0.02232494, + "auxiliary_loss_mlp": 0.01310957, + "balance_loss_clip": 1.42827296, + "balance_loss_mlp": 1.09523582, + "epoch": 0.005531172969398184, + "flos": 21104988791040.0, + "grad_norm": 2.2497812929330743, + "language_loss": 1.04301131, + "learning_rate": 2.7736437536690466e-06, + "loss": 1.07844567, + "num_input_tokens_seen": 1067925, + "step": 46, + "time_per_iteration": 2.6387572288513184 + }, + { + "auxiliary_loss_clip": 0.02222316, + "auxiliary_loss_mlp": 0.01278661, + "balance_loss_clip": 1.42864168, + "balance_loss_mlp": 1.0682801, + "epoch": 0.005651415860037276, + "flos": 20844887431680.0, + "grad_norm": 6.149980709588049, + "language_loss": 1.07959795, + "learning_rate": 2.789223836941131e-06, + "loss": 1.11460769, + "num_input_tokens_seen": 1088060, + "step": 47, + "time_per_iteration": 2.645055055618286 + }, + { + "auxiliary_loss_clip": 0.02187384, + "auxiliary_loss_mlp": 0.01287044, + "balance_loss_clip": 1.41626072, + "balance_loss_mlp": 1.08066869, + "epoch": 0.005771658750676366, + "flos": 13260195383040.0, + "grad_norm": 2.367606107257419, + "language_loss": 1.08816361, + "learning_rate": 2.8044758939680847e-06, + "loss": 1.12290788, + "num_input_tokens_seen": 1104130, + "step": 48, + "time_per_iteration": 2.6173622608184814 + }, + { + "auxiliary_loss_clip": 0.02163339, + "auxiliary_loss_mlp": 0.01285028, + "balance_loss_clip": 1.41632617, + "balance_loss_mlp": 1.07665014, + "epoch": 0.005891901641315457, + "flos": 24425396997120.0, + "grad_norm": 3.6202072581208973, + "language_loss": 1.02078879, + "learning_rate": 2.8194134530738863e-06, + "loss": 1.05527246, + "num_input_tokens_seen": 1122900, + "step": 49, + "time_per_iteration": 2.625859260559082 + }, + { + "auxiliary_loss_clip": 0.02155209, + "auxiliary_loss_mlp": 0.01293916, + "balance_loss_clip": 1.41042113, + "balance_loss_mlp": 1.09574211, + "epoch": 0.006012144531954548, + "flos": 23076197314560.0, + "grad_norm": 2.701049678593534, + "language_loss": 0.90105623, + "learning_rate": 2.834049222568994e-06, + "loss": 0.93554747, + "num_input_tokens_seen": 1140250, + "step": 50, + "time_per_iteration": 2.6387081146240234 + }, + { + "auxiliary_loss_clip": 0.02150442, + "auxiliary_loss_mlp": 0.01256909, + "balance_loss_clip": 1.40696573, + "balance_loss_mlp": 1.06388521, + "epoch": 0.006132387422593639, + "flos": 22528775064960.0, + "grad_norm": 1.9179468433546132, + "language_loss": 0.92596257, + "learning_rate": 2.848395155712969e-06, + "loss": 0.9600361, + "num_input_tokens_seen": 1160470, + "step": 51, + "time_per_iteration": 2.6163644790649414 + }, + { + "auxiliary_loss_clip": 0.0213526, + "auxiliary_loss_mlp": 0.01296169, + "balance_loss_clip": 1.40875244, + "balance_loss_mlp": 1.09894979, + "epoch": 0.00625263031323273, + "flos": 27628340751360.0, + "grad_norm": 2.163799800457259, + "language_loss": 0.97907317, + "learning_rate": 2.8624625093687977e-06, + "loss": 1.01338744, + "num_input_tokens_seen": 1177605, + "step": 52, + "time_per_iteration": 2.652637481689453 + }, + { + "auxiliary_loss_clip": 0.02117692, + "auxiliary_loss_mlp": 0.01264114, + "balance_loss_clip": 1.39884543, + "balance_loss_mlp": 1.07900584, + "epoch": 0.006372873203871821, + "flos": 23110671392640.0, + "grad_norm": 2.16527575809615, + "language_loss": 0.89054537, + "learning_rate": 2.876261897070029e-06, + "loss": 0.92436349, + "num_input_tokens_seen": 1197735, + "step": 53, + "time_per_iteration": 2.641178846359253 + }, + { + "auxiliary_loss_clip": 0.02114474, + "auxiliary_loss_mlp": 0.01279849, + "balance_loss_clip": 1.40211296, + "balance_loss_mlp": 1.09207094, + "epoch": 0.006493116094510912, + "flos": 22856028900480.0, + "grad_norm": 2.5986627826536113, + "language_loss": 0.92797983, + "learning_rate": 2.889803337127447e-06, + "loss": 0.96192306, + "num_input_tokens_seen": 1216335, + "step": 54, + "time_per_iteration": 2.638310194015503 + }, + { + "auxiliary_loss_clip": 0.02083782, + "auxiliary_loss_mlp": 0.01299407, + "balance_loss_clip": 1.39024699, + "balance_loss_mlp": 1.10190082, + "epoch": 0.006613358985150003, + "flos": 23071708114560.0, + "grad_norm": 3.4989901410270967, + "language_loss": 0.84574056, + "learning_rate": 2.903096296321516e-06, + "loss": 0.87957239, + "num_input_tokens_seen": 1234480, + "step": 55, + "time_per_iteration": 2.6131131649017334 + }, + { + "auxiliary_loss_clip": 0.02085264, + "auxiliary_loss_mlp": 0.01252839, + "balance_loss_clip": 1.39213204, + "balance_loss_mlp": 1.07555068, + "epoch": 0.006733601875789094, + "flos": 26537662229760.0, + "grad_norm": 2.6028676877163686, + "language_loss": 0.91590989, + "learning_rate": 2.9161497296578907e-06, + "loss": 0.94929099, + "num_input_tokens_seen": 1253870, + "step": 56, + "time_per_iteration": 2.671555280685425 + }, + { + "auxiliary_loss_clip": 0.02064449, + "auxiliary_loss_mlp": 0.01256639, + "balance_loss_clip": 1.38560629, + "balance_loss_mlp": 1.07725322, + "epoch": 0.006853844766428185, + "flos": 15523178083200.0, + "grad_norm": 2.6322986336177636, + "language_loss": 0.85943907, + "learning_rate": 2.928972116604173e-06, + "loss": 0.89264989, + "num_input_tokens_seen": 1270145, + "step": 57, + "time_per_iteration": 2.568746566772461 + }, + { + "auxiliary_loss_clip": 0.02035534, + "auxiliary_loss_mlp": 0.01236726, + "balance_loss_clip": 1.37630868, + "balance_loss_mlp": 1.06830692, + "epoch": 0.006974087657067276, + "flos": 24243760897920.0, + "grad_norm": 2.0468603200895665, + "language_loss": 1.02089727, + "learning_rate": 2.9415714941751377e-06, + "loss": 1.05361998, + "num_input_tokens_seen": 1291365, + "step": 58, + "time_per_iteration": 2.641627788543701 + }, + { + "auxiliary_loss_clip": 0.02051284, + "auxiliary_loss_mlp": 0.0126025, + "balance_loss_clip": 1.3777374, + "balance_loss_mlp": 1.09097242, + "epoch": 0.007094330547706367, + "flos": 25772513690880.0, + "grad_norm": 2.20240722833817, + "language_loss": 0.93476552, + "learning_rate": 2.9539554871897396e-06, + "loss": 0.96788085, + "num_input_tokens_seen": 1311535, + "step": 59, + "time_per_iteration": 2.663645029067993 + }, + { + "auxiliary_loss_clip": 0.02019501, + "auxiliary_loss_mlp": 0.01241901, + "balance_loss_clip": 1.36984706, + "balance_loss_mlp": 1.07691538, + "epoch": 0.007214573438345458, + "flos": 21319015979520.0, + "grad_norm": 2.0312074881518942, + "language_loss": 0.97501868, + "learning_rate": 2.9661313359851253e-06, + "loss": 1.00763273, + "num_input_tokens_seen": 1329420, + "step": 60, + "time_per_iteration": 2.6119375228881836 + }, + { + "auxiliary_loss_clip": 0.01995552, + "auxiliary_loss_mlp": 0.01240838, + "balance_loss_clip": 1.36693799, + "balance_loss_mlp": 1.08071625, + "epoch": 0.007334816328984549, + "flos": 24937088192640.0, + "grad_norm": 2.140590273233077, + "language_loss": 0.94071126, + "learning_rate": 2.978105921839922e-06, + "loss": 0.97307515, + "num_input_tokens_seen": 1349965, + "step": 61, + "time_per_iteration": 2.657789707183838 + }, + { + "auxiliary_loss_clip": 0.01982442, + "auxiliary_loss_mlp": 0.01252701, + "balance_loss_clip": 1.36371708, + "balance_loss_mlp": 1.09191155, + "epoch": 0.00745505921962364, + "flos": 18510586277760.0, + "grad_norm": 3.438169528042135, + "language_loss": 0.72305679, + "learning_rate": 2.9898857903302893e-06, + "loss": 0.75540817, + "num_input_tokens_seen": 1368915, + "step": 62, + "time_per_iteration": 2.67333722114563 + }, + { + "auxiliary_loss_clip": 0.0198204, + "auxiliary_loss_mlp": 0.01254439, + "balance_loss_clip": 1.36086059, + "balance_loss_mlp": 1.08983517, + "epoch": 0.007575302110262731, + "flos": 18477656484480.0, + "grad_norm": 2.7402681555698054, + "language_loss": 0.88157225, + "learning_rate": 3.001477172817253e-06, + "loss": 0.91393697, + "num_input_tokens_seen": 1386805, + "step": 63, + "time_per_iteration": 2.5777153968811035 + }, + { + "auxiliary_loss_clip": 0.0195827, + "auxiliary_loss_mlp": 0.01224069, + "balance_loss_clip": 1.35301375, + "balance_loss_mlp": 1.07548654, + "epoch": 0.007695545000901822, + "flos": 24973178382720.0, + "grad_norm": 2.299960072665208, + "language_loss": 0.96365809, + "learning_rate": 3.012886006241894e-06, + "loss": 0.99548143, + "num_input_tokens_seen": 1406190, + "step": 64, + "time_per_iteration": 2.6290977001190186 + }, + { + "auxiliary_loss_clip": 0.0196218, + "auxiliary_loss_mlp": 0.0122708, + "balance_loss_clip": 1.35420418, + "balance_loss_mlp": 1.07134509, + "epoch": 0.007815787891540913, + "flos": 21324223451520.0, + "grad_norm": 4.865728604069954, + "language_loss": 0.88050032, + "learning_rate": 3.0241179513858383e-06, + "loss": 0.91239297, + "num_input_tokens_seen": 1425500, + "step": 65, + "time_per_iteration": 2.589324474334717 + }, + { + "auxiliary_loss_clip": 0.01943283, + "auxiliary_loss_mlp": 0.01252511, + "balance_loss_clip": 1.34309196, + "balance_loss_mlp": 1.09000516, + "epoch": 0.007936030782180003, + "flos": 21575777374080.0, + "grad_norm": 2.1515512015928997, + "language_loss": 0.87840772, + "learning_rate": 3.035178409737647e-06, + "loss": 0.91036558, + "num_input_tokens_seen": 1442950, + "step": 66, + "time_per_iteration": 2.63244366645813 + }, + { + "auxiliary_loss_clip": 0.01922874, + "auxiliary_loss_mlp": 0.01216119, + "balance_loss_clip": 1.3370477, + "balance_loss_mlp": 1.08117437, + "epoch": 0.008056273672819095, + "flos": 20120785159680.0, + "grad_norm": 2.3534754068808015, + "language_loss": 0.88637495, + "learning_rate": 3.046072539090907e-06, + "loss": 0.9177649, + "num_input_tokens_seen": 1460915, + "step": 67, + "time_per_iteration": 2.5793426036834717 + }, + { + "auxiliary_loss_clip": 0.0191524, + "auxiliary_loss_mlp": 0.01212686, + "balance_loss_clip": 1.33555996, + "balance_loss_mlp": 1.07192361, + "epoch": 0.008176516563458186, + "flos": 18333116156160.0, + "grad_norm": 2.32333096803499, + "language_loss": 1.04707313, + "learning_rate": 3.056805267986779e-06, + "loss": 1.07835233, + "num_input_tokens_seen": 1478385, + "step": 68, + "time_per_iteration": 2.589597225189209 + }, + { + "auxiliary_loss_clip": 0.0189913, + "auxiliary_loss_mlp": 0.01217712, + "balance_loss_clip": 1.32971203, + "balance_loss_mlp": 1.08000183, + "epoch": 0.008296759454097276, + "flos": 21872076664320.0, + "grad_norm": 2.1988244051355648, + "language_loss": 0.95325446, + "learning_rate": 3.0673813091022194e-06, + "loss": 0.98442286, + "num_input_tokens_seen": 1497605, + "step": 69, + "time_per_iteration": 4.116564989089966 + }, + { + "auxiliary_loss_clip": 0.01747201, + "auxiliary_loss_mlp": 0.01204221, + "balance_loss_clip": 1.35013294, + "balance_loss_mlp": 1.1332674, + "epoch": 0.008417002344736368, + "flos": 63408228036480.0, + "grad_norm": 1.2879149664159901, + "language_loss": 0.62019843, + "learning_rate": 3.0778051716749317e-06, + "loss": 0.64971262, + "num_input_tokens_seen": 1561150, + "step": 70, + "time_per_iteration": 4.691477060317993 + }, + { + "auxiliary_loss_clip": 0.01870178, + "auxiliary_loss_mlp": 0.01209066, + "balance_loss_clip": 1.3114171, + "balance_loss_mlp": 1.07373953, + "epoch": 0.008537245235375458, + "flos": 22966454286720.0, + "grad_norm": 3.9371301275607133, + "language_loss": 0.90500462, + "learning_rate": 3.0880811730470094e-06, + "loss": 0.9357971, + "num_input_tokens_seen": 1580605, + "step": 71, + "time_per_iteration": 2.6047046184539795 + }, + { + "auxiliary_loss_clip": 0.01713358, + "auxiliary_loss_mlp": 0.01173053, + "balance_loss_clip": 1.32955694, + "balance_loss_mlp": 1.10782194, + "epoch": 0.008657488126014549, + "flos": 61984046712960.0, + "grad_norm": 1.1469052470377348, + "language_loss": 0.58562702, + "learning_rate": 3.098213449401257e-06, + "loss": 0.61449116, + "num_input_tokens_seen": 1647535, + "step": 72, + "time_per_iteration": 3.0961344242095947 + }, + { + "auxiliary_loss_clip": 0.01859456, + "auxiliary_loss_mlp": 0.01207954, + "balance_loss_clip": 1.30954051, + "balance_loss_mlp": 1.07949436, + "epoch": 0.00877773101665364, + "flos": 30296791152000.0, + "grad_norm": 3.0923662648706105, + "language_loss": 0.98872817, + "learning_rate": 3.1082059657570015e-06, + "loss": 1.01940227, + "num_input_tokens_seen": 1666770, + "step": 73, + "time_per_iteration": 2.663571357727051 + }, + { + "auxiliary_loss_clip": 0.01828815, + "auxiliary_loss_mlp": 0.01197022, + "balance_loss_clip": 1.29955816, + "balance_loss_mlp": 1.06617796, + "epoch": 0.00889797390729273, + "flos": 23514056104320.0, + "grad_norm": 2.64196348040785, + "language_loss": 0.96697056, + "learning_rate": 3.1180625252858496e-06, + "loss": 0.99722892, + "num_input_tokens_seen": 1685200, + "step": 74, + "time_per_iteration": 2.6235556602478027 + }, + { + "auxiliary_loss_clip": 0.01813272, + "auxiliary_loss_mlp": 0.01206079, + "balance_loss_clip": 1.29128432, + "balance_loss_mlp": 1.08267403, + "epoch": 0.009018216797931822, + "flos": 23075838178560.0, + "grad_norm": 2.7178307255778194, + "language_loss": 0.800735, + "learning_rate": 3.1277867780021663e-06, + "loss": 0.83092856, + "num_input_tokens_seen": 1701835, + "step": 75, + "time_per_iteration": 2.5933990478515625 + }, + { + "auxiliary_loss_clip": 0.01792765, + "auxiliary_loss_mlp": 0.01176446, + "balance_loss_clip": 1.28396213, + "balance_loss_mlp": 1.06362629, + "epoch": 0.009138459688570914, + "flos": 15918877284480.0, + "grad_norm": 2.1518130708773873, + "language_loss": 0.95679742, + "learning_rate": 3.1373822288779824e-06, + "loss": 0.98648953, + "num_input_tokens_seen": 1718415, + "step": 76, + "time_per_iteration": 2.572849988937378 + }, + { + "auxiliary_loss_clip": 0.01792212, + "auxiliary_loss_mlp": 0.01208224, + "balance_loss_clip": 1.28557599, + "balance_loss_mlp": 1.08748853, + "epoch": 0.009258702579210003, + "flos": 27016531372800.0, + "grad_norm": 1.8671540037119525, + "language_loss": 0.79442704, + "learning_rate": 3.1468522454274533e-06, + "loss": 0.8244313, + "num_input_tokens_seen": 1738770, + "step": 77, + "time_per_iteration": 2.8733348846435547 + }, + { + "auxiliary_loss_clip": 0.01783026, + "auxiliary_loss_mlp": 0.01189995, + "balance_loss_clip": 1.2798388, + "balance_loss_mlp": 1.07145357, + "epoch": 0.009378945469849095, + "flos": 26903196984960.0, + "grad_norm": 2.7797983621049163, + "language_loss": 0.91761243, + "learning_rate": 3.15620006480197e-06, + "loss": 0.94734263, + "num_input_tokens_seen": 1758040, + "step": 78, + "time_per_iteration": 2.6800432205200195 + }, + { + "auxiliary_loss_clip": 0.01780022, + "auxiliary_loss_mlp": 0.01185347, + "balance_loss_clip": 1.27723634, + "balance_loss_mlp": 1.06785405, + "epoch": 0.009499188360488187, + "flos": 35694236327040.0, + "grad_norm": 3.2870263479744204, + "language_loss": 0.75202656, + "learning_rate": 3.1654288004333087e-06, + "loss": 0.78168023, + "num_input_tokens_seen": 1776705, + "step": 79, + "time_per_iteration": 2.6883795261383057 + }, + { + "auxiliary_loss_clip": 0.01760491, + "auxiliary_loss_mlp": 0.01177035, + "balance_loss_clip": 1.27155077, + "balance_loss_mlp": 1.06941271, + "epoch": 0.009619431251127276, + "flos": 21503201944320.0, + "grad_norm": 2.9457083423954935, + "language_loss": 0.75987071, + "learning_rate": 3.1745414482589353e-06, + "loss": 0.78924596, + "num_input_tokens_seen": 1795915, + "step": 80, + "time_per_iteration": 2.6029884815216064 + }, + { + "auxiliary_loss_clip": 0.01749425, + "auxiliary_loss_mlp": 0.01173265, + "balance_loss_clip": 1.26705146, + "balance_loss_mlp": 1.06354523, + "epoch": 0.009739674141766368, + "flos": 17421056991360.0, + "grad_norm": 3.8187317974969033, + "language_loss": 0.87147331, + "learning_rate": 3.1835408925606204e-06, + "loss": 0.90070021, + "num_input_tokens_seen": 1814055, + "step": 81, + "time_per_iteration": 2.5797977447509766 + }, + { + "auxiliary_loss_clip": 0.01728687, + "auxiliary_loss_mlp": 0.01184645, + "balance_loss_clip": 1.26019275, + "balance_loss_mlp": 1.07683158, + "epoch": 0.00985991703240546, + "flos": 27527109246720.0, + "grad_norm": 2.68649661508314, + "language_loss": 0.89399135, + "learning_rate": 3.1924299114448214e-06, + "loss": 0.92312467, + "num_input_tokens_seen": 1834535, + "step": 82, + "time_per_iteration": 2.7107739448547363 + }, + { + "auxiliary_loss_clip": 0.01737396, + "auxiliary_loss_mlp": 0.01183328, + "balance_loss_clip": 1.2640872, + "balance_loss_mlp": 1.07623029, + "epoch": 0.00998015992304455, + "flos": 13808084509440.0, + "grad_norm": 6.8096691916540575, + "language_loss": 0.83485675, + "learning_rate": 3.2012111819909055e-06, + "loss": 0.86406404, + "num_input_tokens_seen": 1851865, + "step": 83, + "time_per_iteration": 2.651357650756836 + }, + { + "auxiliary_loss_clip": 0.01725781, + "auxiliary_loss_mlp": 0.01177782, + "balance_loss_clip": 1.25757849, + "balance_loss_mlp": 1.07302141, + "epoch": 0.010100402813683641, + "flos": 20191385341440.0, + "grad_norm": 4.131325453323916, + "language_loss": 0.95071375, + "learning_rate": 3.2098872850910627e-06, + "loss": 0.97974938, + "num_input_tokens_seen": 1868540, + "step": 84, + "time_per_iteration": 2.62860107421875 + }, + { + "auxiliary_loss_clip": 0.0172459, + "auxiliary_loss_mlp": 0.01178415, + "balance_loss_clip": 1.25969911, + "balance_loss_mlp": 1.07804024, + "epoch": 0.010220645704322733, + "flos": 17201642762880.0, + "grad_norm": 2.1646046732958397, + "language_loss": 0.89259446, + "learning_rate": 3.2184607100038194e-06, + "loss": 0.92162454, + "num_input_tokens_seen": 1887180, + "step": 85, + "time_per_iteration": 2.581496477127075 + }, + { + "auxiliary_loss_clip": 0.0172302, + "auxiliary_loss_mlp": 0.01180579, + "balance_loss_clip": 1.25998425, + "balance_loss_mlp": 1.08096766, + "epoch": 0.010340888594961822, + "flos": 21470415805440.0, + "grad_norm": 2.2043277630126408, + "language_loss": 0.93303382, + "learning_rate": 3.2269338586412414e-06, + "loss": 0.96206981, + "num_input_tokens_seen": 1904765, + "step": 86, + "time_per_iteration": 2.608494758605957 + }, + { + "auxiliary_loss_clip": 0.01708152, + "auxiliary_loss_mlp": 0.011696, + "balance_loss_clip": 1.25247729, + "balance_loss_mlp": 1.07637846, + "epoch": 0.010461131485600914, + "flos": 23002831785600.0, + "grad_norm": 6.519713521576016, + "language_loss": 0.96458882, + "learning_rate": 3.2353090496083106e-06, + "loss": 0.99336636, + "num_input_tokens_seen": 1922600, + "step": 87, + "time_per_iteration": 2.6073639392852783 + }, + { + "auxiliary_loss_clip": 0.016869, + "auxiliary_loss_mlp": 0.01166923, + "balance_loss_clip": 1.2434237, + "balance_loss_mlp": 1.07970929, + "epoch": 0.010581374376240005, + "flos": 33546850571520.0, + "grad_norm": 2.0912135427214995, + "language_loss": 0.81207597, + "learning_rate": 3.2435885220114572e-06, + "loss": 0.8406142, + "num_input_tokens_seen": 1943950, + "step": 88, + "time_per_iteration": 2.6951162815093994 + }, + { + "auxiliary_loss_clip": 0.01694567, + "auxiliary_loss_mlp": 0.01154509, + "balance_loss_clip": 1.24924529, + "balance_loss_mlp": 1.06309986, + "epoch": 0.010701617266879095, + "flos": 21763087822080.0, + "grad_norm": 2.3079840278445602, + "language_loss": 0.94123149, + "learning_rate": 3.2517744390519113e-06, + "loss": 0.96972227, + "num_input_tokens_seen": 1962815, + "step": 89, + "time_per_iteration": 2.5949528217315674 + }, + { + "auxiliary_loss_clip": 0.0168119, + "auxiliary_loss_mlp": 0.0115566, + "balance_loss_clip": 1.23525751, + "balance_loss_mlp": 1.06901884, + "epoch": 0.010821860157518187, + "flos": 19060199256960.0, + "grad_norm": 1.9752752942284109, + "language_loss": 0.75446093, + "learning_rate": 3.259868891418298e-06, + "loss": 0.78282946, + "num_input_tokens_seen": 1980580, + "step": 90, + "time_per_iteration": 2.5764966011047363 + }, + { + "auxiliary_loss_clip": 0.01688902, + "auxiliary_loss_mlp": 0.01188954, + "balance_loss_clip": 1.24499321, + "balance_loss_mlp": 1.098593, + "epoch": 0.010942103048157278, + "flos": 25447378757760.0, + "grad_norm": 5.7842798133925895, + "language_loss": 0.85219717, + "learning_rate": 3.2678739004917757e-06, + "loss": 0.88097572, + "num_input_tokens_seen": 2000315, + "step": 91, + "time_per_iteration": 2.628726005554199 + }, + { + "auxiliary_loss_clip": 0.01671089, + "auxiliary_loss_mlp": 0.01169394, + "balance_loss_clip": 1.23823953, + "balance_loss_mlp": 1.0846597, + "epoch": 0.011062345938796368, + "flos": 27493928058240.0, + "grad_norm": 4.34082175441669, + "language_loss": 0.92130125, + "learning_rate": 3.275791421376029e-06, + "loss": 0.94970608, + "num_input_tokens_seen": 2023760, + "step": 92, + "time_per_iteration": 2.6944386959075928 + }, + { + "auxiliary_loss_clip": 0.01660834, + "auxiliary_loss_mlp": 0.01148617, + "balance_loss_clip": 1.23072314, + "balance_loss_mlp": 1.07084429, + "epoch": 0.01118258882943546, + "flos": 16071210864000.0, + "grad_norm": 2.3885592719217086, + "language_loss": 0.96017957, + "learning_rate": 3.2836233457634622e-06, + "loss": 0.98827404, + "num_input_tokens_seen": 2041895, + "step": 93, + "time_per_iteration": 2.5680718421936035 + }, + { + "auxiliary_loss_clip": 0.01658419, + "auxiliary_loss_mlp": 0.01182249, + "balance_loss_clip": 1.22944057, + "balance_loss_mlp": 1.08855033, + "epoch": 0.011302831720074551, + "flos": 20668602458880.0, + "grad_norm": 2.036036117434603, + "language_loss": 0.85517466, + "learning_rate": 3.2913715046481135e-06, + "loss": 0.8835814, + "num_input_tokens_seen": 2061640, + "step": 94, + "time_per_iteration": 2.587519407272339 + }, + { + "auxiliary_loss_clip": 0.01654984, + "auxiliary_loss_mlp": 0.01160084, + "balance_loss_clip": 1.22769237, + "balance_loss_mlp": 1.08054793, + "epoch": 0.011423074610713641, + "flos": 13072238490240.0, + "grad_norm": 2.206713127464866, + "language_loss": 0.88959599, + "learning_rate": 3.299037670895023e-06, + "loss": 0.91774666, + "num_input_tokens_seen": 2078255, + "step": 95, + "time_per_iteration": 3.3570058345794678 + }, + { + "auxiliary_loss_clip": 0.01656564, + "auxiliary_loss_mlp": 0.0114764, + "balance_loss_clip": 1.2332828, + "balance_loss_mlp": 1.0658145, + "epoch": 0.011543317501352733, + "flos": 30335646689280.0, + "grad_norm": 2.0368373245882996, + "language_loss": 0.80340135, + "learning_rate": 3.3066235616750667e-06, + "loss": 0.83144331, + "num_input_tokens_seen": 2099490, + "step": 96, + "time_per_iteration": 2.668557643890381 + }, + { + "auxiliary_loss_clip": 0.0163604, + "auxiliary_loss_mlp": 0.01141963, + "balance_loss_clip": 1.22135949, + "balance_loss_mlp": 1.06419086, + "epoch": 0.011663560391991824, + "flos": 15522962601600.0, + "grad_norm": 2.248558158994509, + "language_loss": 0.92334008, + "learning_rate": 3.3141308407736276e-06, + "loss": 0.95112014, + "num_input_tokens_seen": 2116125, + "step": 97, + "time_per_iteration": 4.238301515579224 + }, + { + "auxiliary_loss_clip": 0.01642414, + "auxiliary_loss_mlp": 0.01147753, + "balance_loss_clip": 1.21891952, + "balance_loss_mlp": 1.07088661, + "epoch": 0.011783803282630914, + "flos": 19902125116800.0, + "grad_norm": 2.8025273818912493, + "language_loss": 0.86707199, + "learning_rate": 3.321561120780869e-06, + "loss": 0.8949737, + "num_input_tokens_seen": 2134835, + "step": 98, + "time_per_iteration": 2.583487033843994 + }, + { + "auxiliary_loss_clip": 0.01631622, + "auxiliary_loss_mlp": 0.01141896, + "balance_loss_clip": 1.22142577, + "balance_loss_mlp": 1.07308865, + "epoch": 0.011904046173270006, + "flos": 22340674517760.0, + "grad_norm": 2.317475111446279, + "language_loss": 1.01683676, + "learning_rate": 3.3289159651708192e-06, + "loss": 1.044572, + "num_input_tokens_seen": 2152410, + "step": 99, + "time_per_iteration": 2.5845963954925537 + }, + { + "auxiliary_loss_clip": 0.01629017, + "auxiliary_loss_mlp": 0.01141992, + "balance_loss_clip": 1.21788228, + "balance_loss_mlp": 1.06641304, + "epoch": 0.012024289063909096, + "flos": 19100060375040.0, + "grad_norm": 1.8984900891722585, + "language_loss": 0.97687185, + "learning_rate": 3.3361968902759768e-06, + "loss": 1.00458193, + "num_input_tokens_seen": 2172090, + "step": 100, + "time_per_iteration": 2.581552267074585 + }, + { + "auxiliary_loss_clip": 0.01622033, + "auxiliary_loss_mlp": 0.01129354, + "balance_loss_clip": 1.21535504, + "balance_loss_mlp": 1.06269217, + "epoch": 0.012144531954548187, + "flos": 15012205159680.0, + "grad_norm": 2.3553081798729405, + "language_loss": 0.93901598, + "learning_rate": 3.343405367163663e-06, + "loss": 0.96652985, + "num_input_tokens_seen": 2189020, + "step": 101, + "time_per_iteration": 2.5688493251800537 + }, + { + "auxiliary_loss_clip": 0.01623505, + "auxiliary_loss_mlp": 0.01137665, + "balance_loss_clip": 1.21441579, + "balance_loss_mlp": 1.06909537, + "epoch": 0.012264774845187279, + "flos": 15122020014720.0, + "grad_norm": 2.4947859643671224, + "language_loss": 0.81225544, + "learning_rate": 3.350542823419951e-06, + "loss": 0.83986712, + "num_input_tokens_seen": 2205620, + "step": 102, + "time_per_iteration": 2.5664992332458496 + }, + { + "auxiliary_loss_clip": 0.01617765, + "auxiliary_loss_mlp": 0.01151773, + "balance_loss_clip": 1.20887589, + "balance_loss_mlp": 1.08358526, + "epoch": 0.012385017735826368, + "flos": 13949248959360.0, + "grad_norm": 4.354580360466932, + "language_loss": 0.87259769, + "learning_rate": 3.3576106448465615e-06, + "loss": 0.90029311, + "num_input_tokens_seen": 2219000, + "step": 103, + "time_per_iteration": 2.5770623683929443 + }, + { + "auxiliary_loss_clip": 0.01607287, + "auxiliary_loss_mlp": 0.01137285, + "balance_loss_clip": 1.20722914, + "balance_loss_mlp": 1.0683825, + "epoch": 0.01250526062646546, + "flos": 23623260428160.0, + "grad_norm": 4.776494348644612, + "language_loss": 0.88241792, + "learning_rate": 3.3646101770757797e-06, + "loss": 0.90986365, + "num_input_tokens_seen": 2237790, + "step": 104, + "time_per_iteration": 2.6096255779266357 + }, + { + "auxiliary_loss_clip": 0.01599114, + "auxiliary_loss_mlp": 0.01141021, + "balance_loss_clip": 1.20317793, + "balance_loss_mlp": 1.06825602, + "epoch": 0.012625503517104552, + "flos": 34640078958720.0, + "grad_norm": 1.7997822864099609, + "language_loss": 0.85644877, + "learning_rate": 3.371542727108104e-06, + "loss": 0.88385016, + "num_input_tokens_seen": 2259965, + "step": 105, + "time_per_iteration": 2.7218105792999268 + }, + { + "auxiliary_loss_clip": 0.01601414, + "auxiliary_loss_mlp": 0.01177821, + "balance_loss_clip": 1.20554364, + "balance_loss_mlp": 1.10739207, + "epoch": 0.012745746407743641, + "flos": 17821891837440.0, + "grad_norm": 2.9543229572457315, + "language_loss": 0.89857131, + "learning_rate": 3.3784095647770114e-06, + "loss": 0.92636371, + "num_input_tokens_seen": 2278610, + "step": 106, + "time_per_iteration": 2.590902328491211 + }, + { + "auxiliary_loss_clip": 0.01591407, + "auxiliary_loss_mlp": 0.01139702, + "balance_loss_clip": 1.19527221, + "balance_loss_mlp": 1.07027435, + "epoch": 0.012865989298382733, + "flos": 20595057361920.0, + "grad_norm": 3.5612755663292512, + "language_loss": 0.88621837, + "learning_rate": 3.3852119241449547e-06, + "loss": 0.91352946, + "num_input_tokens_seen": 2297730, + "step": 107, + "time_per_iteration": 2.592797040939331 + }, + { + "auxiliary_loss_clip": 0.01587599, + "auxiliary_loss_mlp": 0.01133174, + "balance_loss_clip": 1.19526589, + "balance_loss_mlp": 1.06627393, + "epoch": 0.012986232189021825, + "flos": 23948969978880.0, + "grad_norm": 3.271836694059246, + "language_loss": 0.96381533, + "learning_rate": 3.3919510048344295e-06, + "loss": 0.99102306, + "num_input_tokens_seen": 2315740, + "step": 108, + "time_per_iteration": 2.604032278060913 + }, + { + "auxiliary_loss_clip": 0.01576114, + "auxiliary_loss_mlp": 0.01132026, + "balance_loss_clip": 1.19042826, + "balance_loss_mlp": 1.07060909, + "epoch": 0.013106475079660914, + "flos": 23725425686400.0, + "grad_norm": 2.154975618963108, + "language_loss": 0.86655784, + "learning_rate": 3.3986279732976907e-06, + "loss": 0.89363921, + "num_input_tokens_seen": 2334215, + "step": 109, + "time_per_iteration": 2.597432851791382 + }, + { + "auxiliary_loss_clip": 0.01571114, + "auxiliary_loss_mlp": 0.01110737, + "balance_loss_clip": 1.18780768, + "balance_loss_mlp": 1.05060816, + "epoch": 0.013226717970300006, + "flos": 21102438925440.0, + "grad_norm": 1.9122955654924654, + "language_loss": 0.95534748, + "learning_rate": 3.4052439640284983e-06, + "loss": 0.98216605, + "num_input_tokens_seen": 2353130, + "step": 110, + "time_per_iteration": 2.5930137634277344 + }, + { + "auxiliary_loss_clip": 0.01572513, + "auxiliary_loss_mlp": 0.01130658, + "balance_loss_clip": 1.19114435, + "balance_loss_mlp": 1.06790638, + "epoch": 0.013346960860939098, + "flos": 24863902231680.0, + "grad_norm": 1.995696459837898, + "language_loss": 0.81182849, + "learning_rate": 3.4118000807190217e-06, + "loss": 0.83886015, + "num_input_tokens_seen": 2374010, + "step": 111, + "time_per_iteration": 2.62113356590271 + }, + { + "auxiliary_loss_clip": 0.0157428, + "auxiliary_loss_mlp": 0.01128069, + "balance_loss_clip": 1.18879116, + "balance_loss_mlp": 1.0675106, + "epoch": 0.013467203751578187, + "flos": 28181940140160.0, + "grad_norm": 1.726319810236208, + "language_loss": 0.76064789, + "learning_rate": 3.4182973973648723e-06, + "loss": 0.78767133, + "num_input_tokens_seen": 2395220, + "step": 112, + "time_per_iteration": 2.6450231075286865 + }, + { + "auxiliary_loss_clip": 0.01561745, + "auxiliary_loss_mlp": 0.01147629, + "balance_loss_clip": 1.18498135, + "balance_loss_mlp": 1.0878818, + "epoch": 0.013587446642217279, + "flos": 18916233546240.0, + "grad_norm": 6.381097018553025, + "language_loss": 0.95047188, + "learning_rate": 3.424736959321014e-06, + "loss": 0.97756553, + "num_input_tokens_seen": 2413025, + "step": 113, + "time_per_iteration": 2.6307506561279297 + }, + { + "auxiliary_loss_clip": 0.01564419, + "auxiliary_loss_mlp": 0.01141227, + "balance_loss_clip": 1.18456137, + "balance_loss_mlp": 1.07954824, + "epoch": 0.01370768953285637, + "flos": 23988615615360.0, + "grad_norm": 3.8997413783036055, + "language_loss": 0.88928545, + "learning_rate": 3.431119784311155e-06, + "loss": 0.9163419, + "num_input_tokens_seen": 2432700, + "step": 114, + "time_per_iteration": 2.617321252822876 + }, + { + "auxiliary_loss_clip": 0.01549075, + "auxiliary_loss_mlp": 0.01127632, + "balance_loss_clip": 1.17896593, + "balance_loss_mlp": 1.07072115, + "epoch": 0.01382793242349546, + "flos": 39202565512320.0, + "grad_norm": 3.0172273045183085, + "language_loss": 0.7746948, + "learning_rate": 3.43744686339307e-06, + "loss": 0.80146188, + "num_input_tokens_seen": 2455020, + "step": 115, + "time_per_iteration": 2.725250482559204 + }, + { + "auxiliary_loss_clip": 0.0154605, + "auxiliary_loss_mlp": 0.01093096, + "balance_loss_clip": 1.17413783, + "balance_loss_mlp": 1.03978562, + "epoch": 0.013948175314134552, + "flos": 41353506714240.0, + "grad_norm": 2.2588089588709486, + "language_loss": 0.91154039, + "learning_rate": 3.44371916188212e-06, + "loss": 0.93793184, + "num_input_tokens_seen": 2475775, + "step": 116, + "time_per_iteration": 2.7487893104553223 + }, + { + "auxiliary_loss_clip": 0.01541149, + "auxiliary_loss_mlp": 0.01108299, + "balance_loss_clip": 1.17378497, + "balance_loss_mlp": 1.05749214, + "epoch": 0.014068418204773643, + "flos": 22453542028800.0, + "grad_norm": 2.7632665614576686, + "language_loss": 0.86486822, + "learning_rate": 3.449937620235143e-06, + "loss": 0.89136273, + "num_input_tokens_seen": 2496370, + "step": 117, + "time_per_iteration": 2.623762369155884 + }, + { + "auxiliary_loss_clip": 0.01544147, + "auxiliary_loss_mlp": 0.01115375, + "balance_loss_clip": 1.1755408, + "balance_loss_mlp": 1.06161189, + "epoch": 0.014188661095412733, + "flos": 23805147922560.0, + "grad_norm": 1.8586461572518316, + "language_loss": 0.89305121, + "learning_rate": 3.456103154896722e-06, + "loss": 0.91964638, + "num_input_tokens_seen": 2517645, + "step": 118, + "time_per_iteration": 2.6543915271759033 + }, + { + "auxiliary_loss_clip": 0.01531473, + "auxiliary_loss_mlp": 0.01122257, + "balance_loss_clip": 1.16844773, + "balance_loss_mlp": 1.0705682, + "epoch": 0.014308903986051825, + "flos": 23660248458240.0, + "grad_norm": 1.990442269860647, + "language_loss": 0.92346728, + "learning_rate": 3.462216659109757e-06, + "loss": 0.95000458, + "num_input_tokens_seen": 2537825, + "step": 119, + "time_per_iteration": 2.6098403930664062 + }, + { + "auxiliary_loss_clip": 0.01550517, + "auxiliary_loss_mlp": 0.01131573, + "balance_loss_clip": 1.17779422, + "balance_loss_mlp": 1.07959819, + "epoch": 0.014429146876690916, + "flos": 20667991927680.0, + "grad_norm": 2.264497773176269, + "language_loss": 0.85299343, + "learning_rate": 3.4682790036921077e-06, + "loss": 0.87981433, + "num_input_tokens_seen": 2556485, + "step": 120, + "time_per_iteration": 2.594604015350342 + }, + { + "auxiliary_loss_clip": 0.01525533, + "auxiliary_loss_mlp": 0.01108963, + "balance_loss_clip": 1.1692847, + "balance_loss_mlp": 1.06552291, + "epoch": 0.014549389767330006, + "flos": 20229199384320.0, + "grad_norm": 1.9851201241183256, + "language_loss": 0.83216095, + "learning_rate": 3.4742910377810193e-06, + "loss": 0.85850596, + "num_input_tokens_seen": 2573945, + "step": 121, + "time_per_iteration": 2.642075777053833 + }, + { + "auxiliary_loss_clip": 0.01525128, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_clip": 1.16808343, + "balance_loss_mlp": 1.07199001, + "epoch": 0.014669632657969098, + "flos": 18004174381440.0, + "grad_norm": 2.2887279017947977, + "language_loss": 0.88897741, + "learning_rate": 3.4802535895469042e-06, + "loss": 0.91541207, + "num_input_tokens_seen": 2592695, + "step": 122, + "time_per_iteration": 3.374725580215454 + }, + { + "auxiliary_loss_clip": 0.01526636, + "auxiliary_loss_mlp": 0.01107706, + "balance_loss_clip": 1.16711879, + "balance_loss_mlp": 1.0610956, + "epoch": 0.01478987554860819, + "flos": 22741796672640.0, + "grad_norm": 2.267521836430062, + "language_loss": 0.89663643, + "learning_rate": 3.4861674668779934e-06, + "loss": 0.92297983, + "num_input_tokens_seen": 2610925, + "step": 123, + "time_per_iteration": 2.5955233573913574 + }, + { + "auxiliary_loss_clip": 0.01518221, + "auxiliary_loss_mlp": 0.01105952, + "balance_loss_clip": 1.16208756, + "balance_loss_mlp": 1.05810189, + "epoch": 0.01491011843924728, + "flos": 17198590106880.0, + "grad_norm": 2.6286553866275217, + "language_loss": 0.84037566, + "learning_rate": 3.492033458037272e-06, + "loss": 0.86661744, + "num_input_tokens_seen": 2629495, + "step": 124, + "time_per_iteration": 5.0272181034088135 + }, + { + "auxiliary_loss_clip": 0.01514433, + "auxiliary_loss_mlp": 0.01110268, + "balance_loss_clip": 1.15931463, + "balance_loss_mlp": 1.06709063, + "epoch": 0.01503036132988637, + "flos": 17673867889920.0, + "grad_norm": 2.3706329892284135, + "language_loss": 0.87002653, + "learning_rate": 3.497852332293018e-06, + "loss": 0.89627355, + "num_input_tokens_seen": 2645070, + "step": 125, + "time_per_iteration": 2.516751766204834 + }, + { + "auxiliary_loss_clip": 0.01512182, + "auxiliary_loss_mlp": 0.01113349, + "balance_loss_clip": 1.16069603, + "balance_loss_mlp": 1.07129157, + "epoch": 0.015150604220525462, + "flos": 18878239935360.0, + "grad_norm": 9.927683912181434, + "language_loss": 0.96701115, + "learning_rate": 3.5036248405242356e-06, + "loss": 0.9932664, + "num_input_tokens_seen": 2663825, + "step": 126, + "time_per_iteration": 2.5550808906555176 + }, + { + "auxiliary_loss_clip": 0.01512431, + "auxiliary_loss_mlp": 0.0110902, + "balance_loss_clip": 1.1588161, + "balance_loss_mlp": 1.06336308, + "epoch": 0.015270847111164552, + "flos": 39420184060800.0, + "grad_norm": 2.4890633873311288, + "language_loss": 0.82861608, + "learning_rate": 3.509351715802146e-06, + "loss": 0.85483056, + "num_input_tokens_seen": 2684710, + "step": 127, + "time_per_iteration": 2.7203259468078613 + }, + { + "auxiliary_loss_clip": 0.01510152, + "auxiliary_loss_mlp": 0.01122115, + "balance_loss_clip": 1.15730858, + "balance_loss_mlp": 1.07474113, + "epoch": 0.015391090001803644, + "flos": 43762466286720.0, + "grad_norm": 2.231390561886709, + "language_loss": 0.7863512, + "learning_rate": 3.5150336739488763e-06, + "loss": 0.81267393, + "num_input_tokens_seen": 2706995, + "step": 128, + "time_per_iteration": 2.7562105655670166 + }, + { + "auxiliary_loss_clip": 0.01506023, + "auxiliary_loss_mlp": 0.0108955, + "balance_loss_clip": 1.15751505, + "balance_loss_mlp": 1.04997277, + "epoch": 0.015511332892442733, + "flos": 18916341287040.0, + "grad_norm": 2.438631534235271, + "language_loss": 0.84215492, + "learning_rate": 3.5206714140744143e-06, + "loss": 0.86811066, + "num_input_tokens_seen": 2727050, + "step": 129, + "time_per_iteration": 2.5441577434539795 + }, + { + "auxiliary_loss_clip": 0.01508841, + "auxiliary_loss_mlp": 0.01115343, + "balance_loss_clip": 1.16043437, + "balance_loss_mlp": 1.07271326, + "epoch": 0.015631575783081827, + "flos": 24535283679360.0, + "grad_norm": 3.020002872777889, + "language_loss": 0.87800133, + "learning_rate": 3.5262656190928208e-06, + "loss": 0.90424311, + "num_input_tokens_seen": 2745350, + "step": 130, + "time_per_iteration": 2.560955047607422 + }, + { + "auxiliary_loss_clip": 0.01458089, + "auxiliary_loss_mlp": 0.01068653, + "balance_loss_clip": 1.16608071, + "balance_loss_mlp": 1.0485301, + "epoch": 0.015751818673720917, + "flos": 62328536098560.0, + "grad_norm": 1.0294830305896912, + "language_loss": 0.7150535, + "learning_rate": 3.5318169562186737e-06, + "loss": 0.74032092, + "num_input_tokens_seen": 2814195, + "step": 131, + "time_per_iteration": 3.1654999256134033 + }, + { + "auxiliary_loss_clip": 0.01495531, + "auxiliary_loss_mlp": 0.01119257, + "balance_loss_clip": 1.15350318, + "balance_loss_mlp": 1.07948852, + "epoch": 0.015872061564360006, + "flos": 23878549365120.0, + "grad_norm": 2.1784417595977352, + "language_loss": 0.82420343, + "learning_rate": 3.5373260774446292e-06, + "loss": 0.85035133, + "num_input_tokens_seen": 2834645, + "step": 132, + "time_per_iteration": 2.5698916912078857 + }, + { + "auxiliary_loss_clip": 0.01493349, + "auxiliary_loss_mlp": 0.01113717, + "balance_loss_clip": 1.1525147, + "balance_loss_mlp": 1.0737344, + "epoch": 0.0159923044549991, + "flos": 23367899664000.0, + "grad_norm": 5.032845195464264, + "language_loss": 0.90439296, + "learning_rate": 3.542793620000961e-06, + "loss": 0.93046361, + "num_input_tokens_seen": 2854120, + "step": 133, + "time_per_iteration": 2.5637803077697754 + }, + { + "auxiliary_loss_clip": 0.01490488, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.15090585, + "balance_loss_mlp": 1.06799197, + "epoch": 0.01611254734563819, + "flos": 17858305249920.0, + "grad_norm": 2.8239559178069595, + "language_loss": 0.87485325, + "learning_rate": 3.5482202067978894e-06, + "loss": 0.90084767, + "num_input_tokens_seen": 2871330, + "step": 134, + "time_per_iteration": 2.526554822921753 + }, + { + "auxiliary_loss_clip": 0.01489062, + "auxiliary_loss_mlp": 0.01103539, + "balance_loss_clip": 1.15231252, + "balance_loss_mlp": 1.06348515, + "epoch": 0.01623279023627728, + "flos": 20954774113920.0, + "grad_norm": 3.2803857990800886, + "language_loss": 0.75892198, + "learning_rate": 3.553606446851471e-06, + "loss": 0.78484803, + "num_input_tokens_seen": 2888070, + "step": 135, + "time_per_iteration": 2.561326026916504 + }, + { + "auxiliary_loss_clip": 0.01476671, + "auxiliary_loss_mlp": 0.01099344, + "balance_loss_clip": 1.14409137, + "balance_loss_mlp": 1.05976653, + "epoch": 0.016353033126916373, + "flos": 15742412743680.0, + "grad_norm": 1.7595075629792851, + "language_loss": 0.83433354, + "learning_rate": 3.5589529356937613e-06, + "loss": 0.86009371, + "num_input_tokens_seen": 2906465, + "step": 136, + "time_per_iteration": 2.5772318840026855 + }, + { + "auxiliary_loss_clip": 0.01485606, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.14757085, + "balance_loss_mlp": 1.06164122, + "epoch": 0.016473276017555463, + "flos": 18807280617600.0, + "grad_norm": 1.732950356528454, + "language_loss": 0.77090061, + "learning_rate": 3.5642602557679627e-06, + "loss": 0.79676503, + "num_input_tokens_seen": 2924915, + "step": 137, + "time_per_iteration": 2.5599591732025146 + }, + { + "auxiliary_loss_clip": 0.01480111, + "auxiliary_loss_mlp": 0.01088064, + "balance_loss_clip": 1.15349746, + "balance_loss_mlp": 1.05449414, + "epoch": 0.016593518908194552, + "flos": 24352641999360.0, + "grad_norm": 5.200749370730247, + "language_loss": 0.84199989, + "learning_rate": 3.569528976809202e-06, + "loss": 0.86768168, + "num_input_tokens_seen": 2942130, + "step": 138, + "time_per_iteration": 2.5954983234405518 + }, + { + "auxiliary_loss_clip": 0.01482042, + "auxiliary_loss_mlp": 0.01112796, + "balance_loss_clip": 1.14896917, + "balance_loss_mlp": 1.07233655, + "epoch": 0.016713761798833646, + "flos": 22346133384960.0, + "grad_norm": 1.7796812801667605, + "language_loss": 0.90045643, + "learning_rate": 3.5747596562115522e-06, + "loss": 0.92640483, + "num_input_tokens_seen": 2962745, + "step": 139, + "time_per_iteration": 2.6077613830566406 + }, + { + "auxiliary_loss_clip": 0.01485226, + "auxiliary_loss_mlp": 0.01108709, + "balance_loss_clip": 1.14959764, + "balance_loss_mlp": 1.07049036, + "epoch": 0.016834004689472735, + "flos": 17821820010240.0, + "grad_norm": 2.614065059124414, + "language_loss": 0.9090777, + "learning_rate": 3.5799528393819138e-06, + "loss": 0.93501699, + "num_input_tokens_seen": 2981825, + "step": 140, + "time_per_iteration": 2.554339647293091 + }, + { + "auxiliary_loss_clip": 0.01467501, + "auxiliary_loss_mlp": 0.01103847, + "balance_loss_clip": 1.14048219, + "balance_loss_mlp": 1.06803644, + "epoch": 0.016954247580111825, + "flos": 20519501103360.0, + "grad_norm": 2.6163422582780864, + "language_loss": 0.88117659, + "learning_rate": 3.585109060081286e-06, + "loss": 0.90688998, + "num_input_tokens_seen": 3001625, + "step": 141, + "time_per_iteration": 2.567394971847534 + }, + { + "auxiliary_loss_clip": 0.01474074, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.14409161, + "balance_loss_mlp": 1.06625509, + "epoch": 0.017074490470750915, + "flos": 22088869200000.0, + "grad_norm": 1.8880994173388934, + "language_loss": 0.78752673, + "learning_rate": 3.590228840753992e-06, + "loss": 0.81329459, + "num_input_tokens_seen": 3022055, + "step": 142, + "time_per_iteration": 2.593890905380249 + }, + { + "auxiliary_loss_clip": 0.01465329, + "auxiliary_loss_mlp": 0.01105886, + "balance_loss_clip": 1.14217997, + "balance_loss_mlp": 1.07188785, + "epoch": 0.01719473336139001, + "flos": 15997270717440.0, + "grad_norm": 2.2049118852316965, + "language_loss": 0.87486386, + "learning_rate": 3.5953126928453423e-06, + "loss": 0.90057606, + "num_input_tokens_seen": 3039605, + "step": 143, + "time_per_iteration": 2.5317704677581787 + }, + { + "auxiliary_loss_clip": 0.01461108, + "auxiliary_loss_mlp": 0.01086412, + "balance_loss_clip": 1.13764167, + "balance_loss_mlp": 1.05360556, + "epoch": 0.017314976252029098, + "flos": 22492038430080.0, + "grad_norm": 2.082234841580534, + "language_loss": 0.80529904, + "learning_rate": 3.600361117108239e-06, + "loss": 0.83077431, + "num_input_tokens_seen": 3059405, + "step": 144, + "time_per_iteration": 2.5645899772644043 + }, + { + "auxiliary_loss_clip": 0.01466605, + "auxiliary_loss_mlp": 0.0109213, + "balance_loss_clip": 1.13959026, + "balance_loss_mlp": 1.05751157, + "epoch": 0.017435219142668188, + "flos": 22018053536640.0, + "grad_norm": 2.3836164053262348, + "language_loss": 0.97259957, + "learning_rate": 3.6053746038991616e-06, + "loss": 0.99818695, + "num_input_tokens_seen": 3078490, + "step": 145, + "time_per_iteration": 2.5696916580200195 + }, + { + "auxiliary_loss_clip": 0.01410764, + "auxiliary_loss_mlp": 0.01022467, + "balance_loss_clip": 1.1469425, + "balance_loss_mlp": 1.00596893, + "epoch": 0.01755546203330728, + "flos": 72240526149120.0, + "grad_norm": 1.0576759065620442, + "language_loss": 0.58469176, + "learning_rate": 3.6103536334639843e-06, + "loss": 0.60902405, + "num_input_tokens_seen": 3131755, + "step": 146, + "time_per_iteration": 3.097276449203491 + }, + { + "auxiliary_loss_clip": 0.01456415, + "auxiliary_loss_mlp": 0.01087688, + "balance_loss_clip": 1.13664842, + "balance_loss_mlp": 1.05554962, + "epoch": 0.01767570492394637, + "flos": 25337061112320.0, + "grad_norm": 1.9429705516477995, + "language_loss": 0.85672379, + "learning_rate": 3.615298676214041e-06, + "loss": 0.8821649, + "num_input_tokens_seen": 3152035, + "step": 147, + "time_per_iteration": 2.582542657852173 + }, + { + "auxiliary_loss_clip": 0.01452954, + "auxiliary_loss_mlp": 0.01099399, + "balance_loss_clip": 1.13405442, + "balance_loss_mlp": 1.06752241, + "epoch": 0.01779594781458546, + "flos": 20449188230400.0, + "grad_norm": 2.8225290636597853, + "language_loss": 0.88834655, + "learning_rate": 3.6202101929928317e-06, + "loss": 0.91387004, + "num_input_tokens_seen": 3170625, + "step": 148, + "time_per_iteration": 3.3058583736419678 + }, + { + "auxiliary_loss_clip": 0.01447732, + "auxiliary_loss_mlp": 0.01097344, + "balance_loss_clip": 1.13204956, + "balance_loss_mlp": 1.06685042, + "epoch": 0.017916190705224554, + "flos": 16253601148800.0, + "grad_norm": 2.6652532460221545, + "language_loss": 0.88369137, + "learning_rate": 3.6250886353337413e-06, + "loss": 0.90914214, + "num_input_tokens_seen": 3188155, + "step": 149, + "time_per_iteration": 2.531362771987915 + }, + { + "auxiliary_loss_clip": 0.01461479, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_clip": 1.13990402, + "balance_loss_mlp": 1.06598485, + "epoch": 0.018036433595863644, + "flos": 23330588411520.0, + "grad_norm": 2.0088655917252174, + "language_loss": 0.8646332, + "learning_rate": 3.6299344457091488e-06, + "loss": 0.89021325, + "num_input_tokens_seen": 3209015, + "step": 150, + "time_per_iteration": 2.6037371158599854 + }, + { + "auxiliary_loss_clip": 0.01453401, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_clip": 1.13643658, + "balance_loss_mlp": 1.0588901, + "epoch": 0.018156676486502734, + "flos": 18588010043520.0, + "grad_norm": 2.160399540885904, + "language_loss": 0.93835902, + "learning_rate": 3.634748057771256e-06, + "loss": 0.96377134, + "num_input_tokens_seen": 3224955, + "step": 151, + "time_per_iteration": 4.976336717605591 + }, + { + "auxiliary_loss_clip": 0.01446179, + "auxiliary_loss_mlp": 0.01089351, + "balance_loss_clip": 1.13402367, + "balance_loss_mlp": 1.06001365, + "epoch": 0.018276919377141827, + "flos": 25448707560960.0, + "grad_norm": 1.6771776838518953, + "language_loss": 0.85689265, + "learning_rate": 3.639529896584965e-06, + "loss": 0.88224798, + "num_input_tokens_seen": 3246330, + "step": 152, + "time_per_iteration": 2.680593729019165 + }, + { + "auxiliary_loss_clip": 0.01446642, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_clip": 1.13246155, + "balance_loss_mlp": 1.05059123, + "epoch": 0.018397162267780917, + "flos": 20047311889920.0, + "grad_norm": 3.2533214905985424, + "language_loss": 0.88949311, + "learning_rate": 3.6442803788531233e-06, + "loss": 0.91476989, + "num_input_tokens_seen": 3264290, + "step": 153, + "time_per_iteration": 2.542591094970703 + }, + { + "auxiliary_loss_clip": 0.01450881, + "auxiliary_loss_mlp": 0.0109803, + "balance_loss_clip": 1.13325989, + "balance_loss_mlp": 1.06608152, + "epoch": 0.018517405158420007, + "flos": 27565282425600.0, + "grad_norm": 2.1667251712673212, + "language_loss": 0.9614765, + "learning_rate": 3.6489999131344357e-06, + "loss": 0.9869656, + "num_input_tokens_seen": 3287065, + "step": 154, + "time_per_iteration": 2.7482194900512695 + }, + { + "auxiliary_loss_clip": 0.01437658, + "auxiliary_loss_mlp": 0.0108781, + "balance_loss_clip": 1.12920237, + "balance_loss_mlp": 1.06080914, + "epoch": 0.0186376480490591, + "flos": 19354056422400.0, + "grad_norm": 2.2744787991518627, + "language_loss": 0.90718651, + "learning_rate": 3.653688900054313e-06, + "loss": 0.93244112, + "num_input_tokens_seen": 3305595, + "step": 155, + "time_per_iteration": 2.5362863540649414 + }, + { + "auxiliary_loss_clip": 0.01440703, + "auxiliary_loss_mlp": 0.0107426, + "balance_loss_clip": 1.1271894, + "balance_loss_mlp": 1.04522014, + "epoch": 0.01875789093969819, + "flos": 26687840993280.0, + "grad_norm": 2.135603545706687, + "language_loss": 0.76078254, + "learning_rate": 3.6583477325089526e-06, + "loss": 0.78593218, + "num_input_tokens_seen": 3326135, + "step": 156, + "time_per_iteration": 2.5923924446105957 + }, + { + "auxiliary_loss_clip": 0.01434352, + "auxiliary_loss_mlp": 0.01080347, + "balance_loss_clip": 1.1254282, + "balance_loss_mlp": 1.05240452, + "epoch": 0.01887813383033728, + "flos": 24353001135360.0, + "grad_norm": 2.179739141230665, + "language_loss": 1.04206955, + "learning_rate": 3.6629767958628916e-06, + "loss": 1.06721663, + "num_input_tokens_seen": 3343510, + "step": 157, + "time_per_iteration": 2.5653772354125977 + }, + { + "auxiliary_loss_clip": 0.01432091, + "auxiliary_loss_mlp": 0.0108005, + "balance_loss_clip": 1.12781787, + "balance_loss_mlp": 1.05160689, + "epoch": 0.018998376720976373, + "flos": 14647532330880.0, + "grad_norm": 3.6148855800531408, + "language_loss": 0.85596704, + "learning_rate": 3.667576468140291e-06, + "loss": 0.88108844, + "num_input_tokens_seen": 3361325, + "step": 158, + "time_per_iteration": 2.5195393562316895 + }, + { + "auxiliary_loss_clip": 0.01425593, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_clip": 1.12153316, + "balance_loss_mlp": 1.04234171, + "epoch": 0.019118619611615463, + "flos": 29305261146240.0, + "grad_norm": 2.3857912529684104, + "language_loss": 0.88951373, + "learning_rate": 3.672147120210184e-06, + "loss": 0.91445374, + "num_input_tokens_seen": 3377925, + "step": 159, + "time_per_iteration": 2.598142623901367 + }, + { + "auxiliary_loss_clip": 0.01433606, + "auxiliary_loss_mlp": 0.01080392, + "balance_loss_clip": 1.12984526, + "balance_loss_mlp": 1.05475032, + "epoch": 0.019238862502254553, + "flos": 20886723797760.0, + "grad_norm": 2.62153111620146, + "language_loss": 0.86480957, + "learning_rate": 3.6766891159659177e-06, + "loss": 0.88994956, + "num_input_tokens_seen": 3396335, + "step": 160, + "time_per_iteration": 2.5487401485443115 + }, + { + "auxiliary_loss_clip": 0.0143265, + "auxiliary_loss_mlp": 0.01081581, + "balance_loss_clip": 1.1298672, + "balance_loss_mlp": 1.05590367, + "epoch": 0.019359105392893646, + "flos": 21360672777600.0, + "grad_norm": 5.922963878579769, + "language_loss": 0.88024551, + "learning_rate": 3.6812028124990075e-06, + "loss": 0.90538782, + "num_input_tokens_seen": 3413605, + "step": 161, + "time_per_iteration": 2.6768038272857666 + }, + { + "auxiliary_loss_clip": 0.01426323, + "auxiliary_loss_mlp": 0.01083246, + "balance_loss_clip": 1.12519312, + "balance_loss_mlp": 1.05830765, + "epoch": 0.019479348283532736, + "flos": 16283729681280.0, + "grad_norm": 7.439420692857519, + "language_loss": 0.81577343, + "learning_rate": 3.6856885602676016e-06, + "loss": 0.84086913, + "num_input_tokens_seen": 3429640, + "step": 162, + "time_per_iteration": 2.5244598388671875 + }, + { + "auxiliary_loss_clip": 0.01426714, + "auxiliary_loss_mlp": 0.01085323, + "balance_loss_clip": 1.12677789, + "balance_loss_mlp": 1.0607183, + "epoch": 0.019599591174171826, + "flos": 22091239497600.0, + "grad_norm": 2.9364770665322233, + "language_loss": 0.94292581, + "learning_rate": 3.6901467032597733e-06, + "loss": 0.96804619, + "num_input_tokens_seen": 3448125, + "step": 163, + "time_per_iteration": 2.5787899494171143 + }, + { + "auxiliary_loss_clip": 0.01430259, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_clip": 1.12589908, + "balance_loss_mlp": 1.04305077, + "epoch": 0.01971983406481092, + "flos": 19609668581760.0, + "grad_norm": 2.8664032329917517, + "language_loss": 0.87311208, + "learning_rate": 3.694577579151804e-06, + "loss": 0.89811718, + "num_input_tokens_seen": 3466535, + "step": 164, + "time_per_iteration": 2.556314468383789 + }, + { + "auxiliary_loss_clip": 0.01428854, + "auxiliary_loss_mlp": 0.01081942, + "balance_loss_clip": 1.12716293, + "balance_loss_mlp": 1.05597794, + "epoch": 0.01984007695545001, + "flos": 19099342103040.0, + "grad_norm": 2.3282177795771317, + "language_loss": 0.73900092, + "learning_rate": 3.6989815194616703e-06, + "loss": 0.76410884, + "num_input_tokens_seen": 3483730, + "step": 165, + "time_per_iteration": 2.56429123878479 + }, + { + "auxiliary_loss_clip": 0.01428013, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_clip": 1.1232996, + "balance_loss_mlp": 1.05643296, + "epoch": 0.0199603198460891, + "flos": 20848406964480.0, + "grad_norm": 3.169647780258612, + "language_loss": 0.79861784, + "learning_rate": 3.703358849697888e-06, + "loss": 0.82373822, + "num_input_tokens_seen": 3503640, + "step": 166, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.01424382, + "auxiliary_loss_mlp": 0.01094567, + "balance_loss_clip": 1.12610042, + "balance_loss_mlp": 1.07039142, + "epoch": 0.020080562736728192, + "flos": 21870747861120.0, + "grad_norm": 1.751451113158652, + "language_loss": 0.82654071, + "learning_rate": 3.7077098895038803e-06, + "loss": 0.85173011, + "num_input_tokens_seen": 3523010, + "step": 167, + "time_per_iteration": 2.562889575958252 + }, + { + "auxiliary_loss_clip": 0.0142328, + "auxiliary_loss_mlp": 0.01077316, + "balance_loss_clip": 1.12402582, + "balance_loss_mlp": 1.05284238, + "epoch": 0.020200805627367282, + "flos": 21688788539520.0, + "grad_norm": 3.5201702542785553, + "language_loss": 0.97259778, + "learning_rate": 3.712034952798045e-06, + "loss": 0.99760371, + "num_input_tokens_seen": 3541125, + "step": 168, + "time_per_iteration": 2.5528852939605713 + }, + { + "auxiliary_loss_clip": 0.01419908, + "auxiliary_loss_mlp": 0.01085295, + "balance_loss_clip": 1.11919832, + "balance_loss_mlp": 1.05982018, + "epoch": 0.02032104851800637, + "flos": 33543043729920.0, + "grad_norm": 2.0825055805261092, + "language_loss": 0.84713686, + "learning_rate": 3.7163343479096656e-06, + "loss": 0.87218887, + "num_input_tokens_seen": 3562700, + "step": 169, + "time_per_iteration": 2.685830593109131 + }, + { + "auxiliary_loss_clip": 0.01418172, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_clip": 1.12273097, + "balance_loss_mlp": 1.05794525, + "epoch": 0.020441291408645465, + "flos": 31686965274240.0, + "grad_norm": 10.034419740486332, + "language_loss": 0.83108127, + "learning_rate": 3.720608377710802e-06, + "loss": 0.8560648, + "num_input_tokens_seen": 3582790, + "step": 170, + "time_per_iteration": 2.6113367080688477 + }, + { + "auxiliary_loss_clip": 0.01411267, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_clip": 1.11607432, + "balance_loss_mlp": 1.06647348, + "epoch": 0.020561534299284555, + "flos": 20886687884160.0, + "grad_norm": 2.718985769532778, + "language_loss": 0.86390066, + "learning_rate": 3.7248573397443277e-06, + "loss": 0.8889311, + "num_input_tokens_seen": 3601715, + "step": 171, + "time_per_iteration": 2.5358664989471436 + }, + { + "auxiliary_loss_clip": 0.01416104, + "auxiliary_loss_mlp": 0.01092375, + "balance_loss_clip": 1.12181735, + "balance_loss_mlp": 1.06645858, + "epoch": 0.020681777189923645, + "flos": 20996610480000.0, + "grad_norm": 2.718383006326251, + "language_loss": 0.97611809, + "learning_rate": 3.729081526348224e-06, + "loss": 1.00120282, + "num_input_tokens_seen": 3620245, + "step": 172, + "time_per_iteration": 2.5303955078125 + }, + { + "auxiliary_loss_clip": 0.0141646, + "auxiliary_loss_mlp": 0.01071845, + "balance_loss_clip": 1.12051523, + "balance_loss_mlp": 1.04931462, + "epoch": 0.020802020080562738, + "flos": 28257532312320.0, + "grad_norm": 1.841258987290032, + "language_loss": 0.84781528, + "learning_rate": 3.7332812247762777e-06, + "loss": 0.87269831, + "num_input_tokens_seen": 3641545, + "step": 173, + "time_per_iteration": 2.6008615493774414 + }, + { + "auxiliary_loss_clip": 0.01417335, + "auxiliary_loss_mlp": 0.01069538, + "balance_loss_clip": 1.12345195, + "balance_loss_mlp": 1.0462451, + "epoch": 0.020922262971201828, + "flos": 19681274344320.0, + "grad_norm": 6.847690326791993, + "language_loss": 0.95587969, + "learning_rate": 3.737456717315293e-06, + "loss": 0.98074836, + "num_input_tokens_seen": 3660510, + "step": 174, + "time_per_iteration": 2.5305421352386475 + }, + { + "auxiliary_loss_clip": 0.01405171, + "auxiliary_loss_mlp": 0.01091433, + "balance_loss_clip": 1.11834955, + "balance_loss_mlp": 1.06803203, + "epoch": 0.021042505861840918, + "flos": 15666353694720.0, + "grad_norm": 1.815648506805503, + "language_loss": 0.90628594, + "learning_rate": 3.7416082813989552e-06, + "loss": 0.93125194, + "num_input_tokens_seen": 3677505, + "step": 175, + "time_per_iteration": 2.5492212772369385 + }, + { + "auxiliary_loss_clip": 0.01413574, + "auxiliary_loss_mlp": 0.01082537, + "balance_loss_clip": 1.1203258, + "balance_loss_mlp": 1.05855238, + "epoch": 0.02116274875248001, + "flos": 21142012734720.0, + "grad_norm": 2.1916805418685303, + "language_loss": 0.89451814, + "learning_rate": 3.745736189718439e-06, + "loss": 0.91947931, + "num_input_tokens_seen": 3696760, + "step": 176, + "time_per_iteration": 3.322657346725464 + }, + { + "auxiliary_loss_clip": 0.0140357, + "auxiliary_loss_mlp": 0.01068503, + "balance_loss_clip": 1.11489654, + "balance_loss_mlp": 1.04553139, + "epoch": 0.0212829916431191, + "flos": 24715770543360.0, + "grad_norm": 2.8373053680019047, + "language_loss": 0.7239967, + "learning_rate": 3.749840710329894e-06, + "loss": 0.74871743, + "num_input_tokens_seen": 3717465, + "step": 177, + "time_per_iteration": 2.5626890659332275 + }, + { + "auxiliary_loss_clip": 0.01416611, + "auxiliary_loss_mlp": 0.01086137, + "balance_loss_clip": 1.12006068, + "balance_loss_mlp": 1.06063795, + "epoch": 0.02140323453375819, + "flos": 16645493508480.0, + "grad_norm": 2.634715335817087, + "language_loss": 0.97967482, + "learning_rate": 3.7539221067588938e-06, + "loss": 1.00470233, + "num_input_tokens_seen": 3731440, + "step": 178, + "time_per_iteration": 3.2883355617523193 + }, + { + "auxiliary_loss_clip": 0.01409219, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_clip": 1.11634207, + "balance_loss_mlp": 1.06264794, + "epoch": 0.021523477424397284, + "flos": 20299332689280.0, + "grad_norm": 3.6613487270137393, + "language_loss": 0.94068992, + "learning_rate": 3.757980638101964e-06, + "loss": 0.96565634, + "num_input_tokens_seen": 3744935, + "step": 179, + "time_per_iteration": 3.287532329559326 + }, + { + "auxiliary_loss_clip": 0.01411468, + "auxiliary_loss_mlp": 0.01080334, + "balance_loss_clip": 1.11844802, + "balance_loss_mlp": 1.05425084, + "epoch": 0.021643720315036374, + "flos": 26104005331200.0, + "grad_norm": 2.14929302770171, + "language_loss": 0.89462525, + "learning_rate": 3.7620165591252806e-06, + "loss": 0.91954327, + "num_input_tokens_seen": 3763035, + "step": 180, + "time_per_iteration": 2.555919885635376 + }, + { + "auxiliary_loss_clip": 0.01401508, + "auxiliary_loss_mlp": 0.01076324, + "balance_loss_clip": 1.11743677, + "balance_loss_mlp": 1.05399609, + "epoch": 0.021763963205675464, + "flos": 24787663614720.0, + "grad_norm": 1.9176995488909003, + "language_loss": 0.94510317, + "learning_rate": 3.766030120360636e-06, + "loss": 0.96988147, + "num_input_tokens_seen": 3782665, + "step": 181, + "time_per_iteration": 2.580328941345215 + }, + { + "auxiliary_loss_clip": 0.01408209, + "auxiliary_loss_mlp": 0.01074559, + "balance_loss_clip": 1.11761904, + "balance_loss_mlp": 1.05233824, + "epoch": 0.021884206096314557, + "flos": 25813559957760.0, + "grad_norm": 2.313197753299018, + "language_loss": 0.90308601, + "learning_rate": 3.7700215681987578e-06, + "loss": 0.92791367, + "num_input_tokens_seen": 3802435, + "step": 182, + "time_per_iteration": 2.5972445011138916 + }, + { + "auxiliary_loss_clip": 0.01401456, + "auxiliary_loss_mlp": 0.01087562, + "balance_loss_clip": 1.11515021, + "balance_loss_mlp": 1.06369662, + "epoch": 0.022004448986953647, + "flos": 20082719721600.0, + "grad_norm": 1.788301507256928, + "language_loss": 0.82512677, + "learning_rate": 3.7739911449800767e-06, + "loss": 0.85001695, + "num_input_tokens_seen": 3822490, + "step": 183, + "time_per_iteration": 2.541797399520874 + }, + { + "auxiliary_loss_clip": 0.01401479, + "auxiliary_loss_mlp": 0.01087832, + "balance_loss_clip": 1.1139307, + "balance_loss_mlp": 1.06656528, + "epoch": 0.022124691877592736, + "flos": 20480609652480.0, + "grad_norm": 1.854011193243605, + "language_loss": 0.80808711, + "learning_rate": 3.7779390890830114e-06, + "loss": 0.83298028, + "num_input_tokens_seen": 3841140, + "step": 184, + "time_per_iteration": 2.5442826747894287 + }, + { + "auxiliary_loss_clip": 0.01401675, + "auxiliary_loss_mlp": 0.01081471, + "balance_loss_clip": 1.11399424, + "balance_loss_mlp": 1.05789113, + "epoch": 0.02224493476823183, + "flos": 23586847015680.0, + "grad_norm": 1.9192203708603637, + "language_loss": 0.86151493, + "learning_rate": 3.7818656350098723e-06, + "loss": 0.88634634, + "num_input_tokens_seen": 3862090, + "step": 185, + "time_per_iteration": 2.5849714279174805 + }, + { + "auxiliary_loss_clip": 0.01395894, + "auxiliary_loss_mlp": 0.01078642, + "balance_loss_clip": 1.10983944, + "balance_loss_mlp": 1.05431151, + "epoch": 0.02236517765887092, + "flos": 16909940413440.0, + "grad_norm": 3.5046287375721863, + "language_loss": 0.77585697, + "learning_rate": 3.7857710134704447e-06, + "loss": 0.80060244, + "num_input_tokens_seen": 3881025, + "step": 186, + "time_per_iteration": 2.5469424724578857 + }, + { + "auxiliary_loss_clip": 0.01396919, + "auxiliary_loss_mlp": 0.01059504, + "balance_loss_clip": 1.11540174, + "balance_loss_mlp": 1.03810585, + "epoch": 0.02248542054951001, + "flos": 43508182930560.0, + "grad_norm": 2.265521615882135, + "language_loss": 0.7927044, + "learning_rate": 3.7896554514633234e-06, + "loss": 0.81726867, + "num_input_tokens_seen": 3905310, + "step": 187, + "time_per_iteration": 2.7198057174682617 + }, + { + "auxiliary_loss_clip": 0.0139423, + "auxiliary_loss_mlp": 0.01069756, + "balance_loss_clip": 1.11241436, + "balance_loss_mlp": 1.04804802, + "epoch": 0.022605663440149103, + "flos": 23367648268800.0, + "grad_norm": 1.9529095567211885, + "language_loss": 0.84416461, + "learning_rate": 3.7935191723550955e-06, + "loss": 0.86880457, + "num_input_tokens_seen": 3924265, + "step": 188, + "time_per_iteration": 2.575063705444336 + }, + { + "auxiliary_loss_clip": 0.01392785, + "auxiliary_loss_mlp": 0.01072341, + "balance_loss_clip": 1.11107707, + "balance_loss_mlp": 1.05184913, + "epoch": 0.022725906330788193, + "flos": 29019915504000.0, + "grad_norm": 1.9325483060481101, + "language_loss": 0.88300174, + "learning_rate": 3.797362395957408e-06, + "loss": 0.90765303, + "num_input_tokens_seen": 3944830, + "step": 189, + "time_per_iteration": 2.59494686126709 + }, + { + "auxiliary_loss_clip": 0.01401822, + "auxiliary_loss_mlp": 0.01071747, + "balance_loss_clip": 1.11703348, + "balance_loss_mlp": 1.05001497, + "epoch": 0.022846149221427282, + "flos": 24496176746880.0, + "grad_norm": 3.852247199769434, + "language_loss": 0.78356338, + "learning_rate": 3.8011853386020055e-06, + "loss": 0.80829901, + "num_input_tokens_seen": 3965735, + "step": 190, + "time_per_iteration": 2.55912446975708 + }, + { + "auxiliary_loss_clip": 0.01397736, + "auxiliary_loss_mlp": 0.01084314, + "balance_loss_clip": 1.11481428, + "balance_loss_mlp": 1.061831, + "epoch": 0.022966392112066376, + "flos": 15523537219200.0, + "grad_norm": 3.1609162586887063, + "language_loss": 0.89613628, + "learning_rate": 3.804988213213804e-06, + "loss": 0.92095685, + "num_input_tokens_seen": 3983975, + "step": 191, + "time_per_iteration": 2.4898557662963867 + }, + { + "auxiliary_loss_clip": 0.01402283, + "auxiliary_loss_mlp": 0.01022892, + "balance_loss_clip": 1.1672411, + "balance_loss_mlp": 1.0118767, + "epoch": 0.023086635002705466, + "flos": 55650408433920.0, + "grad_norm": 1.0380405782245725, + "language_loss": 0.63190162, + "learning_rate": 3.808771229382049e-06, + "loss": 0.65615338, + "num_input_tokens_seen": 4043440, + "step": 192, + "time_per_iteration": 3.0167601108551025 + }, + { + "auxiliary_loss_clip": 0.01389308, + "auxiliary_loss_mlp": 0.01079339, + "balance_loss_clip": 1.11214149, + "balance_loss_mlp": 1.05882287, + "epoch": 0.023206877893344555, + "flos": 19313441118720.0, + "grad_norm": 2.930288162729014, + "language_loss": 0.84674656, + "learning_rate": 3.8125345934296324e-06, + "loss": 0.87143302, + "num_input_tokens_seen": 4061750, + "step": 193, + "time_per_iteration": 2.518778085708618 + }, + { + "auxiliary_loss_clip": 0.01390244, + "auxiliary_loss_mlp": 0.01077654, + "balance_loss_clip": 1.11097419, + "balance_loss_mlp": 1.05493283, + "epoch": 0.02332712078398365, + "flos": 23072965090560.0, + "grad_norm": 9.057110792122597, + "language_loss": 0.87965274, + "learning_rate": 3.81627850848061e-06, + "loss": 0.90433168, + "num_input_tokens_seen": 4082345, + "step": 194, + "time_per_iteration": 2.5747063159942627 + }, + { + "auxiliary_loss_clip": 0.01385222, + "auxiliary_loss_mlp": 0.01066258, + "balance_loss_clip": 1.10750687, + "balance_loss_mlp": 1.04568243, + "epoch": 0.02344736367462274, + "flos": 24425971614720.0, + "grad_norm": 2.2292398006617615, + "language_loss": 0.8652823, + "learning_rate": 3.820003174525994e-06, + "loss": 0.88979709, + "num_input_tokens_seen": 4101770, + "step": 195, + "time_per_iteration": 2.6050658226013184 + }, + { + "auxiliary_loss_clip": 0.01389548, + "auxiliary_loss_mlp": 0.0108103, + "balance_loss_clip": 1.11222684, + "balance_loss_mlp": 1.06010866, + "epoch": 0.02356760656526183, + "flos": 21579799697280.0, + "grad_norm": 2.4436298903508784, + "language_loss": 0.82977521, + "learning_rate": 3.823708788487851e-06, + "loss": 0.8544811, + "num_input_tokens_seen": 4118770, + "step": 196, + "time_per_iteration": 2.5470685958862305 + }, + { + "auxiliary_loss_clip": 0.01385902, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_clip": 1.10985732, + "balance_loss_mlp": 1.06326544, + "epoch": 0.02368784945590092, + "flos": 25193598192000.0, + "grad_norm": 1.9017040349659848, + "language_loss": 0.84712446, + "learning_rate": 3.827395544281781e-06, + "loss": 0.87181169, + "num_input_tokens_seen": 4141110, + "step": 197, + "time_per_iteration": 2.5880842208862305 + }, + { + "auxiliary_loss_clip": 0.01392244, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_clip": 1.11289191, + "balance_loss_mlp": 1.06408548, + "epoch": 0.02380809234654001, + "flos": 27562481164800.0, + "grad_norm": 1.8497562683026902, + "language_loss": 0.78934038, + "learning_rate": 3.831063632877802e-06, + "loss": 0.81411517, + "num_input_tokens_seen": 4161430, + "step": 198, + "time_per_iteration": 2.644836902618408 + }, + { + "auxiliary_loss_clip": 0.01388296, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_clip": 1.11625671, + "balance_loss_mlp": 1.05685329, + "epoch": 0.0239283352371791, + "flos": 18259786540800.0, + "grad_norm": 3.2409744469602355, + "language_loss": 0.7590993, + "learning_rate": 3.834713242359712e-06, + "loss": 0.78374046, + "num_input_tokens_seen": 4179260, + "step": 199, + "time_per_iteration": 2.5400943756103516 + }, + { + "auxiliary_loss_clip": 0.01390228, + "auxiliary_loss_mlp": 0.01077628, + "balance_loss_clip": 1.10933185, + "balance_loss_mlp": 1.05537176, + "epoch": 0.02404857812781819, + "flos": 21395110942080.0, + "grad_norm": 2.12032787396583, + "language_loss": 0.87314248, + "learning_rate": 3.838344557982959e-06, + "loss": 0.89782107, + "num_input_tokens_seen": 4200640, + "step": 200, + "time_per_iteration": 2.5728847980499268 + }, + { + "auxiliary_loss_clip": 0.01383727, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_clip": 1.10831118, + "balance_loss_mlp": 1.05407667, + "epoch": 0.024168821018457284, + "flos": 16654256426880.0, + "grad_norm": 3.125736507087999, + "language_loss": 0.84964693, + "learning_rate": 3.841957762231063e-06, + "loss": 0.87424338, + "num_input_tokens_seen": 4218170, + "step": 201, + "time_per_iteration": 2.5489203929901123 + }, + { + "auxiliary_loss_clip": 0.01380106, + "auxiliary_loss_mlp": 0.01066765, + "balance_loss_clip": 1.1055572, + "balance_loss_mlp": 1.04645181, + "epoch": 0.024289063909096374, + "flos": 22820872464000.0, + "grad_norm": 2.5792905742882968, + "language_loss": 0.88006091, + "learning_rate": 3.8455530348706454e-06, + "loss": 0.90452957, + "num_input_tokens_seen": 4237770, + "step": 202, + "time_per_iteration": 2.5241541862487793 + }, + { + "auxiliary_loss_clip": 0.01380503, + "auxiliary_loss_mlp": 0.01077916, + "balance_loss_clip": 1.10754681, + "balance_loss_mlp": 1.05890226, + "epoch": 0.024409306799735464, + "flos": 17748598135680.0, + "grad_norm": 2.438729885780499, + "language_loss": 0.77295017, + "learning_rate": 3.849130553005099e-06, + "loss": 0.79753435, + "num_input_tokens_seen": 4255985, + "step": 203, + "time_per_iteration": 2.517843246459961 + }, + { + "auxiliary_loss_clip": 0.01381676, + "auxiliary_loss_mlp": 0.01070859, + "balance_loss_clip": 1.10553384, + "balance_loss_mlp": 1.05188107, + "epoch": 0.024529549690374557, + "flos": 21616213109760.0, + "grad_norm": 1.9688360905024407, + "language_loss": 0.83921212, + "learning_rate": 3.852690491126933e-06, + "loss": 0.86373746, + "num_input_tokens_seen": 4276035, + "step": 204, + "time_per_iteration": 3.3108887672424316 + }, + { + "auxiliary_loss_clip": 0.01377017, + "auxiliary_loss_mlp": 0.01058524, + "balance_loss_clip": 1.10279655, + "balance_loss_mlp": 1.03819847, + "epoch": 0.024649792581013647, + "flos": 25551662918400.0, + "grad_norm": 2.573431844425089, + "language_loss": 0.91292673, + "learning_rate": 3.856233021168845e-06, + "loss": 0.93728215, + "num_input_tokens_seen": 4295730, + "step": 205, + "time_per_iteration": 4.158140420913696 + }, + { + "auxiliary_loss_clip": 0.01371777, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.10372138, + "balance_loss_mlp": 1.03515458, + "epoch": 0.024770035471652737, + "flos": 34495574544000.0, + "grad_norm": 3.7263828342652685, + "language_loss": 0.91193497, + "learning_rate": 3.859758312553544e-06, + "loss": 0.9361822, + "num_input_tokens_seen": 4317950, + "step": 206, + "time_per_iteration": 3.3903987407684326 + }, + { + "auxiliary_loss_clip": 0.01379205, + "auxiliary_loss_mlp": 0.01072392, + "balance_loss_clip": 1.10869694, + "balance_loss_mlp": 1.05366445, + "epoch": 0.02489027836229183, + "flos": 21505428587520.0, + "grad_norm": 11.065519307272993, + "language_loss": 0.91687691, + "learning_rate": 3.8632665322423735e-06, + "loss": 0.9413929, + "num_input_tokens_seen": 4337605, + "step": 207, + "time_per_iteration": 2.531292676925659 + }, + { + "auxiliary_loss_clip": 0.01377938, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.10608745, + "balance_loss_mlp": 1.04583359, + "epoch": 0.02501052125293092, + "flos": 23219013790080.0, + "grad_norm": 1.7802798643640798, + "language_loss": 0.85950285, + "learning_rate": 3.866757844782762e-06, + "loss": 0.88393259, + "num_input_tokens_seen": 4358110, + "step": 208, + "time_per_iteration": 2.5524511337280273 + }, + { + "auxiliary_loss_clip": 0.01376961, + "auxiliary_loss_mlp": 0.01065747, + "balance_loss_clip": 1.10692191, + "balance_loss_mlp": 1.0463872, + "epoch": 0.02513076414357001, + "flos": 26388920010240.0, + "grad_norm": 4.521592518536236, + "language_loss": 0.91567683, + "learning_rate": 3.870232412354527e-06, + "loss": 0.94010395, + "num_input_tokens_seen": 4374955, + "step": 209, + "time_per_iteration": 2.543482780456543 + }, + { + "auxiliary_loss_clip": 0.01373997, + "auxiliary_loss_mlp": 0.01063983, + "balance_loss_clip": 1.10460401, + "balance_loss_mlp": 1.04483831, + "epoch": 0.025251007034209103, + "flos": 13590430047360.0, + "grad_norm": 1.914701900603714, + "language_loss": 0.92761451, + "learning_rate": 3.873690394815086e-06, + "loss": 0.9519943, + "num_input_tokens_seen": 4391535, + "step": 210, + "time_per_iteration": 2.496335983276367 + }, + { + "auxiliary_loss_clip": 0.01371273, + "auxiliary_loss_mlp": 0.01059487, + "balance_loss_clip": 1.10121536, + "balance_loss_mlp": 1.04058027, + "epoch": 0.025371249924848193, + "flos": 15049229103360.0, + "grad_norm": 2.6583762682006062, + "language_loss": 0.91372192, + "learning_rate": 3.877131949743587e-06, + "loss": 0.93802941, + "num_input_tokens_seen": 4408400, + "step": 211, + "time_per_iteration": 2.4998562335968018 + }, + { + "auxiliary_loss_clip": 0.01372666, + "auxiliary_loss_mlp": 0.0108012, + "balance_loss_clip": 1.1043458, + "balance_loss_mlp": 1.06096339, + "epoch": 0.025491492815487283, + "flos": 25553853648000.0, + "grad_norm": 2.027955702331032, + "language_loss": 0.7822789, + "learning_rate": 3.880557232483993e-06, + "loss": 0.80680668, + "num_input_tokens_seen": 4427840, + "step": 212, + "time_per_iteration": 2.5699496269226074 + }, + { + "auxiliary_loss_clip": 0.01370906, + "auxiliary_loss_mlp": 0.01062825, + "balance_loss_clip": 1.100631, + "balance_loss_mlp": 1.04363227, + "epoch": 0.025611735706126376, + "flos": 20630752502400.0, + "grad_norm": 2.092042044481166, + "language_loss": 0.86853957, + "learning_rate": 3.883966396187164e-06, + "loss": 0.89287686, + "num_input_tokens_seen": 4447110, + "step": 213, + "time_per_iteration": 2.514939308166504 + }, + { + "auxiliary_loss_clip": 0.01373712, + "auxiliary_loss_mlp": 0.01062067, + "balance_loss_clip": 1.10519934, + "balance_loss_mlp": 1.04379237, + "epoch": 0.025731978596765466, + "flos": 19062282245760.0, + "grad_norm": 2.1047223452143533, + "language_loss": 0.9013173, + "learning_rate": 3.887359591851937e-06, + "loss": 0.92567509, + "num_input_tokens_seen": 4464715, + "step": 214, + "time_per_iteration": 2.521629571914673 + }, + { + "auxiliary_loss_clip": 0.01369111, + "auxiliary_loss_mlp": 0.01058783, + "balance_loss_clip": 1.10313976, + "balance_loss_mlp": 1.03964972, + "epoch": 0.025852221487404556, + "flos": 22163814927360.0, + "grad_norm": 1.7361939760560665, + "language_loss": 0.92229509, + "learning_rate": 3.890736968365265e-06, + "loss": 0.94657397, + "num_input_tokens_seen": 4485030, + "step": 215, + "time_per_iteration": 2.5245988368988037 + }, + { + "auxiliary_loss_clip": 0.01370425, + "auxiliary_loss_mlp": 0.01061352, + "balance_loss_clip": 1.10214257, + "balance_loss_mlp": 1.04095566, + "epoch": 0.02597246437804365, + "flos": 26541971861760.0, + "grad_norm": 2.3657616820463945, + "language_loss": 0.8502574, + "learning_rate": 3.894098672541412e-06, + "loss": 0.8745752, + "num_input_tokens_seen": 4505935, + "step": 216, + "time_per_iteration": 2.5550570487976074 + }, + { + "auxiliary_loss_clip": 0.01370472, + "auxiliary_loss_mlp": 0.01068833, + "balance_loss_clip": 1.10191047, + "balance_loss_mlp": 1.04841232, + "epoch": 0.02609270726868274, + "flos": 32671671696000.0, + "grad_norm": 3.3730990437831085, + "language_loss": 0.75483173, + "learning_rate": 3.89744484916025e-06, + "loss": 0.77922475, + "num_input_tokens_seen": 4527045, + "step": 217, + "time_per_iteration": 2.5992558002471924 + }, + { + "auxiliary_loss_clip": 0.0137079, + "auxiliary_loss_mlp": 0.01067659, + "balance_loss_clip": 1.10230899, + "balance_loss_mlp": 1.04740524, + "epoch": 0.02621295015932183, + "flos": 26243553669120.0, + "grad_norm": 2.0405233171400115, + "language_loss": 0.87248671, + "learning_rate": 3.900775641004673e-06, + "loss": 0.89687121, + "num_input_tokens_seen": 4546360, + "step": 218, + "time_per_iteration": 2.532458543777466 + }, + { + "auxiliary_loss_clip": 0.01377343, + "auxiliary_loss_mlp": 0.01074584, + "balance_loss_clip": 1.10563469, + "balance_loss_mlp": 1.0518626, + "epoch": 0.026333193049960922, + "flos": 42921402353280.0, + "grad_norm": 2.767793871035358, + "language_loss": 0.7434454, + "learning_rate": 3.904091188897156e-06, + "loss": 0.76796472, + "num_input_tokens_seen": 4565495, + "step": 219, + "time_per_iteration": 2.7515268325805664 + }, + { + "auxiliary_loss_clip": 0.01368197, + "auxiliary_loss_mlp": 0.01075617, + "balance_loss_clip": 1.10076344, + "balance_loss_mlp": 1.05481505, + "epoch": 0.026453435940600012, + "flos": 17963846386560.0, + "grad_norm": 2.302433214367086, + "language_loss": 0.81957054, + "learning_rate": 3.90739163173548e-06, + "loss": 0.84400868, + "num_input_tokens_seen": 4583330, + "step": 220, + "time_per_iteration": 2.5082905292510986 + }, + { + "auxiliary_loss_clip": 0.01366445, + "auxiliary_loss_mlp": 0.01070764, + "balance_loss_clip": 1.10053861, + "balance_loss_mlp": 1.051548, + "epoch": 0.026573678831239102, + "flos": 18984319776000.0, + "grad_norm": 2.4686040300066248, + "language_loss": 0.88337493, + "learning_rate": 3.910677106527646e-06, + "loss": 0.90774703, + "num_input_tokens_seen": 4600520, + "step": 221, + "time_per_iteration": 2.51016902923584 + }, + { + "auxiliary_loss_clip": 0.01363827, + "auxiliary_loss_mlp": 0.01068368, + "balance_loss_clip": 1.10002804, + "balance_loss_mlp": 1.04991436, + "epoch": 0.026693921721878195, + "flos": 29241448634880.0, + "grad_norm": 4.887649570779633, + "language_loss": 0.84137398, + "learning_rate": 3.913947748426004e-06, + "loss": 0.86569595, + "num_input_tokens_seen": 4617340, + "step": 222, + "time_per_iteration": 2.5692780017852783 + }, + { + "auxiliary_loss_clip": 0.01368926, + "auxiliary_loss_mlp": 0.01066482, + "balance_loss_clip": 1.10241842, + "balance_loss_mlp": 1.04767025, + "epoch": 0.026814164612517285, + "flos": 14128083797760.0, + "grad_norm": 5.4855746413733995, + "language_loss": 0.76465851, + "learning_rate": 3.9172036907606136e-06, + "loss": 0.78901261, + "num_input_tokens_seen": 4630820, + "step": 223, + "time_per_iteration": 2.454571008682251 + }, + { + "auxiliary_loss_clip": 0.01367086, + "auxiliary_loss_mlp": 0.01064258, + "balance_loss_clip": 1.09898829, + "balance_loss_mlp": 1.04481435, + "epoch": 0.026934407503156375, + "flos": 23511973115520.0, + "grad_norm": 1.9817330814430627, + "language_loss": 0.95113736, + "learning_rate": 3.920445065071855e-06, + "loss": 0.97545075, + "num_input_tokens_seen": 4651985, + "step": 224, + "time_per_iteration": 2.564382553100586 + }, + { + "auxiliary_loss_clip": 0.01364004, + "auxiliary_loss_mlp": 0.01072121, + "balance_loss_clip": 1.09905529, + "balance_loss_mlp": 1.05251098, + "epoch": 0.027054650393795468, + "flos": 28950356816640.0, + "grad_norm": 5.645860463749995, + "language_loss": 0.80066538, + "learning_rate": 3.923672001142322e-06, + "loss": 0.82502663, + "num_input_tokens_seen": 4672295, + "step": 225, + "time_per_iteration": 2.574038028717041 + }, + { + "auxiliary_loss_clip": 0.01359479, + "auxiliary_loss_mlp": 0.01079039, + "balance_loss_clip": 1.09643006, + "balance_loss_mlp": 1.05920255, + "epoch": 0.027174893284434558, + "flos": 31431568596480.0, + "grad_norm": 2.397447962906849, + "language_loss": 0.84523952, + "learning_rate": 3.926884627027996e-06, + "loss": 0.86962467, + "num_input_tokens_seen": 4696065, + "step": 226, + "time_per_iteration": 2.5938305854797363 + }, + { + "auxiliary_loss_clip": 0.01361368, + "auxiliary_loss_mlp": 0.01070239, + "balance_loss_clip": 1.0956924, + "balance_loss_mlp": 1.05153537, + "epoch": 0.027295136175073648, + "flos": 22054466949120.0, + "grad_norm": 2.0278945649438453, + "language_loss": 0.77424455, + "learning_rate": 3.930083069088744e-06, + "loss": 0.79856062, + "num_input_tokens_seen": 4716065, + "step": 227, + "time_per_iteration": 2.5249006748199463 + }, + { + "auxiliary_loss_clip": 0.01334356, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.11732769, + "balance_loss_mlp": 1.06321526, + "epoch": 0.02741537906571274, + "flos": 60800752972800.0, + "grad_norm": 0.988272673829982, + "language_loss": 0.5919171, + "learning_rate": 3.933267452018137e-06, + "loss": 0.61604059, + "num_input_tokens_seen": 4775860, + "step": 228, + "time_per_iteration": 3.090804100036621 + }, + { + "auxiliary_loss_clip": 0.01360834, + "auxiliary_loss_mlp": 0.0106248, + "balance_loss_clip": 1.09876895, + "balance_loss_mlp": 1.04325175, + "epoch": 0.02753562195635183, + "flos": 24606278910720.0, + "grad_norm": 2.0474886718029697, + "language_loss": 0.84290993, + "learning_rate": 3.936437898872622e-06, + "loss": 0.86714315, + "num_input_tokens_seen": 4795835, + "step": 229, + "time_per_iteration": 2.592053174972534 + }, + { + "auxiliary_loss_clip": 0.01360707, + "auxiliary_loss_mlp": 0.01057845, + "balance_loss_clip": 1.0975877, + "balance_loss_mlp": 1.03921294, + "epoch": 0.02765586484699092, + "flos": 34094236907520.0, + "grad_norm": 2.42105056624181, + "language_loss": 0.79662192, + "learning_rate": 3.9395945311000525e-06, + "loss": 0.82080746, + "num_input_tokens_seen": 4817460, + "step": 230, + "time_per_iteration": 2.634876012802124 + }, + { + "auxiliary_loss_clip": 0.01362803, + "auxiliary_loss_mlp": 0.01072821, + "balance_loss_clip": 1.09878993, + "balance_loss_mlp": 1.05350888, + "epoch": 0.027776107737630014, + "flos": 14829922615680.0, + "grad_norm": 2.092932739472263, + "language_loss": 0.90712649, + "learning_rate": 3.942737468567608e-06, + "loss": 0.93148279, + "num_input_tokens_seen": 4835475, + "step": 231, + "time_per_iteration": 3.263613700866699 + }, + { + "auxiliary_loss_clip": 0.01361028, + "auxiliary_loss_mlp": 0.01072752, + "balance_loss_clip": 1.09874249, + "balance_loss_mlp": 1.05371428, + "epoch": 0.027896350628269104, + "flos": 47920347066240.0, + "grad_norm": 1.9826093397407576, + "language_loss": 0.86328733, + "learning_rate": 3.9458668295891026e-06, + "loss": 0.88762516, + "num_input_tokens_seen": 4857760, + "step": 232, + "time_per_iteration": 3.459953784942627 + }, + { + "auxiliary_loss_clip": 0.01355582, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_clip": 1.0930438, + "balance_loss_mlp": 1.04147863, + "epoch": 0.028016593518908194, + "flos": 21684550734720.0, + "grad_norm": 2.540725644261303, + "language_loss": 0.86893064, + "learning_rate": 3.948982730951712e-06, + "loss": 0.89310449, + "num_input_tokens_seen": 4875855, + "step": 233, + "time_per_iteration": 3.378103494644165 + }, + { + "auxiliary_loss_clip": 0.01360022, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_clip": 1.09621572, + "balance_loss_mlp": 1.04535949, + "epoch": 0.028136836409547287, + "flos": 18439483305600.0, + "grad_norm": 2.3403275770508767, + "language_loss": 0.81747532, + "learning_rate": 3.9520852879421254e-06, + "loss": 0.84172809, + "num_input_tokens_seen": 4893200, + "step": 234, + "time_per_iteration": 3.238353967666626 + }, + { + "auxiliary_loss_clip": 0.01354508, + "auxiliary_loss_mlp": 0.01072417, + "balance_loss_clip": 1.09505105, + "balance_loss_mlp": 1.05423784, + "epoch": 0.028257079300186377, + "flos": 31576934937600.0, + "grad_norm": 2.399611303228527, + "language_loss": 0.81857347, + "learning_rate": 3.955174614372137e-06, + "loss": 0.8428427, + "num_input_tokens_seen": 4912965, + "step": 235, + "time_per_iteration": 2.5765774250030518 + }, + { + "auxiliary_loss_clip": 0.01357386, + "auxiliary_loss_mlp": 0.01069566, + "balance_loss_clip": 1.09624577, + "balance_loss_mlp": 1.05009902, + "epoch": 0.028377322190825467, + "flos": 23513337832320.0, + "grad_norm": 3.4646144960759004, + "language_loss": 0.84361625, + "learning_rate": 3.9582508226037045e-06, + "loss": 0.86788571, + "num_input_tokens_seen": 4933105, + "step": 236, + "time_per_iteration": 2.593547821044922 + }, + { + "auxiliary_loss_clip": 0.01364442, + "auxiliary_loss_mlp": 0.01080945, + "balance_loss_clip": 1.09747219, + "balance_loss_mlp": 1.06050038, + "epoch": 0.02849756508146456, + "flos": 20479604071680.0, + "grad_norm": 2.626401673026312, + "language_loss": 0.94296819, + "learning_rate": 3.9613140235734636e-06, + "loss": 0.96742207, + "num_input_tokens_seen": 4950085, + "step": 237, + "time_per_iteration": 2.504316568374634 + }, + { + "auxiliary_loss_clip": 0.01354819, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.09395039, + "balance_loss_mlp": 1.04832888, + "epoch": 0.02861780797210365, + "flos": 14283362292480.0, + "grad_norm": 1.987726433843816, + "language_loss": 0.81226784, + "learning_rate": 3.96436432681674e-06, + "loss": 0.83650202, + "num_input_tokens_seen": 4968075, + "step": 238, + "time_per_iteration": 2.4844777584075928 + }, + { + "auxiliary_loss_clip": 0.0135512, + "auxiliary_loss_mlp": 0.01071437, + "balance_loss_clip": 1.09406614, + "balance_loss_mlp": 1.05204177, + "epoch": 0.02873805086274274, + "flos": 25808532053760.0, + "grad_norm": 2.4155043749967127, + "language_loss": 0.89347875, + "learning_rate": 3.967401840491044e-06, + "loss": 0.91774434, + "num_input_tokens_seen": 4987355, + "step": 239, + "time_per_iteration": 2.549912452697754 + }, + { + "auxiliary_loss_clip": 0.01352498, + "auxiliary_loss_mlp": 0.01070063, + "balance_loss_clip": 1.09601831, + "balance_loss_mlp": 1.05265832, + "epoch": 0.028858293753381833, + "flos": 17304238984320.0, + "grad_norm": 2.676311652762329, + "language_loss": 0.87879658, + "learning_rate": 3.97042667139909e-06, + "loss": 0.90302217, + "num_input_tokens_seen": 5004680, + "step": 240, + "time_per_iteration": 2.491797685623169 + }, + { + "auxiliary_loss_clip": 0.01354218, + "auxiliary_loss_mlp": 0.0106491, + "balance_loss_clip": 1.09551525, + "balance_loss_mlp": 1.04615855, + "epoch": 0.028978536644020923, + "flos": 23038347358080.0, + "grad_norm": 2.1018041484895296, + "language_loss": 0.87803209, + "learning_rate": 3.973438925011327e-06, + "loss": 0.90222335, + "num_input_tokens_seen": 5022965, + "step": 241, + "time_per_iteration": 2.570103883743286 + }, + { + "auxiliary_loss_clip": 0.01353684, + "auxiliary_loss_mlp": 0.01051098, + "balance_loss_clip": 1.09301424, + "balance_loss_mlp": 1.03190553, + "epoch": 0.029098779534660012, + "flos": 28329712692480.0, + "grad_norm": 2.5576929287314347, + "language_loss": 0.91379148, + "learning_rate": 3.976438705488002e-06, + "loss": 0.93783927, + "num_input_tokens_seen": 5042625, + "step": 242, + "time_per_iteration": 2.592794418334961 + }, + { + "auxiliary_loss_clip": 0.0135092, + "auxiliary_loss_mlp": 0.0106363, + "balance_loss_clip": 1.09478378, + "balance_loss_mlp": 1.0461061, + "epoch": 0.029219022425299106, + "flos": 13881665520000.0, + "grad_norm": 2.5280614664977072, + "language_loss": 0.93100679, + "learning_rate": 3.9794261157007744e-06, + "loss": 0.95515227, + "num_input_tokens_seen": 5060380, + "step": 243, + "time_per_iteration": 2.5896942615509033 + }, + { + "auxiliary_loss_clip": 0.01356432, + "auxiliary_loss_mlp": 0.01061906, + "balance_loss_clip": 1.09618497, + "balance_loss_mlp": 1.04195046, + "epoch": 0.029339265315938196, + "flos": 19422501788160.0, + "grad_norm": 2.109807418241014, + "language_loss": 0.84701347, + "learning_rate": 3.982401257253887e-06, + "loss": 0.87119687, + "num_input_tokens_seen": 5078720, + "step": 244, + "time_per_iteration": 2.5891332626342773 + }, + { + "auxiliary_loss_clip": 0.01352592, + "auxiliary_loss_mlp": 0.01059747, + "balance_loss_clip": 1.09394896, + "balance_loss_mlp": 1.04203284, + "epoch": 0.029459508206577285, + "flos": 15669550005120.0, + "grad_norm": 2.8971911256440963, + "language_loss": 0.89753604, + "learning_rate": 3.985364230504893e-06, + "loss": 0.92165941, + "num_input_tokens_seen": 5096605, + "step": 245, + "time_per_iteration": 2.607832908630371 + }, + { + "auxiliary_loss_clip": 0.01359355, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_clip": 1.09924781, + "balance_loss_mlp": 1.05038452, + "epoch": 0.02957975109721638, + "flos": 28220975245440.0, + "grad_norm": 2.2527161371439908, + "language_loss": 0.84525084, + "learning_rate": 3.988315134584976e-06, + "loss": 0.86952102, + "num_input_tokens_seen": 5116285, + "step": 246, + "time_per_iteration": 2.6695690155029297 + }, + { + "auxiliary_loss_clip": 0.01355058, + "auxiliary_loss_mlp": 0.01067614, + "balance_loss_clip": 1.09607279, + "balance_loss_mlp": 1.04880238, + "epoch": 0.02969999398785547, + "flos": 24315869450880.0, + "grad_norm": 1.8485859146399621, + "language_loss": 0.80376291, + "learning_rate": 3.991254067418851e-06, + "loss": 0.82798964, + "num_input_tokens_seen": 5136825, + "step": 247, + "time_per_iteration": 2.643104076385498 + }, + { + "auxiliary_loss_clip": 0.01345538, + "auxiliary_loss_mlp": 0.01068566, + "balance_loss_clip": 1.0934912, + "balance_loss_mlp": 1.05101871, + "epoch": 0.02982023687849456, + "flos": 35078584193280.0, + "grad_norm": 2.057889283917218, + "language_loss": 0.82960624, + "learning_rate": 3.994181125744254e-06, + "loss": 0.85374725, + "num_input_tokens_seen": 5158630, + "step": 248, + "time_per_iteration": 2.7324092388153076 + }, + { + "auxiliary_loss_clip": 0.01349721, + "auxiliary_loss_mlp": 0.01061026, + "balance_loss_clip": 1.09452152, + "balance_loss_mlp": 1.04359722, + "epoch": 0.02994047976913365, + "flos": 26177155378560.0, + "grad_norm": 1.8365260418773277, + "language_loss": 0.7406131, + "learning_rate": 3.99709640513106e-06, + "loss": 0.7647205, + "num_input_tokens_seen": 5179510, + "step": 249, + "time_per_iteration": 2.570517063140869 + }, + { + "auxiliary_loss_clip": 0.01349676, + "auxiliary_loss_mlp": 0.01066141, + "balance_loss_clip": 1.09127975, + "balance_loss_mlp": 1.04675794, + "epoch": 0.03006072265977274, + "flos": 25625028447360.0, + "grad_norm": 2.1312757339695203, + "language_loss": 0.85661417, + "learning_rate": 4e-06, + "loss": 0.88077235, + "num_input_tokens_seen": 5199345, + "step": 250, + "time_per_iteration": 2.5370521545410156 + }, + { + "auxiliary_loss_clip": 0.01350725, + "auxiliary_loss_mlp": 0.01060236, + "balance_loss_clip": 1.09559369, + "balance_loss_mlp": 1.04311752, + "epoch": 0.03018096555041183, + "flos": 22127078292480.0, + "grad_norm": 7.031118081795415, + "language_loss": 0.88818896, + "learning_rate": 3.999999848300794e-06, + "loss": 0.91229868, + "num_input_tokens_seen": 5218330, + "step": 251, + "time_per_iteration": 2.539456844329834 + }, + { + "auxiliary_loss_clip": 0.01342835, + "auxiliary_loss_mlp": 0.01056742, + "balance_loss_clip": 1.08909047, + "balance_loss_mlp": 1.03903937, + "epoch": 0.030301208441050925, + "flos": 30188197359360.0, + "grad_norm": 1.6473734173783947, + "language_loss": 0.88951945, + "learning_rate": 3.999999393203203e-06, + "loss": 0.91351521, + "num_input_tokens_seen": 5240740, + "step": 252, + "time_per_iteration": 2.56982684135437 + }, + { + "auxiliary_loss_clip": 0.01341757, + "auxiliary_loss_mlp": 0.01059889, + "balance_loss_clip": 1.08728623, + "balance_loss_mlp": 1.04277015, + "epoch": 0.030421451331690014, + "flos": 23621392920960.0, + "grad_norm": 2.431972451369968, + "language_loss": 0.85291088, + "learning_rate": 3.999998634707293e-06, + "loss": 0.87692732, + "num_input_tokens_seen": 5260290, + "step": 253, + "time_per_iteration": 2.575003147125244 + }, + { + "auxiliary_loss_clip": 0.01352043, + "auxiliary_loss_mlp": 0.01063324, + "balance_loss_clip": 1.09604883, + "balance_loss_mlp": 1.0453589, + "epoch": 0.030541694222329104, + "flos": 27928446883200.0, + "grad_norm": 2.6863918847428763, + "language_loss": 0.96575165, + "learning_rate": 3.999997572813182e-06, + "loss": 0.98990536, + "num_input_tokens_seen": 5278100, + "step": 254, + "time_per_iteration": 2.5469977855682373 + }, + { + "auxiliary_loss_clip": 0.01345333, + "auxiliary_loss_mlp": 0.0106962, + "balance_loss_clip": 1.09076774, + "balance_loss_mlp": 1.05213213, + "epoch": 0.030661937112968194, + "flos": 18588441006720.0, + "grad_norm": 4.722009759145655, + "language_loss": 0.87619752, + "learning_rate": 3.999996207521028e-06, + "loss": 0.90034711, + "num_input_tokens_seen": 5296810, + "step": 255, + "time_per_iteration": 2.5415143966674805 + }, + { + "auxiliary_loss_clip": 0.01348768, + "auxiliary_loss_mlp": 0.01058025, + "balance_loss_clip": 1.09013677, + "balance_loss_mlp": 1.039011, + "epoch": 0.030782180003607287, + "flos": 12969139478400.0, + "grad_norm": 2.3994916801587833, + "language_loss": 0.82472408, + "learning_rate": 3.999994538831039e-06, + "loss": 0.84879202, + "num_input_tokens_seen": 5313395, + "step": 256, + "time_per_iteration": 2.4928524494171143 + }, + { + "auxiliary_loss_clip": 0.01345532, + "auxiliary_loss_mlp": 0.01059168, + "balance_loss_clip": 1.09062111, + "balance_loss_mlp": 1.04005921, + "epoch": 0.030902422894246377, + "flos": 23335364920320.0, + "grad_norm": 2.6655494813262575, + "language_loss": 0.8581593, + "learning_rate": 3.99999256674347e-06, + "loss": 0.88220626, + "num_input_tokens_seen": 5333545, + "step": 257, + "time_per_iteration": 3.3387093544006348 + }, + { + "auxiliary_loss_clip": 0.01304656, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.10229373, + "balance_loss_mlp": 1.02955675, + "epoch": 0.031022665784885467, + "flos": 55094151438720.0, + "grad_norm": 1.0099928657566553, + "language_loss": 0.53480977, + "learning_rate": 3.999990291258618e-06, + "loss": 0.55829304, + "num_input_tokens_seen": 5392235, + "step": 258, + "time_per_iteration": 3.051557779312134 + }, + { + "auxiliary_loss_clip": 0.01344231, + "auxiliary_loss_mlp": 0.01061055, + "balance_loss_clip": 1.0903585, + "balance_loss_mlp": 1.04322112, + "epoch": 0.03114290867552456, + "flos": 19317786664320.0, + "grad_norm": 3.8969201173723005, + "language_loss": 0.86756158, + "learning_rate": 3.999987712376829e-06, + "loss": 0.89161444, + "num_input_tokens_seen": 5410555, + "step": 259, + "time_per_iteration": 3.413259983062744 + }, + { + "auxiliary_loss_clip": 0.01343181, + "auxiliary_loss_mlp": 0.01059481, + "balance_loss_clip": 1.0915283, + "balance_loss_mlp": 1.04175413, + "epoch": 0.031263151566163654, + "flos": 20959442881920.0, + "grad_norm": 2.211860651607376, + "language_loss": 0.82125264, + "learning_rate": 3.999984830098494e-06, + "loss": 0.84527922, + "num_input_tokens_seen": 5430135, + "step": 260, + "time_per_iteration": 2.5220329761505127 + }, + { + "auxiliary_loss_clip": 0.01339961, + "auxiliary_loss_mlp": 0.01063405, + "balance_loss_clip": 1.08798361, + "balance_loss_mlp": 1.04521334, + "epoch": 0.03138339445680274, + "flos": 14793006412800.0, + "grad_norm": 5.903850409695519, + "language_loss": 0.98065579, + "learning_rate": 3.999981644424051e-06, + "loss": 1.00468946, + "num_input_tokens_seen": 5444935, + "step": 261, + "time_per_iteration": 3.396998643875122 + }, + { + "auxiliary_loss_clip": 0.01342885, + "auxiliary_loss_mlp": 0.01072589, + "balance_loss_clip": 1.09211588, + "balance_loss_mlp": 1.05332541, + "epoch": 0.03150363734744183, + "flos": 11655599022720.0, + "grad_norm": 2.630978039724578, + "language_loss": 0.86223084, + "learning_rate": 3.999978155353982e-06, + "loss": 0.88638556, + "num_input_tokens_seen": 5462080, + "step": 262, + "time_per_iteration": 2.4938228130340576 + }, + { + "auxiliary_loss_clip": 0.01340191, + "auxiliary_loss_mlp": 0.01070084, + "balance_loss_clip": 1.0884409, + "balance_loss_mlp": 1.05116534, + "epoch": 0.03162388023808092, + "flos": 33727732485120.0, + "grad_norm": 2.649327536096108, + "language_loss": 0.80542767, + "learning_rate": 3.9999743628888186e-06, + "loss": 0.82953036, + "num_input_tokens_seen": 5483870, + "step": 263, + "time_per_iteration": 2.6233174800872803 + }, + { + "auxiliary_loss_clip": 0.0133352, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.08486366, + "balance_loss_mlp": 1.04041016, + "epoch": 0.03174412312872001, + "flos": 20810952057600.0, + "grad_norm": 2.2379722597908263, + "language_loss": 0.89522231, + "learning_rate": 3.999970267029133e-06, + "loss": 0.91914284, + "num_input_tokens_seen": 5502830, + "step": 264, + "time_per_iteration": 2.5259580612182617 + }, + { + "auxiliary_loss_clip": 0.01335858, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_clip": 1.0876497, + "balance_loss_mlp": 1.04192591, + "epoch": 0.0318643660193591, + "flos": 23727939638400.0, + "grad_norm": 1.737513959245869, + "language_loss": 0.80131078, + "learning_rate": 3.999965867775548e-06, + "loss": 0.82526809, + "num_input_tokens_seen": 5523225, + "step": 265, + "time_per_iteration": 2.531316041946411 + }, + { + "auxiliary_loss_clip": 0.01337675, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_clip": 1.0875113, + "balance_loss_mlp": 1.05037618, + "epoch": 0.0319846089099982, + "flos": 13917863450880.0, + "grad_norm": 2.2992564104272875, + "language_loss": 0.86757058, + "learning_rate": 3.9999611651287315e-06, + "loss": 0.89162862, + "num_input_tokens_seen": 5541380, + "step": 266, + "time_per_iteration": 2.490023612976074 + }, + { + "auxiliary_loss_clip": 0.01341798, + "auxiliary_loss_mlp": 0.01063917, + "balance_loss_clip": 1.09049046, + "balance_loss_mlp": 1.0460242, + "epoch": 0.03210485180063729, + "flos": 14753253035520.0, + "grad_norm": 2.8891847246666607, + "language_loss": 0.7864753, + "learning_rate": 3.999956159089396e-06, + "loss": 0.81053245, + "num_input_tokens_seen": 5558830, + "step": 267, + "time_per_iteration": 2.4564454555511475 + }, + { + "auxiliary_loss_clip": 0.01338667, + "auxiliary_loss_mlp": 0.01071203, + "balance_loss_clip": 1.08925581, + "balance_loss_mlp": 1.05288005, + "epoch": 0.03222509469127638, + "flos": 28913153304960.0, + "grad_norm": 2.300808642043789, + "language_loss": 0.79762089, + "learning_rate": 3.999950849658302e-06, + "loss": 0.82171959, + "num_input_tokens_seen": 5577750, + "step": 268, + "time_per_iteration": 2.5569043159484863 + }, + { + "auxiliary_loss_clip": 0.01345131, + "auxiliary_loss_mlp": 0.01067944, + "balance_loss_clip": 1.09130991, + "balance_loss_mlp": 1.05003905, + "epoch": 0.03234533758191547, + "flos": 16946389739520.0, + "grad_norm": 2.924318003872924, + "language_loss": 0.84219098, + "learning_rate": 3.999945236836254e-06, + "loss": 0.86632174, + "num_input_tokens_seen": 5596715, + "step": 269, + "time_per_iteration": 2.4971113204956055 + }, + { + "auxiliary_loss_clip": 0.01344112, + "auxiliary_loss_mlp": 0.0106993, + "balance_loss_clip": 1.09242821, + "balance_loss_mlp": 1.05034423, + "epoch": 0.03246558047255456, + "flos": 18989096284800.0, + "grad_norm": 3.1800056935421757, + "language_loss": 0.95129263, + "learning_rate": 3.999939320624103e-06, + "loss": 0.97543311, + "num_input_tokens_seen": 5611865, + "step": 270, + "time_per_iteration": 2.485318422317505 + }, + { + "auxiliary_loss_clip": 0.01341205, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_clip": 1.09086657, + "balance_loss_mlp": 1.05295742, + "epoch": 0.03258582336319365, + "flos": 23728334688000.0, + "grad_norm": 1.9764671367909006, + "language_loss": 0.89835531, + "learning_rate": 3.999933101022749e-06, + "loss": 0.92247939, + "num_input_tokens_seen": 5632270, + "step": 271, + "time_per_iteration": 2.564082622528076 + }, + { + "auxiliary_loss_clip": 0.0133869, + "auxiliary_loss_mlp": 0.01068345, + "balance_loss_clip": 1.08979499, + "balance_loss_mlp": 1.04996312, + "epoch": 0.032706066253832745, + "flos": 27670823562240.0, + "grad_norm": 1.9796735763299091, + "language_loss": 0.86787832, + "learning_rate": 3.999926578033132e-06, + "loss": 0.8919487, + "num_input_tokens_seen": 5652085, + "step": 272, + "time_per_iteration": 2.599562883377075 + }, + { + "auxiliary_loss_clip": 0.01336984, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_clip": 1.08594632, + "balance_loss_mlp": 1.04904151, + "epoch": 0.032826309144471835, + "flos": 45624685968000.0, + "grad_norm": 2.2889200841507304, + "language_loss": 0.63257587, + "learning_rate": 3.999919751656244e-06, + "loss": 0.65661955, + "num_input_tokens_seen": 5678985, + "step": 273, + "time_per_iteration": 2.7432477474212646 + }, + { + "auxiliary_loss_clip": 0.01334162, + "auxiliary_loss_mlp": 0.01061414, + "balance_loss_clip": 1.08565664, + "balance_loss_mlp": 1.04210186, + "epoch": 0.032946552035110925, + "flos": 25812374808960.0, + "grad_norm": 3.045135988792467, + "language_loss": 0.75878394, + "learning_rate": 3.9999126218931195e-06, + "loss": 0.7827397, + "num_input_tokens_seen": 5697020, + "step": 274, + "time_per_iteration": 2.575251817703247 + }, + { + "auxiliary_loss_clip": 0.01339846, + "auxiliary_loss_mlp": 0.01053298, + "balance_loss_clip": 1.09053946, + "balance_loss_mlp": 1.03511882, + "epoch": 0.033066794925750015, + "flos": 15121984101120.0, + "grad_norm": 3.434165482442146, + "language_loss": 0.89831567, + "learning_rate": 3.99990518874484e-06, + "loss": 0.92224705, + "num_input_tokens_seen": 5713460, + "step": 275, + "time_per_iteration": 2.5150575637817383 + }, + { + "auxiliary_loss_clip": 0.01338367, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.09014893, + "balance_loss_mlp": 1.04998183, + "epoch": 0.033187037816389105, + "flos": 22776593973120.0, + "grad_norm": 2.7461679448867544, + "language_loss": 0.92698133, + "learning_rate": 3.999897452212534e-06, + "loss": 0.95104253, + "num_input_tokens_seen": 5730790, + "step": 276, + "time_per_iteration": 2.527181625366211 + }, + { + "auxiliary_loss_clip": 0.01331969, + "auxiliary_loss_mlp": 0.01066118, + "balance_loss_clip": 1.08577979, + "balance_loss_mlp": 1.04740238, + "epoch": 0.033307280707028195, + "flos": 23331414424320.0, + "grad_norm": 2.179013422042405, + "language_loss": 1.00495708, + "learning_rate": 3.999889412297374e-06, + "loss": 1.02893794, + "num_input_tokens_seen": 5750215, + "step": 277, + "time_per_iteration": 2.518770694732666 + }, + { + "auxiliary_loss_clip": 0.0133281, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.08493567, + "balance_loss_mlp": 1.0303396, + "epoch": 0.03342752359766729, + "flos": 28840290566400.0, + "grad_norm": 2.8570186209186796, + "language_loss": 0.7881788, + "learning_rate": 3.999881069000581e-06, + "loss": 0.81198013, + "num_input_tokens_seen": 5769945, + "step": 278, + "time_per_iteration": 2.568476915359497 + }, + { + "auxiliary_loss_clip": 0.01333882, + "auxiliary_loss_mlp": 0.01057542, + "balance_loss_clip": 1.08555245, + "balance_loss_mlp": 1.038445, + "epoch": 0.03354776648830638, + "flos": 19384544090880.0, + "grad_norm": 2.9167783875535207, + "language_loss": 0.87105525, + "learning_rate": 3.99987242232342e-06, + "loss": 0.89496952, + "num_input_tokens_seen": 5784950, + "step": 279, + "time_per_iteration": 2.4600019454956055 + }, + { + "auxiliary_loss_clip": 0.01336555, + "auxiliary_loss_mlp": 0.01064321, + "balance_loss_clip": 1.08977258, + "balance_loss_mlp": 1.04556894, + "epoch": 0.03366800937894547, + "flos": 17858628472320.0, + "grad_norm": 2.1558087775206602, + "language_loss": 0.79768413, + "learning_rate": 3.9998634722672026e-06, + "loss": 0.82169294, + "num_input_tokens_seen": 5805005, + "step": 280, + "time_per_iteration": 2.4988930225372314 + }, + { + "auxiliary_loss_clip": 0.01335839, + "auxiliary_loss_mlp": 0.01056894, + "balance_loss_clip": 1.08912122, + "balance_loss_mlp": 1.03941774, + "epoch": 0.03378825226958456, + "flos": 35951033635200.0, + "grad_norm": 4.708304669967321, + "language_loss": 0.78568453, + "learning_rate": 3.999854218833286e-06, + "loss": 0.80961186, + "num_input_tokens_seen": 5825825, + "step": 281, + "time_per_iteration": 2.6269450187683105 + }, + { + "auxiliary_loss_clip": 0.01335172, + "auxiliary_loss_mlp": 0.01061528, + "balance_loss_clip": 1.08924055, + "balance_loss_mlp": 1.04349184, + "epoch": 0.03390849516022365, + "flos": 25702488126720.0, + "grad_norm": 2.046197934265432, + "language_loss": 0.82014465, + "learning_rate": 3.999844662023075e-06, + "loss": 0.84411168, + "num_input_tokens_seen": 5845700, + "step": 282, + "time_per_iteration": 2.624467611312866 + }, + { + "auxiliary_loss_clip": 0.0132729, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.08505869, + "balance_loss_mlp": 1.03918576, + "epoch": 0.03402873805086274, + "flos": 21284505987840.0, + "grad_norm": 2.38817751911425, + "language_loss": 0.9241792, + "learning_rate": 3.999834801838018e-06, + "loss": 0.94802296, + "num_input_tokens_seen": 5864680, + "step": 283, + "time_per_iteration": 2.5722880363464355 + }, + { + "auxiliary_loss_clip": 0.01328388, + "auxiliary_loss_mlp": 0.01055549, + "balance_loss_clip": 1.08540606, + "balance_loss_mlp": 1.03790617, + "epoch": 0.03414898094150183, + "flos": 22710913954560.0, + "grad_norm": 2.690237992939667, + "language_loss": 0.74035811, + "learning_rate": 3.9998246382796115e-06, + "loss": 0.76419753, + "num_input_tokens_seen": 5884260, + "step": 284, + "time_per_iteration": 2.5084428787231445 + }, + { + "auxiliary_loss_clip": 0.01331572, + "auxiliary_loss_mlp": 0.01051912, + "balance_loss_clip": 1.08360291, + "balance_loss_mlp": 1.03293395, + "epoch": 0.03426922383214093, + "flos": 18879927874560.0, + "grad_norm": 2.3554148044201404, + "language_loss": 0.91031516, + "learning_rate": 3.999814171349399e-06, + "loss": 0.93414998, + "num_input_tokens_seen": 5902120, + "step": 285, + "time_per_iteration": 3.2675235271453857 + }, + { + "auxiliary_loss_clip": 0.01328037, + "auxiliary_loss_mlp": 0.01057979, + "balance_loss_clip": 1.08513415, + "balance_loss_mlp": 1.04095626, + "epoch": 0.03438946672278002, + "flos": 34752012716160.0, + "grad_norm": 1.9871252320276223, + "language_loss": 0.73578537, + "learning_rate": 3.9998034010489655e-06, + "loss": 0.75964558, + "num_input_tokens_seen": 5925810, + "step": 286, + "time_per_iteration": 2.642399787902832 + }, + { + "auxiliary_loss_clip": 0.01328379, + "auxiliary_loss_mlp": 0.01065283, + "balance_loss_clip": 1.08706367, + "balance_loss_mlp": 1.04810476, + "epoch": 0.03450970961341911, + "flos": 22164102236160.0, + "grad_norm": 2.249825948352589, + "language_loss": 0.76129889, + "learning_rate": 3.999792327379946e-06, + "loss": 0.78523546, + "num_input_tokens_seen": 5945185, + "step": 287, + "time_per_iteration": 4.911510229110718 + }, + { + "auxiliary_loss_clip": 0.01333937, + "auxiliary_loss_mlp": 0.01062381, + "balance_loss_clip": 1.09100664, + "balance_loss_mlp": 1.04513097, + "epoch": 0.034629952504058197, + "flos": 21725740656000.0, + "grad_norm": 2.546643900846086, + "language_loss": 0.96285677, + "learning_rate": 3.999780950344021e-06, + "loss": 0.98681998, + "num_input_tokens_seen": 5963375, + "step": 288, + "time_per_iteration": 2.5098876953125 + }, + { + "auxiliary_loss_clip": 0.01333357, + "auxiliary_loss_mlp": 0.0106592, + "balance_loss_clip": 1.08696699, + "balance_loss_mlp": 1.04733539, + "epoch": 0.034750195394697286, + "flos": 20047994248320.0, + "grad_norm": 1.8878023902374277, + "language_loss": 0.82841927, + "learning_rate": 3.999769269942916e-06, + "loss": 0.85241205, + "num_input_tokens_seen": 5983415, + "step": 289, + "time_per_iteration": 2.5221378803253174 + }, + { + "auxiliary_loss_clip": 0.01328378, + "auxiliary_loss_mlp": 0.01054838, + "balance_loss_clip": 1.08521628, + "balance_loss_mlp": 1.03693295, + "epoch": 0.034870438285336376, + "flos": 27965865876480.0, + "grad_norm": 1.8017181204797872, + "language_loss": 0.8110379, + "learning_rate": 3.999757286178402e-06, + "loss": 0.8348701, + "num_input_tokens_seen": 6005850, + "step": 290, + "time_per_iteration": 2.5938706398010254 + }, + { + "auxiliary_loss_clip": 0.01331033, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_clip": 1.08730459, + "balance_loss_mlp": 1.0352931, + "epoch": 0.03499068117597547, + "flos": 22017514832640.0, + "grad_norm": 2.2751823159358224, + "language_loss": 0.90712178, + "learning_rate": 3.999744999052299e-06, + "loss": 0.93096673, + "num_input_tokens_seen": 6027240, + "step": 291, + "time_per_iteration": 2.580021381378174 + }, + { + "auxiliary_loss_clip": 0.01296254, + "auxiliary_loss_mlp": 0.0101107, + "balance_loss_clip": 1.11012912, + "balance_loss_mlp": 0.99953097, + "epoch": 0.03511092406661456, + "flos": 57242147725440.0, + "grad_norm": 0.9575470453671735, + "language_loss": 0.61195493, + "learning_rate": 3.9997324085664675e-06, + "loss": 0.63502812, + "num_input_tokens_seen": 6087470, + "step": 292, + "time_per_iteration": 3.0615618228912354 + }, + { + "auxiliary_loss_clip": 0.01326155, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_clip": 1.08412242, + "balance_loss_mlp": 1.04129732, + "epoch": 0.03523116695725365, + "flos": 22928065626240.0, + "grad_norm": 2.192033855550171, + "language_loss": 0.92056733, + "learning_rate": 3.999719514722821e-06, + "loss": 0.94442517, + "num_input_tokens_seen": 6107600, + "step": 293, + "time_per_iteration": 2.5036370754241943 + }, + { + "auxiliary_loss_clip": 0.01322728, + "auxiliary_loss_mlp": 0.01056308, + "balance_loss_clip": 1.08326459, + "balance_loss_mlp": 1.0395596, + "epoch": 0.03535140984789274, + "flos": 36903241226880.0, + "grad_norm": 2.1360042386923976, + "language_loss": 0.74641192, + "learning_rate": 3.999706317523314e-06, + "loss": 0.77020228, + "num_input_tokens_seen": 6126160, + "step": 294, + "time_per_iteration": 2.6289920806884766 + }, + { + "auxiliary_loss_clip": 0.01324414, + "auxiliary_loss_mlp": 0.01056242, + "balance_loss_clip": 1.08429885, + "balance_loss_mlp": 1.03957653, + "epoch": 0.03547165273853183, + "flos": 20449152316800.0, + "grad_norm": 3.143754370236453, + "language_loss": 0.85977232, + "learning_rate": 3.999692816969948e-06, + "loss": 0.88357884, + "num_input_tokens_seen": 6145695, + "step": 295, + "time_per_iteration": 2.5379278659820557 + }, + { + "auxiliary_loss_clip": 0.0128319, + "auxiliary_loss_mlp": 0.01010781, + "balance_loss_clip": 1.10108447, + "balance_loss_mlp": 0.99967021, + "epoch": 0.03559189562917092, + "flos": 69850564871040.0, + "grad_norm": 1.0287677883225497, + "language_loss": 0.69377422, + "learning_rate": 3.999679013064772e-06, + "loss": 0.71671391, + "num_input_tokens_seen": 6212440, + "step": 296, + "time_per_iteration": 3.141838312149048 + }, + { + "auxiliary_loss_clip": 0.01328582, + "auxiliary_loss_mlp": 0.01061071, + "balance_loss_clip": 1.08670473, + "balance_loss_mlp": 1.04359508, + "epoch": 0.03571213851981002, + "flos": 21651944163840.0, + "grad_norm": 2.4278313957226962, + "language_loss": 0.85654485, + "learning_rate": 3.99966490580988e-06, + "loss": 0.88044143, + "num_input_tokens_seen": 6229800, + "step": 297, + "time_per_iteration": 2.5418715476989746 + }, + { + "auxiliary_loss_clip": 0.01329879, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_clip": 1.08607018, + "balance_loss_mlp": 1.04155755, + "epoch": 0.03583238141044911, + "flos": 43945610757120.0, + "grad_norm": 2.2834553406901335, + "language_loss": 0.65798414, + "learning_rate": 3.999650495207411e-06, + "loss": 0.68187559, + "num_input_tokens_seen": 6255825, + "step": 298, + "time_per_iteration": 2.7448337078094482 + }, + { + "auxiliary_loss_clip": 0.01322379, + "auxiliary_loss_mlp": 0.01061102, + "balance_loss_clip": 1.08434451, + "balance_loss_mlp": 1.04314899, + "epoch": 0.0359526243010882, + "flos": 18910810592640.0, + "grad_norm": 2.699118205142571, + "language_loss": 0.90292114, + "learning_rate": 3.999635781259553e-06, + "loss": 0.92675602, + "num_input_tokens_seen": 6271090, + "step": 299, + "time_per_iteration": 2.511906385421753 + }, + { + "auxiliary_loss_clip": 0.01265766, + "auxiliary_loss_mlp": 0.01010648, + "balance_loss_clip": 1.08814216, + "balance_loss_mlp": 0.99968082, + "epoch": 0.03607286719172729, + "flos": 61668892782720.0, + "grad_norm": 0.9161323993730158, + "language_loss": 0.52298796, + "learning_rate": 3.999620763968535e-06, + "loss": 0.54575211, + "num_input_tokens_seen": 6329965, + "step": 300, + "time_per_iteration": 2.9446473121643066 + }, + { + "auxiliary_loss_clip": 0.01322788, + "auxiliary_loss_mlp": 0.0105647, + "balance_loss_clip": 1.08577251, + "balance_loss_mlp": 1.03857672, + "epoch": 0.03619311008236638, + "flos": 27819062991360.0, + "grad_norm": 1.6555977356602007, + "language_loss": 0.86348355, + "learning_rate": 3.999605443336638e-06, + "loss": 0.88727611, + "num_input_tokens_seen": 6352095, + "step": 301, + "time_per_iteration": 2.584638833999634 + }, + { + "auxiliary_loss_clip": 0.0132967, + "auxiliary_loss_mlp": 0.01062383, + "balance_loss_clip": 1.0882982, + "balance_loss_mlp": 1.04447746, + "epoch": 0.03631335297300547, + "flos": 13621133197440.0, + "grad_norm": 2.5957059082305594, + "language_loss": 0.89140511, + "learning_rate": 3.999589819366185e-06, + "loss": 0.91532558, + "num_input_tokens_seen": 6365885, + "step": 302, + "time_per_iteration": 2.509063959121704 + }, + { + "auxiliary_loss_clip": 0.01328681, + "auxiliary_loss_mlp": 0.01055599, + "balance_loss_clip": 1.08693552, + "balance_loss_mlp": 1.03703856, + "epoch": 0.036433595863644565, + "flos": 27631788456960.0, + "grad_norm": 2.0264428370210426, + "language_loss": 0.84895611, + "learning_rate": 3.999573892059547e-06, + "loss": 0.87279892, + "num_input_tokens_seen": 6385015, + "step": 303, + "time_per_iteration": 2.649162769317627 + }, + { + "auxiliary_loss_clip": 0.01332284, + "auxiliary_loss_mlp": 0.01062724, + "balance_loss_clip": 1.08786666, + "balance_loss_mlp": 1.04318523, + "epoch": 0.036553838754283655, + "flos": 24572020314240.0, + "grad_norm": 4.729620865465135, + "language_loss": 0.81245923, + "learning_rate": 3.999557661419138e-06, + "loss": 0.83640927, + "num_input_tokens_seen": 6405165, + "step": 304, + "time_per_iteration": 2.6427083015441895 + }, + { + "auxiliary_loss_clip": 0.01331544, + "auxiliary_loss_mlp": 0.01056599, + "balance_loss_clip": 1.09016681, + "balance_loss_mlp": 1.03994584, + "epoch": 0.036674081644922744, + "flos": 23404313076480.0, + "grad_norm": 1.980264391209871, + "language_loss": 0.81468773, + "learning_rate": 3.9995411274474225e-06, + "loss": 0.83856916, + "num_input_tokens_seen": 6424445, + "step": 305, + "time_per_iteration": 2.6122148036956787 + }, + { + "auxiliary_loss_clip": 0.01327508, + "auxiliary_loss_mlp": 0.01064969, + "balance_loss_clip": 1.08598101, + "balance_loss_mlp": 1.04646766, + "epoch": 0.036794324535561834, + "flos": 27489690253440.0, + "grad_norm": 2.537369263596587, + "language_loss": 0.81363618, + "learning_rate": 3.999524290146908e-06, + "loss": 0.83756095, + "num_input_tokens_seen": 6444650, + "step": 306, + "time_per_iteration": 2.6239359378814697 + }, + { + "auxiliary_loss_clip": 0.01327072, + "auxiliary_loss_mlp": 0.01064032, + "balance_loss_clip": 1.08922243, + "balance_loss_mlp": 1.0458169, + "epoch": 0.036914567426200924, + "flos": 19463476227840.0, + "grad_norm": 2.2926271255330035, + "language_loss": 0.92621684, + "learning_rate": 3.9995071495201485e-06, + "loss": 0.95012784, + "num_input_tokens_seen": 6461755, + "step": 307, + "time_per_iteration": 2.5787405967712402 + }, + { + "auxiliary_loss_clip": 0.01325617, + "auxiliary_loss_mlp": 0.01057346, + "balance_loss_clip": 1.08691669, + "balance_loss_mlp": 1.03859472, + "epoch": 0.037034810316840014, + "flos": 22309324922880.0, + "grad_norm": 2.457564993699083, + "language_loss": 0.97762251, + "learning_rate": 3.999489705569744e-06, + "loss": 1.00145221, + "num_input_tokens_seen": 6479455, + "step": 308, + "time_per_iteration": 2.5974178314208984 + }, + { + "auxiliary_loss_clip": 0.01322516, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_clip": 1.08328605, + "balance_loss_mlp": 1.045645, + "epoch": 0.03715505320747911, + "flos": 18588333265920.0, + "grad_norm": 3.272398825942866, + "language_loss": 0.86725533, + "learning_rate": 3.999471958298341e-06, + "loss": 0.89111364, + "num_input_tokens_seen": 6498365, + "step": 309, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01331067, + "auxiliary_loss_mlp": 0.01069612, + "balance_loss_clip": 1.08935523, + "balance_loss_mlp": 1.05031228, + "epoch": 0.0372752960981182, + "flos": 35955343267200.0, + "grad_norm": 2.286976248839816, + "language_loss": 0.7611773, + "learning_rate": 3.999453907708631e-06, + "loss": 0.78518409, + "num_input_tokens_seen": 6520770, + "step": 310, + "time_per_iteration": 2.708791732788086 + }, + { + "auxiliary_loss_clip": 0.01327323, + "auxiliary_loss_mlp": 0.01051579, + "balance_loss_clip": 1.08778858, + "balance_loss_mlp": 1.03472304, + "epoch": 0.03739553898875729, + "flos": 20814040627200.0, + "grad_norm": 2.1290641195766087, + "language_loss": 0.8118763, + "learning_rate": 3.999435553803353e-06, + "loss": 0.83566535, + "num_input_tokens_seen": 6540170, + "step": 311, + "time_per_iteration": 2.6099495887756348 + }, + { + "auxiliary_loss_clip": 0.0132369, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_clip": 1.08587921, + "balance_loss_mlp": 1.04633772, + "epoch": 0.03751578187939638, + "flos": 20264140339200.0, + "grad_norm": 2.66932750298438, + "language_loss": 0.8341676, + "learning_rate": 3.999416896585292e-06, + "loss": 0.8580451, + "num_input_tokens_seen": 6557200, + "step": 312, + "time_per_iteration": 3.437772750854492 + }, + { + "auxiliary_loss_clip": 0.0132523, + "auxiliary_loss_mlp": 0.01057606, + "balance_loss_clip": 1.08547044, + "balance_loss_mlp": 1.03953362, + "epoch": 0.03763602477003547, + "flos": 20668063754880.0, + "grad_norm": 2.6752869306198748, + "language_loss": 0.85858917, + "learning_rate": 3.9993979360572775e-06, + "loss": 0.8824175, + "num_input_tokens_seen": 6577340, + "step": 313, + "time_per_iteration": 3.3978238105773926 + }, + { + "auxiliary_loss_clip": 0.01332563, + "auxiliary_loss_mlp": 0.0106021, + "balance_loss_clip": 1.09025753, + "balance_loss_mlp": 1.04224527, + "epoch": 0.03775626766067456, + "flos": 16691352197760.0, + "grad_norm": 2.581312795847191, + "language_loss": 0.82825291, + "learning_rate": 3.999378672222185e-06, + "loss": 0.85218066, + "num_input_tokens_seen": 6595125, + "step": 314, + "time_per_iteration": 3.9229483604431152 + }, + { + "auxiliary_loss_clip": 0.01326893, + "auxiliary_loss_mlp": 0.01056831, + "balance_loss_clip": 1.08860445, + "balance_loss_mlp": 1.03741145, + "epoch": 0.03787651055131366, + "flos": 21141797253120.0, + "grad_norm": 2.0342078606871747, + "language_loss": 0.82576346, + "learning_rate": 3.9993591050829385e-06, + "loss": 0.84960079, + "num_input_tokens_seen": 6612990, + "step": 315, + "time_per_iteration": 2.5938942432403564 + }, + { + "auxiliary_loss_clip": 0.01328284, + "auxiliary_loss_mlp": 0.01066695, + "balance_loss_clip": 1.08934808, + "balance_loss_mlp": 1.04812193, + "epoch": 0.037996753441952746, + "flos": 22018089450240.0, + "grad_norm": 2.4057036791192656, + "language_loss": 0.79293507, + "learning_rate": 3.999339234642506e-06, + "loss": 0.81688488, + "num_input_tokens_seen": 6632740, + "step": 316, + "time_per_iteration": 2.6008734703063965 + }, + { + "auxiliary_loss_clip": 0.01326737, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.08912241, + "balance_loss_mlp": 1.03288436, + "epoch": 0.038116996332591836, + "flos": 27709391790720.0, + "grad_norm": 2.025643207695263, + "language_loss": 0.83883595, + "learning_rate": 3.9993190609038994e-06, + "loss": 0.86262333, + "num_input_tokens_seen": 6651505, + "step": 317, + "time_per_iteration": 2.6177659034729004 + }, + { + "auxiliary_loss_clip": 0.01318055, + "auxiliary_loss_mlp": 0.01053225, + "balance_loss_clip": 1.08364534, + "balance_loss_mlp": 1.03500974, + "epoch": 0.038237239223230926, + "flos": 21178067011200.0, + "grad_norm": 2.672723726459501, + "language_loss": 0.83313006, + "learning_rate": 3.999298583870182e-06, + "loss": 0.85684288, + "num_input_tokens_seen": 6671090, + "step": 318, + "time_per_iteration": 2.5851194858551025 + }, + { + "auxiliary_loss_clip": 0.01320377, + "auxiliary_loss_mlp": 0.01060886, + "balance_loss_clip": 1.08458877, + "balance_loss_mlp": 1.04276586, + "epoch": 0.038357482113870016, + "flos": 25556618995200.0, + "grad_norm": 1.777345067792429, + "language_loss": 0.77511567, + "learning_rate": 3.999277803544458e-06, + "loss": 0.79892826, + "num_input_tokens_seen": 6691245, + "step": 319, + "time_per_iteration": 2.6483538150787354 + }, + { + "auxiliary_loss_clip": 0.01231492, + "auxiliary_loss_mlp": 0.01019452, + "balance_loss_clip": 1.06167054, + "balance_loss_mlp": 1.00953352, + "epoch": 0.038477725004509106, + "flos": 59227578034560.0, + "grad_norm": 1.4865256816277614, + "language_loss": 0.62394178, + "learning_rate": 3.999256719929882e-06, + "loss": 0.64645123, + "num_input_tokens_seen": 6752520, + "step": 320, + "time_per_iteration": 3.094872236251831 + }, + { + "auxiliary_loss_clip": 0.01230377, + "auxiliary_loss_mlp": 0.01015774, + "balance_loss_clip": 1.06042171, + "balance_loss_mlp": 1.00580812, + "epoch": 0.0385979678951482, + "flos": 67317676398720.0, + "grad_norm": 1.2101276577943494, + "language_loss": 0.670331, + "learning_rate": 3.999235333029651e-06, + "loss": 0.69279253, + "num_input_tokens_seen": 6806460, + "step": 321, + "time_per_iteration": 3.001925230026245 + }, + { + "auxiliary_loss_clip": 0.01319312, + "auxiliary_loss_mlp": 0.01056926, + "balance_loss_clip": 1.08648396, + "balance_loss_mlp": 1.0400219, + "epoch": 0.03871821078578729, + "flos": 22746752749440.0, + "grad_norm": 1.9237780889146128, + "language_loss": 0.81981397, + "learning_rate": 3.999213642847009e-06, + "loss": 0.84357643, + "num_input_tokens_seen": 6827045, + "step": 322, + "time_per_iteration": 2.6191458702087402 + }, + { + "auxiliary_loss_clip": 0.01320242, + "auxiliary_loss_mlp": 0.01056025, + "balance_loss_clip": 1.08479762, + "balance_loss_mlp": 1.03871632, + "epoch": 0.03883845367642638, + "flos": 26280613526400.0, + "grad_norm": 1.8329261185335868, + "language_loss": 0.91452098, + "learning_rate": 3.999191649385247e-06, + "loss": 0.93828368, + "num_input_tokens_seen": 6848220, + "step": 323, + "time_per_iteration": 2.6254429817199707 + }, + { + "auxiliary_loss_clip": 0.01226636, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.05850315, + "balance_loss_mlp": 0.99792022, + "epoch": 0.03895869656706547, + "flos": 56962835568000.0, + "grad_norm": 0.9128708985822461, + "language_loss": 0.59792387, + "learning_rate": 3.999169352647702e-06, + "loss": 0.62026477, + "num_input_tokens_seen": 6909400, + "step": 324, + "time_per_iteration": 3.0429677963256836 + }, + { + "auxiliary_loss_clip": 0.01322206, + "auxiliary_loss_mlp": 0.0107795, + "balance_loss_clip": 1.08558095, + "balance_loss_mlp": 1.05882883, + "epoch": 0.03907893945770456, + "flos": 24863363527680.0, + "grad_norm": 1.8247516543324356, + "language_loss": 0.83096236, + "learning_rate": 3.999146752637755e-06, + "loss": 0.8549639, + "num_input_tokens_seen": 6930445, + "step": 325, + "time_per_iteration": 2.6552157402038574 + }, + { + "auxiliary_loss_clip": 0.01319275, + "auxiliary_loss_mlp": 0.01059028, + "balance_loss_clip": 1.08412313, + "balance_loss_mlp": 1.0406698, + "epoch": 0.03919918234834365, + "flos": 18368595815040.0, + "grad_norm": 2.7615801444642702, + "language_loss": 0.89452446, + "learning_rate": 3.999123849358836e-06, + "loss": 0.91830754, + "num_input_tokens_seen": 6948110, + "step": 326, + "time_per_iteration": 2.5374817848205566 + }, + { + "auxiliary_loss_clip": 0.01320353, + "auxiliary_loss_mlp": 0.01061401, + "balance_loss_clip": 1.0855751, + "balance_loss_mlp": 1.04260147, + "epoch": 0.03931942523898275, + "flos": 25225414663680.0, + "grad_norm": 2.5502410578561068, + "language_loss": 0.74760568, + "learning_rate": 3.999100642814418e-06, + "loss": 0.77142322, + "num_input_tokens_seen": 6968550, + "step": 327, + "time_per_iteration": 2.615022659301758 + }, + { + "auxiliary_loss_clip": 0.01320131, + "auxiliary_loss_mlp": 0.01061968, + "balance_loss_clip": 1.08636844, + "balance_loss_mlp": 1.04343057, + "epoch": 0.03943966812962184, + "flos": 23257905240960.0, + "grad_norm": 3.2534658503126574, + "language_loss": 0.88650692, + "learning_rate": 3.999077133008022e-06, + "loss": 0.91032791, + "num_input_tokens_seen": 6987135, + "step": 328, + "time_per_iteration": 2.557554244995117 + }, + { + "auxiliary_loss_clip": 0.01321379, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_clip": 1.08625197, + "balance_loss_mlp": 1.04481435, + "epoch": 0.03955991102026093, + "flos": 29168837291520.0, + "grad_norm": 1.8830009201283515, + "language_loss": 0.9026013, + "learning_rate": 3.9990533199432145e-06, + "loss": 0.9264673, + "num_input_tokens_seen": 7008630, + "step": 329, + "time_per_iteration": 2.749346971511841 + }, + { + "auxiliary_loss_clip": 0.01319137, + "auxiliary_loss_mlp": 0.01056034, + "balance_loss_clip": 1.084324, + "balance_loss_mlp": 1.03769994, + "epoch": 0.03968015391090002, + "flos": 17602441695360.0, + "grad_norm": 2.260068834338427, + "language_loss": 0.75783455, + "learning_rate": 3.999029203623608e-06, + "loss": 0.78158623, + "num_input_tokens_seen": 7026350, + "step": 330, + "time_per_iteration": 2.5551459789276123 + }, + { + "auxiliary_loss_clip": 0.01314413, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_clip": 1.08268523, + "balance_loss_mlp": 1.0334003, + "epoch": 0.03980039680153911, + "flos": 21799285752960.0, + "grad_norm": 2.06365449955655, + "language_loss": 0.86942321, + "learning_rate": 3.99900478405286e-06, + "loss": 0.89308643, + "num_input_tokens_seen": 7045660, + "step": 331, + "time_per_iteration": 2.5484280586242676 + }, + { + "auxiliary_loss_clip": 0.01318587, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.08791137, + "balance_loss_mlp": 1.04371357, + "epoch": 0.0399206396921782, + "flos": 15195134148480.0, + "grad_norm": 2.76715508324197, + "language_loss": 0.82306921, + "learning_rate": 3.998980061234676e-06, + "loss": 0.84685326, + "num_input_tokens_seen": 7063575, + "step": 332, + "time_per_iteration": 2.6399519443511963 + }, + { + "auxiliary_loss_clip": 0.01322188, + "auxiliary_loss_mlp": 0.01058873, + "balance_loss_clip": 1.08416867, + "balance_loss_mlp": 1.03963268, + "epoch": 0.040040882582817294, + "flos": 14422910630400.0, + "grad_norm": 3.6203703624551644, + "language_loss": 0.75698757, + "learning_rate": 3.9989550351728055e-06, + "loss": 0.78079814, + "num_input_tokens_seen": 7080505, + "step": 333, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01318246, + "auxiliary_loss_mlp": 0.01058101, + "balance_loss_clip": 1.08566666, + "balance_loss_mlp": 1.0406009, + "epoch": 0.040161125473456384, + "flos": 19280906375040.0, + "grad_norm": 2.251075568033566, + "language_loss": 0.84644306, + "learning_rate": 3.998929705871046e-06, + "loss": 0.87020648, + "num_input_tokens_seen": 7097860, + "step": 334, + "time_per_iteration": 2.616380214691162 + }, + { + "auxiliary_loss_clip": 0.01316935, + "auxiliary_loss_mlp": 0.01057557, + "balance_loss_clip": 1.08613563, + "balance_loss_mlp": 1.0400331, + "epoch": 0.040281368364095474, + "flos": 17821101738240.0, + "grad_norm": 2.7471022746200613, + "language_loss": 0.89322639, + "learning_rate": 3.99890407333324e-06, + "loss": 0.91697133, + "num_input_tokens_seen": 7116390, + "step": 335, + "time_per_iteration": 2.543278932571411 + }, + { + "auxiliary_loss_clip": 0.0131353, + "auxiliary_loss_mlp": 0.01058075, + "balance_loss_clip": 1.08022022, + "balance_loss_mlp": 1.03991914, + "epoch": 0.040401611254734564, + "flos": 19573757959680.0, + "grad_norm": 1.9418844287797874, + "language_loss": 0.87220156, + "learning_rate": 3.998878137563275e-06, + "loss": 0.89591759, + "num_input_tokens_seen": 7135940, + "step": 336, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01316435, + "auxiliary_loss_mlp": 0.01053083, + "balance_loss_clip": 1.08287001, + "balance_loss_mlp": 1.03467703, + "epoch": 0.040521854145373654, + "flos": 22054466949120.0, + "grad_norm": 5.718891235477159, + "language_loss": 0.85277283, + "learning_rate": 3.998851898565085e-06, + "loss": 0.87646806, + "num_input_tokens_seen": 7155745, + "step": 337, + "time_per_iteration": 2.5689029693603516 + }, + { + "auxiliary_loss_clip": 0.01312839, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.08150196, + "balance_loss_mlp": 1.03039503, + "epoch": 0.04064209703601274, + "flos": 22674644196480.0, + "grad_norm": 1.9341216700014996, + "language_loss": 0.82864231, + "learning_rate": 3.998825356342653e-06, + "loss": 0.85224849, + "num_input_tokens_seen": 7175920, + "step": 338, + "time_per_iteration": 2.583622455596924 + }, + { + "auxiliary_loss_clip": 0.01315403, + "auxiliary_loss_mlp": 0.01062177, + "balance_loss_clip": 1.08178306, + "balance_loss_mlp": 1.04439068, + "epoch": 0.04076233992665183, + "flos": 38582172783360.0, + "grad_norm": 3.6254904757981175, + "language_loss": 0.73175031, + "learning_rate": 3.998798510900003e-06, + "loss": 0.75552607, + "num_input_tokens_seen": 7198720, + "step": 339, + "time_per_iteration": 3.454833745956421 + }, + { + "auxiliary_loss_clip": 0.01314767, + "auxiliary_loss_mlp": 0.01056337, + "balance_loss_clip": 1.08162415, + "balance_loss_mlp": 1.03825259, + "epoch": 0.04088258281729093, + "flos": 25885309374720.0, + "grad_norm": 2.07244703697863, + "language_loss": 0.83756411, + "learning_rate": 3.998771362241207e-06, + "loss": 0.8612752, + "num_input_tokens_seen": 7219125, + "step": 340, + "time_per_iteration": 2.5592620372772217 + }, + { + "auxiliary_loss_clip": 0.01310137, + "auxiliary_loss_mlp": 0.01056328, + "balance_loss_clip": 1.08057475, + "balance_loss_mlp": 1.03873301, + "epoch": 0.04100282570793002, + "flos": 19789832223360.0, + "grad_norm": 2.119833154011509, + "language_loss": 0.87653935, + "learning_rate": 3.998743910370385e-06, + "loss": 0.90020394, + "num_input_tokens_seen": 7237985, + "step": 341, + "time_per_iteration": 4.925775527954102 + }, + { + "auxiliary_loss_clip": 0.01320088, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.09003818, + "balance_loss_mlp": 1.0314436, + "epoch": 0.04112306859856911, + "flos": 22565152563840.0, + "grad_norm": 2.296056977647958, + "language_loss": 0.73469889, + "learning_rate": 3.998716155291702e-06, + "loss": 0.75840628, + "num_input_tokens_seen": 7255825, + "step": 342, + "time_per_iteration": 2.551989793777466 + }, + { + "auxiliary_loss_clip": 0.01316045, + "auxiliary_loss_mlp": 0.01062469, + "balance_loss_clip": 1.08642399, + "balance_loss_mlp": 1.0442183, + "epoch": 0.0412433114892082, + "flos": 25040654081280.0, + "grad_norm": 1.7882314255406557, + "language_loss": 0.90727872, + "learning_rate": 3.998688097009366e-06, + "loss": 0.93106383, + "num_input_tokens_seen": 7276590, + "step": 343, + "time_per_iteration": 2.5596227645874023 + }, + { + "auxiliary_loss_clip": 0.01315506, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_clip": 1.08385062, + "balance_loss_mlp": 1.03988528, + "epoch": 0.04136355437984729, + "flos": 25191371548800.0, + "grad_norm": 2.074152362839757, + "language_loss": 0.80049992, + "learning_rate": 3.998659735527636e-06, + "loss": 0.82421881, + "num_input_tokens_seen": 7295680, + "step": 344, + "time_per_iteration": 2.60099196434021 + }, + { + "auxiliary_loss_clip": 0.01314049, + "auxiliary_loss_mlp": 0.01055973, + "balance_loss_clip": 1.08284283, + "balance_loss_mlp": 1.03832984, + "epoch": 0.04148379727048638, + "flos": 22966777509120.0, + "grad_norm": 2.436670389002975, + "language_loss": 0.77607787, + "learning_rate": 3.998631070850813e-06, + "loss": 0.7997781, + "num_input_tokens_seen": 7316300, + "step": 345, + "time_per_iteration": 2.584638833999634 + }, + { + "auxiliary_loss_clip": 0.0131222, + "auxiliary_loss_mlp": 0.01063781, + "balance_loss_clip": 1.08453548, + "balance_loss_mlp": 1.04760444, + "epoch": 0.041604040161125476, + "flos": 14063481187200.0, + "grad_norm": 2.349898811934156, + "language_loss": 0.83583879, + "learning_rate": 3.9986021029832455e-06, + "loss": 0.85959882, + "num_input_tokens_seen": 7333615, + "step": 346, + "time_per_iteration": 2.557032346725464 + }, + { + "auxiliary_loss_clip": 0.01312313, + "auxiliary_loss_mlp": 0.01058894, + "balance_loss_clip": 1.081195, + "balance_loss_mlp": 1.03964174, + "epoch": 0.041724283051764566, + "flos": 12091877614080.0, + "grad_norm": 2.89297312464698, + "language_loss": 0.91637683, + "learning_rate": 3.9985728319293285e-06, + "loss": 0.94008887, + "num_input_tokens_seen": 7347590, + "step": 347, + "time_per_iteration": 2.5359947681427 + }, + { + "auxiliary_loss_clip": 0.01316481, + "auxiliary_loss_mlp": 0.01058424, + "balance_loss_clip": 1.0815773, + "balance_loss_mlp": 1.03981566, + "epoch": 0.041844525942403656, + "flos": 12385303816320.0, + "grad_norm": 2.3403564571514828, + "language_loss": 0.85245478, + "learning_rate": 3.998543257693501e-06, + "loss": 0.87620389, + "num_input_tokens_seen": 7364345, + "step": 348, + "time_per_iteration": 2.509395122528076 + }, + { + "auxiliary_loss_clip": 0.01313547, + "auxiliary_loss_mlp": 0.01065876, + "balance_loss_clip": 1.08412433, + "balance_loss_mlp": 1.04869819, + "epoch": 0.041964768833042745, + "flos": 23769345041280.0, + "grad_norm": 2.7583644666562357, + "language_loss": 0.87927425, + "learning_rate": 3.998513380280251e-06, + "loss": 0.90306854, + "num_input_tokens_seen": 7384625, + "step": 349, + "time_per_iteration": 2.5917603969573975 + }, + { + "auxiliary_loss_clip": 0.01318061, + "auxiliary_loss_mlp": 0.01078741, + "balance_loss_clip": 1.08475423, + "balance_loss_mlp": 1.05898833, + "epoch": 0.042085011723681835, + "flos": 11875336473600.0, + "grad_norm": 2.2270792561184067, + "language_loss": 0.9500795, + "learning_rate": 3.99848319969411e-06, + "loss": 0.97404748, + "num_input_tokens_seen": 7402225, + "step": 350, + "time_per_iteration": 2.5401008129119873 + }, + { + "auxiliary_loss_clip": 0.01319733, + "auxiliary_loss_mlp": 0.01063017, + "balance_loss_clip": 1.08729088, + "balance_loss_mlp": 1.04414606, + "epoch": 0.042205254614320925, + "flos": 16873957964160.0, + "grad_norm": 2.673247242733447, + "language_loss": 0.79569805, + "learning_rate": 3.9984527159396564e-06, + "loss": 0.8195256, + "num_input_tokens_seen": 7420865, + "step": 351, + "time_per_iteration": 2.5301592350006104 + }, + { + "auxiliary_loss_clip": 0.0131309, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.08083868, + "balance_loss_mlp": 1.03927588, + "epoch": 0.04232549750496002, + "flos": 25118508810240.0, + "grad_norm": 2.7951503142474805, + "language_loss": 0.84749132, + "learning_rate": 3.9984219290215154e-06, + "loss": 0.87118757, + "num_input_tokens_seen": 7441040, + "step": 352, + "time_per_iteration": 2.567413330078125 + }, + { + "auxiliary_loss_clip": 0.01311128, + "auxiliary_loss_mlp": 0.01051462, + "balance_loss_clip": 1.08428621, + "balance_loss_mlp": 1.03561926, + "epoch": 0.04244574039559911, + "flos": 26724541714560.0, + "grad_norm": 2.6082997342973955, + "language_loss": 0.89066052, + "learning_rate": 3.998390838944356e-06, + "loss": 0.91428649, + "num_input_tokens_seen": 7462545, + "step": 353, + "time_per_iteration": 2.611095666885376 + }, + { + "auxiliary_loss_clip": 0.01313103, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_clip": 1.0838933, + "balance_loss_mlp": 1.04573762, + "epoch": 0.0425659832862382, + "flos": 20923244951040.0, + "grad_norm": 3.384944358026906, + "language_loss": 0.90222442, + "learning_rate": 3.998359445712895e-06, + "loss": 0.92598188, + "num_input_tokens_seen": 7481650, + "step": 354, + "time_per_iteration": 2.5344746112823486 + }, + { + "auxiliary_loss_clip": 0.01310081, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.08028007, + "balance_loss_mlp": 1.03470719, + "epoch": 0.04268622617687729, + "flos": 23331127115520.0, + "grad_norm": 2.2889450889327683, + "language_loss": 0.81318724, + "learning_rate": 3.9983277493318955e-06, + "loss": 0.83679819, + "num_input_tokens_seen": 7500945, + "step": 355, + "time_per_iteration": 2.5524790287017822 + }, + { + "auxiliary_loss_clip": 0.01313179, + "auxiliary_loss_mlp": 0.01053356, + "balance_loss_clip": 1.08010328, + "balance_loss_mlp": 1.03665495, + "epoch": 0.04280646906751638, + "flos": 25994010908160.0, + "grad_norm": 1.660632890208909, + "language_loss": 0.81371999, + "learning_rate": 3.998295749806165e-06, + "loss": 0.8373853, + "num_input_tokens_seen": 7522170, + "step": 356, + "time_per_iteration": 2.5783164501190186 + }, + { + "auxiliary_loss_clip": 0.01314946, + "auxiliary_loss_mlp": 0.0106787, + "balance_loss_clip": 1.08672929, + "balance_loss_mlp": 1.05009556, + "epoch": 0.04292671195815547, + "flos": 26906824258560.0, + "grad_norm": 1.8955914360242396, + "language_loss": 0.83225167, + "learning_rate": 3.998263447140558e-06, + "loss": 0.85607982, + "num_input_tokens_seen": 7542370, + "step": 357, + "time_per_iteration": 2.620328426361084 + }, + { + "auxiliary_loss_clip": 0.01310746, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.08078313, + "balance_loss_mlp": 1.03341079, + "epoch": 0.04304695484879457, + "flos": 39457315745280.0, + "grad_norm": 2.5358378294633477, + "language_loss": 0.81583101, + "learning_rate": 3.998230841339976e-06, + "loss": 0.83943766, + "num_input_tokens_seen": 7564380, + "step": 358, + "time_per_iteration": 2.673593521118164 + }, + { + "auxiliary_loss_clip": 0.01312746, + "auxiliary_loss_mlp": 0.01052242, + "balance_loss_clip": 1.08640409, + "balance_loss_mlp": 1.03592217, + "epoch": 0.04316719773943366, + "flos": 19646297475840.0, + "grad_norm": 2.269258688655431, + "language_loss": 0.84676296, + "learning_rate": 3.998197932409363e-06, + "loss": 0.87041289, + "num_input_tokens_seen": 7582390, + "step": 359, + "time_per_iteration": 2.5551185607910156 + }, + { + "auxiliary_loss_clip": 0.0130442, + "auxiliary_loss_mlp": 0.01058652, + "balance_loss_clip": 1.08019042, + "balance_loss_mlp": 1.04223728, + "epoch": 0.04328744063007275, + "flos": 22452320966400.0, + "grad_norm": 2.2045675827180204, + "language_loss": 0.86123645, + "learning_rate": 3.9981647203537125e-06, + "loss": 0.88486719, + "num_input_tokens_seen": 7599890, + "step": 360, + "time_per_iteration": 2.5333738327026367 + }, + { + "auxiliary_loss_clip": 0.01310202, + "auxiliary_loss_mlp": 0.01061271, + "balance_loss_clip": 1.08114123, + "balance_loss_mlp": 1.04514194, + "epoch": 0.04340768352071184, + "flos": 21283033530240.0, + "grad_norm": 2.0324766846157165, + "language_loss": 0.96320349, + "learning_rate": 3.998131205178063e-06, + "loss": 0.98691821, + "num_input_tokens_seen": 7618360, + "step": 361, + "time_per_iteration": 2.5629286766052246 + }, + { + "auxiliary_loss_clip": 0.01309097, + "auxiliary_loss_mlp": 0.01059348, + "balance_loss_clip": 1.08135867, + "balance_loss_mlp": 1.04294515, + "epoch": 0.04352792641135093, + "flos": 11583705951360.0, + "grad_norm": 3.092901942717306, + "language_loss": 0.77099061, + "learning_rate": 3.998097386887498e-06, + "loss": 0.79467499, + "num_input_tokens_seen": 7635435, + "step": 362, + "time_per_iteration": 2.504873275756836 + }, + { + "auxiliary_loss_clip": 0.01306816, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_clip": 1.08170485, + "balance_loss_mlp": 1.05034399, + "epoch": 0.04364816930199002, + "flos": 23623547736960.0, + "grad_norm": 1.6967285013272546, + "language_loss": 0.84878534, + "learning_rate": 3.998063265487148e-06, + "loss": 0.87252486, + "num_input_tokens_seen": 7656485, + "step": 363, + "time_per_iteration": 2.5713717937469482 + }, + { + "auxiliary_loss_clip": 0.01310254, + "auxiliary_loss_mlp": 0.01060625, + "balance_loss_clip": 1.08434379, + "balance_loss_mlp": 1.04462743, + "epoch": 0.043768412192629114, + "flos": 14429734214400.0, + "grad_norm": 2.7728417639716967, + "language_loss": 0.80900252, + "learning_rate": 3.99802884098219e-06, + "loss": 0.83271134, + "num_input_tokens_seen": 7674595, + "step": 364, + "time_per_iteration": 2.5306150913238525 + }, + { + "auxiliary_loss_clip": 0.01308174, + "auxiliary_loss_mlp": 0.01047759, + "balance_loss_clip": 1.08051109, + "balance_loss_mlp": 1.03090274, + "epoch": 0.043888655083268203, + "flos": 26468893641600.0, + "grad_norm": 2.422027029514062, + "language_loss": 0.82300222, + "learning_rate": 3.997994113377845e-06, + "loss": 0.84656155, + "num_input_tokens_seen": 7693495, + "step": 365, + "time_per_iteration": 2.63159441947937 + }, + { + "auxiliary_loss_clip": 0.01309765, + "auxiliary_loss_mlp": 0.01050509, + "balance_loss_clip": 1.08256102, + "balance_loss_mlp": 1.03378367, + "epoch": 0.04400889797390729, + "flos": 27235263242880.0, + "grad_norm": 2.185841314784067, + "language_loss": 0.83136374, + "learning_rate": 3.9979590826793815e-06, + "loss": 0.85496652, + "num_input_tokens_seen": 7714685, + "step": 366, + "time_per_iteration": 3.391929864883423 + }, + { + "auxiliary_loss_clip": 0.01312973, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_clip": 1.08536506, + "balance_loss_mlp": 1.03888178, + "epoch": 0.04412914086454638, + "flos": 20119528183680.0, + "grad_norm": 1.914676242305401, + "language_loss": 0.81048787, + "learning_rate": 3.997923748892113e-06, + "loss": 0.83417535, + "num_input_tokens_seen": 7734005, + "step": 367, + "time_per_iteration": 2.540332317352295 + }, + { + "auxiliary_loss_clip": 0.01307573, + "auxiliary_loss_mlp": 0.01053266, + "balance_loss_clip": 1.08420587, + "balance_loss_mlp": 1.03797174, + "epoch": 0.04424938375518547, + "flos": 22604618632320.0, + "grad_norm": 1.667628748875126, + "language_loss": 0.88783115, + "learning_rate": 3.9978881120214015e-06, + "loss": 0.91143954, + "num_input_tokens_seen": 7755525, + "step": 368, + "time_per_iteration": 3.3355276584625244 + }, + { + "auxiliary_loss_clip": 0.0130812, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.08121467, + "balance_loss_mlp": 1.03435814, + "epoch": 0.04436962664582456, + "flos": 24132365844480.0, + "grad_norm": 2.231466016085589, + "language_loss": 0.79410022, + "learning_rate": 3.997852172072652e-06, + "loss": 0.81769204, + "num_input_tokens_seen": 7776740, + "step": 369, + "time_per_iteration": 3.401426315307617 + }, + { + "auxiliary_loss_clip": 0.01309043, + "auxiliary_loss_mlp": 0.01065794, + "balance_loss_clip": 1.08202302, + "balance_loss_mlp": 1.04922366, + "epoch": 0.04448986953646366, + "flos": 18222906251520.0, + "grad_norm": 3.1122340375059725, + "language_loss": 0.88918149, + "learning_rate": 3.9978159290513155e-06, + "loss": 0.91292989, + "num_input_tokens_seen": 7794820, + "step": 370, + "time_per_iteration": 3.426555633544922 + }, + { + "auxiliary_loss_clip": 0.01309655, + "auxiliary_loss_mlp": 0.010682, + "balance_loss_clip": 1.0830617, + "balance_loss_mlp": 1.05130768, + "epoch": 0.04461011242710275, + "flos": 30117920400000.0, + "grad_norm": 3.225703998838781, + "language_loss": 0.80211729, + "learning_rate": 3.997779382962892e-06, + "loss": 0.82589579, + "num_input_tokens_seen": 7817705, + "step": 371, + "time_per_iteration": 2.6029775142669678 + }, + { + "auxiliary_loss_clip": 0.01304917, + "auxiliary_loss_mlp": 0.01055243, + "balance_loss_clip": 1.08141255, + "balance_loss_mlp": 1.03839874, + "epoch": 0.04473035531774184, + "flos": 29752529299200.0, + "grad_norm": 2.0281820427455246, + "language_loss": 0.73647076, + "learning_rate": 3.997742533812924e-06, + "loss": 0.76007235, + "num_input_tokens_seen": 7840970, + "step": 372, + "time_per_iteration": 2.6166629791259766 + }, + { + "auxiliary_loss_clip": 0.01309586, + "auxiliary_loss_mlp": 0.01062425, + "balance_loss_clip": 1.08520854, + "balance_loss_mlp": 1.04614103, + "epoch": 0.04485059820838093, + "flos": 13151565676800.0, + "grad_norm": 2.844710917064601, + "language_loss": 0.92341948, + "learning_rate": 3.997705381607001e-06, + "loss": 0.9471395, + "num_input_tokens_seen": 7857785, + "step": 373, + "time_per_iteration": 2.529916524887085 + }, + { + "auxiliary_loss_clip": 0.01217096, + "auxiliary_loss_mlp": 0.01018393, + "balance_loss_clip": 1.05960727, + "balance_loss_mlp": 1.0100956, + "epoch": 0.04497084109902002, + "flos": 68094209548800.0, + "grad_norm": 0.9694272218985324, + "language_loss": 0.60210967, + "learning_rate": 3.997667926350761e-06, + "loss": 0.62446457, + "num_input_tokens_seen": 7916115, + "step": 374, + "time_per_iteration": 3.0246238708496094 + }, + { + "auxiliary_loss_clip": 0.01215497, + "auxiliary_loss_mlp": 0.01013825, + "balance_loss_clip": 1.05885804, + "balance_loss_mlp": 1.00567079, + "epoch": 0.04509108398965911, + "flos": 64342263346560.0, + "grad_norm": 0.8994556925364138, + "language_loss": 0.57714069, + "learning_rate": 3.997630168049886e-06, + "loss": 0.59943396, + "num_input_tokens_seen": 7974480, + "step": 375, + "time_per_iteration": 3.1211094856262207 + }, + { + "auxiliary_loss_clip": 0.01310887, + "auxiliary_loss_mlp": 0.0105971, + "balance_loss_clip": 1.08335674, + "balance_loss_mlp": 1.04299712, + "epoch": 0.045211326880298205, + "flos": 22271115830400.0, + "grad_norm": 4.315548341348546, + "language_loss": 0.77534771, + "learning_rate": 3.997592106710101e-06, + "loss": 0.79905367, + "num_input_tokens_seen": 7993940, + "step": 376, + "time_per_iteration": 2.5396900177001953 + }, + { + "auxiliary_loss_clip": 0.013042, + "auxiliary_loss_mlp": 0.01048382, + "balance_loss_clip": 1.08173037, + "balance_loss_mlp": 1.03206229, + "epoch": 0.045331569770937295, + "flos": 32159441796480.0, + "grad_norm": 2.435785205452096, + "language_loss": 0.65642887, + "learning_rate": 3.997553742337182e-06, + "loss": 0.67995465, + "num_input_tokens_seen": 8013365, + "step": 377, + "time_per_iteration": 2.6946990489959717 + }, + { + "auxiliary_loss_clip": 0.01304953, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_clip": 1.08177328, + "balance_loss_mlp": 1.0359695, + "epoch": 0.045451812661576385, + "flos": 22163455791360.0, + "grad_norm": 2.1790354145978394, + "language_loss": 0.91461468, + "learning_rate": 3.997515074936949e-06, + "loss": 0.93818903, + "num_input_tokens_seen": 8034240, + "step": 378, + "time_per_iteration": 2.5630507469177246 + }, + { + "auxiliary_loss_clip": 0.01306336, + "auxiliary_loss_mlp": 0.01053791, + "balance_loss_clip": 1.08244073, + "balance_loss_mlp": 1.03776932, + "epoch": 0.045572055552215475, + "flos": 16581968305920.0, + "grad_norm": 2.3545578761050177, + "language_loss": 0.87036151, + "learning_rate": 3.997476104515268e-06, + "loss": 0.89396274, + "num_input_tokens_seen": 8052430, + "step": 379, + "time_per_iteration": 2.6333203315734863 + }, + { + "auxiliary_loss_clip": 0.01306404, + "auxiliary_loss_mlp": 0.01052337, + "balance_loss_clip": 1.0852288, + "balance_loss_mlp": 1.036852, + "epoch": 0.045692298442854565, + "flos": 17603375448960.0, + "grad_norm": 2.0843757832637855, + "language_loss": 0.77621472, + "learning_rate": 3.9974368310780485e-06, + "loss": 0.79980212, + "num_input_tokens_seen": 8069605, + "step": 380, + "time_per_iteration": 2.5254719257354736 + }, + { + "auxiliary_loss_clip": 0.01313584, + "auxiliary_loss_mlp": 0.01057691, + "balance_loss_clip": 1.0852927, + "balance_loss_mlp": 1.03963089, + "epoch": 0.045812541333493655, + "flos": 26761098781440.0, + "grad_norm": 2.337614857095147, + "language_loss": 0.74208629, + "learning_rate": 3.997397254631251e-06, + "loss": 0.76579905, + "num_input_tokens_seen": 8090225, + "step": 381, + "time_per_iteration": 2.5758590698242188 + }, + { + "auxiliary_loss_clip": 0.01205805, + "auxiliary_loss_mlp": 0.01017371, + "balance_loss_clip": 1.05264294, + "balance_loss_mlp": 1.00935984, + "epoch": 0.04593278422413275, + "flos": 60250349894400.0, + "grad_norm": 0.8282427057370774, + "language_loss": 0.60037583, + "learning_rate": 3.997357375180878e-06, + "loss": 0.62260753, + "num_input_tokens_seen": 8154505, + "step": 382, + "time_per_iteration": 3.167154550552368 + }, + { + "auxiliary_loss_clip": 0.01307683, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.08278894, + "balance_loss_mlp": 1.02872074, + "epoch": 0.04605302711477184, + "flos": 21799249839360.0, + "grad_norm": 1.7449019801179229, + "language_loss": 0.75533605, + "learning_rate": 3.997317192732979e-06, + "loss": 0.7788747, + "num_input_tokens_seen": 8173285, + "step": 383, + "time_per_iteration": 2.5524566173553467 + }, + { + "auxiliary_loss_clip": 0.01307884, + "auxiliary_loss_mlp": 0.01062889, + "balance_loss_clip": 1.08329308, + "balance_loss_mlp": 1.04590178, + "epoch": 0.04617327000541093, + "flos": 19459705299840.0, + "grad_norm": 1.8813961376697292, + "language_loss": 0.82538718, + "learning_rate": 3.99727670729365e-06, + "loss": 0.84909487, + "num_input_tokens_seen": 8191845, + "step": 384, + "time_per_iteration": 2.523893117904663 + }, + { + "auxiliary_loss_clip": 0.01307429, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.08743572, + "balance_loss_mlp": 1.04079795, + "epoch": 0.04629351289605002, + "flos": 25411468135680.0, + "grad_norm": 1.9694999885385984, + "language_loss": 0.78162694, + "learning_rate": 3.997235918869033e-06, + "loss": 0.80526704, + "num_input_tokens_seen": 8212880, + "step": 385, + "time_per_iteration": 2.595957040786743 + }, + { + "auxiliary_loss_clip": 0.01307364, + "auxiliary_loss_mlp": 0.01049257, + "balance_loss_clip": 1.08660781, + "balance_loss_mlp": 1.03395081, + "epoch": 0.04641375578668911, + "flos": 20558284813440.0, + "grad_norm": 2.0374736518985364, + "language_loss": 0.82465196, + "learning_rate": 3.997194827465315e-06, + "loss": 0.8482182, + "num_input_tokens_seen": 8231475, + "step": 386, + "time_per_iteration": 2.5485081672668457 + }, + { + "auxiliary_loss_clip": 0.01304222, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_clip": 1.08043289, + "balance_loss_mlp": 1.03363967, + "epoch": 0.0465339986773282, + "flos": 13188661447680.0, + "grad_norm": 2.9474275535594074, + "language_loss": 0.91001689, + "learning_rate": 3.997153433088728e-06, + "loss": 0.9335556, + "num_input_tokens_seen": 8248600, + "step": 387, + "time_per_iteration": 2.5060527324676514 + }, + { + "auxiliary_loss_clip": 0.01306059, + "auxiliary_loss_mlp": 0.01054173, + "balance_loss_clip": 1.08352542, + "balance_loss_mlp": 1.03689981, + "epoch": 0.0466542415679673, + "flos": 25556547168000.0, + "grad_norm": 2.0653077563617113, + "language_loss": 0.81239474, + "learning_rate": 3.997111735745554e-06, + "loss": 0.83599705, + "num_input_tokens_seen": 8271570, + "step": 388, + "time_per_iteration": 2.5766990184783936 + }, + { + "auxiliary_loss_clip": 0.01301181, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_clip": 1.08093476, + "balance_loss_mlp": 1.04160607, + "epoch": 0.04677448445860639, + "flos": 22236749493120.0, + "grad_norm": 1.8129346591158655, + "language_loss": 0.82515168, + "learning_rate": 3.997069735442118e-06, + "loss": 0.84875858, + "num_input_tokens_seen": 8291265, + "step": 389, + "time_per_iteration": 2.5257515907287598 + }, + { + "auxiliary_loss_clip": 0.01302037, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_clip": 1.08147573, + "balance_loss_mlp": 1.03821778, + "epoch": 0.04689472734924548, + "flos": 28147825198080.0, + "grad_norm": 1.5524138645133032, + "language_loss": 0.80315757, + "learning_rate": 3.997027432184792e-06, + "loss": 0.82672107, + "num_input_tokens_seen": 8315925, + "step": 390, + "time_per_iteration": 2.6208670139312744 + }, + { + "auxiliary_loss_clip": 0.01303666, + "auxiliary_loss_mlp": 0.01054164, + "balance_loss_clip": 1.0827539, + "balance_loss_mlp": 1.03846419, + "epoch": 0.04701497023988457, + "flos": 23148952312320.0, + "grad_norm": 1.9399800262865, + "language_loss": 0.89414722, + "learning_rate": 3.99698482597999e-06, + "loss": 0.91772556, + "num_input_tokens_seen": 8333605, + "step": 391, + "time_per_iteration": 2.6397433280944824 + }, + { + "auxiliary_loss_clip": 0.01192702, + "auxiliary_loss_mlp": 0.0101032, + "balance_loss_clip": 1.04421163, + "balance_loss_mlp": 1.0029763, + "epoch": 0.04713521313052366, + "flos": 64827668764800.0, + "grad_norm": 0.8699911529385073, + "language_loss": 0.63914961, + "learning_rate": 3.99694191683418e-06, + "loss": 0.66117978, + "num_input_tokens_seen": 8394405, + "step": 392, + "time_per_iteration": 3.1264443397521973 + }, + { + "auxiliary_loss_clip": 0.01307198, + "auxiliary_loss_mlp": 0.01053445, + "balance_loss_clip": 1.08628488, + "balance_loss_mlp": 1.03686297, + "epoch": 0.047255456021162746, + "flos": 18771585477120.0, + "grad_norm": 1.9576946882674373, + "language_loss": 0.81689608, + "learning_rate": 3.996898704753867e-06, + "loss": 0.84050256, + "num_input_tokens_seen": 8412355, + "step": 393, + "time_per_iteration": 3.438933849334717 + }, + { + "auxiliary_loss_clip": 0.01300407, + "auxiliary_loss_mlp": 0.01048388, + "balance_loss_clip": 1.08005714, + "balance_loss_mlp": 1.03278351, + "epoch": 0.04737569891180184, + "flos": 22053820504320.0, + "grad_norm": 2.2784115046639326, + "language_loss": 0.87820518, + "learning_rate": 3.996855189745609e-06, + "loss": 0.90169311, + "num_input_tokens_seen": 8431620, + "step": 394, + "time_per_iteration": 2.5775296688079834 + }, + { + "auxiliary_loss_clip": 0.01300394, + "auxiliary_loss_mlp": 0.01056059, + "balance_loss_clip": 1.0802176, + "balance_loss_mlp": 1.03952503, + "epoch": 0.04749594180244093, + "flos": 29057370410880.0, + "grad_norm": 1.87911489869821, + "language_loss": 0.92737824, + "learning_rate": 3.996811371816007e-06, + "loss": 0.95094275, + "num_input_tokens_seen": 8454045, + "step": 395, + "time_per_iteration": 3.4575424194335938 + }, + { + "auxiliary_loss_clip": 0.01302111, + "auxiliary_loss_mlp": 0.01060594, + "balance_loss_clip": 1.08385897, + "balance_loss_mlp": 1.04558539, + "epoch": 0.04761618469308002, + "flos": 35112268172160.0, + "grad_norm": 1.908314134252623, + "language_loss": 0.77723658, + "learning_rate": 3.996767250971707e-06, + "loss": 0.80086362, + "num_input_tokens_seen": 8476785, + "step": 396, + "time_per_iteration": 3.5062286853790283 + }, + { + "auxiliary_loss_clip": 0.01307277, + "auxiliary_loss_mlp": 0.01048865, + "balance_loss_clip": 1.08641338, + "balance_loss_mlp": 1.03285468, + "epoch": 0.04773642758371911, + "flos": 25630702796160.0, + "grad_norm": 1.9715060621461424, + "language_loss": 0.8691026, + "learning_rate": 3.996722827219403e-06, + "loss": 0.89266402, + "num_input_tokens_seen": 8498400, + "step": 397, + "time_per_iteration": 3.5262444019317627 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01061006, + "balance_loss_clip": 1.08609676, + "balance_loss_mlp": 1.04476964, + "epoch": 0.0478566704743582, + "flos": 20631506688000.0, + "grad_norm": 6.080034601161963, + "language_loss": 0.82688367, + "learning_rate": 3.996678100565833e-06, + "loss": 0.85056698, + "num_input_tokens_seen": 8517455, + "step": 398, + "time_per_iteration": 2.5732080936431885 + }, + { + "auxiliary_loss_clip": 0.01297821, + "auxiliary_loss_mlp": 0.01057667, + "balance_loss_clip": 1.08022106, + "balance_loss_mlp": 1.04022646, + "epoch": 0.04797691336499729, + "flos": 18835721210880.0, + "grad_norm": 2.4931397108219895, + "language_loss": 0.88834172, + "learning_rate": 3.996633071017783e-06, + "loss": 0.91189659, + "num_input_tokens_seen": 8534085, + "step": 399, + "time_per_iteration": 2.49765682220459 + }, + { + "auxiliary_loss_clip": 0.01297952, + "auxiliary_loss_mlp": 0.01054015, + "balance_loss_clip": 1.08073699, + "balance_loss_mlp": 1.03779054, + "epoch": 0.04809715625563638, + "flos": 21099673578240.0, + "grad_norm": 2.2491143389063573, + "language_loss": 0.81976545, + "learning_rate": 3.996587738582084e-06, + "loss": 0.84328514, + "num_input_tokens_seen": 8550885, + "step": 400, + "time_per_iteration": 2.5223872661590576 + }, + { + "auxiliary_loss_clip": 0.01297426, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.07867622, + "balance_loss_mlp": 1.02980947, + "epoch": 0.04821739914627548, + "flos": 23805650712960.0, + "grad_norm": 2.702677681447213, + "language_loss": 0.86355561, + "learning_rate": 3.9965421032656115e-06, + "loss": 0.88698113, + "num_input_tokens_seen": 8570815, + "step": 401, + "time_per_iteration": 2.6047074794769287 + }, + { + "auxiliary_loss_clip": 0.01298558, + "auxiliary_loss_mlp": 0.01051842, + "balance_loss_clip": 1.07910204, + "balance_loss_mlp": 1.03465223, + "epoch": 0.04833764203691457, + "flos": 22200587475840.0, + "grad_norm": 3.6134515528823994, + "language_loss": 0.93923134, + "learning_rate": 3.99649616507529e-06, + "loss": 0.96273541, + "num_input_tokens_seen": 8589910, + "step": 402, + "time_per_iteration": 2.5562562942504883 + }, + { + "auxiliary_loss_clip": 0.01187902, + "auxiliary_loss_mlp": 0.01006467, + "balance_loss_clip": 1.0447197, + "balance_loss_mlp": 0.99945784, + "epoch": 0.04845788492755366, + "flos": 65904376896000.0, + "grad_norm": 0.8952224617761081, + "language_loss": 0.63144195, + "learning_rate": 3.996449924018088e-06, + "loss": 0.65338564, + "num_input_tokens_seen": 8650370, + "step": 403, + "time_per_iteration": 3.043468713760376 + }, + { + "auxiliary_loss_clip": 0.01296963, + "auxiliary_loss_mlp": 0.01055744, + "balance_loss_clip": 1.08101249, + "balance_loss_mlp": 1.0408783, + "epoch": 0.04857812781819275, + "flos": 19281301424640.0, + "grad_norm": 1.8410139877245097, + "language_loss": 0.79444671, + "learning_rate": 3.99640338010102e-06, + "loss": 0.81797373, + "num_input_tokens_seen": 8669475, + "step": 404, + "time_per_iteration": 2.526526927947998 + }, + { + "auxiliary_loss_clip": 0.01296844, + "auxiliary_loss_mlp": 0.01046258, + "balance_loss_clip": 1.07914042, + "balance_loss_mlp": 1.02952099, + "epoch": 0.04869837070883184, + "flos": 24062376193920.0, + "grad_norm": 1.92652693165795, + "language_loss": 0.78709567, + "learning_rate": 3.996356533331146e-06, + "loss": 0.81052673, + "num_input_tokens_seen": 8691345, + "step": 405, + "time_per_iteration": 2.6189985275268555 + }, + { + "auxiliary_loss_clip": 0.01307176, + "auxiliary_loss_mlp": 0.01047535, + "balance_loss_clip": 1.08082664, + "balance_loss_mlp": 1.03173995, + "epoch": 0.04881861359947093, + "flos": 25187169657600.0, + "grad_norm": 3.7221953294549186, + "language_loss": 0.61992085, + "learning_rate": 3.996309383715573e-06, + "loss": 0.64346796, + "num_input_tokens_seen": 8710125, + "step": 406, + "time_per_iteration": 2.5847864151000977 + }, + { + "auxiliary_loss_clip": 0.01304322, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.08302617, + "balance_loss_mlp": 1.02882171, + "epoch": 0.048938856490110025, + "flos": 16362913213440.0, + "grad_norm": 5.904436297257136, + "language_loss": 0.73949993, + "learning_rate": 3.996261931261454e-06, + "loss": 0.76298565, + "num_input_tokens_seen": 8728705, + "step": 407, + "time_per_iteration": 2.579486131668091 + }, + { + "auxiliary_loss_clip": 0.01302115, + "auxiliary_loss_mlp": 0.01052331, + "balance_loss_clip": 1.08327043, + "balance_loss_mlp": 1.03652358, + "epoch": 0.049059099380749115, + "flos": 29895094379520.0, + "grad_norm": 1.8776330847058453, + "language_loss": 0.86580873, + "learning_rate": 3.996214175975987e-06, + "loss": 0.88935316, + "num_input_tokens_seen": 8749225, + "step": 408, + "time_per_iteration": 2.602163553237915 + }, + { + "auxiliary_loss_clip": 0.01305403, + "auxiliary_loss_mlp": 0.01055051, + "balance_loss_clip": 1.08456445, + "balance_loss_mlp": 1.0389694, + "epoch": 0.049179342271388204, + "flos": 35918858027520.0, + "grad_norm": 4.187718692754341, + "language_loss": 0.79392195, + "learning_rate": 3.996166117866417e-06, + "loss": 0.81752646, + "num_input_tokens_seen": 8771160, + "step": 409, + "time_per_iteration": 2.6792855262756348 + }, + { + "auxiliary_loss_clip": 0.01297202, + "auxiliary_loss_mlp": 0.01047271, + "balance_loss_clip": 1.07991517, + "balance_loss_mlp": 1.03204191, + "epoch": 0.049299585162027294, + "flos": 14611226659200.0, + "grad_norm": 1.9274947872232246, + "language_loss": 0.86921704, + "learning_rate": 3.996117756940035e-06, + "loss": 0.89266169, + "num_input_tokens_seen": 8787845, + "step": 410, + "time_per_iteration": 2.541041612625122 + }, + { + "auxiliary_loss_clip": 0.01301537, + "auxiliary_loss_mlp": 0.01047727, + "balance_loss_clip": 1.08328938, + "balance_loss_mlp": 1.03246856, + "epoch": 0.049419828052666384, + "flos": 19567939956480.0, + "grad_norm": 2.058687031277856, + "language_loss": 0.97840512, + "learning_rate": 3.996069093204175e-06, + "loss": 1.00189781, + "num_input_tokens_seen": 8803805, + "step": 411, + "time_per_iteration": 2.568758249282837 + }, + { + "auxiliary_loss_clip": 0.0130675, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.08518767, + "balance_loss_mlp": 1.03857028, + "epoch": 0.049540070943305474, + "flos": 13659916907520.0, + "grad_norm": 2.3945827972595057, + "language_loss": 0.87980765, + "learning_rate": 3.996020126666221e-06, + "loss": 0.90342593, + "num_input_tokens_seen": 8820785, + "step": 412, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.0130177, + "auxiliary_loss_mlp": 0.0104881, + "balance_loss_clip": 1.08342624, + "balance_loss_mlp": 1.03400445, + "epoch": 0.04966031383394457, + "flos": 21832035978240.0, + "grad_norm": 2.467844257976011, + "language_loss": 0.82006478, + "learning_rate": 3.995970857333601e-06, + "loss": 0.84357059, + "num_input_tokens_seen": 8841195, + "step": 413, + "time_per_iteration": 2.6237852573394775 + }, + { + "auxiliary_loss_clip": 0.01302275, + "auxiliary_loss_mlp": 0.0105131, + "balance_loss_clip": 1.08128595, + "balance_loss_mlp": 1.03565764, + "epoch": 0.04978055672458366, + "flos": 28618793349120.0, + "grad_norm": 1.792407698710041, + "language_loss": 0.79696965, + "learning_rate": 3.995921285213789e-06, + "loss": 0.8205055, + "num_input_tokens_seen": 8861455, + "step": 414, + "time_per_iteration": 2.5889523029327393 + }, + { + "auxiliary_loss_clip": 0.0129773, + "auxiliary_loss_mlp": 0.01047265, + "balance_loss_clip": 1.08041418, + "balance_loss_mlp": 1.03271556, + "epoch": 0.04990079961522275, + "flos": 19828220883840.0, + "grad_norm": 2.3154784060640865, + "language_loss": 0.80597448, + "learning_rate": 3.995871410314305e-06, + "loss": 0.82942438, + "num_input_tokens_seen": 8880015, + "step": 415, + "time_per_iteration": 2.5525991916656494 + }, + { + "auxiliary_loss_clip": 0.01167935, + "auxiliary_loss_mlp": 0.01007743, + "balance_loss_clip": 1.04132557, + "balance_loss_mlp": 1.00073385, + "epoch": 0.05002104250586184, + "flos": 62735045293440.0, + "grad_norm": 3.2881720058211226, + "language_loss": 0.59632242, + "learning_rate": 3.995821232642714e-06, + "loss": 0.61807919, + "num_input_tokens_seen": 8938420, + "step": 416, + "time_per_iteration": 3.241363286972046 + }, + { + "auxiliary_loss_clip": 0.01283934, + "auxiliary_loss_mlp": 0.01049445, + "balance_loss_clip": 1.08361006, + "balance_loss_mlp": 1.0343647, + "epoch": 0.05014128539650093, + "flos": 27928518710400.0, + "grad_norm": 3.1907768298465933, + "language_loss": 0.82474566, + "learning_rate": 3.995770752206629e-06, + "loss": 0.84807944, + "num_input_tokens_seen": 8959495, + "step": 417, + "time_per_iteration": 2.621290922164917 + }, + { + "auxiliary_loss_clip": 0.01302323, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.08387649, + "balance_loss_mlp": 1.03242755, + "epoch": 0.05026152828714002, + "flos": 17705576620800.0, + "grad_norm": 1.9869218969289917, + "language_loss": 0.9748978, + "learning_rate": 3.995719969013709e-06, + "loss": 0.99840963, + "num_input_tokens_seen": 8976675, + "step": 418, + "time_per_iteration": 2.539304494857788 + }, + { + "auxiliary_loss_clip": 0.01265562, + "auxiliary_loss_mlp": 0.01052296, + "balance_loss_clip": 1.07860887, + "balance_loss_mlp": 1.03638124, + "epoch": 0.05038177117777912, + "flos": 19133277477120.0, + "grad_norm": 2.7706364881300347, + "language_loss": 0.8593576, + "learning_rate": 3.995668883071655e-06, + "loss": 0.88253617, + "num_input_tokens_seen": 8992900, + "step": 419, + "time_per_iteration": 2.6271185874938965 + }, + { + "auxiliary_loss_clip": 0.01301142, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.08269358, + "balance_loss_mlp": 1.03655696, + "epoch": 0.050502014068418206, + "flos": 20667704618880.0, + "grad_norm": 2.7540808018243266, + "language_loss": 0.91182673, + "learning_rate": 3.995617494388219e-06, + "loss": 0.93535864, + "num_input_tokens_seen": 9011020, + "step": 420, + "time_per_iteration": 3.3287360668182373 + }, + { + "auxiliary_loss_clip": 0.01260532, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.07389832, + "balance_loss_mlp": 1.02776575, + "epoch": 0.050622256959057296, + "flos": 21361103740800.0, + "grad_norm": 1.9516582237082214, + "language_loss": 0.80540347, + "learning_rate": 3.995565802971196e-06, + "loss": 0.82844651, + "num_input_tokens_seen": 9030995, + "step": 421, + "time_per_iteration": 2.643930196762085 + }, + { + "auxiliary_loss_clip": 0.01260912, + "auxiliary_loss_mlp": 0.01051425, + "balance_loss_clip": 1.07638383, + "balance_loss_mlp": 1.03685749, + "epoch": 0.050742499849696386, + "flos": 27673588909440.0, + "grad_norm": 1.849183054338753, + "language_loss": 0.67442358, + "learning_rate": 3.995513808828427e-06, + "loss": 0.69754696, + "num_input_tokens_seen": 9053790, + "step": 422, + "time_per_iteration": 3.4457900524139404 + }, + { + "auxiliary_loss_clip": 0.01264124, + "auxiliary_loss_mlp": 0.01045527, + "balance_loss_clip": 1.07738662, + "balance_loss_mlp": 1.03021431, + "epoch": 0.050862742740335476, + "flos": 19865999013120.0, + "grad_norm": 1.9483182901866805, + "language_loss": 0.76573622, + "learning_rate": 3.9954615119678e-06, + "loss": 0.78883266, + "num_input_tokens_seen": 9072345, + "step": 423, + "time_per_iteration": 3.3843607902526855 + }, + { + "auxiliary_loss_clip": 0.0127205, + "auxiliary_loss_mlp": 0.01054079, + "balance_loss_clip": 1.07656598, + "balance_loss_mlp": 1.03812838, + "epoch": 0.050982985630974566, + "flos": 22085098272000.0, + "grad_norm": 2.1671354713306274, + "language_loss": 0.80606043, + "learning_rate": 3.995408912397248e-06, + "loss": 0.82932162, + "num_input_tokens_seen": 9090240, + "step": 424, + "time_per_iteration": 3.5066134929656982 + }, + { + "auxiliary_loss_clip": 0.01268556, + "auxiliary_loss_mlp": 0.01055006, + "balance_loss_clip": 1.08021164, + "balance_loss_mlp": 1.03832817, + "epoch": 0.05110322852161366, + "flos": 20740962407040.0, + "grad_norm": 2.159098803946794, + "language_loss": 0.93172324, + "learning_rate": 3.99535601012475e-06, + "loss": 0.9549588, + "num_input_tokens_seen": 9105570, + "step": 425, + "time_per_iteration": 2.6888365745544434 + }, + { + "auxiliary_loss_clip": 0.01249192, + "auxiliary_loss_mlp": 0.00766922, + "balance_loss_clip": 1.07906306, + "balance_loss_mlp": 1.0003432, + "epoch": 0.05122347141225275, + "flos": 28547295327360.0, + "grad_norm": 1.7810073919051372, + "language_loss": 0.75522244, + "learning_rate": 3.995302805158333e-06, + "loss": 0.77538365, + "num_input_tokens_seen": 9128225, + "step": 426, + "time_per_iteration": 2.7587318420410156 + }, + { + "auxiliary_loss_clip": 0.01258693, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.0759424, + "balance_loss_mlp": 1.03640521, + "epoch": 0.05134371430289184, + "flos": 19722679747200.0, + "grad_norm": 2.364402475415502, + "language_loss": 0.83420956, + "learning_rate": 3.9952492975060665e-06, + "loss": 0.85733831, + "num_input_tokens_seen": 9148295, + "step": 427, + "time_per_iteration": 2.6693570613861084 + }, + { + "auxiliary_loss_clip": 0.01279715, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.07896721, + "balance_loss_mlp": 1.02736926, + "epoch": 0.05146395719353093, + "flos": 34458945649920.0, + "grad_norm": 2.5247760303231637, + "language_loss": 0.84815639, + "learning_rate": 3.995195487176067e-06, + "loss": 0.87137979, + "num_input_tokens_seen": 9168525, + "step": 428, + "time_per_iteration": 2.753730535507202 + }, + { + "auxiliary_loss_clip": 0.01298381, + "auxiliary_loss_mlp": 0.01052319, + "balance_loss_clip": 1.08070576, + "balance_loss_mlp": 1.03627288, + "epoch": 0.05158420008417002, + "flos": 21760286561280.0, + "grad_norm": 1.7510064686302105, + "language_loss": 0.85443282, + "learning_rate": 3.995141374176499e-06, + "loss": 0.87793982, + "num_input_tokens_seen": 9186920, + "step": 429, + "time_per_iteration": 2.5584375858306885 + }, + { + "auxiliary_loss_clip": 0.01137308, + "auxiliary_loss_mlp": 0.00757144, + "balance_loss_clip": 1.03608894, + "balance_loss_mlp": 1.00049198, + "epoch": 0.05170444297480911, + "flos": 72553956226560.0, + "grad_norm": 0.928468515580307, + "language_loss": 0.63148701, + "learning_rate": 3.995086958515572e-06, + "loss": 0.65043151, + "num_input_tokens_seen": 9244940, + "step": 430, + "time_per_iteration": 3.252453327178955 + }, + { + "auxiliary_loss_clip": 0.01176628, + "auxiliary_loss_mlp": 0.00757032, + "balance_loss_clip": 1.03753281, + "balance_loss_mlp": 1.00046813, + "epoch": 0.05182468586544821, + "flos": 62416159326720.0, + "grad_norm": 0.863801819190144, + "language_loss": 0.59991491, + "learning_rate": 3.995032240201538e-06, + "loss": 0.61925149, + "num_input_tokens_seen": 9307335, + "step": 431, + "time_per_iteration": 3.058312177658081 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01005996, + "balance_loss_clip": 1.03366351, + "balance_loss_mlp": 0.99841464, + "epoch": 0.0519449287560873, + "flos": 41225989432320.0, + "grad_norm": 0.9432823728371184, + "language_loss": 0.63217014, + "learning_rate": 3.9949772192427e-06, + "loss": 0.65376896, + "num_input_tokens_seen": 9353960, + "step": 432, + "time_per_iteration": 2.8581018447875977 + }, + { + "auxiliary_loss_clip": 0.01261973, + "auxiliary_loss_mlp": 0.01050686, + "balance_loss_clip": 1.07384753, + "balance_loss_mlp": 1.0345099, + "epoch": 0.05206517164672639, + "flos": 17494530261120.0, + "grad_norm": 2.174423268662747, + "language_loss": 0.79734135, + "learning_rate": 3.994921895647405e-06, + "loss": 0.82046801, + "num_input_tokens_seen": 9372130, + "step": 433, + "time_per_iteration": 2.637779712677002 + }, + { + "auxiliary_loss_clip": 0.01173331, + "auxiliary_loss_mlp": 0.010063, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 0.99914706, + "epoch": 0.05218541453736548, + "flos": 64002762973440.0, + "grad_norm": 0.8427963775507344, + "language_loss": 0.55441427, + "learning_rate": 3.994866269424043e-06, + "loss": 0.5762105, + "num_input_tokens_seen": 9428500, + "step": 434, + "time_per_iteration": 2.998046398162842 + }, + { + "auxiliary_loss_clip": 0.01205172, + "auxiliary_loss_mlp": 0.01058122, + "balance_loss_clip": 1.06135535, + "balance_loss_mlp": 1.04157615, + "epoch": 0.05230565742800457, + "flos": 19317319787520.0, + "grad_norm": 2.5202974300516336, + "language_loss": 0.78479213, + "learning_rate": 3.9948103405810545e-06, + "loss": 0.80742508, + "num_input_tokens_seen": 9447450, + "step": 435, + "time_per_iteration": 2.826554775238037 + }, + { + "auxiliary_loss_clip": 0.01232683, + "auxiliary_loss_mlp": 0.01054905, + "balance_loss_clip": 1.07014525, + "balance_loss_mlp": 1.03992069, + "epoch": 0.05242590031864366, + "flos": 25298636538240.0, + "grad_norm": 2.4909273650765233, + "language_loss": 0.86032963, + "learning_rate": 3.994754109126923e-06, + "loss": 0.88320553, + "num_input_tokens_seen": 9468945, + "step": 436, + "time_per_iteration": 2.77929949760437 + }, + { + "auxiliary_loss_clip": 0.01207393, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_clip": 1.06949985, + "balance_loss_mlp": 1.02970243, + "epoch": 0.052546143209282754, + "flos": 26211629456640.0, + "grad_norm": 1.6691748388454406, + "language_loss": 0.93482065, + "learning_rate": 3.994697575070181e-06, + "loss": 0.95734429, + "num_input_tokens_seen": 9488405, + "step": 437, + "time_per_iteration": 2.809767007827759 + }, + { + "auxiliary_loss_clip": 0.01265962, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_clip": 1.08160222, + "balance_loss_mlp": 1.03899574, + "epoch": 0.052666386099921844, + "flos": 22158140578560.0, + "grad_norm": 1.8392610337997999, + "language_loss": 0.9144181, + "learning_rate": 3.994640738419402e-06, + "loss": 0.93762851, + "num_input_tokens_seen": 9507780, + "step": 438, + "time_per_iteration": 2.664480686187744 + }, + { + "auxiliary_loss_clip": 0.01277535, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.07866287, + "balance_loss_mlp": 1.02911329, + "epoch": 0.052786628990560934, + "flos": 23881817502720.0, + "grad_norm": 1.8723803384654591, + "language_loss": 0.81028652, + "learning_rate": 3.9945835991832075e-06, + "loss": 0.83350384, + "num_input_tokens_seen": 9529665, + "step": 439, + "time_per_iteration": 2.6422131061553955 + }, + { + "auxiliary_loss_clip": 0.01299052, + "auxiliary_loss_mlp": 0.01060016, + "balance_loss_clip": 1.08483291, + "balance_loss_mlp": 1.04494786, + "epoch": 0.052906871881200024, + "flos": 24605021934720.0, + "grad_norm": 1.9564672463054191, + "language_loss": 0.92908603, + "learning_rate": 3.994526157370268e-06, + "loss": 0.95267677, + "num_input_tokens_seen": 9548280, + "step": 440, + "time_per_iteration": 2.6024482250213623 + }, + { + "auxiliary_loss_clip": 0.01151736, + "auxiliary_loss_mlp": 0.01016686, + "balance_loss_clip": 1.03298402, + "balance_loss_mlp": 1.01005769, + "epoch": 0.053027114771839114, + "flos": 56461631143680.0, + "grad_norm": 0.8971811150894674, + "language_loss": 0.59247661, + "learning_rate": 3.994468412989296e-06, + "loss": 0.6141609, + "num_input_tokens_seen": 9609690, + "step": 441, + "time_per_iteration": 3.2654948234558105 + }, + { + "auxiliary_loss_clip": 0.01237446, + "auxiliary_loss_mlp": 0.01056138, + "balance_loss_clip": 1.07102358, + "balance_loss_mlp": 1.04031849, + "epoch": 0.053147357662478203, + "flos": 17311098481920.0, + "grad_norm": 2.0407459472038854, + "language_loss": 0.92766589, + "learning_rate": 3.994410366049052e-06, + "loss": 0.9506017, + "num_input_tokens_seen": 9627550, + "step": 442, + "time_per_iteration": 2.6080989837646484 + }, + { + "auxiliary_loss_clip": 0.01278215, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_clip": 1.07735777, + "balance_loss_mlp": 1.03152776, + "epoch": 0.0532676005531173, + "flos": 17164977955200.0, + "grad_norm": 2.247340446770413, + "language_loss": 0.83198243, + "learning_rate": 3.994352016558341e-06, + "loss": 0.85523939, + "num_input_tokens_seen": 9644855, + "step": 443, + "time_per_iteration": 2.5477967262268066 + }, + { + "auxiliary_loss_clip": 0.01279192, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_clip": 1.07988501, + "balance_loss_mlp": 1.03970003, + "epoch": 0.05338784344375639, + "flos": 27819960831360.0, + "grad_norm": 1.984281850293955, + "language_loss": 0.73881489, + "learning_rate": 3.994293364526014e-06, + "loss": 0.76216042, + "num_input_tokens_seen": 9665740, + "step": 444, + "time_per_iteration": 2.736412525177002 + }, + { + "auxiliary_loss_clip": 0.01253984, + "auxiliary_loss_mlp": 0.01046676, + "balance_loss_clip": 1.07694125, + "balance_loss_mlp": 1.0297718, + "epoch": 0.05350808633439548, + "flos": 21507691144320.0, + "grad_norm": 3.2264900324897337, + "language_loss": 0.84782857, + "learning_rate": 3.99423440996097e-06, + "loss": 0.87083513, + "num_input_tokens_seen": 9685280, + "step": 445, + "time_per_iteration": 2.7217965126037598 + }, + { + "auxiliary_loss_clip": 0.01262648, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.0802635, + "balance_loss_mlp": 1.03875327, + "epoch": 0.05362832922503457, + "flos": 20084299920000.0, + "grad_norm": 2.4223229496780636, + "language_loss": 0.81441277, + "learning_rate": 3.994175152872152e-06, + "loss": 0.83758903, + "num_input_tokens_seen": 9704365, + "step": 446, + "time_per_iteration": 2.725703716278076 + }, + { + "auxiliary_loss_clip": 0.0128054, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_clip": 1.07781053, + "balance_loss_mlp": 1.03049147, + "epoch": 0.05374857211567366, + "flos": 26137222433280.0, + "grad_norm": 1.9260703314619048, + "language_loss": 0.78617531, + "learning_rate": 3.994115593268548e-06, + "loss": 0.8094331, + "num_input_tokens_seen": 9724145, + "step": 447, + "time_per_iteration": 3.495028257369995 + }, + { + "auxiliary_loss_clip": 0.01296754, + "auxiliary_loss_mlp": 0.01054711, + "balance_loss_clip": 1.08153582, + "balance_loss_mlp": 1.03949988, + "epoch": 0.05386881500631275, + "flos": 27486817165440.0, + "grad_norm": 2.1122299652795093, + "language_loss": 0.82147062, + "learning_rate": 3.994055731159195e-06, + "loss": 0.84498531, + "num_input_tokens_seen": 9741615, + "step": 448, + "time_per_iteration": 3.464294672012329 + }, + { + "auxiliary_loss_clip": 0.01282017, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_clip": 1.08250535, + "balance_loss_mlp": 1.04518247, + "epoch": 0.053989057896951846, + "flos": 23585087249280.0, + "grad_norm": 1.9351903759452422, + "language_loss": 0.86826122, + "learning_rate": 3.993995566553172e-06, + "loss": 0.89168441, + "num_input_tokens_seen": 9760580, + "step": 449, + "time_per_iteration": 2.6384971141815186 + }, + { + "auxiliary_loss_clip": 0.01240323, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_clip": 1.06784356, + "balance_loss_mlp": 1.03614604, + "epoch": 0.054109300787590936, + "flos": 25228862369280.0, + "grad_norm": 1.6159627040757005, + "language_loss": 0.77008229, + "learning_rate": 3.993935099459607e-06, + "loss": 0.793001, + "num_input_tokens_seen": 9782195, + "step": 450, + "time_per_iteration": 3.452735662460327 + }, + { + "auxiliary_loss_clip": 0.01289542, + "auxiliary_loss_mlp": 0.01052454, + "balance_loss_clip": 1.08023953, + "balance_loss_mlp": 1.03785062, + "epoch": 0.054229543678230026, + "flos": 23841525421440.0, + "grad_norm": 1.9365977819459719, + "language_loss": 0.7397663, + "learning_rate": 3.993874329887673e-06, + "loss": 0.76318622, + "num_input_tokens_seen": 9800850, + "step": 451, + "time_per_iteration": 3.8032331466674805 + }, + { + "auxiliary_loss_clip": 0.01278475, + "auxiliary_loss_mlp": 0.01060308, + "balance_loss_clip": 1.07885563, + "balance_loss_mlp": 1.04450107, + "epoch": 0.054349786568869116, + "flos": 16320933192960.0, + "grad_norm": 2.262755922106927, + "language_loss": 0.86274034, + "learning_rate": 3.993813257846589e-06, + "loss": 0.88612813, + "num_input_tokens_seen": 9817605, + "step": 452, + "time_per_iteration": 2.6025373935699463 + }, + { + "auxiliary_loss_clip": 0.01277432, + "auxiliary_loss_mlp": 0.01048695, + "balance_loss_clip": 1.0800885, + "balance_loss_mlp": 1.03323388, + "epoch": 0.054470029459508205, + "flos": 18660729127680.0, + "grad_norm": 2.194280422597141, + "language_loss": 0.93389201, + "learning_rate": 3.993751883345619e-06, + "loss": 0.95715332, + "num_input_tokens_seen": 9835965, + "step": 453, + "time_per_iteration": 2.579185724258423 + }, + { + "auxiliary_loss_clip": 0.01255293, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.07677543, + "balance_loss_mlp": 1.03583384, + "epoch": 0.054590272350147295, + "flos": 17785298856960.0, + "grad_norm": 2.8824854357742957, + "language_loss": 0.8772558, + "learning_rate": 3.993690206394073e-06, + "loss": 0.90032482, + "num_input_tokens_seen": 9852265, + "step": 454, + "time_per_iteration": 2.6367220878601074 + }, + { + "auxiliary_loss_clip": 0.01262541, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_clip": 1.07537532, + "balance_loss_mlp": 1.03914475, + "epoch": 0.054710515240786385, + "flos": 17785945301760.0, + "grad_norm": 4.250461845371748, + "language_loss": 0.87786257, + "learning_rate": 3.993628227001307e-06, + "loss": 0.90103078, + "num_input_tokens_seen": 9870465, + "step": 455, + "time_per_iteration": 2.7315773963928223 + }, + { + "auxiliary_loss_clip": 0.01257737, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_clip": 1.07525134, + "balance_loss_mlp": 1.03912151, + "epoch": 0.05483075813142548, + "flos": 48210900180480.0, + "grad_norm": 1.8598645814736685, + "language_loss": 0.71243703, + "learning_rate": 3.993565945176726e-06, + "loss": 0.73555219, + "num_input_tokens_seen": 9891490, + "step": 456, + "time_per_iteration": 2.8869247436523438 + }, + { + "auxiliary_loss_clip": 0.01252099, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.07586932, + "balance_loss_mlp": 1.04067314, + "epoch": 0.05495100102206457, + "flos": 19682244011520.0, + "grad_norm": 2.3720106741769733, + "language_loss": 0.84223944, + "learning_rate": 3.993503360929776e-06, + "loss": 0.86531949, + "num_input_tokens_seen": 9910375, + "step": 457, + "time_per_iteration": 2.6046829223632812 + }, + { + "auxiliary_loss_clip": 0.0118876, + "auxiliary_loss_mlp": 0.01048865, + "balance_loss_clip": 1.06438732, + "balance_loss_mlp": 1.03308189, + "epoch": 0.05507124391270366, + "flos": 26360048453760.0, + "grad_norm": 1.6505415536320192, + "language_loss": 0.8125149, + "learning_rate": 3.99344047426995e-06, + "loss": 0.8348912, + "num_input_tokens_seen": 9931635, + "step": 458, + "time_per_iteration": 2.9724433422088623 + }, + { + "auxiliary_loss_clip": 0.0123024, + "auxiliary_loss_mlp": 0.0105358, + "balance_loss_clip": 1.071172, + "balance_loss_mlp": 1.03764129, + "epoch": 0.05519148680334275, + "flos": 22601314581120.0, + "grad_norm": 2.475066243843825, + "language_loss": 0.93373871, + "learning_rate": 3.993377285206789e-06, + "loss": 0.95657694, + "num_input_tokens_seen": 9951420, + "step": 459, + "time_per_iteration": 3.034868001937866 + }, + { + "auxiliary_loss_clip": 0.01218326, + "auxiliary_loss_mlp": 0.01058978, + "balance_loss_clip": 1.06891084, + "balance_loss_mlp": 1.04375541, + "epoch": 0.05531172969398184, + "flos": 40552519380480.0, + "grad_norm": 1.8113930562494502, + "language_loss": 0.86769176, + "learning_rate": 3.99331379374988e-06, + "loss": 0.89046478, + "num_input_tokens_seen": 9975025, + "step": 460, + "time_per_iteration": 2.829263925552368 + }, + { + "auxiliary_loss_clip": 0.01262227, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.07059193, + "balance_loss_mlp": 1.03015804, + "epoch": 0.05543197258462093, + "flos": 23477894087040.0, + "grad_norm": 2.017019789076434, + "language_loss": 0.8003341, + "learning_rate": 3.993249999908852e-06, + "loss": 0.82339811, + "num_input_tokens_seen": 9995175, + "step": 461, + "time_per_iteration": 2.6703124046325684 + }, + { + "auxiliary_loss_clip": 0.01290503, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_clip": 1.0779928, + "balance_loss_mlp": 1.03480554, + "epoch": 0.05555221547526003, + "flos": 18624603024000.0, + "grad_norm": 1.8937970038404397, + "language_loss": 0.86934984, + "learning_rate": 3.993185903693384e-06, + "loss": 0.89274859, + "num_input_tokens_seen": 10011975, + "step": 462, + "time_per_iteration": 2.5178093910217285 + }, + { + "auxiliary_loss_clip": 0.01254068, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.07357454, + "balance_loss_mlp": 1.02727699, + "epoch": 0.05567245836589912, + "flos": 23587098410880.0, + "grad_norm": 3.566233413138947, + "language_loss": 0.8240279, + "learning_rate": 3.9931215051131995e-06, + "loss": 0.84698057, + "num_input_tokens_seen": 10032620, + "step": 463, + "time_per_iteration": 2.6224758625030518 + }, + { + "auxiliary_loss_clip": 0.01259662, + "auxiliary_loss_mlp": 0.01047481, + "balance_loss_clip": 1.07127881, + "balance_loss_mlp": 1.0328896, + "epoch": 0.05579270125653821, + "flos": 27746667129600.0, + "grad_norm": 1.8451244320381792, + "language_loss": 0.80100644, + "learning_rate": 3.993056804178068e-06, + "loss": 0.82407784, + "num_input_tokens_seen": 10054165, + "step": 464, + "time_per_iteration": 2.6490097045898438 + }, + { + "auxiliary_loss_clip": 0.01219214, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.07070839, + "balance_loss_mlp": 1.03198957, + "epoch": 0.0559129441471773, + "flos": 27014161075200.0, + "grad_norm": 2.038459399122024, + "language_loss": 0.8459264, + "learning_rate": 3.992991800897803e-06, + "loss": 0.86859381, + "num_input_tokens_seen": 10073970, + "step": 465, + "time_per_iteration": 2.719805955886841 + }, + { + "auxiliary_loss_clip": 0.01288578, + "auxiliary_loss_mlp": 0.01049555, + "balance_loss_clip": 1.07780969, + "balance_loss_mlp": 1.0343318, + "epoch": 0.05603318703781639, + "flos": 15229787794560.0, + "grad_norm": 2.211867426676443, + "language_loss": 0.90184343, + "learning_rate": 3.9929264952822665e-06, + "loss": 0.92522466, + "num_input_tokens_seen": 10091505, + "step": 466, + "time_per_iteration": 2.5047214031219482 + }, + { + "auxiliary_loss_clip": 0.01273936, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_clip": 1.0741744, + "balance_loss_mlp": 1.03656948, + "epoch": 0.05615342992845548, + "flos": 22266482976000.0, + "grad_norm": 2.009741192149142, + "language_loss": 0.88441217, + "learning_rate": 3.992860887341366e-06, + "loss": 0.9076612, + "num_input_tokens_seen": 10109675, + "step": 467, + "time_per_iteration": 2.5594053268432617 + }, + { + "auxiliary_loss_clip": 0.01226967, + "auxiliary_loss_mlp": 0.01044072, + "balance_loss_clip": 1.07008815, + "balance_loss_mlp": 1.02867031, + "epoch": 0.056273672819094574, + "flos": 23584979508480.0, + "grad_norm": 1.9229166855364772, + "language_loss": 0.81169248, + "learning_rate": 3.992794977085052e-06, + "loss": 0.83440286, + "num_input_tokens_seen": 10127675, + "step": 468, + "time_per_iteration": 2.68764328956604 + }, + { + "auxiliary_loss_clip": 0.01244237, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.07536268, + "balance_loss_mlp": 1.04220319, + "epoch": 0.056393915709733664, + "flos": 19858708552320.0, + "grad_norm": 2.218454955542911, + "language_loss": 0.85029125, + "learning_rate": 3.992728764523326e-06, + "loss": 0.87329888, + "num_input_tokens_seen": 10146620, + "step": 469, + "time_per_iteration": 2.6565005779266357 + }, + { + "auxiliary_loss_clip": 0.01254263, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_clip": 1.07303381, + "balance_loss_mlp": 1.03268671, + "epoch": 0.05651415860037275, + "flos": 22163779013760.0, + "grad_norm": 1.8081435341304666, + "language_loss": 0.80726612, + "learning_rate": 3.99266224966623e-06, + "loss": 0.8302815, + "num_input_tokens_seen": 10167535, + "step": 470, + "time_per_iteration": 2.6627657413482666 + }, + { + "auxiliary_loss_clip": 0.01245734, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_clip": 1.07389557, + "balance_loss_mlp": 1.03180361, + "epoch": 0.05663440149101184, + "flos": 19463548055040.0, + "grad_norm": 1.9043129631139961, + "language_loss": 0.88234097, + "learning_rate": 3.992595432523855e-06, + "loss": 0.9052701, + "num_input_tokens_seen": 10184825, + "step": 471, + "time_per_iteration": 2.705014228820801 + }, + { + "auxiliary_loss_clip": 0.01226715, + "auxiliary_loss_mlp": 0.01050286, + "balance_loss_clip": 1.06801248, + "balance_loss_mlp": 1.03602219, + "epoch": 0.05675464438165093, + "flos": 22670226823680.0, + "grad_norm": 2.0690069347771787, + "language_loss": 0.86075461, + "learning_rate": 3.992528313106338e-06, + "loss": 0.88352466, + "num_input_tokens_seen": 10203025, + "step": 472, + "time_per_iteration": 2.660782814025879 + }, + { + "auxiliary_loss_clip": 0.0128766, + "auxiliary_loss_mlp": 0.00766525, + "balance_loss_clip": 1.08046675, + "balance_loss_mlp": 1.00002384, + "epoch": 0.05687488727229002, + "flos": 16901177495040.0, + "grad_norm": 2.2097478742268613, + "language_loss": 0.81996894, + "learning_rate": 3.9924608914238595e-06, + "loss": 0.84051085, + "num_input_tokens_seen": 10218020, + "step": 473, + "time_per_iteration": 3.3181445598602295 + }, + { + "auxiliary_loss_clip": 0.01272255, + "auxiliary_loss_mlp": 0.01050804, + "balance_loss_clip": 1.07769299, + "balance_loss_mlp": 1.0366478, + "epoch": 0.05699513016292912, + "flos": 29168980945920.0, + "grad_norm": 3.031408946721835, + "language_loss": 0.83984435, + "learning_rate": 3.992393167486648e-06, + "loss": 0.86307496, + "num_input_tokens_seen": 10237170, + "step": 474, + "time_per_iteration": 2.622290849685669 + }, + { + "auxiliary_loss_clip": 0.01290903, + "auxiliary_loss_mlp": 0.01057092, + "balance_loss_clip": 1.07971704, + "balance_loss_mlp": 1.04147577, + "epoch": 0.05711537305356821, + "flos": 18916197632640.0, + "grad_norm": 3.290204429831137, + "language_loss": 0.80766809, + "learning_rate": 3.992325141304977e-06, + "loss": 0.83114803, + "num_input_tokens_seen": 10255125, + "step": 475, + "time_per_iteration": 3.3650882244110107 + }, + { + "auxiliary_loss_clip": 0.01226086, + "auxiliary_loss_mlp": 0.01050663, + "balance_loss_clip": 1.07066691, + "balance_loss_mlp": 1.03628623, + "epoch": 0.0572356159442073, + "flos": 26758979879040.0, + "grad_norm": 2.2544307179786056, + "language_loss": 0.86566478, + "learning_rate": 3.992256812889166e-06, + "loss": 0.88843226, + "num_input_tokens_seen": 10271230, + "step": 476, + "time_per_iteration": 3.426182270050049 + }, + { + "auxiliary_loss_clip": 0.01289125, + "auxiliary_loss_mlp": 0.01052572, + "balance_loss_clip": 1.08027315, + "balance_loss_mlp": 1.03814816, + "epoch": 0.05735585883484639, + "flos": 35116146840960.0, + "grad_norm": 2.1605781040426204, + "language_loss": 0.76751554, + "learning_rate": 3.992188182249582e-06, + "loss": 0.79093248, + "num_input_tokens_seen": 10293125, + "step": 477, + "time_per_iteration": 3.5208561420440674 + }, + { + "auxiliary_loss_clip": 0.01256266, + "auxiliary_loss_mlp": 0.01055845, + "balance_loss_clip": 1.07830632, + "balance_loss_mlp": 1.04097939, + "epoch": 0.05747610172548548, + "flos": 18734381965440.0, + "grad_norm": 3.501635547208748, + "language_loss": 0.90491593, + "learning_rate": 3.992119249396633e-06, + "loss": 0.92803705, + "num_input_tokens_seen": 10311810, + "step": 478, + "time_per_iteration": 2.5823044776916504 + }, + { + "auxiliary_loss_clip": 0.012468, + "auxiliary_loss_mlp": 0.00766559, + "balance_loss_clip": 1.07085705, + "balance_loss_mlp": 1.00006104, + "epoch": 0.05759634461612457, + "flos": 27964752554880.0, + "grad_norm": 1.8355539627398967, + "language_loss": 0.82166243, + "learning_rate": 3.992050014340778e-06, + "loss": 0.84179604, + "num_input_tokens_seen": 10332165, + "step": 479, + "time_per_iteration": 2.650386095046997 + }, + { + "auxiliary_loss_clip": 0.01169883, + "auxiliary_loss_mlp": 0.01011543, + "balance_loss_clip": 1.04753792, + "balance_loss_mlp": 1.00591588, + "epoch": 0.057716587506763666, + "flos": 69292009405440.0, + "grad_norm": 0.8343792343489639, + "language_loss": 0.55071431, + "learning_rate": 3.99198047709252e-06, + "loss": 0.5725286, + "num_input_tokens_seen": 10393685, + "step": 480, + "time_per_iteration": 3.1941888332366943 + }, + { + "auxiliary_loss_clip": 0.01234985, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.06653214, + "balance_loss_mlp": 1.03418541, + "epoch": 0.057836830397402755, + "flos": 25009196745600.0, + "grad_norm": 1.7587170365530287, + "language_loss": 0.7850107, + "learning_rate": 3.991910637662408e-06, + "loss": 0.80785334, + "num_input_tokens_seen": 10413975, + "step": 481, + "time_per_iteration": 2.668564558029175 + }, + { + "auxiliary_loss_clip": 0.01288245, + "auxiliary_loss_mlp": 0.01045164, + "balance_loss_clip": 1.08067203, + "balance_loss_mlp": 1.03016722, + "epoch": 0.057957073288041845, + "flos": 25593894334080.0, + "grad_norm": 1.7872837820325018, + "language_loss": 0.80465949, + "learning_rate": 3.9918404960610355e-06, + "loss": 0.82799363, + "num_input_tokens_seen": 10433005, + "step": 482, + "time_per_iteration": 2.5619750022888184 + }, + { + "auxiliary_loss_clip": 0.01279854, + "auxiliary_loss_mlp": 0.01051277, + "balance_loss_clip": 1.08008456, + "balance_loss_mlp": 1.03692997, + "epoch": 0.058077316178680935, + "flos": 20777411733120.0, + "grad_norm": 2.216841478132205, + "language_loss": 0.77322996, + "learning_rate": 3.991770052299043e-06, + "loss": 0.79654121, + "num_input_tokens_seen": 10451235, + "step": 483, + "time_per_iteration": 2.5454251766204834 + }, + { + "auxiliary_loss_clip": 0.01253004, + "auxiliary_loss_mlp": 0.01043261, + "balance_loss_clip": 1.07139385, + "balance_loss_mlp": 1.0297606, + "epoch": 0.058197559069320025, + "flos": 18916484941440.0, + "grad_norm": 2.599245728792679, + "language_loss": 0.87562007, + "learning_rate": 3.991699306387118e-06, + "loss": 0.89858276, + "num_input_tokens_seen": 10469705, + "step": 484, + "time_per_iteration": 2.5623483657836914 + }, + { + "auxiliary_loss_clip": 0.01274664, + "auxiliary_loss_mlp": 0.01056003, + "balance_loss_clip": 1.07816565, + "balance_loss_mlp": 1.0417155, + "epoch": 0.058317801959959115, + "flos": 24863327614080.0, + "grad_norm": 4.756759524574157, + "language_loss": 0.78046644, + "learning_rate": 3.991628258335991e-06, + "loss": 0.80377305, + "num_input_tokens_seen": 10491910, + "step": 485, + "time_per_iteration": 2.6181812286376953 + }, + { + "auxiliary_loss_clip": 0.01231366, + "auxiliary_loss_mlp": 0.01047169, + "balance_loss_clip": 1.0695045, + "balance_loss_mlp": 1.03252387, + "epoch": 0.05843804485059821, + "flos": 23257977068160.0, + "grad_norm": 3.7899103076848055, + "language_loss": 0.88073432, + "learning_rate": 3.991556908156442e-06, + "loss": 0.90351963, + "num_input_tokens_seen": 10508435, + "step": 486, + "time_per_iteration": 2.6892738342285156 + }, + { + "auxiliary_loss_clip": 0.01261704, + "auxiliary_loss_mlp": 0.01054489, + "balance_loss_clip": 1.07563674, + "balance_loss_mlp": 1.04030287, + "epoch": 0.0585582877412373, + "flos": 23150532510720.0, + "grad_norm": 1.7752129421907592, + "language_loss": 0.87635565, + "learning_rate": 3.9914852558592914e-06, + "loss": 0.8995176, + "num_input_tokens_seen": 10529485, + "step": 487, + "time_per_iteration": 2.653871536254883 + }, + { + "auxiliary_loss_clip": 0.01273441, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_clip": 1.07855678, + "balance_loss_mlp": 1.03254914, + "epoch": 0.05867853063187639, + "flos": 23506406507520.0, + "grad_norm": 2.960473161929745, + "language_loss": 0.80842394, + "learning_rate": 3.991413301455413e-06, + "loss": 0.83163476, + "num_input_tokens_seen": 10545935, + "step": 488, + "time_per_iteration": 2.5692968368530273 + }, + { + "auxiliary_loss_clip": 0.01241758, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.07231295, + "balance_loss_mlp": 1.03290021, + "epoch": 0.05879877352251548, + "flos": 29495803818240.0, + "grad_norm": 7.956721259878436, + "language_loss": 0.77901775, + "learning_rate": 3.991341044955719e-06, + "loss": 0.80189872, + "num_input_tokens_seen": 10565690, + "step": 489, + "time_per_iteration": 2.678300142288208 + }, + { + "auxiliary_loss_clip": 0.0127033, + "auxiliary_loss_mlp": 0.00767439, + "balance_loss_clip": 1.07561994, + "balance_loss_mlp": 1.00017357, + "epoch": 0.05891901641315457, + "flos": 20157485880960.0, + "grad_norm": 2.0433155359883575, + "language_loss": 0.81396073, + "learning_rate": 3.991268486371172e-06, + "loss": 0.83433843, + "num_input_tokens_seen": 10584245, + "step": 490, + "time_per_iteration": 2.5893120765686035 + }, + { + "auxiliary_loss_clip": 0.01255923, + "auxiliary_loss_mlp": 0.01055099, + "balance_loss_clip": 1.0732888, + "balance_loss_mlp": 1.03798044, + "epoch": 0.05903925930379366, + "flos": 24644200694400.0, + "grad_norm": 2.4600426917720326, + "language_loss": 0.88209903, + "learning_rate": 3.991195625712779e-06, + "loss": 0.90520924, + "num_input_tokens_seen": 10601210, + "step": 491, + "time_per_iteration": 2.6413207054138184 + }, + { + "auxiliary_loss_clip": 0.0128831, + "auxiliary_loss_mlp": 0.01047628, + "balance_loss_clip": 1.08083904, + "balance_loss_mlp": 1.03242874, + "epoch": 0.05915950219443276, + "flos": 21250391045760.0, + "grad_norm": 2.3515861784330307, + "language_loss": 0.81626749, + "learning_rate": 3.991122462991592e-06, + "loss": 0.83962691, + "num_input_tokens_seen": 10620730, + "step": 492, + "time_per_iteration": 2.5646188259124756 + }, + { + "auxiliary_loss_clip": 0.01293225, + "auxiliary_loss_mlp": 0.01051029, + "balance_loss_clip": 1.07885432, + "balance_loss_mlp": 1.03628266, + "epoch": 0.05927974508507185, + "flos": 9902727319680.0, + "grad_norm": 4.981811019288449, + "language_loss": 0.81147444, + "learning_rate": 3.991048998218712e-06, + "loss": 0.83491695, + "num_input_tokens_seen": 10634035, + "step": 493, + "time_per_iteration": 2.529045581817627 + }, + { + "auxiliary_loss_clip": 0.01269645, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_clip": 1.07322073, + "balance_loss_mlp": 1.03356338, + "epoch": 0.05939998797571094, + "flos": 18259499232000.0, + "grad_norm": 2.81974827787177, + "language_loss": 0.76362193, + "learning_rate": 3.990975231405281e-06, + "loss": 0.78679609, + "num_input_tokens_seen": 10652485, + "step": 494, + "time_per_iteration": 2.587747573852539 + }, + { + "auxiliary_loss_clip": 0.01269592, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.07819152, + "balance_loss_mlp": 1.03400612, + "epoch": 0.05952023086635003, + "flos": 28256598558720.0, + "grad_norm": 3.2131980282900305, + "language_loss": 0.78632241, + "learning_rate": 3.990901162562491e-06, + "loss": 0.80950606, + "num_input_tokens_seen": 10673175, + "step": 495, + "time_per_iteration": 2.647360324859619 + }, + { + "auxiliary_loss_clip": 0.01232594, + "auxiliary_loss_mlp": 0.00767752, + "balance_loss_clip": 1.06736565, + "balance_loss_mlp": 1.00015175, + "epoch": 0.05964047375698912, + "flos": 14902498045440.0, + "grad_norm": 15.71552339439477, + "language_loss": 0.91038048, + "learning_rate": 3.9908267917015765e-06, + "loss": 0.93038392, + "num_input_tokens_seen": 10691235, + "step": 496, + "time_per_iteration": 2.6359856128692627 + }, + { + "auxiliary_loss_clip": 0.0125895, + "auxiliary_loss_mlp": 0.01062804, + "balance_loss_clip": 1.07245481, + "balance_loss_mlp": 1.04734266, + "epoch": 0.059760716647628206, + "flos": 23185581206400.0, + "grad_norm": 3.9717311310739993, + "language_loss": 0.93061179, + "learning_rate": 3.990752118833821e-06, + "loss": 0.95382935, + "num_input_tokens_seen": 10708675, + "step": 497, + "time_per_iteration": 2.6624879837036133 + }, + { + "auxiliary_loss_clip": 0.01289684, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_clip": 1.08069634, + "balance_loss_mlp": 1.03059888, + "epoch": 0.0598809595382673, + "flos": 22746968231040.0, + "grad_norm": 1.7802356239607748, + "language_loss": 0.77967262, + "learning_rate": 3.990677143970553e-06, + "loss": 0.80302298, + "num_input_tokens_seen": 10729485, + "step": 498, + "time_per_iteration": 2.642883777618408 + }, + { + "auxiliary_loss_clip": 0.01238815, + "auxiliary_loss_mlp": 0.01057459, + "balance_loss_clip": 1.07719672, + "balance_loss_mlp": 1.04166365, + "epoch": 0.06000120242890639, + "flos": 22127221946880.0, + "grad_norm": 1.8255575427019457, + "language_loss": 0.81348979, + "learning_rate": 3.990601867123144e-06, + "loss": 0.8364526, + "num_input_tokens_seen": 10749210, + "step": 499, + "time_per_iteration": 2.703958034515381 + }, + { + "auxiliary_loss_clip": 0.0122229, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_clip": 1.07176709, + "balance_loss_mlp": 1.03739583, + "epoch": 0.06012144531954548, + "flos": 19171773878400.0, + "grad_norm": 1.9568498726337706, + "language_loss": 0.85040468, + "learning_rate": 3.990526288303014e-06, + "loss": 0.87315089, + "num_input_tokens_seen": 10768000, + "step": 500, + "time_per_iteration": 3.4640142917633057 + }, + { + "auxiliary_loss_clip": 0.01252048, + "auxiliary_loss_mlp": 0.00766383, + "balance_loss_clip": 1.07381582, + "balance_loss_mlp": 1.00007629, + "epoch": 0.06024168821018457, + "flos": 22783345729920.0, + "grad_norm": 1.7015245871129292, + "language_loss": 0.90888071, + "learning_rate": 3.9904504075216295e-06, + "loss": 0.92906505, + "num_input_tokens_seen": 10788760, + "step": 501, + "time_per_iteration": 2.6595582962036133 + }, + { + "auxiliary_loss_clip": 0.01237916, + "auxiliary_loss_mlp": 0.01059116, + "balance_loss_clip": 1.06935573, + "balance_loss_mlp": 1.04332042, + "epoch": 0.06036193110082366, + "flos": 18770687637120.0, + "grad_norm": 2.3385765970873096, + "language_loss": 0.94025385, + "learning_rate": 3.990374224790501e-06, + "loss": 0.96322411, + "num_input_tokens_seen": 10806965, + "step": 502, + "time_per_iteration": 3.4137332439422607 + }, + { + "auxiliary_loss_clip": 0.01253027, + "auxiliary_loss_mlp": 0.01056263, + "balance_loss_clip": 1.07572412, + "balance_loss_mlp": 1.0409925, + "epoch": 0.06048217399146275, + "flos": 17201570935680.0, + "grad_norm": 2.273210061593724, + "language_loss": 0.70804536, + "learning_rate": 3.990297740121185e-06, + "loss": 0.73113823, + "num_input_tokens_seen": 10824900, + "step": 503, + "time_per_iteration": 2.5982022285461426 + }, + { + "auxiliary_loss_clip": 0.01269168, + "auxiliary_loss_mlp": 0.00767179, + "balance_loss_clip": 1.07669306, + "balance_loss_mlp": 1.00008011, + "epoch": 0.06060241688210185, + "flos": 24024131187840.0, + "grad_norm": 1.9152883845288027, + "language_loss": 0.78320456, + "learning_rate": 3.990220953525284e-06, + "loss": 0.80356801, + "num_input_tokens_seen": 10842010, + "step": 504, + "time_per_iteration": 4.2173755168914795 + }, + { + "auxiliary_loss_clip": 0.01242398, + "auxiliary_loss_mlp": 0.01048692, + "balance_loss_clip": 1.07039809, + "balance_loss_mlp": 1.03442311, + "epoch": 0.06072265977274094, + "flos": 14611190745600.0, + "grad_norm": 2.5373459279840938, + "language_loss": 0.74258208, + "learning_rate": 3.9901438650144465e-06, + "loss": 0.76549304, + "num_input_tokens_seen": 10858260, + "step": 505, + "time_per_iteration": 2.6630074977874756 + }, + { + "auxiliary_loss_clip": 0.01260956, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.07419014, + "balance_loss_mlp": 1.03293169, + "epoch": 0.06084290266338003, + "flos": 20558284813440.0, + "grad_norm": 2.3332867513378646, + "language_loss": 0.91983247, + "learning_rate": 3.990066474600367e-06, + "loss": 0.94290793, + "num_input_tokens_seen": 10876230, + "step": 506, + "time_per_iteration": 2.556824207305908 + }, + { + "auxiliary_loss_clip": 0.01255115, + "auxiliary_loss_mlp": 0.01054062, + "balance_loss_clip": 1.06948829, + "balance_loss_mlp": 1.03885102, + "epoch": 0.06096314555401912, + "flos": 22309217182080.0, + "grad_norm": 2.030504540763671, + "language_loss": 0.67750573, + "learning_rate": 3.989988782294786e-06, + "loss": 0.70059752, + "num_input_tokens_seen": 10896320, + "step": 507, + "time_per_iteration": 2.596639633178711 + }, + { + "auxiliary_loss_clip": 0.01220896, + "auxiliary_loss_mlp": 0.0105564, + "balance_loss_clip": 1.06857038, + "balance_loss_mlp": 1.04070342, + "epoch": 0.06108338844465821, + "flos": 19131374056320.0, + "grad_norm": 1.6628456841236994, + "language_loss": 0.94930637, + "learning_rate": 3.989910788109489e-06, + "loss": 0.97207171, + "num_input_tokens_seen": 10912970, + "step": 508, + "time_per_iteration": 2.6833114624023438 + }, + { + "auxiliary_loss_clip": 0.01231883, + "auxiliary_loss_mlp": 0.01047665, + "balance_loss_clip": 1.06817305, + "balance_loss_mlp": 1.03357434, + "epoch": 0.0612036313352973, + "flos": 33584018169600.0, + "grad_norm": 2.1991078278036134, + "language_loss": 0.75011522, + "learning_rate": 3.989832492056307e-06, + "loss": 0.77291071, + "num_input_tokens_seen": 10933995, + "step": 509, + "time_per_iteration": 2.778923749923706 + }, + { + "auxiliary_loss_clip": 0.012692, + "auxiliary_loss_mlp": 0.01050809, + "balance_loss_clip": 1.07625592, + "balance_loss_mlp": 1.0359199, + "epoch": 0.06132387422593639, + "flos": 27490552179840.0, + "grad_norm": 5.918767852311743, + "language_loss": 0.80982929, + "learning_rate": 3.989753894147119e-06, + "loss": 0.83302945, + "num_input_tokens_seen": 10954120, + "step": 510, + "time_per_iteration": 2.6411149501800537 + }, + { + "auxiliary_loss_clip": 0.01266358, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.08098495, + "balance_loss_mlp": 1.0314368, + "epoch": 0.061444117116575485, + "flos": 25885057979520.0, + "grad_norm": 1.6869982553297518, + "language_loss": 0.79724312, + "learning_rate": 3.989674994393846e-06, + "loss": 0.82036042, + "num_input_tokens_seen": 10973595, + "step": 511, + "time_per_iteration": 2.676708698272705 + }, + { + "auxiliary_loss_clip": 0.01266848, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.07807994, + "balance_loss_mlp": 1.0283246, + "epoch": 0.061564360007214575, + "flos": 28512031150080.0, + "grad_norm": 2.1719317071115216, + "language_loss": 0.93731785, + "learning_rate": 3.98959579280846e-06, + "loss": 0.9604128, + "num_input_tokens_seen": 10991995, + "step": 512, + "time_per_iteration": 2.6956677436828613 + }, + { + "auxiliary_loss_clip": 0.01201074, + "auxiliary_loss_mlp": 0.01046305, + "balance_loss_clip": 1.07095885, + "balance_loss_mlp": 1.0316062, + "epoch": 0.061684602897853665, + "flos": 12094355652480.0, + "grad_norm": 2.0394130631990457, + "language_loss": 0.83402348, + "learning_rate": 3.989516289402973e-06, + "loss": 0.85649729, + "num_input_tokens_seen": 11007625, + "step": 513, + "time_per_iteration": 2.652188777923584 + }, + { + "auxiliary_loss_clip": 0.01182799, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.06124592, + "balance_loss_mlp": 1.03264666, + "epoch": 0.061804845788492754, + "flos": 19532639865600.0, + "grad_norm": 2.080239407443858, + "language_loss": 0.80217361, + "learning_rate": 3.989436484189447e-06, + "loss": 0.82447684, + "num_input_tokens_seen": 11025570, + "step": 514, + "time_per_iteration": 2.7185964584350586 + }, + { + "auxiliary_loss_clip": 0.0127007, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.07449269, + "balance_loss_mlp": 1.03000486, + "epoch": 0.061925088679131844, + "flos": 15341111020800.0, + "grad_norm": 2.5846469946538546, + "language_loss": 0.80778503, + "learning_rate": 3.9893563771799885e-06, + "loss": 0.8309325, + "num_input_tokens_seen": 11042045, + "step": 515, + "time_per_iteration": 2.5329296588897705 + }, + { + "auxiliary_loss_clip": 0.01284952, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_clip": 1.07874906, + "balance_loss_mlp": 1.03726482, + "epoch": 0.062045331569770934, + "flos": 25919927107200.0, + "grad_norm": 2.1163998825912307, + "language_loss": 0.86399448, + "learning_rate": 3.989275968386749e-06, + "loss": 0.88736761, + "num_input_tokens_seen": 11059955, + "step": 516, + "time_per_iteration": 2.5696825981140137 + }, + { + "auxiliary_loss_clip": 0.01244653, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.07081389, + "balance_loss_mlp": 1.03796196, + "epoch": 0.06216557446041003, + "flos": 28110621686400.0, + "grad_norm": 2.937226241389715, + "language_loss": 0.76394022, + "learning_rate": 3.989195257821926e-06, + "loss": 0.78692281, + "num_input_tokens_seen": 11078440, + "step": 517, + "time_per_iteration": 2.6372673511505127 + }, + { + "auxiliary_loss_clip": 0.01249354, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_clip": 1.07677817, + "balance_loss_mlp": 1.03693271, + "epoch": 0.06228581735104912, + "flos": 23478181395840.0, + "grad_norm": 2.174227028428246, + "language_loss": 0.84479153, + "learning_rate": 3.989114245497765e-06, + "loss": 0.86780685, + "num_input_tokens_seen": 11098240, + "step": 518, + "time_per_iteration": 2.6220695972442627 + }, + { + "auxiliary_loss_clip": 0.01266368, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.07011247, + "balance_loss_mlp": 1.03116083, + "epoch": 0.06240606024168821, + "flos": 15195205975680.0, + "grad_norm": 1.999850191138751, + "language_loss": 0.9485203, + "learning_rate": 3.989032931426554e-06, + "loss": 0.97163522, + "num_input_tokens_seen": 11115395, + "step": 519, + "time_per_iteration": 2.5469882488250732 + }, + { + "auxiliary_loss_clip": 0.01242359, + "auxiliary_loss_mlp": 0.01048599, + "balance_loss_clip": 1.07131505, + "balance_loss_mlp": 1.03395426, + "epoch": 0.06252630313232731, + "flos": 20631829910400.0, + "grad_norm": 2.480408347412726, + "language_loss": 0.87056077, + "learning_rate": 3.9889513156206295e-06, + "loss": 0.89347041, + "num_input_tokens_seen": 11134835, + "step": 520, + "time_per_iteration": 2.588503122329712 + }, + { + "auxiliary_loss_clip": 0.0123943, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_clip": 1.0725863, + "balance_loss_mlp": 1.03511035, + "epoch": 0.06264654602296639, + "flos": 20778058177920.0, + "grad_norm": 2.691123443069599, + "language_loss": 0.7333951, + "learning_rate": 3.988869398092371e-06, + "loss": 0.75629532, + "num_input_tokens_seen": 11154745, + "step": 521, + "time_per_iteration": 2.6066815853118896 + }, + { + "auxiliary_loss_clip": 0.01250896, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.07427764, + "balance_loss_mlp": 1.03376961, + "epoch": 0.06276678891360549, + "flos": 29605798241280.0, + "grad_norm": 2.132968272031907, + "language_loss": 0.79316318, + "learning_rate": 3.988787178854206e-06, + "loss": 0.81615889, + "num_input_tokens_seen": 11174280, + "step": 522, + "time_per_iteration": 2.6430771350860596 + }, + { + "auxiliary_loss_clip": 0.01281796, + "auxiliary_loss_mlp": 0.01047678, + "balance_loss_clip": 1.07795227, + "balance_loss_mlp": 1.03304529, + "epoch": 0.06288703180424457, + "flos": 22126288193280.0, + "grad_norm": 2.1655606786726693, + "language_loss": 0.87630653, + "learning_rate": 3.988704657918608e-06, + "loss": 0.89960122, + "num_input_tokens_seen": 11193340, + "step": 523, + "time_per_iteration": 2.523374080657959 + }, + { + "auxiliary_loss_clip": 0.01264429, + "auxiliary_loss_mlp": 0.0105499, + "balance_loss_clip": 1.0777669, + "balance_loss_mlp": 1.04097688, + "epoch": 0.06300727469488367, + "flos": 14976689587200.0, + "grad_norm": 3.3428038352648763, + "language_loss": 0.79416084, + "learning_rate": 3.988621835298094e-06, + "loss": 0.81735498, + "num_input_tokens_seen": 11210555, + "step": 524, + "time_per_iteration": 2.637953042984009 + }, + { + "auxiliary_loss_clip": 0.01279006, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_clip": 1.07856989, + "balance_loss_mlp": 1.03454566, + "epoch": 0.06312751758552275, + "flos": 24535391420160.0, + "grad_norm": 1.9277079631097673, + "language_loss": 0.91820008, + "learning_rate": 3.988538711005229e-06, + "loss": 0.94147658, + "num_input_tokens_seen": 11230010, + "step": 525, + "time_per_iteration": 2.531520128250122 + }, + { + "auxiliary_loss_clip": 0.01258024, + "auxiliary_loss_mlp": 0.01045121, + "balance_loss_clip": 1.07437611, + "balance_loss_mlp": 1.03191888, + "epoch": 0.06324776047616185, + "flos": 21507008785920.0, + "grad_norm": 2.3449037616007558, + "language_loss": 0.88559568, + "learning_rate": 3.988455285052622e-06, + "loss": 0.90862715, + "num_input_tokens_seen": 11246190, + "step": 526, + "time_per_iteration": 2.549417734146118 + }, + { + "auxiliary_loss_clip": 0.01259674, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.07579994, + "balance_loss_mlp": 1.03631437, + "epoch": 0.06336800336680094, + "flos": 21688034353920.0, + "grad_norm": 2.016569022935123, + "language_loss": 0.84146541, + "learning_rate": 3.98837155745293e-06, + "loss": 0.864568, + "num_input_tokens_seen": 11264230, + "step": 527, + "time_per_iteration": 3.276167154312134 + }, + { + "auxiliary_loss_clip": 0.01264863, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_clip": 1.07788789, + "balance_loss_mlp": 1.02978158, + "epoch": 0.06348824625744003, + "flos": 19500895221120.0, + "grad_norm": 2.27613616477299, + "language_loss": 0.75995594, + "learning_rate": 3.988287528218854e-06, + "loss": 0.78304857, + "num_input_tokens_seen": 11283015, + "step": 528, + "time_per_iteration": 3.3132426738739014 + }, + { + "auxiliary_loss_clip": 0.01260404, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_clip": 1.07694769, + "balance_loss_mlp": 1.03286314, + "epoch": 0.06360848914807912, + "flos": 15481233976320.0, + "grad_norm": 2.072176348938091, + "language_loss": 0.90792084, + "learning_rate": 3.98820319736314e-06, + "loss": 0.93098295, + "num_input_tokens_seen": 11299630, + "step": 529, + "time_per_iteration": 3.3120808601379395 + }, + { + "auxiliary_loss_clip": 0.01230096, + "auxiliary_loss_mlp": 0.01048167, + "balance_loss_clip": 1.06697452, + "balance_loss_mlp": 1.03413606, + "epoch": 0.0637287320387182, + "flos": 20593369422720.0, + "grad_norm": 1.7508227475370186, + "language_loss": 0.85564196, + "learning_rate": 3.988118564898582e-06, + "loss": 0.87842453, + "num_input_tokens_seen": 11319170, + "step": 530, + "time_per_iteration": 3.522829294204712 + }, + { + "auxiliary_loss_clip": 0.01223561, + "auxiliary_loss_mlp": 0.00766983, + "balance_loss_clip": 1.07106614, + "balance_loss_mlp": 1.00005126, + "epoch": 0.0638489749293573, + "flos": 17412222245760.0, + "grad_norm": 2.6019235698645597, + "language_loss": 0.89125049, + "learning_rate": 3.988033630838019e-06, + "loss": 0.91115594, + "num_input_tokens_seen": 11333210, + "step": 531, + "time_per_iteration": 2.596247911453247 + }, + { + "auxiliary_loss_clip": 0.01262959, + "auxiliary_loss_mlp": 0.01056195, + "balance_loss_clip": 1.0753665, + "balance_loss_mlp": 1.04242027, + "epoch": 0.0639692178199964, + "flos": 23807661874560.0, + "grad_norm": 1.7259890662811201, + "language_loss": 0.88038683, + "learning_rate": 3.987948395194334e-06, + "loss": 0.9035784, + "num_input_tokens_seen": 11355590, + "step": 532, + "time_per_iteration": 2.60113787651062 + }, + { + "auxiliary_loss_clip": 0.01253628, + "auxiliary_loss_mlp": 0.01055858, + "balance_loss_clip": 1.07104838, + "balance_loss_mlp": 1.04213154, + "epoch": 0.06408946071063548, + "flos": 18477225521280.0, + "grad_norm": 2.181430518949391, + "language_loss": 0.76723611, + "learning_rate": 3.987862857980458e-06, + "loss": 0.79033101, + "num_input_tokens_seen": 11371535, + "step": 533, + "time_per_iteration": 2.5269105434417725 + }, + { + "auxiliary_loss_clip": 0.0122978, + "auxiliary_loss_mlp": 0.01047241, + "balance_loss_clip": 1.06969643, + "balance_loss_mlp": 1.03280461, + "epoch": 0.06420970360127458, + "flos": 27162220936320.0, + "grad_norm": 3.4444429775789223, + "language_loss": 0.7680499, + "learning_rate": 3.987777019209368e-06, + "loss": 0.79082006, + "num_input_tokens_seen": 11392050, + "step": 534, + "time_per_iteration": 2.676609516143799 + }, + { + "auxiliary_loss_clip": 0.01281578, + "auxiliary_loss_mlp": 0.01040451, + "balance_loss_clip": 1.07916582, + "balance_loss_mlp": 1.02680707, + "epoch": 0.06432994649191366, + "flos": 23659673840640.0, + "grad_norm": 1.885192362499125, + "language_loss": 0.81273139, + "learning_rate": 3.987690878894084e-06, + "loss": 0.83595169, + "num_input_tokens_seen": 11411765, + "step": 535, + "time_per_iteration": 2.5245308876037598 + }, + { + "auxiliary_loss_clip": 0.01250636, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.07367468, + "balance_loss_mlp": 1.023, + "epoch": 0.06445018938255276, + "flos": 23403953940480.0, + "grad_norm": 3.3719943401864523, + "language_loss": 0.85190439, + "learning_rate": 3.987604437047673e-06, + "loss": 0.87478423, + "num_input_tokens_seen": 11431565, + "step": 536, + "time_per_iteration": 2.5835015773773193 + }, + { + "auxiliary_loss_clip": 0.01258563, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.07361615, + "balance_loss_mlp": 1.03076005, + "epoch": 0.06457043227319184, + "flos": 19646692525440.0, + "grad_norm": 2.5851956248597916, + "language_loss": 0.77534115, + "learning_rate": 3.987517693683251e-06, + "loss": 0.79837048, + "num_input_tokens_seen": 11450140, + "step": 537, + "time_per_iteration": 2.545957088470459 + }, + { + "auxiliary_loss_clip": 0.01246268, + "auxiliary_loss_mlp": 0.01057002, + "balance_loss_clip": 1.07612455, + "balance_loss_mlp": 1.04266143, + "epoch": 0.06469067516383094, + "flos": 16978744915200.0, + "grad_norm": 4.200448366512321, + "language_loss": 0.96094352, + "learning_rate": 3.9874306488139745e-06, + "loss": 0.98397613, + "num_input_tokens_seen": 11465400, + "step": 538, + "time_per_iteration": 2.542083263397217 + }, + { + "auxiliary_loss_clip": 0.01225929, + "auxiliary_loss_mlp": 0.01047455, + "balance_loss_clip": 1.07089329, + "balance_loss_mlp": 1.03334045, + "epoch": 0.06481091805447003, + "flos": 23296401642240.0, + "grad_norm": 2.0707413921384408, + "language_loss": 0.87717485, + "learning_rate": 3.987343302453049e-06, + "loss": 0.89990866, + "num_input_tokens_seen": 11486675, + "step": 539, + "time_per_iteration": 2.6370670795440674 + }, + { + "auxiliary_loss_clip": 0.01244343, + "auxiliary_loss_mlp": 0.01049903, + "balance_loss_clip": 1.07429671, + "balance_loss_mlp": 1.03569365, + "epoch": 0.06493116094510912, + "flos": 29172356824320.0, + "grad_norm": 1.86592847795136, + "language_loss": 0.82603812, + "learning_rate": 3.987255654613724e-06, + "loss": 0.84898055, + "num_input_tokens_seen": 11510440, + "step": 540, + "time_per_iteration": 2.668349027633667 + }, + { + "auxiliary_loss_clip": 0.01223454, + "auxiliary_loss_mlp": 0.01046393, + "balance_loss_clip": 1.06828475, + "balance_loss_mlp": 1.03256488, + "epoch": 0.06505140383574821, + "flos": 19865065259520.0, + "grad_norm": 2.1510318073473313, + "language_loss": 0.70003158, + "learning_rate": 3.987167705309296e-06, + "loss": 0.72273004, + "num_input_tokens_seen": 11529715, + "step": 541, + "time_per_iteration": 2.6080195903778076 + }, + { + "auxiliary_loss_clip": 0.01262767, + "auxiliary_loss_mlp": 0.00766071, + "balance_loss_clip": 1.07512736, + "balance_loss_mlp": 1.00018382, + "epoch": 0.0651716467263873, + "flos": 17924703540480.0, + "grad_norm": 2.0855670003236133, + "language_loss": 0.95461667, + "learning_rate": 3.987079454553108e-06, + "loss": 0.97490507, + "num_input_tokens_seen": 11547665, + "step": 542, + "time_per_iteration": 2.5343332290649414 + }, + { + "auxiliary_loss_clip": 0.01226471, + "auxiliary_loss_mlp": 0.01044329, + "balance_loss_clip": 1.07240462, + "balance_loss_mlp": 1.03072691, + "epoch": 0.0652918896170264, + "flos": 20842840356480.0, + "grad_norm": 1.8339175151330724, + "language_loss": 0.91140485, + "learning_rate": 3.986990902358546e-06, + "loss": 0.93411279, + "num_input_tokens_seen": 11564605, + "step": 543, + "time_per_iteration": 2.616152763366699 + }, + { + "auxiliary_loss_clip": 0.01262039, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.07387853, + "balance_loss_mlp": 1.0361712, + "epoch": 0.06541213250766549, + "flos": 21872507627520.0, + "grad_norm": 2.3364183511650864, + "language_loss": 0.93024445, + "learning_rate": 3.986902048739045e-06, + "loss": 0.9533658, + "num_input_tokens_seen": 11584550, + "step": 544, + "time_per_iteration": 2.553133726119995 + }, + { + "auxiliary_loss_clip": 0.01247142, + "auxiliary_loss_mlp": 0.01052495, + "balance_loss_clip": 1.07392597, + "balance_loss_mlp": 1.03751016, + "epoch": 0.06553237539830457, + "flos": 23110743219840.0, + "grad_norm": 2.7575017319100983, + "language_loss": 0.80062878, + "learning_rate": 3.986812893708082e-06, + "loss": 0.82362509, + "num_input_tokens_seen": 11600740, + "step": 545, + "time_per_iteration": 2.612067699432373 + }, + { + "auxiliary_loss_clip": 0.01244858, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_clip": 1.07021284, + "balance_loss_mlp": 1.03672695, + "epoch": 0.06565261828894367, + "flos": 17923769786880.0, + "grad_norm": 2.2202509878486736, + "language_loss": 0.81673646, + "learning_rate": 3.9867234372791826e-06, + "loss": 0.83970064, + "num_input_tokens_seen": 11618695, + "step": 546, + "time_per_iteration": 2.5827808380126953 + }, + { + "auxiliary_loss_clip": 0.01257309, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.07342386, + "balance_loss_mlp": 1.03148067, + "epoch": 0.06577286117958275, + "flos": 22783058421120.0, + "grad_norm": 1.8780021614191618, + "language_loss": 0.87131679, + "learning_rate": 3.986633679465918e-06, + "loss": 0.89434421, + "num_input_tokens_seen": 11638850, + "step": 547, + "time_per_iteration": 2.585137367248535 + }, + { + "auxiliary_loss_clip": 0.01214552, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.07103539, + "balance_loss_mlp": 1.03785896, + "epoch": 0.06589310407022185, + "flos": 23696194993920.0, + "grad_norm": 2.930574689007103, + "language_loss": 0.80597746, + "learning_rate": 3.986543620281904e-06, + "loss": 0.82863963, + "num_input_tokens_seen": 11658500, + "step": 548, + "time_per_iteration": 2.6810150146484375 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.06738496, + "balance_loss_mlp": 1.02147758, + "epoch": 0.06601334696086093, + "flos": 26864772410880.0, + "grad_norm": 1.8110082584409353, + "language_loss": 0.91071099, + "learning_rate": 3.986453259740802e-06, + "loss": 0.93331927, + "num_input_tokens_seen": 11676670, + "step": 549, + "time_per_iteration": 2.633110284805298 + }, + { + "auxiliary_loss_clip": 0.01243045, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.07658756, + "balance_loss_mlp": 1.03288758, + "epoch": 0.06613358985150003, + "flos": 12567694101120.0, + "grad_norm": 2.9496046330633394, + "language_loss": 0.7910217, + "learning_rate": 3.986362597856319e-06, + "loss": 0.81392086, + "num_input_tokens_seen": 11693170, + "step": 550, + "time_per_iteration": 2.5648274421691895 + }, + { + "auxiliary_loss_clip": 0.01240508, + "auxiliary_loss_mlp": 0.00767502, + "balance_loss_clip": 1.07033682, + "balance_loss_mlp": 1.00026834, + "epoch": 0.06625383274213913, + "flos": 18332505624960.0, + "grad_norm": 2.3328763525652074, + "language_loss": 0.81396592, + "learning_rate": 3.986271634642211e-06, + "loss": 0.83404601, + "num_input_tokens_seen": 11710150, + "step": 551, + "time_per_iteration": 2.592482566833496 + }, + { + "auxiliary_loss_clip": 0.01273292, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.07646179, + "balance_loss_mlp": 1.03255057, + "epoch": 0.06637407563277821, + "flos": 15375585098880.0, + "grad_norm": 2.1826992901561875, + "language_loss": 0.81742251, + "learning_rate": 3.986180370112274e-06, + "loss": 0.84062612, + "num_input_tokens_seen": 11726670, + "step": 552, + "time_per_iteration": 2.4661827087402344 + }, + { + "auxiliary_loss_clip": 0.01260618, + "auxiliary_loss_mlp": 0.007673, + "balance_loss_clip": 1.07538533, + "balance_loss_mlp": 1.0002594, + "epoch": 0.0664943185234173, + "flos": 24025244509440.0, + "grad_norm": 1.9926933822402197, + "language_loss": 0.74451935, + "learning_rate": 3.986088804280354e-06, + "loss": 0.76479852, + "num_input_tokens_seen": 11746400, + "step": 553, + "time_per_iteration": 3.360436201095581 + }, + { + "auxiliary_loss_clip": 0.01245805, + "auxiliary_loss_mlp": 0.0104812, + "balance_loss_clip": 1.07392716, + "balance_loss_mlp": 1.03342152, + "epoch": 0.06661456141405639, + "flos": 20957503547520.0, + "grad_norm": 2.1738004473362333, + "language_loss": 0.93925273, + "learning_rate": 3.985996937160342e-06, + "loss": 0.96219194, + "num_input_tokens_seen": 11765590, + "step": 554, + "time_per_iteration": 2.581338882446289 + }, + { + "auxiliary_loss_clip": 0.0125303, + "auxiliary_loss_mlp": 0.01050195, + "balance_loss_clip": 1.07229364, + "balance_loss_mlp": 1.03631866, + "epoch": 0.06673480430469549, + "flos": 52223953322880.0, + "grad_norm": 2.418364541997693, + "language_loss": 0.68797129, + "learning_rate": 3.985904768766173e-06, + "loss": 0.71100354, + "num_input_tokens_seen": 11788365, + "step": 555, + "time_per_iteration": 3.618574380874634 + }, + { + "auxiliary_loss_clip": 0.01229765, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.07037377, + "balance_loss_mlp": 1.03229427, + "epoch": 0.06685504719533458, + "flos": 16217079995520.0, + "grad_norm": 2.336085897758626, + "language_loss": 0.76281738, + "learning_rate": 3.98581229911183e-06, + "loss": 0.78558433, + "num_input_tokens_seen": 11807285, + "step": 556, + "time_per_iteration": 3.4054975509643555 + }, + { + "auxiliary_loss_clip": 0.0126, + "auxiliary_loss_mlp": 0.01043449, + "balance_loss_clip": 1.07183361, + "balance_loss_mlp": 1.02906013, + "epoch": 0.06697529008597367, + "flos": 22491535639680.0, + "grad_norm": 1.6715079088386338, + "language_loss": 0.92027056, + "learning_rate": 3.985719528211341e-06, + "loss": 0.94330502, + "num_input_tokens_seen": 11826655, + "step": 557, + "time_per_iteration": 3.425886631011963 + }, + { + "auxiliary_loss_clip": 0.01164098, + "auxiliary_loss_mlp": 0.01005779, + "balance_loss_clip": 1.05518031, + "balance_loss_mlp": 1.00043797, + "epoch": 0.06709553297661276, + "flos": 62688216936960.0, + "grad_norm": 0.8426575856637201, + "language_loss": 0.6298762, + "learning_rate": 3.985626456078777e-06, + "loss": 0.65157503, + "num_input_tokens_seen": 11891310, + "step": 558, + "time_per_iteration": 3.209629774093628 + }, + { + "auxiliary_loss_clip": 0.01231266, + "auxiliary_loss_mlp": 0.01045569, + "balance_loss_clip": 1.0716784, + "balance_loss_mlp": 1.03147233, + "epoch": 0.06721577586725185, + "flos": 11216590997760.0, + "grad_norm": 2.2691223932057274, + "language_loss": 0.86120021, + "learning_rate": 3.985533082728259e-06, + "loss": 0.88396859, + "num_input_tokens_seen": 11906965, + "step": 559, + "time_per_iteration": 2.5809545516967773 + }, + { + "auxiliary_loss_clip": 0.01280003, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.07721162, + "balance_loss_mlp": 1.02418017, + "epoch": 0.06733601875789094, + "flos": 25922189664000.0, + "grad_norm": 1.7248760863800816, + "language_loss": 0.7448777, + "learning_rate": 3.985439408173951e-06, + "loss": 0.76806247, + "num_input_tokens_seen": 11927190, + "step": 560, + "time_per_iteration": 2.5479140281677246 + }, + { + "auxiliary_loss_clip": 0.01278976, + "auxiliary_loss_mlp": 0.01056547, + "balance_loss_clip": 1.07902932, + "balance_loss_mlp": 1.04196763, + "epoch": 0.06745626164853002, + "flos": 20813645577600.0, + "grad_norm": 2.0444860398414146, + "language_loss": 0.7063567, + "learning_rate": 3.9853454324300634e-06, + "loss": 0.72971195, + "num_input_tokens_seen": 11946400, + "step": 561, + "time_per_iteration": 2.501147747039795 + }, + { + "auxiliary_loss_clip": 0.01202719, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.06616867, + "balance_loss_mlp": 1.02492917, + "epoch": 0.06757650453916912, + "flos": 19829262378240.0, + "grad_norm": 1.8461667113087885, + "language_loss": 0.77804077, + "learning_rate": 3.985251155510852e-06, + "loss": 0.80046934, + "num_input_tokens_seen": 11965430, + "step": 562, + "time_per_iteration": 2.693840742111206 + }, + { + "auxiliary_loss_clip": 0.01213208, + "auxiliary_loss_mlp": 0.0104203, + "balance_loss_clip": 1.07302928, + "balance_loss_mlp": 1.02730775, + "epoch": 0.06769674742980822, + "flos": 25739224761600.0, + "grad_norm": 1.8001761945618686, + "language_loss": 0.80315232, + "learning_rate": 3.98515657743062e-06, + "loss": 0.82570469, + "num_input_tokens_seen": 11984895, + "step": 563, + "time_per_iteration": 2.7494289875030518 + }, + { + "auxiliary_loss_clip": 0.01237123, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_clip": 1.06792533, + "balance_loss_mlp": 1.03565788, + "epoch": 0.0678169903204473, + "flos": 13074788355840.0, + "grad_norm": 1.856358641793355, + "language_loss": 0.77271307, + "learning_rate": 3.985061698203711e-06, + "loss": 0.79557943, + "num_input_tokens_seen": 12002010, + "step": 564, + "time_per_iteration": 2.550980567932129 + }, + { + "auxiliary_loss_clip": 0.01178668, + "auxiliary_loss_mlp": 0.0100528, + "balance_loss_clip": 1.05027533, + "balance_loss_mlp": 1.00027299, + "epoch": 0.0679372332110864, + "flos": 70865830788480.0, + "grad_norm": 0.8851225780516029, + "language_loss": 0.63848746, + "learning_rate": 3.984966517844523e-06, + "loss": 0.66032696, + "num_input_tokens_seen": 12057255, + "step": 565, + "time_per_iteration": 3.0447895526885986 + }, + { + "auxiliary_loss_clip": 0.01275864, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.07559872, + "balance_loss_mlp": 1.0380249, + "epoch": 0.06805747610172548, + "flos": 28256418990720.0, + "grad_norm": 2.253362606550451, + "language_loss": 0.80515397, + "learning_rate": 3.984871036367492e-06, + "loss": 0.8284362, + "num_input_tokens_seen": 12077280, + "step": 566, + "time_per_iteration": 2.766516923904419 + }, + { + "auxiliary_loss_clip": 0.01257085, + "auxiliary_loss_mlp": 0.00766463, + "balance_loss_clip": 1.0752207, + "balance_loss_mlp": 1.00041521, + "epoch": 0.06817771899236458, + "flos": 20120533764480.0, + "grad_norm": 1.8954592620839301, + "language_loss": 0.83065045, + "learning_rate": 3.984775253787102e-06, + "loss": 0.85088593, + "num_input_tokens_seen": 12095570, + "step": 567, + "time_per_iteration": 2.5330657958984375 + }, + { + "auxiliary_loss_clip": 0.01261805, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.0724442, + "balance_loss_mlp": 1.02826858, + "epoch": 0.06829796188300366, + "flos": 17930629284480.0, + "grad_norm": 2.7510284895791455, + "language_loss": 0.8819592, + "learning_rate": 3.984679170117885e-06, + "loss": 0.90499967, + "num_input_tokens_seen": 12111775, + "step": 568, + "time_per_iteration": 2.542421340942383 + }, + { + "auxiliary_loss_clip": 0.01253011, + "auxiliary_loss_mlp": 0.0104087, + "balance_loss_clip": 1.07031894, + "balance_loss_mlp": 1.02540815, + "epoch": 0.06841820477364276, + "flos": 14501627285760.0, + "grad_norm": 2.423820637423019, + "language_loss": 0.78290331, + "learning_rate": 3.984582785374415e-06, + "loss": 0.8058421, + "num_input_tokens_seen": 12129215, + "step": 569, + "time_per_iteration": 2.527053117752075 + }, + { + "auxiliary_loss_clip": 0.01239482, + "auxiliary_loss_mlp": 0.00766792, + "balance_loss_clip": 1.07177222, + "balance_loss_mlp": 1.00019956, + "epoch": 0.06853844766428185, + "flos": 21938474954880.0, + "grad_norm": 5.7044739646428875, + "language_loss": 0.80579215, + "learning_rate": 3.9844860995713155e-06, + "loss": 0.8258549, + "num_input_tokens_seen": 12148755, + "step": 570, + "time_per_iteration": 2.64744234085083 + }, + { + "auxiliary_loss_clip": 0.01256853, + "auxiliary_loss_mlp": 0.01039939, + "balance_loss_clip": 1.07793951, + "balance_loss_mlp": 1.02626574, + "epoch": 0.06865869055492094, + "flos": 16800628348800.0, + "grad_norm": 2.464584010030269, + "language_loss": 0.82536882, + "learning_rate": 3.9843891127232524e-06, + "loss": 0.8483367, + "num_input_tokens_seen": 12166290, + "step": 571, + "time_per_iteration": 2.5278453826904297 + }, + { + "auxiliary_loss_clip": 0.01193337, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.06144094, + "balance_loss_mlp": 1.03116071, + "epoch": 0.06877893344556003, + "flos": 19937281553280.0, + "grad_norm": 2.4210927831000277, + "language_loss": 0.67257863, + "learning_rate": 3.984291824844938e-06, + "loss": 0.6949681, + "num_input_tokens_seen": 12181385, + "step": 572, + "time_per_iteration": 2.6662511825561523 + }, + { + "auxiliary_loss_clip": 0.0127183, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.0736835, + "balance_loss_mlp": 1.03384399, + "epoch": 0.06889917633619912, + "flos": 23039388852480.0, + "grad_norm": 3.206335233540511, + "language_loss": 0.8472259, + "learning_rate": 3.984194235951132e-06, + "loss": 0.8704288, + "num_input_tokens_seen": 12197530, + "step": 573, + "time_per_iteration": 2.506444215774536 + }, + { + "auxiliary_loss_clip": 0.01277753, + "auxiliary_loss_mlp": 0.01057301, + "balance_loss_clip": 1.07986724, + "balance_loss_mlp": 1.043872, + "epoch": 0.06901941922683821, + "flos": 20960556203520.0, + "grad_norm": 2.372201970160832, + "language_loss": 0.8445226, + "learning_rate": 3.9840963460566375e-06, + "loss": 0.86787319, + "num_input_tokens_seen": 12216310, + "step": 574, + "time_per_iteration": 2.5472681522369385 + }, + { + "auxiliary_loss_clip": 0.01180612, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_clip": 1.06326449, + "balance_loss_mlp": 1.02939749, + "epoch": 0.06913966211747731, + "flos": 24821850384000.0, + "grad_norm": 2.3633861283945534, + "language_loss": 0.89548922, + "learning_rate": 3.983998155176305e-06, + "loss": 0.91773003, + "num_input_tokens_seen": 12236670, + "step": 575, + "time_per_iteration": 2.768602132797241 + }, + { + "auxiliary_loss_clip": 0.01172061, + "auxiliary_loss_mlp": 0.010099, + "balance_loss_clip": 1.04613316, + "balance_loss_mlp": 1.00467837, + "epoch": 0.06925990500811639, + "flos": 58367446957440.0, + "grad_norm": 0.8204492604379559, + "language_loss": 0.5702244, + "learning_rate": 3.9838996633250305e-06, + "loss": 0.592044, + "num_input_tokens_seen": 12297185, + "step": 576, + "time_per_iteration": 3.042463779449463 + }, + { + "auxiliary_loss_clip": 0.01256716, + "auxiliary_loss_mlp": 0.01047223, + "balance_loss_clip": 1.07180572, + "balance_loss_mlp": 1.0342052, + "epoch": 0.06938014789875549, + "flos": 12749940731520.0, + "grad_norm": 2.155547208318768, + "language_loss": 0.88177574, + "learning_rate": 3.983800870517753e-06, + "loss": 0.90481508, + "num_input_tokens_seen": 12313975, + "step": 577, + "time_per_iteration": 2.525132656097412 + }, + { + "auxiliary_loss_clip": 0.01254257, + "auxiliary_loss_mlp": 0.0104924, + "balance_loss_clip": 1.07680428, + "balance_loss_mlp": 1.03659844, + "epoch": 0.06950039078939457, + "flos": 22820226019200.0, + "grad_norm": 3.592766453103535, + "language_loss": 0.78249228, + "learning_rate": 3.983701776769463e-06, + "loss": 0.80552727, + "num_input_tokens_seen": 12331385, + "step": 578, + "time_per_iteration": 2.5824599266052246 + }, + { + "auxiliary_loss_clip": 0.01249524, + "auxiliary_loss_mlp": 0.01045758, + "balance_loss_clip": 1.07382369, + "balance_loss_mlp": 1.03166199, + "epoch": 0.06962063368003367, + "flos": 21941348042880.0, + "grad_norm": 1.971366564308807, + "language_loss": 0.85804611, + "learning_rate": 3.9836023820951885e-06, + "loss": 0.88099897, + "num_input_tokens_seen": 12350600, + "step": 579, + "time_per_iteration": 2.550511121749878 + }, + { + "auxiliary_loss_clip": 0.0121792, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_clip": 1.0646385, + "balance_loss_mlp": 1.03656483, + "epoch": 0.06974087657067275, + "flos": 20706021452160.0, + "grad_norm": 2.3745944518949065, + "language_loss": 0.68461537, + "learning_rate": 3.983502686510011e-06, + "loss": 0.70728904, + "num_input_tokens_seen": 12371430, + "step": 580, + "time_per_iteration": 3.4523305892944336 + }, + { + "auxiliary_loss_clip": 0.01257042, + "auxiliary_loss_mlp": 0.00766245, + "balance_loss_clip": 1.0699327, + "balance_loss_mlp": 1.00032485, + "epoch": 0.06986111946131185, + "flos": 22638230784000.0, + "grad_norm": 2.0588409033940973, + "language_loss": 0.7346034, + "learning_rate": 3.9834026900290525e-06, + "loss": 0.75483632, + "num_input_tokens_seen": 12390825, + "step": 581, + "time_per_iteration": 2.5729331970214844 + }, + { + "auxiliary_loss_clip": 0.01271513, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.07424843, + "balance_loss_mlp": 1.03228903, + "epoch": 0.06998136235195095, + "flos": 26943453152640.0, + "grad_norm": 2.375267702689081, + "language_loss": 1.00228882, + "learning_rate": 3.983302392667482e-06, + "loss": 1.02546, + "num_input_tokens_seen": 12411670, + "step": 582, + "time_per_iteration": 3.3593664169311523 + }, + { + "auxiliary_loss_clip": 0.01254701, + "auxiliary_loss_mlp": 0.01045346, + "balance_loss_clip": 1.07530975, + "balance_loss_mlp": 1.03180957, + "epoch": 0.07010160524259003, + "flos": 22492505306880.0, + "grad_norm": 2.0487303018912497, + "language_loss": 0.93556327, + "learning_rate": 3.983201794440517e-06, + "loss": 0.95856369, + "num_input_tokens_seen": 12431245, + "step": 583, + "time_per_iteration": 3.46026873588562 + }, + { + "auxiliary_loss_clip": 0.01230422, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.07131767, + "balance_loss_mlp": 1.02731586, + "epoch": 0.07022184813322913, + "flos": 18332541538560.0, + "grad_norm": 1.7581052408777442, + "language_loss": 0.67342103, + "learning_rate": 3.9831008953634165e-06, + "loss": 0.69613057, + "num_input_tokens_seen": 12450535, + "step": 584, + "time_per_iteration": 2.5757226943969727 + }, + { + "auxiliary_loss_clip": 0.01188892, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_clip": 1.06086266, + "balance_loss_mlp": 1.03171897, + "epoch": 0.07034209102386821, + "flos": 24675550289280.0, + "grad_norm": 4.298322309658396, + "language_loss": 0.8120147, + "learning_rate": 3.9829996954514864e-06, + "loss": 0.83436382, + "num_input_tokens_seen": 12469675, + "step": 585, + "time_per_iteration": 2.682492256164551 + }, + { + "auxiliary_loss_clip": 0.01244815, + "auxiliary_loss_mlp": 0.01044506, + "balance_loss_clip": 1.0708642, + "balance_loss_mlp": 1.02999866, + "epoch": 0.0704623339145073, + "flos": 25995878415360.0, + "grad_norm": 2.064938116724065, + "language_loss": 0.84089768, + "learning_rate": 3.982898194720079e-06, + "loss": 0.86379087, + "num_input_tokens_seen": 12490405, + "step": 586, + "time_per_iteration": 2.60019850730896 + }, + { + "auxiliary_loss_clip": 0.01235518, + "auxiliary_loss_mlp": 0.00766772, + "balance_loss_clip": 1.07304692, + "balance_loss_mlp": 1.00034738, + "epoch": 0.0705825768051464, + "flos": 25338318088320.0, + "grad_norm": 2.2840348070558725, + "language_loss": 0.82299864, + "learning_rate": 3.982796393184592e-06, + "loss": 0.84302145, + "num_input_tokens_seen": 12509485, + "step": 587, + "time_per_iteration": 2.6180150508880615 + }, + { + "auxiliary_loss_clip": 0.01150874, + "auxiliary_loss_mlp": 0.01008601, + "balance_loss_clip": 1.03844261, + "balance_loss_mlp": 1.00304556, + "epoch": 0.07070281969578548, + "flos": 66047552507520.0, + "grad_norm": 0.7957432733644473, + "language_loss": 0.62667608, + "learning_rate": 3.98269429086047e-06, + "loss": 0.64827085, + "num_input_tokens_seen": 12567325, + "step": 588, + "time_per_iteration": 3.0329325199127197 + }, + { + "auxiliary_loss_clip": 0.01227253, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_clip": 1.07003927, + "balance_loss_mlp": 1.03529942, + "epoch": 0.07082306258642458, + "flos": 23653568528640.0, + "grad_norm": 2.6908160301059123, + "language_loss": 0.86417818, + "learning_rate": 3.982591887763199e-06, + "loss": 0.88694525, + "num_input_tokens_seen": 12584785, + "step": 589, + "time_per_iteration": 2.585711717605591 + }, + { + "auxiliary_loss_clip": 0.01200766, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.05997133, + "balance_loss_mlp": 1.02946448, + "epoch": 0.07094330547706366, + "flos": 13880049408000.0, + "grad_norm": 4.654211697095614, + "language_loss": 0.81554866, + "learning_rate": 3.982489183908316e-06, + "loss": 0.83799684, + "num_input_tokens_seen": 12601205, + "step": 590, + "time_per_iteration": 2.5951669216156006 + }, + { + "auxiliary_loss_clip": 0.01162197, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.05370998, + "balance_loss_mlp": 1.02886987, + "epoch": 0.07106354836770276, + "flos": 24645098534400.0, + "grad_norm": 1.7255467996131657, + "language_loss": 0.84671474, + "learning_rate": 3.982386179311399e-06, + "loss": 0.8687517, + "num_input_tokens_seen": 12621725, + "step": 591, + "time_per_iteration": 2.726693868637085 + }, + { + "auxiliary_loss_clip": 0.01259019, + "auxiliary_loss_mlp": 0.01048724, + "balance_loss_clip": 1.07411575, + "balance_loss_mlp": 1.03328681, + "epoch": 0.07118379125834184, + "flos": 16217223649920.0, + "grad_norm": 2.789612028291633, + "language_loss": 0.87713683, + "learning_rate": 3.982282873988075e-06, + "loss": 0.90021431, + "num_input_tokens_seen": 12639600, + "step": 592, + "time_per_iteration": 2.553016424179077 + }, + { + "auxiliary_loss_clip": 0.01238849, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.07179165, + "balance_loss_mlp": 1.02747273, + "epoch": 0.07130403414898094, + "flos": 19719986227200.0, + "grad_norm": 1.7454674101722318, + "language_loss": 0.87045741, + "learning_rate": 3.982179267954016e-06, + "loss": 0.89324737, + "num_input_tokens_seen": 12660030, + "step": 593, + "time_per_iteration": 2.598154306411743 + }, + { + "auxiliary_loss_clip": 0.01270743, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.07442689, + "balance_loss_mlp": 1.02739203, + "epoch": 0.07142427703962004, + "flos": 21871933009920.0, + "grad_norm": 2.188863459073556, + "language_loss": 0.96033943, + "learning_rate": 3.982075361224937e-06, + "loss": 0.98346174, + "num_input_tokens_seen": 12678395, + "step": 594, + "time_per_iteration": 2.5527126789093018 + }, + { + "auxiliary_loss_clip": 0.01253365, + "auxiliary_loss_mlp": 0.00766229, + "balance_loss_clip": 1.07601118, + "balance_loss_mlp": 1.00040317, + "epoch": 0.07154451993025912, + "flos": 18296595002880.0, + "grad_norm": 2.3148194789952257, + "language_loss": 0.88131595, + "learning_rate": 3.981971153816602e-06, + "loss": 0.90151191, + "num_input_tokens_seen": 12696000, + "step": 595, + "time_per_iteration": 2.5114622116088867 + }, + { + "auxiliary_loss_clip": 0.01272342, + "auxiliary_loss_mlp": 0.01044701, + "balance_loss_clip": 1.07900381, + "balance_loss_mlp": 1.03174865, + "epoch": 0.07166476282089822, + "flos": 22160690444160.0, + "grad_norm": 1.653076774494414, + "language_loss": 0.96452427, + "learning_rate": 3.981866645744819e-06, + "loss": 0.98769468, + "num_input_tokens_seen": 12716715, + "step": 596, + "time_per_iteration": 2.5672714710235596 + }, + { + "auxiliary_loss_clip": 0.01273867, + "auxiliary_loss_mlp": 0.00767258, + "balance_loss_clip": 1.07689142, + "balance_loss_mlp": 1.00028276, + "epoch": 0.0717850057115373, + "flos": 14136343925760.0, + "grad_norm": 2.3843404252442464, + "language_loss": 0.81421471, + "learning_rate": 3.9817618370254416e-06, + "loss": 0.83462596, + "num_input_tokens_seen": 12733370, + "step": 597, + "time_per_iteration": 2.479390859603882 + }, + { + "auxiliary_loss_clip": 0.01275025, + "auxiliary_loss_mlp": 0.01050572, + "balance_loss_clip": 1.07826328, + "balance_loss_mlp": 1.03656459, + "epoch": 0.0719052486021764, + "flos": 30917794412160.0, + "grad_norm": 2.286638011846903, + "language_loss": 0.87202072, + "learning_rate": 3.9816567276743684e-06, + "loss": 0.89527667, + "num_input_tokens_seen": 12753235, + "step": 598, + "time_per_iteration": 2.6221888065338135 + }, + { + "auxiliary_loss_clip": 0.01232682, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.06970978, + "balance_loss_mlp": 1.02699959, + "epoch": 0.0720254914928155, + "flos": 21287019939840.0, + "grad_norm": 2.246112861764881, + "language_loss": 0.77386433, + "learning_rate": 3.9815513177075466e-06, + "loss": 0.79659998, + "num_input_tokens_seen": 12772020, + "step": 599, + "time_per_iteration": 2.585167169570923 + }, + { + "auxiliary_loss_clip": 0.0124455, + "auxiliary_loss_mlp": 0.01045941, + "balance_loss_clip": 1.07124329, + "balance_loss_mlp": 1.03385925, + "epoch": 0.07214573438345458, + "flos": 27819170732160.0, + "grad_norm": 1.5202453659500388, + "language_loss": 0.70445299, + "learning_rate": 3.9814456071409646e-06, + "loss": 0.72735798, + "num_input_tokens_seen": 12792555, + "step": 600, + "time_per_iteration": 2.6290860176086426 + }, + { + "auxiliary_loss_clip": 0.01210813, + "auxiliary_loss_mlp": 0.01056093, + "balance_loss_clip": 1.06683993, + "balance_loss_mlp": 1.0412631, + "epoch": 0.07226597727409367, + "flos": 25483576688640.0, + "grad_norm": 2.3492968198432282, + "language_loss": 0.85013497, + "learning_rate": 3.981339595990659e-06, + "loss": 0.87280399, + "num_input_tokens_seen": 12811085, + "step": 601, + "time_per_iteration": 2.6802878379821777 + }, + { + "auxiliary_loss_clip": 0.01253983, + "auxiliary_loss_mlp": 0.01049952, + "balance_loss_clip": 1.07370162, + "balance_loss_mlp": 1.03508711, + "epoch": 0.07238622016473276, + "flos": 23513840622720.0, + "grad_norm": 1.9363483719261925, + "language_loss": 0.81177491, + "learning_rate": 3.981233284272713e-06, + "loss": 0.83481425, + "num_input_tokens_seen": 12830830, + "step": 602, + "time_per_iteration": 2.6028285026550293 + }, + { + "auxiliary_loss_clip": 0.01222594, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_clip": 1.06792784, + "balance_loss_mlp": 1.03080046, + "epoch": 0.07250646305537185, + "flos": 25453519983360.0, + "grad_norm": 1.6138326590643064, + "language_loss": 0.90151894, + "learning_rate": 3.981126672003253e-06, + "loss": 0.92417777, + "num_input_tokens_seen": 12853505, + "step": 603, + "time_per_iteration": 2.7627761363983154 + }, + { + "auxiliary_loss_clip": 0.01241234, + "auxiliary_loss_mlp": 0.01051368, + "balance_loss_clip": 1.06634784, + "balance_loss_mlp": 1.03775978, + "epoch": 0.07262670594601094, + "flos": 27155038216320.0, + "grad_norm": 2.565746168301205, + "language_loss": 0.7766583, + "learning_rate": 3.981019759198451e-06, + "loss": 0.79958427, + "num_input_tokens_seen": 12872455, + "step": 604, + "time_per_iteration": 2.6418650150299072 + }, + { + "auxiliary_loss_clip": 0.01237425, + "auxiliary_loss_mlp": 0.01048442, + "balance_loss_clip": 1.06926513, + "balance_loss_mlp": 1.03459573, + "epoch": 0.07274694883665003, + "flos": 26651607148800.0, + "grad_norm": 2.261919794147609, + "language_loss": 0.84469163, + "learning_rate": 3.980912545874528e-06, + "loss": 0.86755037, + "num_input_tokens_seen": 12892620, + "step": 605, + "time_per_iteration": 2.596210479736328 + }, + { + "auxiliary_loss_clip": 0.01248403, + "auxiliary_loss_mlp": 0.00766615, + "balance_loss_clip": 1.07128811, + "balance_loss_mlp": 1.00039661, + "epoch": 0.07286719172728913, + "flos": 29862344154240.0, + "grad_norm": 2.602761267271394, + "language_loss": 0.85806286, + "learning_rate": 3.980805032047746e-06, + "loss": 0.87821305, + "num_input_tokens_seen": 12914090, + "step": 606, + "time_per_iteration": 2.6312897205352783 + }, + { + "auxiliary_loss_clip": 0.01232565, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.06720662, + "balance_loss_mlp": 1.02705014, + "epoch": 0.07298743461792821, + "flos": 17382057799680.0, + "grad_norm": 2.086891865456296, + "language_loss": 0.80811268, + "learning_rate": 3.980697217734415e-06, + "loss": 0.83086622, + "num_input_tokens_seen": 12931830, + "step": 607, + "time_per_iteration": 3.3344218730926514 + }, + { + "auxiliary_loss_clip": 0.0121129, + "auxiliary_loss_mlp": 0.00766422, + "balance_loss_clip": 1.06782436, + "balance_loss_mlp": 1.00045025, + "epoch": 0.07310767750856731, + "flos": 19498201701120.0, + "grad_norm": 1.8070699747287209, + "language_loss": 0.91731501, + "learning_rate": 3.980589102950891e-06, + "loss": 0.93709219, + "num_input_tokens_seen": 12949995, + "step": 608, + "time_per_iteration": 2.708122491836548 + }, + { + "auxiliary_loss_clip": 0.01241656, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.07543826, + "balance_loss_mlp": 1.02928495, + "epoch": 0.07322792039920639, + "flos": 29168693637120.0, + "grad_norm": 2.3774159488455617, + "language_loss": 0.76165074, + "learning_rate": 3.9804806877135755e-06, + "loss": 0.78450286, + "num_input_tokens_seen": 12968040, + "step": 609, + "time_per_iteration": 4.253344535827637 + }, + { + "auxiliary_loss_clip": 0.01260246, + "auxiliary_loss_mlp": 0.0076738, + "balance_loss_clip": 1.07201946, + "balance_loss_mlp": 1.00047135, + "epoch": 0.07334816328984549, + "flos": 23477822259840.0, + "grad_norm": 2.2127813009434285, + "language_loss": 0.86091042, + "learning_rate": 3.980371972038915e-06, + "loss": 0.88118672, + "num_input_tokens_seen": 12988530, + "step": 610, + "time_per_iteration": 3.43818736076355 + }, + { + "auxiliary_loss_clip": 0.01273207, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.07609749, + "balance_loss_mlp": 1.03435612, + "epoch": 0.07346840618048459, + "flos": 22962467877120.0, + "grad_norm": 1.7077774245216952, + "language_loss": 0.84310943, + "learning_rate": 3.980262955943399e-06, + "loss": 0.86632597, + "num_input_tokens_seen": 13008195, + "step": 611, + "time_per_iteration": 2.5046634674072266 + }, + { + "auxiliary_loss_clip": 0.01233818, + "auxiliary_loss_mlp": 0.01048035, + "balance_loss_clip": 1.07343566, + "balance_loss_mlp": 1.03542876, + "epoch": 0.07358864907112367, + "flos": 17673903803520.0, + "grad_norm": 2.604637302776454, + "language_loss": 0.87090552, + "learning_rate": 3.980153639443569e-06, + "loss": 0.89372402, + "num_input_tokens_seen": 13024180, + "step": 612, + "time_per_iteration": 2.548511266708374 + }, + { + "auxiliary_loss_clip": 0.01246114, + "auxiliary_loss_mlp": 0.01050388, + "balance_loss_clip": 1.07363272, + "balance_loss_mlp": 1.03652978, + "epoch": 0.07370889196176277, + "flos": 24097029840000.0, + "grad_norm": 2.491076690174547, + "language_loss": 0.80031502, + "learning_rate": 3.980044022556005e-06, + "loss": 0.8232801, + "num_input_tokens_seen": 13043865, + "step": 613, + "time_per_iteration": 2.647897481918335 + }, + { + "auxiliary_loss_clip": 0.01257371, + "auxiliary_loss_mlp": 0.01053629, + "balance_loss_clip": 1.07460582, + "balance_loss_mlp": 1.04018223, + "epoch": 0.07382913485240185, + "flos": 25885919905920.0, + "grad_norm": 2.2994542165772205, + "language_loss": 0.73143929, + "learning_rate": 3.9799341052973375e-06, + "loss": 0.75454926, + "num_input_tokens_seen": 13063700, + "step": 614, + "time_per_iteration": 2.7614216804504395 + }, + { + "auxiliary_loss_clip": 0.01240344, + "auxiliary_loss_mlp": 0.01045341, + "balance_loss_clip": 1.07494617, + "balance_loss_mlp": 1.03039193, + "epoch": 0.07394937774304094, + "flos": 16873850223360.0, + "grad_norm": 2.751488011903687, + "language_loss": 0.75050294, + "learning_rate": 3.979823887684241e-06, + "loss": 0.77335978, + "num_input_tokens_seen": 13082640, + "step": 615, + "time_per_iteration": 2.5896170139312744 + }, + { + "auxiliary_loss_clip": 0.01273815, + "auxiliary_loss_mlp": 0.01053363, + "balance_loss_clip": 1.07832313, + "balance_loss_mlp": 1.03937984, + "epoch": 0.07406962063368003, + "flos": 20703471586560.0, + "grad_norm": 2.3353929609217414, + "language_loss": 0.84628868, + "learning_rate": 3.979713369733434e-06, + "loss": 0.86956048, + "num_input_tokens_seen": 13100505, + "step": 616, + "time_per_iteration": 2.5221545696258545 + }, + { + "auxiliary_loss_clip": 0.01251822, + "auxiliary_loss_mlp": 0.01055892, + "balance_loss_clip": 1.07414913, + "balance_loss_mlp": 1.04194474, + "epoch": 0.07418986352431912, + "flos": 21430985650560.0, + "grad_norm": 2.2763015337680943, + "language_loss": 0.85020459, + "learning_rate": 3.979602551461683e-06, + "loss": 0.87328172, + "num_input_tokens_seen": 13121285, + "step": 617, + "time_per_iteration": 2.5392236709594727 + }, + { + "auxiliary_loss_clip": 0.01236995, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_clip": 1.07236981, + "balance_loss_mlp": 1.03304553, + "epoch": 0.07431010641495822, + "flos": 12021133777920.0, + "grad_norm": 2.3343254155216577, + "language_loss": 0.91489494, + "learning_rate": 3.979491432885799e-06, + "loss": 0.9377358, + "num_input_tokens_seen": 13137550, + "step": 618, + "time_per_iteration": 2.5775418281555176 + }, + { + "auxiliary_loss_clip": 0.01202656, + "auxiliary_loss_mlp": 0.00766353, + "balance_loss_clip": 1.06419575, + "balance_loss_mlp": 1.0003686, + "epoch": 0.0744303493055973, + "flos": 20957575374720.0, + "grad_norm": 1.970362593502088, + "language_loss": 0.82915056, + "learning_rate": 3.97938001402264e-06, + "loss": 0.84884059, + "num_input_tokens_seen": 13156675, + "step": 619, + "time_per_iteration": 2.634760856628418 + }, + { + "auxiliary_loss_clip": 0.01217916, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.07066441, + "balance_loss_mlp": 1.02982211, + "epoch": 0.0745505921962364, + "flos": 16253134272000.0, + "grad_norm": 4.93537684287489, + "language_loss": 0.79844701, + "learning_rate": 3.979268294889105e-06, + "loss": 0.82105923, + "num_input_tokens_seen": 13172225, + "step": 620, + "time_per_iteration": 2.582827568054199 + }, + { + "auxiliary_loss_clip": 0.01271556, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.07613254, + "balance_loss_mlp": 1.03383529, + "epoch": 0.07467083508687548, + "flos": 50944635550080.0, + "grad_norm": 1.8989280219398388, + "language_loss": 0.73871648, + "learning_rate": 3.979156275502143e-06, + "loss": 0.7619065, + "num_input_tokens_seen": 13195885, + "step": 621, + "time_per_iteration": 2.739715099334717 + }, + { + "auxiliary_loss_clip": 0.01224676, + "auxiliary_loss_mlp": 0.01056151, + "balance_loss_clip": 1.06816077, + "balance_loss_mlp": 1.04149985, + "epoch": 0.07479107797751458, + "flos": 17529686697600.0, + "grad_norm": 2.7077953215659045, + "language_loss": 0.91158986, + "learning_rate": 3.979043955878749e-06, + "loss": 0.93439817, + "num_input_tokens_seen": 13213730, + "step": 622, + "time_per_iteration": 2.5992958545684814 + }, + { + "auxiliary_loss_clip": 0.01237749, + "auxiliary_loss_mlp": 0.01044519, + "balance_loss_clip": 1.07109678, + "balance_loss_mlp": 1.03151298, + "epoch": 0.07491132086815366, + "flos": 23473943591040.0, + "grad_norm": 2.90858125808394, + "language_loss": 0.83216572, + "learning_rate": 3.978931336035959e-06, + "loss": 0.85498834, + "num_input_tokens_seen": 13232540, + "step": 623, + "time_per_iteration": 2.5871009826660156 + }, + { + "auxiliary_loss_clip": 0.01256497, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.07534719, + "balance_loss_mlp": 1.04123878, + "epoch": 0.07503156375879276, + "flos": 20157557708160.0, + "grad_norm": 2.650308159148101, + "language_loss": 0.82484996, + "learning_rate": 3.9788184159908595e-06, + "loss": 0.84796917, + "num_input_tokens_seen": 13249670, + "step": 624, + "time_per_iteration": 2.524965763092041 + }, + { + "auxiliary_loss_clip": 0.01233643, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_clip": 1.07061183, + "balance_loss_mlp": 1.04005444, + "epoch": 0.07515180664943186, + "flos": 15115519653120.0, + "grad_norm": 3.3886365148321707, + "language_loss": 0.82399142, + "learning_rate": 3.97870519576058e-06, + "loss": 0.8468591, + "num_input_tokens_seen": 13266095, + "step": 625, + "time_per_iteration": 2.5881028175354004 + }, + { + "auxiliary_loss_clip": 0.01221747, + "auxiliary_loss_mlp": 0.00766907, + "balance_loss_clip": 1.06842601, + "balance_loss_mlp": 1.00032115, + "epoch": 0.07527204954007094, + "flos": 21287702298240.0, + "grad_norm": 2.5528294325248093, + "language_loss": 0.81024474, + "learning_rate": 3.978591675362295e-06, + "loss": 0.83013129, + "num_input_tokens_seen": 13284810, + "step": 626, + "time_per_iteration": 2.6359786987304688 + }, + { + "auxiliary_loss_clip": 0.01204157, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.07034647, + "balance_loss_mlp": 1.02897191, + "epoch": 0.07539229243071004, + "flos": 21324187537920.0, + "grad_norm": 2.0107095435506754, + "language_loss": 0.87499791, + "learning_rate": 3.978477854813226e-06, + "loss": 0.89746714, + "num_input_tokens_seen": 13304150, + "step": 627, + "time_per_iteration": 2.6630687713623047 + }, + { + "auxiliary_loss_clip": 0.01256691, + "auxiliary_loss_mlp": 0.01049118, + "balance_loss_clip": 1.07308722, + "balance_loss_mlp": 1.03630841, + "epoch": 0.07551253532134912, + "flos": 13042540920960.0, + "grad_norm": 1.80119150614823, + "language_loss": 0.82395399, + "learning_rate": 3.97836373413064e-06, + "loss": 0.8470121, + "num_input_tokens_seen": 13322205, + "step": 628, + "time_per_iteration": 2.5787763595581055 + }, + { + "auxiliary_loss_clip": 0.01268707, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_clip": 1.07220101, + "balance_loss_mlp": 1.03181124, + "epoch": 0.07563277821198822, + "flos": 19208761908480.0, + "grad_norm": 2.264499327765329, + "language_loss": 0.74581814, + "learning_rate": 3.978249313331848e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 13340435, + "step": 629, + "time_per_iteration": 2.5443100929260254 + }, + { + "auxiliary_loss_clip": 0.01261396, + "auxiliary_loss_mlp": 0.007667, + "balance_loss_clip": 1.07214129, + "balance_loss_mlp": 1.00022268, + "epoch": 0.07575302110262731, + "flos": 19537200892800.0, + "grad_norm": 3.2654222884131174, + "language_loss": 0.62165952, + "learning_rate": 3.978134592434208e-06, + "loss": 0.64194047, + "num_input_tokens_seen": 13358185, + "step": 630, + "time_per_iteration": 2.5344574451446533 + }, + { + "auxiliary_loss_clip": 0.01099463, + "auxiliary_loss_mlp": 0.0101255, + "balance_loss_clip": 1.02980602, + "balance_loss_mlp": 1.00713825, + "epoch": 0.0758732639932664, + "flos": 67961808017280.0, + "grad_norm": 1.0268439926791282, + "language_loss": 0.59428561, + "learning_rate": 3.978019571455123e-06, + "loss": 0.61540574, + "num_input_tokens_seen": 13410130, + "step": 631, + "time_per_iteration": 3.2429943084716797 + }, + { + "auxiliary_loss_clip": 0.01270214, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.07667637, + "balance_loss_mlp": 1.03226674, + "epoch": 0.07599350688390549, + "flos": 18989204025600.0, + "grad_norm": 2.1725712703120363, + "language_loss": 0.84007347, + "learning_rate": 3.977904250412042e-06, + "loss": 0.8632201, + "num_input_tokens_seen": 13429085, + "step": 632, + "time_per_iteration": 2.575817584991455 + }, + { + "auxiliary_loss_clip": 0.0124385, + "auxiliary_loss_mlp": 0.01047161, + "balance_loss_clip": 1.07307816, + "balance_loss_mlp": 1.03417909, + "epoch": 0.07611374977454458, + "flos": 21069006341760.0, + "grad_norm": 2.066555761199429, + "language_loss": 0.85641992, + "learning_rate": 3.97778862932246e-06, + "loss": 0.87933004, + "num_input_tokens_seen": 13446250, + "step": 633, + "time_per_iteration": 2.5982720851898193 + }, + { + "auxiliary_loss_clip": 0.01135136, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.05011618, + "balance_loss_mlp": 1.02785432, + "epoch": 0.07623399266518367, + "flos": 18514536773760.0, + "grad_norm": 2.6235959902478294, + "language_loss": 0.94056976, + "learning_rate": 3.9776727082039144e-06, + "loss": 0.96233571, + "num_input_tokens_seen": 13463220, + "step": 634, + "time_per_iteration": 3.6866254806518555 + }, + { + "auxiliary_loss_clip": 0.01154173, + "auxiliary_loss_mlp": 0.01007967, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.00303161, + "epoch": 0.07635423555582276, + "flos": 44663036077440.0, + "grad_norm": 0.8045631510774766, + "language_loss": 0.55544055, + "learning_rate": 3.977556487073991e-06, + "loss": 0.57706189, + "num_input_tokens_seen": 13517775, + "step": 635, + "time_per_iteration": 3.8925201892852783 + }, + { + "auxiliary_loss_clip": 0.01228285, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.06510353, + "balance_loss_mlp": 1.03481805, + "epoch": 0.07647447844646185, + "flos": 21761148487680.0, + "grad_norm": 1.8610320615176026, + "language_loss": 0.81302428, + "learning_rate": 3.97743996595032e-06, + "loss": 0.835778, + "num_input_tokens_seen": 13537815, + "step": 636, + "time_per_iteration": 2.664645195007324 + }, + { + "auxiliary_loss_clip": 0.01268218, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.07407188, + "balance_loss_mlp": 1.03430486, + "epoch": 0.07659472133710095, + "flos": 23806799948160.0, + "grad_norm": 1.5866606944481814, + "language_loss": 0.81801939, + "learning_rate": 3.9773231448505804e-06, + "loss": 0.84118903, + "num_input_tokens_seen": 13559605, + "step": 637, + "time_per_iteration": 3.4268534183502197 + }, + { + "auxiliary_loss_clip": 0.01235485, + "auxiliary_loss_mlp": 0.00766878, + "balance_loss_clip": 1.0714488, + "balance_loss_mlp": 1.00018728, + "epoch": 0.07671496422774003, + "flos": 21469984842240.0, + "grad_norm": 1.9988915185474208, + "language_loss": 0.78065717, + "learning_rate": 3.977206023792491e-06, + "loss": 0.80068076, + "num_input_tokens_seen": 13579495, + "step": 638, + "time_per_iteration": 2.6180267333984375 + }, + { + "auxiliary_loss_clip": 0.01255923, + "auxiliary_loss_mlp": 0.01058055, + "balance_loss_clip": 1.07708037, + "balance_loss_mlp": 1.04485273, + "epoch": 0.07683520711837913, + "flos": 16980971558400.0, + "grad_norm": 2.5524578370673314, + "language_loss": 0.81249166, + "learning_rate": 3.97708860279382e-06, + "loss": 0.83563137, + "num_input_tokens_seen": 13597605, + "step": 639, + "time_per_iteration": 2.5424745082855225 + }, + { + "auxiliary_loss_clip": 0.01217375, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_clip": 1.06552052, + "balance_loss_mlp": 1.03578544, + "epoch": 0.07695545000901821, + "flos": 23476744851840.0, + "grad_norm": 1.8063188235171888, + "language_loss": 0.78034335, + "learning_rate": 3.97697088187238e-06, + "loss": 0.80301338, + "num_input_tokens_seen": 13618120, + "step": 640, + "time_per_iteration": 2.670809030532837 + }, + { + "auxiliary_loss_clip": 0.01234773, + "auxiliary_loss_mlp": 0.01056573, + "balance_loss_clip": 1.07297528, + "balance_loss_mlp": 1.04381728, + "epoch": 0.07707569289965731, + "flos": 17634258167040.0, + "grad_norm": 2.539290318590636, + "language_loss": 0.91485828, + "learning_rate": 3.976852861046029e-06, + "loss": 0.93777174, + "num_input_tokens_seen": 13634735, + "step": 641, + "time_per_iteration": 2.572610378265381 + }, + { + "auxiliary_loss_clip": 0.01204854, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.06658614, + "balance_loss_mlp": 1.03076315, + "epoch": 0.0771959357902964, + "flos": 25775674087680.0, + "grad_norm": 1.7084786055823329, + "language_loss": 0.80460715, + "learning_rate": 3.97673454033267e-06, + "loss": 0.82709455, + "num_input_tokens_seen": 13656835, + "step": 642, + "time_per_iteration": 2.7294981479644775 + }, + { + "auxiliary_loss_clip": 0.01233509, + "auxiliary_loss_mlp": 0.01051373, + "balance_loss_clip": 1.06730175, + "balance_loss_mlp": 1.03765178, + "epoch": 0.07731617868093549, + "flos": 19828651847040.0, + "grad_norm": 2.0098962506058387, + "language_loss": 0.82719755, + "learning_rate": 3.976615919750254e-06, + "loss": 0.8500464, + "num_input_tokens_seen": 13674535, + "step": 643, + "time_per_iteration": 2.558629035949707 + }, + { + "auxiliary_loss_clip": 0.01249063, + "auxiliary_loss_mlp": 0.01050615, + "balance_loss_clip": 1.07101142, + "balance_loss_mlp": 1.03591073, + "epoch": 0.07743642157157458, + "flos": 21324654414720.0, + "grad_norm": 2.3173926489491463, + "language_loss": 0.8683899, + "learning_rate": 3.976496999316775e-06, + "loss": 0.89138663, + "num_input_tokens_seen": 13693290, + "step": 644, + "time_per_iteration": 2.5122554302215576 + }, + { + "auxiliary_loss_clip": 0.01234987, + "auxiliary_loss_mlp": 0.01047256, + "balance_loss_clip": 1.07507193, + "balance_loss_mlp": 1.03336251, + "epoch": 0.07755666446221367, + "flos": 19969133938560.0, + "grad_norm": 1.9617658721406876, + "language_loss": 0.84044337, + "learning_rate": 3.976377779050271e-06, + "loss": 0.86326587, + "num_input_tokens_seen": 13711420, + "step": 645, + "time_per_iteration": 2.5363337993621826 + }, + { + "auxiliary_loss_clip": 0.01241592, + "auxiliary_loss_mlp": 0.01053574, + "balance_loss_clip": 1.0693059, + "balance_loss_mlp": 1.04029989, + "epoch": 0.07767690735285276, + "flos": 23623224514560.0, + "grad_norm": 3.6506466365488217, + "language_loss": 0.8459599, + "learning_rate": 3.976258258968831e-06, + "loss": 0.86891162, + "num_input_tokens_seen": 13729965, + "step": 646, + "time_per_iteration": 2.5662784576416016 + }, + { + "auxiliary_loss_clip": 0.01217257, + "auxiliary_loss_mlp": 0.01052297, + "balance_loss_clip": 1.06879711, + "balance_loss_mlp": 1.03939867, + "epoch": 0.07779715024349185, + "flos": 22236246702720.0, + "grad_norm": 2.23380659202524, + "language_loss": 0.73969424, + "learning_rate": 3.976138439090583e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 13748045, + "step": 647, + "time_per_iteration": 2.625836133956909 + }, + { + "auxiliary_loss_clip": 0.01223497, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_clip": 1.07156491, + "balance_loss_mlp": 1.02836502, + "epoch": 0.07791739313413094, + "flos": 20955097336320.0, + "grad_norm": 4.073572188980601, + "language_loss": 0.85310626, + "learning_rate": 3.976018319433706e-06, + "loss": 0.87576485, + "num_input_tokens_seen": 13765590, + "step": 648, + "time_per_iteration": 2.6301052570343018 + }, + { + "auxiliary_loss_clip": 0.01248494, + "auxiliary_loss_mlp": 0.01043505, + "balance_loss_clip": 1.07144868, + "balance_loss_mlp": 1.03049326, + "epoch": 0.07803763602477004, + "flos": 19312327797120.0, + "grad_norm": 2.777328584820729, + "language_loss": 0.91331643, + "learning_rate": 3.9758979000164205e-06, + "loss": 0.93623644, + "num_input_tokens_seen": 13782410, + "step": 649, + "time_per_iteration": 2.5231499671936035 + }, + { + "auxiliary_loss_clip": 0.0122411, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.06871665, + "balance_loss_mlp": 1.02353132, + "epoch": 0.07815787891540912, + "flos": 22710806213760.0, + "grad_norm": 1.7485844960492705, + "language_loss": 0.71820235, + "learning_rate": 3.975777180856995e-06, + "loss": 0.74082267, + "num_input_tokens_seen": 13801530, + "step": 650, + "time_per_iteration": 2.654094696044922 + }, + { + "auxiliary_loss_clip": 0.01271057, + "auxiliary_loss_mlp": 0.01052669, + "balance_loss_clip": 1.07314622, + "balance_loss_mlp": 1.03825092, + "epoch": 0.07827812180604822, + "flos": 22711129436160.0, + "grad_norm": 4.376378786053499, + "language_loss": 0.86301017, + "learning_rate": 3.975656161973742e-06, + "loss": 0.88624752, + "num_input_tokens_seen": 13820615, + "step": 651, + "time_per_iteration": 2.498030662536621 + }, + { + "auxiliary_loss_clip": 0.01267623, + "auxiliary_loss_mlp": 0.01049946, + "balance_loss_clip": 1.07106912, + "balance_loss_mlp": 1.03566408, + "epoch": 0.0783983646966873, + "flos": 21725597001600.0, + "grad_norm": 2.9897507566000834, + "language_loss": 0.88849723, + "learning_rate": 3.9755348433850194e-06, + "loss": 0.91167283, + "num_input_tokens_seen": 13835955, + "step": 652, + "time_per_iteration": 2.5535647869110107 + }, + { + "auxiliary_loss_clip": 0.01117621, + "auxiliary_loss_mlp": 0.01015142, + "balance_loss_clip": 1.02487493, + "balance_loss_mlp": 1.01061201, + "epoch": 0.0785186075873264, + "flos": 60640877537280.0, + "grad_norm": 0.9465377015555395, + "language_loss": 0.63570237, + "learning_rate": 3.975413225109232e-06, + "loss": 0.65703011, + "num_input_tokens_seen": 13896505, + "step": 653, + "time_per_iteration": 3.187976837158203 + }, + { + "auxiliary_loss_clip": 0.01248183, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.0697726, + "balance_loss_mlp": 1.02981722, + "epoch": 0.0786388504779655, + "flos": 23877902920320.0, + "grad_norm": 7.159268064526338, + "language_loss": 0.93540776, + "learning_rate": 3.975291307164829e-06, + "loss": 0.95832968, + "num_input_tokens_seen": 13915150, + "step": 654, + "time_per_iteration": 2.576275110244751 + }, + { + "auxiliary_loss_clip": 0.01203713, + "auxiliary_loss_mlp": 0.01045718, + "balance_loss_clip": 1.06223583, + "balance_loss_mlp": 1.03309369, + "epoch": 0.07875909336860458, + "flos": 15158684822400.0, + "grad_norm": 2.04949160182938, + "language_loss": 0.85302484, + "learning_rate": 3.975169089570306e-06, + "loss": 0.8755191, + "num_input_tokens_seen": 13933525, + "step": 655, + "time_per_iteration": 2.587508201599121 + }, + { + "auxiliary_loss_clip": 0.01232567, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.0667057, + "balance_loss_mlp": 1.0298202, + "epoch": 0.07887933625924368, + "flos": 22236857233920.0, + "grad_norm": 2.212160114325958, + "language_loss": 0.91596723, + "learning_rate": 3.975046572344202e-06, + "loss": 0.93872869, + "num_input_tokens_seen": 13949985, + "step": 656, + "time_per_iteration": 2.56308650970459 + }, + { + "auxiliary_loss_clip": 0.01209946, + "auxiliary_loss_mlp": 0.01053017, + "balance_loss_clip": 1.06169248, + "balance_loss_mlp": 1.03931963, + "epoch": 0.07899957914988276, + "flos": 20777734955520.0, + "grad_norm": 2.5234406845751423, + "language_loss": 0.71202993, + "learning_rate": 3.974923755505103e-06, + "loss": 0.73465955, + "num_input_tokens_seen": 13969215, + "step": 657, + "time_per_iteration": 2.5943844318389893 + }, + { + "auxiliary_loss_clip": 0.01207687, + "auxiliary_loss_mlp": 0.01045968, + "balance_loss_clip": 1.06508684, + "balance_loss_mlp": 1.03240824, + "epoch": 0.07911982204052186, + "flos": 23003047267200.0, + "grad_norm": 1.7012657024578905, + "language_loss": 0.91192645, + "learning_rate": 3.974800639071641e-06, + "loss": 0.93446302, + "num_input_tokens_seen": 13989935, + "step": 658, + "time_per_iteration": 2.6454665660858154 + }, + { + "auxiliary_loss_clip": 0.01172807, + "auxiliary_loss_mlp": 0.00766469, + "balance_loss_clip": 1.05971825, + "balance_loss_mlp": 1.00021553, + "epoch": 0.07924006493116094, + "flos": 23111389664640.0, + "grad_norm": 2.1626012923916362, + "language_loss": 1.00612378, + "learning_rate": 3.974677223062492e-06, + "loss": 1.02551651, + "num_input_tokens_seen": 14007150, + "step": 659, + "time_per_iteration": 2.6960089206695557 + }, + { + "auxiliary_loss_clip": 0.01230821, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.06819999, + "balance_loss_mlp": 1.03111959, + "epoch": 0.07936030782180004, + "flos": 16472153450880.0, + "grad_norm": 2.564382592213036, + "language_loss": 0.74513698, + "learning_rate": 3.974553507496378e-06, + "loss": 0.76788688, + "num_input_tokens_seen": 14025725, + "step": 660, + "time_per_iteration": 3.3615169525146484 + }, + { + "auxiliary_loss_clip": 0.01222469, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.06761706, + "balance_loss_mlp": 1.02579725, + "epoch": 0.07948055071243913, + "flos": 23733290764800.0, + "grad_norm": 2.5211777176405388, + "language_loss": 0.89121854, + "learning_rate": 3.974429492392068e-06, + "loss": 0.91385603, + "num_input_tokens_seen": 14045750, + "step": 661, + "time_per_iteration": 2.6135473251342773 + }, + { + "auxiliary_loss_clip": 0.01262965, + "auxiliary_loss_mlp": 0.00766247, + "balance_loss_clip": 1.07298768, + "balance_loss_mlp": 1.00017083, + "epoch": 0.07960079360307822, + "flos": 19573326996480.0, + "grad_norm": 2.0887812885924495, + "language_loss": 0.91341269, + "learning_rate": 3.974305177768373e-06, + "loss": 0.93370485, + "num_input_tokens_seen": 14063960, + "step": 662, + "time_per_iteration": 4.059848308563232 + }, + { + "auxiliary_loss_clip": 0.01206313, + "auxiliary_loss_mlp": 0.01046351, + "balance_loss_clip": 1.06520462, + "balance_loss_mlp": 1.03267789, + "epoch": 0.07972103649371731, + "flos": 23513409659520.0, + "grad_norm": 2.0747917438149046, + "language_loss": 0.86506605, + "learning_rate": 3.974180563644152e-06, + "loss": 0.88759273, + "num_input_tokens_seen": 14082525, + "step": 663, + "time_per_iteration": 3.4509592056274414 + }, + { + "auxiliary_loss_clip": 0.01235393, + "auxiliary_loss_mlp": 0.01050149, + "balance_loss_clip": 1.06816483, + "balance_loss_mlp": 1.03666687, + "epoch": 0.0798412793843564, + "flos": 16726867770240.0, + "grad_norm": 2.268142801777866, + "language_loss": 0.89240664, + "learning_rate": 3.97405565003831e-06, + "loss": 0.91526204, + "num_input_tokens_seen": 14098610, + "step": 664, + "time_per_iteration": 2.5687992572784424 + }, + { + "auxiliary_loss_clip": 0.01216845, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.06549239, + "balance_loss_mlp": 1.02763951, + "epoch": 0.07996152227499549, + "flos": 18223337214720.0, + "grad_norm": 2.2910203263823203, + "language_loss": 0.78163797, + "learning_rate": 3.973930436969794e-06, + "loss": 0.8042202, + "num_input_tokens_seen": 14117065, + "step": 665, + "time_per_iteration": 2.6311750411987305 + }, + { + "auxiliary_loss_clip": 0.01222692, + "auxiliary_loss_mlp": 0.01048739, + "balance_loss_clip": 1.06507707, + "balance_loss_mlp": 1.03555441, + "epoch": 0.08008176516563459, + "flos": 20594877793920.0, + "grad_norm": 2.2504136057765725, + "language_loss": 0.85818565, + "learning_rate": 3.973804924457602e-06, + "loss": 0.88089991, + "num_input_tokens_seen": 14135145, + "step": 666, + "time_per_iteration": 2.5804810523986816 + }, + { + "auxiliary_loss_clip": 0.0122403, + "auxiliary_loss_mlp": 0.01055178, + "balance_loss_clip": 1.06750441, + "balance_loss_mlp": 1.04197526, + "epoch": 0.08020200805627367, + "flos": 31834306863360.0, + "grad_norm": 1.7130246482230882, + "language_loss": 0.85604417, + "learning_rate": 3.973679112520771e-06, + "loss": 0.87883621, + "num_input_tokens_seen": 14156860, + "step": 667, + "time_per_iteration": 2.6611900329589844 + }, + { + "auxiliary_loss_clip": 0.01207124, + "auxiliary_loss_mlp": 0.01042649, + "balance_loss_clip": 1.0632143, + "balance_loss_mlp": 1.02938139, + "epoch": 0.08032225094691277, + "flos": 17783503176960.0, + "grad_norm": 1.97690153763219, + "language_loss": 0.99219173, + "learning_rate": 3.973553001178389e-06, + "loss": 1.01468945, + "num_input_tokens_seen": 14174365, + "step": 668, + "time_per_iteration": 2.579007387161255 + }, + { + "auxiliary_loss_clip": 0.01219535, + "auxiliary_loss_mlp": 0.01041743, + "balance_loss_clip": 1.06808639, + "balance_loss_mlp": 1.02851725, + "epoch": 0.08044249383755185, + "flos": 24061693835520.0, + "grad_norm": 1.8761999803577978, + "language_loss": 0.75674611, + "learning_rate": 3.973426590449585e-06, + "loss": 0.77935886, + "num_input_tokens_seen": 14192320, + "step": 669, + "time_per_iteration": 2.61918568611145 + }, + { + "auxiliary_loss_clip": 0.01201926, + "auxiliary_loss_mlp": 0.0104085, + "balance_loss_clip": 1.06554627, + "balance_loss_mlp": 1.02780843, + "epoch": 0.08056273672819095, + "flos": 18223624523520.0, + "grad_norm": 2.0474885735201616, + "language_loss": 0.75297213, + "learning_rate": 3.9732998803535364e-06, + "loss": 0.77539986, + "num_input_tokens_seen": 14210380, + "step": 670, + "time_per_iteration": 2.6618103981018066 + }, + { + "auxiliary_loss_clip": 0.01265683, + "auxiliary_loss_mlp": 0.01051384, + "balance_loss_clip": 1.07192338, + "balance_loss_mlp": 1.03775239, + "epoch": 0.08068297961883003, + "flos": 19676856971520.0, + "grad_norm": 2.1589471347559606, + "language_loss": 0.85204059, + "learning_rate": 3.973172870909465e-06, + "loss": 0.87521124, + "num_input_tokens_seen": 14225145, + "step": 671, + "time_per_iteration": 2.481862783432007 + }, + { + "auxiliary_loss_clip": 0.01238742, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.06785262, + "balance_loss_mlp": 1.02745414, + "epoch": 0.08080322250946913, + "flos": 23148736830720.0, + "grad_norm": 2.8826257936842983, + "language_loss": 0.80388939, + "learning_rate": 3.973045562136638e-06, + "loss": 0.82669008, + "num_input_tokens_seen": 14241960, + "step": 672, + "time_per_iteration": 2.623370885848999 + }, + { + "auxiliary_loss_clip": 0.0125487, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.07288671, + "balance_loss_mlp": 1.02993369, + "epoch": 0.08092346540010822, + "flos": 21763626526080.0, + "grad_norm": 2.0198513083665777, + "language_loss": 0.91353452, + "learning_rate": 3.972917954054368e-06, + "loss": 0.93651509, + "num_input_tokens_seen": 14260515, + "step": 673, + "time_per_iteration": 2.562635898590088 + }, + { + "auxiliary_loss_clip": 0.01231296, + "auxiliary_loss_mlp": 0.01051337, + "balance_loss_clip": 1.07149172, + "balance_loss_mlp": 1.03601289, + "epoch": 0.08104370829074731, + "flos": 21032485188480.0, + "grad_norm": 2.1571431054807166, + "language_loss": 0.81664163, + "learning_rate": 3.972790046682013e-06, + "loss": 0.83946794, + "num_input_tokens_seen": 14279190, + "step": 674, + "time_per_iteration": 2.582496404647827 + }, + { + "auxiliary_loss_clip": 0.01216842, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_clip": 1.06341565, + "balance_loss_mlp": 1.02934539, + "epoch": 0.0811639511813864, + "flos": 20083186598400.0, + "grad_norm": 3.662303568123341, + "language_loss": 0.79071438, + "learning_rate": 3.972661840038977e-06, + "loss": 0.8133105, + "num_input_tokens_seen": 14299480, + "step": 675, + "time_per_iteration": 2.669689893722534 + }, + { + "auxiliary_loss_clip": 0.0125291, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.07428443, + "balance_loss_mlp": 1.0294044, + "epoch": 0.08128419407202549, + "flos": 16836718538880.0, + "grad_norm": 2.246666381171423, + "language_loss": 0.83539516, + "learning_rate": 3.972533334144707e-06, + "loss": 0.85834992, + "num_input_tokens_seen": 14316405, + "step": 676, + "time_per_iteration": 2.5439813137054443 + }, + { + "auxiliary_loss_clip": 0.01254613, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.06984031, + "balance_loss_mlp": 1.03111482, + "epoch": 0.08140443696266458, + "flos": 23769273214080.0, + "grad_norm": 2.1670658987671323, + "language_loss": 0.78512132, + "learning_rate": 3.972404529018699e-06, + "loss": 0.80811608, + "num_input_tokens_seen": 14336265, + "step": 677, + "time_per_iteration": 2.5575778484344482 + }, + { + "auxiliary_loss_clip": 0.01227477, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.06399465, + "balance_loss_mlp": 1.02180314, + "epoch": 0.08152467985330367, + "flos": 24390132819840.0, + "grad_norm": 1.957099904429329, + "language_loss": 0.85374784, + "learning_rate": 3.972275424680493e-06, + "loss": 0.87637162, + "num_input_tokens_seen": 14356375, + "step": 678, + "time_per_iteration": 2.582249641418457 + }, + { + "auxiliary_loss_clip": 0.01263622, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.07245564, + "balance_loss_mlp": 1.0287261, + "epoch": 0.08164492274394276, + "flos": 19317750750720.0, + "grad_norm": 2.0340387961029567, + "language_loss": 0.92155188, + "learning_rate": 3.972146021149673e-06, + "loss": 0.94460499, + "num_input_tokens_seen": 14374650, + "step": 679, + "time_per_iteration": 2.4950051307678223 + }, + { + "auxiliary_loss_clip": 0.01216779, + "auxiliary_loss_mlp": 0.01043142, + "balance_loss_clip": 1.06692076, + "balance_loss_mlp": 1.03067291, + "epoch": 0.08176516563458186, + "flos": 14830461319680.0, + "grad_norm": 5.164625520573512, + "language_loss": 0.78768885, + "learning_rate": 3.972016318445868e-06, + "loss": 0.81028807, + "num_input_tokens_seen": 14392650, + "step": 680, + "time_per_iteration": 2.6619937419891357 + }, + { + "auxiliary_loss_clip": 0.0124705, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_clip": 1.07044649, + "balance_loss_mlp": 1.03495944, + "epoch": 0.08188540852522094, + "flos": 22602320161920.0, + "grad_norm": 2.2282848670471953, + "language_loss": 0.92229015, + "learning_rate": 3.971886316588757e-06, + "loss": 0.94523871, + "num_input_tokens_seen": 14413155, + "step": 681, + "time_per_iteration": 2.536166191101074 + }, + { + "auxiliary_loss_clip": 0.01208506, + "auxiliary_loss_mlp": 0.01050109, + "balance_loss_clip": 1.06868422, + "balance_loss_mlp": 1.03601825, + "epoch": 0.08200565141586004, + "flos": 19463727623040.0, + "grad_norm": 4.707442841963019, + "language_loss": 0.73622108, + "learning_rate": 3.9717560155980595e-06, + "loss": 0.75880718, + "num_input_tokens_seen": 14428805, + "step": 682, + "time_per_iteration": 2.6403868198394775 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.07045472, + "balance_loss_mlp": 1.03535938, + "epoch": 0.08212589430649912, + "flos": 20594662312320.0, + "grad_norm": 1.9116792069076491, + "language_loss": 0.92087519, + "learning_rate": 3.971625415493542e-06, + "loss": 0.94384253, + "num_input_tokens_seen": 14447125, + "step": 683, + "time_per_iteration": 2.6121182441711426 + }, + { + "auxiliary_loss_clip": 0.01210501, + "auxiliary_loss_mlp": 0.01047393, + "balance_loss_clip": 1.06558764, + "balance_loss_mlp": 1.033553, + "epoch": 0.08224613719713822, + "flos": 25953611086080.0, + "grad_norm": 1.9384775272057482, + "language_loss": 0.87691903, + "learning_rate": 3.971494516295017e-06, + "loss": 0.89949799, + "num_input_tokens_seen": 14466575, + "step": 684, + "time_per_iteration": 2.727145195007324 + }, + { + "auxiliary_loss_clip": 0.0122007, + "auxiliary_loss_mlp": 0.01047052, + "balance_loss_clip": 1.06555665, + "balance_loss_mlp": 1.0328424, + "epoch": 0.08236638008777732, + "flos": 23768734510080.0, + "grad_norm": 1.887635928815399, + "language_loss": 0.85498619, + "learning_rate": 3.971363318022341e-06, + "loss": 0.87765747, + "num_input_tokens_seen": 14487915, + "step": 685, + "time_per_iteration": 2.673487901687622 + }, + { + "auxiliary_loss_clip": 0.01234104, + "auxiliary_loss_mlp": 0.01050061, + "balance_loss_clip": 1.06528044, + "balance_loss_mlp": 1.03617883, + "epoch": 0.0824866229784164, + "flos": 38799144887040.0, + "grad_norm": 1.746277163184343, + "language_loss": 0.68389058, + "learning_rate": 3.971231820695417e-06, + "loss": 0.70673215, + "num_input_tokens_seen": 14511530, + "step": 686, + "time_per_iteration": 2.7478880882263184 + }, + { + "auxiliary_loss_clip": 0.0124029, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.07125902, + "balance_loss_mlp": 1.03164911, + "epoch": 0.0826068658690555, + "flos": 23107762391040.0, + "grad_norm": 1.8424476455481815, + "language_loss": 0.81038511, + "learning_rate": 3.971100024334193e-06, + "loss": 0.83324528, + "num_input_tokens_seen": 14529050, + "step": 687, + "time_per_iteration": 3.2825686931610107 + }, + { + "auxiliary_loss_clip": 0.01200309, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.06213737, + "balance_loss_mlp": 1.03119409, + "epoch": 0.08272710875969458, + "flos": 21136374299520.0, + "grad_norm": 2.748884337128322, + "language_loss": 0.8663221, + "learning_rate": 3.970967928958663e-06, + "loss": 0.88876569, + "num_input_tokens_seen": 14546165, + "step": 688, + "time_per_iteration": 3.3621938228607178 + }, + { + "auxiliary_loss_clip": 0.01204571, + "auxiliary_loss_mlp": 0.01049291, + "balance_loss_clip": 1.06573462, + "balance_loss_mlp": 1.03613639, + "epoch": 0.08284735165033368, + "flos": 19063000517760.0, + "grad_norm": 1.6524581487965286, + "language_loss": 0.8351633, + "learning_rate": 3.970835534588865e-06, + "loss": 0.85770196, + "num_input_tokens_seen": 14563660, + "step": 689, + "time_per_iteration": 4.261405944824219 + }, + { + "auxiliary_loss_clip": 0.01236123, + "auxiliary_loss_mlp": 0.01051895, + "balance_loss_clip": 1.07306111, + "balance_loss_mlp": 1.03899097, + "epoch": 0.08296759454097276, + "flos": 16727442387840.0, + "grad_norm": 1.8841435604667611, + "language_loss": 0.85682249, + "learning_rate": 3.970702841244883e-06, + "loss": 0.87970269, + "num_input_tokens_seen": 14581980, + "step": 690, + "time_per_iteration": 2.568441867828369 + }, + { + "auxiliary_loss_clip": 0.01252539, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.07278788, + "balance_loss_mlp": 1.03439856, + "epoch": 0.08308783743161186, + "flos": 18004928567040.0, + "grad_norm": 1.852523786510735, + "language_loss": 0.82465774, + "learning_rate": 3.970569848946847e-06, + "loss": 0.84765959, + "num_input_tokens_seen": 14601795, + "step": 691, + "time_per_iteration": 2.5279581546783447 + }, + { + "auxiliary_loss_clip": 0.01232734, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.06868792, + "balance_loss_mlp": 1.02557087, + "epoch": 0.08320808032225095, + "flos": 15079788599040.0, + "grad_norm": 2.691041968052477, + "language_loss": 0.83029163, + "learning_rate": 3.970436557714932e-06, + "loss": 0.85300261, + "num_input_tokens_seen": 14618315, + "step": 692, + "time_per_iteration": 2.5089499950408936 + }, + { + "auxiliary_loss_clip": 0.01225984, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.0665493, + "balance_loss_mlp": 1.02837205, + "epoch": 0.08332832321289003, + "flos": 22383085501440.0, + "grad_norm": 2.2310222853365187, + "language_loss": 0.86425436, + "learning_rate": 3.970302967569358e-06, + "loss": 0.8869344, + "num_input_tokens_seen": 14636905, + "step": 693, + "time_per_iteration": 2.6006155014038086 + }, + { + "auxiliary_loss_clip": 0.01251, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.07529497, + "balance_loss_mlp": 1.03268254, + "epoch": 0.08344856610352913, + "flos": 24717386655360.0, + "grad_norm": 1.9624476696521875, + "language_loss": 0.68304384, + "learning_rate": 3.9701690785303896e-06, + "loss": 0.70601487, + "num_input_tokens_seen": 14656100, + "step": 694, + "time_per_iteration": 2.571890354156494 + }, + { + "auxiliary_loss_clip": 0.01252999, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_clip": 1.07151222, + "balance_loss_mlp": 1.03272498, + "epoch": 0.08356880899416821, + "flos": 25370206387200.0, + "grad_norm": 2.8349356725764996, + "language_loss": 0.88184774, + "learning_rate": 3.970034890618339e-06, + "loss": 0.90483999, + "num_input_tokens_seen": 14675790, + "step": 695, + "time_per_iteration": 2.5660157203674316 + }, + { + "auxiliary_loss_clip": 0.01232196, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.06832385, + "balance_loss_mlp": 1.03209949, + "epoch": 0.08368905188480731, + "flos": 24353072962560.0, + "grad_norm": 2.198210046186097, + "language_loss": 0.88546288, + "learning_rate": 3.969900403853562e-06, + "loss": 0.90822947, + "num_input_tokens_seen": 14694830, + "step": 696, + "time_per_iteration": 2.555873394012451 + }, + { + "auxiliary_loss_clip": 0.01267271, + "auxiliary_loss_mlp": 0.01058438, + "balance_loss_clip": 1.07504046, + "balance_loss_mlp": 1.04466891, + "epoch": 0.08380929477544641, + "flos": 18037319656320.0, + "grad_norm": 1.7968567354819687, + "language_loss": 0.7801978, + "learning_rate": 3.96976561825646e-06, + "loss": 0.80345488, + "num_input_tokens_seen": 14711920, + "step": 697, + "time_per_iteration": 2.473745822906494 + }, + { + "auxiliary_loss_clip": 0.01204472, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.06750631, + "balance_loss_mlp": 1.0271728, + "epoch": 0.08392953766608549, + "flos": 26286287875200.0, + "grad_norm": 1.9076661484904223, + "language_loss": 0.86911577, + "learning_rate": 3.969630533847479e-06, + "loss": 0.89155984, + "num_input_tokens_seen": 14730880, + "step": 698, + "time_per_iteration": 2.726006031036377 + }, + { + "auxiliary_loss_clip": 0.01248766, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.07070589, + "balance_loss_mlp": 1.02666128, + "epoch": 0.08404978055672459, + "flos": 22492146170880.0, + "grad_norm": 1.912668150610432, + "language_loss": 0.84301937, + "learning_rate": 3.969495150647113e-06, + "loss": 0.86590338, + "num_input_tokens_seen": 14749050, + "step": 699, + "time_per_iteration": 2.554166555404663 + }, + { + "auxiliary_loss_clip": 0.01214875, + "auxiliary_loss_mlp": 0.01039841, + "balance_loss_clip": 1.06978536, + "balance_loss_mlp": 1.02725279, + "epoch": 0.08417002344736367, + "flos": 24826878288000.0, + "grad_norm": 2.3979910193493934, + "language_loss": 0.76438642, + "learning_rate": 3.969359468675899e-06, + "loss": 0.78693354, + "num_input_tokens_seen": 14769180, + "step": 700, + "time_per_iteration": 2.696536064147949 + }, + { + "auxiliary_loss_clip": 0.01244028, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.07024026, + "balance_loss_mlp": 1.02781212, + "epoch": 0.08429026633800277, + "flos": 16945922862720.0, + "grad_norm": 2.0039685239047134, + "language_loss": 0.89212424, + "learning_rate": 3.969223487954418e-06, + "loss": 0.91497457, + "num_input_tokens_seen": 14786640, + "step": 701, + "time_per_iteration": 2.5321505069732666 + }, + { + "auxiliary_loss_clip": 0.01202975, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_clip": 1.07088447, + "balance_loss_mlp": 1.03233027, + "epoch": 0.08441050922864185, + "flos": 23841920471040.0, + "grad_norm": 4.084768347851272, + "language_loss": 0.82598758, + "learning_rate": 3.969087208503301e-06, + "loss": 0.84846354, + "num_input_tokens_seen": 14806720, + "step": 702, + "time_per_iteration": 2.6615612506866455 + }, + { + "auxiliary_loss_clip": 0.01201978, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.0665524, + "balance_loss_mlp": 1.03251791, + "epoch": 0.08453075211928095, + "flos": 25520205582720.0, + "grad_norm": 2.3926587597809243, + "language_loss": 0.84800619, + "learning_rate": 3.968950630343219e-06, + "loss": 0.87047756, + "num_input_tokens_seen": 14823705, + "step": 703, + "time_per_iteration": 2.6553988456726074 + }, + { + "auxiliary_loss_clip": 0.01228692, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.06679034, + "balance_loss_mlp": 1.02942145, + "epoch": 0.08465099500992004, + "flos": 19532496211200.0, + "grad_norm": 2.1413374696871763, + "language_loss": 0.93690538, + "learning_rate": 3.968813753494892e-06, + "loss": 0.95961124, + "num_input_tokens_seen": 14841865, + "step": 704, + "time_per_iteration": 2.629457473754883 + }, + { + "auxiliary_loss_clip": 0.01200602, + "auxiliary_loss_mlp": 0.00766838, + "balance_loss_clip": 1.05919659, + "balance_loss_mlp": 1.00026965, + "epoch": 0.08477123790055913, + "flos": 29351299403520.0, + "grad_norm": 2.247485423183913, + "language_loss": 0.75486815, + "learning_rate": 3.968676577979084e-06, + "loss": 0.77454257, + "num_input_tokens_seen": 14861415, + "step": 705, + "time_per_iteration": 2.6891446113586426 + }, + { + "auxiliary_loss_clip": 0.01190528, + "auxiliary_loss_mlp": 0.0104928, + "balance_loss_clip": 1.06026244, + "balance_loss_mlp": 1.0360117, + "epoch": 0.08489148079119822, + "flos": 18624495283200.0, + "grad_norm": 2.092828863937814, + "language_loss": 0.78101873, + "learning_rate": 3.968539103816605e-06, + "loss": 0.80341685, + "num_input_tokens_seen": 14879215, + "step": 706, + "time_per_iteration": 2.6363978385925293 + }, + { + "auxiliary_loss_clip": 0.01229073, + "auxiliary_loss_mlp": 0.00766382, + "balance_loss_clip": 1.07036948, + "balance_loss_mlp": 1.00033689, + "epoch": 0.0850117236818373, + "flos": 23471393725440.0, + "grad_norm": 1.8581787943709995, + "language_loss": 0.89284164, + "learning_rate": 3.9684013310283085e-06, + "loss": 0.91279614, + "num_input_tokens_seen": 14897900, + "step": 707, + "time_per_iteration": 2.6086738109588623 + }, + { + "auxiliary_loss_clip": 0.0122622, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_clip": 1.07080543, + "balance_loss_mlp": 1.0347805, + "epoch": 0.0851319665724764, + "flos": 40625058896640.0, + "grad_norm": 5.442632421449348, + "language_loss": 0.64219964, + "learning_rate": 3.9682632596350956e-06, + "loss": 0.6649403, + "num_input_tokens_seen": 14919065, + "step": 708, + "time_per_iteration": 2.753957509994507 + }, + { + "auxiliary_loss_clip": 0.01242668, + "auxiliary_loss_mlp": 0.01040624, + "balance_loss_clip": 1.07111847, + "balance_loss_mlp": 1.02765393, + "epoch": 0.0852522094631155, + "flos": 15879554870400.0, + "grad_norm": 1.9563547219313315, + "language_loss": 0.78531468, + "learning_rate": 3.968124889657911e-06, + "loss": 0.80814767, + "num_input_tokens_seen": 14934165, + "step": 709, + "time_per_iteration": 2.500439405441284 + }, + { + "auxiliary_loss_clip": 0.01193587, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.06246459, + "balance_loss_mlp": 1.03033948, + "epoch": 0.08537245235375458, + "flos": 14567091822720.0, + "grad_norm": 3.646994956941074, + "language_loss": 0.9064312, + "learning_rate": 3.967986221117746e-06, + "loss": 0.92878908, + "num_input_tokens_seen": 14950105, + "step": 710, + "time_per_iteration": 2.6542534828186035 + }, + { + "auxiliary_loss_clip": 0.01172055, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.06207466, + "balance_loss_mlp": 1.02709675, + "epoch": 0.08549269524439368, + "flos": 26468929555200.0, + "grad_norm": 2.533747217679113, + "language_loss": 0.86604768, + "learning_rate": 3.967847254035635e-06, + "loss": 0.88815999, + "num_input_tokens_seen": 14969490, + "step": 711, + "time_per_iteration": 2.9470837116241455 + }, + { + "auxiliary_loss_clip": 0.0121092, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_clip": 1.06390035, + "balance_loss_mlp": 1.03052831, + "epoch": 0.08561293813503276, + "flos": 13590214565760.0, + "grad_norm": 2.2386750145744556, + "language_loss": 0.85938454, + "learning_rate": 3.967707988432661e-06, + "loss": 0.88192779, + "num_input_tokens_seen": 14987195, + "step": 712, + "time_per_iteration": 2.9067230224609375 + }, + { + "auxiliary_loss_clip": 0.01262015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.07074261, + "balance_loss_mlp": 1.02769971, + "epoch": 0.08573318102567186, + "flos": 26943524979840.0, + "grad_norm": 2.608793268359326, + "language_loss": 0.8768183, + "learning_rate": 3.967568424329949e-06, + "loss": 0.89985186, + "num_input_tokens_seen": 15007620, + "step": 713, + "time_per_iteration": 2.557586669921875 + }, + { + "auxiliary_loss_clip": 0.01125643, + "auxiliary_loss_mlp": 0.01008535, + "balance_loss_clip": 1.0274452, + "balance_loss_mlp": 1.00424349, + "epoch": 0.08585342391631094, + "flos": 67302739319040.0, + "grad_norm": 0.8339682721802405, + "language_loss": 0.55495805, + "learning_rate": 3.967428561748671e-06, + "loss": 0.57629985, + "num_input_tokens_seen": 15075590, + "step": 714, + "time_per_iteration": 3.9590275287628174 + }, + { + "auxiliary_loss_clip": 0.01189338, + "auxiliary_loss_mlp": 0.01046337, + "balance_loss_clip": 1.05928683, + "balance_loss_mlp": 1.03279519, + "epoch": 0.08597366680695004, + "flos": 22456594684800.0, + "grad_norm": 2.406491906210773, + "language_loss": 0.87914634, + "learning_rate": 3.967288400710045e-06, + "loss": 0.90150309, + "num_input_tokens_seen": 15095055, + "step": 715, + "time_per_iteration": 4.252544641494751 + }, + { + "auxiliary_loss_clip": 0.01206506, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.067276, + "balance_loss_mlp": 1.02590132, + "epoch": 0.08609390969758914, + "flos": 23550505430400.0, + "grad_norm": 1.8113188108545195, + "language_loss": 0.88191593, + "learning_rate": 3.9671479412353335e-06, + "loss": 0.90437567, + "num_input_tokens_seen": 15113520, + "step": 716, + "time_per_iteration": 3.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.0124658, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_clip": 1.07306862, + "balance_loss_mlp": 1.03064299, + "epoch": 0.08621415258822822, + "flos": 25885848078720.0, + "grad_norm": 2.2771922178571247, + "language_loss": 0.74392021, + "learning_rate": 3.967007183345843e-06, + "loss": 0.7668227, + "num_input_tokens_seen": 15133375, + "step": 717, + "time_per_iteration": 2.5887489318847656 + }, + { + "auxiliary_loss_clip": 0.01237505, + "auxiliary_loss_mlp": 0.01039424, + "balance_loss_clip": 1.06799555, + "balance_loss_mlp": 1.02659655, + "epoch": 0.08633439547886732, + "flos": 13589568120960.0, + "grad_norm": 2.1681758363575456, + "language_loss": 0.89685607, + "learning_rate": 3.966866127062927e-06, + "loss": 0.91962534, + "num_input_tokens_seen": 15150500, + "step": 718, + "time_per_iteration": 2.5089569091796875 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01005766, + "balance_loss_clip": 1.02597737, + "balance_loss_mlp": 1.00164115, + "epoch": 0.0864546383695064, + "flos": 57767342434560.0, + "grad_norm": 0.8797506243913611, + "language_loss": 0.62673581, + "learning_rate": 3.966724772407982e-06, + "loss": 0.6480577, + "num_input_tokens_seen": 15208015, + "step": 719, + "time_per_iteration": 2.972865581512451 + }, + { + "auxiliary_loss_clip": 0.01204569, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.06506062, + "balance_loss_mlp": 1.02784646, + "epoch": 0.0865748812601455, + "flos": 20046952753920.0, + "grad_norm": 3.197730513244888, + "language_loss": 0.89148206, + "learning_rate": 3.966583119402454e-06, + "loss": 0.91393006, + "num_input_tokens_seen": 15224780, + "step": 720, + "time_per_iteration": 2.590679168701172 + }, + { + "auxiliary_loss_clip": 0.01238102, + "auxiliary_loss_mlp": 0.00765865, + "balance_loss_clip": 1.06846118, + "balance_loss_mlp": 1.00017905, + "epoch": 0.08669512415078459, + "flos": 35262446935680.0, + "grad_norm": 1.835475061832961, + "language_loss": 0.82331693, + "learning_rate": 3.9664411680678305e-06, + "loss": 0.84335661, + "num_input_tokens_seen": 15246535, + "step": 721, + "time_per_iteration": 2.6514365673065186 + }, + { + "auxiliary_loss_clip": 0.01100283, + "auxiliary_loss_mlp": 0.01006894, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.00286484, + "epoch": 0.08681536704142367, + "flos": 65654870048640.0, + "grad_norm": 0.8504548309843537, + "language_loss": 0.61479133, + "learning_rate": 3.966298918425644e-06, + "loss": 0.63586313, + "num_input_tokens_seen": 15304025, + "step": 722, + "time_per_iteration": 3.040348768234253 + }, + { + "auxiliary_loss_clip": 0.01243942, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.06751406, + "balance_loss_mlp": 1.02879941, + "epoch": 0.08693560993206277, + "flos": 34529940881280.0, + "grad_norm": 1.8892694256120992, + "language_loss": 0.8262831, + "learning_rate": 3.966156370497476e-06, + "loss": 0.8491438, + "num_input_tokens_seen": 15327635, + "step": 723, + "time_per_iteration": 2.630070924758911 + }, + { + "auxiliary_loss_clip": 0.01244589, + "auxiliary_loss_mlp": 0.01039352, + "balance_loss_clip": 1.06830597, + "balance_loss_mlp": 1.02691817, + "epoch": 0.08705585282270185, + "flos": 23149419189120.0, + "grad_norm": 1.9765714121339535, + "language_loss": 0.88450456, + "learning_rate": 3.96601352430495e-06, + "loss": 0.90734398, + "num_input_tokens_seen": 15347405, + "step": 724, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0122603, + "auxiliary_loss_mlp": 0.01052406, + "balance_loss_clip": 1.06951308, + "balance_loss_mlp": 1.03925085, + "epoch": 0.08717609571334095, + "flos": 29497599498240.0, + "grad_norm": 1.7606593758580305, + "language_loss": 0.83099043, + "learning_rate": 3.965870379869735e-06, + "loss": 0.85377479, + "num_input_tokens_seen": 15369450, + "step": 725, + "time_per_iteration": 2.62967586517334 + }, + { + "auxiliary_loss_clip": 0.01239474, + "auxiliary_loss_mlp": 0.01043134, + "balance_loss_clip": 1.06500781, + "balance_loss_mlp": 1.03061712, + "epoch": 0.08729633860398003, + "flos": 20667489137280.0, + "grad_norm": 2.22584673923551, + "language_loss": 0.86881793, + "learning_rate": 3.965726937213547e-06, + "loss": 0.891644, + "num_input_tokens_seen": 15388085, + "step": 726, + "time_per_iteration": 2.55401611328125 + }, + { + "auxiliary_loss_clip": 0.01238424, + "auxiliary_loss_mlp": 0.01049697, + "balance_loss_clip": 1.06470847, + "balance_loss_mlp": 1.03665543, + "epoch": 0.08741658149461913, + "flos": 18369493655040.0, + "grad_norm": 2.151648204066083, + "language_loss": 0.81186914, + "learning_rate": 3.965583196358144e-06, + "loss": 0.83475041, + "num_input_tokens_seen": 15407120, + "step": 727, + "time_per_iteration": 2.509429454803467 + }, + { + "auxiliary_loss_clip": 0.01259538, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.07008934, + "balance_loss_mlp": 1.0293293, + "epoch": 0.08753682438525823, + "flos": 18729677283840.0, + "grad_norm": 2.35394729686958, + "language_loss": 0.74322057, + "learning_rate": 3.965439157325335e-06, + "loss": 0.7662527, + "num_input_tokens_seen": 15424485, + "step": 728, + "time_per_iteration": 2.441293716430664 + }, + { + "auxiliary_loss_clip": 0.01218303, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.06103265, + "balance_loss_mlp": 1.02334952, + "epoch": 0.08765706727589731, + "flos": 27776113303680.0, + "grad_norm": 1.9535928059670373, + "language_loss": 0.76105404, + "learning_rate": 3.965294820136968e-06, + "loss": 0.78361511, + "num_input_tokens_seen": 15446285, + "step": 729, + "time_per_iteration": 2.6831133365631104 + }, + { + "auxiliary_loss_clip": 0.01229427, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.06814563, + "balance_loss_mlp": 1.025334, + "epoch": 0.08777731016653641, + "flos": 24389127239040.0, + "grad_norm": 2.1511631392446726, + "language_loss": 0.86837113, + "learning_rate": 3.965150184814938e-06, + "loss": 0.89104259, + "num_input_tokens_seen": 15465770, + "step": 730, + "time_per_iteration": 2.593564510345459 + }, + { + "auxiliary_loss_clip": 0.01215966, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.06531978, + "balance_loss_mlp": 1.02778959, + "epoch": 0.08789755305717549, + "flos": 21981855605760.0, + "grad_norm": 2.021302841036452, + "language_loss": 0.76748025, + "learning_rate": 3.965005251381189e-06, + "loss": 0.79005063, + "num_input_tokens_seen": 15483705, + "step": 731, + "time_per_iteration": 2.6122143268585205 + }, + { + "auxiliary_loss_clip": 0.01127584, + "auxiliary_loss_mlp": 0.01011785, + "balance_loss_clip": 1.02510381, + "balance_loss_mlp": 1.00768435, + "epoch": 0.08801779594781459, + "flos": 58360120583040.0, + "grad_norm": 0.9717997599186701, + "language_loss": 0.64608657, + "learning_rate": 3.964860019857705e-06, + "loss": 0.66748023, + "num_input_tokens_seen": 15548620, + "step": 732, + "time_per_iteration": 3.10711669921875 + }, + { + "auxiliary_loss_clip": 0.01260301, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.07399726, + "balance_loss_mlp": 1.03102434, + "epoch": 0.08813803883845367, + "flos": 23294785530240.0, + "grad_norm": 1.8206133020948418, + "language_loss": 0.83789277, + "learning_rate": 3.964714490266518e-06, + "loss": 0.86092728, + "num_input_tokens_seen": 15569265, + "step": 733, + "time_per_iteration": 2.5126473903656006 + }, + { + "auxiliary_loss_clip": 0.01125597, + "auxiliary_loss_mlp": 0.01005217, + "balance_loss_clip": 1.02625155, + "balance_loss_mlp": 1.00104451, + "epoch": 0.08825828172909277, + "flos": 63424924882560.0, + "grad_norm": 0.8861968915982156, + "language_loss": 0.64556694, + "learning_rate": 3.964568662629706e-06, + "loss": 0.66687512, + "num_input_tokens_seen": 15630570, + "step": 734, + "time_per_iteration": 2.9916768074035645 + }, + { + "auxiliary_loss_clip": 0.01234664, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_clip": 1.06426501, + "balance_loss_mlp": 1.03273296, + "epoch": 0.08837852461973186, + "flos": 26720986268160.0, + "grad_norm": 2.143013551686155, + "language_loss": 0.84600925, + "learning_rate": 3.9644225369693895e-06, + "loss": 0.86880338, + "num_input_tokens_seen": 15650870, + "step": 735, + "time_per_iteration": 2.564491033554077 + }, + { + "auxiliary_loss_clip": 0.01256849, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.07175684, + "balance_loss_mlp": 1.02608156, + "epoch": 0.08849876751037095, + "flos": 27265427688960.0, + "grad_norm": 2.568594686377238, + "language_loss": 0.86595887, + "learning_rate": 3.964276113307735e-06, + "loss": 0.88891017, + "num_input_tokens_seen": 15670835, + "step": 736, + "time_per_iteration": 2.548471689224243 + }, + { + "auxiliary_loss_clip": 0.01207633, + "auxiliary_loss_mlp": 0.01055019, + "balance_loss_clip": 1.064996, + "balance_loss_mlp": 1.04199481, + "epoch": 0.08861901040101004, + "flos": 19828759587840.0, + "grad_norm": 2.200777328594122, + "language_loss": 0.80789328, + "learning_rate": 3.9641293916669574e-06, + "loss": 0.8305198, + "num_input_tokens_seen": 15689795, + "step": 737, + "time_per_iteration": 2.604114532470703 + }, + { + "auxiliary_loss_clip": 0.01203642, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.06393814, + "balance_loss_mlp": 1.02305853, + "epoch": 0.08873925329164913, + "flos": 23658704173440.0, + "grad_norm": 1.8158915380501024, + "language_loss": 0.8280381, + "learning_rate": 3.9639823720693115e-06, + "loss": 0.85043848, + "num_input_tokens_seen": 15711650, + "step": 738, + "time_per_iteration": 2.696847915649414 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01016754, + "balance_loss_clip": 1.03346586, + "balance_loss_mlp": 1.01177144, + "epoch": 0.08885949618228822, + "flos": 71831541893760.0, + "grad_norm": 0.8392848151604831, + "language_loss": 0.60047221, + "learning_rate": 3.963835054537102e-06, + "loss": 0.62169886, + "num_input_tokens_seen": 15780615, + "step": 739, + "time_per_iteration": 3.23791241645813 + }, + { + "auxiliary_loss_clip": 0.01219907, + "auxiliary_loss_mlp": 0.0105225, + "balance_loss_clip": 1.06260693, + "balance_loss_mlp": 1.03995955, + "epoch": 0.08897973907292732, + "flos": 22346169298560.0, + "grad_norm": 2.190365928104189, + "language_loss": 0.6099714, + "learning_rate": 3.963687439092676e-06, + "loss": 0.63269299, + "num_input_tokens_seen": 15801300, + "step": 740, + "time_per_iteration": 2.605628728866577 + }, + { + "auxiliary_loss_clip": 0.01239106, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_clip": 1.0677743, + "balance_loss_mlp": 1.03613985, + "epoch": 0.0890999819635664, + "flos": 21251827589760.0, + "grad_norm": 2.0502454626781392, + "language_loss": 0.80261332, + "learning_rate": 3.963539525758427e-06, + "loss": 0.82549131, + "num_input_tokens_seen": 15820860, + "step": 741, + "time_per_iteration": 4.148313522338867 + }, + { + "auxiliary_loss_clip": 0.01225402, + "auxiliary_loss_mlp": 0.0103841, + "balance_loss_clip": 1.06687486, + "balance_loss_mlp": 1.02483201, + "epoch": 0.0892202248542055, + "flos": 25370888745600.0, + "grad_norm": 2.11347652141186, + "language_loss": 0.67678612, + "learning_rate": 3.9633913145567925e-06, + "loss": 0.69942427, + "num_input_tokens_seen": 15841350, + "step": 742, + "time_per_iteration": 3.463514566421509 + }, + { + "auxiliary_loss_clip": 0.01223112, + "auxiliary_loss_mlp": 0.01036298, + "balance_loss_clip": 1.06784868, + "balance_loss_mlp": 1.02449679, + "epoch": 0.08934046774484458, + "flos": 24457895827200.0, + "grad_norm": 2.191582810812382, + "language_loss": 0.81525695, + "learning_rate": 3.9632428055102575e-06, + "loss": 0.83785105, + "num_input_tokens_seen": 15861360, + "step": 743, + "time_per_iteration": 3.4594414234161377 + }, + { + "auxiliary_loss_clip": 0.01243546, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.07199073, + "balance_loss_mlp": 1.03010964, + "epoch": 0.08946071063548368, + "flos": 35772773414400.0, + "grad_norm": 1.9741278613046018, + "language_loss": 0.67003, + "learning_rate": 3.9630939986413495e-06, + "loss": 0.69290364, + "num_input_tokens_seen": 15883160, + "step": 744, + "time_per_iteration": 2.667379856109619 + }, + { + "auxiliary_loss_clip": 0.01196295, + "auxiliary_loss_mlp": 0.01044414, + "balance_loss_clip": 1.06346834, + "balance_loss_mlp": 1.03128278, + "epoch": 0.08958095352612276, + "flos": 14356584167040.0, + "grad_norm": 1.7406608405953365, + "language_loss": 0.78252316, + "learning_rate": 3.962944893972643e-06, + "loss": 0.80493021, + "num_input_tokens_seen": 15901610, + "step": 745, + "time_per_iteration": 2.580564260482788 + }, + { + "auxiliary_loss_clip": 0.01220473, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.06515944, + "balance_loss_mlp": 1.02955008, + "epoch": 0.08970119641676186, + "flos": 17853277345920.0, + "grad_norm": 2.7840713399751613, + "language_loss": 0.90480894, + "learning_rate": 3.962795491526756e-06, + "loss": 0.92743421, + "num_input_tokens_seen": 15918770, + "step": 746, + "time_per_iteration": 2.5759356021881104 + }, + { + "auxiliary_loss_clip": 0.01259934, + "auxiliary_loss_mlp": 0.01054159, + "balance_loss_clip": 1.07319033, + "balance_loss_mlp": 1.04038465, + "epoch": 0.08982143930740095, + "flos": 20811670329600.0, + "grad_norm": 2.4108909395120532, + "language_loss": 0.89553893, + "learning_rate": 3.962645791326354e-06, + "loss": 0.91867995, + "num_input_tokens_seen": 15938025, + "step": 747, + "time_per_iteration": 2.4914937019348145 + }, + { + "auxiliary_loss_clip": 0.01236492, + "auxiliary_loss_mlp": 0.01041316, + "balance_loss_clip": 1.06902027, + "balance_loss_mlp": 1.0299201, + "epoch": 0.08994168219804004, + "flos": 24097712198400.0, + "grad_norm": 1.9429655143654245, + "language_loss": 0.83066165, + "learning_rate": 3.962495793394146e-06, + "loss": 0.85343969, + "num_input_tokens_seen": 15957215, + "step": 748, + "time_per_iteration": 2.61409330368042 + }, + { + "auxiliary_loss_clip": 0.01135261, + "auxiliary_loss_mlp": 0.01012196, + "balance_loss_clip": 1.02588487, + "balance_loss_mlp": 1.00828588, + "epoch": 0.09006192508867913, + "flos": 57188893812480.0, + "grad_norm": 0.7487012947921607, + "language_loss": 0.61236072, + "learning_rate": 3.9623454977528864e-06, + "loss": 0.63383532, + "num_input_tokens_seen": 16015870, + "step": 749, + "time_per_iteration": 2.8964195251464844 + }, + { + "auxiliary_loss_clip": 0.01209774, + "auxiliary_loss_mlp": 0.01048011, + "balance_loss_clip": 1.06379175, + "balance_loss_mlp": 1.03567839, + "epoch": 0.09018216797931822, + "flos": 20487505063680.0, + "grad_norm": 1.8252040716289808, + "language_loss": 0.85107136, + "learning_rate": 3.962194904425375e-06, + "loss": 0.87364924, + "num_input_tokens_seen": 16036500, + "step": 750, + "time_per_iteration": 2.6538734436035156 + }, + { + "auxiliary_loss_clip": 0.01232799, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.06668937, + "balance_loss_mlp": 1.03003013, + "epoch": 0.09030241086995731, + "flos": 22638123043200.0, + "grad_norm": 3.116746254542993, + "language_loss": 0.68255734, + "learning_rate": 3.9620440134344566e-06, + "loss": 0.70531046, + "num_input_tokens_seen": 16054655, + "step": 751, + "time_per_iteration": 2.5317018032073975 + }, + { + "auxiliary_loss_clip": 0.01204148, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_clip": 1.0643996, + "balance_loss_mlp": 1.03201699, + "epoch": 0.09042265376059641, + "flos": 21871502046720.0, + "grad_norm": 2.997105049436566, + "language_loss": 0.82299364, + "learning_rate": 3.9618928248030215e-06, + "loss": 0.84548461, + "num_input_tokens_seen": 16074165, + "step": 752, + "time_per_iteration": 2.638350486755371 + }, + { + "auxiliary_loss_clip": 0.01236608, + "auxiliary_loss_mlp": 0.01044808, + "balance_loss_clip": 1.06934679, + "balance_loss_mlp": 1.03266621, + "epoch": 0.0905428966512355, + "flos": 24316192673280.0, + "grad_norm": 1.9660484946703065, + "language_loss": 0.82874864, + "learning_rate": 3.961741338554005e-06, + "loss": 0.85156286, + "num_input_tokens_seen": 16092505, + "step": 753, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.01231074, + "auxiliary_loss_mlp": 0.01052697, + "balance_loss_clip": 1.06870103, + "balance_loss_mlp": 1.03960752, + "epoch": 0.09066313954187459, + "flos": 35845061535360.0, + "grad_norm": 1.7435346595427537, + "language_loss": 0.75742501, + "learning_rate": 3.9615895547103865e-06, + "loss": 0.78026271, + "num_input_tokens_seen": 16116150, + "step": 754, + "time_per_iteration": 2.6791906356811523 + }, + { + "auxiliary_loss_clip": 0.01220575, + "auxiliary_loss_mlp": 0.01050618, + "balance_loss_clip": 1.06407571, + "balance_loss_mlp": 1.0376358, + "epoch": 0.09078338243251367, + "flos": 29168729550720.0, + "grad_norm": 1.91167513211398, + "language_loss": 0.77737838, + "learning_rate": 3.961437473295193e-06, + "loss": 0.80009031, + "num_input_tokens_seen": 16136295, + "step": 755, + "time_per_iteration": 2.620534658432007 + }, + { + "auxiliary_loss_clip": 0.01176728, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.05563712, + "balance_loss_mlp": 1.02905452, + "epoch": 0.09090362532315277, + "flos": 21907699977600.0, + "grad_norm": 2.55666270523674, + "language_loss": 0.72194338, + "learning_rate": 3.961285094331495e-06, + "loss": 0.74412465, + "num_input_tokens_seen": 16154210, + "step": 756, + "time_per_iteration": 2.6476550102233887 + }, + { + "auxiliary_loss_clip": 0.01249015, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.06674623, + "balance_loss_mlp": 1.02382684, + "epoch": 0.09102386821379185, + "flos": 27344503480320.0, + "grad_norm": 1.8837184445602146, + "language_loss": 0.86164725, + "learning_rate": 3.961132417842406e-06, + "loss": 0.88449234, + "num_input_tokens_seen": 16173995, + "step": 757, + "time_per_iteration": 2.633317232131958 + }, + { + "auxiliary_loss_clip": 0.01231315, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.06706858, + "balance_loss_mlp": 1.0398438, + "epoch": 0.09114411110443095, + "flos": 20813501923200.0, + "grad_norm": 2.438111727915082, + "language_loss": 0.75167799, + "learning_rate": 3.960979443851089e-06, + "loss": 0.77450633, + "num_input_tokens_seen": 16191020, + "step": 758, + "time_per_iteration": 2.5230281352996826 + }, + { + "auxiliary_loss_clip": 0.01220444, + "auxiliary_loss_mlp": 0.0104145, + "balance_loss_clip": 1.06604314, + "balance_loss_mlp": 1.02834296, + "epoch": 0.09126435399507005, + "flos": 26145949438080.0, + "grad_norm": 1.7835830370434884, + "language_loss": 0.79153293, + "learning_rate": 3.96082617238075e-06, + "loss": 0.81415188, + "num_input_tokens_seen": 16213645, + "step": 759, + "time_per_iteration": 2.6373579502105713 + }, + { + "auxiliary_loss_clip": 0.01219924, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.06432903, + "balance_loss_mlp": 1.02681303, + "epoch": 0.09138459688570913, + "flos": 24388911757440.0, + "grad_norm": 2.4030029515871356, + "language_loss": 0.79990709, + "learning_rate": 3.960672603454639e-06, + "loss": 0.82249081, + "num_input_tokens_seen": 16233625, + "step": 760, + "time_per_iteration": 2.5876359939575195 + }, + { + "auxiliary_loss_clip": 0.01232584, + "auxiliary_loss_mlp": 0.0104431, + "balance_loss_clip": 1.06770086, + "balance_loss_mlp": 1.03119111, + "epoch": 0.09150483977634823, + "flos": 21032664756480.0, + "grad_norm": 3.1256636034472383, + "language_loss": 0.76926249, + "learning_rate": 3.960518737096054e-06, + "loss": 0.79203141, + "num_input_tokens_seen": 16253255, + "step": 761, + "time_per_iteration": 2.5378503799438477 + }, + { + "auxiliary_loss_clip": 0.01237537, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0688293, + "balance_loss_mlp": 1.02685297, + "epoch": 0.09162508266698731, + "flos": 22856998567680.0, + "grad_norm": 2.2192148737818016, + "language_loss": 0.72188818, + "learning_rate": 3.960364573328334e-06, + "loss": 0.74465662, + "num_input_tokens_seen": 16272580, + "step": 762, + "time_per_iteration": 2.5311474800109863 + }, + { + "auxiliary_loss_clip": 0.01205773, + "auxiliary_loss_mlp": 0.01038027, + "balance_loss_clip": 1.06161964, + "balance_loss_mlp": 1.02507496, + "epoch": 0.0917453255576264, + "flos": 21724411852800.0, + "grad_norm": 1.9593625852296, + "language_loss": 0.88944232, + "learning_rate": 3.9602101121748675e-06, + "loss": 0.91188037, + "num_input_tokens_seen": 16293075, + "step": 763, + "time_per_iteration": 2.6149067878723145 + }, + { + "auxiliary_loss_clip": 0.01222695, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.06933141, + "balance_loss_mlp": 1.02872014, + "epoch": 0.0918655684482655, + "flos": 14609215497600.0, + "grad_norm": 1.923559539220672, + "language_loss": 0.7213006, + "learning_rate": 3.960055353659085e-06, + "loss": 0.74393004, + "num_input_tokens_seen": 16310185, + "step": 764, + "time_per_iteration": 2.5277061462402344 + }, + { + "auxiliary_loss_clip": 0.01209373, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.06591654, + "balance_loss_mlp": 1.02087116, + "epoch": 0.09198581133890459, + "flos": 23435016226560.0, + "grad_norm": 1.8471511245631569, + "language_loss": 0.83871853, + "learning_rate": 3.959900297804465e-06, + "loss": 0.8611415, + "num_input_tokens_seen": 16330355, + "step": 765, + "time_per_iteration": 2.6588802337646484 + }, + { + "auxiliary_loss_clip": 0.01210054, + "auxiliary_loss_mlp": 0.01037334, + "balance_loss_clip": 1.06390882, + "balance_loss_mlp": 1.02531743, + "epoch": 0.09210605422954368, + "flos": 16795887753600.0, + "grad_norm": 1.9564219188643026, + "language_loss": 0.77339691, + "learning_rate": 3.9597449446345276e-06, + "loss": 0.79587078, + "num_input_tokens_seen": 16347600, + "step": 766, + "time_per_iteration": 2.54259991645813 + }, + { + "auxiliary_loss_clip": 0.01206682, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.05965638, + "balance_loss_mlp": 1.02658296, + "epoch": 0.09222629712018277, + "flos": 22674249146880.0, + "grad_norm": 2.1298118106402057, + "language_loss": 0.83415002, + "learning_rate": 3.95958929417284e-06, + "loss": 0.85659611, + "num_input_tokens_seen": 16365755, + "step": 767, + "time_per_iteration": 2.593198776245117 + }, + { + "auxiliary_loss_clip": 0.0112605, + "auxiliary_loss_mlp": 0.01007895, + "balance_loss_clip": 1.02580655, + "balance_loss_mlp": 1.00388992, + "epoch": 0.09234654001082186, + "flos": 69976756327680.0, + "grad_norm": 0.7312384833474007, + "language_loss": 0.5879674, + "learning_rate": 3.9594333464430145e-06, + "loss": 0.60930693, + "num_input_tokens_seen": 16435245, + "step": 768, + "time_per_iteration": 4.689736843109131 + }, + { + "auxiliary_loss_clip": 0.01145614, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_clip": 1.05238318, + "balance_loss_mlp": 1.03241229, + "epoch": 0.09246678290146094, + "flos": 20011437181440.0, + "grad_norm": 1.849293474416031, + "language_loss": 0.88107038, + "learning_rate": 3.959277101468709e-06, + "loss": 0.90296465, + "num_input_tokens_seen": 16454795, + "step": 769, + "time_per_iteration": 3.564208745956421 + }, + { + "auxiliary_loss_clip": 0.01205198, + "auxiliary_loss_mlp": 0.01046547, + "balance_loss_clip": 1.0625, + "balance_loss_mlp": 1.03451908, + "epoch": 0.09258702579210004, + "flos": 17747448900480.0, + "grad_norm": 3.006295516896853, + "language_loss": 0.78764904, + "learning_rate": 3.959120559273624e-06, + "loss": 0.81016648, + "num_input_tokens_seen": 16472580, + "step": 770, + "time_per_iteration": 2.573676109313965 + }, + { + "auxiliary_loss_clip": 0.012046, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.06371641, + "balance_loss_mlp": 1.03057551, + "epoch": 0.09270726868273914, + "flos": 20886544229760.0, + "grad_norm": 1.7780905701429013, + "language_loss": 0.83329165, + "learning_rate": 3.958963719881509e-06, + "loss": 0.85576117, + "num_input_tokens_seen": 16490670, + "step": 771, + "time_per_iteration": 2.634565830230713 + }, + { + "auxiliary_loss_clip": 0.012375, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.070665, + "balance_loss_mlp": 1.02521074, + "epoch": 0.09282751157337822, + "flos": 17015697031680.0, + "grad_norm": 2.1995602992394407, + "language_loss": 0.93782759, + "learning_rate": 3.958806583316154e-06, + "loss": 0.96058249, + "num_input_tokens_seen": 16508640, + "step": 772, + "time_per_iteration": 2.5302464962005615 + }, + { + "auxiliary_loss_clip": 0.01253086, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.07188642, + "balance_loss_mlp": 1.02264607, + "epoch": 0.09294775446401732, + "flos": 32523647748480.0, + "grad_norm": 1.8650159343366985, + "language_loss": 0.7872268, + "learning_rate": 3.9586491496013985e-06, + "loss": 0.81009686, + "num_input_tokens_seen": 16531035, + "step": 773, + "time_per_iteration": 2.5678648948669434 + }, + { + "auxiliary_loss_clip": 0.01241467, + "auxiliary_loss_mlp": 0.01046493, + "balance_loss_clip": 1.06949806, + "balance_loss_mlp": 1.03448272, + "epoch": 0.0930679973546564, + "flos": 18259750627200.0, + "grad_norm": 2.1694090712863936, + "language_loss": 0.83244175, + "learning_rate": 3.958491418761124e-06, + "loss": 0.85532135, + "num_input_tokens_seen": 16548605, + "step": 774, + "time_per_iteration": 2.498048782348633 + }, + { + "auxiliary_loss_clip": 0.01220382, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.0627861, + "balance_loss_mlp": 1.02492869, + "epoch": 0.0931882402452955, + "flos": 21099745405440.0, + "grad_norm": 2.4946998974856034, + "language_loss": 0.73005903, + "learning_rate": 3.958333390819258e-06, + "loss": 0.7526291, + "num_input_tokens_seen": 16565535, + "step": 775, + "time_per_iteration": 2.564757823944092 + }, + { + "auxiliary_loss_clip": 0.01252142, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.07084692, + "balance_loss_mlp": 1.02937102, + "epoch": 0.0933084831359346, + "flos": 24207275658240.0, + "grad_norm": 1.9814847555112147, + "language_loss": 0.80175126, + "learning_rate": 3.9581750657997754e-06, + "loss": 0.82467914, + "num_input_tokens_seen": 16584900, + "step": 776, + "time_per_iteration": 2.527513265609741 + }, + { + "auxiliary_loss_clip": 0.01217284, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.06165981, + "balance_loss_mlp": 1.02664614, + "epoch": 0.09342872602657368, + "flos": 25480272637440.0, + "grad_norm": 1.753109203356896, + "language_loss": 0.89739782, + "learning_rate": 3.95801644372669e-06, + "loss": 0.91995025, + "num_input_tokens_seen": 16604805, + "step": 777, + "time_per_iteration": 2.5987749099731445 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.06281602, + "balance_loss_mlp": 1.0308187, + "epoch": 0.09354896891721277, + "flos": 23149060053120.0, + "grad_norm": 2.126772034068981, + "language_loss": 0.84682322, + "learning_rate": 3.957857524624068e-06, + "loss": 0.86950517, + "num_input_tokens_seen": 16623685, + "step": 778, + "time_per_iteration": 2.567216634750366 + }, + { + "auxiliary_loss_clip": 0.01221143, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.06686115, + "balance_loss_mlp": 1.02899218, + "epoch": 0.09366921180785186, + "flos": 24279563779200.0, + "grad_norm": 1.6495817965267818, + "language_loss": 0.89487755, + "learning_rate": 3.957698308516016e-06, + "loss": 0.9174943, + "num_input_tokens_seen": 16644985, + "step": 779, + "time_per_iteration": 2.5927236080169678 + }, + { + "auxiliary_loss_clip": 0.01232512, + "auxiliary_loss_mlp": 0.00765155, + "balance_loss_clip": 1.06897628, + "balance_loss_mlp": 1.00041103, + "epoch": 0.09378945469849095, + "flos": 18730036419840.0, + "grad_norm": 1.9629140592359302, + "language_loss": 0.82340711, + "learning_rate": 3.957538795426688e-06, + "loss": 0.84338385, + "num_input_tokens_seen": 16662410, + "step": 780, + "time_per_iteration": 2.512120485305786 + }, + { + "auxiliary_loss_clip": 0.01221532, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.06474781, + "balance_loss_mlp": 1.03170431, + "epoch": 0.09390969758913004, + "flos": 23218834222080.0, + "grad_norm": 2.3787498835355376, + "language_loss": 0.77122021, + "learning_rate": 3.9573789853802804e-06, + "loss": 0.79388475, + "num_input_tokens_seen": 16680885, + "step": 781, + "time_per_iteration": 2.5790092945098877 + }, + { + "auxiliary_loss_clip": 0.01221178, + "auxiliary_loss_mlp": 0.00764982, + "balance_loss_clip": 1.06831777, + "balance_loss_mlp": 1.00035882, + "epoch": 0.09402994047976913, + "flos": 19646728439040.0, + "grad_norm": 2.554537162341992, + "language_loss": 0.74640894, + "learning_rate": 3.957218878401037e-06, + "loss": 0.76627052, + "num_input_tokens_seen": 16699375, + "step": 782, + "time_per_iteration": 2.5763988494873047 + }, + { + "auxiliary_loss_clip": 0.01252666, + "auxiliary_loss_mlp": 0.01047058, + "balance_loss_clip": 1.07043099, + "balance_loss_mlp": 1.03419495, + "epoch": 0.09415018337040823, + "flos": 29420463041280.0, + "grad_norm": 1.9805424188329135, + "language_loss": 0.88926888, + "learning_rate": 3.957058474513246e-06, + "loss": 0.91226614, + "num_input_tokens_seen": 16719230, + "step": 783, + "time_per_iteration": 2.544337034225464 + }, + { + "auxiliary_loss_clip": 0.01232638, + "auxiliary_loss_mlp": 0.0104423, + "balance_loss_clip": 1.06809676, + "balance_loss_mlp": 1.03310168, + "epoch": 0.09427042626104731, + "flos": 24572092141440.0, + "grad_norm": 1.7049045798645293, + "language_loss": 0.78703558, + "learning_rate": 3.956897773741241e-06, + "loss": 0.80980426, + "num_input_tokens_seen": 16738220, + "step": 784, + "time_per_iteration": 2.584751605987549 + }, + { + "auxiliary_loss_clip": 0.01209785, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_clip": 1.06302822, + "balance_loss_mlp": 1.03232455, + "epoch": 0.09439066915168641, + "flos": 26359581576960.0, + "grad_norm": 1.7954940158014103, + "language_loss": 0.71830308, + "learning_rate": 3.956736776109398e-06, + "loss": 0.74084759, + "num_input_tokens_seen": 16759395, + "step": 785, + "time_per_iteration": 2.652832269668579 + }, + { + "auxiliary_loss_clip": 0.01227276, + "auxiliary_loss_mlp": 0.00766183, + "balance_loss_clip": 1.06390095, + "balance_loss_mlp": 1.00047815, + "epoch": 0.09451091204232549, + "flos": 19427278296960.0, + "grad_norm": 2.0545432496868, + "language_loss": 0.83776742, + "learning_rate": 3.956575481642143e-06, + "loss": 0.85770202, + "num_input_tokens_seen": 16778285, + "step": 786, + "time_per_iteration": 2.5896732807159424 + }, + { + "auxiliary_loss_clip": 0.01178079, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.05468822, + "balance_loss_mlp": 1.02488136, + "epoch": 0.09463115493296459, + "flos": 25368051571200.0, + "grad_norm": 2.4041934698281278, + "language_loss": 0.75313181, + "learning_rate": 3.956413890363943e-06, + "loss": 0.77528477, + "num_input_tokens_seen": 16795265, + "step": 787, + "time_per_iteration": 2.734797954559326 + }, + { + "auxiliary_loss_clip": 0.01233069, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.06687641, + "balance_loss_mlp": 1.02934527, + "epoch": 0.09475139782360369, + "flos": 10123254869760.0, + "grad_norm": 1.9613012834700416, + "language_loss": 0.82224941, + "learning_rate": 3.956252002299312e-06, + "loss": 0.8449887, + "num_input_tokens_seen": 16811165, + "step": 788, + "time_per_iteration": 2.520799160003662 + }, + { + "auxiliary_loss_clip": 0.01250428, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.06852376, + "balance_loss_mlp": 1.02076781, + "epoch": 0.09487164071424277, + "flos": 17231088936960.0, + "grad_norm": 2.4889905694389407, + "language_loss": 0.90756106, + "learning_rate": 3.956089817472807e-06, + "loss": 0.93039417, + "num_input_tokens_seen": 16828470, + "step": 789, + "time_per_iteration": 2.477947235107422 + }, + { + "auxiliary_loss_clip": 0.01218534, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.06705117, + "balance_loss_mlp": 1.02735901, + "epoch": 0.09499188360488187, + "flos": 30849564528000.0, + "grad_norm": 3.5754611308378745, + "language_loss": 0.854056, + "learning_rate": 3.955927335909032e-06, + "loss": 0.8766278, + "num_input_tokens_seen": 16851680, + "step": 790, + "time_per_iteration": 2.6805055141448975 + }, + { + "auxiliary_loss_clip": 0.01188017, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.06602859, + "balance_loss_mlp": 1.03167486, + "epoch": 0.09511212649552095, + "flos": 29351694453120.0, + "grad_norm": 2.2423361345725246, + "language_loss": 0.75613439, + "learning_rate": 3.955764557632634e-06, + "loss": 0.77844727, + "num_input_tokens_seen": 16871490, + "step": 791, + "time_per_iteration": 2.805894136428833 + }, + { + "auxiliary_loss_clip": 0.01216081, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.06478167, + "balance_loss_mlp": 1.02773428, + "epoch": 0.09523236938616005, + "flos": 10378687461120.0, + "grad_norm": 2.5277447534627866, + "language_loss": 0.94500089, + "learning_rate": 3.955601482668309e-06, + "loss": 0.96756172, + "num_input_tokens_seen": 16889350, + "step": 792, + "time_per_iteration": 2.6129648685455322 + }, + { + "auxiliary_loss_clip": 0.01184638, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.05713415, + "balance_loss_mlp": 1.02569437, + "epoch": 0.09535261227679913, + "flos": 19061815368960.0, + "grad_norm": 2.11208830724053, + "language_loss": 0.88537276, + "learning_rate": 3.955438111040794e-06, + "loss": 0.90761244, + "num_input_tokens_seen": 16907625, + "step": 793, + "time_per_iteration": 2.6418042182922363 + }, + { + "auxiliary_loss_clip": 0.01181344, + "auxiliary_loss_mlp": 0.01045143, + "balance_loss_clip": 1.05783463, + "balance_loss_mlp": 1.03381264, + "epoch": 0.09547285516743823, + "flos": 20922993555840.0, + "grad_norm": 1.8521875212611885, + "language_loss": 0.80311835, + "learning_rate": 3.955274442774873e-06, + "loss": 0.82538325, + "num_input_tokens_seen": 16926205, + "step": 794, + "time_per_iteration": 3.458961009979248 + }, + { + "auxiliary_loss_clip": 0.0123539, + "auxiliary_loss_mlp": 0.01042397, + "balance_loss_clip": 1.06711102, + "balance_loss_mlp": 1.02995157, + "epoch": 0.09559309805807732, + "flos": 30154405639680.0, + "grad_norm": 2.1484158698403344, + "language_loss": 0.70436639, + "learning_rate": 3.9551104778953725e-06, + "loss": 0.72714424, + "num_input_tokens_seen": 16946500, + "step": 795, + "time_per_iteration": 3.2802071571350098 + }, + { + "auxiliary_loss_clip": 0.01204002, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.06148553, + "balance_loss_mlp": 1.02247226, + "epoch": 0.0957133409487164, + "flos": 21066743784960.0, + "grad_norm": 1.751888853735935, + "language_loss": 0.85512233, + "learning_rate": 3.954946216427167e-06, + "loss": 0.87750375, + "num_input_tokens_seen": 16966960, + "step": 796, + "time_per_iteration": 4.184223175048828 + }, + { + "auxiliary_loss_clip": 0.01097705, + "auxiliary_loss_mlp": 0.01015301, + "balance_loss_clip": 1.02569258, + "balance_loss_mlp": 1.01067543, + "epoch": 0.0958335838393555, + "flos": 71297979315840.0, + "grad_norm": 1.043455537692087, + "language_loss": 0.61547267, + "learning_rate": 3.954781658395176e-06, + "loss": 0.63660276, + "num_input_tokens_seen": 17023215, + "step": 797, + "time_per_iteration": 3.103978157043457 + }, + { + "auxiliary_loss_clip": 0.01225749, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.06647742, + "balance_loss_mlp": 1.02834713, + "epoch": 0.09595382672999458, + "flos": 21872974504320.0, + "grad_norm": 2.1448238675332574, + "language_loss": 0.92258275, + "learning_rate": 3.95461680382436e-06, + "loss": 0.94524777, + "num_input_tokens_seen": 17042140, + "step": 798, + "time_per_iteration": 2.5930378437042236 + }, + { + "auxiliary_loss_clip": 0.01241229, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.07103777, + "balance_loss_mlp": 1.02931356, + "epoch": 0.09607406962063368, + "flos": 18695562341760.0, + "grad_norm": 2.625548575508764, + "language_loss": 0.86233532, + "learning_rate": 3.9544516527397295e-06, + "loss": 0.88516808, + "num_input_tokens_seen": 17058490, + "step": 799, + "time_per_iteration": 2.5209574699401855 + }, + { + "auxiliary_loss_clip": 0.01205779, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.06416452, + "balance_loss_mlp": 1.02169561, + "epoch": 0.09619431251127276, + "flos": 22568456615040.0, + "grad_norm": 2.037867888772389, + "language_loss": 0.80602717, + "learning_rate": 3.954286205166338e-06, + "loss": 0.82841957, + "num_input_tokens_seen": 17079655, + "step": 800, + "time_per_iteration": 2.619941473007202 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01040445, + "balance_loss_clip": 1.07351577, + "balance_loss_mlp": 1.0277437, + "epoch": 0.09631455540191186, + "flos": 14246230608000.0, + "grad_norm": 1.987252059219273, + "language_loss": 0.83819532, + "learning_rate": 3.954120461129282e-06, + "loss": 0.86101961, + "num_input_tokens_seen": 17097065, + "step": 801, + "time_per_iteration": 2.5039467811584473 + }, + { + "auxiliary_loss_clip": 0.01257025, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_clip": 1.07445312, + "balance_loss_mlp": 1.0374434, + "epoch": 0.09643479829255096, + "flos": 20740387789440.0, + "grad_norm": 2.158717564384931, + "language_loss": 0.83928901, + "learning_rate": 3.953954420653706e-06, + "loss": 0.86235344, + "num_input_tokens_seen": 17114090, + "step": 802, + "time_per_iteration": 2.4912257194519043 + }, + { + "auxiliary_loss_clip": 0.01233177, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.06845307, + "balance_loss_mlp": 1.02921653, + "epoch": 0.09655504118319004, + "flos": 24420476833920.0, + "grad_norm": 1.9094827960957284, + "language_loss": 0.88406926, + "learning_rate": 3.953788083764798e-06, + "loss": 0.90680814, + "num_input_tokens_seen": 17133325, + "step": 803, + "time_per_iteration": 2.5425305366516113 + }, + { + "auxiliary_loss_clip": 0.01188666, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.06193912, + "balance_loss_mlp": 1.03744483, + "epoch": 0.09667528407382914, + "flos": 18441961344000.0, + "grad_norm": 1.8920396608018544, + "language_loss": 0.91943133, + "learning_rate": 3.953621450487792e-06, + "loss": 0.9418118, + "num_input_tokens_seen": 17151945, + "step": 804, + "time_per_iteration": 2.6172900199890137 + }, + { + "auxiliary_loss_clip": 0.0113234, + "auxiliary_loss_mlp": 0.01006809, + "balance_loss_clip": 1.02213097, + "balance_loss_mlp": 1.0024941, + "epoch": 0.09679552696446822, + "flos": 70816455544320.0, + "grad_norm": 0.8560121161214167, + "language_loss": 0.61250603, + "learning_rate": 3.953454520847964e-06, + "loss": 0.63389754, + "num_input_tokens_seen": 17216790, + "step": 805, + "time_per_iteration": 3.2859981060028076 + }, + { + "auxiliary_loss_clip": 0.01216551, + "auxiliary_loss_mlp": 0.01045763, + "balance_loss_clip": 1.0649724, + "balance_loss_mlp": 1.0312252, + "epoch": 0.09691576985510732, + "flos": 21945514020480.0, + "grad_norm": 2.745606855863707, + "language_loss": 0.73961413, + "learning_rate": 3.9532872948706395e-06, + "loss": 0.76223731, + "num_input_tokens_seen": 17236285, + "step": 806, + "time_per_iteration": 2.6289658546447754 + }, + { + "auxiliary_loss_clip": 0.01222346, + "auxiliary_loss_mlp": 0.0104887, + "balance_loss_clip": 1.06678629, + "balance_loss_mlp": 1.03603733, + "epoch": 0.09703601274574641, + "flos": 17965211103360.0, + "grad_norm": 2.2985129823923782, + "language_loss": 0.82587349, + "learning_rate": 3.9531197725811845e-06, + "loss": 0.84858567, + "num_input_tokens_seen": 17251670, + "step": 807, + "time_per_iteration": 2.5764806270599365 + }, + { + "auxiliary_loss_clip": 0.01250957, + "auxiliary_loss_mlp": 0.01047415, + "balance_loss_clip": 1.07257998, + "balance_loss_mlp": 1.0351131, + "epoch": 0.0971562556363855, + "flos": 22162162901760.0, + "grad_norm": 1.8946313736482847, + "language_loss": 0.87765265, + "learning_rate": 3.952951954005013e-06, + "loss": 0.90063637, + "num_input_tokens_seen": 17271355, + "step": 808, + "time_per_iteration": 2.5147461891174316 + }, + { + "auxiliary_loss_clip": 0.01214121, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.05962515, + "balance_loss_mlp": 1.02374566, + "epoch": 0.0972764985270246, + "flos": 25848716394240.0, + "grad_norm": 2.181473469442269, + "language_loss": 0.84720832, + "learning_rate": 3.952783839167584e-06, + "loss": 0.86969805, + "num_input_tokens_seen": 17291400, + "step": 809, + "time_per_iteration": 2.6008763313293457 + }, + { + "auxiliary_loss_clip": 0.01233857, + "auxiliary_loss_mlp": 0.01050148, + "balance_loss_clip": 1.0668273, + "balance_loss_mlp": 1.03783417, + "epoch": 0.09739674141766368, + "flos": 20339373375360.0, + "grad_norm": 2.6480001370345616, + "language_loss": 0.74355948, + "learning_rate": 3.952615428094398e-06, + "loss": 0.7663995, + "num_input_tokens_seen": 17310920, + "step": 810, + "time_per_iteration": 2.5583999156951904 + }, + { + "auxiliary_loss_clip": 0.01179707, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.05722129, + "balance_loss_mlp": 1.03071284, + "epoch": 0.09751698430830277, + "flos": 15743059188480.0, + "grad_norm": 1.662049886234894, + "language_loss": 0.73639357, + "learning_rate": 3.952446720811004e-06, + "loss": 0.75861847, + "num_input_tokens_seen": 17329245, + "step": 811, + "time_per_iteration": 2.6206955909729004 + }, + { + "auxiliary_loss_clip": 0.01091271, + "auxiliary_loss_mlp": 0.01006574, + "balance_loss_clip": 1.01903558, + "balance_loss_mlp": 1.0020678, + "epoch": 0.09763722719894186, + "flos": 63716806800000.0, + "grad_norm": 0.866989383649995, + "language_loss": 0.63636887, + "learning_rate": 3.952277717342995e-06, + "loss": 0.65734732, + "num_input_tokens_seen": 17395680, + "step": 812, + "time_per_iteration": 3.269688367843628 + }, + { + "auxiliary_loss_clip": 0.01222663, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_clip": 1.0659492, + "balance_loss_mlp": 1.03375769, + "epoch": 0.09775747008958095, + "flos": 22090916275200.0, + "grad_norm": 2.3231239091977716, + "language_loss": 0.85610962, + "learning_rate": 3.952108417716009e-06, + "loss": 0.8787967, + "num_input_tokens_seen": 17415135, + "step": 813, + "time_per_iteration": 2.5688862800598145 + }, + { + "auxiliary_loss_clip": 0.01237455, + "auxiliary_loss_mlp": 0.01038248, + "balance_loss_clip": 1.07102036, + "balance_loss_mlp": 1.02589238, + "epoch": 0.09787771298022005, + "flos": 21286050272640.0, + "grad_norm": 2.0426173596828785, + "language_loss": 0.85128343, + "learning_rate": 3.951938821955727e-06, + "loss": 0.87404048, + "num_input_tokens_seen": 17434535, + "step": 814, + "time_per_iteration": 2.5478286743164062 + }, + { + "auxiliary_loss_clip": 0.0121911, + "auxiliary_loss_mlp": 0.01048725, + "balance_loss_clip": 1.06782997, + "balance_loss_mlp": 1.03530788, + "epoch": 0.09799795587085913, + "flos": 22054574689920.0, + "grad_norm": 1.6040175122193243, + "language_loss": 0.76598686, + "learning_rate": 3.9517689300878786e-06, + "loss": 0.78866524, + "num_input_tokens_seen": 17454270, + "step": 815, + "time_per_iteration": 2.5704588890075684 + }, + { + "auxiliary_loss_clip": 0.01248235, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_clip": 1.06708574, + "balance_loss_mlp": 1.03263688, + "epoch": 0.09811819876149823, + "flos": 22163743100160.0, + "grad_norm": 1.9686863874398621, + "language_loss": 0.78712517, + "learning_rate": 3.951598742138236e-06, + "loss": 0.81005788, + "num_input_tokens_seen": 17472995, + "step": 816, + "time_per_iteration": 2.547961473464966 + }, + { + "auxiliary_loss_clip": 0.01220068, + "auxiliary_loss_mlp": 0.01041197, + "balance_loss_clip": 1.05931056, + "balance_loss_mlp": 1.0293653, + "epoch": 0.09823844165213731, + "flos": 22231111057920.0, + "grad_norm": 2.056434800186804, + "language_loss": 0.80068362, + "learning_rate": 3.951428258132615e-06, + "loss": 0.82329625, + "num_input_tokens_seen": 17491115, + "step": 817, + "time_per_iteration": 2.5729565620422363 + }, + { + "auxiliary_loss_clip": 0.01220572, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.06710804, + "balance_loss_mlp": 1.02782965, + "epoch": 0.09835868454277641, + "flos": 22487728798080.0, + "grad_norm": 2.1250495623144636, + "language_loss": 0.84772849, + "learning_rate": 3.951257478096879e-06, + "loss": 0.87033409, + "num_input_tokens_seen": 17509480, + "step": 818, + "time_per_iteration": 2.6155011653900146 + }, + { + "auxiliary_loss_clip": 0.01223267, + "auxiliary_loss_mlp": 0.00766923, + "balance_loss_clip": 1.06884956, + "balance_loss_mlp": 1.00047767, + "epoch": 0.0984789274334155, + "flos": 16362554077440.0, + "grad_norm": 2.58954467149033, + "language_loss": 0.68580061, + "learning_rate": 3.951086402056936e-06, + "loss": 0.70570254, + "num_input_tokens_seen": 17524080, + "step": 819, + "time_per_iteration": 2.5347323417663574 + }, + { + "auxiliary_loss_clip": 0.01152041, + "auxiliary_loss_mlp": 0.00766332, + "balance_loss_clip": 1.06044173, + "balance_loss_mlp": 1.00057626, + "epoch": 0.09859917032405459, + "flos": 24243545416320.0, + "grad_norm": 1.6130678718603515, + "language_loss": 0.83779049, + "learning_rate": 3.950915030038735e-06, + "loss": 0.85697424, + "num_input_tokens_seen": 17543875, + "step": 820, + "time_per_iteration": 3.0314464569091797 + }, + { + "auxiliary_loss_clip": 0.01232972, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.0690906, + "balance_loss_mlp": 1.02820146, + "epoch": 0.09871941321469369, + "flos": 17420195064960.0, + "grad_norm": 2.1895587298853814, + "language_loss": 0.83750367, + "learning_rate": 3.9507433620682765e-06, + "loss": 0.8602339, + "num_input_tokens_seen": 17560810, + "step": 821, + "time_per_iteration": 4.323476552963257 + }, + { + "auxiliary_loss_clip": 0.0120202, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.06205177, + "balance_loss_mlp": 1.02788901, + "epoch": 0.09883965610533277, + "flos": 28477341590400.0, + "grad_norm": 1.9703173077232572, + "language_loss": 0.88010406, + "learning_rate": 3.9505713981716e-06, + "loss": 0.9025259, + "num_input_tokens_seen": 17583640, + "step": 822, + "time_per_iteration": 2.723100185394287 + }, + { + "auxiliary_loss_clip": 0.01217141, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.06774211, + "balance_loss_mlp": 1.03029525, + "epoch": 0.09895989899597187, + "flos": 23693932437120.0, + "grad_norm": 2.094724305878368, + "language_loss": 0.80917007, + "learning_rate": 3.950399138374795e-06, + "loss": 0.83175671, + "num_input_tokens_seen": 17602720, + "step": 823, + "time_per_iteration": 4.2245564460754395 + }, + { + "auxiliary_loss_clip": 0.01233647, + "auxiliary_loss_mlp": 0.01048655, + "balance_loss_clip": 1.06643486, + "balance_loss_mlp": 1.03561902, + "epoch": 0.09908014188661095, + "flos": 24679608526080.0, + "grad_norm": 1.726291384423809, + "language_loss": 0.74182308, + "learning_rate": 3.95022658270399e-06, + "loss": 0.76464605, + "num_input_tokens_seen": 17623085, + "step": 824, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.01218245, + "auxiliary_loss_mlp": 0.01041906, + "balance_loss_clip": 1.06836236, + "balance_loss_mlp": 1.03024757, + "epoch": 0.09920038477725004, + "flos": 14064307200000.0, + "grad_norm": 2.2325463561183105, + "language_loss": 0.78133821, + "learning_rate": 3.9500537311853635e-06, + "loss": 0.80393964, + "num_input_tokens_seen": 17641040, + "step": 825, + "time_per_iteration": 2.604297161102295 + }, + { + "auxiliary_loss_clip": 0.01232331, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.06342971, + "balance_loss_mlp": 1.02736568, + "epoch": 0.09932062766788914, + "flos": 13407070095360.0, + "grad_norm": 6.202923520042933, + "language_loss": 0.83450109, + "learning_rate": 3.949880583845136e-06, + "loss": 0.85722929, + "num_input_tokens_seen": 17659115, + "step": 826, + "time_per_iteration": 2.5487418174743652 + }, + { + "auxiliary_loss_clip": 0.01216699, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0641526, + "balance_loss_mlp": 1.02412653, + "epoch": 0.09944087055852822, + "flos": 19500751566720.0, + "grad_norm": 1.7238259042339856, + "language_loss": 0.80951703, + "learning_rate": 3.949707140709575e-06, + "loss": 0.83204657, + "num_input_tokens_seen": 17678845, + "step": 827, + "time_per_iteration": 2.648170232772827 + }, + { + "auxiliary_loss_clip": 0.01234301, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.06372094, + "balance_loss_mlp": 1.02771854, + "epoch": 0.09956111344916732, + "flos": 17749100926080.0, + "grad_norm": 2.1725824632685855, + "language_loss": 0.83733821, + "learning_rate": 3.949533401804991e-06, + "loss": 0.86008918, + "num_input_tokens_seen": 17695750, + "step": 828, + "time_per_iteration": 2.6375529766082764 + }, + { + "auxiliary_loss_clip": 0.01233757, + "auxiliary_loss_mlp": 0.00766981, + "balance_loss_clip": 1.06815481, + "balance_loss_mlp": 1.00049829, + "epoch": 0.0996813563398064, + "flos": 17967581400960.0, + "grad_norm": 2.896347376383928, + "language_loss": 0.90642709, + "learning_rate": 3.949359367157739e-06, + "loss": 0.92643452, + "num_input_tokens_seen": 17714445, + "step": 829, + "time_per_iteration": 2.5230071544647217 + }, + { + "auxiliary_loss_clip": 0.01239032, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.06852829, + "balance_loss_mlp": 1.0327785, + "epoch": 0.0998015992304455, + "flos": 17457039440640.0, + "grad_norm": 1.9299140150184406, + "language_loss": 0.75379717, + "learning_rate": 3.949185036794222e-06, + "loss": 0.77664632, + "num_input_tokens_seen": 17732455, + "step": 830, + "time_per_iteration": 2.586301326751709 + }, + { + "auxiliary_loss_clip": 0.0124868, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.06947541, + "balance_loss_mlp": 1.03192306, + "epoch": 0.0999218421210846, + "flos": 25888757080320.0, + "grad_norm": 1.6450499525737834, + "language_loss": 0.78710586, + "learning_rate": 3.949010410740884e-06, + "loss": 0.81002641, + "num_input_tokens_seen": 17755280, + "step": 831, + "time_per_iteration": 2.5553503036499023 + }, + { + "auxiliary_loss_clip": 0.01209549, + "auxiliary_loss_mlp": 0.00766433, + "balance_loss_clip": 1.06246579, + "balance_loss_mlp": 1.00039613, + "epoch": 0.10004208501172368, + "flos": 21215916967680.0, + "grad_norm": 1.6487171161933056, + "language_loss": 0.86424005, + "learning_rate": 3.948835489024216e-06, + "loss": 0.88399982, + "num_input_tokens_seen": 17775015, + "step": 832, + "time_per_iteration": 2.5942766666412354 + }, + { + "auxiliary_loss_clip": 0.01236645, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.06694722, + "balance_loss_mlp": 1.02902985, + "epoch": 0.10016232790236278, + "flos": 17348409734400.0, + "grad_norm": 2.2027483556178336, + "language_loss": 0.90658712, + "learning_rate": 3.948660271670755e-06, + "loss": 0.92936718, + "num_input_tokens_seen": 17792165, + "step": 833, + "time_per_iteration": 2.516378879547119 + }, + { + "auxiliary_loss_clip": 0.01214437, + "auxiliary_loss_mlp": 0.01040804, + "balance_loss_clip": 1.06405771, + "balance_loss_mlp": 1.02912724, + "epoch": 0.10028257079300186, + "flos": 25666541591040.0, + "grad_norm": 2.2327335683673706, + "language_loss": 0.83782685, + "learning_rate": 3.948484758707079e-06, + "loss": 0.86037922, + "num_input_tokens_seen": 17811765, + "step": 834, + "time_per_iteration": 2.6521735191345215 + }, + { + "auxiliary_loss_clip": 0.01192811, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.05869985, + "balance_loss_mlp": 1.02750111, + "epoch": 0.10040281368364096, + "flos": 25156035544320.0, + "grad_norm": 2.0375519850253565, + "language_loss": 0.83572602, + "learning_rate": 3.948308950159815e-06, + "loss": 0.8580578, + "num_input_tokens_seen": 17830445, + "step": 835, + "time_per_iteration": 2.625239849090576 + }, + { + "auxiliary_loss_clip": 0.01196617, + "auxiliary_loss_mlp": 0.01045953, + "balance_loss_clip": 1.05860639, + "balance_loss_mlp": 1.03212476, + "epoch": 0.10052305657428004, + "flos": 17603303621760.0, + "grad_norm": 3.2033163156772235, + "language_loss": 0.75790626, + "learning_rate": 3.9481328460556326e-06, + "loss": 0.78033197, + "num_input_tokens_seen": 17847665, + "step": 836, + "time_per_iteration": 2.573728561401367 + }, + { + "auxiliary_loss_clip": 0.01208961, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.06185448, + "balance_loss_mlp": 1.02616191, + "epoch": 0.10064329946491914, + "flos": 18660154510080.0, + "grad_norm": 1.9499357192424849, + "language_loss": 0.89519417, + "learning_rate": 3.9479564464212455e-06, + "loss": 0.91766691, + "num_input_tokens_seen": 17866825, + "step": 837, + "time_per_iteration": 2.542353630065918 + }, + { + "auxiliary_loss_clip": 0.01254383, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.06939816, + "balance_loss_mlp": 1.02862549, + "epoch": 0.10076354235555823, + "flos": 17199056983680.0, + "grad_norm": 2.3194186221426243, + "language_loss": 0.76208752, + "learning_rate": 3.947779751283414e-06, + "loss": 0.78503764, + "num_input_tokens_seen": 17883995, + "step": 838, + "time_per_iteration": 2.461181640625 + }, + { + "auxiliary_loss_clip": 0.01236945, + "auxiliary_loss_mlp": 0.00767306, + "balance_loss_clip": 1.07302868, + "balance_loss_mlp": 1.00035167, + "epoch": 0.10088378524619732, + "flos": 22962252395520.0, + "grad_norm": 1.81270753187586, + "language_loss": 0.76041019, + "learning_rate": 3.947602760668944e-06, + "loss": 0.78045273, + "num_input_tokens_seen": 17903785, + "step": 839, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.01234257, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_clip": 1.07080507, + "balance_loss_mlp": 1.03336382, + "epoch": 0.10100402813683641, + "flos": 37885828746240.0, + "grad_norm": 1.828716741790657, + "language_loss": 0.7145431, + "learning_rate": 3.947425474604684e-06, + "loss": 0.73734891, + "num_input_tokens_seen": 17927720, + "step": 840, + "time_per_iteration": 2.668940305709839 + }, + { + "auxiliary_loss_clip": 0.01215727, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.0638833, + "balance_loss_mlp": 1.03671575, + "epoch": 0.1011242710274755, + "flos": 21543458112000.0, + "grad_norm": 1.9320913072199315, + "language_loss": 0.92268705, + "learning_rate": 3.947247893117528e-06, + "loss": 0.94533396, + "num_input_tokens_seen": 17946225, + "step": 841, + "time_per_iteration": 2.589635133743286 + }, + { + "auxiliary_loss_clip": 0.01228495, + "auxiliary_loss_mlp": 0.01049871, + "balance_loss_clip": 1.06425011, + "balance_loss_mlp": 1.03740728, + "epoch": 0.10124451391811459, + "flos": 13621456419840.0, + "grad_norm": 3.146387800368606, + "language_loss": 0.69823033, + "learning_rate": 3.947070016234413e-06, + "loss": 0.72101396, + "num_input_tokens_seen": 17962015, + "step": 842, + "time_per_iteration": 2.482999563217163 + }, + { + "auxiliary_loss_clip": 0.01228636, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.06934345, + "balance_loss_mlp": 1.02932835, + "epoch": 0.10136475680875369, + "flos": 16649228522880.0, + "grad_norm": 2.3831317146790845, + "language_loss": 0.7445336, + "learning_rate": 3.946891843982326e-06, + "loss": 0.76724529, + "num_input_tokens_seen": 17979680, + "step": 843, + "time_per_iteration": 2.5367839336395264 + }, + { + "auxiliary_loss_clip": 0.01234247, + "auxiliary_loss_mlp": 0.01042693, + "balance_loss_clip": 1.06872821, + "balance_loss_mlp": 1.03000939, + "epoch": 0.10148499969939277, + "flos": 19461034103040.0, + "grad_norm": 2.5785518212365295, + "language_loss": 0.74559534, + "learning_rate": 3.9467133763882935e-06, + "loss": 0.76836467, + "num_input_tokens_seen": 17998145, + "step": 844, + "time_per_iteration": 2.4988763332366943 + }, + { + "auxiliary_loss_clip": 0.01222586, + "auxiliary_loss_mlp": 0.01043499, + "balance_loss_clip": 1.06560552, + "balance_loss_mlp": 1.03074408, + "epoch": 0.10160524259003187, + "flos": 21104988791040.0, + "grad_norm": 1.9213692221373082, + "language_loss": 0.86357558, + "learning_rate": 3.9465346134793905e-06, + "loss": 0.88623643, + "num_input_tokens_seen": 18017955, + "step": 845, + "time_per_iteration": 2.5335724353790283 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.06500959, + "balance_loss_mlp": 1.02902722, + "epoch": 0.10172548548067095, + "flos": 17712687513600.0, + "grad_norm": 2.1625420854176403, + "language_loss": 0.79372048, + "learning_rate": 3.9463555552827335e-06, + "loss": 0.81615186, + "num_input_tokens_seen": 18035125, + "step": 846, + "time_per_iteration": 2.6125457286834717 + }, + { + "auxiliary_loss_clip": 0.0122046, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.06376243, + "balance_loss_mlp": 1.03532577, + "epoch": 0.10184572837131005, + "flos": 21104845136640.0, + "grad_norm": 2.786357293765913, + "language_loss": 0.86559433, + "learning_rate": 3.946176201825487e-06, + "loss": 0.88827276, + "num_input_tokens_seen": 18053160, + "step": 847, + "time_per_iteration": 3.294499635696411 + }, + { + "auxiliary_loss_clip": 0.0121938, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.06780005, + "balance_loss_mlp": 1.03363168, + "epoch": 0.10196597126194913, + "flos": 26067591918720.0, + "grad_norm": 1.838448177399513, + "language_loss": 0.8337425, + "learning_rate": 3.9459965531348575e-06, + "loss": 0.85639477, + "num_input_tokens_seen": 18072815, + "step": 848, + "time_per_iteration": 3.4428482055664062 + }, + { + "auxiliary_loss_clip": 0.01217897, + "auxiliary_loss_mlp": 0.00766771, + "balance_loss_clip": 1.06582034, + "balance_loss_mlp": 1.00060952, + "epoch": 0.10208621415258823, + "flos": 29314634595840.0, + "grad_norm": 2.327723543085567, + "language_loss": 0.86028624, + "learning_rate": 3.945816609238098e-06, + "loss": 0.88013291, + "num_input_tokens_seen": 18092225, + "step": 849, + "time_per_iteration": 4.28177809715271 + }, + { + "auxiliary_loss_clip": 0.01179999, + "auxiliary_loss_mlp": 0.01044646, + "balance_loss_clip": 1.06286883, + "balance_loss_mlp": 1.03231406, + "epoch": 0.10220645704322733, + "flos": 23805794367360.0, + "grad_norm": 1.7641124833085795, + "language_loss": 0.85317433, + "learning_rate": 3.945636370162507e-06, + "loss": 0.87542081, + "num_input_tokens_seen": 18112335, + "step": 850, + "time_per_iteration": 2.672788619995117 + }, + { + "auxiliary_loss_clip": 0.01230503, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.06743395, + "balance_loss_mlp": 1.03539467, + "epoch": 0.10232669993386641, + "flos": 23218546913280.0, + "grad_norm": 1.680173808653884, + "language_loss": 0.78901947, + "learning_rate": 3.945455835935425e-06, + "loss": 0.81179428, + "num_input_tokens_seen": 18131520, + "step": 851, + "time_per_iteration": 2.5438425540924072 + }, + { + "auxiliary_loss_clip": 0.01217433, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.06411123, + "balance_loss_mlp": 1.03497195, + "epoch": 0.1024469428245055, + "flos": 22922929981440.0, + "grad_norm": 2.0256263103329744, + "language_loss": 0.75105548, + "learning_rate": 3.94527500658424e-06, + "loss": 0.7736969, + "num_input_tokens_seen": 18149185, + "step": 852, + "time_per_iteration": 2.590784788131714 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.06331968, + "balance_loss_mlp": 1.0299499, + "epoch": 0.10256718571514459, + "flos": 31359495957120.0, + "grad_norm": 2.320799585131205, + "language_loss": 0.81047755, + "learning_rate": 3.945093882136382e-06, + "loss": 0.83277059, + "num_input_tokens_seen": 18172960, + "step": 853, + "time_per_iteration": 2.7449111938476562 + }, + { + "auxiliary_loss_clip": 0.0121421, + "auxiliary_loss_mlp": 0.00765368, + "balance_loss_clip": 1.06659508, + "balance_loss_mlp": 1.00053501, + "epoch": 0.10268742860578368, + "flos": 23474877344640.0, + "grad_norm": 2.0992575748363684, + "language_loss": 0.84571511, + "learning_rate": 3.944912462619329e-06, + "loss": 0.86551088, + "num_input_tokens_seen": 18191925, + "step": 854, + "time_per_iteration": 2.5720298290252686 + }, + { + "auxiliary_loss_clip": 0.01219448, + "auxiliary_loss_mlp": 0.01046146, + "balance_loss_clip": 1.06466556, + "balance_loss_mlp": 1.03274715, + "epoch": 0.10280767149642277, + "flos": 25520313323520.0, + "grad_norm": 1.8539797956474198, + "language_loss": 0.80976975, + "learning_rate": 3.9447307480606025e-06, + "loss": 0.83242571, + "num_input_tokens_seen": 18212010, + "step": 855, + "time_per_iteration": 2.5866432189941406 + }, + { + "auxiliary_loss_clip": 0.01211257, + "auxiliary_loss_mlp": 0.01044826, + "balance_loss_clip": 1.06413543, + "balance_loss_mlp": 1.03186774, + "epoch": 0.10292791438706186, + "flos": 17347691462400.0, + "grad_norm": 1.9278357693770747, + "language_loss": 0.90202832, + "learning_rate": 3.944548738487767e-06, + "loss": 0.92458916, + "num_input_tokens_seen": 18229525, + "step": 856, + "time_per_iteration": 2.535675048828125 + }, + { + "auxiliary_loss_clip": 0.01255305, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.07315385, + "balance_loss_mlp": 1.03060937, + "epoch": 0.10304815727770096, + "flos": 27052693390080.0, + "grad_norm": 2.6168574174165804, + "language_loss": 0.90706313, + "learning_rate": 3.944366433928434e-06, + "loss": 0.93003726, + "num_input_tokens_seen": 18249505, + "step": 857, + "time_per_iteration": 2.5394139289855957 + }, + { + "auxiliary_loss_clip": 0.01210405, + "auxiliary_loss_mlp": 0.01043504, + "balance_loss_clip": 1.06148088, + "balance_loss_mlp": 1.03136861, + "epoch": 0.10316840016834004, + "flos": 22782591544320.0, + "grad_norm": 1.4583354935052693, + "language_loss": 0.83477235, + "learning_rate": 3.9441838344102594e-06, + "loss": 0.85731143, + "num_input_tokens_seen": 18269230, + "step": 858, + "time_per_iteration": 2.5818631649017334 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.06766701, + "balance_loss_mlp": 1.02996337, + "epoch": 0.10328864305897914, + "flos": 20704584908160.0, + "grad_norm": 2.8417679349850156, + "language_loss": 0.67158765, + "learning_rate": 3.944000939960943e-06, + "loss": 0.69423676, + "num_input_tokens_seen": 18287955, + "step": 859, + "time_per_iteration": 2.5712931156158447 + }, + { + "auxiliary_loss_clip": 0.01234923, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.06502998, + "balance_loss_mlp": 1.02883863, + "epoch": 0.10340888594961822, + "flos": 28478814048000.0, + "grad_norm": 1.6018506392566247, + "language_loss": 0.79961228, + "learning_rate": 3.943817750608229e-06, + "loss": 0.82236242, + "num_input_tokens_seen": 18310505, + "step": 860, + "time_per_iteration": 2.6067240238189697 + }, + { + "auxiliary_loss_clip": 0.0123711, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.07013845, + "balance_loss_mlp": 1.02498674, + "epoch": 0.10352912884025732, + "flos": 13370333460480.0, + "grad_norm": 6.098781885593588, + "language_loss": 0.82040596, + "learning_rate": 3.943634266379908e-06, + "loss": 0.84313941, + "num_input_tokens_seen": 18327400, + "step": 861, + "time_per_iteration": 2.4991538524627686 + }, + { + "auxiliary_loss_clip": 0.01234318, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.06568325, + "balance_loss_mlp": 1.02628326, + "epoch": 0.10364937173089642, + "flos": 25558558329600.0, + "grad_norm": 1.9103511504565425, + "language_loss": 0.84885699, + "learning_rate": 3.943450487303815e-06, + "loss": 0.87157905, + "num_input_tokens_seen": 18347895, + "step": 862, + "time_per_iteration": 2.6033434867858887 + }, + { + "auxiliary_loss_clip": 0.01230767, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.0675416, + "balance_loss_mlp": 1.02514505, + "epoch": 0.1037696146215355, + "flos": 21215486004480.0, + "grad_norm": 1.7345335874492325, + "language_loss": 0.8581754, + "learning_rate": 3.943266413407827e-06, + "loss": 0.8808527, + "num_input_tokens_seen": 18367170, + "step": 863, + "time_per_iteration": 2.5189144611358643 + }, + { + "auxiliary_loss_clip": 0.01235176, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.06821549, + "balance_loss_mlp": 1.02763009, + "epoch": 0.1038898575121746, + "flos": 25807382818560.0, + "grad_norm": 1.8273700818729341, + "language_loss": 0.84795642, + "learning_rate": 3.94308204471987e-06, + "loss": 0.87070704, + "num_input_tokens_seen": 18386185, + "step": 864, + "time_per_iteration": 2.548856019973755 + }, + { + "auxiliary_loss_clip": 0.01203318, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.06267774, + "balance_loss_mlp": 1.02370524, + "epoch": 0.10401010040281368, + "flos": 19062425900160.0, + "grad_norm": 2.2821449157478852, + "language_loss": 0.75035113, + "learning_rate": 3.942897381267912e-06, + "loss": 0.7727443, + "num_input_tokens_seen": 18402550, + "step": 865, + "time_per_iteration": 2.577869176864624 + }, + { + "auxiliary_loss_clip": 0.01237717, + "auxiliary_loss_mlp": 0.01037513, + "balance_loss_clip": 1.06897235, + "balance_loss_mlp": 1.02587211, + "epoch": 0.10413034329345278, + "flos": 16355119962240.0, + "grad_norm": 2.337327302317508, + "language_loss": 0.66133368, + "learning_rate": 3.942712423079965e-06, + "loss": 0.68408602, + "num_input_tokens_seen": 18418940, + "step": 866, + "time_per_iteration": 2.4838130474090576 + }, + { + "auxiliary_loss_clip": 0.01182365, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.05389929, + "balance_loss_mlp": 1.02735972, + "epoch": 0.10425058618409186, + "flos": 17236511890560.0, + "grad_norm": 2.226723933930378, + "language_loss": 0.90111232, + "learning_rate": 3.942527170184088e-06, + "loss": 0.92331278, + "num_input_tokens_seen": 18435560, + "step": 867, + "time_per_iteration": 2.577340841293335 + }, + { + "auxiliary_loss_clip": 0.01249542, + "auxiliary_loss_mlp": 0.01043536, + "balance_loss_clip": 1.07011175, + "balance_loss_mlp": 1.03148949, + "epoch": 0.10437082907473096, + "flos": 17967365919360.0, + "grad_norm": 2.741930552202432, + "language_loss": 0.77555239, + "learning_rate": 3.942341622608385e-06, + "loss": 0.79848325, + "num_input_tokens_seen": 18452590, + "step": 868, + "time_per_iteration": 2.497769832611084 + }, + { + "auxiliary_loss_clip": 0.01220772, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.06989682, + "balance_loss_mlp": 1.03093922, + "epoch": 0.10449107196537005, + "flos": 36283315374720.0, + "grad_norm": 1.5602295245729714, + "language_loss": 0.77826494, + "learning_rate": 3.942155780381001e-06, + "loss": 0.80089909, + "num_input_tokens_seen": 18476325, + "step": 869, + "time_per_iteration": 2.6921284198760986 + }, + { + "auxiliary_loss_clip": 0.01218625, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.06372213, + "balance_loss_mlp": 1.02944434, + "epoch": 0.10461131485600914, + "flos": 23802095266560.0, + "grad_norm": 5.696177323381225, + "language_loss": 0.76074743, + "learning_rate": 3.94196964353013e-06, + "loss": 0.78334987, + "num_input_tokens_seen": 18495775, + "step": 870, + "time_per_iteration": 2.6000494956970215 + }, + { + "auxiliary_loss_clip": 0.0121267, + "auxiliary_loss_mlp": 0.007657, + "balance_loss_clip": 1.0617739, + "balance_loss_mlp": 1.0006566, + "epoch": 0.10473155774664823, + "flos": 18405476104320.0, + "grad_norm": 1.8979359831931355, + "language_loss": 0.8077811, + "learning_rate": 3.941783212084008e-06, + "loss": 0.82756478, + "num_input_tokens_seen": 18513530, + "step": 871, + "time_per_iteration": 2.5612242221832275 + }, + { + "auxiliary_loss_clip": 0.01202145, + "auxiliary_loss_mlp": 0.0104362, + "balance_loss_clip": 1.06394315, + "balance_loss_mlp": 1.03139472, + "epoch": 0.10485180063728732, + "flos": 25592637358080.0, + "grad_norm": 2.4871196560135664, + "language_loss": 0.7871505, + "learning_rate": 3.941596486070916e-06, + "loss": 0.8096081, + "num_input_tokens_seen": 18531575, + "step": 872, + "time_per_iteration": 2.619663953781128 + }, + { + "auxiliary_loss_clip": 0.01181727, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.06187224, + "balance_loss_mlp": 1.02239108, + "epoch": 0.10497204352792641, + "flos": 27088747666560.0, + "grad_norm": 3.1576578296201885, + "language_loss": 0.5921123, + "learning_rate": 3.941409465519182e-06, + "loss": 0.61428201, + "num_input_tokens_seen": 18552100, + "step": 873, + "time_per_iteration": 2.7454960346221924 + }, + { + "auxiliary_loss_clip": 0.01222224, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.06283784, + "balance_loss_mlp": 1.03130531, + "epoch": 0.10509228641856551, + "flos": 32858479353600.0, + "grad_norm": 1.7446605499271863, + "language_loss": 0.85294521, + "learning_rate": 3.941222150457176e-06, + "loss": 0.87560987, + "num_input_tokens_seen": 18575355, + "step": 874, + "time_per_iteration": 3.3950982093811035 + }, + { + "auxiliary_loss_clip": 0.01235029, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.06584203, + "balance_loss_mlp": 1.0256083, + "epoch": 0.10521252930920459, + "flos": 14319165173760.0, + "grad_norm": 20.774339608696174, + "language_loss": 0.71904916, + "learning_rate": 3.941034540913311e-06, + "loss": 0.74177647, + "num_input_tokens_seen": 18592885, + "step": 875, + "time_per_iteration": 3.4726932048797607 + }, + { + "auxiliary_loss_clip": 0.01233965, + "auxiliary_loss_mlp": 0.00766339, + "balance_loss_clip": 1.06971025, + "balance_loss_mlp": 1.00055301, + "epoch": 0.10533277219984369, + "flos": 21687028773120.0, + "grad_norm": 1.6457120243152337, + "language_loss": 0.82627094, + "learning_rate": 3.940846636916051e-06, + "loss": 0.8462739, + "num_input_tokens_seen": 18612920, + "step": 876, + "time_per_iteration": 4.157577276229858 + }, + { + "auxiliary_loss_clip": 0.01214884, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.0681076, + "balance_loss_mlp": 1.02910423, + "epoch": 0.10545301509048277, + "flos": 22269787027200.0, + "grad_norm": 1.955388234737653, + "language_loss": 0.86795479, + "learning_rate": 3.940658438493899e-06, + "loss": 0.89052027, + "num_input_tokens_seen": 18630765, + "step": 877, + "time_per_iteration": 2.554295539855957 + }, + { + "auxiliary_loss_clip": 0.01249511, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_clip": 1.0646199, + "balance_loss_mlp": 1.03019547, + "epoch": 0.10557325798112187, + "flos": 22199725549440.0, + "grad_norm": 2.183281152458787, + "language_loss": 0.76064271, + "learning_rate": 3.940469945675405e-06, + "loss": 0.78356957, + "num_input_tokens_seen": 18649150, + "step": 878, + "time_per_iteration": 2.472249984741211 + }, + { + "auxiliary_loss_clip": 0.01164619, + "auxiliary_loss_mlp": 0.01045432, + "balance_loss_clip": 1.05782974, + "balance_loss_mlp": 1.03381503, + "epoch": 0.10569350087176095, + "flos": 25775889569280.0, + "grad_norm": 1.9937433771615651, + "language_loss": 0.9149425, + "learning_rate": 3.940281158489163e-06, + "loss": 0.93704307, + "num_input_tokens_seen": 18668380, + "step": 879, + "time_per_iteration": 2.6702818870544434 + }, + { + "auxiliary_loss_clip": 0.01165015, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.05501246, + "balance_loss_mlp": 1.03153992, + "epoch": 0.10581374376240005, + "flos": 17311385790720.0, + "grad_norm": 1.713106144795998, + "language_loss": 0.82694852, + "learning_rate": 3.940092076963812e-06, + "loss": 0.84902883, + "num_input_tokens_seen": 18685875, + "step": 880, + "time_per_iteration": 2.665902853012085 + }, + { + "auxiliary_loss_clip": 0.01212001, + "auxiliary_loss_mlp": 0.01045204, + "balance_loss_clip": 1.06160581, + "balance_loss_mlp": 1.03305042, + "epoch": 0.10593398665303914, + "flos": 34349454017280.0, + "grad_norm": 2.288447147084918, + "language_loss": 0.78771412, + "learning_rate": 3.9399027011280355e-06, + "loss": 0.81028622, + "num_input_tokens_seen": 18707970, + "step": 881, + "time_per_iteration": 2.676102638244629 + }, + { + "auxiliary_loss_clip": 0.01215846, + "auxiliary_loss_mlp": 0.01037767, + "balance_loss_clip": 1.06829095, + "balance_loss_mlp": 1.0258522, + "epoch": 0.10605422954367823, + "flos": 23257977068160.0, + "grad_norm": 2.0712205587525623, + "language_loss": 0.7715838, + "learning_rate": 3.939713031010561e-06, + "loss": 0.79411995, + "num_input_tokens_seen": 18726335, + "step": 882, + "time_per_iteration": 2.601681709289551 + }, + { + "auxiliary_loss_clip": 0.01197479, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.0652554, + "balance_loss_mlp": 1.02549839, + "epoch": 0.10617447243431732, + "flos": 22820118278400.0, + "grad_norm": 2.0119370464286814, + "language_loss": 0.77705848, + "learning_rate": 3.939523066640163e-06, + "loss": 0.79941767, + "num_input_tokens_seen": 18745230, + "step": 883, + "time_per_iteration": 2.611258029937744 + }, + { + "auxiliary_loss_clip": 0.01234587, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.06795311, + "balance_loss_mlp": 1.02690899, + "epoch": 0.10629471532495641, + "flos": 24386577373440.0, + "grad_norm": 1.7257535437533693, + "language_loss": 0.81320012, + "learning_rate": 3.939332808045657e-06, + "loss": 0.83593452, + "num_input_tokens_seen": 18764880, + "step": 884, + "time_per_iteration": 2.578336715698242 + }, + { + "auxiliary_loss_clip": 0.01200021, + "auxiliary_loss_mlp": 0.01043607, + "balance_loss_clip": 1.06350231, + "balance_loss_mlp": 1.03220499, + "epoch": 0.1064149582155955, + "flos": 21105491581440.0, + "grad_norm": 1.7297978875876656, + "language_loss": 0.84645975, + "learning_rate": 3.939142255255906e-06, + "loss": 0.86889601, + "num_input_tokens_seen": 18785765, + "step": 885, + "time_per_iteration": 2.6485509872436523 + }, + { + "auxiliary_loss_clip": 0.01231295, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.06758714, + "balance_loss_mlp": 1.02184808, + "epoch": 0.1065352011062346, + "flos": 20702035042560.0, + "grad_norm": 2.184481076466831, + "language_loss": 0.87081289, + "learning_rate": 3.938951408299817e-06, + "loss": 0.8934691, + "num_input_tokens_seen": 18804605, + "step": 886, + "time_per_iteration": 2.5448787212371826 + }, + { + "auxiliary_loss_clip": 0.01097863, + "auxiliary_loss_mlp": 0.01014511, + "balance_loss_clip": 1.04692638, + "balance_loss_mlp": 1.01081586, + "epoch": 0.10665544399687368, + "flos": 62659632689280.0, + "grad_norm": 0.8069423179480928, + "language_loss": 0.5449568, + "learning_rate": 3.938760267206342e-06, + "loss": 0.56608057, + "num_input_tokens_seen": 18866425, + "step": 887, + "time_per_iteration": 3.235097885131836 + }, + { + "auxiliary_loss_clip": 0.01246683, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.06923103, + "balance_loss_mlp": 1.02474415, + "epoch": 0.10677568688751278, + "flos": 26140382830080.0, + "grad_norm": 2.9651813870548587, + "language_loss": 0.78733134, + "learning_rate": 3.938568832004475e-06, + "loss": 0.81016719, + "num_input_tokens_seen": 18885130, + "step": 888, + "time_per_iteration": 2.696911096572876 + }, + { + "auxiliary_loss_clip": 0.01205601, + "auxiliary_loss_mlp": 0.01047404, + "balance_loss_clip": 1.06130052, + "balance_loss_mlp": 1.03506613, + "epoch": 0.10689592977815186, + "flos": 12786533712000.0, + "grad_norm": 2.044086220378799, + "language_loss": 0.75175261, + "learning_rate": 3.938377102723257e-06, + "loss": 0.77428269, + "num_input_tokens_seen": 18902265, + "step": 889, + "time_per_iteration": 2.6156020164489746 + }, + { + "auxiliary_loss_clip": 0.01168326, + "auxiliary_loss_mlp": 0.01048899, + "balance_loss_clip": 1.05677462, + "balance_loss_mlp": 1.03620934, + "epoch": 0.10701617266879096, + "flos": 22126683242880.0, + "grad_norm": 2.0879707643650782, + "language_loss": 0.83190566, + "learning_rate": 3.938185079391774e-06, + "loss": 0.85407794, + "num_input_tokens_seen": 18919310, + "step": 890, + "time_per_iteration": 2.697831869125366 + }, + { + "auxiliary_loss_clip": 0.01246035, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.06751299, + "balance_loss_mlp": 1.02373052, + "epoch": 0.10713641555943004, + "flos": 19745625559680.0, + "grad_norm": 2.95096546335341, + "language_loss": 1.06028533, + "learning_rate": 3.937992762039157e-06, + "loss": 1.0831008, + "num_input_tokens_seen": 18932635, + "step": 891, + "time_per_iteration": 2.5040862560272217 + }, + { + "auxiliary_loss_clip": 0.01229997, + "auxiliary_loss_mlp": 0.01046718, + "balance_loss_clip": 1.06821299, + "balance_loss_mlp": 1.03519642, + "epoch": 0.10725665845006914, + "flos": 23952992302080.0, + "grad_norm": 1.6904696360626592, + "language_loss": 0.80624121, + "learning_rate": 3.937800150694577e-06, + "loss": 0.8290084, + "num_input_tokens_seen": 18953810, + "step": 892, + "time_per_iteration": 2.57356858253479 + }, + { + "auxiliary_loss_clip": 0.01184032, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.06292629, + "balance_loss_mlp": 1.02580535, + "epoch": 0.10737690134070824, + "flos": 18551704371840.0, + "grad_norm": 3.8283504428316184, + "language_loss": 0.75914514, + "learning_rate": 3.937607245387255e-06, + "loss": 0.78136718, + "num_input_tokens_seen": 18973175, + "step": 893, + "time_per_iteration": 2.66790509223938 + }, + { + "auxiliary_loss_clip": 0.01220911, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.06344509, + "balance_loss_mlp": 1.03141284, + "epoch": 0.10749714423134732, + "flos": 22707609903360.0, + "grad_norm": 1.9953284310374422, + "language_loss": 0.72016245, + "learning_rate": 3.937414046146455e-06, + "loss": 0.74279869, + "num_input_tokens_seen": 18991130, + "step": 894, + "time_per_iteration": 2.6203842163085938 + }, + { + "auxiliary_loss_clip": 0.01248196, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_clip": 1.07028544, + "balance_loss_mlp": 1.03570247, + "epoch": 0.10761738712198642, + "flos": 21106066199040.0, + "grad_norm": 2.1009456808382505, + "language_loss": 0.75195003, + "learning_rate": 3.9372205530014845e-06, + "loss": 0.77491784, + "num_input_tokens_seen": 19009610, + "step": 895, + "time_per_iteration": 2.5062379837036133 + }, + { + "auxiliary_loss_clip": 0.01244361, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.0667882, + "balance_loss_mlp": 1.03931367, + "epoch": 0.1077376300126255, + "flos": 23766723348480.0, + "grad_norm": 1.9725829713817538, + "language_loss": 0.71280301, + "learning_rate": 3.937026765981696e-06, + "loss": 0.73575318, + "num_input_tokens_seen": 19029680, + "step": 896, + "time_per_iteration": 2.524160861968994 + }, + { + "auxiliary_loss_clip": 0.01204608, + "auxiliary_loss_mlp": 0.01047806, + "balance_loss_clip": 1.06574655, + "balance_loss_mlp": 1.03562832, + "epoch": 0.1078578729032646, + "flos": 20919581763840.0, + "grad_norm": 1.8177511129370527, + "language_loss": 0.79174101, + "learning_rate": 3.936832685116488e-06, + "loss": 0.81426519, + "num_input_tokens_seen": 19047775, + "step": 897, + "time_per_iteration": 2.6213831901550293 + }, + { + "auxiliary_loss_clip": 0.01244984, + "auxiliary_loss_mlp": 0.01047706, + "balance_loss_clip": 1.06768215, + "balance_loss_mlp": 1.03632188, + "epoch": 0.10797811579390369, + "flos": 14829886702080.0, + "grad_norm": 3.9006019505813434, + "language_loss": 0.89869893, + "learning_rate": 3.936638310435301e-06, + "loss": 0.92162585, + "num_input_tokens_seen": 19065640, + "step": 898, + "time_per_iteration": 2.48211407661438 + }, + { + "auxiliary_loss_clip": 0.01233377, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.0671159, + "balance_loss_mlp": 1.027385, + "epoch": 0.10809835868454278, + "flos": 19536985411200.0, + "grad_norm": 1.8870482773908106, + "language_loss": 0.81717026, + "learning_rate": 3.936443641967623e-06, + "loss": 0.83990157, + "num_input_tokens_seen": 19084470, + "step": 899, + "time_per_iteration": 2.6297972202301025 + }, + { + "auxiliary_loss_clip": 0.01215288, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.06584024, + "balance_loss_mlp": 1.03720522, + "epoch": 0.10821860157518187, + "flos": 18442320480000.0, + "grad_norm": 1.9752072870018094, + "language_loss": 0.83099663, + "learning_rate": 3.936248679742983e-06, + "loss": 0.85364693, + "num_input_tokens_seen": 19102965, + "step": 900, + "time_per_iteration": 2.7230026721954346 + }, + { + "auxiliary_loss_clip": 0.0108885, + "auxiliary_loss_mlp": 0.01022038, + "balance_loss_clip": 1.02573144, + "balance_loss_mlp": 1.01846206, + "epoch": 0.10833884446582095, + "flos": 49359468447360.0, + "grad_norm": 1.0561499190279335, + "language_loss": 0.70156032, + "learning_rate": 3.936053423790959e-06, + "loss": 0.72266924, + "num_input_tokens_seen": 19151285, + "step": 901, + "time_per_iteration": 4.565507650375366 + }, + { + "auxiliary_loss_clip": 0.01245785, + "auxiliary_loss_mlp": 0.0105394, + "balance_loss_clip": 1.07003534, + "balance_loss_mlp": 1.04275846, + "epoch": 0.10845908735646005, + "flos": 20411912891520.0, + "grad_norm": 1.7809148851027565, + "language_loss": 0.77264971, + "learning_rate": 3.935857874141168e-06, + "loss": 0.79564697, + "num_input_tokens_seen": 19170120, + "step": 902, + "time_per_iteration": 4.17949366569519 + }, + { + "auxiliary_loss_clip": 0.01206677, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.06274319, + "balance_loss_mlp": 1.023929, + "epoch": 0.10857933024709913, + "flos": 14027750133120.0, + "grad_norm": 2.457791106671611, + "language_loss": 0.83552498, + "learning_rate": 3.935662030823279e-06, + "loss": 0.85795939, + "num_input_tokens_seen": 19186305, + "step": 903, + "time_per_iteration": 2.5696496963500977 + }, + { + "auxiliary_loss_clip": 0.0122893, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_clip": 1.06380689, + "balance_loss_mlp": 1.03623164, + "epoch": 0.10869957313773823, + "flos": 13369004657280.0, + "grad_norm": 2.144541793372915, + "language_loss": 0.72460008, + "learning_rate": 3.935465893866998e-06, + "loss": 0.74736738, + "num_input_tokens_seen": 19204530, + "step": 904, + "time_per_iteration": 2.5013344287872314 + }, + { + "auxiliary_loss_clip": 0.01215252, + "auxiliary_loss_mlp": 0.01045518, + "balance_loss_clip": 1.0651046, + "balance_loss_mlp": 1.03353155, + "epoch": 0.10881981602837733, + "flos": 25807095509760.0, + "grad_norm": 1.8631507561449274, + "language_loss": 0.80162418, + "learning_rate": 3.935269463302079e-06, + "loss": 0.82423186, + "num_input_tokens_seen": 19222735, + "step": 905, + "time_per_iteration": 2.5993716716766357 + }, + { + "auxiliary_loss_clip": 0.01232574, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.06604266, + "balance_loss_mlp": 1.03403258, + "epoch": 0.10894005891901641, + "flos": 20777555387520.0, + "grad_norm": 1.752147798494373, + "language_loss": 0.7660498, + "learning_rate": 3.935072739158322e-06, + "loss": 0.78884304, + "num_input_tokens_seen": 19242445, + "step": 906, + "time_per_iteration": 2.603715419769287 + }, + { + "auxiliary_loss_clip": 0.01216285, + "auxiliary_loss_mlp": 0.01045409, + "balance_loss_clip": 1.06351054, + "balance_loss_mlp": 1.03328514, + "epoch": 0.10906030180965551, + "flos": 26649883296000.0, + "grad_norm": 1.6051087849701524, + "language_loss": 0.79892141, + "learning_rate": 3.934875721465569e-06, + "loss": 0.82153833, + "num_input_tokens_seen": 19262865, + "step": 907, + "time_per_iteration": 2.660031318664551 + }, + { + "auxiliary_loss_clip": 0.01206826, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.05740714, + "balance_loss_mlp": 1.0243628, + "epoch": 0.10918054470029459, + "flos": 36534402420480.0, + "grad_norm": 2.7031024681674816, + "language_loss": 0.71307731, + "learning_rate": 3.9346784102537076e-06, + "loss": 0.73551643, + "num_input_tokens_seen": 19285000, + "step": 908, + "time_per_iteration": 2.726249933242798 + }, + { + "auxiliary_loss_clip": 0.01241946, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.06549275, + "balance_loss_mlp": 1.02191401, + "epoch": 0.10930078759093369, + "flos": 21762549118080.0, + "grad_norm": 2.1602413612824467, + "language_loss": 0.78048885, + "learning_rate": 3.934480805552669e-06, + "loss": 0.80324185, + "num_input_tokens_seen": 19306010, + "step": 909, + "time_per_iteration": 2.538923501968384 + }, + { + "auxiliary_loss_clip": 0.01243027, + "auxiliary_loss_mlp": 0.00766099, + "balance_loss_clip": 1.0671109, + "balance_loss_mlp": 1.00043392, + "epoch": 0.10942103048157277, + "flos": 22601781457920.0, + "grad_norm": 2.683040007716835, + "language_loss": 0.8823545, + "learning_rate": 3.93428290739243e-06, + "loss": 0.90244579, + "num_input_tokens_seen": 19325380, + "step": 910, + "time_per_iteration": 2.5804762840270996 + }, + { + "auxiliary_loss_clip": 0.01212609, + "auxiliary_loss_mlp": 0.01043356, + "balance_loss_clip": 1.06255555, + "balance_loss_mlp": 1.0314765, + "epoch": 0.10954127337221187, + "flos": 15045781397760.0, + "grad_norm": 2.5509704323512024, + "language_loss": 0.80226022, + "learning_rate": 3.9340847158030125e-06, + "loss": 0.82481986, + "num_input_tokens_seen": 19338960, + "step": 911, + "time_per_iteration": 2.591444492340088 + }, + { + "auxiliary_loss_clip": 0.01227843, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.06210423, + "balance_loss_mlp": 1.03505921, + "epoch": 0.10966151626285096, + "flos": 21650974496640.0, + "grad_norm": 1.8008669266167887, + "language_loss": 0.75721812, + "learning_rate": 3.9338862308144814e-06, + "loss": 0.77996147, + "num_input_tokens_seen": 19357780, + "step": 912, + "time_per_iteration": 2.5491652488708496 + }, + { + "auxiliary_loss_clip": 0.01241929, + "auxiliary_loss_mlp": 0.01038504, + "balance_loss_clip": 1.06527805, + "balance_loss_mlp": 1.02644598, + "epoch": 0.10978175915349005, + "flos": 20121359777280.0, + "grad_norm": 1.6891769587223253, + "language_loss": 0.84569609, + "learning_rate": 3.933687452456946e-06, + "loss": 0.86850047, + "num_input_tokens_seen": 19377680, + "step": 913, + "time_per_iteration": 2.482377290725708 + }, + { + "auxiliary_loss_clip": 0.01194459, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.05782187, + "balance_loss_mlp": 1.02563262, + "epoch": 0.10990200204412914, + "flos": 20412667077120.0, + "grad_norm": 2.357984621539144, + "language_loss": 0.8658424, + "learning_rate": 3.933488380760562e-06, + "loss": 0.88817203, + "num_input_tokens_seen": 19397040, + "step": 914, + "time_per_iteration": 2.5733702182769775 + }, + { + "auxiliary_loss_clip": 0.01241666, + "auxiliary_loss_mlp": 0.00766404, + "balance_loss_clip": 1.06553578, + "balance_loss_mlp": 1.00029194, + "epoch": 0.11002224493476823, + "flos": 17530117660800.0, + "grad_norm": 1.8858425190326942, + "language_loss": 0.87028563, + "learning_rate": 3.9332890157555286e-06, + "loss": 0.89036632, + "num_input_tokens_seen": 19413975, + "step": 915, + "time_per_iteration": 2.469669818878174 + }, + { + "auxiliary_loss_clip": 0.01216891, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.0642904, + "balance_loss_mlp": 1.0310775, + "epoch": 0.11014248782540732, + "flos": 12203093099520.0, + "grad_norm": 2.0779879065913467, + "language_loss": 0.76569939, + "learning_rate": 3.933089357472088e-06, + "loss": 0.78829557, + "num_input_tokens_seen": 19432005, + "step": 916, + "time_per_iteration": 2.5350403785705566 + }, + { + "auxiliary_loss_clip": 0.01240491, + "auxiliary_loss_mlp": 0.01037486, + "balance_loss_clip": 1.06668663, + "balance_loss_mlp": 1.02588117, + "epoch": 0.11026273071604642, + "flos": 22382977760640.0, + "grad_norm": 1.9537814144608443, + "language_loss": 0.85895693, + "learning_rate": 3.932889405940529e-06, + "loss": 0.88173664, + "num_input_tokens_seen": 19450100, + "step": 917, + "time_per_iteration": 2.488548994064331 + }, + { + "auxiliary_loss_clip": 0.01215023, + "auxiliary_loss_mlp": 0.01039902, + "balance_loss_clip": 1.06833053, + "balance_loss_mlp": 1.02882719, + "epoch": 0.1103829736066855, + "flos": 19829046896640.0, + "grad_norm": 2.3573227942372132, + "language_loss": 0.8032921, + "learning_rate": 3.932689161191184e-06, + "loss": 0.82584137, + "num_input_tokens_seen": 19467805, + "step": 918, + "time_per_iteration": 2.5376923084259033 + }, + { + "auxiliary_loss_clip": 0.01225372, + "auxiliary_loss_mlp": 0.01042683, + "balance_loss_clip": 1.06312788, + "balance_loss_mlp": 1.03035116, + "epoch": 0.1105032164973246, + "flos": 22669616292480.0, + "grad_norm": 2.1540245448021165, + "language_loss": 0.88042688, + "learning_rate": 3.93248862325443e-06, + "loss": 0.9031074, + "num_input_tokens_seen": 19486710, + "step": 919, + "time_per_iteration": 2.5131280422210693 + }, + { + "auxiliary_loss_clip": 0.01119518, + "auxiliary_loss_mlp": 0.01006093, + "balance_loss_clip": 1.02441168, + "balance_loss_mlp": 1.00220704, + "epoch": 0.11062345938796368, + "flos": 66483507876480.0, + "grad_norm": 0.9628648553891785, + "language_loss": 0.6446929, + "learning_rate": 3.932287792160688e-06, + "loss": 0.66594899, + "num_input_tokens_seen": 19545170, + "step": 920, + "time_per_iteration": 2.9984629154205322 + }, + { + "auxiliary_loss_clip": 0.01230006, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.0635066, + "balance_loss_mlp": 1.02614594, + "epoch": 0.11074370227860278, + "flos": 21907771804800.0, + "grad_norm": 2.286659396698755, + "language_loss": 0.80967748, + "learning_rate": 3.932086667940424e-06, + "loss": 0.83236909, + "num_input_tokens_seen": 19561875, + "step": 921, + "time_per_iteration": 2.5438828468322754 + }, + { + "auxiliary_loss_clip": 0.01225527, + "auxiliary_loss_mlp": 0.0076628, + "balance_loss_clip": 1.06544352, + "balance_loss_mlp": 1.00044894, + "epoch": 0.11086394516924186, + "flos": 28658115763200.0, + "grad_norm": 1.8702677092526565, + "language_loss": 0.81422657, + "learning_rate": 3.93188525062415e-06, + "loss": 0.83414465, + "num_input_tokens_seen": 19582340, + "step": 922, + "time_per_iteration": 2.6303744316101074 + }, + { + "auxiliary_loss_clip": 0.01228102, + "auxiliary_loss_mlp": 0.0105, + "balance_loss_clip": 1.06463611, + "balance_loss_mlp": 1.03803086, + "epoch": 0.11098418805988096, + "flos": 24535247765760.0, + "grad_norm": 2.059490623517715, + "language_loss": 0.85826564, + "learning_rate": 3.931683540242418e-06, + "loss": 0.88104665, + "num_input_tokens_seen": 19603405, + "step": 923, + "time_per_iteration": 2.568347215652466 + }, + { + "auxiliary_loss_clip": 0.01220552, + "auxiliary_loss_mlp": 0.01041222, + "balance_loss_clip": 1.06238747, + "balance_loss_mlp": 1.02862191, + "epoch": 0.11110443095052006, + "flos": 22960384888320.0, + "grad_norm": 3.345995857824467, + "language_loss": 0.91359913, + "learning_rate": 3.9314815368258295e-06, + "loss": 0.93621689, + "num_input_tokens_seen": 19619885, + "step": 924, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.01231046, + "auxiliary_loss_mlp": 0.01038573, + "balance_loss_clip": 1.06968033, + "balance_loss_mlp": 1.02677083, + "epoch": 0.11122467384115914, + "flos": 18950025265920.0, + "grad_norm": 1.648144738620991, + "language_loss": 0.79199755, + "learning_rate": 3.9312792404050275e-06, + "loss": 0.81469381, + "num_input_tokens_seen": 19637940, + "step": 925, + "time_per_iteration": 2.529869794845581 + }, + { + "auxiliary_loss_clip": 0.01241211, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_clip": 1.06772447, + "balance_loss_mlp": 1.03167939, + "epoch": 0.11134491673179824, + "flos": 25082957324160.0, + "grad_norm": 2.0111942537795175, + "language_loss": 0.77109653, + "learning_rate": 3.9310766510107e-06, + "loss": 0.79393083, + "num_input_tokens_seen": 19657115, + "step": 926, + "time_per_iteration": 2.5497233867645264 + }, + { + "auxiliary_loss_clip": 0.01196749, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.05819678, + "balance_loss_mlp": 1.03240061, + "epoch": 0.11146515962243732, + "flos": 24499121662080.0, + "grad_norm": 1.7701309045128373, + "language_loss": 0.92141306, + "learning_rate": 3.9308737686735806e-06, + "loss": 0.94383192, + "num_input_tokens_seen": 19677075, + "step": 927, + "time_per_iteration": 3.4489996433258057 + }, + { + "auxiliary_loss_clip": 0.01245008, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.0675478, + "balance_loss_mlp": 1.03852224, + "epoch": 0.11158540251307641, + "flos": 22343763087360.0, + "grad_norm": 1.9489123122390157, + "language_loss": 0.82494092, + "learning_rate": 3.9306705934244455e-06, + "loss": 0.84789026, + "num_input_tokens_seen": 19697155, + "step": 928, + "time_per_iteration": 3.345170259475708 + }, + { + "auxiliary_loss_clip": 0.01201012, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.06058645, + "balance_loss_mlp": 1.02271295, + "epoch": 0.11170564540371551, + "flos": 19902304684800.0, + "grad_norm": 3.8309276274066555, + "language_loss": 0.88328558, + "learning_rate": 3.930467125294116e-06, + "loss": 0.90563965, + "num_input_tokens_seen": 19716705, + "step": 929, + "time_per_iteration": 4.264178514480591 + }, + { + "auxiliary_loss_clip": 0.01063279, + "auxiliary_loss_mlp": 0.01015038, + "balance_loss_clip": 1.02122641, + "balance_loss_mlp": 1.01181936, + "epoch": 0.1118258882943546, + "flos": 64586239499520.0, + "grad_norm": 0.9267581895326711, + "language_loss": 0.60511625, + "learning_rate": 3.930263364313458e-06, + "loss": 0.62589943, + "num_input_tokens_seen": 19767275, + "step": 930, + "time_per_iteration": 3.0412485599517822 + }, + { + "auxiliary_loss_clip": 0.01192551, + "auxiliary_loss_mlp": 0.01047519, + "balance_loss_clip": 1.05895185, + "balance_loss_mlp": 1.03503776, + "epoch": 0.11194613118499369, + "flos": 17201965985280.0, + "grad_norm": 1.9034218675531067, + "language_loss": 0.8319875, + "learning_rate": 3.930059310513384e-06, + "loss": 0.85438824, + "num_input_tokens_seen": 19786315, + "step": 931, + "time_per_iteration": 2.606708526611328 + }, + { + "auxiliary_loss_clip": 0.01179275, + "auxiliary_loss_mlp": 0.00765751, + "balance_loss_clip": 1.05716705, + "balance_loss_mlp": 1.00044394, + "epoch": 0.11206637407563277, + "flos": 31863465728640.0, + "grad_norm": 1.7644289908324133, + "language_loss": 0.84314966, + "learning_rate": 3.929854963924846e-06, + "loss": 0.86259997, + "num_input_tokens_seen": 19806580, + "step": 932, + "time_per_iteration": 2.692108631134033 + }, + { + "auxiliary_loss_clip": 0.01198069, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.06000352, + "balance_loss_mlp": 1.02489352, + "epoch": 0.11218661696627187, + "flos": 21945621761280.0, + "grad_norm": 1.8666517588301932, + "language_loss": 0.77612174, + "learning_rate": 3.929650324578845e-06, + "loss": 0.79846597, + "num_input_tokens_seen": 19826045, + "step": 933, + "time_per_iteration": 2.60599684715271 + }, + { + "auxiliary_loss_clip": 0.01214759, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.06194365, + "balance_loss_mlp": 1.03188276, + "epoch": 0.11230685985691095, + "flos": 25878198481920.0, + "grad_norm": 2.6979353762670946, + "language_loss": 0.82038438, + "learning_rate": 3.929445392506423e-06, + "loss": 0.84297669, + "num_input_tokens_seen": 19843985, + "step": 934, + "time_per_iteration": 2.6032726764678955 + }, + { + "auxiliary_loss_clip": 0.01225622, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_clip": 1.0667181, + "balance_loss_mlp": 1.03335059, + "epoch": 0.11242710274755005, + "flos": 22231506107520.0, + "grad_norm": 2.2004972779220524, + "language_loss": 0.7586152, + "learning_rate": 3.92924016773867e-06, + "loss": 0.78131485, + "num_input_tokens_seen": 19860480, + "step": 935, + "time_per_iteration": 2.5110716819763184 + }, + { + "auxiliary_loss_clip": 0.01211213, + "auxiliary_loss_mlp": 0.00765341, + "balance_loss_clip": 1.05966306, + "balance_loss_mlp": 1.00027537, + "epoch": 0.11254734563818915, + "flos": 17712184723200.0, + "grad_norm": 2.1775783808745315, + "language_loss": 0.73724914, + "learning_rate": 3.9290346503067175e-06, + "loss": 0.75701463, + "num_input_tokens_seen": 19877145, + "step": 936, + "time_per_iteration": 2.6096205711364746 + }, + { + "auxiliary_loss_clip": 0.01228612, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.06202245, + "balance_loss_mlp": 1.03279829, + "epoch": 0.11266758852882823, + "flos": 54930397334400.0, + "grad_norm": 1.6875503765399, + "language_loss": 0.79074746, + "learning_rate": 3.9288288402417415e-06, + "loss": 0.81347978, + "num_input_tokens_seen": 19903405, + "step": 937, + "time_per_iteration": 2.8562569618225098 + }, + { + "auxiliary_loss_clip": 0.01229723, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.06522214, + "balance_loss_mlp": 1.02647066, + "epoch": 0.11278783141946733, + "flos": 18878132194560.0, + "grad_norm": 3.0473630401126446, + "language_loss": 0.70422733, + "learning_rate": 3.928622737574964e-06, + "loss": 0.7269119, + "num_input_tokens_seen": 19918740, + "step": 938, + "time_per_iteration": 2.5113043785095215 + }, + { + "auxiliary_loss_clip": 0.01209721, + "auxiliary_loss_mlp": 0.01040816, + "balance_loss_clip": 1.05999959, + "balance_loss_mlp": 1.02902031, + "epoch": 0.11290807431010641, + "flos": 26469252777600.0, + "grad_norm": 1.7079211075878025, + "language_loss": 0.91000837, + "learning_rate": 3.928416342337652e-06, + "loss": 0.93251377, + "num_input_tokens_seen": 19938475, + "step": 939, + "time_per_iteration": 2.5781867504119873 + }, + { + "auxiliary_loss_clip": 0.01214029, + "auxiliary_loss_mlp": 0.01042584, + "balance_loss_clip": 1.06338894, + "balance_loss_mlp": 1.03046083, + "epoch": 0.1130283172007455, + "flos": 22710590732160.0, + "grad_norm": 1.6486694650201297, + "language_loss": 0.82921183, + "learning_rate": 3.928209654561113e-06, + "loss": 0.85177797, + "num_input_tokens_seen": 19959310, + "step": 940, + "time_per_iteration": 2.60115385055542 + }, + { + "auxiliary_loss_clip": 0.01204416, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.06230795, + "balance_loss_mlp": 1.03075671, + "epoch": 0.1131485600913846, + "flos": 23219911630080.0, + "grad_norm": 2.2595700219827264, + "language_loss": 0.81772578, + "learning_rate": 3.928002674276703e-06, + "loss": 0.8401947, + "num_input_tokens_seen": 19978700, + "step": 941, + "time_per_iteration": 2.550711154937744 + }, + { + "auxiliary_loss_clip": 0.01163359, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.05377817, + "balance_loss_mlp": 1.02747726, + "epoch": 0.11326880298202369, + "flos": 14064271286400.0, + "grad_norm": 2.6591037702523717, + "language_loss": 0.75724494, + "learning_rate": 3.92779540151582e-06, + "loss": 0.77927351, + "num_input_tokens_seen": 19995785, + "step": 942, + "time_per_iteration": 2.6208701133728027 + }, + { + "auxiliary_loss_clip": 0.01209148, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.06147647, + "balance_loss_mlp": 1.0226357, + "epoch": 0.11338904587266278, + "flos": 16325386479360.0, + "grad_norm": 1.7345091594213833, + "language_loss": 0.85486722, + "learning_rate": 3.927587836309907e-06, + "loss": 0.87729418, + "num_input_tokens_seen": 20013615, + "step": 943, + "time_per_iteration": 2.5255768299102783 + }, + { + "auxiliary_loss_clip": 0.01204764, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_clip": 1.05869961, + "balance_loss_mlp": 1.02967, + "epoch": 0.11350928876330187, + "flos": 24426258923520.0, + "grad_norm": 1.9533308449286588, + "language_loss": 0.78329992, + "learning_rate": 3.927379978690452e-06, + "loss": 0.80575955, + "num_input_tokens_seen": 20032880, + "step": 944, + "time_per_iteration": 2.5818123817443848 + }, + { + "auxiliary_loss_clip": 0.01182674, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_clip": 1.05213881, + "balance_loss_mlp": 1.03279221, + "epoch": 0.11362953165394096, + "flos": 24497074586880.0, + "grad_norm": 2.327988389960311, + "language_loss": 0.87347078, + "learning_rate": 3.927171828688987e-06, + "loss": 0.89574504, + "num_input_tokens_seen": 20052405, + "step": 945, + "time_per_iteration": 2.626222848892212 + }, + { + "auxiliary_loss_clip": 0.01242082, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.06754446, + "balance_loss_mlp": 1.02386856, + "epoch": 0.11374977454458005, + "flos": 24060831909120.0, + "grad_norm": 2.0206231734861544, + "language_loss": 0.82261312, + "learning_rate": 3.926963386337088e-06, + "loss": 0.84538722, + "num_input_tokens_seen": 20070635, + "step": 946, + "time_per_iteration": 2.5365524291992188 + }, + { + "auxiliary_loss_clip": 0.01245299, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.06629467, + "balance_loss_mlp": 1.02802038, + "epoch": 0.11387001743521914, + "flos": 39457638967680.0, + "grad_norm": 2.159456717824878, + "language_loss": 0.69928288, + "learning_rate": 3.926754651666375e-06, + "loss": 0.72214556, + "num_input_tokens_seen": 20091195, + "step": 947, + "time_per_iteration": 2.6271743774414062 + }, + { + "auxiliary_loss_clip": 0.0119752, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.06280625, + "balance_loss_mlp": 1.03165853, + "epoch": 0.11399026032585824, + "flos": 25082454533760.0, + "grad_norm": 2.8213346841396163, + "language_loss": 0.7815634, + "learning_rate": 3.926545624708513e-06, + "loss": 0.80397063, + "num_input_tokens_seen": 20110435, + "step": 948, + "time_per_iteration": 2.637596845626831 + }, + { + "auxiliary_loss_clip": 0.01190582, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.05721092, + "balance_loss_mlp": 1.03418267, + "epoch": 0.11411050321649732, + "flos": 17961835224960.0, + "grad_norm": 1.8600363110307376, + "language_loss": 0.85566014, + "learning_rate": 3.926336305495213e-06, + "loss": 0.87802893, + "num_input_tokens_seen": 20128995, + "step": 949, + "time_per_iteration": 2.590027093887329 + }, + { + "auxiliary_loss_clip": 0.01183176, + "auxiliary_loss_mlp": 0.01044449, + "balance_loss_clip": 1.05839086, + "balance_loss_mlp": 1.03125882, + "epoch": 0.11423074610713642, + "flos": 22455409536000.0, + "grad_norm": 2.843676516676487, + "language_loss": 0.89210212, + "learning_rate": 3.926126694058226e-06, + "loss": 0.9143784, + "num_input_tokens_seen": 20148145, + "step": 950, + "time_per_iteration": 2.6399731636047363 + }, + { + "auxiliary_loss_clip": 0.01176519, + "auxiliary_loss_mlp": 0.0104315, + "balance_loss_clip": 1.06042659, + "balance_loss_mlp": 1.03243279, + "epoch": 0.1143509889977755, + "flos": 19717687756800.0, + "grad_norm": 1.4023500310889876, + "language_loss": 0.82142472, + "learning_rate": 3.92591679042935e-06, + "loss": 0.84362149, + "num_input_tokens_seen": 20168035, + "step": 951, + "time_per_iteration": 2.654221296310425 + }, + { + "auxiliary_loss_clip": 0.01225639, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.06502831, + "balance_loss_mlp": 1.02973866, + "epoch": 0.1144712318884146, + "flos": 19822869757440.0, + "grad_norm": 1.5792850120107134, + "language_loss": 0.82031012, + "learning_rate": 3.92570659464043e-06, + "loss": 0.8429938, + "num_input_tokens_seen": 20186095, + "step": 952, + "time_per_iteration": 2.570864200592041 + }, + { + "auxiliary_loss_clip": 0.01221346, + "auxiliary_loss_mlp": 0.00766276, + "balance_loss_clip": 1.0645318, + "balance_loss_mlp": 1.000525, + "epoch": 0.1145914747790537, + "flos": 14939198766720.0, + "grad_norm": 2.044037121102799, + "language_loss": 0.79848111, + "learning_rate": 3.925496106723349e-06, + "loss": 0.81835735, + "num_input_tokens_seen": 20203535, + "step": 953, + "time_per_iteration": 2.530999183654785 + }, + { + "auxiliary_loss_clip": 0.01226673, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.06345189, + "balance_loss_mlp": 1.02891517, + "epoch": 0.11471171766969278, + "flos": 19865029345920.0, + "grad_norm": 2.2719813310197168, + "language_loss": 0.83986515, + "learning_rate": 3.9252853267100405e-06, + "loss": 0.86253309, + "num_input_tokens_seen": 20222780, + "step": 954, + "time_per_iteration": 3.2981762886047363 + }, + { + "auxiliary_loss_clip": 0.0118559, + "auxiliary_loss_mlp": 0.01041318, + "balance_loss_clip": 1.05945182, + "balance_loss_mlp": 1.0291822, + "epoch": 0.11483196056033187, + "flos": 22526476594560.0, + "grad_norm": 1.875616324270152, + "language_loss": 0.83536255, + "learning_rate": 3.9250742546324786e-06, + "loss": 0.85763162, + "num_input_tokens_seen": 20243015, + "step": 955, + "time_per_iteration": 3.4052798748016357 + }, + { + "auxiliary_loss_clip": 0.01206633, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.05893803, + "balance_loss_mlp": 1.02925706, + "epoch": 0.11495220345097096, + "flos": 28220292887040.0, + "grad_norm": 1.618262806097162, + "language_loss": 0.86693269, + "learning_rate": 3.924862890522683e-06, + "loss": 0.88939995, + "num_input_tokens_seen": 20263025, + "step": 956, + "time_per_iteration": 4.212643146514893 + }, + { + "auxiliary_loss_clip": 0.01223084, + "auxiliary_loss_mlp": 0.01037441, + "balance_loss_clip": 1.06112909, + "balance_loss_mlp": 1.02590799, + "epoch": 0.11507244634161005, + "flos": 17492267704320.0, + "grad_norm": 2.0530214525973847, + "language_loss": 0.86499703, + "learning_rate": 3.9246512344127174e-06, + "loss": 0.88760233, + "num_input_tokens_seen": 20280685, + "step": 957, + "time_per_iteration": 2.4893124103546143 + }, + { + "auxiliary_loss_clip": 0.01146318, + "auxiliary_loss_mlp": 0.01037975, + "balance_loss_clip": 1.05446625, + "balance_loss_mlp": 1.02659059, + "epoch": 0.11519268923224914, + "flos": 22564937082240.0, + "grad_norm": 1.9017705451146496, + "language_loss": 0.8199634, + "learning_rate": 3.9244392863346895e-06, + "loss": 0.84180629, + "num_input_tokens_seen": 20300090, + "step": 958, + "time_per_iteration": 2.791994571685791 + }, + { + "auxiliary_loss_clip": 0.01212446, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.0650481, + "balance_loss_mlp": 1.03126693, + "epoch": 0.11531293212288823, + "flos": 16982839065600.0, + "grad_norm": 1.8182902874197975, + "language_loss": 0.92402172, + "learning_rate": 3.9242270463207524e-06, + "loss": 0.94658464, + "num_input_tokens_seen": 20318480, + "step": 959, + "time_per_iteration": 2.7012460231781006 + }, + { + "auxiliary_loss_clip": 0.01164607, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.0555253, + "balance_loss_mlp": 1.02744722, + "epoch": 0.11543317501352733, + "flos": 12422004537600.0, + "grad_norm": 3.064022707127171, + "language_loss": 0.85613179, + "learning_rate": 3.924014514403102e-06, + "loss": 0.87817556, + "num_input_tokens_seen": 20334635, + "step": 960, + "time_per_iteration": 2.6200289726257324 + }, + { + "auxiliary_loss_clip": 0.01167287, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.0555023, + "balance_loss_mlp": 1.03145111, + "epoch": 0.11555341790416641, + "flos": 19821648695040.0, + "grad_norm": 1.8819418745391039, + "language_loss": 0.91113877, + "learning_rate": 3.92380169061398e-06, + "loss": 0.93325341, + "num_input_tokens_seen": 20352415, + "step": 961, + "time_per_iteration": 2.6419949531555176 + }, + { + "auxiliary_loss_clip": 0.0118503, + "auxiliary_loss_mlp": 0.00766054, + "balance_loss_clip": 1.05472338, + "balance_loss_mlp": 1.0004729, + "epoch": 0.11567366079480551, + "flos": 25738865625600.0, + "grad_norm": 2.2012459478465116, + "language_loss": 0.83903944, + "learning_rate": 3.9235885749856705e-06, + "loss": 0.85855031, + "num_input_tokens_seen": 20371095, + "step": 962, + "time_per_iteration": 2.633448839187622 + }, + { + "auxiliary_loss_clip": 0.01213416, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.0677799, + "balance_loss_mlp": 1.030334, + "epoch": 0.1157939036854446, + "flos": 18223301301120.0, + "grad_norm": 1.7903948176500721, + "language_loss": 0.82646728, + "learning_rate": 3.9233751675505035e-06, + "loss": 0.8490262, + "num_input_tokens_seen": 20389805, + "step": 963, + "time_per_iteration": 2.5845773220062256 + }, + { + "auxiliary_loss_clip": 0.01204736, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.06273198, + "balance_loss_mlp": 1.02586961, + "epoch": 0.11591414657608369, + "flos": 23073755189760.0, + "grad_norm": 1.908807837973274, + "language_loss": 0.847812, + "learning_rate": 3.923161468340853e-06, + "loss": 0.87024498, + "num_input_tokens_seen": 20409640, + "step": 964, + "time_per_iteration": 2.580963373184204 + }, + { + "auxiliary_loss_clip": 0.01165628, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.05430007, + "balance_loss_mlp": 1.02783251, + "epoch": 0.11603438946672277, + "flos": 19461716461440.0, + "grad_norm": 1.6381109348892826, + "language_loss": 0.81631535, + "learning_rate": 3.9229474773891374e-06, + "loss": 0.83836353, + "num_input_tokens_seen": 20428180, + "step": 965, + "time_per_iteration": 2.6528360843658447 + }, + { + "auxiliary_loss_clip": 0.0119866, + "auxiliary_loss_mlp": 0.0104774, + "balance_loss_clip": 1.05532169, + "balance_loss_mlp": 1.03533053, + "epoch": 0.11615463235736187, + "flos": 26831986272000.0, + "grad_norm": 2.812368858162306, + "language_loss": 0.84072852, + "learning_rate": 3.922733194727818e-06, + "loss": 0.86319256, + "num_input_tokens_seen": 20447975, + "step": 966, + "time_per_iteration": 2.665864944458008 + }, + { + "auxiliary_loss_clip": 0.01228559, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.06445479, + "balance_loss_mlp": 1.02529144, + "epoch": 0.11627487524800097, + "flos": 18580324533120.0, + "grad_norm": 2.0308015262553276, + "language_loss": 0.87523764, + "learning_rate": 3.922518620389402e-06, + "loss": 0.89789462, + "num_input_tokens_seen": 20464840, + "step": 967, + "time_per_iteration": 2.491875410079956 + }, + { + "auxiliary_loss_clip": 0.01120636, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.0513438, + "balance_loss_mlp": 1.02859068, + "epoch": 0.11639511813864005, + "flos": 18150474476160.0, + "grad_norm": 1.7719591020828176, + "language_loss": 0.89476585, + "learning_rate": 3.922303754406439e-06, + "loss": 0.916381, + "num_input_tokens_seen": 20482680, + "step": 968, + "time_per_iteration": 2.739344596862793 + }, + { + "auxiliary_loss_clip": 0.01177221, + "auxiliary_loss_mlp": 0.01046323, + "balance_loss_clip": 1.05591905, + "balance_loss_mlp": 1.03357947, + "epoch": 0.11651536102927915, + "flos": 20922023888640.0, + "grad_norm": 1.8180349908740716, + "language_loss": 0.79353881, + "learning_rate": 3.922088596811526e-06, + "loss": 0.81577432, + "num_input_tokens_seen": 20501810, + "step": 969, + "time_per_iteration": 2.6056923866271973 + }, + { + "auxiliary_loss_clip": 0.01214366, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.06295323, + "balance_loss_mlp": 1.03114057, + "epoch": 0.11663560391991823, + "flos": 16508602776960.0, + "grad_norm": 2.265541965597569, + "language_loss": 0.8693738, + "learning_rate": 3.9218731476373e-06, + "loss": 0.89193857, + "num_input_tokens_seen": 20517995, + "step": 970, + "time_per_iteration": 2.5185046195983887 + }, + { + "auxiliary_loss_clip": 0.01231996, + "auxiliary_loss_mlp": 0.01042893, + "balance_loss_clip": 1.06838155, + "balance_loss_mlp": 1.03031063, + "epoch": 0.11675584681055733, + "flos": 19865029345920.0, + "grad_norm": 2.3135076779749513, + "language_loss": 0.84299803, + "learning_rate": 3.9216574069164455e-06, + "loss": 0.86574686, + "num_input_tokens_seen": 20536970, + "step": 971, + "time_per_iteration": 2.524675130844116 + }, + { + "auxiliary_loss_clip": 0.01237055, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.06457245, + "balance_loss_mlp": 1.02689755, + "epoch": 0.11687608970119642, + "flos": 21944364785280.0, + "grad_norm": 1.4670532794642632, + "language_loss": 0.80037451, + "learning_rate": 3.921441374681691e-06, + "loss": 0.82312667, + "num_input_tokens_seen": 20557030, + "step": 972, + "time_per_iteration": 2.514143466949463 + }, + { + "auxiliary_loss_clip": 0.01205833, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.06262183, + "balance_loss_mlp": 1.0254221, + "epoch": 0.1169963325918355, + "flos": 24061155131520.0, + "grad_norm": 1.8665235028513762, + "language_loss": 0.65436268, + "learning_rate": 3.921225050965808e-06, + "loss": 0.67679173, + "num_input_tokens_seen": 20576915, + "step": 973, + "time_per_iteration": 2.61875319480896 + }, + { + "auxiliary_loss_clip": 0.01191129, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.0590806, + "balance_loss_mlp": 1.02712512, + "epoch": 0.1171165754824746, + "flos": 23368151059200.0, + "grad_norm": 2.6880391044166725, + "language_loss": 0.75302392, + "learning_rate": 3.921008435801612e-06, + "loss": 0.77532887, + "num_input_tokens_seen": 20596000, + "step": 974, + "time_per_iteration": 2.6088783740997314 + }, + { + "auxiliary_loss_clip": 0.01213633, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.06299937, + "balance_loss_mlp": 1.02554417, + "epoch": 0.11723681837311369, + "flos": 18552243075840.0, + "grad_norm": 4.285114135166893, + "language_loss": 0.76008409, + "learning_rate": 3.920791529221963e-06, + "loss": 0.78260279, + "num_input_tokens_seen": 20614675, + "step": 975, + "time_per_iteration": 2.529035806655884 + }, + { + "auxiliary_loss_clip": 0.01210362, + "auxiliary_loss_mlp": 0.00766387, + "balance_loss_clip": 1.06128442, + "balance_loss_mlp": 1.00066805, + "epoch": 0.11735706126375278, + "flos": 23550541344000.0, + "grad_norm": 1.8499289217872699, + "language_loss": 0.76417959, + "learning_rate": 3.920574331259768e-06, + "loss": 0.78394699, + "num_input_tokens_seen": 20635875, + "step": 976, + "time_per_iteration": 2.612368106842041 + }, + { + "auxiliary_loss_clip": 0.01199021, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.06041372, + "balance_loss_mlp": 1.02742648, + "epoch": 0.11747730415439187, + "flos": 22381541216640.0, + "grad_norm": 2.3332826823854207, + "language_loss": 0.79438019, + "learning_rate": 3.9203568419479716e-06, + "loss": 0.81675172, + "num_input_tokens_seen": 20656430, + "step": 977, + "time_per_iteration": 2.5652849674224854 + }, + { + "auxiliary_loss_clip": 0.0120874, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.06257844, + "balance_loss_mlp": 1.0224371, + "epoch": 0.11759754704503096, + "flos": 22200731130240.0, + "grad_norm": 1.718837618217562, + "language_loss": 0.75087053, + "learning_rate": 3.92013906131957e-06, + "loss": 0.7732901, + "num_input_tokens_seen": 20675360, + "step": 978, + "time_per_iteration": 2.553297519683838 + }, + { + "auxiliary_loss_clip": 0.01191975, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.06099439, + "balance_loss_mlp": 1.03808641, + "epoch": 0.11771778993567006, + "flos": 22309755886080.0, + "grad_norm": 2.0680710964172424, + "language_loss": 0.82466137, + "learning_rate": 3.9199209894076e-06, + "loss": 0.84707075, + "num_input_tokens_seen": 20695675, + "step": 979, + "time_per_iteration": 2.595244884490967 + }, + { + "auxiliary_loss_clip": 0.01242083, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.0656836, + "balance_loss_mlp": 1.02462983, + "epoch": 0.11783803282630914, + "flos": 21288169175040.0, + "grad_norm": 2.4707052789212613, + "language_loss": 0.89629817, + "learning_rate": 3.919702626245142e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 20715330, + "step": 980, + "time_per_iteration": 3.4049274921417236 + }, + { + "auxiliary_loss_clip": 0.01195917, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.060094, + "balance_loss_mlp": 1.02756143, + "epoch": 0.11795827571694824, + "flos": 25371535190400.0, + "grad_norm": 1.9601695462580284, + "language_loss": 0.65970743, + "learning_rate": 3.919483971865322e-06, + "loss": 0.68205845, + "num_input_tokens_seen": 20735325, + "step": 981, + "time_per_iteration": 2.6587114334106445 + }, + { + "auxiliary_loss_clip": 0.01206829, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.06445777, + "balance_loss_mlp": 1.02741003, + "epoch": 0.11807851860758732, + "flos": 23622218933760.0, + "grad_norm": 1.7745202475073376, + "language_loss": 0.88066924, + "learning_rate": 3.91926502630131e-06, + "loss": 0.90312701, + "num_input_tokens_seen": 20755940, + "step": 982, + "time_per_iteration": 4.167108535766602 + }, + { + "auxiliary_loss_clip": 0.01228528, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.0684576, + "balance_loss_mlp": 1.0300355, + "epoch": 0.11819876149822642, + "flos": 24972496024320.0, + "grad_norm": 1.698647139236579, + "language_loss": 0.72159225, + "learning_rate": 3.91904578958632e-06, + "loss": 0.74428999, + "num_input_tokens_seen": 20775355, + "step": 983, + "time_per_iteration": 3.359976053237915 + }, + { + "auxiliary_loss_clip": 0.01241512, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.06652236, + "balance_loss_mlp": 1.03113961, + "epoch": 0.11831900438886551, + "flos": 23003226835200.0, + "grad_norm": 2.048071901779721, + "language_loss": 0.84382492, + "learning_rate": 3.918826261753608e-06, + "loss": 0.86667001, + "num_input_tokens_seen": 20794935, + "step": 984, + "time_per_iteration": 2.4908268451690674 + }, + { + "auxiliary_loss_clip": 0.01206934, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.06103861, + "balance_loss_mlp": 1.02800798, + "epoch": 0.1184392472795046, + "flos": 27965147604480.0, + "grad_norm": 2.394395146675226, + "language_loss": 0.71289194, + "learning_rate": 3.918606442836478e-06, + "loss": 0.73534471, + "num_input_tokens_seen": 20817155, + "step": 985, + "time_per_iteration": 2.6179757118225098 + }, + { + "auxiliary_loss_clip": 0.01222628, + "auxiliary_loss_mlp": 0.01040492, + "balance_loss_clip": 1.06738067, + "balance_loss_mlp": 1.02956688, + "epoch": 0.1185594901701437, + "flos": 19898497843200.0, + "grad_norm": 1.7222729810647157, + "language_loss": 0.77240282, + "learning_rate": 3.918386332868277e-06, + "loss": 0.79503405, + "num_input_tokens_seen": 20835125, + "step": 986, + "time_per_iteration": 2.533193826675415 + }, + { + "auxiliary_loss_clip": 0.01213343, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.0623523, + "balance_loss_mlp": 1.03275156, + "epoch": 0.11867973306078278, + "flos": 18912354877440.0, + "grad_norm": 1.7972402145953454, + "language_loss": 0.94646931, + "learning_rate": 3.918165931882394e-06, + "loss": 0.9690454, + "num_input_tokens_seen": 20853525, + "step": 987, + "time_per_iteration": 2.493241548538208 + }, + { + "auxiliary_loss_clip": 0.01150922, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.05180478, + "balance_loss_mlp": 1.02449632, + "epoch": 0.11879997595142187, + "flos": 16982803152000.0, + "grad_norm": 2.8701367565148863, + "language_loss": 0.75633311, + "learning_rate": 3.917945239912264e-06, + "loss": 0.77820742, + "num_input_tokens_seen": 20871000, + "step": 988, + "time_per_iteration": 2.65818190574646 + }, + { + "auxiliary_loss_clip": 0.01174285, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.05648839, + "balance_loss_mlp": 1.02960157, + "epoch": 0.11892021884206096, + "flos": 17530369056000.0, + "grad_norm": 2.2683693811168117, + "language_loss": 0.76048595, + "learning_rate": 3.917724256991367e-06, + "loss": 0.78262937, + "num_input_tokens_seen": 20889745, + "step": 989, + "time_per_iteration": 2.6192023754119873 + }, + { + "auxiliary_loss_clip": 0.01197633, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.06089699, + "balance_loss_mlp": 1.04003155, + "epoch": 0.11904046173270005, + "flos": 30955895763840.0, + "grad_norm": 2.1900327150327934, + "language_loss": 0.81445205, + "learning_rate": 3.9175029831532245e-06, + "loss": 0.83694255, + "num_input_tokens_seen": 20909260, + "step": 990, + "time_per_iteration": 2.642359733581543 + }, + { + "auxiliary_loss_clip": 0.01196171, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.06488705, + "balance_loss_mlp": 1.02659905, + "epoch": 0.11916070462333915, + "flos": 20157234485760.0, + "grad_norm": 1.9819851623689972, + "language_loss": 0.88759965, + "learning_rate": 3.917281418431404e-06, + "loss": 0.90993321, + "num_input_tokens_seen": 20928305, + "step": 991, + "time_per_iteration": 2.5863351821899414 + }, + { + "auxiliary_loss_clip": 0.01205542, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.06330383, + "balance_loss_mlp": 1.02783322, + "epoch": 0.11928094751397823, + "flos": 23551115961600.0, + "grad_norm": 1.8347798245064726, + "language_loss": 0.76992983, + "learning_rate": 3.917059562859516e-06, + "loss": 0.79237938, + "num_input_tokens_seen": 20947630, + "step": 992, + "time_per_iteration": 2.5786662101745605 + }, + { + "auxiliary_loss_clip": 0.01200186, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.06419182, + "balance_loss_mlp": 1.0331502, + "epoch": 0.11940119040461733, + "flos": 23908426502400.0, + "grad_norm": 2.052912838766127, + "language_loss": 0.888677, + "learning_rate": 3.916837416471218e-06, + "loss": 0.91113639, + "num_input_tokens_seen": 20964250, + "step": 993, + "time_per_iteration": 2.566814661026001 + }, + { + "auxiliary_loss_clip": 0.01216891, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_clip": 1.060673, + "balance_loss_mlp": 1.03130603, + "epoch": 0.11952143329525641, + "flos": 13844533835520.0, + "grad_norm": 2.1291378616171404, + "language_loss": 0.72395205, + "learning_rate": 3.916614979300207e-06, + "loss": 0.74654961, + "num_input_tokens_seen": 20979095, + "step": 994, + "time_per_iteration": 2.525491237640381 + }, + { + "auxiliary_loss_clip": 0.01166868, + "auxiliary_loss_mlp": 0.01043405, + "balance_loss_clip": 1.05849338, + "balance_loss_mlp": 1.03258657, + "epoch": 0.11964167618589551, + "flos": 27015525792000.0, + "grad_norm": 1.5246351963244797, + "language_loss": 0.78624648, + "learning_rate": 3.9163922513802274e-06, + "loss": 0.80834913, + "num_input_tokens_seen": 21001430, + "step": 995, + "time_per_iteration": 2.684483528137207 + }, + { + "auxiliary_loss_clip": 0.01240758, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.0657649, + "balance_loss_mlp": 1.028723, + "epoch": 0.1197619190765346, + "flos": 12567622273920.0, + "grad_norm": 2.6258873565575565, + "language_loss": 0.82800126, + "learning_rate": 3.916169232745067e-06, + "loss": 0.85080862, + "num_input_tokens_seen": 21019105, + "step": 996, + "time_per_iteration": 2.4364235401153564 + }, + { + "auxiliary_loss_clip": 0.01198229, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.06201482, + "balance_loss_mlp": 1.03333426, + "epoch": 0.11988216196717369, + "flos": 16909437623040.0, + "grad_norm": 2.324224738622359, + "language_loss": 0.92010516, + "learning_rate": 3.915945923428559e-06, + "loss": 0.94253719, + "num_input_tokens_seen": 21035630, + "step": 997, + "time_per_iteration": 2.552797794342041 + }, + { + "auxiliary_loss_clip": 0.01218022, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.06150365, + "balance_loss_mlp": 1.02751279, + "epoch": 0.12000240485781279, + "flos": 16216577205120.0, + "grad_norm": 2.2702745622218456, + "language_loss": 0.82913798, + "learning_rate": 3.915722323464577e-06, + "loss": 0.85171103, + "num_input_tokens_seen": 21054235, + "step": 998, + "time_per_iteration": 2.5011770725250244 + }, + { + "auxiliary_loss_clip": 0.01224254, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.06482995, + "balance_loss_mlp": 1.02895951, + "epoch": 0.12012264774845187, + "flos": 49344887525760.0, + "grad_norm": 9.58896316961832, + "language_loss": 0.70649987, + "learning_rate": 3.91549843288704e-06, + "loss": 0.72914821, + "num_input_tokens_seen": 21077915, + "step": 999, + "time_per_iteration": 2.7434935569763184 + }, + { + "auxiliary_loss_clip": 0.01189199, + "auxiliary_loss_mlp": 0.007659, + "balance_loss_clip": 1.05567646, + "balance_loss_mlp": 1.00062478, + "epoch": 0.12024289063909097, + "flos": 26979435601920.0, + "grad_norm": 2.1096753748710935, + "language_loss": 0.79038888, + "learning_rate": 3.915274251729916e-06, + "loss": 0.80993986, + "num_input_tokens_seen": 21099205, + "step": 1000, + "time_per_iteration": 2.643472671508789 + }, + { + "auxiliary_loss_clip": 0.01196208, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.06424034, + "balance_loss_mlp": 1.02140212, + "epoch": 0.12036313352973005, + "flos": 19537308633600.0, + "grad_norm": 1.8544035027011023, + "language_loss": 0.90385783, + "learning_rate": 3.91504978002721e-06, + "loss": 0.92615628, + "num_input_tokens_seen": 21118260, + "step": 1001, + "time_per_iteration": 2.5690293312072754 + }, + { + "auxiliary_loss_clip": 0.01210551, + "auxiliary_loss_mlp": 0.0076578, + "balance_loss_clip": 1.06141877, + "balance_loss_mlp": 1.00054801, + "epoch": 0.12048337642036915, + "flos": 17268256535040.0, + "grad_norm": 2.0506963233042637, + "language_loss": 0.76309454, + "learning_rate": 3.914825017812974e-06, + "loss": 0.78285784, + "num_input_tokens_seen": 21134910, + "step": 1002, + "time_per_iteration": 2.5428874492645264 + }, + { + "auxiliary_loss_clip": 0.01209486, + "auxiliary_loss_mlp": 0.01042833, + "balance_loss_clip": 1.06509304, + "balance_loss_mlp": 1.03087592, + "epoch": 0.12060361931100824, + "flos": 22856962654080.0, + "grad_norm": 2.9471277677973005, + "language_loss": 0.73059678, + "learning_rate": 3.9145999651213065e-06, + "loss": 0.75311995, + "num_input_tokens_seen": 21154150, + "step": 1003, + "time_per_iteration": 2.5889201164245605 + }, + { + "auxiliary_loss_clip": 0.01225387, + "auxiliary_loss_mlp": 0.01042682, + "balance_loss_clip": 1.06508482, + "balance_loss_mlp": 1.03049278, + "epoch": 0.12072386220164733, + "flos": 16726795943040.0, + "grad_norm": 2.704158983103104, + "language_loss": 0.88778567, + "learning_rate": 3.9143746219863465e-06, + "loss": 0.91046631, + "num_input_tokens_seen": 21171255, + "step": 1004, + "time_per_iteration": 2.502582311630249 + }, + { + "auxiliary_loss_clip": 0.01112758, + "auxiliary_loss_mlp": 0.01018366, + "balance_loss_clip": 1.0276258, + "balance_loss_mlp": 1.01469445, + "epoch": 0.12084410509228642, + "flos": 55144176105600.0, + "grad_norm": 0.9598125803528457, + "language_loss": 0.6477887, + "learning_rate": 3.914148988442278e-06, + "loss": 0.66909993, + "num_input_tokens_seen": 21227045, + "step": 1005, + "time_per_iteration": 3.0781404972076416 + }, + { + "auxiliary_loss_clip": 0.01194883, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.06053185, + "balance_loss_mlp": 1.02556324, + "epoch": 0.1209643479829255, + "flos": 26760236855040.0, + "grad_norm": 2.646323940227183, + "language_loss": 0.95544195, + "learning_rate": 3.91392306452333e-06, + "loss": 0.9777633, + "num_input_tokens_seen": 21244120, + "step": 1006, + "time_per_iteration": 2.596857786178589 + }, + { + "auxiliary_loss_clip": 0.01243703, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.06813216, + "balance_loss_mlp": 1.02602589, + "epoch": 0.1210845908735646, + "flos": 11035026725760.0, + "grad_norm": 4.04010142298813, + "language_loss": 0.66413605, + "learning_rate": 3.913696850263774e-06, + "loss": 0.68694717, + "num_input_tokens_seen": 21258485, + "step": 1007, + "time_per_iteration": 3.2026400566101074 + }, + { + "auxiliary_loss_clip": 0.01221538, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.06370449, + "balance_loss_mlp": 1.02464151, + "epoch": 0.1212048337642037, + "flos": 20484631975680.0, + "grad_norm": 1.8975304121341388, + "language_loss": 0.79360056, + "learning_rate": 3.913470345697929e-06, + "loss": 0.81617594, + "num_input_tokens_seen": 21277115, + "step": 1008, + "time_per_iteration": 3.331591844558716 + }, + { + "auxiliary_loss_clip": 0.01179099, + "auxiliary_loss_mlp": 0.0104205, + "balance_loss_clip": 1.05980539, + "balance_loss_mlp": 1.03079629, + "epoch": 0.12132507665484278, + "flos": 22346061557760.0, + "grad_norm": 1.9201007677734316, + "language_loss": 0.85409951, + "learning_rate": 3.913243550860153e-06, + "loss": 0.87631106, + "num_input_tokens_seen": 21294880, + "step": 1009, + "time_per_iteration": 3.6106107234954834 + }, + { + "auxiliary_loss_clip": 0.0122874, + "auxiliary_loss_mlp": 0.01045083, + "balance_loss_clip": 1.06980348, + "balance_loss_mlp": 1.03287625, + "epoch": 0.12144531954548188, + "flos": 29314957818240.0, + "grad_norm": 2.306781336844081, + "language_loss": 0.76189792, + "learning_rate": 3.913016465784852e-06, + "loss": 0.78463614, + "num_input_tokens_seen": 21315555, + "step": 1010, + "time_per_iteration": 2.567572593688965 + }, + { + "auxiliary_loss_clip": 0.01176601, + "auxiliary_loss_mlp": 0.01040594, + "balance_loss_clip": 1.05686808, + "balance_loss_mlp": 1.02845216, + "epoch": 0.12156556243612096, + "flos": 20485242506880.0, + "grad_norm": 2.4686687411030626, + "language_loss": 0.71863604, + "learning_rate": 3.912789090506474e-06, + "loss": 0.74080801, + "num_input_tokens_seen": 21334815, + "step": 1011, + "time_per_iteration": 2.6196062564849854 + }, + { + "auxiliary_loss_clip": 0.01198316, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_clip": 1.05880427, + "balance_loss_mlp": 1.03208733, + "epoch": 0.12168580532676006, + "flos": 16472009796480.0, + "grad_norm": 2.255577668767007, + "language_loss": 0.72386634, + "learning_rate": 3.9125614250595114e-06, + "loss": 0.74629223, + "num_input_tokens_seen": 21351025, + "step": 1012, + "time_per_iteration": 2.5531017780303955 + }, + { + "auxiliary_loss_clip": 0.01223764, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.06297159, + "balance_loss_mlp": 1.02596307, + "epoch": 0.12180604821739914, + "flos": 15341290588800.0, + "grad_norm": 3.0441915624316063, + "language_loss": 0.88928324, + "learning_rate": 3.912333469478502e-06, + "loss": 0.91190279, + "num_input_tokens_seen": 21368990, + "step": 1013, + "time_per_iteration": 2.495077133178711 + }, + { + "auxiliary_loss_clip": 0.01206488, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.06057394, + "balance_loss_mlp": 1.02187455, + "epoch": 0.12192629110803824, + "flos": 19318038059520.0, + "grad_norm": 2.0427297407854597, + "language_loss": 0.77962685, + "learning_rate": 3.912105223798025e-06, + "loss": 0.80202174, + "num_input_tokens_seen": 21388410, + "step": 1014, + "time_per_iteration": 2.6215145587921143 + }, + { + "auxiliary_loss_clip": 0.01099655, + "auxiliary_loss_mlp": 0.01005045, + "balance_loss_clip": 1.02547741, + "balance_loss_mlp": 1.00158834, + "epoch": 0.12204653399867733, + "flos": 47725354085760.0, + "grad_norm": 0.990520873334885, + "language_loss": 0.67684889, + "learning_rate": 3.9118766880527065e-06, + "loss": 0.69789588, + "num_input_tokens_seen": 21442845, + "step": 1015, + "time_per_iteration": 3.044400930404663 + }, + { + "auxiliary_loss_clip": 0.01168023, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.05710983, + "balance_loss_mlp": 1.02113688, + "epoch": 0.12216677688931642, + "flos": 18221936584320.0, + "grad_norm": 2.52403638356895, + "language_loss": 0.73751146, + "learning_rate": 3.9116478622772145e-06, + "loss": 0.75951493, + "num_input_tokens_seen": 21461420, + "step": 1016, + "time_per_iteration": 2.62351131439209 + }, + { + "auxiliary_loss_clip": 0.01222966, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_clip": 1.06604719, + "balance_loss_mlp": 1.03663373, + "epoch": 0.12228701977995551, + "flos": 27525636789120.0, + "grad_norm": 1.7199422193388714, + "language_loss": 0.88077521, + "learning_rate": 3.911418746506261e-06, + "loss": 0.90348864, + "num_input_tokens_seen": 21481550, + "step": 1017, + "time_per_iteration": 2.5631885528564453 + }, + { + "auxiliary_loss_clip": 0.01229994, + "auxiliary_loss_mlp": 0.01044416, + "balance_loss_clip": 1.07086349, + "balance_loss_mlp": 1.03259039, + "epoch": 0.1224072626705946, + "flos": 21798136517760.0, + "grad_norm": 1.8260706506822462, + "language_loss": 0.78364635, + "learning_rate": 3.911189340774604e-06, + "loss": 0.8063904, + "num_input_tokens_seen": 21501680, + "step": 1018, + "time_per_iteration": 2.5708253383636475 + }, + { + "auxiliary_loss_clip": 0.01217812, + "auxiliary_loss_mlp": 0.01039771, + "balance_loss_clip": 1.06492114, + "balance_loss_mlp": 1.02800465, + "epoch": 0.1225275055612337, + "flos": 20703758895360.0, + "grad_norm": 1.727340606254441, + "language_loss": 0.79367739, + "learning_rate": 3.910959645117043e-06, + "loss": 0.81625319, + "num_input_tokens_seen": 21521015, + "step": 1019, + "time_per_iteration": 2.5758423805236816 + }, + { + "auxiliary_loss_clip": 0.01105632, + "auxiliary_loss_mlp": 0.007565, + "balance_loss_clip": 1.02706218, + "balance_loss_mlp": 1.00058162, + "epoch": 0.12264774845187278, + "flos": 57745294462080.0, + "grad_norm": 0.8125432820521249, + "language_loss": 0.56720543, + "learning_rate": 3.910729659568423e-06, + "loss": 0.58582675, + "num_input_tokens_seen": 21578200, + "step": 1020, + "time_per_iteration": 3.1648995876312256 + }, + { + "auxiliary_loss_clip": 0.01209931, + "auxiliary_loss_mlp": 0.01040801, + "balance_loss_clip": 1.06600249, + "balance_loss_mlp": 1.03006673, + "epoch": 0.12276799134251187, + "flos": 26396282298240.0, + "grad_norm": 1.7057670937916907, + "language_loss": 0.82469827, + "learning_rate": 3.9104993841636344e-06, + "loss": 0.84720558, + "num_input_tokens_seen": 21598770, + "step": 1021, + "time_per_iteration": 2.609281063079834 + }, + { + "auxiliary_loss_clip": 0.01207678, + "auxiliary_loss_mlp": 0.00765196, + "balance_loss_clip": 1.06627536, + "balance_loss_mlp": 1.00060439, + "epoch": 0.12288823423315097, + "flos": 21064193919360.0, + "grad_norm": 1.8577112068971815, + "language_loss": 0.80922139, + "learning_rate": 3.910268818937608e-06, + "loss": 0.82895011, + "num_input_tokens_seen": 21616925, + "step": 1022, + "time_per_iteration": 2.5876307487487793 + }, + { + "auxiliary_loss_clip": 0.01176879, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.06090665, + "balance_loss_mlp": 1.0285604, + "epoch": 0.12300847712379005, + "flos": 12312441077760.0, + "grad_norm": 2.6475338074114765, + "language_loss": 0.87307084, + "learning_rate": 3.9100379639253196e-06, + "loss": 0.89523911, + "num_input_tokens_seen": 21633645, + "step": 1023, + "time_per_iteration": 2.6753451824188232 + }, + { + "auxiliary_loss_clip": 0.01206551, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.05974627, + "balance_loss_mlp": 1.02591443, + "epoch": 0.12312872001442915, + "flos": 16762239688320.0, + "grad_norm": 2.417687254467852, + "language_loss": 0.86479461, + "learning_rate": 3.909806819161791e-06, + "loss": 0.88723874, + "num_input_tokens_seen": 21649120, + "step": 1024, + "time_per_iteration": 2.513341188430786 + }, + { + "auxiliary_loss_clip": 0.01196211, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.06084919, + "balance_loss_mlp": 1.02440524, + "epoch": 0.12324896290506823, + "flos": 18404937400320.0, + "grad_norm": 1.9589692322372398, + "language_loss": 0.86263263, + "learning_rate": 3.909575384682086e-06, + "loss": 0.88495904, + "num_input_tokens_seen": 21668000, + "step": 1025, + "time_per_iteration": 2.5905184745788574 + }, + { + "auxiliary_loss_clip": 0.01226497, + "auxiliary_loss_mlp": 0.01056194, + "balance_loss_clip": 1.06430888, + "balance_loss_mlp": 1.04410565, + "epoch": 0.12336920579570733, + "flos": 18915407533440.0, + "grad_norm": 1.8655590766453618, + "language_loss": 0.69025713, + "learning_rate": 3.9093436605213144e-06, + "loss": 0.7130841, + "num_input_tokens_seen": 21688500, + "step": 1026, + "time_per_iteration": 2.5556986331939697 + }, + { + "auxiliary_loss_clip": 0.01208768, + "auxiliary_loss_mlp": 0.01044812, + "balance_loss_clip": 1.06253195, + "balance_loss_mlp": 1.03368413, + "epoch": 0.12348944868634643, + "flos": 23878369797120.0, + "grad_norm": 1.7260957482982664, + "language_loss": 0.79429078, + "learning_rate": 3.909111646714627e-06, + "loss": 0.81682652, + "num_input_tokens_seen": 21709345, + "step": 1027, + "time_per_iteration": 2.596848249435425 + }, + { + "auxiliary_loss_clip": 0.01234456, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.06441426, + "balance_loss_mlp": 1.02237034, + "epoch": 0.12360969157698551, + "flos": 19026084314880.0, + "grad_norm": 2.0780050132534744, + "language_loss": 0.72134864, + "learning_rate": 3.9088793432972206e-06, + "loss": 0.74402279, + "num_input_tokens_seen": 21728165, + "step": 1028, + "time_per_iteration": 2.5289793014526367 + }, + { + "auxiliary_loss_clip": 0.01174556, + "auxiliary_loss_mlp": 0.01041233, + "balance_loss_clip": 1.05893672, + "balance_loss_mlp": 1.02988458, + "epoch": 0.1237299344676246, + "flos": 13224607983360.0, + "grad_norm": 2.2036766739061706, + "language_loss": 0.81887329, + "learning_rate": 3.908646750304336e-06, + "loss": 0.84103113, + "num_input_tokens_seen": 21745850, + "step": 1029, + "time_per_iteration": 2.61556077003479 + }, + { + "auxiliary_loss_clip": 0.01212655, + "auxiliary_loss_mlp": 0.01038181, + "balance_loss_clip": 1.06578064, + "balance_loss_mlp": 1.02691007, + "epoch": 0.12385017735826369, + "flos": 20485673470080.0, + "grad_norm": 1.6563065779352846, + "language_loss": 0.87232924, + "learning_rate": 3.908413867771257e-06, + "loss": 0.89483762, + "num_input_tokens_seen": 21764760, + "step": 1030, + "time_per_iteration": 2.5537776947021484 + }, + { + "auxiliary_loss_clip": 0.01222885, + "auxiliary_loss_mlp": 0.01042772, + "balance_loss_clip": 1.06665456, + "balance_loss_mlp": 1.03029037, + "epoch": 0.12397042024890279, + "flos": 17347835116800.0, + "grad_norm": 2.80765701690499, + "language_loss": 0.80836433, + "learning_rate": 3.908180695733311e-06, + "loss": 0.83102089, + "num_input_tokens_seen": 21784250, + "step": 1031, + "time_per_iteration": 2.5283615589141846 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.0105061, + "balance_loss_clip": 1.05398762, + "balance_loss_mlp": 1.03899932, + "epoch": 0.12409066313954187, + "flos": 20412343854720.0, + "grad_norm": 1.7243856998166662, + "language_loss": 0.82587904, + "learning_rate": 3.907947234225871e-06, + "loss": 0.84794652, + "num_input_tokens_seen": 21803260, + "step": 1032, + "time_per_iteration": 2.6609599590301514 + }, + { + "auxiliary_loss_clip": 0.0115523, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.05652881, + "balance_loss_mlp": 1.02433991, + "epoch": 0.12421090603018096, + "flos": 20736688688640.0, + "grad_norm": 2.032795667262735, + "language_loss": 0.86986744, + "learning_rate": 3.907713483284352e-06, + "loss": 0.89177221, + "num_input_tokens_seen": 21822735, + "step": 1033, + "time_per_iteration": 3.414198398590088 + }, + { + "auxiliary_loss_clip": 0.01136252, + "auxiliary_loss_mlp": 0.01045928, + "balance_loss_clip": 1.05161452, + "balance_loss_mlp": 1.0328629, + "epoch": 0.12433114892082006, + "flos": 24498834353280.0, + "grad_norm": 2.1182155335371875, + "language_loss": 0.96906853, + "learning_rate": 3.907479442944216e-06, + "loss": 0.99089032, + "num_input_tokens_seen": 21841140, + "step": 1034, + "time_per_iteration": 2.730423927307129 + }, + { + "auxiliary_loss_clip": 0.01222586, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.0656755, + "balance_loss_mlp": 1.02713192, + "epoch": 0.12445139181145914, + "flos": 19682315838720.0, + "grad_norm": 2.3747896242790194, + "language_loss": 0.9254849, + "learning_rate": 3.907245113240963e-06, + "loss": 0.94808751, + "num_input_tokens_seen": 21859260, + "step": 1035, + "time_per_iteration": 4.330326795578003 + }, + { + "auxiliary_loss_clip": 0.0118913, + "auxiliary_loss_mlp": 0.01039028, + "balance_loss_clip": 1.05605388, + "balance_loss_mlp": 1.02721477, + "epoch": 0.12457163470209824, + "flos": 46423087522560.0, + "grad_norm": 1.8092644991493878, + "language_loss": 0.74021316, + "learning_rate": 3.907010494210144e-06, + "loss": 0.76249468, + "num_input_tokens_seen": 21881920, + "step": 1036, + "time_per_iteration": 2.788540840148926 + }, + { + "auxiliary_loss_clip": 0.01227619, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.06821394, + "balance_loss_mlp": 1.02813482, + "epoch": 0.12469187759273732, + "flos": 20376289578240.0, + "grad_norm": 2.2501040124787703, + "language_loss": 0.92053378, + "learning_rate": 3.9067755858873495e-06, + "loss": 0.94322038, + "num_input_tokens_seen": 21898720, + "step": 1037, + "time_per_iteration": 2.520719289779663 + }, + { + "auxiliary_loss_clip": 0.01084568, + "auxiliary_loss_mlp": 0.01010269, + "balance_loss_clip": 1.0198729, + "balance_loss_mlp": 1.00716913, + "epoch": 0.12481212048337642, + "flos": 69224641447680.0, + "grad_norm": 0.8677857729503843, + "language_loss": 0.62883776, + "learning_rate": 3.906540388308214e-06, + "loss": 0.64978611, + "num_input_tokens_seen": 21958305, + "step": 1038, + "time_per_iteration": 3.130140542984009 + }, + { + "auxiliary_loss_clip": 0.01163258, + "auxiliary_loss_mlp": 0.01045288, + "balance_loss_clip": 1.0589267, + "balance_loss_mlp": 1.03314686, + "epoch": 0.12493236337401552, + "flos": 18223696350720.0, + "grad_norm": 1.8073709706382857, + "language_loss": 0.81376076, + "learning_rate": 3.906304901508417e-06, + "loss": 0.83584619, + "num_input_tokens_seen": 21977205, + "step": 1039, + "time_per_iteration": 2.6253833770751953 + }, + { + "auxiliary_loss_clip": 0.01225959, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.06938267, + "balance_loss_mlp": 1.03438282, + "epoch": 0.12505260626465461, + "flos": 30044375303040.0, + "grad_norm": 2.165768411202503, + "language_loss": 0.75856638, + "learning_rate": 3.9060691255236835e-06, + "loss": 0.78127682, + "num_input_tokens_seen": 21997770, + "step": 1040, + "time_per_iteration": 2.6141459941864014 + }, + { + "auxiliary_loss_clip": 0.01214779, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_clip": 1.05893683, + "balance_loss_mlp": 1.03100181, + "epoch": 0.1251728491552937, + "flos": 24433980347520.0, + "grad_norm": 2.0825170316626966, + "language_loss": 0.80774593, + "learning_rate": 3.905833060389778e-06, + "loss": 0.83032459, + "num_input_tokens_seen": 22021890, + "step": 1041, + "time_per_iteration": 2.617892026901245 + }, + { + "auxiliary_loss_clip": 0.01239973, + "auxiliary_loss_mlp": 0.00765896, + "balance_loss_clip": 1.06771278, + "balance_loss_mlp": 1.00050235, + "epoch": 0.12529309204593278, + "flos": 27119809952640.0, + "grad_norm": 2.033416425227112, + "language_loss": 0.78584045, + "learning_rate": 3.905596706142513e-06, + "loss": 0.80589914, + "num_input_tokens_seen": 22043300, + "step": 1042, + "time_per_iteration": 2.544154644012451 + }, + { + "auxiliary_loss_clip": 0.01186887, + "auxiliary_loss_mlp": 0.01042991, + "balance_loss_clip": 1.05792046, + "balance_loss_mlp": 1.03113532, + "epoch": 0.12541333493657186, + "flos": 30774151923840.0, + "grad_norm": 1.9825665135880344, + "language_loss": 0.86204076, + "learning_rate": 3.9053600628177435e-06, + "loss": 0.88433945, + "num_input_tokens_seen": 22062910, + "step": 1043, + "time_per_iteration": 2.659789800643921 + }, + { + "auxiliary_loss_clip": 0.01235809, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.06527948, + "balance_loss_mlp": 1.02650785, + "epoch": 0.12553357782721097, + "flos": 23659566099840.0, + "grad_norm": 2.02360247026911, + "language_loss": 0.84805894, + "learning_rate": 3.905123130451367e-06, + "loss": 0.87079394, + "num_input_tokens_seen": 22084010, + "step": 1044, + "time_per_iteration": 2.4836843013763428 + }, + { + "auxiliary_loss_clip": 0.01238001, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.06661189, + "balance_loss_mlp": 1.02651715, + "epoch": 0.12565382071785006, + "flos": 24863758577280.0, + "grad_norm": 1.834208084065533, + "language_loss": 0.79542667, + "learning_rate": 3.904885909079326e-06, + "loss": 0.81819552, + "num_input_tokens_seen": 22102795, + "step": 1045, + "time_per_iteration": 2.4970035552978516 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.06280184, + "balance_loss_mlp": 1.02990174, + "epoch": 0.12577406360848914, + "flos": 21360780518400.0, + "grad_norm": 2.301228865342368, + "language_loss": 0.77948004, + "learning_rate": 3.904648398737607e-06, + "loss": 0.80213177, + "num_input_tokens_seen": 22121360, + "step": 1046, + "time_per_iteration": 2.5358169078826904 + }, + { + "auxiliary_loss_clip": 0.01237161, + "auxiliary_loss_mlp": 0.01044589, + "balance_loss_clip": 1.06548643, + "balance_loss_mlp": 1.03325796, + "epoch": 0.12589430649912825, + "flos": 36138056774400.0, + "grad_norm": 1.8387368266389037, + "language_loss": 0.77851701, + "learning_rate": 3.9044105994622406e-06, + "loss": 0.8013345, + "num_input_tokens_seen": 22142505, + "step": 1047, + "time_per_iteration": 2.6019949913024902 + }, + { + "auxiliary_loss_clip": 0.01209834, + "auxiliary_loss_mlp": 0.0076652, + "balance_loss_clip": 1.06142414, + "balance_loss_mlp": 1.00049973, + "epoch": 0.12601454938976733, + "flos": 25337671643520.0, + "grad_norm": 1.7879344355647262, + "language_loss": 0.82132846, + "learning_rate": 3.9041725112893005e-06, + "loss": 0.84109193, + "num_input_tokens_seen": 22163730, + "step": 1048, + "time_per_iteration": 2.623356342315674 + }, + { + "auxiliary_loss_clip": 0.0118795, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_clip": 1.06150615, + "balance_loss_mlp": 1.03337622, + "epoch": 0.12613479228040642, + "flos": 15560094286080.0, + "grad_norm": 2.8337247214013024, + "language_loss": 0.751284, + "learning_rate": 3.903934134254904e-06, + "loss": 0.77361298, + "num_input_tokens_seen": 22181520, + "step": 1049, + "time_per_iteration": 2.5933494567871094 + }, + { + "auxiliary_loss_clip": 0.01227356, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.06444979, + "balance_loss_mlp": 1.02993989, + "epoch": 0.1262550351710455, + "flos": 21470595373440.0, + "grad_norm": 2.1939216617211996, + "language_loss": 0.85168588, + "learning_rate": 3.903695468395213e-06, + "loss": 0.87438142, + "num_input_tokens_seen": 22199390, + "step": 1050, + "time_per_iteration": 2.591585636138916 + }, + { + "auxiliary_loss_clip": 0.01209933, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.05910277, + "balance_loss_mlp": 1.03159213, + "epoch": 0.1263752780616846, + "flos": 31576719456000.0, + "grad_norm": 2.4365251497791647, + "language_loss": 0.56180906, + "learning_rate": 3.903456513746434e-06, + "loss": 0.58433264, + "num_input_tokens_seen": 22220365, + "step": 1051, + "time_per_iteration": 2.624966621398926 + }, + { + "auxiliary_loss_clip": 0.01233572, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.06357551, + "balance_loss_mlp": 1.02632856, + "epoch": 0.1264955209523237, + "flos": 28768217927040.0, + "grad_norm": 1.7156146986624585, + "language_loss": 0.87875628, + "learning_rate": 3.903217270344815e-06, + "loss": 0.90146673, + "num_input_tokens_seen": 22240615, + "step": 1052, + "time_per_iteration": 2.5525124073028564 + }, + { + "auxiliary_loss_clip": 0.01182613, + "auxiliary_loss_mlp": 0.01041455, + "balance_loss_clip": 1.05629277, + "balance_loss_mlp": 1.02985048, + "epoch": 0.12661576384296278, + "flos": 29241125412480.0, + "grad_norm": 1.9010482147436805, + "language_loss": 0.82078099, + "learning_rate": 3.902977738226648e-06, + "loss": 0.84302169, + "num_input_tokens_seen": 22261350, + "step": 1053, + "time_per_iteration": 2.6897804737091064 + }, + { + "auxiliary_loss_clip": 0.01226416, + "auxiliary_loss_mlp": 0.01048902, + "balance_loss_clip": 1.06522703, + "balance_loss_mlp": 1.03630781, + "epoch": 0.12673600673360189, + "flos": 20850346298880.0, + "grad_norm": 2.2315762732263758, + "language_loss": 0.91388261, + "learning_rate": 3.902737917428273e-06, + "loss": 0.93663585, + "num_input_tokens_seen": 22279515, + "step": 1054, + "time_per_iteration": 2.5261991024017334 + }, + { + "auxiliary_loss_clip": 0.01235811, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.06448388, + "balance_loss_mlp": 1.02759171, + "epoch": 0.12685624962424097, + "flos": 25263695583360.0, + "grad_norm": 2.5699811173321736, + "language_loss": 0.84103638, + "learning_rate": 3.902497807986068e-06, + "loss": 0.86378193, + "num_input_tokens_seen": 22299535, + "step": 1055, + "time_per_iteration": 2.5223405361175537 + }, + { + "auxiliary_loss_clip": 0.01191034, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.05673623, + "balance_loss_mlp": 1.02774, + "epoch": 0.12697649251488005, + "flos": 27527109246720.0, + "grad_norm": 1.875138620495883, + "language_loss": 0.84047484, + "learning_rate": 3.902257409936458e-06, + "loss": 0.8627851, + "num_input_tokens_seen": 22320300, + "step": 1056, + "time_per_iteration": 2.703920602798462 + }, + { + "auxiliary_loss_clip": 0.01206425, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.06520307, + "balance_loss_mlp": 1.03168225, + "epoch": 0.12709673540551916, + "flos": 21251863503360.0, + "grad_norm": 1.7813530729522795, + "language_loss": 0.83898681, + "learning_rate": 3.902016723315912e-06, + "loss": 0.86148238, + "num_input_tokens_seen": 22338240, + "step": 1057, + "time_per_iteration": 2.5500500202178955 + }, + { + "auxiliary_loss_clip": 0.01218692, + "auxiliary_loss_mlp": 0.01038266, + "balance_loss_clip": 1.06144941, + "balance_loss_mlp": 1.0269469, + "epoch": 0.12721697829615825, + "flos": 25337707557120.0, + "grad_norm": 2.6080745494358983, + "language_loss": 0.69487011, + "learning_rate": 3.901775748160941e-06, + "loss": 0.71743965, + "num_input_tokens_seen": 22357420, + "step": 1058, + "time_per_iteration": 2.5631563663482666 + }, + { + "auxiliary_loss_clip": 0.01097698, + "auxiliary_loss_mlp": 0.01003875, + "balance_loss_clip": 1.02415848, + "balance_loss_mlp": 1.00041831, + "epoch": 0.12733722118679733, + "flos": 61943287754880.0, + "grad_norm": 0.794994901578526, + "language_loss": 0.6091494, + "learning_rate": 3.901534484508101e-06, + "loss": 0.6301651, + "num_input_tokens_seen": 22420095, + "step": 1059, + "time_per_iteration": 3.1312367916107178 + }, + { + "auxiliary_loss_clip": 0.01198224, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.0592581, + "balance_loss_mlp": 1.02780747, + "epoch": 0.1274574640774364, + "flos": 26976742081920.0, + "grad_norm": 1.757718367762464, + "language_loss": 0.74566364, + "learning_rate": 3.901292932393991e-06, + "loss": 0.76804471, + "num_input_tokens_seen": 22438975, + "step": 1060, + "time_per_iteration": 3.37105655670166 + }, + { + "auxiliary_loss_clip": 0.0123693, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_clip": 1.06642056, + "balance_loss_mlp": 1.032727, + "epoch": 0.12757770696807552, + "flos": 22236318529920.0, + "grad_norm": 2.683069096975892, + "language_loss": 0.85380304, + "learning_rate": 3.9010510918552555e-06, + "loss": 0.87661779, + "num_input_tokens_seen": 22458050, + "step": 1061, + "time_per_iteration": 3.207925796508789 + }, + { + "auxiliary_loss_clip": 0.01205701, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_clip": 1.06172013, + "balance_loss_mlp": 1.03438258, + "epoch": 0.1276979498587146, + "flos": 28547905858560.0, + "grad_norm": 2.2181327617303332, + "language_loss": 0.74518663, + "learning_rate": 3.900808962928581e-06, + "loss": 0.76772106, + "num_input_tokens_seen": 22475665, + "step": 1062, + "time_per_iteration": 4.192541122436523 + }, + { + "auxiliary_loss_clip": 0.01237608, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.06756854, + "balance_loss_mlp": 1.03203726, + "epoch": 0.1278181927493537, + "flos": 17420338719360.0, + "grad_norm": 2.1493599543259476, + "language_loss": 0.89597332, + "learning_rate": 3.900566545650698e-06, + "loss": 0.91879213, + "num_input_tokens_seen": 22493335, + "step": 1063, + "time_per_iteration": 2.4348583221435547 + }, + { + "auxiliary_loss_clip": 0.01221776, + "auxiliary_loss_mlp": 0.01037193, + "balance_loss_clip": 1.06514704, + "balance_loss_mlp": 1.02459216, + "epoch": 0.1279384356399928, + "flos": 21138636856320.0, + "grad_norm": 2.4317959370763806, + "language_loss": 0.81991482, + "learning_rate": 3.900323840058381e-06, + "loss": 0.8425045, + "num_input_tokens_seen": 22511045, + "step": 1064, + "time_per_iteration": 2.4917423725128174 + }, + { + "auxiliary_loss_clip": 0.01221352, + "auxiliary_loss_mlp": 0.01037427, + "balance_loss_clip": 1.06205368, + "balance_loss_mlp": 1.02652502, + "epoch": 0.12805867853063188, + "flos": 26576733248640.0, + "grad_norm": 1.8150866672807773, + "language_loss": 0.81860203, + "learning_rate": 3.900080846188449e-06, + "loss": 0.8411898, + "num_input_tokens_seen": 22529635, + "step": 1065, + "time_per_iteration": 2.579023838043213 + }, + { + "auxiliary_loss_clip": 0.01237094, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.06564808, + "balance_loss_mlp": 1.02511668, + "epoch": 0.12817892142127096, + "flos": 16436206915200.0, + "grad_norm": 2.1840617257935677, + "language_loss": 0.81221342, + "learning_rate": 3.8998375640777625e-06, + "loss": 0.83495647, + "num_input_tokens_seen": 22547505, + "step": 1066, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01097635, + "auxiliary_loss_mlp": 0.01006575, + "balance_loss_clip": 1.02421284, + "balance_loss_mlp": 1.00311816, + "epoch": 0.12829916431191005, + "flos": 60757049099520.0, + "grad_norm": 0.7008882319576063, + "language_loss": 0.52651429, + "learning_rate": 3.899593993763229e-06, + "loss": 0.5475564, + "num_input_tokens_seen": 22608465, + "step": 1067, + "time_per_iteration": 3.0849146842956543 + }, + { + "auxiliary_loss_clip": 0.01187537, + "auxiliary_loss_mlp": 0.01040392, + "balance_loss_clip": 1.06102085, + "balance_loss_mlp": 1.02678406, + "epoch": 0.12841940720254916, + "flos": 29786895636480.0, + "grad_norm": 1.9326467634356748, + "language_loss": 0.81173527, + "learning_rate": 3.899350135281796e-06, + "loss": 0.83401453, + "num_input_tokens_seen": 22629465, + "step": 1068, + "time_per_iteration": 2.7014524936676025 + }, + { + "auxiliary_loss_clip": 0.01195867, + "auxiliary_loss_mlp": 0.01040886, + "balance_loss_clip": 1.06296778, + "balance_loss_mlp": 1.0292275, + "epoch": 0.12853965009318824, + "flos": 25951851319680.0, + "grad_norm": 1.8942979586135964, + "language_loss": 0.79509997, + "learning_rate": 3.8991059886704585e-06, + "loss": 0.81746751, + "num_input_tokens_seen": 22648970, + "step": 1069, + "time_per_iteration": 2.7527835369110107 + }, + { + "auxiliary_loss_clip": 0.0118406, + "auxiliary_loss_mlp": 0.01043911, + "balance_loss_clip": 1.05845666, + "balance_loss_mlp": 1.0321269, + "epoch": 0.12865989298382732, + "flos": 30846871008000.0, + "grad_norm": 3.0375864042754754, + "language_loss": 0.83218378, + "learning_rate": 3.898861553966252e-06, + "loss": 0.85446352, + "num_input_tokens_seen": 22668620, + "step": 1070, + "time_per_iteration": 2.6544060707092285 + }, + { + "auxiliary_loss_clip": 0.01147119, + "auxiliary_loss_mlp": 0.01040795, + "balance_loss_clip": 1.05284178, + "balance_loss_mlp": 1.02928531, + "epoch": 0.12878013587446643, + "flos": 25885776251520.0, + "grad_norm": 1.6059021970267606, + "language_loss": 0.88276815, + "learning_rate": 3.898616831206257e-06, + "loss": 0.90464735, + "num_input_tokens_seen": 22689045, + "step": 1071, + "time_per_iteration": 2.8169755935668945 + }, + { + "auxiliary_loss_clip": 0.01187432, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.0563271, + "balance_loss_mlp": 1.02572703, + "epoch": 0.12890037876510552, + "flos": 23333138277120.0, + "grad_norm": 2.0507138467152575, + "language_loss": 0.77051663, + "learning_rate": 3.8983718204276e-06, + "loss": 0.79278612, + "num_input_tokens_seen": 22711265, + "step": 1072, + "time_per_iteration": 2.6494476795196533 + }, + { + "auxiliary_loss_clip": 0.01202952, + "auxiliary_loss_mlp": 0.01043137, + "balance_loss_clip": 1.06074369, + "balance_loss_mlp": 1.03225291, + "epoch": 0.1290206216557446, + "flos": 23587242065280.0, + "grad_norm": 1.6210355732540829, + "language_loss": 0.82544482, + "learning_rate": 3.898126521667446e-06, + "loss": 0.84790564, + "num_input_tokens_seen": 22731420, + "step": 1073, + "time_per_iteration": 2.606952428817749 + }, + { + "auxiliary_loss_clip": 0.01217308, + "auxiliary_loss_mlp": 0.01051592, + "balance_loss_clip": 1.06015873, + "balance_loss_mlp": 1.03913474, + "epoch": 0.12914086454638368, + "flos": 24170610850560.0, + "grad_norm": 2.0889349962772408, + "language_loss": 0.83094406, + "learning_rate": 3.897880934963007e-06, + "loss": 0.85363305, + "num_input_tokens_seen": 22750970, + "step": 1074, + "time_per_iteration": 2.5464606285095215 + }, + { + "auxiliary_loss_clip": 0.01200189, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.05767119, + "balance_loss_mlp": 1.02312768, + "epoch": 0.1292611074370228, + "flos": 20267157081600.0, + "grad_norm": 2.3825652314996315, + "language_loss": 0.7839219, + "learning_rate": 3.89763506035154e-06, + "loss": 0.80627525, + "num_input_tokens_seen": 22768820, + "step": 1075, + "time_per_iteration": 2.5488884449005127 + }, + { + "auxiliary_loss_clip": 0.01207455, + "auxiliary_loss_mlp": 0.01038959, + "balance_loss_clip": 1.06099534, + "balance_loss_mlp": 1.02745569, + "epoch": 0.12938135032766188, + "flos": 27377684668800.0, + "grad_norm": 1.818841941166332, + "language_loss": 0.80930656, + "learning_rate": 3.897388897870343e-06, + "loss": 0.83177066, + "num_input_tokens_seen": 22789460, + "step": 1076, + "time_per_iteration": 2.6027488708496094 + }, + { + "auxiliary_loss_clip": 0.01216497, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.06091404, + "balance_loss_mlp": 1.02390647, + "epoch": 0.12950159321830096, + "flos": 29277107861760.0, + "grad_norm": 2.267886828067961, + "language_loss": 0.74844557, + "learning_rate": 3.89714244755676e-06, + "loss": 0.77097303, + "num_input_tokens_seen": 22810820, + "step": 1077, + "time_per_iteration": 2.6856064796447754 + }, + { + "auxiliary_loss_clip": 0.01160281, + "auxiliary_loss_mlp": 0.01048739, + "balance_loss_clip": 1.0522207, + "balance_loss_mlp": 1.03680611, + "epoch": 0.12962183610894007, + "flos": 24534888629760.0, + "grad_norm": 2.6921088966252413, + "language_loss": 0.86473924, + "learning_rate": 3.896895709448175e-06, + "loss": 0.8868295, + "num_input_tokens_seen": 22830570, + "step": 1078, + "time_per_iteration": 2.6484367847442627 + }, + { + "auxiliary_loss_clip": 0.0115089, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.05036759, + "balance_loss_mlp": 1.0327301, + "epoch": 0.12974207899957915, + "flos": 11215944552960.0, + "grad_norm": 2.677033917966243, + "language_loss": 0.76961946, + "learning_rate": 3.896648683582019e-06, + "loss": 0.79156888, + "num_input_tokens_seen": 22845905, + "step": 1079, + "time_per_iteration": 2.630662202835083 + }, + { + "auxiliary_loss_clip": 0.0117227, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.05983758, + "balance_loss_mlp": 1.03013206, + "epoch": 0.12986232189021824, + "flos": 24717889445760.0, + "grad_norm": 3.2649643005184514, + "language_loss": 0.80984527, + "learning_rate": 3.896401369995766e-06, + "loss": 0.83197886, + "num_input_tokens_seen": 22865710, + "step": 1080, + "time_per_iteration": 2.6650562286376953 + }, + { + "auxiliary_loss_clip": 0.01238362, + "auxiliary_loss_mlp": 0.01051007, + "balance_loss_clip": 1.06849241, + "balance_loss_mlp": 1.03957534, + "epoch": 0.12998256478085732, + "flos": 23915357827200.0, + "grad_norm": 1.76989814963174, + "language_loss": 0.79493952, + "learning_rate": 3.896153768726932e-06, + "loss": 0.81783319, + "num_input_tokens_seen": 22886020, + "step": 1081, + "time_per_iteration": 2.4893932342529297 + }, + { + "auxiliary_loss_clip": 0.01220664, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.06560612, + "balance_loss_mlp": 1.03135657, + "epoch": 0.13010280767149643, + "flos": 18624207974400.0, + "grad_norm": 3.0296820114714005, + "language_loss": 0.88073671, + "learning_rate": 3.8959058798130806e-06, + "loss": 0.90337396, + "num_input_tokens_seen": 22903995, + "step": 1082, + "time_per_iteration": 2.4932053089141846 + }, + { + "auxiliary_loss_clip": 0.01207132, + "auxiliary_loss_mlp": 0.00766376, + "balance_loss_clip": 1.06225491, + "balance_loss_mlp": 1.00053179, + "epoch": 0.1302230505621355, + "flos": 22783992174720.0, + "grad_norm": 1.7686071487870745, + "language_loss": 0.74923426, + "learning_rate": 3.895657703291814e-06, + "loss": 0.7689693, + "num_input_tokens_seen": 22924100, + "step": 1083, + "time_per_iteration": 2.5632290840148926 + }, + { + "auxiliary_loss_clip": 0.01214209, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.06086493, + "balance_loss_mlp": 1.02413881, + "epoch": 0.1303432934527746, + "flos": 21323612920320.0, + "grad_norm": 2.786514891613513, + "language_loss": 0.7979756, + "learning_rate": 3.895409239200781e-06, + "loss": 0.8204779, + "num_input_tokens_seen": 22939985, + "step": 1084, + "time_per_iteration": 2.544332504272461 + }, + { + "auxiliary_loss_clip": 0.01213417, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_clip": 1.06044936, + "balance_loss_mlp": 1.02955484, + "epoch": 0.1304635363434137, + "flos": 20922490765440.0, + "grad_norm": 5.791234820138607, + "language_loss": 0.91468573, + "learning_rate": 3.895160487577673e-06, + "loss": 0.93724382, + "num_input_tokens_seen": 22957555, + "step": 1085, + "time_per_iteration": 2.494330406188965 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01007128, + "balance_loss_clip": 1.02338457, + "balance_loss_mlp": 1.00421894, + "epoch": 0.1305837792340528, + "flos": 63245659080960.0, + "grad_norm": 0.7926481764659873, + "language_loss": 0.60941195, + "learning_rate": 3.894911448460226e-06, + "loss": 0.6305728, + "num_input_tokens_seen": 23016870, + "step": 1086, + "time_per_iteration": 2.9574227333068848 + }, + { + "auxiliary_loss_clip": 0.01123452, + "auxiliary_loss_mlp": 0.01042338, + "balance_loss_clip": 1.05006611, + "balance_loss_mlp": 1.03051257, + "epoch": 0.13070402212469187, + "flos": 26428852955520.0, + "grad_norm": 1.8395267344183746, + "language_loss": 0.72356761, + "learning_rate": 3.8946621218862195e-06, + "loss": 0.74522561, + "num_input_tokens_seen": 23037870, + "step": 1087, + "time_per_iteration": 3.5511553287506104 + }, + { + "auxiliary_loss_clip": 0.01185378, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.05946052, + "balance_loss_mlp": 1.03088737, + "epoch": 0.13082426501533098, + "flos": 27673409341440.0, + "grad_norm": 1.9371779198412662, + "language_loss": 0.88693434, + "learning_rate": 3.894412507893475e-06, + "loss": 0.90920585, + "num_input_tokens_seen": 23058150, + "step": 1088, + "time_per_iteration": 4.178976774215698 + }, + { + "auxiliary_loss_clip": 0.01180233, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.05770159, + "balance_loss_mlp": 1.03132463, + "epoch": 0.13094450790597006, + "flos": 24826770547200.0, + "grad_norm": 1.9488178767877817, + "language_loss": 0.71934056, + "learning_rate": 3.894162606519859e-06, + "loss": 0.74157357, + "num_input_tokens_seen": 23077100, + "step": 1089, + "time_per_iteration": 3.4846904277801514 + }, + { + "auxiliary_loss_clip": 0.01171111, + "auxiliary_loss_mlp": 0.01043088, + "balance_loss_clip": 1.05747199, + "balance_loss_mlp": 1.03246045, + "epoch": 0.13106475079660915, + "flos": 19062605468160.0, + "grad_norm": 2.3813549649653862, + "language_loss": 0.77142668, + "learning_rate": 3.893912417803282e-06, + "loss": 0.79356867, + "num_input_tokens_seen": 23096815, + "step": 1090, + "time_per_iteration": 2.6153159141540527 + }, + { + "auxiliary_loss_clip": 0.01173876, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_clip": 1.05355191, + "balance_loss_mlp": 1.03092289, + "epoch": 0.13118499368724823, + "flos": 28913189218560.0, + "grad_norm": 1.9246357570134993, + "language_loss": 0.76847947, + "learning_rate": 3.8936619417816975e-06, + "loss": 0.79064941, + "num_input_tokens_seen": 23117145, + "step": 1091, + "time_per_iteration": 2.695366859436035 + }, + { + "auxiliary_loss_clip": 0.01189844, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.06271899, + "balance_loss_mlp": 1.02102029, + "epoch": 0.13130523657788734, + "flos": 14283398206080.0, + "grad_norm": 2.2137688150948014, + "language_loss": 0.71569204, + "learning_rate": 3.8934111784931015e-06, + "loss": 0.73791045, + "num_input_tokens_seen": 23134595, + "step": 1092, + "time_per_iteration": 2.5972609519958496 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.01006468, + "balance_loss_clip": 1.0292387, + "balance_loss_mlp": 1.00354791, + "epoch": 0.13142547946852642, + "flos": 70174155519360.0, + "grad_norm": 0.9096421325731003, + "language_loss": 0.59038675, + "learning_rate": 3.893160127975535e-06, + "loss": 0.61152184, + "num_input_tokens_seen": 23195285, + "step": 1093, + "time_per_iteration": 3.2276577949523926 + }, + { + "auxiliary_loss_clip": 0.0117617, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.0552814, + "balance_loss_mlp": 1.02735662, + "epoch": 0.1315457223591655, + "flos": 45805998844800.0, + "grad_norm": 2.422349245228072, + "language_loss": 0.81867808, + "learning_rate": 3.8929087902670826e-06, + "loss": 0.84082627, + "num_input_tokens_seen": 23216915, + "step": 1094, + "time_per_iteration": 2.856400489807129 + }, + { + "auxiliary_loss_clip": 0.01116557, + "auxiliary_loss_mlp": 0.01003447, + "balance_loss_clip": 1.02710652, + "balance_loss_mlp": 1.00062132, + "epoch": 0.13166596524980462, + "flos": 62881165820160.0, + "grad_norm": 0.9342568828266821, + "language_loss": 0.60764849, + "learning_rate": 3.8926571654058715e-06, + "loss": 0.62884849, + "num_input_tokens_seen": 23273560, + "step": 1095, + "time_per_iteration": 3.0328946113586426 + }, + { + "auxiliary_loss_clip": 0.0118573, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.06013823, + "balance_loss_mlp": 1.02650654, + "epoch": 0.1317862081404437, + "flos": 23586523793280.0, + "grad_norm": 2.321033013523136, + "language_loss": 0.77013534, + "learning_rate": 3.892405253430074e-06, + "loss": 0.79236841, + "num_input_tokens_seen": 23291080, + "step": 1096, + "time_per_iteration": 2.6079819202423096 + }, + { + "auxiliary_loss_clip": 0.01206235, + "auxiliary_loss_mlp": 0.007664, + "balance_loss_clip": 1.06304145, + "balance_loss_mlp": 1.0005393, + "epoch": 0.13190645103108278, + "flos": 20260764460800.0, + "grad_norm": 1.8122958788187349, + "language_loss": 0.82487422, + "learning_rate": 3.892153054377904e-06, + "loss": 0.84460056, + "num_input_tokens_seen": 23308485, + "step": 1097, + "time_per_iteration": 2.5407423973083496 + }, + { + "auxiliary_loss_clip": 0.01055211, + "auxiliary_loss_mlp": 0.01002931, + "balance_loss_clip": 1.02085662, + "balance_loss_mlp": 0.99989074, + "epoch": 0.13202669392172187, + "flos": 53455440136320.0, + "grad_norm": 0.9901935740418132, + "language_loss": 0.59512466, + "learning_rate": 3.891900568287619e-06, + "loss": 0.61570609, + "num_input_tokens_seen": 23360870, + "step": 1098, + "time_per_iteration": 3.0702266693115234 + }, + { + "auxiliary_loss_clip": 0.0119182, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.0601213, + "balance_loss_mlp": 1.0256021, + "epoch": 0.13214693681236098, + "flos": 15851293845120.0, + "grad_norm": 2.3851610740302185, + "language_loss": 0.72198993, + "learning_rate": 3.891647795197523e-06, + "loss": 0.74428201, + "num_input_tokens_seen": 23376910, + "step": 1099, + "time_per_iteration": 2.5461232662200928 + }, + { + "auxiliary_loss_clip": 0.01194649, + "auxiliary_loss_mlp": 0.01042719, + "balance_loss_clip": 1.05772066, + "balance_loss_mlp": 1.03036296, + "epoch": 0.13226717970300006, + "flos": 19353840940800.0, + "grad_norm": 2.7518598170523583, + "language_loss": 0.68658239, + "learning_rate": 3.8913947351459605e-06, + "loss": 0.708956, + "num_input_tokens_seen": 23394450, + "step": 1100, + "time_per_iteration": 2.567164182662964 + }, + { + "auxiliary_loss_clip": 0.01237684, + "auxiliary_loss_mlp": 0.01040247, + "balance_loss_clip": 1.06825185, + "balance_loss_mlp": 1.02952456, + "epoch": 0.13238742259363914, + "flos": 20698084546560.0, + "grad_norm": 2.13996784084023, + "language_loss": 0.67788929, + "learning_rate": 3.89114138817132e-06, + "loss": 0.70066857, + "num_input_tokens_seen": 23411115, + "step": 1101, + "time_per_iteration": 2.4586291313171387 + }, + { + "auxiliary_loss_clip": 0.01221332, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.06773162, + "balance_loss_mlp": 1.02622724, + "epoch": 0.13250766548427825, + "flos": 21032449274880.0, + "grad_norm": 1.7722122691886268, + "language_loss": 0.84196448, + "learning_rate": 3.890887754312035e-06, + "loss": 0.86455286, + "num_input_tokens_seen": 23429360, + "step": 1102, + "time_per_iteration": 2.518289089202881 + }, + { + "auxiliary_loss_clip": 0.01194219, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.05565023, + "balance_loss_mlp": 1.03307652, + "epoch": 0.13262790837491734, + "flos": 22637871648000.0, + "grad_norm": 1.7355110561617315, + "language_loss": 0.87834692, + "learning_rate": 3.890633833606581e-06, + "loss": 0.90073538, + "num_input_tokens_seen": 23449050, + "step": 1103, + "time_per_iteration": 2.554748296737671 + }, + { + "auxiliary_loss_clip": 0.0121992, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0680151, + "balance_loss_mlp": 1.02447009, + "epoch": 0.13274815126555642, + "flos": 19683141851520.0, + "grad_norm": 1.8128540616048867, + "language_loss": 0.69349158, + "learning_rate": 3.890379626093477e-06, + "loss": 0.71604407, + "num_input_tokens_seen": 23468800, + "step": 1104, + "time_per_iteration": 2.4978950023651123 + }, + { + "auxiliary_loss_clip": 0.01159758, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.05570912, + "balance_loss_mlp": 1.02616715, + "epoch": 0.1328683941561955, + "flos": 21317687176320.0, + "grad_norm": 3.111749095110473, + "language_loss": 0.91930997, + "learning_rate": 3.890125131811287e-06, + "loss": 0.94128579, + "num_input_tokens_seen": 23486850, + "step": 1105, + "time_per_iteration": 2.637204647064209 + }, + { + "auxiliary_loss_clip": 0.0119076, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.05631971, + "balance_loss_mlp": 1.03045964, + "epoch": 0.1329886370468346, + "flos": 13699131580800.0, + "grad_norm": 2.3441860526908522, + "language_loss": 0.75298822, + "learning_rate": 3.889870350798618e-06, + "loss": 0.77530456, + "num_input_tokens_seen": 23504195, + "step": 1106, + "time_per_iteration": 2.53411602973938 + }, + { + "auxiliary_loss_clip": 0.01236405, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.06561232, + "balance_loss_mlp": 1.02557981, + "epoch": 0.1331088799374737, + "flos": 21032413361280.0, + "grad_norm": 3.5334631757630603, + "language_loss": 0.78767294, + "learning_rate": 3.889615283094119e-06, + "loss": 0.81039953, + "num_input_tokens_seen": 23523385, + "step": 1107, + "time_per_iteration": 2.488326072692871 + }, + { + "auxiliary_loss_clip": 0.01238651, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.06507444, + "balance_loss_mlp": 1.02660942, + "epoch": 0.13322912282811278, + "flos": 18260432985600.0, + "grad_norm": 12.28798199617565, + "language_loss": 0.84551746, + "learning_rate": 3.889359928736485e-06, + "loss": 0.86828458, + "num_input_tokens_seen": 23541330, + "step": 1108, + "time_per_iteration": 2.4427828788757324 + }, + { + "auxiliary_loss_clip": 0.0119505, + "auxiliary_loss_mlp": 0.00765848, + "balance_loss_clip": 1.06080747, + "balance_loss_mlp": 1.00055742, + "epoch": 0.1333493657187519, + "flos": 24460876656000.0, + "grad_norm": 2.2124904287473877, + "language_loss": 0.91258901, + "learning_rate": 3.889104287764451e-06, + "loss": 0.93219805, + "num_input_tokens_seen": 23561705, + "step": 1109, + "time_per_iteration": 2.5819194316864014 + }, + { + "auxiliary_loss_clip": 0.01204228, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.06469357, + "balance_loss_mlp": 1.02904487, + "epoch": 0.13346960860939097, + "flos": 22158930677760.0, + "grad_norm": 1.9545266980181715, + "language_loss": 0.90322125, + "learning_rate": 3.888848360216798e-06, + "loss": 0.92566359, + "num_input_tokens_seen": 23579350, + "step": 1110, + "time_per_iteration": 2.5954713821411133 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01006111, + "balance_loss_clip": 1.02258015, + "balance_loss_mlp": 1.00318992, + "epoch": 0.13358985150003005, + "flos": 67931212608000.0, + "grad_norm": 0.7886129773184753, + "language_loss": 0.56661195, + "learning_rate": 3.888592146132351e-06, + "loss": 0.58769619, + "num_input_tokens_seen": 23640620, + "step": 1111, + "time_per_iteration": 3.2357943058013916 + }, + { + "auxiliary_loss_clip": 0.01221541, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.06723166, + "balance_loss_mlp": 1.0322473, + "epoch": 0.13371009439066917, + "flos": 26834284742400.0, + "grad_norm": 1.7642046227068535, + "language_loss": 0.78301346, + "learning_rate": 3.888335645549978e-06, + "loss": 0.8056618, + "num_input_tokens_seen": 23661040, + "step": 1112, + "time_per_iteration": 2.5538580417633057 + }, + { + "auxiliary_loss_clip": 0.01236833, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.06859541, + "balance_loss_mlp": 1.03116035, + "epoch": 0.13383033728130825, + "flos": 26322844942080.0, + "grad_norm": 2.4308147951373145, + "language_loss": 0.81584638, + "learning_rate": 3.888078858508588e-06, + "loss": 0.83863491, + "num_input_tokens_seen": 23680900, + "step": 1113, + "time_per_iteration": 3.3245110511779785 + }, + { + "auxiliary_loss_clip": 0.01205432, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.06717896, + "balance_loss_mlp": 1.02373874, + "epoch": 0.13395058017194733, + "flos": 22563931501440.0, + "grad_norm": 2.0642328049777525, + "language_loss": 0.84394753, + "learning_rate": 3.8878217850471365e-06, + "loss": 0.86635178, + "num_input_tokens_seen": 23700815, + "step": 1114, + "time_per_iteration": 3.3945209980010986 + }, + { + "auxiliary_loss_clip": 0.01241144, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.070629, + "balance_loss_mlp": 1.03250325, + "epoch": 0.13407082306258641, + "flos": 25810938264960.0, + "grad_norm": 3.351111796965735, + "language_loss": 0.74081403, + "learning_rate": 3.887564425204621e-06, + "loss": 0.76367199, + "num_input_tokens_seen": 23722500, + "step": 1115, + "time_per_iteration": 3.3902275562286377 + }, + { + "auxiliary_loss_clip": 0.01077486, + "auxiliary_loss_mlp": 0.01003229, + "balance_loss_clip": 1.0217731, + "balance_loss_mlp": 1.00012922, + "epoch": 0.13419106595322552, + "flos": 68338365269760.0, + "grad_norm": 0.8366427288338363, + "language_loss": 0.54693097, + "learning_rate": 3.887306779020083e-06, + "loss": 0.56773812, + "num_input_tokens_seen": 23777155, + "step": 1116, + "time_per_iteration": 3.9230473041534424 + }, + { + "auxiliary_loss_clip": 0.01225442, + "auxiliary_loss_mlp": 0.0104043, + "balance_loss_clip": 1.06822824, + "balance_loss_mlp": 1.02902722, + "epoch": 0.1343113088438646, + "flos": 20449080489600.0, + "grad_norm": 2.3119638949165635, + "language_loss": 0.70564878, + "learning_rate": 3.887048846532608e-06, + "loss": 0.72830749, + "num_input_tokens_seen": 23794130, + "step": 1117, + "time_per_iteration": 2.5116889476776123 + }, + { + "auxiliary_loss_clip": 0.01086603, + "auxiliary_loss_mlp": 0.0100373, + "balance_loss_clip": 1.02205515, + "balance_loss_mlp": 1.00065422, + "epoch": 0.1344315517345037, + "flos": 67389784951680.0, + "grad_norm": 0.7552521832323286, + "language_loss": 0.58130139, + "learning_rate": 3.8867906277813224e-06, + "loss": 0.60220474, + "num_input_tokens_seen": 23852285, + "step": 1118, + "time_per_iteration": 3.0470433235168457 + }, + { + "auxiliary_loss_clip": 0.01222635, + "auxiliary_loss_mlp": 0.00765608, + "balance_loss_clip": 1.06407583, + "balance_loss_mlp": 1.00059748, + "epoch": 0.1345517946251428, + "flos": 40734442788480.0, + "grad_norm": 2.082290779517302, + "language_loss": 0.73726213, + "learning_rate": 3.886532122805399e-06, + "loss": 0.75714457, + "num_input_tokens_seen": 23874765, + "step": 1119, + "time_per_iteration": 2.6942477226257324 + }, + { + "auxiliary_loss_clip": 0.0114572, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.05271935, + "balance_loss_mlp": 1.02539206, + "epoch": 0.13467203751578188, + "flos": 22816850140800.0, + "grad_norm": 2.643121458873509, + "language_loss": 0.89980185, + "learning_rate": 3.886273331644053e-06, + "loss": 0.92162526, + "num_input_tokens_seen": 23893635, + "step": 1120, + "time_per_iteration": 2.7172718048095703 + }, + { + "auxiliary_loss_clip": 0.01171285, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.05976129, + "balance_loss_mlp": 1.02422523, + "epoch": 0.13479228040642097, + "flos": 17091576512640.0, + "grad_norm": 1.9054408442747877, + "language_loss": 0.82221293, + "learning_rate": 3.886014254336542e-06, + "loss": 0.84427953, + "num_input_tokens_seen": 23910110, + "step": 1121, + "time_per_iteration": 2.6267788410186768 + }, + { + "auxiliary_loss_clip": 0.01218932, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0642643, + "balance_loss_mlp": 1.02212644, + "epoch": 0.13491252329706005, + "flos": 23730525417600.0, + "grad_norm": 1.6913692861283482, + "language_loss": 0.92491448, + "learning_rate": 3.885754890922168e-06, + "loss": 0.94743061, + "num_input_tokens_seen": 23930440, + "step": 1122, + "time_per_iteration": 2.5488367080688477 + }, + { + "auxiliary_loss_clip": 0.01130335, + "auxiliary_loss_mlp": 0.01044297, + "balance_loss_clip": 1.05392122, + "balance_loss_mlp": 1.03265631, + "epoch": 0.13503276618769916, + "flos": 34127058960000.0, + "grad_norm": 1.7944821455380489, + "language_loss": 0.78555918, + "learning_rate": 3.885495241440277e-06, + "loss": 0.80730546, + "num_input_tokens_seen": 23954535, + "step": 1123, + "time_per_iteration": 2.8178391456604004 + }, + { + "auxiliary_loss_clip": 0.01237196, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_clip": 1.06707311, + "balance_loss_mlp": 1.03272295, + "epoch": 0.13515300907833824, + "flos": 17712328377600.0, + "grad_norm": 1.7401416164413983, + "language_loss": 0.74297965, + "learning_rate": 3.885235305930257e-06, + "loss": 0.76578844, + "num_input_tokens_seen": 23972735, + "step": 1124, + "time_per_iteration": 2.449970245361328 + }, + { + "auxiliary_loss_clip": 0.0118879, + "auxiliary_loss_mlp": 0.01045192, + "balance_loss_clip": 1.06520057, + "balance_loss_mlp": 1.03297901, + "epoch": 0.13527325196897733, + "flos": 20260872201600.0, + "grad_norm": 2.10017959199904, + "language_loss": 0.85496354, + "learning_rate": 3.884975084431539e-06, + "loss": 0.87730336, + "num_input_tokens_seen": 23987685, + "step": 1125, + "time_per_iteration": 2.552387237548828 + }, + { + "auxiliary_loss_clip": 0.01213222, + "auxiliary_loss_mlp": 0.00765804, + "balance_loss_clip": 1.06406534, + "balance_loss_mlp": 1.00056016, + "epoch": 0.13539349485961644, + "flos": 18186492839040.0, + "grad_norm": 2.1944087825723373, + "language_loss": 0.91458923, + "learning_rate": 3.8847145769836e-06, + "loss": 0.93437946, + "num_input_tokens_seen": 24004105, + "step": 1126, + "time_per_iteration": 2.526672601699829 + }, + { + "auxiliary_loss_clip": 0.0123826, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.06669855, + "balance_loss_mlp": 1.0256846, + "epoch": 0.13551373775025552, + "flos": 19317463441920.0, + "grad_norm": 2.7581016787270194, + "language_loss": 0.6676085, + "learning_rate": 3.884453783625959e-06, + "loss": 0.69036174, + "num_input_tokens_seen": 24021715, + "step": 1127, + "time_per_iteration": 2.466416120529175 + }, + { + "auxiliary_loss_clip": 0.0119893, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.06273007, + "balance_loss_mlp": 1.02103591, + "epoch": 0.1356339806408946, + "flos": 20850813175680.0, + "grad_norm": 3.9049875103349447, + "language_loss": 0.84585714, + "learning_rate": 3.884192704398176e-06, + "loss": 0.86815774, + "num_input_tokens_seen": 24038915, + "step": 1128, + "time_per_iteration": 2.5350582599639893 + }, + { + "auxiliary_loss_clip": 0.01222331, + "auxiliary_loss_mlp": 0.01049818, + "balance_loss_clip": 1.06490159, + "balance_loss_mlp": 1.03941703, + "epoch": 0.13575422353153369, + "flos": 50476037696640.0, + "grad_norm": 1.636927351293132, + "language_loss": 0.74530131, + "learning_rate": 3.883931339339858e-06, + "loss": 0.76802278, + "num_input_tokens_seen": 24063300, + "step": 1129, + "time_per_iteration": 2.7794485092163086 + }, + { + "auxiliary_loss_clip": 0.01223432, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.06343007, + "balance_loss_mlp": 1.02725267, + "epoch": 0.1358744664221728, + "flos": 18150797698560.0, + "grad_norm": 1.7534117859437652, + "language_loss": 0.78962457, + "learning_rate": 3.883669688490654e-06, + "loss": 0.81224799, + "num_input_tokens_seen": 24081070, + "step": 1130, + "time_per_iteration": 2.4853930473327637 + }, + { + "auxiliary_loss_clip": 0.01191718, + "auxiliary_loss_mlp": 0.00765238, + "balance_loss_clip": 1.05795455, + "balance_loss_mlp": 1.00061095, + "epoch": 0.13599470931281188, + "flos": 18442966924800.0, + "grad_norm": 1.8747709922818752, + "language_loss": 0.85364306, + "learning_rate": 3.883407751890256e-06, + "loss": 0.87321258, + "num_input_tokens_seen": 24099675, + "step": 1131, + "time_per_iteration": 2.5318706035614014 + }, + { + "auxiliary_loss_clip": 0.01188661, + "auxiliary_loss_mlp": 0.01042532, + "balance_loss_clip": 1.05958772, + "balance_loss_mlp": 1.0305748, + "epoch": 0.13611495220345096, + "flos": 26680766014080.0, + "grad_norm": 1.9282243511849169, + "language_loss": 0.85753089, + "learning_rate": 3.8831455295783994e-06, + "loss": 0.87984282, + "num_input_tokens_seen": 24118925, + "step": 1132, + "time_per_iteration": 2.6496376991271973 + }, + { + "auxiliary_loss_clip": 0.01200811, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.06350863, + "balance_loss_mlp": 1.02615154, + "epoch": 0.13623519509409007, + "flos": 21686238673920.0, + "grad_norm": 1.7584708506403586, + "language_loss": 0.74239892, + "learning_rate": 3.882883021594864e-06, + "loss": 0.76477563, + "num_input_tokens_seen": 24137065, + "step": 1133, + "time_per_iteration": 2.557145595550537 + }, + { + "auxiliary_loss_clip": 0.01180803, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.06258726, + "balance_loss_mlp": 1.02361631, + "epoch": 0.13635543798472916, + "flos": 14830389492480.0, + "grad_norm": 2.237147280963237, + "language_loss": 0.86813086, + "learning_rate": 3.8826202279794705e-06, + "loss": 0.89028084, + "num_input_tokens_seen": 24154125, + "step": 1134, + "time_per_iteration": 2.5350422859191895 + }, + { + "auxiliary_loss_clip": 0.01238455, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.0698688, + "balance_loss_mlp": 1.02466452, + "epoch": 0.13647568087536824, + "flos": 22890323410560.0, + "grad_norm": 2.0003563310924566, + "language_loss": 0.70224953, + "learning_rate": 3.882357148772085e-06, + "loss": 0.72498345, + "num_input_tokens_seen": 24171550, + "step": 1135, + "time_per_iteration": 2.4790890216827393 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.05925655, + "balance_loss_mlp": 1.03178394, + "epoch": 0.13659592376600732, + "flos": 19937927998080.0, + "grad_norm": 2.343597310573232, + "language_loss": 0.84076083, + "learning_rate": 3.882093784012617e-06, + "loss": 0.86292684, + "num_input_tokens_seen": 24190190, + "step": 1136, + "time_per_iteration": 2.659261703491211 + }, + { + "auxiliary_loss_clip": 0.0120315, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.0641706, + "balance_loss_mlp": 1.02485764, + "epoch": 0.13671616665664643, + "flos": 21428579439360.0, + "grad_norm": 1.9172636174945736, + "language_loss": 0.84037709, + "learning_rate": 3.881830133741019e-06, + "loss": 0.86276758, + "num_input_tokens_seen": 24209055, + "step": 1137, + "time_per_iteration": 2.5476598739624023 + }, + { + "auxiliary_loss_clip": 0.01188257, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.06451476, + "balance_loss_mlp": 1.03027129, + "epoch": 0.13683640954728551, + "flos": 22778138257920.0, + "grad_norm": 2.169140424810057, + "language_loss": 0.7636587, + "learning_rate": 3.881566197997285e-06, + "loss": 0.78595662, + "num_input_tokens_seen": 24225490, + "step": 1138, + "time_per_iteration": 2.5732421875 + }, + { + "auxiliary_loss_clip": 0.01200883, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.06601942, + "balance_loss_mlp": 1.02788734, + "epoch": 0.1369566524379246, + "flos": 21725884310400.0, + "grad_norm": 1.4834736784490028, + "language_loss": 0.74882936, + "learning_rate": 3.881301976821456e-06, + "loss": 0.77122247, + "num_input_tokens_seen": 24245520, + "step": 1139, + "time_per_iteration": 2.5371994972229004 + }, + { + "auxiliary_loss_clip": 0.01216479, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.06543183, + "balance_loss_mlp": 1.02989233, + "epoch": 0.1370768953285637, + "flos": 18624459369600.0, + "grad_norm": 2.3394429841805437, + "language_loss": 0.90481675, + "learning_rate": 3.881037470253612e-06, + "loss": 0.92738569, + "num_input_tokens_seen": 24265035, + "step": 1140, + "time_per_iteration": 3.250687599182129 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01038238, + "balance_loss_clip": 1.06142616, + "balance_loss_mlp": 1.027897, + "epoch": 0.1371971382192028, + "flos": 14939521989120.0, + "grad_norm": 2.8360532551834132, + "language_loss": 0.79317904, + "learning_rate": 3.88077267833388e-06, + "loss": 0.81530046, + "num_input_tokens_seen": 24281550, + "step": 1141, + "time_per_iteration": 3.4392950534820557 + }, + { + "auxiliary_loss_clip": 0.01167623, + "auxiliary_loss_mlp": 0.01044383, + "balance_loss_clip": 1.0572654, + "balance_loss_mlp": 1.03332603, + "epoch": 0.13731738110984187, + "flos": 19023785844480.0, + "grad_norm": 2.120510237121425, + "language_loss": 0.8374365, + "learning_rate": 3.880507601102427e-06, + "loss": 0.85955656, + "num_input_tokens_seen": 24299485, + "step": 1142, + "time_per_iteration": 4.235506057739258 + }, + { + "auxiliary_loss_clip": 0.01236295, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_clip": 1.0697273, + "balance_loss_mlp": 1.0310719, + "epoch": 0.13743762400048098, + "flos": 18187462506240.0, + "grad_norm": 1.735808045086037, + "language_loss": 0.82088411, + "learning_rate": 3.880242238599467e-06, + "loss": 0.84366333, + "num_input_tokens_seen": 24316010, + "step": 1143, + "time_per_iteration": 2.5226643085479736 + }, + { + "auxiliary_loss_clip": 0.01234991, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.06862211, + "balance_loss_mlp": 1.03195536, + "epoch": 0.13755786689112007, + "flos": 21031982398080.0, + "grad_norm": 1.5920762803453283, + "language_loss": 0.83119774, + "learning_rate": 3.879976590865254e-06, + "loss": 0.85397446, + "num_input_tokens_seen": 24335465, + "step": 1144, + "time_per_iteration": 2.4575960636138916 + }, + { + "auxiliary_loss_clip": 0.01205712, + "auxiliary_loss_mlp": 0.01045409, + "balance_loss_clip": 1.06664252, + "balance_loss_mlp": 1.03446603, + "epoch": 0.13767810978175915, + "flos": 21360636864000.0, + "grad_norm": 3.0504350383902974, + "language_loss": 0.86995775, + "learning_rate": 3.879710657940087e-06, + "loss": 0.89246893, + "num_input_tokens_seen": 24354415, + "step": 1145, + "time_per_iteration": 2.5459773540496826 + }, + { + "auxiliary_loss_clip": 0.01224107, + "auxiliary_loss_mlp": 0.01052231, + "balance_loss_clip": 1.06667805, + "balance_loss_mlp": 1.04044104, + "epoch": 0.13779835267239823, + "flos": 30592084861440.0, + "grad_norm": 1.7427681978460918, + "language_loss": 0.70536286, + "learning_rate": 3.879444439864308e-06, + "loss": 0.72812617, + "num_input_tokens_seen": 24373990, + "step": 1146, + "time_per_iteration": 2.5718469619750977 + }, + { + "auxiliary_loss_clip": 0.01220846, + "auxiliary_loss_mlp": 0.00765265, + "balance_loss_clip": 1.06564152, + "balance_loss_mlp": 1.00068808, + "epoch": 0.13791859556303734, + "flos": 22669867687680.0, + "grad_norm": 1.9564029846730402, + "language_loss": 0.86140037, + "learning_rate": 3.879177936678301e-06, + "loss": 0.88126147, + "num_input_tokens_seen": 24392995, + "step": 1147, + "time_per_iteration": 2.5227766036987305 + }, + { + "auxiliary_loss_clip": 0.01210328, + "auxiliary_loss_mlp": 0.01040911, + "balance_loss_clip": 1.06656027, + "balance_loss_mlp": 1.02938318, + "epoch": 0.13803883845367643, + "flos": 35224166016000.0, + "grad_norm": 3.1895566408514906, + "language_loss": 0.77125525, + "learning_rate": 3.878911148422496e-06, + "loss": 0.79376769, + "num_input_tokens_seen": 24414470, + "step": 1148, + "time_per_iteration": 2.6622581481933594 + }, + { + "auxiliary_loss_clip": 0.01221472, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.06628394, + "balance_loss_mlp": 1.02588248, + "epoch": 0.1381590813443155, + "flos": 32014542332160.0, + "grad_norm": 2.492172622950934, + "language_loss": 0.70582134, + "learning_rate": 3.878644075137364e-06, + "loss": 0.72840846, + "num_input_tokens_seen": 24435120, + "step": 1149, + "time_per_iteration": 2.6068007946014404 + }, + { + "auxiliary_loss_clip": 0.01168041, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.05654144, + "balance_loss_mlp": 1.02707553, + "epoch": 0.13827932423495462, + "flos": 17821855923840.0, + "grad_norm": 2.2891147006175614, + "language_loss": 0.79340219, + "learning_rate": 3.878376716863418e-06, + "loss": 0.8154605, + "num_input_tokens_seen": 24451420, + "step": 1150, + "time_per_iteration": 2.565126657485962 + }, + { + "auxiliary_loss_clip": 0.01202179, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.06189728, + "balance_loss_mlp": 1.03022361, + "epoch": 0.1383995671255937, + "flos": 19427098728960.0, + "grad_norm": 3.552917109639181, + "language_loss": 0.71636283, + "learning_rate": 3.878109073641219e-06, + "loss": 0.73880458, + "num_input_tokens_seen": 24470450, + "step": 1151, + "time_per_iteration": 2.5587689876556396 + }, + { + "auxiliary_loss_clip": 0.01169742, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.05990624, + "balance_loss_mlp": 1.02935886, + "epoch": 0.13851981001623279, + "flos": 28296603331200.0, + "grad_norm": 1.5395638558780678, + "language_loss": 0.81073576, + "learning_rate": 3.877841145511366e-06, + "loss": 0.83283669, + "num_input_tokens_seen": 24493190, + "step": 1152, + "time_per_iteration": 2.698010206222534 + }, + { + "auxiliary_loss_clip": 0.01225111, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.06673193, + "balance_loss_mlp": 1.03022492, + "epoch": 0.13864005290687187, + "flos": 21213079793280.0, + "grad_norm": 1.6676772457962357, + "language_loss": 0.8263678, + "learning_rate": 3.8775729325145035e-06, + "loss": 0.84903193, + "num_input_tokens_seen": 24512425, + "step": 1153, + "time_per_iteration": 2.533019781112671 + }, + { + "auxiliary_loss_clip": 0.01072597, + "auxiliary_loss_mlp": 0.01007158, + "balance_loss_clip": 1.0213964, + "balance_loss_mlp": 1.00436902, + "epoch": 0.13876029579751098, + "flos": 71653389413760.0, + "grad_norm": 0.7940597706851396, + "language_loss": 0.64798701, + "learning_rate": 3.877304434691321e-06, + "loss": 0.6687845, + "num_input_tokens_seen": 24579275, + "step": 1154, + "time_per_iteration": 3.280683994293213 + }, + { + "auxiliary_loss_clip": 0.01188129, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.06388402, + "balance_loss_mlp": 1.02328396, + "epoch": 0.13888053868815006, + "flos": 21941348042880.0, + "grad_norm": 1.7006248442540297, + "language_loss": 0.79752934, + "learning_rate": 3.877035652082548e-06, + "loss": 0.81974268, + "num_input_tokens_seen": 24598720, + "step": 1155, + "time_per_iteration": 2.6515135765075684 + }, + { + "auxiliary_loss_clip": 0.01197033, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.06401873, + "balance_loss_mlp": 1.02310228, + "epoch": 0.13900078157878915, + "flos": 19608627087360.0, + "grad_norm": 1.7928477875007955, + "language_loss": 0.85042077, + "learning_rate": 3.87676658472896e-06, + "loss": 0.87273258, + "num_input_tokens_seen": 24617530, + "step": 1156, + "time_per_iteration": 2.618703603744507 + }, + { + "auxiliary_loss_clip": 0.01220603, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.06409955, + "balance_loss_mlp": 1.03636241, + "epoch": 0.13912102446942826, + "flos": 22638051216000.0, + "grad_norm": 1.8248561797151621, + "language_loss": 0.85099757, + "learning_rate": 3.876497232671372e-06, + "loss": 0.87367523, + "num_input_tokens_seen": 24637485, + "step": 1157, + "time_per_iteration": 2.561492443084717 + }, + { + "auxiliary_loss_clip": 0.01178535, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.06011271, + "balance_loss_mlp": 1.02744603, + "epoch": 0.13924126736006734, + "flos": 29643324975360.0, + "grad_norm": 2.57702666429055, + "language_loss": 0.83174038, + "learning_rate": 3.876227595950647e-06, + "loss": 0.85390663, + "num_input_tokens_seen": 24656915, + "step": 1158, + "time_per_iteration": 2.730687141418457 + }, + { + "auxiliary_loss_clip": 0.01238132, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.06957912, + "balance_loss_mlp": 1.02918506, + "epoch": 0.13936151025070642, + "flos": 27417653527680.0, + "grad_norm": 2.0098098912034796, + "language_loss": 0.7897464, + "learning_rate": 3.875957674607686e-06, + "loss": 0.81253988, + "num_input_tokens_seen": 24679190, + "step": 1159, + "time_per_iteration": 2.5239720344543457 + }, + { + "auxiliary_loss_clip": 0.01212106, + "auxiliary_loss_mlp": 0.00766214, + "balance_loss_clip": 1.06094408, + "balance_loss_mlp": 1.00072503, + "epoch": 0.1394817531413455, + "flos": 16399326625920.0, + "grad_norm": 1.8681486450314568, + "language_loss": 0.88116348, + "learning_rate": 3.8756874686834386e-06, + "loss": 0.90094674, + "num_input_tokens_seen": 24697405, + "step": 1160, + "time_per_iteration": 2.5002243518829346 + }, + { + "auxiliary_loss_clip": 0.01224077, + "auxiliary_loss_mlp": 0.00766156, + "balance_loss_clip": 1.06396878, + "balance_loss_mlp": 1.0007, + "epoch": 0.13960199603198462, + "flos": 30922319525760.0, + "grad_norm": 1.6754811315738596, + "language_loss": 0.80351496, + "learning_rate": 3.875416978218893e-06, + "loss": 0.82341725, + "num_input_tokens_seen": 24720600, + "step": 1161, + "time_per_iteration": 2.607428789138794 + }, + { + "auxiliary_loss_clip": 0.0119776, + "auxiliary_loss_mlp": 0.01043113, + "balance_loss_clip": 1.05825782, + "balance_loss_mlp": 1.0313828, + "epoch": 0.1397222389226237, + "flos": 18113773754880.0, + "grad_norm": 2.3900928746731562, + "language_loss": 0.82941085, + "learning_rate": 3.8751462032550835e-06, + "loss": 0.85181957, + "num_input_tokens_seen": 24737605, + "step": 1162, + "time_per_iteration": 2.5867013931274414 + }, + { + "auxiliary_loss_clip": 0.01203861, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.06762207, + "balance_loss_mlp": 1.01578701, + "epoch": 0.13984248181326278, + "flos": 16872772815360.0, + "grad_norm": 2.419568095398299, + "language_loss": 0.82782382, + "learning_rate": 3.874875143833085e-06, + "loss": 0.85012591, + "num_input_tokens_seen": 24755845, + "step": 1163, + "time_per_iteration": 2.5956952571868896 + }, + { + "auxiliary_loss_clip": 0.01221303, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_clip": 1.0652492, + "balance_loss_mlp": 1.03524864, + "epoch": 0.1399627247039019, + "flos": 54121401267840.0, + "grad_norm": 1.7957526657167664, + "language_loss": 0.68963742, + "learning_rate": 3.874603799994019e-06, + "loss": 0.71232653, + "num_input_tokens_seen": 24779380, + "step": 1164, + "time_per_iteration": 2.8152964115142822 + }, + { + "auxiliary_loss_clip": 0.01183555, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.06073642, + "balance_loss_mlp": 1.02495253, + "epoch": 0.14008296759454097, + "flos": 11765521618560.0, + "grad_norm": 1.9232594407264896, + "language_loss": 0.87037867, + "learning_rate": 3.874332171779046e-06, + "loss": 0.8925696, + "num_input_tokens_seen": 24794260, + "step": 1165, + "time_per_iteration": 2.549051523208618 + }, + { + "auxiliary_loss_clip": 0.01182838, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.05728924, + "balance_loss_mlp": 1.02045274, + "epoch": 0.14020321048518006, + "flos": 22017514832640.0, + "grad_norm": 1.6293578988964441, + "language_loss": 0.75747234, + "learning_rate": 3.874060259229373e-06, + "loss": 0.77961642, + "num_input_tokens_seen": 24815835, + "step": 1166, + "time_per_iteration": 2.6416914463043213 + }, + { + "auxiliary_loss_clip": 0.01224573, + "auxiliary_loss_mlp": 0.01045061, + "balance_loss_clip": 1.06813753, + "balance_loss_mlp": 1.0336709, + "epoch": 0.14032345337581917, + "flos": 23404313076480.0, + "grad_norm": 2.3030493116747954, + "language_loss": 0.94009602, + "learning_rate": 3.873788062386249e-06, + "loss": 0.96279234, + "num_input_tokens_seen": 24834095, + "step": 1167, + "time_per_iteration": 3.3002519607543945 + }, + { + "auxiliary_loss_clip": 0.01194636, + "auxiliary_loss_mlp": 0.0104129, + "balance_loss_clip": 1.06490302, + "balance_loss_mlp": 1.03018558, + "epoch": 0.14044369626645825, + "flos": 29645767100160.0, + "grad_norm": 2.21392767241256, + "language_loss": 0.82136649, + "learning_rate": 3.873515581290965e-06, + "loss": 0.84372568, + "num_input_tokens_seen": 24858900, + "step": 1168, + "time_per_iteration": 5.074455738067627 + }, + { + "auxiliary_loss_clip": 0.01191142, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.06443453, + "balance_loss_mlp": 1.02772212, + "epoch": 0.14056393915709733, + "flos": 18332972501760.0, + "grad_norm": 1.9987301038753993, + "language_loss": 0.75276655, + "learning_rate": 3.8732428159848575e-06, + "loss": 0.77507007, + "num_input_tokens_seen": 24877875, + "step": 1169, + "time_per_iteration": 2.5681302547454834 + }, + { + "auxiliary_loss_clip": 0.01223066, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.07039797, + "balance_loss_mlp": 1.02499175, + "epoch": 0.14068418204773642, + "flos": 26687517770880.0, + "grad_norm": 2.668535904164936, + "language_loss": 0.7796973, + "learning_rate": 3.872969766509304e-06, + "loss": 0.80229259, + "num_input_tokens_seen": 24898430, + "step": 1170, + "time_per_iteration": 2.5395376682281494 + }, + { + "auxiliary_loss_clip": 0.01083324, + "auxiliary_loss_mlp": 0.0100841, + "balance_loss_clip": 1.02474809, + "balance_loss_mlp": 1.00531089, + "epoch": 0.14080442493837553, + "flos": 65259314501760.0, + "grad_norm": 0.7692529560972263, + "language_loss": 0.55633152, + "learning_rate": 3.872696432905726e-06, + "loss": 0.57724887, + "num_input_tokens_seen": 24959250, + "step": 1171, + "time_per_iteration": 3.123422145843506 + }, + { + "auxiliary_loss_clip": 0.01224567, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.06395519, + "balance_loss_mlp": 1.03609908, + "epoch": 0.1409246678290146, + "flos": 25776715582080.0, + "grad_norm": 2.680063613099516, + "language_loss": 0.72040093, + "learning_rate": 3.872422815215589e-06, + "loss": 0.74312449, + "num_input_tokens_seen": 24978330, + "step": 1172, + "time_per_iteration": 2.536289930343628 + }, + { + "auxiliary_loss_clip": 0.01215429, + "auxiliary_loss_mlp": 0.01043698, + "balance_loss_clip": 1.06078637, + "balance_loss_mlp": 1.03079987, + "epoch": 0.1410449107196537, + "flos": 21868521217920.0, + "grad_norm": 2.1264540056980152, + "language_loss": 0.74402428, + "learning_rate": 3.8721489134803994e-06, + "loss": 0.76661551, + "num_input_tokens_seen": 24997120, + "step": 1173, + "time_per_iteration": 2.4878859519958496 + }, + { + "auxiliary_loss_clip": 0.01219223, + "auxiliary_loss_mlp": 0.01048034, + "balance_loss_clip": 1.06575596, + "balance_loss_mlp": 1.0358212, + "epoch": 0.1411651536102928, + "flos": 16684133564160.0, + "grad_norm": 2.822580385762408, + "language_loss": 0.72143579, + "learning_rate": 3.871874727741707e-06, + "loss": 0.74410844, + "num_input_tokens_seen": 25014350, + "step": 1174, + "time_per_iteration": 2.485135555267334 + }, + { + "auxiliary_loss_clip": 0.01220545, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.06949711, + "balance_loss_mlp": 1.03071427, + "epoch": 0.1412853965009319, + "flos": 20992264934400.0, + "grad_norm": 1.7682710463969376, + "language_loss": 0.96732414, + "learning_rate": 3.871600258041108e-06, + "loss": 0.98994291, + "num_input_tokens_seen": 25033875, + "step": 1175, + "time_per_iteration": 2.5142874717712402 + }, + { + "auxiliary_loss_clip": 0.01200905, + "auxiliary_loss_mlp": 0.01042535, + "balance_loss_clip": 1.06056917, + "balance_loss_mlp": 1.03027463, + "epoch": 0.14140563939157097, + "flos": 20335279224960.0, + "grad_norm": 2.1530729372880177, + "language_loss": 0.8553369, + "learning_rate": 3.871325504420238e-06, + "loss": 0.87777126, + "num_input_tokens_seen": 25052865, + "step": 1176, + "time_per_iteration": 2.5280869007110596 + }, + { + "auxiliary_loss_clip": 0.01237256, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.06881845, + "balance_loss_mlp": 1.02368927, + "epoch": 0.14152588228221005, + "flos": 21068826773760.0, + "grad_norm": 1.777803124425818, + "language_loss": 0.81958461, + "learning_rate": 3.871050466920776e-06, + "loss": 0.84230149, + "num_input_tokens_seen": 25072770, + "step": 1177, + "time_per_iteration": 2.5474722385406494 + }, + { + "auxiliary_loss_clip": 0.01182558, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.06059182, + "balance_loss_mlp": 1.01952696, + "epoch": 0.14164612517284916, + "flos": 18223157646720.0, + "grad_norm": 2.114834645084853, + "language_loss": 0.80007577, + "learning_rate": 3.870775145584447e-06, + "loss": 0.82220423, + "num_input_tokens_seen": 25090550, + "step": 1178, + "time_per_iteration": 2.564645767211914 + }, + { + "auxiliary_loss_clip": 0.01211538, + "auxiliary_loss_mlp": 0.01044165, + "balance_loss_clip": 1.06327605, + "balance_loss_mlp": 1.03226209, + "epoch": 0.14176636806348825, + "flos": 22744454279040.0, + "grad_norm": 2.34870463438838, + "language_loss": 0.64602727, + "learning_rate": 3.8704995404530145e-06, + "loss": 0.66858429, + "num_input_tokens_seen": 25106175, + "step": 1179, + "time_per_iteration": 2.5334203243255615 + }, + { + "auxiliary_loss_clip": 0.01236054, + "auxiliary_loss_mlp": 0.01041291, + "balance_loss_clip": 1.0688889, + "balance_loss_mlp": 1.03072309, + "epoch": 0.14188661095412733, + "flos": 22091095843200.0, + "grad_norm": 1.8369064641049186, + "language_loss": 0.848939, + "learning_rate": 3.87022365156829e-06, + "loss": 0.87171245, + "num_input_tokens_seen": 25126890, + "step": 1180, + "time_per_iteration": 2.4762914180755615 + }, + { + "auxiliary_loss_clip": 0.01144146, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.05697668, + "balance_loss_mlp": 1.02902699, + "epoch": 0.14200685384476644, + "flos": 24352390604160.0, + "grad_norm": 2.0319561021373054, + "language_loss": 0.81166542, + "learning_rate": 3.869947478972123e-06, + "loss": 0.83350867, + "num_input_tokens_seen": 25147915, + "step": 1181, + "time_per_iteration": 2.713775873184204 + }, + { + "auxiliary_loss_clip": 0.01216657, + "auxiliary_loss_mlp": 0.01040142, + "balance_loss_clip": 1.06529903, + "balance_loss_mlp": 1.02811968, + "epoch": 0.14212709673540552, + "flos": 24022048199040.0, + "grad_norm": 1.8303548135535197, + "language_loss": 0.82471538, + "learning_rate": 3.869671022706412e-06, + "loss": 0.84728342, + "num_input_tokens_seen": 25166645, + "step": 1182, + "time_per_iteration": 2.5352203845977783 + }, + { + "auxiliary_loss_clip": 0.01160737, + "auxiliary_loss_mlp": 0.01043103, + "balance_loss_clip": 1.05628657, + "balance_loss_mlp": 1.0320766, + "epoch": 0.1422473396260446, + "flos": 26431797870720.0, + "grad_norm": 2.534389211596621, + "language_loss": 0.64966995, + "learning_rate": 3.869394282813092e-06, + "loss": 0.67170835, + "num_input_tokens_seen": 25185845, + "step": 1183, + "time_per_iteration": 2.649069309234619 + }, + { + "auxiliary_loss_clip": 0.01196131, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.06055903, + "balance_loss_mlp": 1.03297305, + "epoch": 0.1423675825166837, + "flos": 17055306754560.0, + "grad_norm": 2.3156726046805107, + "language_loss": 0.89207506, + "learning_rate": 3.869117259334147e-06, + "loss": 0.91447854, + "num_input_tokens_seen": 25203770, + "step": 1184, + "time_per_iteration": 2.5571448802948 + }, + { + "auxiliary_loss_clip": 0.01215838, + "auxiliary_loss_mlp": 0.0104153, + "balance_loss_clip": 1.06422067, + "balance_loss_mlp": 1.03066969, + "epoch": 0.1424878254073228, + "flos": 17929480049280.0, + "grad_norm": 5.243402136095736, + "language_loss": 0.81980872, + "learning_rate": 3.868839952311599e-06, + "loss": 0.84238237, + "num_input_tokens_seen": 25221725, + "step": 1185, + "time_per_iteration": 2.4902172088623047 + }, + { + "auxiliary_loss_clip": 0.01199618, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.06361175, + "balance_loss_mlp": 1.02756989, + "epoch": 0.14260806829796188, + "flos": 20303606407680.0, + "grad_norm": 2.1555209458345383, + "language_loss": 0.80527294, + "learning_rate": 3.868562361787516e-06, + "loss": 0.8276608, + "num_input_tokens_seen": 25240855, + "step": 1186, + "time_per_iteration": 2.527496576309204 + }, + { + "auxiliary_loss_clip": 0.01136735, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.05275941, + "balance_loss_mlp": 1.02049327, + "epoch": 0.14272831118860096, + "flos": 23185724860800.0, + "grad_norm": 1.9851710034564212, + "language_loss": 0.68652701, + "learning_rate": 3.868284487804009e-06, + "loss": 0.70820713, + "num_input_tokens_seen": 25260085, + "step": 1187, + "time_per_iteration": 2.688676118850708 + }, + { + "auxiliary_loss_clip": 0.01209878, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_clip": 1.0623908, + "balance_loss_mlp": 1.03433716, + "epoch": 0.14284855407924008, + "flos": 27232210586880.0, + "grad_norm": 2.1814130483521486, + "language_loss": 0.77922791, + "learning_rate": 3.86800633040323e-06, + "loss": 0.80178064, + "num_input_tokens_seen": 25280675, + "step": 1188, + "time_per_iteration": 2.5793399810791016 + }, + { + "auxiliary_loss_clip": 0.01204259, + "auxiliary_loss_mlp": 0.0076566, + "balance_loss_clip": 1.06607389, + "balance_loss_mlp": 1.00074816, + "epoch": 0.14296879696987916, + "flos": 28184202696960.0, + "grad_norm": 2.185564566697635, + "language_loss": 0.78287804, + "learning_rate": 3.867727889627376e-06, + "loss": 0.80257726, + "num_input_tokens_seen": 25300290, + "step": 1189, + "time_per_iteration": 2.608096122741699 + }, + { + "auxiliary_loss_clip": 0.01179034, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_clip": 1.06012619, + "balance_loss_mlp": 1.03060222, + "epoch": 0.14308903986051824, + "flos": 19390290266880.0, + "grad_norm": 2.0995517138331863, + "language_loss": 0.78328741, + "learning_rate": 3.867449165518687e-06, + "loss": 0.80550492, + "num_input_tokens_seen": 25316760, + "step": 1190, + "time_per_iteration": 2.5540404319763184 + }, + { + "auxiliary_loss_clip": 0.0124005, + "auxiliary_loss_mlp": 0.0076634, + "balance_loss_clip": 1.06960392, + "balance_loss_mlp": 1.00069129, + "epoch": 0.14320928275115732, + "flos": 17457506317440.0, + "grad_norm": 3.0322452802989575, + "language_loss": 0.71030748, + "learning_rate": 3.867170158119444e-06, + "loss": 0.73037136, + "num_input_tokens_seen": 25335760, + "step": 1191, + "time_per_iteration": 2.440479278564453 + }, + { + "auxiliary_loss_clip": 0.01238486, + "auxiliary_loss_mlp": 0.01039159, + "balance_loss_clip": 1.0683471, + "balance_loss_mlp": 1.02831078, + "epoch": 0.14332952564179643, + "flos": 21466070259840.0, + "grad_norm": 1.8885692454618934, + "language_loss": 0.7556231, + "learning_rate": 3.866890867471972e-06, + "loss": 0.77839953, + "num_input_tokens_seen": 25354230, + "step": 1192, + "time_per_iteration": 2.452528238296509 + }, + { + "auxiliary_loss_clip": 0.01199918, + "auxiliary_loss_mlp": 0.01046284, + "balance_loss_clip": 1.05783772, + "balance_loss_mlp": 1.03444672, + "epoch": 0.14344976853243552, + "flos": 16396992241920.0, + "grad_norm": 3.1640017634757673, + "language_loss": 0.89643347, + "learning_rate": 3.86661129361864e-06, + "loss": 0.91889554, + "num_input_tokens_seen": 25368720, + "step": 1193, + "time_per_iteration": 3.1725125312805176 + }, + { + "auxiliary_loss_clip": 0.01202547, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_clip": 1.0641861, + "balance_loss_mlp": 1.03494573, + "epoch": 0.1435700114230746, + "flos": 18916736336640.0, + "grad_norm": 2.2298320029315963, + "language_loss": 0.85916913, + "learning_rate": 3.866331436601859e-06, + "loss": 0.88166142, + "num_input_tokens_seen": 25386715, + "step": 1194, + "time_per_iteration": 2.5122265815734863 + }, + { + "auxiliary_loss_clip": 0.01236349, + "auxiliary_loss_mlp": 0.01039862, + "balance_loss_clip": 1.06757605, + "balance_loss_mlp": 1.0288825, + "epoch": 0.1436902543137137, + "flos": 19755394058880.0, + "grad_norm": 2.747894320506151, + "language_loss": 0.73746866, + "learning_rate": 3.866051296464083e-06, + "loss": 0.76023078, + "num_input_tokens_seen": 25405550, + "step": 1195, + "time_per_iteration": 4.796572923660278 + }, + { + "auxiliary_loss_clip": 0.0123576, + "auxiliary_loss_mlp": 0.00765587, + "balance_loss_clip": 1.06525993, + "balance_loss_mlp": 1.00069296, + "epoch": 0.1438104972043528, + "flos": 14684807669760.0, + "grad_norm": 3.4899051471487352, + "language_loss": 0.85142255, + "learning_rate": 3.86577087324781e-06, + "loss": 0.871436, + "num_input_tokens_seen": 25422040, + "step": 1196, + "time_per_iteration": 2.4358937740325928 + }, + { + "auxiliary_loss_clip": 0.01218682, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.06715035, + "balance_loss_mlp": 1.02408373, + "epoch": 0.14393074009499188, + "flos": 17092330698240.0, + "grad_norm": 2.000821536628753, + "language_loss": 0.77450216, + "learning_rate": 3.865490166995578e-06, + "loss": 0.79703349, + "num_input_tokens_seen": 25440270, + "step": 1197, + "time_per_iteration": 2.488290309906006 + }, + { + "auxiliary_loss_clip": 0.01219646, + "auxiliary_loss_mlp": 0.01043317, + "balance_loss_clip": 1.06545663, + "balance_loss_mlp": 1.03170621, + "epoch": 0.144050982985631, + "flos": 30476200608000.0, + "grad_norm": 5.912182645450796, + "language_loss": 0.83969343, + "learning_rate": 3.86520917774997e-06, + "loss": 0.86232305, + "num_input_tokens_seen": 25459705, + "step": 1198, + "time_per_iteration": 2.577462673187256 + }, + { + "auxiliary_loss_clip": 0.01217465, + "auxiliary_loss_mlp": 0.01045483, + "balance_loss_clip": 1.06616902, + "balance_loss_mlp": 1.03486705, + "epoch": 0.14417122587627007, + "flos": 17858484817920.0, + "grad_norm": 2.121738109178696, + "language_loss": 0.75049132, + "learning_rate": 3.864927905553614e-06, + "loss": 0.77312076, + "num_input_tokens_seen": 25477615, + "step": 1199, + "time_per_iteration": 2.4995386600494385 + }, + { + "auxiliary_loss_clip": 0.0118451, + "auxiliary_loss_mlp": 0.01041167, + "balance_loss_clip": 1.06020021, + "balance_loss_mlp": 1.03049195, + "epoch": 0.14429146876690915, + "flos": 21613914639360.0, + "grad_norm": 1.5594926221011305, + "language_loss": 0.88717782, + "learning_rate": 3.8646463504491765e-06, + "loss": 0.90943456, + "num_input_tokens_seen": 25497750, + "step": 1200, + "time_per_iteration": 2.6007468700408936 + }, + { + "auxiliary_loss_clip": 0.01223934, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.06885743, + "balance_loss_mlp": 1.02953005, + "epoch": 0.14441171165754824, + "flos": 23258120722560.0, + "grad_norm": 1.684494696861512, + "language_loss": 0.83516902, + "learning_rate": 3.8643645124793705e-06, + "loss": 0.85782242, + "num_input_tokens_seen": 25516650, + "step": 1201, + "time_per_iteration": 2.524254560470581 + }, + { + "auxiliary_loss_clip": 0.01216796, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.06313396, + "balance_loss_mlp": 1.02512884, + "epoch": 0.14453195454818735, + "flos": 42854213963520.0, + "grad_norm": 1.6386836242952605, + "language_loss": 0.7474972, + "learning_rate": 3.8640823916869515e-06, + "loss": 0.77002525, + "num_input_tokens_seen": 25540960, + "step": 1202, + "time_per_iteration": 2.71562123298645 + }, + { + "auxiliary_loss_clip": 0.01234516, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.06678104, + "balance_loss_mlp": 1.02663064, + "epoch": 0.14465219743882643, + "flos": 27235873774080.0, + "grad_norm": 1.4632616301313475, + "language_loss": 0.78389978, + "learning_rate": 3.863799988114714e-06, + "loss": 0.8066206, + "num_input_tokens_seen": 25562990, + "step": 1203, + "time_per_iteration": 2.4873383045196533 + }, + { + "auxiliary_loss_clip": 0.01238803, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.06743598, + "balance_loss_mlp": 1.0256232, + "epoch": 0.1447724403294655, + "flos": 16690705752960.0, + "grad_norm": 2.119686766688216, + "language_loss": 0.70934367, + "learning_rate": 3.863517301805502e-06, + "loss": 0.73210531, + "num_input_tokens_seen": 25581380, + "step": 1204, + "time_per_iteration": 2.4140000343322754 + }, + { + "auxiliary_loss_clip": 0.01191431, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_clip": 1.06454289, + "balance_loss_mlp": 1.03108716, + "epoch": 0.14489268322010462, + "flos": 20073741321600.0, + "grad_norm": 2.244363131655473, + "language_loss": 0.9700129, + "learning_rate": 3.863234332802196e-06, + "loss": 0.99234909, + "num_input_tokens_seen": 25593585, + "step": 1205, + "time_per_iteration": 2.527106761932373 + }, + { + "auxiliary_loss_clip": 0.01199505, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.06077623, + "balance_loss_mlp": 1.03361988, + "epoch": 0.1450129261107437, + "flos": 27125627955840.0, + "grad_norm": 2.072256926202117, + "language_loss": 0.74194157, + "learning_rate": 3.862951081147723e-06, + "loss": 0.76437485, + "num_input_tokens_seen": 25613750, + "step": 1206, + "time_per_iteration": 2.5793206691741943 + }, + { + "auxiliary_loss_clip": 0.01222402, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.06910169, + "balance_loss_mlp": 1.02767849, + "epoch": 0.1451331690013828, + "flos": 25702344472320.0, + "grad_norm": 2.166097566992728, + "language_loss": 0.78196311, + "learning_rate": 3.862667546885053e-06, + "loss": 0.80456865, + "num_input_tokens_seen": 25632300, + "step": 1207, + "time_per_iteration": 2.5124764442443848 + }, + { + "auxiliary_loss_clip": 0.01208844, + "auxiliary_loss_mlp": 0.01042076, + "balance_loss_clip": 1.0621624, + "balance_loss_mlp": 1.03098404, + "epoch": 0.14525341189202187, + "flos": 25737393168000.0, + "grad_norm": 2.8927668863296434, + "language_loss": 0.73480141, + "learning_rate": 3.8623837300571965e-06, + "loss": 0.75731063, + "num_input_tokens_seen": 25651285, + "step": 1208, + "time_per_iteration": 2.558812141418457 + }, + { + "auxiliary_loss_clip": 0.01239395, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.06847858, + "balance_loss_mlp": 1.02670574, + "epoch": 0.14537365478266098, + "flos": 23073898844160.0, + "grad_norm": 2.3159825453378104, + "language_loss": 0.84027344, + "learning_rate": 3.8620996307072085e-06, + "loss": 0.86305237, + "num_input_tokens_seen": 25671990, + "step": 1209, + "time_per_iteration": 2.4595062732696533 + }, + { + "auxiliary_loss_clip": 0.01190349, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.05816972, + "balance_loss_mlp": 1.02690423, + "epoch": 0.14549389767330007, + "flos": 20595021448320.0, + "grad_norm": 2.0180492916964163, + "language_loss": 0.64805627, + "learning_rate": 3.861815248878188e-06, + "loss": 0.67034364, + "num_input_tokens_seen": 25689475, + "step": 1210, + "time_per_iteration": 2.6118834018707275 + }, + { + "auxiliary_loss_clip": 0.01202955, + "auxiliary_loss_mlp": 0.0104213, + "balance_loss_clip": 1.06558061, + "balance_loss_mlp": 1.03143072, + "epoch": 0.14561414056393915, + "flos": 15121804533120.0, + "grad_norm": 7.872042613220004, + "language_loss": 0.79568207, + "learning_rate": 3.861530584613274e-06, + "loss": 0.818133, + "num_input_tokens_seen": 25707475, + "step": 1211, + "time_per_iteration": 2.500601291656494 + }, + { + "auxiliary_loss_clip": 0.01223761, + "auxiliary_loss_mlp": 0.00765748, + "balance_loss_clip": 1.06743288, + "balance_loss_mlp": 1.00066257, + "epoch": 0.14573438345457826, + "flos": 19427493778560.0, + "grad_norm": 2.537843080003902, + "language_loss": 0.82257831, + "learning_rate": 3.86124563795565e-06, + "loss": 0.84247339, + "num_input_tokens_seen": 25726290, + "step": 1212, + "time_per_iteration": 2.5543155670166016 + }, + { + "auxiliary_loss_clip": 0.01235791, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.06805205, + "balance_loss_mlp": 1.02629113, + "epoch": 0.14585462634521734, + "flos": 24828422572800.0, + "grad_norm": 1.5638722416958215, + "language_loss": 0.69697559, + "learning_rate": 3.860960408948543e-06, + "loss": 0.71970487, + "num_input_tokens_seen": 25748040, + "step": 1213, + "time_per_iteration": 2.54801082611084 + }, + { + "auxiliary_loss_clip": 0.01211347, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_clip": 1.06682181, + "balance_loss_mlp": 1.03235173, + "epoch": 0.14597486923585642, + "flos": 15448627405440.0, + "grad_norm": 2.259079129281128, + "language_loss": 0.89562082, + "learning_rate": 3.860674897635222e-06, + "loss": 0.91816187, + "num_input_tokens_seen": 25764525, + "step": 1214, + "time_per_iteration": 2.4703872203826904 + }, + { + "auxiliary_loss_clip": 0.01219655, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.0675081, + "balance_loss_mlp": 1.03336, + "epoch": 0.1460951121264955, + "flos": 16655154266880.0, + "grad_norm": 2.297203183708753, + "language_loss": 0.83523953, + "learning_rate": 3.860389104058998e-06, + "loss": 0.85788333, + "num_input_tokens_seen": 25782755, + "step": 1215, + "time_per_iteration": 2.4570188522338867 + }, + { + "auxiliary_loss_clip": 0.01201195, + "auxiliary_loss_mlp": 0.01036423, + "balance_loss_clip": 1.06182504, + "balance_loss_mlp": 1.02564597, + "epoch": 0.14621535501713462, + "flos": 24863291700480.0, + "grad_norm": 2.3335926360724994, + "language_loss": 0.72447252, + "learning_rate": 3.860103028263227e-06, + "loss": 0.7468487, + "num_input_tokens_seen": 25805860, + "step": 1216, + "time_per_iteration": 2.672964334487915 + }, + { + "auxiliary_loss_clip": 0.01165322, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.05382276, + "balance_loss_mlp": 1.02648544, + "epoch": 0.1463355979077737, + "flos": 25228000442880.0, + "grad_norm": 2.534208305261599, + "language_loss": 0.70284522, + "learning_rate": 3.859816670291304e-06, + "loss": 0.72486973, + "num_input_tokens_seen": 25824955, + "step": 1217, + "time_per_iteration": 2.6387124061584473 + }, + { + "auxiliary_loss_clip": 0.01151416, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.05742192, + "balance_loss_mlp": 1.02283192, + "epoch": 0.14645584079841278, + "flos": 22054143726720.0, + "grad_norm": 1.9536479916081164, + "language_loss": 0.89766669, + "learning_rate": 3.859530030186672e-06, + "loss": 0.91952968, + "num_input_tokens_seen": 25841965, + "step": 1218, + "time_per_iteration": 2.6553542613983154 + }, + { + "auxiliary_loss_clip": 0.01208614, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.06678593, + "balance_loss_mlp": 1.02628243, + "epoch": 0.1465760836890519, + "flos": 23623870959360.0, + "grad_norm": 2.468397128433994, + "language_loss": 0.82533026, + "learning_rate": 3.859243107992813e-06, + "loss": 0.84779006, + "num_input_tokens_seen": 25860770, + "step": 1219, + "time_per_iteration": 2.5393550395965576 + }, + { + "auxiliary_loss_clip": 0.01189126, + "auxiliary_loss_mlp": 0.01041412, + "balance_loss_clip": 1.05678833, + "balance_loss_mlp": 1.02960467, + "epoch": 0.14669632657969098, + "flos": 37407893356800.0, + "grad_norm": 5.754460645409684, + "language_loss": 0.78293109, + "learning_rate": 3.858955903753252e-06, + "loss": 0.80523646, + "num_input_tokens_seen": 25879410, + "step": 1220, + "time_per_iteration": 3.4318995475769043 + }, + { + "auxiliary_loss_clip": 0.0121894, + "auxiliary_loss_mlp": 0.01040368, + "balance_loss_clip": 1.06270063, + "balance_loss_mlp": 1.03040838, + "epoch": 0.14681656947033006, + "flos": 28365910623360.0, + "grad_norm": 1.5867826537853151, + "language_loss": 0.83572161, + "learning_rate": 3.858668417511559e-06, + "loss": 0.85831463, + "num_input_tokens_seen": 25902160, + "step": 1221, + "time_per_iteration": 4.116295099258423 + }, + { + "auxiliary_loss_clip": 0.0120978, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.06734848, + "balance_loss_mlp": 1.02097368, + "epoch": 0.14693681236096917, + "flos": 18479488078080.0, + "grad_norm": 3.213443327373654, + "language_loss": 0.76555955, + "learning_rate": 3.8583806493113445e-06, + "loss": 0.78798151, + "num_input_tokens_seen": 25920505, + "step": 1222, + "time_per_iteration": 3.300384044647217 + }, + { + "auxiliary_loss_clip": 0.01217569, + "auxiliary_loss_mlp": 0.01046093, + "balance_loss_clip": 1.06664085, + "balance_loss_mlp": 1.03537059, + "epoch": 0.14705705525160825, + "flos": 20777806782720.0, + "grad_norm": 2.3996111244555127, + "language_loss": 0.82174957, + "learning_rate": 3.858092599196263e-06, + "loss": 0.84438616, + "num_input_tokens_seen": 25938460, + "step": 1223, + "time_per_iteration": 2.4978649616241455 + }, + { + "auxiliary_loss_clip": 0.01220656, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.06657898, + "balance_loss_mlp": 1.02106345, + "epoch": 0.14717729814224734, + "flos": 29932944336000.0, + "grad_norm": 2.3575179055520663, + "language_loss": 0.82215071, + "learning_rate": 3.857804267210012e-06, + "loss": 0.84467417, + "num_input_tokens_seen": 25957760, + "step": 1224, + "time_per_iteration": 2.5538618564605713 + }, + { + "auxiliary_loss_clip": 0.01173398, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_clip": 1.05502439, + "balance_loss_mlp": 1.03392625, + "epoch": 0.14729754103288642, + "flos": 20047491457920.0, + "grad_norm": 2.127987623321911, + "language_loss": 0.88153243, + "learning_rate": 3.857515653396331e-06, + "loss": 0.90371525, + "num_input_tokens_seen": 25974970, + "step": 1225, + "time_per_iteration": 2.542593240737915 + }, + { + "auxiliary_loss_clip": 0.01175298, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.06003523, + "balance_loss_mlp": 1.03079581, + "epoch": 0.14741778392352553, + "flos": 19281516906240.0, + "grad_norm": 2.399221489198223, + "language_loss": 0.86707854, + "learning_rate": 3.857226757799002e-06, + "loss": 0.88924205, + "num_input_tokens_seen": 25992525, + "step": 1226, + "time_per_iteration": 2.5893795490264893 + }, + { + "auxiliary_loss_clip": 0.01202618, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.06158543, + "balance_loss_mlp": 1.02660966, + "epoch": 0.1475380268141646, + "flos": 25411108999680.0, + "grad_norm": 2.289080394163355, + "language_loss": 0.73733699, + "learning_rate": 3.85693758046185e-06, + "loss": 0.75973988, + "num_input_tokens_seen": 26010815, + "step": 1227, + "time_per_iteration": 2.564051389694214 + }, + { + "auxiliary_loss_clip": 0.01236473, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.06965494, + "balance_loss_mlp": 1.03528833, + "epoch": 0.1476582697048037, + "flos": 20847652778880.0, + "grad_norm": 3.0312883156680557, + "language_loss": 0.82702559, + "learning_rate": 3.8566481214287435e-06, + "loss": 0.8498469, + "num_input_tokens_seen": 26028935, + "step": 1228, + "time_per_iteration": 2.4664509296417236 + }, + { + "auxiliary_loss_clip": 0.01179986, + "auxiliary_loss_mlp": 0.01040099, + "balance_loss_clip": 1.05766511, + "balance_loss_mlp": 1.02864277, + "epoch": 0.1477785125954428, + "flos": 14028109269120.0, + "grad_norm": 2.374617301996417, + "language_loss": 0.90799755, + "learning_rate": 3.8563583807435935e-06, + "loss": 0.93019837, + "num_input_tokens_seen": 26045080, + "step": 1229, + "time_per_iteration": 2.5458686351776123 + }, + { + "auxiliary_loss_clip": 0.01220189, + "auxiliary_loss_mlp": 0.00765419, + "balance_loss_clip": 1.06354225, + "balance_loss_mlp": 1.00070977, + "epoch": 0.1478987554860819, + "flos": 20516699842560.0, + "grad_norm": 2.740657182590516, + "language_loss": 0.77919376, + "learning_rate": 3.856068358450353e-06, + "loss": 0.79904985, + "num_input_tokens_seen": 26065030, + "step": 1230, + "time_per_iteration": 2.500091791152954 + }, + { + "auxiliary_loss_clip": 0.01201557, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.06754053, + "balance_loss_mlp": 1.034688, + "epoch": 0.14801899837672097, + "flos": 17857012360320.0, + "grad_norm": 1.735062418170167, + "language_loss": 0.85593832, + "learning_rate": 3.8557780545930186e-06, + "loss": 0.87841374, + "num_input_tokens_seen": 26083445, + "step": 1231, + "time_per_iteration": 2.517578601837158 + }, + { + "auxiliary_loss_clip": 0.01204214, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.06543541, + "balance_loss_mlp": 1.0296495, + "epoch": 0.14813924126736006, + "flos": 20881408584960.0, + "grad_norm": 2.207366704336268, + "language_loss": 0.79299265, + "learning_rate": 3.855487469215628e-06, + "loss": 0.81543493, + "num_input_tokens_seen": 26102375, + "step": 1232, + "time_per_iteration": 2.5178680419921875 + }, + { + "auxiliary_loss_clip": 0.01188882, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.06163979, + "balance_loss_mlp": 1.02528405, + "epoch": 0.14825948415799917, + "flos": 37414070496000.0, + "grad_norm": 6.117207444812654, + "language_loss": 0.72365403, + "learning_rate": 3.855196602362264e-06, + "loss": 0.74590242, + "num_input_tokens_seen": 26125295, + "step": 1233, + "time_per_iteration": 2.711454153060913 + }, + { + "auxiliary_loss_clip": 0.01217993, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.06373179, + "balance_loss_mlp": 1.02233255, + "epoch": 0.14837972704863825, + "flos": 22014641744640.0, + "grad_norm": 2.035656546237182, + "language_loss": 0.94081235, + "learning_rate": 3.854905454077051e-06, + "loss": 0.9633221, + "num_input_tokens_seen": 26142905, + "step": 1234, + "time_per_iteration": 2.501849412918091 + }, + { + "auxiliary_loss_clip": 0.01138744, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.0541079, + "balance_loss_mlp": 1.02766395, + "epoch": 0.14849996993927733, + "flos": 20996323171200.0, + "grad_norm": 1.9499128414117712, + "language_loss": 0.88363028, + "learning_rate": 3.854614024404155e-06, + "loss": 0.90540069, + "num_input_tokens_seen": 26161215, + "step": 1235, + "time_per_iteration": 2.691936731338501 + }, + { + "auxiliary_loss_clip": 0.01190726, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.05957198, + "balance_loss_mlp": 1.02452648, + "epoch": 0.14862021282991644, + "flos": 20047994248320.0, + "grad_norm": 2.4365216887882224, + "language_loss": 0.8939836, + "learning_rate": 3.8543223133877865e-06, + "loss": 0.91624266, + "num_input_tokens_seen": 26179810, + "step": 1236, + "time_per_iteration": 2.6440999507904053 + }, + { + "auxiliary_loss_clip": 0.01186278, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.05915368, + "balance_loss_mlp": 1.02882433, + "epoch": 0.14874045572055553, + "flos": 22712027276160.0, + "grad_norm": 1.7377833207157234, + "language_loss": 0.88396543, + "learning_rate": 3.854030321072198e-06, + "loss": 0.90623784, + "num_input_tokens_seen": 26199715, + "step": 1237, + "time_per_iteration": 2.5508151054382324 + }, + { + "auxiliary_loss_clip": 0.01193914, + "auxiliary_loss_mlp": 0.01027019, + "balance_loss_clip": 1.06162, + "balance_loss_mlp": 1.0164032, + "epoch": 0.1488606986111946, + "flos": 25411288567680.0, + "grad_norm": 1.9696497991738047, + "language_loss": 0.73253918, + "learning_rate": 3.853738047501682e-06, + "loss": 0.75474846, + "num_input_tokens_seen": 26220275, + "step": 1238, + "time_per_iteration": 2.6250457763671875 + }, + { + "auxiliary_loss_clip": 0.01220851, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.06719077, + "balance_loss_mlp": 1.02817428, + "epoch": 0.1489809415018337, + "flos": 17018749687680.0, + "grad_norm": 22.57294979078124, + "language_loss": 0.77503192, + "learning_rate": 3.85344549272058e-06, + "loss": 0.79763675, + "num_input_tokens_seen": 26238255, + "step": 1239, + "time_per_iteration": 2.459886074066162 + }, + { + "auxiliary_loss_clip": 0.01214757, + "auxiliary_loss_mlp": 0.01040001, + "balance_loss_clip": 1.06312275, + "balance_loss_mlp": 1.02869976, + "epoch": 0.1491011843924728, + "flos": 33659394860160.0, + "grad_norm": 2.4925955923008183, + "language_loss": 0.82429826, + "learning_rate": 3.853152656773269e-06, + "loss": 0.84684581, + "num_input_tokens_seen": 26259690, + "step": 1240, + "time_per_iteration": 2.626439094543457 + }, + { + "auxiliary_loss_clip": 0.01202056, + "auxiliary_loss_mlp": 0.01033918, + "balance_loss_clip": 1.06382978, + "balance_loss_mlp": 1.02275443, + "epoch": 0.14922142728311188, + "flos": 21179000764800.0, + "grad_norm": 1.8288826513734726, + "language_loss": 0.84746116, + "learning_rate": 3.852859539704174e-06, + "loss": 0.86982089, + "num_input_tokens_seen": 26278990, + "step": 1241, + "time_per_iteration": 2.541192054748535 + }, + { + "auxiliary_loss_clip": 0.01169482, + "auxiliary_loss_mlp": 0.01042111, + "balance_loss_clip": 1.05681932, + "balance_loss_mlp": 1.03097141, + "epoch": 0.14934167017375097, + "flos": 29860548474240.0, + "grad_norm": 1.8852357278436713, + "language_loss": 0.76285517, + "learning_rate": 3.85256614155776e-06, + "loss": 0.78497112, + "num_input_tokens_seen": 26299120, + "step": 1242, + "time_per_iteration": 2.7033166885375977 + }, + { + "auxiliary_loss_clip": 0.01213781, + "auxiliary_loss_mlp": 0.01037122, + "balance_loss_clip": 1.06063926, + "balance_loss_mlp": 1.02639365, + "epoch": 0.14946191306439008, + "flos": 17019216564480.0, + "grad_norm": 2.2565078955150524, + "language_loss": 0.7471053, + "learning_rate": 3.852272462378535e-06, + "loss": 0.76961434, + "num_input_tokens_seen": 26316995, + "step": 1243, + "time_per_iteration": 2.4723238945007324 + }, + { + "auxiliary_loss_clip": 0.01204736, + "auxiliary_loss_mlp": 0.01043782, + "balance_loss_clip": 1.06474566, + "balance_loss_mlp": 1.0335542, + "epoch": 0.14958215595502916, + "flos": 15669047214720.0, + "grad_norm": 2.3506711488276353, + "language_loss": 0.77756834, + "learning_rate": 3.85197850221105e-06, + "loss": 0.80005354, + "num_input_tokens_seen": 26333295, + "step": 1244, + "time_per_iteration": 2.5056076049804688 + }, + { + "auxiliary_loss_clip": 0.01219034, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.06846011, + "balance_loss_mlp": 1.02905822, + "epoch": 0.14970239884566824, + "flos": 33108560818560.0, + "grad_norm": 1.6670245623123094, + "language_loss": 0.75336462, + "learning_rate": 3.851684261099899e-06, + "loss": 0.77594936, + "num_input_tokens_seen": 26355035, + "step": 1245, + "time_per_iteration": 2.6425015926361084 + }, + { + "auxiliary_loss_clip": 0.01197047, + "auxiliary_loss_mlp": 0.01036873, + "balance_loss_clip": 1.06015539, + "balance_loss_mlp": 1.02485025, + "epoch": 0.14982264173630733, + "flos": 17821245392640.0, + "grad_norm": 2.327570070238734, + "language_loss": 0.86490738, + "learning_rate": 3.851389739089718e-06, + "loss": 0.88724661, + "num_input_tokens_seen": 26371655, + "step": 1246, + "time_per_iteration": 2.5956642627716064 + }, + { + "auxiliary_loss_clip": 0.01224693, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_clip": 1.07121348, + "balance_loss_mlp": 1.03024745, + "epoch": 0.14994288462694644, + "flos": 32409559175040.0, + "grad_norm": 1.979671113030095, + "language_loss": 0.8022095, + "learning_rate": 3.851094936225186e-06, + "loss": 0.82487053, + "num_input_tokens_seen": 26392540, + "step": 1247, + "time_per_iteration": 3.466738700866699 + }, + { + "auxiliary_loss_clip": 0.01200887, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.06709743, + "balance_loss_mlp": 1.02206969, + "epoch": 0.15006312751758552, + "flos": 31794661226880.0, + "grad_norm": 2.215330494963057, + "language_loss": 0.76325321, + "learning_rate": 3.850799852551024e-06, + "loss": 0.78559029, + "num_input_tokens_seen": 26414960, + "step": 1248, + "time_per_iteration": 3.512132406234741 + }, + { + "auxiliary_loss_clip": 0.01209231, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.06097078, + "balance_loss_mlp": 1.03259528, + "epoch": 0.1501833704082246, + "flos": 16618022582400.0, + "grad_norm": 2.9584849637410193, + "language_loss": 0.85904229, + "learning_rate": 3.850504488111995e-06, + "loss": 0.88157308, + "num_input_tokens_seen": 26431635, + "step": 1249, + "time_per_iteration": 2.4717419147491455 + }, + { + "auxiliary_loss_clip": 0.01193588, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.05991459, + "balance_loss_mlp": 1.02255213, + "epoch": 0.15030361329886371, + "flos": 23471178243840.0, + "grad_norm": 1.6700542194375354, + "language_loss": 0.82501912, + "learning_rate": 3.850208842952907e-06, + "loss": 0.84727943, + "num_input_tokens_seen": 26450440, + "step": 1250, + "time_per_iteration": 2.558462142944336 + }, + { + "auxiliary_loss_clip": 0.011788, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.05841625, + "balance_loss_mlp": 1.03091693, + "epoch": 0.1504238561895028, + "flos": 25629409906560.0, + "grad_norm": 1.6410566484160543, + "language_loss": 0.79135585, + "learning_rate": 3.849912917118608e-06, + "loss": 0.81356001, + "num_input_tokens_seen": 26471480, + "step": 1251, + "time_per_iteration": 2.6641218662261963 + }, + { + "auxiliary_loss_clip": 0.0112563, + "auxiliary_loss_mlp": 0.01010932, + "balance_loss_clip": 1.03287554, + "balance_loss_mlp": 1.00811827, + "epoch": 0.15054409908014188, + "flos": 52095146129280.0, + "grad_norm": 0.8757859917833876, + "language_loss": 0.59238815, + "learning_rate": 3.849616710653992e-06, + "loss": 0.6137538, + "num_input_tokens_seen": 26532950, + "step": 1252, + "time_per_iteration": 3.0317647457122803 + }, + { + "auxiliary_loss_clip": 0.01215752, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.06459975, + "balance_loss_mlp": 1.02678704, + "epoch": 0.150664341970781, + "flos": 18880251096960.0, + "grad_norm": 1.6742087337379226, + "language_loss": 0.74903297, + "learning_rate": 3.84932022360399e-06, + "loss": 0.77156729, + "num_input_tokens_seen": 26551615, + "step": 1253, + "time_per_iteration": 2.484659194946289 + }, + { + "auxiliary_loss_clip": 0.01201493, + "auxiliary_loss_mlp": 0.01043322, + "balance_loss_clip": 1.0662961, + "balance_loss_mlp": 1.0322175, + "epoch": 0.15078458486142007, + "flos": 22163240309760.0, + "grad_norm": 2.7932583912364186, + "language_loss": 0.84420663, + "learning_rate": 3.849023456013581e-06, + "loss": 0.86665475, + "num_input_tokens_seen": 26569175, + "step": 1254, + "time_per_iteration": 2.5319113731384277 + }, + { + "auxiliary_loss_clip": 0.01223362, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.06672752, + "balance_loss_mlp": 1.03370786, + "epoch": 0.15090482775205916, + "flos": 26651894457600.0, + "grad_norm": 2.058642246093293, + "language_loss": 0.62145573, + "learning_rate": 3.848726407927784e-06, + "loss": 0.6441406, + "num_input_tokens_seen": 26589560, + "step": 1255, + "time_per_iteration": 2.5579514503479004 + }, + { + "auxiliary_loss_clip": 0.0120456, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.06594622, + "balance_loss_mlp": 1.02943778, + "epoch": 0.15102507064269824, + "flos": 21798998444160.0, + "grad_norm": 2.295405469042203, + "language_loss": 0.86855304, + "learning_rate": 3.84842907939166e-06, + "loss": 0.89099455, + "num_input_tokens_seen": 26608785, + "step": 1256, + "time_per_iteration": 2.564642906188965 + }, + { + "auxiliary_loss_clip": 0.01179528, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.06016922, + "balance_loss_mlp": 1.0300796, + "epoch": 0.15114531353333735, + "flos": 22820908377600.0, + "grad_norm": 2.229748197790081, + "language_loss": 0.71351624, + "learning_rate": 3.8481314704503146e-06, + "loss": 0.73571444, + "num_input_tokens_seen": 26628615, + "step": 1257, + "time_per_iteration": 2.577796459197998 + }, + { + "auxiliary_loss_clip": 0.01218454, + "auxiliary_loss_mlp": 0.01041803, + "balance_loss_clip": 1.06842375, + "balance_loss_mlp": 1.03189039, + "epoch": 0.15126555642397643, + "flos": 19682674974720.0, + "grad_norm": 2.278305409794669, + "language_loss": 0.88016742, + "learning_rate": 3.847833581148895e-06, + "loss": 0.90276992, + "num_input_tokens_seen": 26647525, + "step": 1258, + "time_per_iteration": 2.4734416007995605 + }, + { + "auxiliary_loss_clip": 0.01229762, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.06314969, + "balance_loss_mlp": 1.02321529, + "epoch": 0.15138579931461552, + "flos": 28726022424960.0, + "grad_norm": 2.0881392532740013, + "language_loss": 0.81280458, + "learning_rate": 3.84753541153259e-06, + "loss": 0.8354426, + "num_input_tokens_seen": 26667095, + "step": 1259, + "time_per_iteration": 2.5433666706085205 + }, + { + "auxiliary_loss_clip": 0.01217648, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.06647992, + "balance_loss_mlp": 1.02664804, + "epoch": 0.15150604220525463, + "flos": 22127006465280.0, + "grad_norm": 1.8237547759941755, + "language_loss": 0.83238631, + "learning_rate": 3.847236961646633e-06, + "loss": 0.85492796, + "num_input_tokens_seen": 26686075, + "step": 1260, + "time_per_iteration": 2.543534994125366 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.01040437, + "balance_loss_clip": 1.06105709, + "balance_loss_mlp": 1.02946436, + "epoch": 0.1516262850958937, + "flos": 12968708515200.0, + "grad_norm": 2.40103123343685, + "language_loss": 0.78011417, + "learning_rate": 3.846938231536296e-06, + "loss": 0.80246043, + "num_input_tokens_seen": 26701695, + "step": 1261, + "time_per_iteration": 2.5631840229034424 + }, + { + "auxiliary_loss_clip": 0.01220942, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.06770289, + "balance_loss_mlp": 1.02509809, + "epoch": 0.1517465279865328, + "flos": 21797130936960.0, + "grad_norm": 1.705542543994183, + "language_loss": 0.8102051, + "learning_rate": 3.8466392212468995e-06, + "loss": 0.832766, + "num_input_tokens_seen": 26721885, + "step": 1262, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.0110058, + "auxiliary_loss_mlp": 0.01001648, + "balance_loss_clip": 1.02718544, + "balance_loss_mlp": 0.99921656, + "epoch": 0.15186677087717187, + "flos": 58174569901440.0, + "grad_norm": 0.8174117389947326, + "language_loss": 0.61960268, + "learning_rate": 3.8463399308238e-06, + "loss": 0.640625, + "num_input_tokens_seen": 26780990, + "step": 1263, + "time_per_iteration": 3.103006601333618 + }, + { + "auxiliary_loss_clip": 0.01215129, + "auxiliary_loss_mlp": 0.0103954, + "balance_loss_clip": 1.06538594, + "balance_loss_mlp": 1.0275414, + "epoch": 0.15198701376781099, + "flos": 32669696448000.0, + "grad_norm": 1.8481913642710386, + "language_loss": 0.63841534, + "learning_rate": 3.846040360312402e-06, + "loss": 0.66096205, + "num_input_tokens_seen": 26804250, + "step": 1264, + "time_per_iteration": 2.5894432067871094 + }, + { + "auxiliary_loss_clip": 0.01229618, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.0640949, + "balance_loss_mlp": 1.02774239, + "epoch": 0.15210725665845007, + "flos": 28402575431040.0, + "grad_norm": 1.9757479822592416, + "language_loss": 0.81437731, + "learning_rate": 3.8457405097581485e-06, + "loss": 0.83705437, + "num_input_tokens_seen": 26823240, + "step": 1265, + "time_per_iteration": 2.5016438961029053 + }, + { + "auxiliary_loss_clip": 0.01170414, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.02729511, + "epoch": 0.15222749954908915, + "flos": 19938179393280.0, + "grad_norm": 1.8953545176578155, + "language_loss": 0.77769017, + "learning_rate": 3.8454403792065275e-06, + "loss": 0.79977387, + "num_input_tokens_seen": 26842060, + "step": 1266, + "time_per_iteration": 2.605502128601074 + }, + { + "auxiliary_loss_clip": 0.01172776, + "auxiliary_loss_mlp": 0.01048803, + "balance_loss_clip": 1.0570122, + "balance_loss_mlp": 1.03846788, + "epoch": 0.15234774243972826, + "flos": 21324223451520.0, + "grad_norm": 1.9386052689444924, + "language_loss": 0.85332459, + "learning_rate": 3.845139968703068e-06, + "loss": 0.87554038, + "num_input_tokens_seen": 26859580, + "step": 1267, + "time_per_iteration": 2.5770022869110107 + }, + { + "auxiliary_loss_clip": 0.01168808, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.05648184, + "balance_loss_mlp": 1.0289427, + "epoch": 0.15246798533036734, + "flos": 25957812977280.0, + "grad_norm": 2.165114892741538, + "language_loss": 0.83059311, + "learning_rate": 3.844839278293342e-06, + "loss": 0.85268223, + "num_input_tokens_seen": 26880430, + "step": 1268, + "time_per_iteration": 2.6602795124053955 + }, + { + "auxiliary_loss_clip": 0.01230705, + "auxiliary_loss_mlp": 0.01036483, + "balance_loss_clip": 1.06612921, + "balance_loss_mlp": 1.02573586, + "epoch": 0.15258822822100643, + "flos": 25811907932160.0, + "grad_norm": 2.0434079301914747, + "language_loss": 0.76872313, + "learning_rate": 3.8445383080229654e-06, + "loss": 0.79139495, + "num_input_tokens_seen": 26896445, + "step": 1269, + "time_per_iteration": 2.4803309440612793 + }, + { + "auxiliary_loss_clip": 0.0119075, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.05619669, + "balance_loss_mlp": 1.02533925, + "epoch": 0.1527084711116455, + "flos": 25265455349760.0, + "grad_norm": 2.2251747720577484, + "language_loss": 0.73606932, + "learning_rate": 3.844237057937593e-06, + "loss": 0.75834095, + "num_input_tokens_seen": 26915450, + "step": 1270, + "time_per_iteration": 2.559722661972046 + }, + { + "auxiliary_loss_clip": 0.01219219, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.06273639, + "balance_loss_mlp": 1.02143323, + "epoch": 0.15282871400228462, + "flos": 29240227572480.0, + "grad_norm": 2.172091760565259, + "language_loss": 0.77928597, + "learning_rate": 3.843935528082926e-06, + "loss": 0.80180025, + "num_input_tokens_seen": 26936475, + "step": 1271, + "time_per_iteration": 2.5708961486816406 + }, + { + "auxiliary_loss_clip": 0.01216165, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.06392264, + "balance_loss_mlp": 1.02400208, + "epoch": 0.1529489568929237, + "flos": 20882952869760.0, + "grad_norm": 1.667115883925367, + "language_loss": 0.84888828, + "learning_rate": 3.843633718504704e-06, + "loss": 0.87139201, + "num_input_tokens_seen": 26954920, + "step": 1272, + "time_per_iteration": 2.4923460483551025 + }, + { + "auxiliary_loss_clip": 0.01183543, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.0599308, + "balance_loss_mlp": 1.02466059, + "epoch": 0.1530691997835628, + "flos": 20083833043200.0, + "grad_norm": 2.2787542222092627, + "language_loss": 0.89980912, + "learning_rate": 3.843331629248715e-06, + "loss": 0.92199612, + "num_input_tokens_seen": 26972520, + "step": 1273, + "time_per_iteration": 3.328202486038208 + }, + { + "auxiliary_loss_clip": 0.01231674, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.06689644, + "balance_loss_mlp": 1.02464235, + "epoch": 0.1531894426742019, + "flos": 28759814144640.0, + "grad_norm": 2.3539370117094265, + "language_loss": 0.76839221, + "learning_rate": 3.843029260360782e-06, + "loss": 0.79105687, + "num_input_tokens_seen": 26990890, + "step": 1274, + "time_per_iteration": 2.5125367641448975 + }, + { + "auxiliary_loss_clip": 0.0121289, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.06278896, + "balance_loss_mlp": 1.03141248, + "epoch": 0.15330968556484098, + "flos": 22236282616320.0, + "grad_norm": 1.7190514774518968, + "language_loss": 0.78939134, + "learning_rate": 3.8427266118867755e-06, + "loss": 0.81193292, + "num_input_tokens_seen": 27010640, + "step": 1275, + "time_per_iteration": 4.02425479888916 + }, + { + "auxiliary_loss_clip": 0.01196539, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.0607028, + "balance_loss_mlp": 1.02368653, + "epoch": 0.15342992845548006, + "flos": 27527504296320.0, + "grad_norm": 1.9525856819748024, + "language_loss": 0.82238668, + "learning_rate": 3.842423683872608e-06, + "loss": 0.84469843, + "num_input_tokens_seen": 27031215, + "step": 1276, + "time_per_iteration": 2.623506546020508 + }, + { + "auxiliary_loss_clip": 0.0121254, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.06200647, + "balance_loss_mlp": 1.03011227, + "epoch": 0.15355017134611917, + "flos": 19609596754560.0, + "grad_norm": 2.7228128119240744, + "language_loss": 0.77289081, + "learning_rate": 3.842120476364232e-06, + "loss": 0.79542232, + "num_input_tokens_seen": 27049665, + "step": 1277, + "time_per_iteration": 2.465909719467163 + }, + { + "auxiliary_loss_clip": 0.01218251, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.06155288, + "balance_loss_mlp": 1.02451527, + "epoch": 0.15367041423675826, + "flos": 18478590238080.0, + "grad_norm": 2.091335753296588, + "language_loss": 0.83655369, + "learning_rate": 3.841816989407644e-06, + "loss": 0.85908461, + "num_input_tokens_seen": 27065155, + "step": 1278, + "time_per_iteration": 2.466064214706421 + }, + { + "auxiliary_loss_clip": 0.01180019, + "auxiliary_loss_mlp": 0.01042958, + "balance_loss_clip": 1.05907667, + "balance_loss_mlp": 1.03254533, + "epoch": 0.15379065712739734, + "flos": 41427662342400.0, + "grad_norm": 2.388470389651613, + "language_loss": 0.77073717, + "learning_rate": 3.841513223048884e-06, + "loss": 0.79296696, + "num_input_tokens_seen": 27085840, + "step": 1279, + "time_per_iteration": 2.736969470977783 + }, + { + "auxiliary_loss_clip": 0.01177999, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.05575037, + "balance_loss_mlp": 1.02593565, + "epoch": 0.15391090001803642, + "flos": 22054215553920.0, + "grad_norm": 2.550091819960324, + "language_loss": 0.78711331, + "learning_rate": 3.841209177334031e-06, + "loss": 0.80925596, + "num_input_tokens_seen": 27104200, + "step": 1280, + "time_per_iteration": 2.5748941898345947 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.06281662, + "balance_loss_mlp": 1.02184713, + "epoch": 0.15403114290867553, + "flos": 15450351258240.0, + "grad_norm": 1.8283972369749335, + "language_loss": 0.74490082, + "learning_rate": 3.84090485230921e-06, + "loss": 0.76731288, + "num_input_tokens_seen": 27122440, + "step": 1281, + "time_per_iteration": 2.4572155475616455 + }, + { + "auxiliary_loss_clip": 0.01228627, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.06498003, + "balance_loss_mlp": 1.02086592, + "epoch": 0.15415138579931462, + "flos": 17929156826880.0, + "grad_norm": 2.496201862457987, + "language_loss": 0.76530743, + "learning_rate": 3.840600248020588e-06, + "loss": 0.78790921, + "num_input_tokens_seen": 27139380, + "step": 1282, + "time_per_iteration": 2.480783700942993 + }, + { + "auxiliary_loss_clip": 0.01202641, + "auxiliary_loss_mlp": 0.01046252, + "balance_loss_clip": 1.05807996, + "balance_loss_mlp": 1.03518295, + "epoch": 0.1542716286899537, + "flos": 11429325296640.0, + "grad_norm": 2.0299029012663206, + "language_loss": 0.80025911, + "learning_rate": 3.840295364514371e-06, + "loss": 0.82274806, + "num_input_tokens_seen": 27156760, + "step": 1283, + "time_per_iteration": 2.551541566848755 + }, + { + "auxiliary_loss_clip": 0.01198649, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.06134689, + "balance_loss_mlp": 1.0280745, + "epoch": 0.1543918715805928, + "flos": 17420338719360.0, + "grad_norm": 2.103221972728767, + "language_loss": 0.78847402, + "learning_rate": 3.83999020183681e-06, + "loss": 0.81084585, + "num_input_tokens_seen": 27175455, + "step": 1284, + "time_per_iteration": 2.5084450244903564 + }, + { + "auxiliary_loss_clip": 0.01142487, + "auxiliary_loss_mlp": 0.01037941, + "balance_loss_clip": 1.0520885, + "balance_loss_mlp": 1.02779627, + "epoch": 0.1545121144712319, + "flos": 17786376264960.0, + "grad_norm": 2.3727457676288837, + "language_loss": 0.7853688, + "learning_rate": 3.839684760034199e-06, + "loss": 0.80717313, + "num_input_tokens_seen": 27193660, + "step": 1285, + "time_per_iteration": 2.6260266304016113 + }, + { + "auxiliary_loss_clip": 0.011766, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.02689099, + "epoch": 0.15463235736187098, + "flos": 28220185146240.0, + "grad_norm": 2.256174849242525, + "language_loss": 0.65442836, + "learning_rate": 3.8393790391528716e-06, + "loss": 0.67657012, + "num_input_tokens_seen": 27214355, + "step": 1286, + "time_per_iteration": 2.6271653175354004 + }, + { + "auxiliary_loss_clip": 0.0119515, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.05848718, + "balance_loss_mlp": 1.02580035, + "epoch": 0.15475260025251006, + "flos": 22856890826880.0, + "grad_norm": 4.639402290407119, + "language_loss": 0.89022899, + "learning_rate": 3.8390730392392075e-06, + "loss": 0.91253936, + "num_input_tokens_seen": 27234335, + "step": 1287, + "time_per_iteration": 2.5311336517333984 + }, + { + "auxiliary_loss_clip": 0.0122874, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.06548917, + "balance_loss_mlp": 1.02246106, + "epoch": 0.15487284314314917, + "flos": 17602872658560.0, + "grad_norm": 2.1439012257708985, + "language_loss": 0.79422951, + "learning_rate": 3.838766760339626e-06, + "loss": 0.81684119, + "num_input_tokens_seen": 27252860, + "step": 1288, + "time_per_iteration": 2.469806432723999 + }, + { + "auxiliary_loss_clip": 0.01162433, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.05339646, + "balance_loss_mlp": 1.02689385, + "epoch": 0.15499308603378825, + "flos": 20082037363200.0, + "grad_norm": 2.55375058181178, + "language_loss": 0.79727298, + "learning_rate": 3.838460202500587e-06, + "loss": 0.81927419, + "num_input_tokens_seen": 27268650, + "step": 1289, + "time_per_iteration": 2.5451347827911377 + }, + { + "auxiliary_loss_clip": 0.01172915, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.05908799, + "balance_loss_mlp": 1.02064121, + "epoch": 0.15511332892442733, + "flos": 15918051271680.0, + "grad_norm": 2.514434211397431, + "language_loss": 0.7368477, + "learning_rate": 3.838153365768599e-06, + "loss": 0.7588954, + "num_input_tokens_seen": 27285160, + "step": 1290, + "time_per_iteration": 2.537978410720825 + }, + { + "auxiliary_loss_clip": 0.01177165, + "auxiliary_loss_mlp": 0.01045165, + "balance_loss_clip": 1.06086576, + "balance_loss_mlp": 1.03458524, + "epoch": 0.15523357181506645, + "flos": 41282475569280.0, + "grad_norm": 2.3457987998059933, + "language_loss": 0.75342691, + "learning_rate": 3.837846250190206e-06, + "loss": 0.77565014, + "num_input_tokens_seen": 27308025, + "step": 1291, + "time_per_iteration": 2.7193427085876465 + }, + { + "auxiliary_loss_clip": 0.01160543, + "auxiliary_loss_mlp": 0.00764763, + "balance_loss_clip": 1.05506492, + "balance_loss_mlp": 1.00065756, + "epoch": 0.15535381470570553, + "flos": 18478769806080.0, + "grad_norm": 1.9605141559527388, + "language_loss": 0.7660625, + "learning_rate": 3.837538855811998e-06, + "loss": 0.78531557, + "num_input_tokens_seen": 27326200, + "step": 1292, + "time_per_iteration": 2.6013104915618896 + }, + { + "auxiliary_loss_clip": 0.01202275, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.06137252, + "balance_loss_mlp": 1.02531445, + "epoch": 0.1554740575963446, + "flos": 13918150759680.0, + "grad_norm": 2.0968326068762204, + "language_loss": 0.70497745, + "learning_rate": 3.837231182680606e-06, + "loss": 0.72735226, + "num_input_tokens_seen": 27344165, + "step": 1293, + "time_per_iteration": 2.5024681091308594 + }, + { + "auxiliary_loss_clip": 0.0121507, + "auxiliary_loss_mlp": 0.01037322, + "balance_loss_clip": 1.06237364, + "balance_loss_mlp": 1.02676058, + "epoch": 0.1555943004869837, + "flos": 20847078161280.0, + "grad_norm": 1.633788707630094, + "language_loss": 0.75934231, + "learning_rate": 3.836923230842706e-06, + "loss": 0.78186619, + "num_input_tokens_seen": 27363280, + "step": 1294, + "time_per_iteration": 2.526648759841919 + }, + { + "auxiliary_loss_clip": 0.01164609, + "auxiliary_loss_mlp": 0.01038463, + "balance_loss_clip": 1.05125523, + "balance_loss_mlp": 1.0276444, + "epoch": 0.1557145433776228, + "flos": 22085888371200.0, + "grad_norm": 2.8419685473349983, + "language_loss": 0.80353981, + "learning_rate": 3.836615000345011e-06, + "loss": 0.82557058, + "num_input_tokens_seen": 27381460, + "step": 1295, + "time_per_iteration": 2.7120113372802734 + }, + { + "auxiliary_loss_clip": 0.01219735, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.059762, + "balance_loss_mlp": 1.02257848, + "epoch": 0.1558347862682619, + "flos": 19791987039360.0, + "grad_norm": 7.669778052156303, + "language_loss": 0.77984911, + "learning_rate": 3.836306491234282e-06, + "loss": 0.80236614, + "num_input_tokens_seen": 27399310, + "step": 1296, + "time_per_iteration": 2.4618020057678223 + }, + { + "auxiliary_loss_clip": 0.01187971, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0616256, + "balance_loss_mlp": 1.02493358, + "epoch": 0.15595502915890097, + "flos": 17237086508160.0, + "grad_norm": 2.433756285009526, + "language_loss": 0.75741076, + "learning_rate": 3.835997703557317e-06, + "loss": 0.77963179, + "num_input_tokens_seen": 27416050, + "step": 1297, + "time_per_iteration": 2.5011565685272217 + }, + { + "auxiliary_loss_clip": 0.01161279, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.05007625, + "balance_loss_mlp": 1.02688742, + "epoch": 0.15607527204954008, + "flos": 19719519350400.0, + "grad_norm": 1.7993894176959195, + "language_loss": 0.80152738, + "learning_rate": 3.83568863736096e-06, + "loss": 0.82350975, + "num_input_tokens_seen": 27434920, + "step": 1298, + "time_per_iteration": 2.6158533096313477 + }, + { + "auxiliary_loss_clip": 0.01178533, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.05360675, + "balance_loss_mlp": 1.02589571, + "epoch": 0.15619551494017916, + "flos": 18515650095360.0, + "grad_norm": 2.282349525301286, + "language_loss": 0.89294279, + "learning_rate": 3.8353792926920975e-06, + "loss": 0.91508389, + "num_input_tokens_seen": 27453570, + "step": 1299, + "time_per_iteration": 3.3293581008911133 + }, + { + "auxiliary_loss_clip": 0.01214213, + "auxiliary_loss_mlp": 0.0104075, + "balance_loss_clip": 1.06258059, + "balance_loss_mlp": 1.03033686, + "epoch": 0.15631575783081825, + "flos": 19902125116800.0, + "grad_norm": 2.3440251546347204, + "language_loss": 0.81618595, + "learning_rate": 3.835069669597655e-06, + "loss": 0.83873558, + "num_input_tokens_seen": 27471960, + "step": 1300, + "time_per_iteration": 2.501718282699585 + }, + { + "auxiliary_loss_clip": 0.01212774, + "auxiliary_loss_mlp": 0.00764715, + "balance_loss_clip": 1.06091404, + "balance_loss_mlp": 1.00049949, + "epoch": 0.15643600072145733, + "flos": 20777663128320.0, + "grad_norm": 1.9318008519431493, + "language_loss": 0.79987836, + "learning_rate": 3.834759768124603e-06, + "loss": 0.81965327, + "num_input_tokens_seen": 27490835, + "step": 1301, + "time_per_iteration": 3.3974266052246094 + }, + { + "auxiliary_loss_clip": 0.01183551, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.06029367, + "balance_loss_mlp": 1.02453327, + "epoch": 0.15655624361209644, + "flos": 18546389159040.0, + "grad_norm": 2.336356999400374, + "language_loss": 0.75858814, + "learning_rate": 3.834449588319953e-06, + "loss": 0.78076839, + "num_input_tokens_seen": 27508870, + "step": 1302, + "time_per_iteration": 2.5800061225891113 + }, + { + "auxiliary_loss_clip": 0.01209347, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.06571388, + "balance_loss_mlp": 1.02910352, + "epoch": 0.15667648650273552, + "flos": 25229544727680.0, + "grad_norm": 3.313064908063855, + "language_loss": 0.85123301, + "learning_rate": 3.834139130230758e-06, + "loss": 0.87371492, + "num_input_tokens_seen": 27528175, + "step": 1303, + "time_per_iteration": 2.5800437927246094 + }, + { + "auxiliary_loss_clip": 0.0119529, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.05623448, + "balance_loss_mlp": 1.02498937, + "epoch": 0.1567967293933746, + "flos": 24827093769600.0, + "grad_norm": 1.6273872003179994, + "language_loss": 0.80929792, + "learning_rate": 3.833828393904117e-06, + "loss": 0.8316009, + "num_input_tokens_seen": 27548455, + "step": 1304, + "time_per_iteration": 2.5812339782714844 + }, + { + "auxiliary_loss_clip": 0.0116005, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.05275607, + "balance_loss_mlp": 1.01951444, + "epoch": 0.15691697228401372, + "flos": 19164555244800.0, + "grad_norm": 2.309751870935284, + "language_loss": 0.7731868, + "learning_rate": 3.833517379387165e-06, + "loss": 0.79509032, + "num_input_tokens_seen": 27564910, + "step": 1305, + "time_per_iteration": 2.5753188133239746 + }, + { + "auxiliary_loss_clip": 0.01213346, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.06294906, + "balance_loss_mlp": 1.02978826, + "epoch": 0.1570372151746528, + "flos": 24790931752320.0, + "grad_norm": 2.0621841902578786, + "language_loss": 0.88590968, + "learning_rate": 3.833206086727085e-06, + "loss": 0.90844595, + "num_input_tokens_seen": 27584260, + "step": 1306, + "time_per_iteration": 2.526747941970825 + }, + { + "auxiliary_loss_clip": 0.0117892, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.05220103, + "balance_loss_mlp": 1.02443933, + "epoch": 0.15715745806529188, + "flos": 24863650836480.0, + "grad_norm": 2.1343996355604458, + "language_loss": 0.70313334, + "learning_rate": 3.8328945159710994e-06, + "loss": 0.72526431, + "num_input_tokens_seen": 27604440, + "step": 1307, + "time_per_iteration": 2.5987956523895264 + }, + { + "auxiliary_loss_clip": 0.0121497, + "auxiliary_loss_mlp": 0.00763744, + "balance_loss_clip": 1.06358027, + "balance_loss_mlp": 1.0004406, + "epoch": 0.157277700955931, + "flos": 21872148491520.0, + "grad_norm": 2.341522550058672, + "language_loss": 0.89072382, + "learning_rate": 3.832582667166473e-06, + "loss": 0.91051102, + "num_input_tokens_seen": 27624250, + "step": 1308, + "time_per_iteration": 2.5144288539886475 + }, + { + "auxiliary_loss_clip": 0.01195026, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.0589782, + "balance_loss_mlp": 1.0252564, + "epoch": 0.15739794384657008, + "flos": 24533344344960.0, + "grad_norm": 1.7700640016707985, + "language_loss": 0.81501579, + "learning_rate": 3.8322705403605125e-06, + "loss": 0.83732951, + "num_input_tokens_seen": 27644595, + "step": 1309, + "time_per_iteration": 2.5510826110839844 + }, + { + "auxiliary_loss_clip": 0.01187414, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.05924654, + "balance_loss_mlp": 1.02458429, + "epoch": 0.15751818673720916, + "flos": 17745329998080.0, + "grad_norm": 2.2456233312011262, + "language_loss": 0.81173086, + "learning_rate": 3.831958135600568e-06, + "loss": 0.83394372, + "num_input_tokens_seen": 27662145, + "step": 1310, + "time_per_iteration": 2.497732639312744 + }, + { + "auxiliary_loss_clip": 0.01212691, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.06430221, + "balance_loss_mlp": 1.02324677, + "epoch": 0.15763842962784824, + "flos": 17858520731520.0, + "grad_norm": 1.8918419326669342, + "language_loss": 0.80073035, + "learning_rate": 3.831645452934032e-06, + "loss": 0.82318264, + "num_input_tokens_seen": 27680575, + "step": 1311, + "time_per_iteration": 2.5057053565979004 + }, + { + "auxiliary_loss_clip": 0.01227381, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.06665266, + "balance_loss_mlp": 1.03156531, + "epoch": 0.15775867251848735, + "flos": 26980908059520.0, + "grad_norm": 1.78290182480353, + "language_loss": 0.80065745, + "learning_rate": 3.831332492408336e-06, + "loss": 0.82334614, + "num_input_tokens_seen": 27701985, + "step": 1312, + "time_per_iteration": 2.4975099563598633 + }, + { + "auxiliary_loss_clip": 0.01189786, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.05762804, + "balance_loss_mlp": 1.01986527, + "epoch": 0.15787891540912644, + "flos": 19240398812160.0, + "grad_norm": 2.005240405053555, + "language_loss": 0.69316411, + "learning_rate": 3.831019254070957e-06, + "loss": 0.71535993, + "num_input_tokens_seen": 27719770, + "step": 1313, + "time_per_iteration": 2.5198729038238525 + }, + { + "auxiliary_loss_clip": 0.0116661, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.0547303, + "balance_loss_mlp": 1.02134657, + "epoch": 0.15799915829976552, + "flos": 27271102037760.0, + "grad_norm": 2.557108418101719, + "language_loss": 0.95168757, + "learning_rate": 3.8307057379694135e-06, + "loss": 0.97366399, + "num_input_tokens_seen": 27739105, + "step": 1314, + "time_per_iteration": 2.657810688018799 + }, + { + "auxiliary_loss_clip": 0.01224239, + "auxiliary_loss_mlp": 0.01041146, + "balance_loss_clip": 1.06193137, + "balance_loss_mlp": 1.03134143, + "epoch": 0.15811940119040463, + "flos": 20405520270720.0, + "grad_norm": 1.9991418730393549, + "language_loss": 0.82394934, + "learning_rate": 3.830391944151264e-06, + "loss": 0.84660316, + "num_input_tokens_seen": 27754985, + "step": 1315, + "time_per_iteration": 2.454183340072632 + }, + { + "auxiliary_loss_clip": 0.0119466, + "auxiliary_loss_mlp": 0.01039034, + "balance_loss_clip": 1.05725694, + "balance_loss_mlp": 1.02928853, + "epoch": 0.1582396440810437, + "flos": 32599347661440.0, + "grad_norm": 1.9614717238796355, + "language_loss": 0.67386281, + "learning_rate": 3.830077872664114e-06, + "loss": 0.69619977, + "num_input_tokens_seen": 27776110, + "step": 1316, + "time_per_iteration": 2.6227524280548096 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.04997826, + "balance_loss_mlp": 1.02536464, + "epoch": 0.1583598869716828, + "flos": 33800559310080.0, + "grad_norm": 1.9802251833922864, + "language_loss": 0.72691953, + "learning_rate": 3.829763523555604e-06, + "loss": 0.74873805, + "num_input_tokens_seen": 27796510, + "step": 1317, + "time_per_iteration": 2.7862308025360107 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.06556225, + "balance_loss_mlp": 1.02021468, + "epoch": 0.15848012986232188, + "flos": 24681332378880.0, + "grad_norm": 9.91569244263891, + "language_loss": 0.78073388, + "learning_rate": 3.829448896873423e-06, + "loss": 0.80306935, + "num_input_tokens_seen": 27815610, + "step": 1318, + "time_per_iteration": 2.520897388458252 + }, + { + "auxiliary_loss_clip": 0.01153211, + "auxiliary_loss_mlp": 0.00763887, + "balance_loss_clip": 1.05752802, + "balance_loss_mlp": 1.00050032, + "epoch": 0.158600372752961, + "flos": 22602068766720.0, + "grad_norm": 1.8313682381823166, + "language_loss": 0.79023135, + "learning_rate": 3.829133992665299e-06, + "loss": 0.80940229, + "num_input_tokens_seen": 27834735, + "step": 1319, + "time_per_iteration": 2.628401756286621 + }, + { + "auxiliary_loss_clip": 0.01199653, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.06160581, + "balance_loss_mlp": 1.02663791, + "epoch": 0.15872061564360007, + "flos": 27927944092800.0, + "grad_norm": 5.432334685263146, + "language_loss": 0.8900013, + "learning_rate": 3.828818810979002e-06, + "loss": 0.91235918, + "num_input_tokens_seen": 27853065, + "step": 1320, + "time_per_iteration": 2.5703372955322266 + }, + { + "auxiliary_loss_clip": 0.01225993, + "auxiliary_loss_mlp": 0.01037154, + "balance_loss_clip": 1.06876779, + "balance_loss_mlp": 1.02759933, + "epoch": 0.15884085853423915, + "flos": 23696805525120.0, + "grad_norm": 2.0168273655309434, + "language_loss": 0.80530798, + "learning_rate": 3.8285033518623454e-06, + "loss": 0.82793939, + "num_input_tokens_seen": 27873315, + "step": 1321, + "time_per_iteration": 2.4816431999206543 + }, + { + "auxiliary_loss_clip": 0.0121416, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.06515753, + "balance_loss_mlp": 1.02695501, + "epoch": 0.15896110142487826, + "flos": 23112359331840.0, + "grad_norm": 2.359860960237877, + "language_loss": 0.81500769, + "learning_rate": 3.8281876153631845e-06, + "loss": 0.83752537, + "num_input_tokens_seen": 27890070, + "step": 1322, + "time_per_iteration": 2.4986140727996826 + }, + { + "auxiliary_loss_clip": 0.01161262, + "auxiliary_loss_mlp": 0.010407, + "balance_loss_clip": 1.0552026, + "balance_loss_mlp": 1.02981067, + "epoch": 0.15908134431551735, + "flos": 14685238632960.0, + "grad_norm": 1.8726158154364538, + "language_loss": 0.651407, + "learning_rate": 3.827871601529416e-06, + "loss": 0.67342663, + "num_input_tokens_seen": 27908590, + "step": 1323, + "time_per_iteration": 2.6046030521392822 + }, + { + "auxiliary_loss_clip": 0.01174865, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.05798221, + "balance_loss_mlp": 1.027825, + "epoch": 0.15920158720615643, + "flos": 20193611984640.0, + "grad_norm": 1.8547480790452227, + "language_loss": 0.80550086, + "learning_rate": 3.827555310408979e-06, + "loss": 0.8276239, + "num_input_tokens_seen": 27927985, + "step": 1324, + "time_per_iteration": 2.580556631088257 + }, + { + "auxiliary_loss_clip": 0.01174497, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.06125152, + "balance_loss_mlp": 1.02529526, + "epoch": 0.1593218300967955, + "flos": 24826626892800.0, + "grad_norm": 2.0793112360043566, + "language_loss": 0.8306675, + "learning_rate": 3.827238742049854e-06, + "loss": 0.85276461, + "num_input_tokens_seen": 27948280, + "step": 1325, + "time_per_iteration": 2.6305932998657227 + }, + { + "auxiliary_loss_clip": 0.01223503, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.06359005, + "balance_loss_mlp": 1.02490616, + "epoch": 0.15944207298743462, + "flos": 28328707111680.0, + "grad_norm": 1.799737846111947, + "language_loss": 0.51575291, + "learning_rate": 3.826921896500066e-06, + "loss": 0.53833675, + "num_input_tokens_seen": 27969565, + "step": 1326, + "time_per_iteration": 3.226177930831909 + }, + { + "auxiliary_loss_clip": 0.01188607, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.06392097, + "balance_loss_mlp": 1.02458215, + "epoch": 0.1595623158780737, + "flos": 22964838174720.0, + "grad_norm": 2.1610572539194393, + "language_loss": 0.78380418, + "learning_rate": 3.826604773807678e-06, + "loss": 0.8060447, + "num_input_tokens_seen": 27987540, + "step": 1327, + "time_per_iteration": 2.5862345695495605 + }, + { + "auxiliary_loss_clip": 0.01191205, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.05529261, + "balance_loss_mlp": 1.0219959, + "epoch": 0.1596825587687128, + "flos": 19710540950400.0, + "grad_norm": 2.472821289841304, + "language_loss": 0.73020577, + "learning_rate": 3.826287374020798e-06, + "loss": 0.75244427, + "num_input_tokens_seen": 28002345, + "step": 1328, + "time_per_iteration": 4.088956356048584 + }, + { + "auxiliary_loss_clip": 0.01227198, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.06717992, + "balance_loss_mlp": 1.02785516, + "epoch": 0.1598028016593519, + "flos": 22637727993600.0, + "grad_norm": 2.007578630221329, + "language_loss": 0.82054067, + "learning_rate": 3.825969697187575e-06, + "loss": 0.84318793, + "num_input_tokens_seen": 28021675, + "step": 1329, + "time_per_iteration": 2.462203025817871 + }, + { + "auxiliary_loss_clip": 0.01179989, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.0587945, + "balance_loss_mlp": 1.02220654, + "epoch": 0.15992304454999098, + "flos": 20482908122880.0, + "grad_norm": 1.9164782912483058, + "language_loss": 0.69449097, + "learning_rate": 3.8256517433562015e-06, + "loss": 0.71661508, + "num_input_tokens_seen": 28039615, + "step": 1330, + "time_per_iteration": 2.5632376670837402 + }, + { + "auxiliary_loss_clip": 0.01224626, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.0654887, + "balance_loss_mlp": 1.02373195, + "epoch": 0.16004328744063007, + "flos": 17676094533120.0, + "grad_norm": 2.0762855494684276, + "language_loss": 0.91381013, + "learning_rate": 3.82533351257491e-06, + "loss": 0.93638062, + "num_input_tokens_seen": 28057565, + "step": 1331, + "time_per_iteration": 2.4493091106414795 + }, + { + "auxiliary_loss_clip": 0.01212278, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.06723177, + "balance_loss_mlp": 1.02709198, + "epoch": 0.16016353033126918, + "flos": 24098717779200.0, + "grad_norm": 2.0073001645685435, + "language_loss": 0.88499123, + "learning_rate": 3.825015004891975e-06, + "loss": 0.90747803, + "num_input_tokens_seen": 28076305, + "step": 1332, + "time_per_iteration": 2.531169891357422 + }, + { + "auxiliary_loss_clip": 0.01206855, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.06267405, + "balance_loss_mlp": 1.01893711, + "epoch": 0.16028377322190826, + "flos": 27634841112960.0, + "grad_norm": 1.785855847878467, + "language_loss": 0.758919, + "learning_rate": 3.824696220355716e-06, + "loss": 0.78127593, + "num_input_tokens_seen": 28097895, + "step": 1333, + "time_per_iteration": 2.534996747970581 + }, + { + "auxiliary_loss_clip": 0.01193305, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.06129003, + "balance_loss_mlp": 1.03060031, + "epoch": 0.16040401611254734, + "flos": 20961202648320.0, + "grad_norm": 1.7754761759412359, + "language_loss": 0.78967017, + "learning_rate": 3.824377159014491e-06, + "loss": 0.81201005, + "num_input_tokens_seen": 28118790, + "step": 1334, + "time_per_iteration": 2.5615174770355225 + }, + { + "auxiliary_loss_clip": 0.01208707, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.06491351, + "balance_loss_mlp": 1.02496958, + "epoch": 0.16052425900318643, + "flos": 21247051080960.0, + "grad_norm": 2.0578507310339247, + "language_loss": 0.84987265, + "learning_rate": 3.824057820916702e-06, + "loss": 0.87230492, + "num_input_tokens_seen": 28135995, + "step": 1335, + "time_per_iteration": 2.4963088035583496 + }, + { + "auxiliary_loss_clip": 0.01196937, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.06197906, + "balance_loss_mlp": 1.01890492, + "epoch": 0.16064450189382554, + "flos": 15524004096000.0, + "grad_norm": 2.3888494902624005, + "language_loss": 0.71945196, + "learning_rate": 3.8237382061107904e-06, + "loss": 0.74171811, + "num_input_tokens_seen": 28152715, + "step": 1336, + "time_per_iteration": 2.5089569091796875 + }, + { + "auxiliary_loss_clip": 0.01121454, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.04673147, + "balance_loss_mlp": 1.02877116, + "epoch": 0.16076474478446462, + "flos": 21178497974400.0, + "grad_norm": 1.875125041634392, + "language_loss": 0.78658986, + "learning_rate": 3.823418314645243e-06, + "loss": 0.80818808, + "num_input_tokens_seen": 28171590, + "step": 1337, + "time_per_iteration": 2.6861073970794678 + }, + { + "auxiliary_loss_clip": 0.01147433, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.05545485, + "balance_loss_mlp": 1.03307867, + "epoch": 0.1608849876751037, + "flos": 18366476912640.0, + "grad_norm": 1.9569511973521676, + "language_loss": 0.75269008, + "learning_rate": 3.823098146568588e-06, + "loss": 0.77458596, + "num_input_tokens_seen": 28191295, + "step": 1338, + "time_per_iteration": 2.6157186031341553 + }, + { + "auxiliary_loss_clip": 0.01208133, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.06245291, + "balance_loss_mlp": 1.02695525, + "epoch": 0.1610052305657428, + "flos": 29497024880640.0, + "grad_norm": 1.8668313873123592, + "language_loss": 0.7134254, + "learning_rate": 3.822777701929394e-06, + "loss": 0.73586589, + "num_input_tokens_seen": 28213120, + "step": 1339, + "time_per_iteration": 2.5748205184936523 + }, + { + "auxiliary_loss_clip": 0.01199887, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.06076169, + "balance_loss_mlp": 1.02953744, + "epoch": 0.1611254734563819, + "flos": 26797871329920.0, + "grad_norm": 1.9702218751790614, + "language_loss": 0.73600662, + "learning_rate": 3.8224569807762714e-06, + "loss": 0.75840485, + "num_input_tokens_seen": 28232440, + "step": 1340, + "time_per_iteration": 2.6296627521514893 + }, + { + "auxiliary_loss_clip": 0.01146447, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.05135703, + "balance_loss_mlp": 1.03007483, + "epoch": 0.16124571634702098, + "flos": 22419570741120.0, + "grad_norm": 1.7563346891858747, + "language_loss": 0.75987625, + "learning_rate": 3.822135983157873e-06, + "loss": 0.78174412, + "num_input_tokens_seen": 28251715, + "step": 1341, + "time_per_iteration": 2.689222574234009 + }, + { + "auxiliary_loss_clip": 0.01221889, + "auxiliary_loss_mlp": 0.0076443, + "balance_loss_clip": 1.06435013, + "balance_loss_mlp": 1.00050724, + "epoch": 0.16136595923766006, + "flos": 10999116103680.0, + "grad_norm": 2.187559308874884, + "language_loss": 0.84295648, + "learning_rate": 3.821814709122896e-06, + "loss": 0.86281973, + "num_input_tokens_seen": 28269765, + "step": 1342, + "time_per_iteration": 2.4441170692443848 + }, + { + "auxiliary_loss_clip": 0.01194041, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.06063771, + "balance_loss_mlp": 1.02471542, + "epoch": 0.16148620212829917, + "flos": 21214983214080.0, + "grad_norm": 4.076975269891227, + "language_loss": 0.84916818, + "learning_rate": 3.821493158720076e-06, + "loss": 0.87144971, + "num_input_tokens_seen": 28288870, + "step": 1343, + "time_per_iteration": 2.552111864089966 + }, + { + "auxiliary_loss_clip": 0.0117864, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.0552609, + "balance_loss_mlp": 1.02222347, + "epoch": 0.16160644501893826, + "flos": 16758468760320.0, + "grad_norm": 3.231563289887907, + "language_loss": 0.73530149, + "learning_rate": 3.821171331998191e-06, + "loss": 0.75741541, + "num_input_tokens_seen": 28305400, + "step": 1344, + "time_per_iteration": 2.552997350692749 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01009321, + "balance_loss_clip": 1.03317237, + "balance_loss_mlp": 1.00668657, + "epoch": 0.16172668790957734, + "flos": 64444967308800.0, + "grad_norm": 1.3969835454386994, + "language_loss": 0.54536045, + "learning_rate": 3.820849229006064e-06, + "loss": 0.56643963, + "num_input_tokens_seen": 28373150, + "step": 1345, + "time_per_iteration": 3.293813467025757 + }, + { + "auxiliary_loss_clip": 0.01226567, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.06425977, + "balance_loss_mlp": 1.02398729, + "epoch": 0.16184693080021645, + "flos": 23257689759360.0, + "grad_norm": 1.918448843624431, + "language_loss": 0.70869899, + "learning_rate": 3.8205268497925564e-06, + "loss": 0.73130488, + "num_input_tokens_seen": 28393620, + "step": 1346, + "time_per_iteration": 2.4860801696777344 + }, + { + "auxiliary_loss_clip": 0.01226786, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.06617475, + "balance_loss_mlp": 1.0262748, + "epoch": 0.16196717369085553, + "flos": 17451113696640.0, + "grad_norm": 2.15423371735896, + "language_loss": 0.78211731, + "learning_rate": 3.8202041944065725e-06, + "loss": 0.80474663, + "num_input_tokens_seen": 28409440, + "step": 1347, + "time_per_iteration": 2.4260001182556152 + }, + { + "auxiliary_loss_clip": 0.01225846, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.06711555, + "balance_loss_mlp": 1.02649164, + "epoch": 0.16208741658149461, + "flos": 23873377806720.0, + "grad_norm": 2.0851337425273937, + "language_loss": 0.74045503, + "learning_rate": 3.819881262897061e-06, + "loss": 0.76308107, + "num_input_tokens_seen": 28427575, + "step": 1348, + "time_per_iteration": 2.491335868835449 + }, + { + "auxiliary_loss_clip": 0.01182309, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.06335545, + "balance_loss_mlp": 1.02585101, + "epoch": 0.1622076594721337, + "flos": 25884806584320.0, + "grad_norm": 1.835683136935265, + "language_loss": 0.73075068, + "learning_rate": 3.819558055313008e-06, + "loss": 0.75293863, + "num_input_tokens_seen": 28448260, + "step": 1349, + "time_per_iteration": 2.616306781768799 + }, + { + "auxiliary_loss_clip": 0.01216356, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.06494987, + "balance_loss_mlp": 1.03042459, + "epoch": 0.1623279023627728, + "flos": 21539759011200.0, + "grad_norm": 1.797506903199932, + "language_loss": 0.77079397, + "learning_rate": 3.819234571703444e-06, + "loss": 0.79336214, + "num_input_tokens_seen": 28467085, + "step": 1350, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.012016, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.05926263, + "balance_loss_mlp": 1.03083467, + "epoch": 0.1624481452534119, + "flos": 22085421494400.0, + "grad_norm": 1.7844943686610308, + "language_loss": 0.85863227, + "learning_rate": 3.8189108121174435e-06, + "loss": 0.88106167, + "num_input_tokens_seen": 28486850, + "step": 1351, + "time_per_iteration": 2.507632255554199 + }, + { + "auxiliary_loss_clip": 0.01177752, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.06459439, + "balance_loss_mlp": 1.02458096, + "epoch": 0.16256838814405097, + "flos": 27087490690560.0, + "grad_norm": 1.6313832187241437, + "language_loss": 0.83410311, + "learning_rate": 3.818586776604118e-06, + "loss": 0.856224, + "num_input_tokens_seen": 28507490, + "step": 1352, + "time_per_iteration": 2.6343817710876465 + }, + { + "auxiliary_loss_clip": 0.0119183, + "auxiliary_loss_mlp": 0.01041702, + "balance_loss_clip": 1.06079519, + "balance_loss_mlp": 1.03203964, + "epoch": 0.16268863103469008, + "flos": 20120354196480.0, + "grad_norm": 2.2803663206063, + "language_loss": 0.61432743, + "learning_rate": 3.818262465212625e-06, + "loss": 0.63666272, + "num_input_tokens_seen": 28527615, + "step": 1353, + "time_per_iteration": 3.2858052253723145 + }, + { + "auxiliary_loss_clip": 0.01202438, + "auxiliary_loss_mlp": 0.01046264, + "balance_loss_clip": 1.06416035, + "balance_loss_mlp": 1.03543353, + "epoch": 0.16280887392532917, + "flos": 18332792933760.0, + "grad_norm": 2.0768902500365503, + "language_loss": 0.77650636, + "learning_rate": 3.817937877992161e-06, + "loss": 0.79899335, + "num_input_tokens_seen": 28544910, + "step": 1354, + "time_per_iteration": 3.270533561706543 + }, + { + "auxiliary_loss_clip": 0.011789, + "auxiliary_loss_mlp": 0.00764782, + "balance_loss_clip": 1.05610704, + "balance_loss_mlp": 1.00046253, + "epoch": 0.16292911681596825, + "flos": 11874330892800.0, + "grad_norm": 2.5010445523911566, + "language_loss": 0.85880184, + "learning_rate": 3.817613014991967e-06, + "loss": 0.87823862, + "num_input_tokens_seen": 28561050, + "step": 1355, + "time_per_iteration": 4.077270269393921 + }, + { + "auxiliary_loss_clip": 0.01169206, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.05472994, + "balance_loss_mlp": 1.02305019, + "epoch": 0.16304935970660733, + "flos": 26103466627200.0, + "grad_norm": 1.9478090656635858, + "language_loss": 0.76697963, + "learning_rate": 3.817287876261323e-06, + "loss": 0.78900218, + "num_input_tokens_seen": 28581385, + "step": 1356, + "time_per_iteration": 2.6120150089263916 + }, + { + "auxiliary_loss_clip": 0.0118877, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.06159985, + "balance_loss_mlp": 1.02346539, + "epoch": 0.16316960259724644, + "flos": 29351945848320.0, + "grad_norm": 1.871061110277696, + "language_loss": 0.79932272, + "learning_rate": 3.816962461849553e-06, + "loss": 0.82155156, + "num_input_tokens_seen": 28603255, + "step": 1357, + "time_per_iteration": 2.6012156009674072 + }, + { + "auxiliary_loss_clip": 0.01187924, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.06222415, + "balance_loss_mlp": 1.02725148, + "epoch": 0.16328984548788553, + "flos": 20886759711360.0, + "grad_norm": 1.7426990927685915, + "language_loss": 0.8457582, + "learning_rate": 3.8166367718060235e-06, + "loss": 0.86801255, + "num_input_tokens_seen": 28623145, + "step": 1358, + "time_per_iteration": 2.5438132286071777 + }, + { + "auxiliary_loss_clip": 0.01203639, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.05972362, + "balance_loss_mlp": 1.02297187, + "epoch": 0.1634100883785246, + "flos": 18041090584320.0, + "grad_norm": 3.1185824001626976, + "language_loss": 0.76397336, + "learning_rate": 3.816310806180139e-06, + "loss": 0.78633714, + "num_input_tokens_seen": 28641555, + "step": 1359, + "time_per_iteration": 2.496049404144287 + }, + { + "auxiliary_loss_clip": 0.01189172, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.06118107, + "balance_loss_mlp": 1.02895868, + "epoch": 0.16353033126916372, + "flos": 24572128055040.0, + "grad_norm": 1.6938807425525473, + "language_loss": 0.81027412, + "learning_rate": 3.81598456502135e-06, + "loss": 0.83255297, + "num_input_tokens_seen": 28661575, + "step": 1360, + "time_per_iteration": 2.588503837585449 + }, + { + "auxiliary_loss_clip": 0.01189601, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.06221497, + "balance_loss_mlp": 1.02552438, + "epoch": 0.1636505741598028, + "flos": 19892895321600.0, + "grad_norm": 3.5683886721238114, + "language_loss": 0.87137961, + "learning_rate": 3.8156580483791455e-06, + "loss": 0.89363509, + "num_input_tokens_seen": 28676765, + "step": 1361, + "time_per_iteration": 2.515977144241333 + }, + { + "auxiliary_loss_clip": 0.01225321, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.06463301, + "balance_loss_mlp": 1.02256441, + "epoch": 0.16377081705044189, + "flos": 28402611344640.0, + "grad_norm": 2.919620684675346, + "language_loss": 0.76937973, + "learning_rate": 3.815331256303059e-06, + "loss": 0.79195744, + "num_input_tokens_seen": 28696795, + "step": 1362, + "time_per_iteration": 2.511462926864624 + }, + { + "auxiliary_loss_clip": 0.01173473, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.06006396, + "balance_loss_mlp": 1.02406597, + "epoch": 0.163891059941081, + "flos": 21908059113600.0, + "grad_norm": 2.6557787567395743, + "language_loss": 0.77219272, + "learning_rate": 3.815004188842665e-06, + "loss": 0.79426652, + "num_input_tokens_seen": 28714835, + "step": 1363, + "time_per_iteration": 2.593045234680176 + }, + { + "auxiliary_loss_clip": 0.01187172, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.05577254, + "balance_loss_mlp": 1.02408314, + "epoch": 0.16401130283172008, + "flos": 26797619934720.0, + "grad_norm": 1.7471984337753381, + "language_loss": 0.79697716, + "learning_rate": 3.814676846047578e-06, + "loss": 0.81919336, + "num_input_tokens_seen": 28735710, + "step": 1364, + "time_per_iteration": 2.581636667251587 + }, + { + "auxiliary_loss_clip": 0.01205845, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.06133938, + "balance_loss_mlp": 1.02921367, + "epoch": 0.16413154572235916, + "flos": 32997417160320.0, + "grad_norm": 2.5994412986233844, + "language_loss": 0.69815218, + "learning_rate": 3.8143492279674565e-06, + "loss": 0.72060263, + "num_input_tokens_seen": 28758405, + "step": 1365, + "time_per_iteration": 2.610006093978882 + }, + { + "auxiliary_loss_clip": 0.01095083, + "auxiliary_loss_mlp": 0.01002154, + "balance_loss_clip": 1.03120255, + "balance_loss_mlp": 0.99953097, + "epoch": 0.16425178861299825, + "flos": 40113622074240.0, + "grad_norm": 0.8428216164868166, + "language_loss": 0.58460873, + "learning_rate": 3.8140213346519997e-06, + "loss": 0.60558105, + "num_input_tokens_seen": 28809000, + "step": 1366, + "time_per_iteration": 2.914249897003174 + }, + { + "auxiliary_loss_clip": 0.01164719, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.05557001, + "balance_loss_mlp": 1.02345884, + "epoch": 0.16437203150363736, + "flos": 25447486498560.0, + "grad_norm": 1.5836320978519902, + "language_loss": 0.76509351, + "learning_rate": 3.813693166150948e-06, + "loss": 0.78707325, + "num_input_tokens_seen": 28829210, + "step": 1367, + "time_per_iteration": 2.610389232635498 + }, + { + "auxiliary_loss_clip": 0.01172398, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.05880749, + "balance_loss_mlp": 1.02561975, + "epoch": 0.16449227439427644, + "flos": 23476888506240.0, + "grad_norm": 2.1086818139222774, + "language_loss": 0.85540181, + "learning_rate": 3.813364722514086e-06, + "loss": 0.87748671, + "num_input_tokens_seen": 28847545, + "step": 1368, + "time_per_iteration": 2.580911874771118 + }, + { + "auxiliary_loss_clip": 0.01207583, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.06135571, + "balance_loss_mlp": 1.02405071, + "epoch": 0.16461251728491552, + "flos": 13545217802880.0, + "grad_norm": 3.91648215679575, + "language_loss": 0.8039155, + "learning_rate": 3.8130360037912368e-06, + "loss": 0.82633305, + "num_input_tokens_seen": 28863990, + "step": 1369, + "time_per_iteration": 2.4585461616516113 + }, + { + "auxiliary_loss_clip": 0.01207558, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.06130576, + "balance_loss_mlp": 1.02429676, + "epoch": 0.16473276017555463, + "flos": 23003298662400.0, + "grad_norm": 2.2119448327558655, + "language_loss": 0.81890279, + "learning_rate": 3.812707010032268e-06, + "loss": 0.84133184, + "num_input_tokens_seen": 28883045, + "step": 1370, + "time_per_iteration": 2.507314682006836 + }, + { + "auxiliary_loss_clip": 0.01216203, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.06716299, + "balance_loss_mlp": 1.03043556, + "epoch": 0.16485300306619372, + "flos": 24790680357120.0, + "grad_norm": 1.7971200438045372, + "language_loss": 0.79190683, + "learning_rate": 3.8123777412870863e-06, + "loss": 0.81447327, + "num_input_tokens_seen": 28902545, + "step": 1371, + "time_per_iteration": 2.5218865871429443 + }, + { + "auxiliary_loss_clip": 0.01198251, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.05979252, + "balance_loss_mlp": 1.0294168, + "epoch": 0.1649732459568328, + "flos": 21106497162240.0, + "grad_norm": 1.9307052711403918, + "language_loss": 0.78238189, + "learning_rate": 3.812048197605643e-06, + "loss": 0.80475932, + "num_input_tokens_seen": 28921440, + "step": 1372, + "time_per_iteration": 2.526406764984131 + }, + { + "auxiliary_loss_clip": 0.01210183, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.06323767, + "balance_loss_mlp": 1.01728189, + "epoch": 0.16509348884747188, + "flos": 20266726118400.0, + "grad_norm": 4.599223303586745, + "language_loss": 0.81482291, + "learning_rate": 3.8117183790379277e-06, + "loss": 0.83719844, + "num_input_tokens_seen": 28939890, + "step": 1373, + "time_per_iteration": 2.4718616008758545 + }, + { + "auxiliary_loss_clip": 0.01224165, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.06348324, + "balance_loss_mlp": 1.02400029, + "epoch": 0.165213731738111, + "flos": 11035493602560.0, + "grad_norm": 2.712307841141203, + "language_loss": 0.93377584, + "learning_rate": 3.811388285633976e-06, + "loss": 0.9563635, + "num_input_tokens_seen": 28955875, + "step": 1374, + "time_per_iteration": 2.420050621032715 + }, + { + "auxiliary_loss_clip": 0.01167125, + "auxiliary_loss_mlp": 0.01043097, + "balance_loss_clip": 1.0560081, + "balance_loss_mlp": 1.03254676, + "epoch": 0.16533397462875007, + "flos": 29972051268480.0, + "grad_norm": 7.377736856131934, + "language_loss": 0.61929584, + "learning_rate": 3.811057917443861e-06, + "loss": 0.64139801, + "num_input_tokens_seen": 28975140, + "step": 1375, + "time_per_iteration": 2.6759655475616455 + }, + { + "auxiliary_loss_clip": 0.01112557, + "auxiliary_loss_mlp": 0.01005935, + "balance_loss_clip": 1.03602195, + "balance_loss_mlp": 1.00333679, + "epoch": 0.16545421751938916, + "flos": 65556763027200.0, + "grad_norm": 0.8529389713249875, + "language_loss": 0.68290961, + "learning_rate": 3.8107272745177e-06, + "loss": 0.70409453, + "num_input_tokens_seen": 29047470, + "step": 1376, + "time_per_iteration": 3.247767925262451 + }, + { + "auxiliary_loss_clip": 0.01180476, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.06005144, + "balance_loss_mlp": 1.02470851, + "epoch": 0.16557446041002827, + "flos": 22492361652480.0, + "grad_norm": 1.759541042669598, + "language_loss": 0.78638875, + "learning_rate": 3.8103963569056513e-06, + "loss": 0.80853838, + "num_input_tokens_seen": 29066605, + "step": 1377, + "time_per_iteration": 2.5731685161590576 + }, + { + "auxiliary_loss_clip": 0.0118662, + "auxiliary_loss_mlp": 0.01041566, + "balance_loss_clip": 1.05589736, + "balance_loss_mlp": 1.03105211, + "epoch": 0.16569470330066735, + "flos": 24602723464320.0, + "grad_norm": 2.3802638800669627, + "language_loss": 0.88227355, + "learning_rate": 3.8100651646579146e-06, + "loss": 0.90455544, + "num_input_tokens_seen": 29085815, + "step": 1378, + "time_per_iteration": 2.5783185958862305 + }, + { + "auxiliary_loss_clip": 0.01188865, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.05668211, + "balance_loss_mlp": 1.02801442, + "epoch": 0.16581494619130643, + "flos": 15006207588480.0, + "grad_norm": 2.214406838350005, + "language_loss": 0.92828226, + "learning_rate": 3.8097336978247317e-06, + "loss": 0.95055521, + "num_input_tokens_seen": 29102520, + "step": 1379, + "time_per_iteration": 3.283736228942871 + }, + { + "auxiliary_loss_clip": 0.01179767, + "auxiliary_loss_mlp": 0.01027872, + "balance_loss_clip": 1.05785859, + "balance_loss_mlp": 1.01710749, + "epoch": 0.16593518908194552, + "flos": 17420338719360.0, + "grad_norm": 2.0604353467233056, + "language_loss": 0.88910067, + "learning_rate": 3.8094019564563854e-06, + "loss": 0.91117716, + "num_input_tokens_seen": 29119450, + "step": 1380, + "time_per_iteration": 2.5433366298675537 + }, + { + "auxiliary_loss_clip": 0.01223156, + "auxiliary_loss_mlp": 0.00765034, + "balance_loss_clip": 1.06303835, + "balance_loss_mlp": 1.00059438, + "epoch": 0.16605543197258463, + "flos": 20412631163520.0, + "grad_norm": 2.0028924355525444, + "language_loss": 0.75193429, + "learning_rate": 3.809069940603201e-06, + "loss": 0.77181625, + "num_input_tokens_seen": 29137405, + "step": 1381, + "time_per_iteration": 3.2355122566223145 + }, + { + "auxiliary_loss_clip": 0.01184832, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.05921209, + "balance_loss_mlp": 1.02397871, + "epoch": 0.1661756748632237, + "flos": 14209745368320.0, + "grad_norm": 2.076204762509888, + "language_loss": 0.77428639, + "learning_rate": 3.8087376503155452e-06, + "loss": 0.79647362, + "num_input_tokens_seen": 29154890, + "step": 1382, + "time_per_iteration": 4.041480302810669 + }, + { + "auxiliary_loss_clip": 0.01102265, + "auxiliary_loss_mlp": 0.01005447, + "balance_loss_clip": 1.0304296, + "balance_loss_mlp": 1.00274086, + "epoch": 0.1662959177538628, + "flos": 66080877350400.0, + "grad_norm": 0.8963520930016445, + "language_loss": 0.56252158, + "learning_rate": 3.808405085643826e-06, + "loss": 0.58359873, + "num_input_tokens_seen": 29219770, + "step": 1383, + "time_per_iteration": 3.1481740474700928 + }, + { + "auxiliary_loss_clip": 0.01226553, + "auxiliary_loss_mlp": 0.00764362, + "balance_loss_clip": 1.06528974, + "balance_loss_mlp": 1.00051641, + "epoch": 0.1664161606445019, + "flos": 20740567357440.0, + "grad_norm": 2.3485987463886913, + "language_loss": 0.89096606, + "learning_rate": 3.8080722466384925e-06, + "loss": 0.9108752, + "num_input_tokens_seen": 29237620, + "step": 1384, + "time_per_iteration": 2.4573898315429688 + }, + { + "auxiliary_loss_clip": 0.01224645, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.06077862, + "balance_loss_mlp": 1.02610993, + "epoch": 0.166536403535141, + "flos": 25260930236160.0, + "grad_norm": 2.335350248159454, + "language_loss": 0.71112055, + "learning_rate": 3.8077391333500376e-06, + "loss": 0.73373806, + "num_input_tokens_seen": 29256760, + "step": 1385, + "time_per_iteration": 2.5026612281799316 + }, + { + "auxiliary_loss_clip": 0.01197471, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.06398523, + "balance_loss_mlp": 1.02360916, + "epoch": 0.16665664642578007, + "flos": 25447450584960.0, + "grad_norm": 2.005732295717281, + "language_loss": 0.76565665, + "learning_rate": 3.8074057458289934e-06, + "loss": 0.78796518, + "num_input_tokens_seen": 29277450, + "step": 1386, + "time_per_iteration": 2.5824437141418457 + }, + { + "auxiliary_loss_clip": 0.0119544, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.05969191, + "balance_loss_mlp": 1.02244937, + "epoch": 0.16677688931641918, + "flos": 22200767043840.0, + "grad_norm": 2.181325667350636, + "language_loss": 0.82003486, + "learning_rate": 3.807072084125934e-06, + "loss": 0.84231508, + "num_input_tokens_seen": 29299300, + "step": 1387, + "time_per_iteration": 2.575983762741089 + }, + { + "auxiliary_loss_clip": 0.01191315, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.06166041, + "balance_loss_mlp": 1.0251193, + "epoch": 0.16689713220705826, + "flos": 16945958776320.0, + "grad_norm": 3.8222041281486874, + "language_loss": 0.80659556, + "learning_rate": 3.806738148291477e-06, + "loss": 0.828866, + "num_input_tokens_seen": 29316125, + "step": 1388, + "time_per_iteration": 2.4923431873321533 + }, + { + "auxiliary_loss_clip": 0.01150643, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.0527122, + "balance_loss_mlp": 1.02562261, + "epoch": 0.16701737509769735, + "flos": 36244423923840.0, + "grad_norm": 3.8104864490270476, + "language_loss": 0.71120501, + "learning_rate": 3.8064039383762793e-06, + "loss": 0.73307931, + "num_input_tokens_seen": 29338490, + "step": 1389, + "time_per_iteration": 2.768720865249634 + }, + { + "auxiliary_loss_clip": 0.01208582, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.06357467, + "balance_loss_mlp": 1.02520132, + "epoch": 0.16713761798833643, + "flos": 23258659426560.0, + "grad_norm": 2.021810927952577, + "language_loss": 0.77485609, + "learning_rate": 3.8060694544310396e-06, + "loss": 0.79729921, + "num_input_tokens_seen": 29357000, + "step": 1390, + "time_per_iteration": 2.4946322441101074 + }, + { + "auxiliary_loss_clip": 0.01226122, + "auxiliary_loss_mlp": 0.01046131, + "balance_loss_clip": 1.06326342, + "balance_loss_mlp": 1.03470504, + "epoch": 0.16725786087897554, + "flos": 25302515207040.0, + "grad_norm": 1.9975067090798053, + "language_loss": 0.7863189, + "learning_rate": 3.8057346965065006e-06, + "loss": 0.80904138, + "num_input_tokens_seen": 29378230, + "step": 1391, + "time_per_iteration": 2.496281147003174 + }, + { + "auxiliary_loss_clip": 0.01193548, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.06362617, + "balance_loss_mlp": 1.02526474, + "epoch": 0.16737810376961462, + "flos": 31831541516160.0, + "grad_norm": 1.6842691825867708, + "language_loss": 0.8448813, + "learning_rate": 3.805399664653443e-06, + "loss": 0.8671701, + "num_input_tokens_seen": 29400370, + "step": 1392, + "time_per_iteration": 2.62697434425354 + }, + { + "auxiliary_loss_clip": 0.01228149, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.06456399, + "balance_loss_mlp": 1.02234197, + "epoch": 0.1674983466602537, + "flos": 27961843553280.0, + "grad_norm": 2.4034802650129925, + "language_loss": 0.74458784, + "learning_rate": 3.805064358922692e-06, + "loss": 0.76720095, + "num_input_tokens_seen": 29418660, + "step": 1393, + "time_per_iteration": 2.499715566635132 + }, + { + "auxiliary_loss_clip": 0.01215958, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.06358862, + "balance_loss_mlp": 1.0271107, + "epoch": 0.16761858955089282, + "flos": 21762656858880.0, + "grad_norm": 1.7187898287251089, + "language_loss": 0.8098222, + "learning_rate": 3.8047287793651136e-06, + "loss": 0.83236545, + "num_input_tokens_seen": 29440105, + "step": 1394, + "time_per_iteration": 2.5094149112701416 + }, + { + "auxiliary_loss_clip": 0.01182022, + "auxiliary_loss_mlp": 0.01041018, + "balance_loss_clip": 1.06002033, + "balance_loss_mlp": 1.03087354, + "epoch": 0.1677388324415319, + "flos": 23805507058560.0, + "grad_norm": 1.9337886535825628, + "language_loss": 0.88205254, + "learning_rate": 3.8043929260316137e-06, + "loss": 0.90428293, + "num_input_tokens_seen": 29458260, + "step": 1395, + "time_per_iteration": 2.5847034454345703 + }, + { + "auxiliary_loss_clip": 0.01198586, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_clip": 1.06624198, + "balance_loss_mlp": 1.03015745, + "epoch": 0.16785907533217098, + "flos": 20558859431040.0, + "grad_norm": 3.310969275348815, + "language_loss": 0.83731234, + "learning_rate": 3.8040567989731417e-06, + "loss": 0.85970604, + "num_input_tokens_seen": 29476205, + "step": 1396, + "time_per_iteration": 2.5296103954315186 + }, + { + "auxiliary_loss_clip": 0.01204389, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.06185269, + "balance_loss_mlp": 1.02265573, + "epoch": 0.16797931822281006, + "flos": 15669657745920.0, + "grad_norm": 2.17058404758993, + "language_loss": 0.79513204, + "learning_rate": 3.8037203982406876e-06, + "loss": 0.81750447, + "num_input_tokens_seen": 29494370, + "step": 1397, + "time_per_iteration": 2.465550661087036 + }, + { + "auxiliary_loss_clip": 0.01224918, + "auxiliary_loss_mlp": 0.01038266, + "balance_loss_clip": 1.06518948, + "balance_loss_mlp": 1.02723932, + "epoch": 0.16809956111344918, + "flos": 16541101607040.0, + "grad_norm": 2.1252485113717574, + "language_loss": 0.73327422, + "learning_rate": 3.8033837238852835e-06, + "loss": 0.75590605, + "num_input_tokens_seen": 29511070, + "step": 1398, + "time_per_iteration": 2.488380193710327 + }, + { + "auxiliary_loss_clip": 0.01184896, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.0572505, + "balance_loss_mlp": 1.02542877, + "epoch": 0.16821980400408826, + "flos": 23258084808960.0, + "grad_norm": 1.7657820990462274, + "language_loss": 0.69678098, + "learning_rate": 3.8030467759580017e-06, + "loss": 0.71898317, + "num_input_tokens_seen": 29531990, + "step": 1399, + "time_per_iteration": 2.55505108833313 + }, + { + "auxiliary_loss_clip": 0.01212398, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.06157327, + "balance_loss_mlp": 1.02416086, + "epoch": 0.16834004689472734, + "flos": 20774754126720.0, + "grad_norm": 2.3889381095758706, + "language_loss": 0.86886036, + "learning_rate": 3.802709554509958e-06, + "loss": 0.89133763, + "num_input_tokens_seen": 29549790, + "step": 1400, + "time_per_iteration": 2.4851348400115967 + }, + { + "auxiliary_loss_clip": 0.01192067, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.0574584, + "balance_loss_mlp": 1.02406836, + "epoch": 0.16846028978536645, + "flos": 26687302289280.0, + "grad_norm": 1.7267304812958526, + "language_loss": 0.78817779, + "learning_rate": 3.8023720595923083e-06, + "loss": 0.81043422, + "num_input_tokens_seen": 29569045, + "step": 1401, + "time_per_iteration": 2.576931953430176 + }, + { + "auxiliary_loss_clip": 0.01160238, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.05482781, + "balance_loss_mlp": 1.0219779, + "epoch": 0.16858053267600553, + "flos": 18843298980480.0, + "grad_norm": 2.4698412360301707, + "language_loss": 0.87011552, + "learning_rate": 3.80203429125625e-06, + "loss": 0.89204144, + "num_input_tokens_seen": 29587220, + "step": 1402, + "time_per_iteration": 2.6015048027038574 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.05414248, + "balance_loss_mlp": 1.02202094, + "epoch": 0.16870077556664462, + "flos": 27744548227200.0, + "grad_norm": 1.6488892825147572, + "language_loss": 0.70128137, + "learning_rate": 3.8016962495530225e-06, + "loss": 0.72300947, + "num_input_tokens_seen": 29606410, + "step": 1403, + "time_per_iteration": 2.702686071395874 + }, + { + "auxiliary_loss_clip": 0.01226673, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.0645076, + "balance_loss_mlp": 1.03315556, + "epoch": 0.1688210184572837, + "flos": 13730768484480.0, + "grad_norm": 2.3577595131959783, + "language_loss": 0.77081883, + "learning_rate": 3.8013579345339063e-06, + "loss": 0.7935158, + "num_input_tokens_seen": 29621275, + "step": 1404, + "time_per_iteration": 2.419342279434204 + }, + { + "auxiliary_loss_clip": 0.01184181, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.05816448, + "balance_loss_mlp": 1.02037573, + "epoch": 0.1689412613479228, + "flos": 26468785900800.0, + "grad_norm": 2.2956984148995514, + "language_loss": 0.69146401, + "learning_rate": 3.801019346250224e-06, + "loss": 0.71361804, + "num_input_tokens_seen": 29641420, + "step": 1405, + "time_per_iteration": 2.607166051864624 + }, + { + "auxiliary_loss_clip": 0.01206484, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.06205797, + "balance_loss_mlp": 1.02277446, + "epoch": 0.1690615042385619, + "flos": 21138852337920.0, + "grad_norm": 2.3720557323403475, + "language_loss": 0.83504641, + "learning_rate": 3.8006804847533395e-06, + "loss": 0.85744417, + "num_input_tokens_seen": 29660935, + "step": 1406, + "time_per_iteration": 3.306201219558716 + }, + { + "auxiliary_loss_clip": 0.01228646, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.06559658, + "balance_loss_mlp": 1.02663636, + "epoch": 0.16918174712920098, + "flos": 20849340718080.0, + "grad_norm": 1.8232609368888006, + "language_loss": 0.85527074, + "learning_rate": 3.8003413500946556e-06, + "loss": 0.87791932, + "num_input_tokens_seen": 29681045, + "step": 1407, + "time_per_iteration": 3.2586536407470703 + }, + { + "auxiliary_loss_clip": 0.01198583, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.06367433, + "balance_loss_mlp": 1.02709949, + "epoch": 0.1693019900198401, + "flos": 16983270028800.0, + "grad_norm": 3.434718627766128, + "language_loss": 0.83473808, + "learning_rate": 3.8000019423256216e-06, + "loss": 0.8571049, + "num_input_tokens_seen": 29698810, + "step": 1408, + "time_per_iteration": 3.353588581085205 + }, + { + "auxiliary_loss_clip": 0.01185551, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.06079853, + "balance_loss_mlp": 1.03650463, + "epoch": 0.16942223291047917, + "flos": 26796901662720.0, + "grad_norm": 1.68801523113345, + "language_loss": 0.88064611, + "learning_rate": 3.7996622614977234e-06, + "loss": 0.90296966, + "num_input_tokens_seen": 29720000, + "step": 1409, + "time_per_iteration": 3.2960622310638428 + }, + { + "auxiliary_loss_clip": 0.01193065, + "auxiliary_loss_mlp": 0.01039369, + "balance_loss_clip": 1.06181884, + "balance_loss_mlp": 1.02943254, + "epoch": 0.16954247580111825, + "flos": 18583700411520.0, + "grad_norm": 1.9465367487819412, + "language_loss": 0.79095072, + "learning_rate": 3.799322307662492e-06, + "loss": 0.8132751, + "num_input_tokens_seen": 29737820, + "step": 1410, + "time_per_iteration": 2.524172306060791 + }, + { + "auxiliary_loss_clip": 0.0116735, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.05540252, + "balance_loss_mlp": 1.02267718, + "epoch": 0.16966271869175734, + "flos": 13983651210240.0, + "grad_norm": 2.8604588564529094, + "language_loss": 0.8373735, + "learning_rate": 3.798982080871496e-06, + "loss": 0.85938215, + "num_input_tokens_seen": 29752960, + "step": 1411, + "time_per_iteration": 2.591909408569336 + }, + { + "auxiliary_loss_clip": 0.01228744, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.06593573, + "balance_loss_mlp": 1.02954197, + "epoch": 0.16978296158239645, + "flos": 37487328284160.0, + "grad_norm": 2.188402092697048, + "language_loss": 0.68072933, + "learning_rate": 3.798641581176349e-06, + "loss": 0.70342386, + "num_input_tokens_seen": 29775240, + "step": 1412, + "time_per_iteration": 2.6231300830841064 + }, + { + "auxiliary_loss_clip": 0.01195794, + "auxiliary_loss_mlp": 0.01039874, + "balance_loss_clip": 1.058851, + "balance_loss_mlp": 1.0288831, + "epoch": 0.16990320447303553, + "flos": 28328958506880.0, + "grad_norm": 1.9139721755174348, + "language_loss": 0.74497199, + "learning_rate": 3.7983008086287044e-06, + "loss": 0.76732868, + "num_input_tokens_seen": 29796560, + "step": 1413, + "time_per_iteration": 2.5754544734954834 + }, + { + "auxiliary_loss_clip": 0.01191026, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.05770648, + "balance_loss_mlp": 1.02860308, + "epoch": 0.1700234473636746, + "flos": 20188189031040.0, + "grad_norm": 2.283137727718297, + "language_loss": 0.79121506, + "learning_rate": 3.797959763280257e-06, + "loss": 0.81352353, + "num_input_tokens_seen": 29815245, + "step": 1414, + "time_per_iteration": 2.5447089672088623 + }, + { + "auxiliary_loss_clip": 0.01216073, + "auxiliary_loss_mlp": 0.01044054, + "balance_loss_clip": 1.06451952, + "balance_loss_mlp": 1.03427327, + "epoch": 0.17014369025431372, + "flos": 24858658846080.0, + "grad_norm": 1.906355883601286, + "language_loss": 0.78867745, + "learning_rate": 3.797618445182743e-06, + "loss": 0.8112787, + "num_input_tokens_seen": 29836640, + "step": 1415, + "time_per_iteration": 2.5434820652008057 + }, + { + "auxiliary_loss_clip": 0.01160504, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.05456102, + "balance_loss_mlp": 1.02271199, + "epoch": 0.1702639331449528, + "flos": 16467233287680.0, + "grad_norm": 2.2320150978227518, + "language_loss": 0.85243052, + "learning_rate": 3.79727685438794e-06, + "loss": 0.87436837, + "num_input_tokens_seen": 29850830, + "step": 1416, + "time_per_iteration": 2.5672619342803955 + }, + { + "auxiliary_loss_clip": 0.01118, + "auxiliary_loss_mlp": 0.01007181, + "balance_loss_clip": 1.03104424, + "balance_loss_mlp": 1.00445127, + "epoch": 0.1703841760355919, + "flos": 52508870979840.0, + "grad_norm": 0.8336451301977318, + "language_loss": 0.61624217, + "learning_rate": 3.796934990947667e-06, + "loss": 0.63749397, + "num_input_tokens_seen": 29912515, + "step": 1417, + "time_per_iteration": 3.077319860458374 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.01005388, + "balance_loss_clip": 1.03104115, + "balance_loss_mlp": 1.00265777, + "epoch": 0.170504418926231, + "flos": 49370637576960.0, + "grad_norm": 0.881594066576247, + "language_loss": 0.62499452, + "learning_rate": 3.7965928549137854e-06, + "loss": 0.64621878, + "num_input_tokens_seen": 29969330, + "step": 1418, + "time_per_iteration": 3.0179309844970703 + }, + { + "auxiliary_loss_clip": 0.01182007, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.05396652, + "balance_loss_mlp": 1.02725434, + "epoch": 0.17062466181687008, + "flos": 25849219184640.0, + "grad_norm": 2.096672348183791, + "language_loss": 0.77523905, + "learning_rate": 3.7962504463381953e-06, + "loss": 0.79744262, + "num_input_tokens_seen": 29990820, + "step": 1419, + "time_per_iteration": 2.6280577182769775 + }, + { + "auxiliary_loss_clip": 0.01192252, + "auxiliary_loss_mlp": 0.00766104, + "balance_loss_clip": 1.06371772, + "balance_loss_mlp": 1.00060987, + "epoch": 0.17074490470750917, + "flos": 20960412549120.0, + "grad_norm": 1.6494449167845604, + "language_loss": 0.78927428, + "learning_rate": 3.7959077652728412e-06, + "loss": 0.8088578, + "num_input_tokens_seen": 30009275, + "step": 1420, + "time_per_iteration": 2.5778114795684814 + }, + { + "auxiliary_loss_clip": 0.01195663, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.05968595, + "balance_loss_mlp": 1.02867997, + "epoch": 0.17086514759814825, + "flos": 20959766104320.0, + "grad_norm": 1.9857297655945323, + "language_loss": 0.77366209, + "learning_rate": 3.795564811769707e-06, + "loss": 0.79601121, + "num_input_tokens_seen": 30027630, + "step": 1421, + "time_per_iteration": 2.5435338020324707 + }, + { + "auxiliary_loss_clip": 0.01197233, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.06500125, + "balance_loss_mlp": 1.02606058, + "epoch": 0.17098539048878736, + "flos": 28474073452800.0, + "grad_norm": 2.4993112167606597, + "language_loss": 0.78084207, + "learning_rate": 3.795221585880818e-06, + "loss": 0.80318916, + "num_input_tokens_seen": 30048310, + "step": 1422, + "time_per_iteration": 2.599140167236328 + }, + { + "auxiliary_loss_clip": 0.01184928, + "auxiliary_loss_mlp": 0.01039663, + "balance_loss_clip": 1.06560183, + "balance_loss_mlp": 1.02994108, + "epoch": 0.17110563337942644, + "flos": 16290014561280.0, + "grad_norm": 1.7457784534936918, + "language_loss": 0.91469115, + "learning_rate": 3.794878087658242e-06, + "loss": 0.93693709, + "num_input_tokens_seen": 30066080, + "step": 1423, + "time_per_iteration": 2.5454890727996826 + }, + { + "auxiliary_loss_clip": 0.01212012, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.06183374, + "balance_loss_mlp": 1.02796888, + "epoch": 0.17122587627006552, + "flos": 29674207693440.0, + "grad_norm": 1.8498834937914956, + "language_loss": 0.78529358, + "learning_rate": 3.7945343171540873e-06, + "loss": 0.80779338, + "num_input_tokens_seen": 30086955, + "step": 1424, + "time_per_iteration": 2.5708813667297363 + }, + { + "auxiliary_loss_clip": 0.01228227, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.06506896, + "balance_loss_mlp": 1.02514625, + "epoch": 0.17134611916070464, + "flos": 25338389915520.0, + "grad_norm": 1.8643309186169357, + "language_loss": 0.79023123, + "learning_rate": 3.7941902744205033e-06, + "loss": 0.81287736, + "num_input_tokens_seen": 30107990, + "step": 1425, + "time_per_iteration": 2.5008764266967773 + }, + { + "auxiliary_loss_clip": 0.01200171, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.06061625, + "balance_loss_mlp": 1.02152705, + "epoch": 0.17146636205134372, + "flos": 13953845900160.0, + "grad_norm": 3.143546746355629, + "language_loss": 0.82729995, + "learning_rate": 3.7938459595096817e-06, + "loss": 0.84963024, + "num_input_tokens_seen": 30126535, + "step": 1426, + "time_per_iteration": 2.510077714920044 + }, + { + "auxiliary_loss_clip": 0.01216975, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.06277645, + "balance_loss_mlp": 1.02559233, + "epoch": 0.1715866049419828, + "flos": 23915214172800.0, + "grad_norm": 1.8511959478063484, + "language_loss": 0.86044669, + "learning_rate": 3.7935013724738545e-06, + "loss": 0.88298082, + "num_input_tokens_seen": 30147035, + "step": 1427, + "time_per_iteration": 2.5357894897460938 + }, + { + "auxiliary_loss_clip": 0.01206901, + "auxiliary_loss_mlp": 0.01038518, + "balance_loss_clip": 1.06322181, + "balance_loss_mlp": 1.02852225, + "epoch": 0.17170684783262188, + "flos": 22709369669760.0, + "grad_norm": 1.6362207341903618, + "language_loss": 0.77506936, + "learning_rate": 3.7931565133652945e-06, + "loss": 0.79752356, + "num_input_tokens_seen": 30167110, + "step": 1428, + "time_per_iteration": 2.4974308013916016 + }, + { + "auxiliary_loss_clip": 0.01227372, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.06541145, + "balance_loss_mlp": 1.02668405, + "epoch": 0.171827090723261, + "flos": 26613290315520.0, + "grad_norm": 2.249521984508326, + "language_loss": 0.67606544, + "learning_rate": 3.792811382236317e-06, + "loss": 0.6987139, + "num_input_tokens_seen": 30185620, + "step": 1429, + "time_per_iteration": 2.484384059906006 + }, + { + "auxiliary_loss_clip": 0.0121456, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.0618825, + "balance_loss_mlp": 1.02741754, + "epoch": 0.17194733361390008, + "flos": 28148507556480.0, + "grad_norm": 2.2981030569969936, + "language_loss": 0.78034455, + "learning_rate": 3.792465979139279e-06, + "loss": 0.80287242, + "num_input_tokens_seen": 30208225, + "step": 1430, + "time_per_iteration": 2.544196844100952 + }, + { + "auxiliary_loss_clip": 0.01088619, + "auxiliary_loss_mlp": 0.0100217, + "balance_loss_clip": 1.02739406, + "balance_loss_mlp": 0.99957085, + "epoch": 0.17206757650453916, + "flos": 65530689753600.0, + "grad_norm": 0.9279798936186392, + "language_loss": 0.65666109, + "learning_rate": 3.792120304126576e-06, + "loss": 0.67756897, + "num_input_tokens_seen": 30271600, + "step": 1431, + "time_per_iteration": 3.166260242462158 + }, + { + "auxiliary_loss_clip": 0.01139796, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.05505204, + "balance_loss_mlp": 1.02069092, + "epoch": 0.17218781939517827, + "flos": 22273486128000.0, + "grad_norm": 1.8358729386930044, + "language_loss": 0.83912158, + "learning_rate": 3.791774357250649e-06, + "loss": 0.8608247, + "num_input_tokens_seen": 30290430, + "step": 1432, + "time_per_iteration": 2.707911968231201 + }, + { + "auxiliary_loss_clip": 0.01193923, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.06092787, + "balance_loss_mlp": 1.0372982, + "epoch": 0.17230806228581735, + "flos": 14137313592960.0, + "grad_norm": 2.0587882137302733, + "language_loss": 0.78973794, + "learning_rate": 3.7914281385639757e-06, + "loss": 0.81216556, + "num_input_tokens_seen": 30308305, + "step": 1433, + "time_per_iteration": 3.4616024494171143 + }, + { + "auxiliary_loss_clip": 0.01210843, + "auxiliary_loss_mlp": 0.01033467, + "balance_loss_clip": 1.06041777, + "balance_loss_mlp": 1.02270281, + "epoch": 0.17242830517645644, + "flos": 20704836303360.0, + "grad_norm": 4.233556433299362, + "language_loss": 0.7962361, + "learning_rate": 3.7910816481190784e-06, + "loss": 0.81867921, + "num_input_tokens_seen": 30328120, + "step": 1434, + "time_per_iteration": 3.34006404876709 + }, + { + "auxiliary_loss_clip": 0.01182956, + "auxiliary_loss_mlp": 0.01036683, + "balance_loss_clip": 1.0570662, + "balance_loss_mlp": 1.0258944, + "epoch": 0.17254854806709552, + "flos": 30774582887040.0, + "grad_norm": 2.145619516560582, + "language_loss": 0.74447173, + "learning_rate": 3.7907348859685193e-06, + "loss": 0.76666814, + "num_input_tokens_seen": 30349825, + "step": 1435, + "time_per_iteration": 3.405677318572998 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.062289, + "balance_loss_mlp": 1.01992941, + "epoch": 0.17266879095773463, + "flos": 26614726859520.0, + "grad_norm": 2.161074656611333, + "language_loss": 0.80808026, + "learning_rate": 3.790387852164902e-06, + "loss": 0.83042681, + "num_input_tokens_seen": 30370555, + "step": 1436, + "time_per_iteration": 2.5254557132720947 + }, + { + "auxiliary_loss_clip": 0.01211553, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.06281877, + "balance_loss_mlp": 1.02773309, + "epoch": 0.1727890338483737, + "flos": 20266295155200.0, + "grad_norm": 2.5863800315929235, + "language_loss": 0.76724088, + "learning_rate": 3.7900405467608707e-06, + "loss": 0.78974062, + "num_input_tokens_seen": 30390100, + "step": 1437, + "time_per_iteration": 2.479651927947998 + }, + { + "auxiliary_loss_clip": 0.01149498, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.05099809, + "balance_loss_mlp": 1.02399421, + "epoch": 0.1729092767390128, + "flos": 18179812909440.0, + "grad_norm": 3.4025696289962997, + "language_loss": 0.79207021, + "learning_rate": 3.7896929698091114e-06, + "loss": 0.81391537, + "num_input_tokens_seen": 30402915, + "step": 1438, + "time_per_iteration": 2.5597352981567383 + }, + { + "auxiliary_loss_clip": 0.01230965, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.06907105, + "balance_loss_mlp": 1.03000474, + "epoch": 0.1730295196296519, + "flos": 26759518583040.0, + "grad_norm": 3.0006139261390086, + "language_loss": 0.68545109, + "learning_rate": 3.7893451213623518e-06, + "loss": 0.70816827, + "num_input_tokens_seen": 30420145, + "step": 1439, + "time_per_iteration": 2.4732155799865723 + }, + { + "auxiliary_loss_clip": 0.01210535, + "auxiliary_loss_mlp": 0.00765427, + "balance_loss_clip": 1.06360769, + "balance_loss_mlp": 1.00066972, + "epoch": 0.173149762520291, + "flos": 23842531002240.0, + "grad_norm": 1.9022165458559455, + "language_loss": 0.81882918, + "learning_rate": 3.7889970014733606e-06, + "loss": 0.83858883, + "num_input_tokens_seen": 30439250, + "step": 1440, + "time_per_iteration": 2.5082085132598877 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.05243218, + "balance_loss_mlp": 1.02782547, + "epoch": 0.17327000541093007, + "flos": 23368186972800.0, + "grad_norm": 1.9065539275533965, + "language_loss": 0.78074944, + "learning_rate": 3.7886486101949463e-06, + "loss": 0.80261678, + "num_input_tokens_seen": 30460430, + "step": 1441, + "time_per_iteration": 2.6349501609802246 + }, + { + "auxiliary_loss_clip": 0.01154264, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.05427611, + "balance_loss_mlp": 1.03255677, + "epoch": 0.17339024830156918, + "flos": 18221290139520.0, + "grad_norm": 2.63682380102624, + "language_loss": 0.88038957, + "learning_rate": 3.7882999475799594e-06, + "loss": 0.90236461, + "num_input_tokens_seen": 30478465, + "step": 1442, + "time_per_iteration": 2.5917351245880127 + }, + { + "auxiliary_loss_clip": 0.01149411, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.05676925, + "balance_loss_mlp": 1.0295074, + "epoch": 0.17351049119220827, + "flos": 23332024955520.0, + "grad_norm": 1.8329884883973253, + "language_loss": 0.81288004, + "learning_rate": 3.787951013681293e-06, + "loss": 0.83477521, + "num_input_tokens_seen": 30496510, + "step": 1443, + "time_per_iteration": 2.6428513526916504 + }, + { + "auxiliary_loss_clip": 0.01206621, + "auxiliary_loss_mlp": 0.01039181, + "balance_loss_clip": 1.05895209, + "balance_loss_mlp": 1.02800465, + "epoch": 0.17363073408284735, + "flos": 23803495896960.0, + "grad_norm": 2.0442226543637005, + "language_loss": 0.77555406, + "learning_rate": 3.787601808551879e-06, + "loss": 0.79801214, + "num_input_tokens_seen": 30516325, + "step": 1444, + "time_per_iteration": 2.5136702060699463 + }, + { + "auxiliary_loss_clip": 0.01181296, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.05852294, + "balance_loss_mlp": 1.03085291, + "epoch": 0.17375097697348643, + "flos": 18515290959360.0, + "grad_norm": 2.4064384632580302, + "language_loss": 0.83627594, + "learning_rate": 3.7872523322446926e-06, + "loss": 0.8585012, + "num_input_tokens_seen": 30535210, + "step": 1445, + "time_per_iteration": 2.550147771835327 + }, + { + "auxiliary_loss_clip": 0.01167113, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.0513531, + "balance_loss_mlp": 1.02149796, + "epoch": 0.17387121986412554, + "flos": 38877897456000.0, + "grad_norm": 1.72386267231857, + "language_loss": 0.60042548, + "learning_rate": 3.7869025848127478e-06, + "loss": 0.62241268, + "num_input_tokens_seen": 30559405, + "step": 1446, + "time_per_iteration": 2.782686471939087 + }, + { + "auxiliary_loss_clip": 0.0120769, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.05832767, + "balance_loss_mlp": 1.02386761, + "epoch": 0.17399146275476463, + "flos": 20375714960640.0, + "grad_norm": 2.8291614811553076, + "language_loss": 0.8070277, + "learning_rate": 3.786552566309102e-06, + "loss": 0.82944572, + "num_input_tokens_seen": 30577615, + "step": 1447, + "time_per_iteration": 2.483642816543579 + }, + { + "auxiliary_loss_clip": 0.01190969, + "auxiliary_loss_mlp": 0.00765131, + "balance_loss_clip": 1.06150627, + "balance_loss_mlp": 1.00063086, + "epoch": 0.1741117056454037, + "flos": 19164339763200.0, + "grad_norm": 2.0693040707876404, + "language_loss": 0.85846364, + "learning_rate": 3.7862022767868517e-06, + "loss": 0.8780247, + "num_input_tokens_seen": 30595205, + "step": 1448, + "time_per_iteration": 2.5522799491882324 + }, + { + "auxiliary_loss_clip": 0.01176616, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.06248236, + "balance_loss_mlp": 1.03406608, + "epoch": 0.17423194853604282, + "flos": 25374300537600.0, + "grad_norm": 1.9867395954899814, + "language_loss": 0.84593821, + "learning_rate": 3.7858517162991367e-06, + "loss": 0.8681463, + "num_input_tokens_seen": 30615280, + "step": 1449, + "time_per_iteration": 2.6081628799438477 + }, + { + "auxiliary_loss_clip": 0.01178484, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.05638707, + "balance_loss_mlp": 1.02585089, + "epoch": 0.1743521914266819, + "flos": 25191874339200.0, + "grad_norm": 2.494721505444849, + "language_loss": 0.6059674, + "learning_rate": 3.7855008848991363e-06, + "loss": 0.62811995, + "num_input_tokens_seen": 30633485, + "step": 1450, + "time_per_iteration": 2.5990772247314453 + }, + { + "auxiliary_loss_clip": 0.01192592, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.06184793, + "balance_loss_mlp": 1.02884364, + "epoch": 0.17447243431732098, + "flos": 25666577504640.0, + "grad_norm": 1.7781127061026205, + "language_loss": 0.78021622, + "learning_rate": 3.7851497826400714e-06, + "loss": 0.8025313, + "num_input_tokens_seen": 30653625, + "step": 1451, + "time_per_iteration": 2.5590765476226807 + }, + { + "auxiliary_loss_clip": 0.01226339, + "auxiliary_loss_mlp": 0.01038077, + "balance_loss_clip": 1.06467664, + "balance_loss_mlp": 1.02715766, + "epoch": 0.17459267720796007, + "flos": 36281950657920.0, + "grad_norm": 1.7849897814051958, + "language_loss": 0.75923574, + "learning_rate": 3.7847984095752034e-06, + "loss": 0.7818799, + "num_input_tokens_seen": 30677080, + "step": 1452, + "time_per_iteration": 2.5738842487335205 + }, + { + "auxiliary_loss_clip": 0.01224325, + "auxiliary_loss_mlp": 0.01028979, + "balance_loss_clip": 1.06320775, + "balance_loss_mlp": 1.01915634, + "epoch": 0.17471292009859918, + "flos": 20011113959040.0, + "grad_norm": 2.0430657857382504, + "language_loss": 0.80162507, + "learning_rate": 3.784446765757836e-06, + "loss": 0.82415813, + "num_input_tokens_seen": 30695725, + "step": 1453, + "time_per_iteration": 2.450652837753296 + }, + { + "auxiliary_loss_clip": 0.01162633, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.0559299, + "balance_loss_mlp": 1.02169609, + "epoch": 0.17483316298923826, + "flos": 27819242559360.0, + "grad_norm": 3.0963963289635834, + "language_loss": 0.77826655, + "learning_rate": 3.7840948512413133e-06, + "loss": 0.8002128, + "num_input_tokens_seen": 30713310, + "step": 1454, + "time_per_iteration": 2.6025259494781494 + }, + { + "auxiliary_loss_clip": 0.01175864, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.06008387, + "balance_loss_mlp": 1.02282584, + "epoch": 0.17495340587987734, + "flos": 44017934791680.0, + "grad_norm": 1.7789399033800681, + "language_loss": 0.78885221, + "learning_rate": 3.7837426660790196e-06, + "loss": 0.81095231, + "num_input_tokens_seen": 30734725, + "step": 1455, + "time_per_iteration": 2.7791130542755127 + }, + { + "auxiliary_loss_clip": 0.01221806, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.06212354, + "balance_loss_mlp": 1.03565979, + "epoch": 0.17507364877051645, + "flos": 20885825957760.0, + "grad_norm": 2.376969109534207, + "language_loss": 0.81676722, + "learning_rate": 3.783390210324382e-06, + "loss": 0.83944142, + "num_input_tokens_seen": 30754450, + "step": 1456, + "time_per_iteration": 2.456920623779297 + }, + { + "auxiliary_loss_clip": 0.01180343, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.06059313, + "balance_loss_mlp": 1.02301264, + "epoch": 0.17519389166115554, + "flos": 24717602136960.0, + "grad_norm": 2.022458378049005, + "language_loss": 0.7275545, + "learning_rate": 3.7830374840308676e-06, + "loss": 0.74968815, + "num_input_tokens_seen": 30774605, + "step": 1457, + "time_per_iteration": 2.6140689849853516 + }, + { + "auxiliary_loss_clip": 0.01211744, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.06380475, + "balance_loss_mlp": 1.02617431, + "epoch": 0.17531413455179462, + "flos": 23798144770560.0, + "grad_norm": 3.1598799236048825, + "language_loss": 0.82531446, + "learning_rate": 3.7826844872519842e-06, + "loss": 0.84780216, + "num_input_tokens_seen": 30792460, + "step": 1458, + "time_per_iteration": 2.5333659648895264 + }, + { + "auxiliary_loss_clip": 0.01191871, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.06165719, + "balance_loss_mlp": 1.02426922, + "epoch": 0.1754343774424337, + "flos": 24572379450240.0, + "grad_norm": 2.381027226382764, + "language_loss": 0.72748619, + "learning_rate": 3.782331220041282e-06, + "loss": 0.74974346, + "num_input_tokens_seen": 30812525, + "step": 1459, + "time_per_iteration": 3.309271812438965 + }, + { + "auxiliary_loss_clip": 0.01186065, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.05777156, + "balance_loss_mlp": 1.02570069, + "epoch": 0.17555462033307281, + "flos": 18114599767680.0, + "grad_norm": 1.9042758649715843, + "language_loss": 0.82965976, + "learning_rate": 3.7819776824523504e-06, + "loss": 0.85188007, + "num_input_tokens_seen": 30830390, + "step": 1460, + "time_per_iteration": 2.572591781616211 + }, + { + "auxiliary_loss_clip": 0.0120061, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.05961335, + "balance_loss_mlp": 1.02898133, + "epoch": 0.1756748632237119, + "flos": 28366018364160.0, + "grad_norm": 2.198166873301516, + "language_loss": 0.83659261, + "learning_rate": 3.7816238745388213e-06, + "loss": 0.8589918, + "num_input_tokens_seen": 30849935, + "step": 1461, + "time_per_iteration": 4.213312864303589 + }, + { + "auxiliary_loss_clip": 0.01198532, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.05743587, + "balance_loss_mlp": 1.0239141, + "epoch": 0.17579510611435098, + "flos": 25732939881600.0, + "grad_norm": 1.756647916091758, + "language_loss": 0.8687942, + "learning_rate": 3.781269796354367e-06, + "loss": 0.89111578, + "num_input_tokens_seen": 30869555, + "step": 1462, + "time_per_iteration": 2.567169427871704 + }, + { + "auxiliary_loss_clip": 0.01195015, + "auxiliary_loss_mlp": 0.01043719, + "balance_loss_clip": 1.0611248, + "balance_loss_mlp": 1.03360391, + "epoch": 0.1759153490049901, + "flos": 18588081870720.0, + "grad_norm": 1.743932079297776, + "language_loss": 0.86128128, + "learning_rate": 3.7809154479527006e-06, + "loss": 0.8836686, + "num_input_tokens_seen": 30888760, + "step": 1463, + "time_per_iteration": 2.5167081356048584 + }, + { + "auxiliary_loss_clip": 0.0116719, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.05505121, + "balance_loss_mlp": 1.01604307, + "epoch": 0.17603559189562917, + "flos": 18619323724800.0, + "grad_norm": 2.5465718729354476, + "language_loss": 0.84011418, + "learning_rate": 3.780560829387577e-06, + "loss": 0.86204094, + "num_input_tokens_seen": 30907260, + "step": 1464, + "time_per_iteration": 2.543729782104492 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.0100232, + "balance_loss_clip": 1.03067732, + "balance_loss_mlp": 0.99980497, + "epoch": 0.17615583478626826, + "flos": 60530775373440.0, + "grad_norm": 0.8626439082443704, + "language_loss": 0.57980108, + "learning_rate": 3.7802059407127915e-06, + "loss": 0.60095966, + "num_input_tokens_seen": 30965810, + "step": 1465, + "time_per_iteration": 3.0473673343658447 + }, + { + "auxiliary_loss_clip": 0.01184294, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_clip": 1.05471969, + "balance_loss_mlp": 1.0320766, + "epoch": 0.17627607767690734, + "flos": 23616221362560.0, + "grad_norm": 2.159682153434583, + "language_loss": 0.85854584, + "learning_rate": 3.7798507819821797e-06, + "loss": 0.88081026, + "num_input_tokens_seen": 30982935, + "step": 1466, + "time_per_iteration": 2.590353012084961 + }, + { + "auxiliary_loss_clip": 0.01170971, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_clip": 1.05704474, + "balance_loss_mlp": 1.0310322, + "epoch": 0.17639632056754645, + "flos": 17639070589440.0, + "grad_norm": 2.1354885044894054, + "language_loss": 0.78709352, + "learning_rate": 3.7794953532496197e-06, + "loss": 0.80921948, + "num_input_tokens_seen": 30998840, + "step": 1467, + "time_per_iteration": 2.546147346496582 + }, + { + "auxiliary_loss_clip": 0.01069033, + "auxiliary_loss_mlp": 0.00755545, + "balance_loss_clip": 1.02976227, + "balance_loss_mlp": 1.00034833, + "epoch": 0.17651656345818553, + "flos": 57932604910080.0, + "grad_norm": 0.8493282396277126, + "language_loss": 0.57942247, + "learning_rate": 3.7791396545690295e-06, + "loss": 0.59766829, + "num_input_tokens_seen": 31060075, + "step": 1468, + "time_per_iteration": 3.166887044906616 + }, + { + "auxiliary_loss_clip": 0.01208736, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.06445813, + "balance_loss_mlp": 1.026335, + "epoch": 0.17663680634882462, + "flos": 22929502170240.0, + "grad_norm": 2.189290891096044, + "language_loss": 0.80462074, + "learning_rate": 3.7787836859943685e-06, + "loss": 0.82707042, + "num_input_tokens_seen": 31078800, + "step": 1469, + "time_per_iteration": 2.507932424545288 + }, + { + "auxiliary_loss_clip": 0.012066, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.06049204, + "balance_loss_mlp": 1.02632797, + "epoch": 0.17675704923946373, + "flos": 22637979388800.0, + "grad_norm": 2.6248840649357086, + "language_loss": 0.78971058, + "learning_rate": 3.7784274475796363e-06, + "loss": 0.8121444, + "num_input_tokens_seen": 31097430, + "step": 1470, + "time_per_iteration": 2.4954752922058105 + }, + { + "auxiliary_loss_clip": 0.01176419, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.05474377, + "balance_loss_mlp": 1.02950859, + "epoch": 0.1768772921301028, + "flos": 27126525795840.0, + "grad_norm": 2.0655219239594844, + "language_loss": 0.75571489, + "learning_rate": 3.7780709393788745e-06, + "loss": 0.77787584, + "num_input_tokens_seen": 31117905, + "step": 1471, + "time_per_iteration": 2.610905647277832 + }, + { + "auxiliary_loss_clip": 0.01221698, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.06311321, + "balance_loss_mlp": 1.02097952, + "epoch": 0.1769975350207419, + "flos": 19172133014400.0, + "grad_norm": 2.884363247435443, + "language_loss": 0.7520656, + "learning_rate": 3.777714161446165e-06, + "loss": 0.77459693, + "num_input_tokens_seen": 31137610, + "step": 1472, + "time_per_iteration": 2.447904586791992 + }, + { + "auxiliary_loss_clip": 0.01206992, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.06159437, + "balance_loss_mlp": 1.02285075, + "epoch": 0.177117777911381, + "flos": 36134932291200.0, + "grad_norm": 2.9148213501154174, + "language_loss": 0.69548571, + "learning_rate": 3.7773571138356304e-06, + "loss": 0.71788275, + "num_input_tokens_seen": 31157780, + "step": 1473, + "time_per_iteration": 2.5981268882751465 + }, + { + "auxiliary_loss_clip": 0.01146851, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.05399108, + "balance_loss_mlp": 1.02002215, + "epoch": 0.17723802080202009, + "flos": 22090593052800.0, + "grad_norm": 2.0383332423634206, + "language_loss": 0.88999462, + "learning_rate": 3.776999796601435e-06, + "loss": 0.9117558, + "num_input_tokens_seen": 31176540, + "step": 1474, + "time_per_iteration": 2.5900957584381104 + }, + { + "auxiliary_loss_clip": 0.01210223, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.06096435, + "balance_loss_mlp": 1.02557421, + "epoch": 0.17735826369265917, + "flos": 30222671437440.0, + "grad_norm": 1.8354347144661434, + "language_loss": 0.72568023, + "learning_rate": 3.776642209797783e-06, + "loss": 0.74813986, + "num_input_tokens_seen": 31198370, + "step": 1475, + "time_per_iteration": 2.556887149810791 + }, + { + "auxiliary_loss_clip": 0.01203098, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.05940342, + "balance_loss_mlp": 1.018821, + "epoch": 0.17747850658329825, + "flos": 21397588980480.0, + "grad_norm": 2.019886082250426, + "language_loss": 0.77519006, + "learning_rate": 3.7762843534789205e-06, + "loss": 0.79751933, + "num_input_tokens_seen": 31217120, + "step": 1476, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.01196678, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.05919778, + "balance_loss_mlp": 1.02565408, + "epoch": 0.17759874947393736, + "flos": 16983341856000.0, + "grad_norm": 2.0088385497203918, + "language_loss": 0.8816241, + "learning_rate": 3.7759262276991343e-06, + "loss": 0.90394592, + "num_input_tokens_seen": 31234730, + "step": 1477, + "time_per_iteration": 2.497840166091919 + }, + { + "auxiliary_loss_clip": 0.01197201, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.06067526, + "balance_loss_mlp": 1.01962447, + "epoch": 0.17771899236457644, + "flos": 11546107390080.0, + "grad_norm": 2.632432372290125, + "language_loss": 0.80604672, + "learning_rate": 3.7755678325127506e-06, + "loss": 0.82831931, + "num_input_tokens_seen": 31252410, + "step": 1478, + "time_per_iteration": 2.509660482406616 + }, + { + "auxiliary_loss_clip": 0.01156106, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.05680299, + "balance_loss_mlp": 1.02363873, + "epoch": 0.17783923525521553, + "flos": 18807747494400.0, + "grad_norm": 1.8840903209120397, + "language_loss": 0.75711358, + "learning_rate": 3.7752091679741393e-06, + "loss": 0.77901459, + "num_input_tokens_seen": 31270200, + "step": 1479, + "time_per_iteration": 2.586625337600708 + }, + { + "auxiliary_loss_clip": 0.01203582, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.0599637, + "balance_loss_mlp": 1.02043843, + "epoch": 0.17795947814585464, + "flos": 30408365773440.0, + "grad_norm": 4.23865399780757, + "language_loss": 0.77757812, + "learning_rate": 3.774850234137708e-06, + "loss": 0.79991764, + "num_input_tokens_seen": 31287495, + "step": 1480, + "time_per_iteration": 2.5544660091400146 + }, + { + "auxiliary_loss_clip": 0.01202524, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.05948234, + "balance_loss_mlp": 1.02373159, + "epoch": 0.17807972103649372, + "flos": 24389055411840.0, + "grad_norm": 2.6979201148921956, + "language_loss": 0.82400393, + "learning_rate": 3.7744910310579076e-06, + "loss": 0.84636831, + "num_input_tokens_seen": 31306420, + "step": 1481, + "time_per_iteration": 2.525739908218384 + }, + { + "auxiliary_loss_clip": 0.01220732, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.06488657, + "balance_loss_mlp": 1.02386522, + "epoch": 0.1781999639271328, + "flos": 20301559332480.0, + "grad_norm": 1.9158112753776986, + "language_loss": 0.85416377, + "learning_rate": 3.774131558789229e-06, + "loss": 0.8766979, + "num_input_tokens_seen": 31325750, + "step": 1482, + "time_per_iteration": 2.4597702026367188 + }, + { + "auxiliary_loss_clip": 0.01221306, + "auxiliary_loss_mlp": 0.00764189, + "balance_loss_clip": 1.06387067, + "balance_loss_mlp": 1.00052857, + "epoch": 0.1783202068177719, + "flos": 15924479806080.0, + "grad_norm": 4.557694886489774, + "language_loss": 0.69853115, + "learning_rate": 3.773771817386203e-06, + "loss": 0.71838611, + "num_input_tokens_seen": 31343080, + "step": 1483, + "time_per_iteration": 2.4490532875061035 + }, + { + "auxiliary_loss_clip": 0.01191827, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.06080842, + "balance_loss_mlp": 1.02216303, + "epoch": 0.178440449708411, + "flos": 20631758083200.0, + "grad_norm": 1.663044072051087, + "language_loss": 0.7915715, + "learning_rate": 3.773411806903403e-06, + "loss": 0.8138054, + "num_input_tokens_seen": 31362160, + "step": 1484, + "time_per_iteration": 2.5441019535064697 + }, + { + "auxiliary_loss_clip": 0.01149809, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.05318284, + "balance_loss_mlp": 1.02647853, + "epoch": 0.17856069259905008, + "flos": 21686059105920.0, + "grad_norm": 3.113216332208616, + "language_loss": 0.94630474, + "learning_rate": 3.7730515273954415e-06, + "loss": 0.96817398, + "num_input_tokens_seen": 31380770, + "step": 1485, + "time_per_iteration": 2.6597001552581787 + }, + { + "auxiliary_loss_clip": 0.01220687, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.06448615, + "balance_loss_mlp": 1.02551711, + "epoch": 0.17868093548968916, + "flos": 26572962320640.0, + "grad_norm": 2.348309574640164, + "language_loss": 0.85363674, + "learning_rate": 3.772690978916973e-06, + "loss": 0.87619114, + "num_input_tokens_seen": 31400525, + "step": 1486, + "time_per_iteration": 3.213793992996216 + }, + { + "auxiliary_loss_clip": 0.01207926, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.06426656, + "balance_loss_mlp": 1.02403975, + "epoch": 0.17880117838032827, + "flos": 18581006891520.0, + "grad_norm": 2.135606479223312, + "language_loss": 0.86897022, + "learning_rate": 3.772330161522693e-06, + "loss": 0.89138985, + "num_input_tokens_seen": 31418435, + "step": 1487, + "time_per_iteration": 2.477466106414795 + }, + { + "auxiliary_loss_clip": 0.01190906, + "auxiliary_loss_mlp": 0.01031865, + "balance_loss_clip": 1.06396556, + "balance_loss_mlp": 1.0218755, + "epoch": 0.17892142127096736, + "flos": 26541217676160.0, + "grad_norm": 1.9667399321019736, + "language_loss": 0.79490423, + "learning_rate": 3.7719690752673365e-06, + "loss": 0.81713188, + "num_input_tokens_seen": 31439230, + "step": 1488, + "time_per_iteration": 4.748527526855469 + }, + { + "auxiliary_loss_clip": 0.01180291, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.06193924, + "balance_loss_mlp": 1.02363694, + "epoch": 0.17904166416160644, + "flos": 23872623621120.0, + "grad_norm": 2.0532543935746066, + "language_loss": 0.77913719, + "learning_rate": 3.7716077202056796e-06, + "loss": 0.80127335, + "num_input_tokens_seen": 31457705, + "step": 1489, + "time_per_iteration": 2.6053662300109863 + }, + { + "auxiliary_loss_clip": 0.01179178, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.05898881, + "balance_loss_mlp": 1.02426386, + "epoch": 0.17916190705224552, + "flos": 19134426712320.0, + "grad_norm": 2.777075445996867, + "language_loss": 0.9368149, + "learning_rate": 3.7712460963925404e-06, + "loss": 0.95894575, + "num_input_tokens_seen": 31473645, + "step": 1490, + "time_per_iteration": 2.510571241378784 + }, + { + "auxiliary_loss_clip": 0.01182279, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.05682516, + "balance_loss_mlp": 1.02364922, + "epoch": 0.17928214994288463, + "flos": 25152120961920.0, + "grad_norm": 2.9975842512507205, + "language_loss": 0.75470787, + "learning_rate": 3.7708842038827775e-06, + "loss": 0.77686548, + "num_input_tokens_seen": 31492605, + "step": 1491, + "time_per_iteration": 2.5920705795288086 + }, + { + "auxiliary_loss_clip": 0.01204698, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.05950236, + "balance_loss_mlp": 1.02516794, + "epoch": 0.17940239283352372, + "flos": 22384629786240.0, + "grad_norm": 1.6416110927524108, + "language_loss": 0.85884148, + "learning_rate": 3.770522042731288e-06, + "loss": 0.88123226, + "num_input_tokens_seen": 31514500, + "step": 1492, + "time_per_iteration": 2.5195505619049072 + }, + { + "auxiliary_loss_clip": 0.01154453, + "auxiliary_loss_mlp": 0.01040935, + "balance_loss_clip": 1.05757678, + "balance_loss_mlp": 1.03116608, + "epoch": 0.1795226357241628, + "flos": 23178685795200.0, + "grad_norm": 2.161096731410261, + "language_loss": 0.87634653, + "learning_rate": 3.7701596129930122e-06, + "loss": 0.89830041, + "num_input_tokens_seen": 31533225, + "step": 1493, + "time_per_iteration": 2.6023898124694824 + }, + { + "auxiliary_loss_clip": 0.0118545, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.06008065, + "balance_loss_mlp": 1.01744854, + "epoch": 0.1796428786148019, + "flos": 22090413484800.0, + "grad_norm": 2.034837794727175, + "language_loss": 0.7340591, + "learning_rate": 3.7697969147229315e-06, + "loss": 0.75619316, + "num_input_tokens_seen": 31551385, + "step": 1494, + "time_per_iteration": 2.573099374771118 + }, + { + "auxiliary_loss_clip": 0.01202941, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.0608778, + "balance_loss_mlp": 1.02450728, + "epoch": 0.179763121505441, + "flos": 21324618501120.0, + "grad_norm": 1.9119513929350358, + "language_loss": 0.85520071, + "learning_rate": 3.7694339479760647e-06, + "loss": 0.87757063, + "num_input_tokens_seen": 31570415, + "step": 1495, + "time_per_iteration": 2.494835615158081 + }, + { + "auxiliary_loss_clip": 0.01097037, + "auxiliary_loss_mlp": 0.01002217, + "balance_loss_clip": 1.02870297, + "balance_loss_mlp": 0.99999982, + "epoch": 0.17988336439608008, + "flos": 68161864815360.0, + "grad_norm": 0.8043142506521495, + "language_loss": 0.57378113, + "learning_rate": 3.769070712807476e-06, + "loss": 0.59477365, + "num_input_tokens_seen": 31632445, + "step": 1496, + "time_per_iteration": 3.166612386703491 + }, + { + "auxiliary_loss_clip": 0.01135806, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.05656028, + "balance_loss_mlp": 1.02447271, + "epoch": 0.18000360728671919, + "flos": 21945047143680.0, + "grad_norm": 1.8920960818279546, + "language_loss": 0.78881741, + "learning_rate": 3.768707209272266e-06, + "loss": 0.81051904, + "num_input_tokens_seen": 31652575, + "step": 1497, + "time_per_iteration": 2.63382887840271 + }, + { + "auxiliary_loss_clip": 0.01186111, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.05747318, + "balance_loss_mlp": 1.02401853, + "epoch": 0.18012385017735827, + "flos": 18986330937600.0, + "grad_norm": 3.5916437821551024, + "language_loss": 0.76598775, + "learning_rate": 3.768343437425579e-06, + "loss": 0.78818864, + "num_input_tokens_seen": 31671145, + "step": 1498, + "time_per_iteration": 2.5113699436187744 + }, + { + "auxiliary_loss_clip": 0.01124702, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.05288827, + "balance_loss_mlp": 1.02244711, + "epoch": 0.18024409306799735, + "flos": 19748103598080.0, + "grad_norm": 3.082484135683912, + "language_loss": 0.86178529, + "learning_rate": 3.7679793973225987e-06, + "loss": 0.88335311, + "num_input_tokens_seen": 31686955, + "step": 1499, + "time_per_iteration": 2.6698496341705322 + }, + { + "auxiliary_loss_clip": 0.01065659, + "auxiliary_loss_mlp": 0.01001859, + "balance_loss_clip": 1.02533233, + "balance_loss_mlp": 0.99966508, + "epoch": 0.18036433595863643, + "flos": 67227183060480.0, + "grad_norm": 0.8467228289448543, + "language_loss": 0.61599892, + "learning_rate": 3.767615089018549e-06, + "loss": 0.63667411, + "num_input_tokens_seen": 31749300, + "step": 1500, + "time_per_iteration": 3.142704486846924 + }, + { + "auxiliary_loss_clip": 0.01190067, + "auxiliary_loss_mlp": 0.01039008, + "balance_loss_clip": 1.06230676, + "balance_loss_mlp": 1.02882767, + "epoch": 0.18048457884927555, + "flos": 18181464935040.0, + "grad_norm": 2.7168561016596713, + "language_loss": 0.86306643, + "learning_rate": 3.7672505125686966e-06, + "loss": 0.88535726, + "num_input_tokens_seen": 31765665, + "step": 1501, + "time_per_iteration": 2.5316851139068604 + }, + { + "auxiliary_loss_clip": 0.01162156, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.05409706, + "balance_loss_mlp": 1.02574325, + "epoch": 0.18060482173991463, + "flos": 15813767111040.0, + "grad_norm": 3.9811257518782948, + "language_loss": 0.84464329, + "learning_rate": 3.7668856680283455e-06, + "loss": 0.86661917, + "num_input_tokens_seen": 31782690, + "step": 1502, + "time_per_iteration": 2.5909206867218018 + }, + { + "auxiliary_loss_clip": 0.01198633, + "auxiliary_loss_mlp": 0.01038552, + "balance_loss_clip": 1.0623548, + "balance_loss_mlp": 1.02878928, + "epoch": 0.1807250646305537, + "flos": 18587399512320.0, + "grad_norm": 2.0723239242589337, + "language_loss": 0.8230598, + "learning_rate": 3.7665205554528437e-06, + "loss": 0.84543163, + "num_input_tokens_seen": 31802045, + "step": 1503, + "time_per_iteration": 2.5402672290802 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.06598234, + "balance_loss_mlp": 1.01889098, + "epoch": 0.18084530752119282, + "flos": 23149131880320.0, + "grad_norm": 2.2472992806793766, + "language_loss": 0.74079037, + "learning_rate": 3.7661551748975782e-06, + "loss": 0.76304674, + "num_input_tokens_seen": 31820220, + "step": 1504, + "time_per_iteration": 2.548757553100586 + }, + { + "auxiliary_loss_clip": 0.01098954, + "auxiliary_loss_mlp": 0.01006058, + "balance_loss_clip": 1.02842176, + "balance_loss_mlp": 1.00388873, + "epoch": 0.1809655504118319, + "flos": 59803153568640.0, + "grad_norm": 0.8151950273048644, + "language_loss": 0.60489517, + "learning_rate": 3.7657895264179772e-06, + "loss": 0.62594533, + "num_input_tokens_seen": 31876195, + "step": 1505, + "time_per_iteration": 3.0946011543273926 + }, + { + "auxiliary_loss_clip": 0.01183983, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.05749536, + "balance_loss_mlp": 1.02307749, + "epoch": 0.181085793302471, + "flos": 44201941188480.0, + "grad_norm": 1.7128033801754456, + "language_loss": 0.74537265, + "learning_rate": 3.765423610069509e-06, + "loss": 0.7675339, + "num_input_tokens_seen": 31901585, + "step": 1506, + "time_per_iteration": 2.7293856143951416 + }, + { + "auxiliary_loss_clip": 0.01193125, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.06225181, + "balance_loss_mlp": 1.02513933, + "epoch": 0.18120603619311007, + "flos": 34898384638080.0, + "grad_norm": 2.1563756484350294, + "language_loss": 0.72380459, + "learning_rate": 3.765057425907683e-06, + "loss": 0.74608463, + "num_input_tokens_seen": 31923045, + "step": 1507, + "time_per_iteration": 2.6435627937316895 + }, + { + "auxiliary_loss_clip": 0.01208504, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.06095719, + "balance_loss_mlp": 1.022668, + "epoch": 0.18132627908374918, + "flos": 21506757390720.0, + "grad_norm": 1.8680801481610392, + "language_loss": 0.78048265, + "learning_rate": 3.764690973988048e-06, + "loss": 0.80289716, + "num_input_tokens_seen": 31943385, + "step": 1508, + "time_per_iteration": 2.5210845470428467 + }, + { + "auxiliary_loss_clip": 0.01180216, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.05831242, + "balance_loss_mlp": 1.02035141, + "epoch": 0.18144652197438826, + "flos": 29057693633280.0, + "grad_norm": 1.860476750718251, + "language_loss": 0.73893833, + "learning_rate": 3.7643242543661967e-06, + "loss": 0.76103878, + "num_input_tokens_seen": 31966045, + "step": 1509, + "time_per_iteration": 2.6721463203430176 + }, + { + "auxiliary_loss_clip": 0.01084399, + "auxiliary_loss_mlp": 0.0100702, + "balance_loss_clip": 1.02275157, + "balance_loss_mlp": 1.00483894, + "epoch": 0.18156676486502735, + "flos": 68675064382080.0, + "grad_norm": 1.0140462444195548, + "language_loss": 0.60482228, + "learning_rate": 3.7639572670977573e-06, + "loss": 0.62573647, + "num_input_tokens_seen": 32021540, + "step": 1510, + "time_per_iteration": 3.013326644897461 + }, + { + "auxiliary_loss_clip": 0.01178305, + "auxiliary_loss_mlp": 0.01035434, + "balance_loss_clip": 1.05647409, + "balance_loss_mlp": 1.02550983, + "epoch": 0.18168700775566646, + "flos": 26471515334400.0, + "grad_norm": 1.6010038141171061, + "language_loss": 0.76647502, + "learning_rate": 3.7635900122384042e-06, + "loss": 0.78861237, + "num_input_tokens_seen": 32044535, + "step": 1511, + "time_per_iteration": 2.6274631023406982 + }, + { + "auxiliary_loss_clip": 0.01194001, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.05874395, + "balance_loss_mlp": 1.02500701, + "epoch": 0.18180725064630554, + "flos": 15005668884480.0, + "grad_norm": 3.250608705809074, + "language_loss": 0.86155397, + "learning_rate": 3.7632224898438477e-06, + "loss": 0.88385361, + "num_input_tokens_seen": 32061010, + "step": 1512, + "time_per_iteration": 3.273439645767212 + }, + { + "auxiliary_loss_clip": 0.01181503, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.05661368, + "balance_loss_mlp": 1.02537799, + "epoch": 0.18192749353694462, + "flos": 19682387665920.0, + "grad_norm": 1.6942831506585407, + "language_loss": 0.78898609, + "learning_rate": 3.762854699969842e-06, + "loss": 0.81115246, + "num_input_tokens_seen": 32081520, + "step": 1513, + "time_per_iteration": 2.587843894958496 + }, + { + "auxiliary_loss_clip": 0.01204897, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.06321013, + "balance_loss_mlp": 1.02893245, + "epoch": 0.1820477364275837, + "flos": 20702717400960.0, + "grad_norm": 1.843684225576671, + "language_loss": 0.73179913, + "learning_rate": 3.762486642672179e-06, + "loss": 0.7542491, + "num_input_tokens_seen": 32098460, + "step": 1514, + "time_per_iteration": 3.995154619216919 + }, + { + "auxiliary_loss_clip": 0.01190051, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.05828667, + "balance_loss_mlp": 1.02651167, + "epoch": 0.18216797931822282, + "flos": 17128708197120.0, + "grad_norm": 1.9992363907895776, + "language_loss": 0.86703825, + "learning_rate": 3.7621183180066946e-06, + "loss": 0.88930494, + "num_input_tokens_seen": 32116420, + "step": 1515, + "time_per_iteration": 3.2694735527038574 + }, + { + "auxiliary_loss_clip": 0.01191011, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.05794668, + "balance_loss_mlp": 1.02329934, + "epoch": 0.1822882222088619, + "flos": 29242561956480.0, + "grad_norm": 1.7233647621152608, + "language_loss": 0.73540294, + "learning_rate": 3.7617497260292625e-06, + "loss": 0.75764787, + "num_input_tokens_seen": 32138475, + "step": 1516, + "time_per_iteration": 2.6051995754241943 + }, + { + "auxiliary_loss_clip": 0.0118447, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.0594039, + "balance_loss_mlp": 1.02111697, + "epoch": 0.18240846509950098, + "flos": 17702739446400.0, + "grad_norm": 2.7588323652151563, + "language_loss": 0.78927094, + "learning_rate": 3.7613808667957967e-06, + "loss": 0.81143463, + "num_input_tokens_seen": 32151165, + "step": 1517, + "time_per_iteration": 2.476970911026001 + }, + { + "auxiliary_loss_clip": 0.01193867, + "auxiliary_loss_mlp": 0.0104085, + "balance_loss_clip": 1.05965662, + "balance_loss_mlp": 1.03053808, + "epoch": 0.1825287079901401, + "flos": 14790025584000.0, + "grad_norm": 2.655432086620798, + "language_loss": 0.90780365, + "learning_rate": 3.7610117403622547e-06, + "loss": 0.93015075, + "num_input_tokens_seen": 32167725, + "step": 1518, + "time_per_iteration": 2.4960196018218994 + }, + { + "auxiliary_loss_clip": 0.01168332, + "auxiliary_loss_mlp": 0.01039402, + "balance_loss_clip": 1.05380726, + "balance_loss_mlp": 1.02905464, + "epoch": 0.18264895088077918, + "flos": 21946232292480.0, + "grad_norm": 1.7844730475998043, + "language_loss": 0.89717704, + "learning_rate": 3.7606423467846313e-06, + "loss": 0.91925442, + "num_input_tokens_seen": 32187330, + "step": 1519, + "time_per_iteration": 2.5872974395751953 + }, + { + "auxiliary_loss_clip": 0.01185085, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.06174278, + "balance_loss_mlp": 1.03036904, + "epoch": 0.18276919377141826, + "flos": 20886759711360.0, + "grad_norm": 1.523319724513717, + "language_loss": 0.79693818, + "learning_rate": 3.760272686118964e-06, + "loss": 0.81919515, + "num_input_tokens_seen": 32205550, + "step": 1520, + "time_per_iteration": 2.5665860176086426 + }, + { + "auxiliary_loss_clip": 0.01192698, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.05847943, + "balance_loss_mlp": 1.0265764, + "epoch": 0.18288943666205737, + "flos": 21469877101440.0, + "grad_norm": 2.0392116591179414, + "language_loss": 0.92427909, + "learning_rate": 3.7599027584213297e-06, + "loss": 0.94657058, + "num_input_tokens_seen": 32224430, + "step": 1521, + "time_per_iteration": 2.550670623779297 + }, + { + "auxiliary_loss_clip": 0.01211864, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.06073928, + "balance_loss_mlp": 1.02718639, + "epoch": 0.18300967955269645, + "flos": 21539363961600.0, + "grad_norm": 1.921490187140876, + "language_loss": 0.78499776, + "learning_rate": 3.7595325637478465e-06, + "loss": 0.80749333, + "num_input_tokens_seen": 32242455, + "step": 1522, + "time_per_iteration": 2.49302339553833 + }, + { + "auxiliary_loss_clip": 0.01181531, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.05859268, + "balance_loss_mlp": 1.02857471, + "epoch": 0.18312992244333554, + "flos": 28876237102080.0, + "grad_norm": 2.1436204187089394, + "language_loss": 0.81630552, + "learning_rate": 3.7591621021546723e-06, + "loss": 0.83851969, + "num_input_tokens_seen": 32264450, + "step": 1523, + "time_per_iteration": 2.6707770824432373 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.05883396, + "balance_loss_mlp": 1.02181172, + "epoch": 0.18325016533397462, + "flos": 20120102801280.0, + "grad_norm": 2.054582317976788, + "language_loss": 0.81301403, + "learning_rate": 3.7587913736980062e-06, + "loss": 0.83534491, + "num_input_tokens_seen": 32284090, + "step": 1524, + "time_per_iteration": 2.512779474258423 + }, + { + "auxiliary_loss_clip": 0.0112991, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.04937124, + "balance_loss_mlp": 1.02387488, + "epoch": 0.18337040822461373, + "flos": 23329187781120.0, + "grad_norm": 1.6646510178698466, + "language_loss": 0.84540337, + "learning_rate": 3.7584203784340865e-06, + "loss": 0.86704326, + "num_input_tokens_seen": 32303260, + "step": 1525, + "time_per_iteration": 2.644603967666626 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.05578923, + "balance_loss_mlp": 1.02710724, + "epoch": 0.1834906511152528, + "flos": 25009555881600.0, + "grad_norm": 2.0517193779853233, + "language_loss": 0.85431719, + "learning_rate": 3.7580491164191938e-06, + "loss": 0.87655795, + "num_input_tokens_seen": 32321570, + "step": 1526, + "time_per_iteration": 2.560609817504883 + }, + { + "auxiliary_loss_clip": 0.01111074, + "auxiliary_loss_mlp": 0.01009516, + "balance_loss_clip": 1.02890611, + "balance_loss_mlp": 1.00723934, + "epoch": 0.1836108940058919, + "flos": 67251493589760.0, + "grad_norm": 0.7408731567461769, + "language_loss": 0.61230671, + "learning_rate": 3.757677587709648e-06, + "loss": 0.63351262, + "num_input_tokens_seen": 32384835, + "step": 1527, + "time_per_iteration": 3.184701681137085 + }, + { + "auxiliary_loss_clip": 0.01173861, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.05965459, + "balance_loss_mlp": 1.02570558, + "epoch": 0.183731136896531, + "flos": 25738721971200.0, + "grad_norm": 1.8020496759229, + "language_loss": 0.75663722, + "learning_rate": 3.7573057923618095e-06, + "loss": 0.77873302, + "num_input_tokens_seen": 32404930, + "step": 1528, + "time_per_iteration": 2.6806342601776123 + }, + { + "auxiliary_loss_clip": 0.01156507, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.05078459, + "balance_loss_mlp": 1.02379227, + "epoch": 0.1838513797871701, + "flos": 20449403712000.0, + "grad_norm": 2.688199568882139, + "language_loss": 0.73994422, + "learning_rate": 3.7569337304320793e-06, + "loss": 0.76185131, + "num_input_tokens_seen": 32424515, + "step": 1529, + "time_per_iteration": 2.6626768112182617 + }, + { + "auxiliary_loss_clip": 0.01091932, + "auxiliary_loss_mlp": 0.01004881, + "balance_loss_clip": 1.02454042, + "balance_loss_mlp": 1.00263977, + "epoch": 0.18397162267780917, + "flos": 68565141786240.0, + "grad_norm": 0.8343523891892647, + "language_loss": 0.64455348, + "learning_rate": 3.756561401976899e-06, + "loss": 0.66552162, + "num_input_tokens_seen": 32484220, + "step": 1530, + "time_per_iteration": 2.9774651527404785 + }, + { + "auxiliary_loss_clip": 0.01223253, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.06390262, + "balance_loss_mlp": 1.02502453, + "epoch": 0.18409186556844825, + "flos": 31941104976000.0, + "grad_norm": 1.8758117518604756, + "language_loss": 0.82780576, + "learning_rate": 3.7561888070527514e-06, + "loss": 0.85038757, + "num_input_tokens_seen": 32506260, + "step": 1531, + "time_per_iteration": 2.68009352684021 + }, + { + "auxiliary_loss_clip": 0.01160319, + "auxiliary_loss_mlp": 0.00764075, + "balance_loss_clip": 1.05540657, + "balance_loss_mlp": 1.00068426, + "epoch": 0.18421210845908736, + "flos": 20120533764480.0, + "grad_norm": 2.2714518181490093, + "language_loss": 0.79666388, + "learning_rate": 3.7558159457161577e-06, + "loss": 0.81590772, + "num_input_tokens_seen": 32524225, + "step": 1532, + "time_per_iteration": 2.7096025943756104 + }, + { + "auxiliary_loss_clip": 0.01194356, + "auxiliary_loss_mlp": 0.00764592, + "balance_loss_clip": 1.06234753, + "balance_loss_mlp": 1.00071347, + "epoch": 0.18433235134972645, + "flos": 23110491824640.0, + "grad_norm": 2.3693903837284527, + "language_loss": 0.78082556, + "learning_rate": 3.755442818023681e-06, + "loss": 0.80041504, + "num_input_tokens_seen": 32543850, + "step": 1533, + "time_per_iteration": 2.6788508892059326 + }, + { + "auxiliary_loss_clip": 0.01179965, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.06033993, + "balance_loss_mlp": 1.02357268, + "epoch": 0.18445259424036553, + "flos": 18291351617280.0, + "grad_norm": 7.243018380366637, + "language_loss": 0.75884104, + "learning_rate": 3.7550694240319246e-06, + "loss": 0.7809689, + "num_input_tokens_seen": 32561725, + "step": 1534, + "time_per_iteration": 2.701127052307129 + }, + { + "auxiliary_loss_clip": 0.01209862, + "auxiliary_loss_mlp": 0.01028746, + "balance_loss_clip": 1.06147552, + "balance_loss_mlp": 1.01919734, + "epoch": 0.18457283713100464, + "flos": 21324079797120.0, + "grad_norm": 2.148654524494574, + "language_loss": 0.76543683, + "learning_rate": 3.7546957637975326e-06, + "loss": 0.78782296, + "num_input_tokens_seen": 32579135, + "step": 1535, + "time_per_iteration": 2.6282596588134766 + }, + { + "auxiliary_loss_clip": 0.01134158, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.0469507, + "balance_loss_mlp": 1.02248025, + "epoch": 0.18469308002164372, + "flos": 20375679047040.0, + "grad_norm": 1.5818530476985238, + "language_loss": 0.74027634, + "learning_rate": 3.7543218373771873e-06, + "loss": 0.76193899, + "num_input_tokens_seen": 32598460, + "step": 1536, + "time_per_iteration": 2.7320454120635986 + }, + { + "auxiliary_loss_clip": 0.01139817, + "auxiliary_loss_mlp": 0.00763936, + "balance_loss_clip": 1.05401158, + "balance_loss_mlp": 1.00070679, + "epoch": 0.1848133229122828, + "flos": 26435892021120.0, + "grad_norm": 1.4732932349375003, + "language_loss": 0.78230953, + "learning_rate": 3.753947644827615e-06, + "loss": 0.80134714, + "num_input_tokens_seen": 32621920, + "step": 1537, + "time_per_iteration": 2.8139865398406982 + }, + { + "auxiliary_loss_clip": 0.01098142, + "auxiliary_loss_mlp": 0.01016022, + "balance_loss_clip": 1.02683163, + "balance_loss_mlp": 1.01376939, + "epoch": 0.1849335658029219, + "flos": 70547447612160.0, + "grad_norm": 0.9146108589512199, + "language_loss": 0.57235128, + "learning_rate": 3.753573186205579e-06, + "loss": 0.59349293, + "num_input_tokens_seen": 32690040, + "step": 1538, + "time_per_iteration": 3.2632219791412354 + }, + { + "auxiliary_loss_clip": 0.01179731, + "auxiliary_loss_mlp": 0.00764146, + "balance_loss_clip": 1.05598855, + "balance_loss_mlp": 1.00062943, + "epoch": 0.185053808693561, + "flos": 17384140788480.0, + "grad_norm": 2.3424513786564147, + "language_loss": 0.77830988, + "learning_rate": 3.753198461567885e-06, + "loss": 0.79774863, + "num_input_tokens_seen": 32707285, + "step": 1539, + "time_per_iteration": 3.2775049209594727 + }, + { + "auxiliary_loss_clip": 0.01171413, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.06024003, + "balance_loss_mlp": 1.02901196, + "epoch": 0.18517405158420008, + "flos": 28986159697920.0, + "grad_norm": 1.7355574878687965, + "language_loss": 0.92044675, + "learning_rate": 3.7528234709713783e-06, + "loss": 0.94254619, + "num_input_tokens_seen": 32730030, + "step": 1540, + "time_per_iteration": 3.4106578826904297 + }, + { + "auxiliary_loss_clip": 0.01208888, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.06407118, + "balance_loss_mlp": 1.02542436, + "epoch": 0.18529429447483917, + "flos": 26794962328320.0, + "grad_norm": 2.034108533288698, + "language_loss": 0.84353328, + "learning_rate": 3.7524482144729447e-06, + "loss": 0.86597282, + "num_input_tokens_seen": 32749485, + "step": 1541, + "time_per_iteration": 4.0467376708984375 + }, + { + "auxiliary_loss_clip": 0.01170519, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.0548315, + "balance_loss_mlp": 1.03438115, + "epoch": 0.18541453736547828, + "flos": 13581595301760.0, + "grad_norm": 2.7230329615418367, + "language_loss": 0.83670223, + "learning_rate": 3.7520726921295106e-06, + "loss": 0.858854, + "num_input_tokens_seen": 32766205, + "step": 1542, + "time_per_iteration": 2.5492055416107178 + }, + { + "auxiliary_loss_clip": 0.01199338, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.05573952, + "balance_loss_mlp": 1.02517676, + "epoch": 0.18553478025611736, + "flos": 24025424077440.0, + "grad_norm": 1.7350773441609353, + "language_loss": 0.72017539, + "learning_rate": 3.751696903998042e-06, + "loss": 0.74251747, + "num_input_tokens_seen": 32784840, + "step": 1543, + "time_per_iteration": 2.5165188312530518 + }, + { + "auxiliary_loss_clip": 0.01203178, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.06166673, + "balance_loss_mlp": 1.02359188, + "epoch": 0.18565502314675644, + "flos": 25885165720320.0, + "grad_norm": 2.161927469521582, + "language_loss": 0.69962597, + "learning_rate": 3.7513208501355456e-06, + "loss": 0.72199297, + "num_input_tokens_seen": 32805945, + "step": 1544, + "time_per_iteration": 2.5203258991241455 + }, + { + "auxiliary_loss_clip": 0.01185924, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.05723393, + "balance_loss_mlp": 1.02527857, + "epoch": 0.18577526603739553, + "flos": 19610063631360.0, + "grad_norm": 1.7789276228578468, + "language_loss": 0.83895981, + "learning_rate": 3.750944530599069e-06, + "loss": 0.86116576, + "num_input_tokens_seen": 32825515, + "step": 1545, + "time_per_iteration": 2.534999370574951 + }, + { + "auxiliary_loss_clip": 0.01211582, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.06288302, + "balance_loss_mlp": 1.02401626, + "epoch": 0.18589550892803464, + "flos": 18474891137280.0, + "grad_norm": 2.4406393180127415, + "language_loss": 0.80830604, + "learning_rate": 3.7505679454456992e-06, + "loss": 0.83076346, + "num_input_tokens_seen": 32842125, + "step": 1546, + "time_per_iteration": 2.4547977447509766 + }, + { + "auxiliary_loss_clip": 0.01123908, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.04932976, + "balance_loss_mlp": 1.02117395, + "epoch": 0.18601575181867372, + "flos": 23549966726400.0, + "grad_norm": 2.137828224311936, + "language_loss": 0.70841169, + "learning_rate": 3.750191094732564e-06, + "loss": 0.72996426, + "num_input_tokens_seen": 32862990, + "step": 1547, + "time_per_iteration": 2.7168071269989014 + }, + { + "auxiliary_loss_clip": 0.01125324, + "auxiliary_loss_mlp": 0.00764462, + "balance_loss_clip": 1.04982042, + "balance_loss_mlp": 1.00060499, + "epoch": 0.1861359947093128, + "flos": 26360192108160.0, + "grad_norm": 1.867496719703348, + "language_loss": 0.75466788, + "learning_rate": 3.7498139785168313e-06, + "loss": 0.77356571, + "num_input_tokens_seen": 32883595, + "step": 1548, + "time_per_iteration": 2.7315075397491455 + }, + { + "auxiliary_loss_clip": 0.01203864, + "auxiliary_loss_mlp": 0.0103957, + "balance_loss_clip": 1.06324744, + "balance_loss_mlp": 1.02916312, + "epoch": 0.1862562375999519, + "flos": 23331198942720.0, + "grad_norm": 1.9752161702777165, + "language_loss": 0.77308905, + "learning_rate": 3.749436596855709e-06, + "loss": 0.79552341, + "num_input_tokens_seen": 32902895, + "step": 1549, + "time_per_iteration": 2.4969980716705322 + }, + { + "auxiliary_loss_clip": 0.01197772, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.05735612, + "balance_loss_mlp": 1.02168262, + "epoch": 0.186376480490591, + "flos": 16648222942080.0, + "grad_norm": 1.9914307259612396, + "language_loss": 0.90828818, + "learning_rate": 3.749058949806446e-06, + "loss": 0.93058711, + "num_input_tokens_seen": 32919620, + "step": 1550, + "time_per_iteration": 2.4717626571655273 + }, + { + "auxiliary_loss_clip": 0.01203516, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.05853784, + "balance_loss_mlp": 1.02253008, + "epoch": 0.18649672338123008, + "flos": 21468656039040.0, + "grad_norm": 1.8086731494923844, + "language_loss": 0.84112746, + "learning_rate": 3.748681037426331e-06, + "loss": 0.86348152, + "num_input_tokens_seen": 32938830, + "step": 1551, + "time_per_iteration": 2.494406223297119 + }, + { + "auxiliary_loss_clip": 0.01220437, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.06319332, + "balance_loss_mlp": 1.02804184, + "epoch": 0.1866169662718692, + "flos": 12312728386560.0, + "grad_norm": 2.2326123203823456, + "language_loss": 0.91934258, + "learning_rate": 3.7483028597726936e-06, + "loss": 0.94191933, + "num_input_tokens_seen": 32955600, + "step": 1552, + "time_per_iteration": 2.413168430328369 + }, + { + "auxiliary_loss_clip": 0.01170861, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.05592728, + "balance_loss_mlp": 1.02816081, + "epoch": 0.18673720916250827, + "flos": 23581280407680.0, + "grad_norm": 2.3162772200575787, + "language_loss": 0.62384784, + "learning_rate": 3.7479244169029017e-06, + "loss": 0.64593935, + "num_input_tokens_seen": 32975390, + "step": 1553, + "time_per_iteration": 2.589843273162842 + }, + { + "auxiliary_loss_clip": 0.01205094, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.05670881, + "balance_loss_mlp": 1.01836276, + "epoch": 0.18685745205314735, + "flos": 19718370115200.0, + "grad_norm": 3.820829610079235, + "language_loss": 0.7325651, + "learning_rate": 3.7475457088743658e-06, + "loss": 0.75489378, + "num_input_tokens_seen": 32992640, + "step": 1554, + "time_per_iteration": 2.477201461791992 + }, + { + "auxiliary_loss_clip": 0.01181741, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.05854392, + "balance_loss_mlp": 1.02921557, + "epoch": 0.18697769494378644, + "flos": 34204123589760.0, + "grad_norm": 2.4575739915819206, + "language_loss": 0.74935174, + "learning_rate": 3.7471667357445348e-06, + "loss": 0.77156925, + "num_input_tokens_seen": 33012470, + "step": 1555, + "time_per_iteration": 2.6245012283325195 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.01025924, + "balance_loss_clip": 1.05194426, + "balance_loss_mlp": 1.01692975, + "epoch": 0.18709793783442555, + "flos": 34241327101440.0, + "grad_norm": 1.7429355029210511, + "language_loss": 0.72416109, + "learning_rate": 3.7467874975709e-06, + "loss": 0.74582827, + "num_input_tokens_seen": 33033275, + "step": 1556, + "time_per_iteration": 2.760094165802002 + }, + { + "auxiliary_loss_clip": 0.01211097, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.06342816, + "balance_loss_mlp": 1.03103709, + "epoch": 0.18721818072506463, + "flos": 40734550529280.0, + "grad_norm": 2.217971135789645, + "language_loss": 0.78057998, + "learning_rate": 3.7464079944109904e-06, + "loss": 0.80310321, + "num_input_tokens_seen": 33055135, + "step": 1557, + "time_per_iteration": 2.704882860183716 + }, + { + "auxiliary_loss_clip": 0.01175581, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.05579972, + "balance_loss_mlp": 1.02408242, + "epoch": 0.18733842361570371, + "flos": 22157386392960.0, + "grad_norm": 1.8354229278334275, + "language_loss": 0.77411747, + "learning_rate": 3.746028226322376e-06, + "loss": 0.79620922, + "num_input_tokens_seen": 33071015, + "step": 1558, + "time_per_iteration": 2.6040945053100586 + }, + { + "auxiliary_loss_clip": 0.01185415, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.057657, + "balance_loss_mlp": 1.02214479, + "epoch": 0.18745866650634282, + "flos": 18914940656640.0, + "grad_norm": 2.054861979023608, + "language_loss": 0.75861967, + "learning_rate": 3.745648193362669e-06, + "loss": 0.78078759, + "num_input_tokens_seen": 33090370, + "step": 1559, + "time_per_iteration": 2.5442733764648438 + }, + { + "auxiliary_loss_clip": 0.01191, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.05874324, + "balance_loss_mlp": 1.02609634, + "epoch": 0.1875789093969819, + "flos": 19314626267520.0, + "grad_norm": 2.9034152997110683, + "language_loss": 0.72469854, + "learning_rate": 3.745267895589518e-06, + "loss": 0.74695891, + "num_input_tokens_seen": 33108910, + "step": 1560, + "time_per_iteration": 2.5363070964813232 + }, + { + "auxiliary_loss_clip": 0.01188493, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0587635, + "balance_loss_mlp": 1.02212548, + "epoch": 0.187699152287621, + "flos": 17018965169280.0, + "grad_norm": 1.8673466887939256, + "language_loss": 0.82019901, + "learning_rate": 3.7448873330606154e-06, + "loss": 0.84240186, + "num_input_tokens_seen": 33126680, + "step": 1561, + "time_per_iteration": 2.5212693214416504 + }, + { + "auxiliary_loss_clip": 0.01169767, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.05980086, + "balance_loss_mlp": 1.0227071, + "epoch": 0.18781939517826007, + "flos": 22346384780160.0, + "grad_norm": 2.4248381783606088, + "language_loss": 0.87134302, + "learning_rate": 3.7445065058336914e-06, + "loss": 0.8933686, + "num_input_tokens_seen": 33145550, + "step": 1562, + "time_per_iteration": 2.5724403858184814 + }, + { + "auxiliary_loss_clip": 0.01143669, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.04834282, + "balance_loss_mlp": 1.01788259, + "epoch": 0.18793963806889918, + "flos": 14611478054400.0, + "grad_norm": 2.318196189999806, + "language_loss": 0.86344582, + "learning_rate": 3.7441254139665176e-06, + "loss": 0.88515508, + "num_input_tokens_seen": 33161735, + "step": 1563, + "time_per_iteration": 2.591139078140259 + }, + { + "auxiliary_loss_clip": 0.01220122, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.0650934, + "balance_loss_mlp": 1.02385473, + "epoch": 0.18805988095953827, + "flos": 17457075354240.0, + "grad_norm": 1.7188244603770093, + "language_loss": 0.82373667, + "learning_rate": 3.743744057516905e-06, + "loss": 0.8462677, + "num_input_tokens_seen": 33179795, + "step": 1564, + "time_per_iteration": 2.6258203983306885 + }, + { + "auxiliary_loss_clip": 0.01158134, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.05416679, + "balance_loss_mlp": 1.02776599, + "epoch": 0.18818012385017735, + "flos": 15043877976960.0, + "grad_norm": 2.8755136388468125, + "language_loss": 0.87065095, + "learning_rate": 3.743362436542706e-06, + "loss": 0.89261007, + "num_input_tokens_seen": 33194485, + "step": 1565, + "time_per_iteration": 3.371915817260742 + }, + { + "auxiliary_loss_clip": 0.01217025, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.06147885, + "balance_loss_mlp": 1.02185214, + "epoch": 0.18830036674081646, + "flos": 47551975136640.0, + "grad_norm": 1.9044862914593303, + "language_loss": 0.76655054, + "learning_rate": 3.7429805511018115e-06, + "loss": 0.78902966, + "num_input_tokens_seen": 33216145, + "step": 1566, + "time_per_iteration": 2.6935555934906006 + }, + { + "auxiliary_loss_clip": 0.01171469, + "auxiliary_loss_mlp": 0.00764277, + "balance_loss_clip": 1.05890167, + "balance_loss_mlp": 1.00050879, + "epoch": 0.18842060963145554, + "flos": 30044626698240.0, + "grad_norm": 1.8828565242481399, + "language_loss": 0.77739727, + "learning_rate": 3.7425984012521524e-06, + "loss": 0.79675472, + "num_input_tokens_seen": 33236345, + "step": 1567, + "time_per_iteration": 3.4368321895599365 + }, + { + "auxiliary_loss_clip": 0.0108338, + "auxiliary_loss_mlp": 0.00755062, + "balance_loss_clip": 1.02790868, + "balance_loss_mlp": 1.00028598, + "epoch": 0.18854085252209463, + "flos": 70318372625280.0, + "grad_norm": 0.7445474067958946, + "language_loss": 0.60416055, + "learning_rate": 3.7422159870517025e-06, + "loss": 0.622545, + "num_input_tokens_seen": 33301600, + "step": 1568, + "time_per_iteration": 4.045297622680664 + }, + { + "auxiliary_loss_clip": 0.01182228, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.05602539, + "balance_loss_mlp": 1.01966512, + "epoch": 0.1886610954127337, + "flos": 21289318410240.0, + "grad_norm": 1.684302857204295, + "language_loss": 0.78573459, + "learning_rate": 3.7418333085584717e-06, + "loss": 0.80784768, + "num_input_tokens_seen": 33322785, + "step": 1569, + "time_per_iteration": 2.5888333320617676 + }, + { + "auxiliary_loss_clip": 0.01176512, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.06014562, + "balance_loss_mlp": 1.02373505, + "epoch": 0.18878133830337282, + "flos": 17266819991040.0, + "grad_norm": 2.181057457373363, + "language_loss": 0.90758812, + "learning_rate": 3.7414503658305128e-06, + "loss": 0.92968893, + "num_input_tokens_seen": 33340020, + "step": 1570, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.01162678, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.05128622, + "balance_loss_mlp": 1.02044129, + "epoch": 0.1889015811940119, + "flos": 25775207210880.0, + "grad_norm": 3.0043060209827077, + "language_loss": 0.77649057, + "learning_rate": 3.7410671589259185e-06, + "loss": 0.79841673, + "num_input_tokens_seen": 33358620, + "step": 1571, + "time_per_iteration": 2.6351916790008545 + }, + { + "auxiliary_loss_clip": 0.01221153, + "auxiliary_loss_mlp": 0.01036976, + "balance_loss_clip": 1.06404877, + "balance_loss_mlp": 1.02705204, + "epoch": 0.18902182408465099, + "flos": 21032197879680.0, + "grad_norm": 2.127268444482011, + "language_loss": 0.80048168, + "learning_rate": 3.7406836879028205e-06, + "loss": 0.82306296, + "num_input_tokens_seen": 33378845, + "step": 1572, + "time_per_iteration": 2.4830033779144287 + }, + { + "auxiliary_loss_clip": 0.01202852, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.06203222, + "balance_loss_mlp": 1.02395523, + "epoch": 0.1891420669752901, + "flos": 22272121411200.0, + "grad_norm": 2.0305732344449283, + "language_loss": 0.76503539, + "learning_rate": 3.7402999528193907e-06, + "loss": 0.7873987, + "num_input_tokens_seen": 33398345, + "step": 1573, + "time_per_iteration": 2.4889743328094482 + }, + { + "auxiliary_loss_clip": 0.01159032, + "auxiliary_loss_mlp": 0.00764043, + "balance_loss_clip": 1.05501628, + "balance_loss_mlp": 1.00056911, + "epoch": 0.18926230986592918, + "flos": 22017802141440.0, + "grad_norm": 2.920781578411857, + "language_loss": 0.85219938, + "learning_rate": 3.739915953733842e-06, + "loss": 0.87143016, + "num_input_tokens_seen": 33416390, + "step": 1574, + "time_per_iteration": 2.553729295730591 + }, + { + "auxiliary_loss_clip": 0.01214363, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.05977714, + "balance_loss_mlp": 1.02161026, + "epoch": 0.18938255275656826, + "flos": 24462672336000.0, + "grad_norm": 1.5983694674565347, + "language_loss": 0.81995702, + "learning_rate": 3.7395316907044264e-06, + "loss": 0.84240794, + "num_input_tokens_seen": 33437175, + "step": 1575, + "time_per_iteration": 2.4772682189941406 + }, + { + "auxiliary_loss_clip": 0.01201851, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.05857491, + "balance_loss_mlp": 1.02513301, + "epoch": 0.18950279564720737, + "flos": 24427049022720.0, + "grad_norm": 1.6558883137432678, + "language_loss": 0.79406571, + "learning_rate": 3.7391471637894364e-06, + "loss": 0.81642878, + "num_input_tokens_seen": 33459440, + "step": 1576, + "time_per_iteration": 2.6126320362091064 + }, + { + "auxiliary_loss_clip": 0.01174386, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.05459809, + "balance_loss_mlp": 1.02594805, + "epoch": 0.18962303853784646, + "flos": 19756291898880.0, + "grad_norm": 1.8260032136448086, + "language_loss": 0.84719336, + "learning_rate": 3.738762373047205e-06, + "loss": 0.86928749, + "num_input_tokens_seen": 33479360, + "step": 1577, + "time_per_iteration": 2.6067140102386475 + }, + { + "auxiliary_loss_clip": 0.01173961, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.05822206, + "balance_loss_mlp": 1.02257216, + "epoch": 0.18974328142848554, + "flos": 21032054225280.0, + "grad_norm": 1.9742703304708609, + "language_loss": 0.83235896, + "learning_rate": 3.738377318536103e-06, + "loss": 0.85441458, + "num_input_tokens_seen": 33499245, + "step": 1578, + "time_per_iteration": 2.591622829437256 + }, + { + "auxiliary_loss_clip": 0.01213849, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.06275678, + "balance_loss_mlp": 1.02374029, + "epoch": 0.18986352431912462, + "flos": 12966122736000.0, + "grad_norm": 4.52040763710509, + "language_loss": 0.71276993, + "learning_rate": 3.7379920003145447e-06, + "loss": 0.73523021, + "num_input_tokens_seen": 33513520, + "step": 1579, + "time_per_iteration": 2.4451022148132324 + }, + { + "auxiliary_loss_clip": 0.01181322, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.05965924, + "balance_loss_mlp": 1.02580571, + "epoch": 0.18998376720976373, + "flos": 23767908497280.0, + "grad_norm": 2.9501289515228915, + "language_loss": 0.83798349, + "learning_rate": 3.7376064184409817e-06, + "loss": 0.86015379, + "num_input_tokens_seen": 33533100, + "step": 1580, + "time_per_iteration": 2.5877296924591064 + }, + { + "auxiliary_loss_clip": 0.01184691, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.05882859, + "balance_loss_mlp": 1.02078104, + "epoch": 0.19010401010040281, + "flos": 22966023323520.0, + "grad_norm": 1.63066257724897, + "language_loss": 0.87088537, + "learning_rate": 3.7372205729739063e-06, + "loss": 0.89303529, + "num_input_tokens_seen": 33554915, + "step": 1581, + "time_per_iteration": 2.580444812774658 + }, + { + "auxiliary_loss_clip": 0.01204687, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.05977666, + "balance_loss_mlp": 1.02123928, + "epoch": 0.1902242529910419, + "flos": 19135647774720.0, + "grad_norm": 2.171098491451308, + "language_loss": 0.71977597, + "learning_rate": 3.7368344639718514e-06, + "loss": 0.74213588, + "num_input_tokens_seen": 33572850, + "step": 1582, + "time_per_iteration": 2.485507011413574 + }, + { + "auxiliary_loss_clip": 0.01201868, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.05908895, + "balance_loss_mlp": 1.03102541, + "epoch": 0.190344495881681, + "flos": 25483935824640.0, + "grad_norm": 1.5175456357153028, + "language_loss": 0.80501449, + "learning_rate": 3.7364480914933895e-06, + "loss": 0.82742828, + "num_input_tokens_seen": 33593090, + "step": 1583, + "time_per_iteration": 2.5432379245758057 + }, + { + "auxiliary_loss_clip": 0.01154568, + "auxiliary_loss_mlp": 0.00763974, + "balance_loss_clip": 1.05485749, + "balance_loss_mlp": 1.00058591, + "epoch": 0.1904647387723201, + "flos": 26792843425920.0, + "grad_norm": 1.9794035993800763, + "language_loss": 0.81057316, + "learning_rate": 3.7360614555971325e-06, + "loss": 0.82975858, + "num_input_tokens_seen": 33612745, + "step": 1584, + "time_per_iteration": 2.6578099727630615 + }, + { + "auxiliary_loss_clip": 0.01199801, + "auxiliary_loss_mlp": 0.00763515, + "balance_loss_clip": 1.0592885, + "balance_loss_mlp": 1.00054049, + "epoch": 0.19058498166295917, + "flos": 23987753688960.0, + "grad_norm": 2.55029490077185, + "language_loss": 0.85114825, + "learning_rate": 3.735674556341733e-06, + "loss": 0.87078136, + "num_input_tokens_seen": 33632360, + "step": 1585, + "time_per_iteration": 2.532957077026367 + }, + { + "auxiliary_loss_clip": 0.01185098, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.06107903, + "balance_loss_mlp": 1.02586007, + "epoch": 0.19070522455359826, + "flos": 28293299280000.0, + "grad_norm": 3.005054909054161, + "language_loss": 0.82440722, + "learning_rate": 3.7352873937858835e-06, + "loss": 0.846609, + "num_input_tokens_seen": 33653895, + "step": 1586, + "time_per_iteration": 2.6022372245788574 + }, + { + "auxiliary_loss_clip": 0.01162444, + "auxiliary_loss_mlp": 0.00763974, + "balance_loss_clip": 1.05437827, + "balance_loss_mlp": 1.00054216, + "epoch": 0.19082546744423737, + "flos": 25660220797440.0, + "grad_norm": 1.9970406731060535, + "language_loss": 0.72234195, + "learning_rate": 3.734899967988316e-06, + "loss": 0.74160612, + "num_input_tokens_seen": 33672075, + "step": 1587, + "time_per_iteration": 2.6210989952087402 + }, + { + "auxiliary_loss_clip": 0.01161132, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.05246758, + "balance_loss_mlp": 1.0201633, + "epoch": 0.19094571033487645, + "flos": 19719483436800.0, + "grad_norm": 1.964453762381807, + "language_loss": 0.83706295, + "learning_rate": 3.7345122790078026e-06, + "loss": 0.85896748, + "num_input_tokens_seen": 33689640, + "step": 1588, + "time_per_iteration": 2.5807878971099854 + }, + { + "auxiliary_loss_clip": 0.01200905, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.06012559, + "balance_loss_mlp": 1.02204084, + "epoch": 0.19106595322551553, + "flos": 21616320850560.0, + "grad_norm": 4.9932016064105715, + "language_loss": 0.92868888, + "learning_rate": 3.7341243269031556e-06, + "loss": 0.95101839, + "num_input_tokens_seen": 33708630, + "step": 1589, + "time_per_iteration": 2.519620656967163 + }, + { + "auxiliary_loss_clip": 0.01176379, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.05703294, + "balance_loss_mlp": 1.02066255, + "epoch": 0.19118619611615464, + "flos": 29896890059520.0, + "grad_norm": 1.5907835102081458, + "language_loss": 0.77460718, + "learning_rate": 3.7337361117332275e-06, + "loss": 0.79666567, + "num_input_tokens_seen": 33730370, + "step": 1590, + "time_per_iteration": 2.610269069671631 + }, + { + "auxiliary_loss_clip": 0.01172143, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.0555135, + "balance_loss_mlp": 1.02288389, + "epoch": 0.19130643900679373, + "flos": 17273428093440.0, + "grad_norm": 1.9978850246252027, + "language_loss": 0.77002162, + "learning_rate": 3.7333476335569087e-06, + "loss": 0.7920599, + "num_input_tokens_seen": 33748370, + "step": 1591, + "time_per_iteration": 2.5594940185546875 + }, + { + "auxiliary_loss_clip": 0.01186856, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.06013024, + "balance_loss_mlp": 1.02106237, + "epoch": 0.1914266818974328, + "flos": 24826339584000.0, + "grad_norm": 3.1506834362039067, + "language_loss": 0.66854191, + "learning_rate": 3.7329588924331325e-06, + "loss": 0.69071764, + "num_input_tokens_seen": 33769575, + "step": 1592, + "time_per_iteration": 3.384814500808716 + }, + { + "auxiliary_loss_clip": 0.01161874, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.05242205, + "balance_loss_mlp": 1.02191377, + "epoch": 0.1915469247880719, + "flos": 18952467390720.0, + "grad_norm": 2.1724687647038285, + "language_loss": 0.82728612, + "learning_rate": 3.732569888420871e-06, + "loss": 0.84921783, + "num_input_tokens_seen": 33789110, + "step": 1593, + "time_per_iteration": 3.40179705619812 + }, + { + "auxiliary_loss_clip": 0.01217956, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.06150675, + "balance_loss_mlp": 1.02287161, + "epoch": 0.191667167678711, + "flos": 21032952065280.0, + "grad_norm": 1.8994210585745117, + "language_loss": 0.82789946, + "learning_rate": 3.732180621579134e-06, + "loss": 0.8504082, + "num_input_tokens_seen": 33808325, + "step": 1594, + "time_per_iteration": 2.4586501121520996 + }, + { + "auxiliary_loss_clip": 0.01181674, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.06018031, + "balance_loss_mlp": 1.02422178, + "epoch": 0.1917874105693501, + "flos": 34237663914240.0, + "grad_norm": 2.020832137515422, + "language_loss": 0.81080198, + "learning_rate": 3.7317910919669745e-06, + "loss": 0.83295679, + "num_input_tokens_seen": 33829520, + "step": 1595, + "time_per_iteration": 4.2925872802734375 + }, + { + "auxiliary_loss_clip": 0.01201219, + "auxiliary_loss_mlp": 0.01040272, + "balance_loss_clip": 1.06245661, + "balance_loss_mlp": 1.03063989, + "epoch": 0.19190765345998917, + "flos": 23550613171200.0, + "grad_norm": 2.3812233846871464, + "language_loss": 0.76375008, + "learning_rate": 3.7314012996434826e-06, + "loss": 0.786165, + "num_input_tokens_seen": 33848250, + "step": 1596, + "time_per_iteration": 2.490527629852295 + }, + { + "auxiliary_loss_clip": 0.01187699, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.06024396, + "balance_loss_mlp": 1.02060914, + "epoch": 0.19202789635062828, + "flos": 19861330245120.0, + "grad_norm": 2.9626655760912057, + "language_loss": 0.81292772, + "learning_rate": 3.7310112446677907e-06, + "loss": 0.83510458, + "num_input_tokens_seen": 33866160, + "step": 1597, + "time_per_iteration": 2.517540693283081 + }, + { + "auxiliary_loss_clip": 0.01219607, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.06476212, + "balance_loss_mlp": 1.02027369, + "epoch": 0.19214813924126736, + "flos": 20922957642240.0, + "grad_norm": 2.1312530830822642, + "language_loss": 0.68997359, + "learning_rate": 3.7306209270990695e-06, + "loss": 0.7124629, + "num_input_tokens_seen": 33884165, + "step": 1598, + "time_per_iteration": 2.4536960124969482 + }, + { + "auxiliary_loss_clip": 0.01185941, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.0593574, + "balance_loss_mlp": 1.02700603, + "epoch": 0.19226838213190645, + "flos": 26359725231360.0, + "grad_norm": 1.9123019699334654, + "language_loss": 0.86977851, + "learning_rate": 3.7302303469965292e-06, + "loss": 0.89199936, + "num_input_tokens_seen": 33903705, + "step": 1599, + "time_per_iteration": 2.5646965503692627 + }, + { + "auxiliary_loss_clip": 0.01201019, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.06149292, + "balance_loss_mlp": 1.02749455, + "epoch": 0.19238862502254553, + "flos": 20850525866880.0, + "grad_norm": 1.9100886665471861, + "language_loss": 0.7047652, + "learning_rate": 3.7298395044194206e-06, + "loss": 0.72714281, + "num_input_tokens_seen": 33922515, + "step": 1600, + "time_per_iteration": 2.4912052154541016 + }, + { + "auxiliary_loss_clip": 0.01221136, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.06645036, + "balance_loss_mlp": 1.0225668, + "epoch": 0.19250886791318464, + "flos": 21726063878400.0, + "grad_norm": 2.132101868079456, + "language_loss": 0.94403148, + "learning_rate": 3.7294483994270356e-06, + "loss": 0.96655929, + "num_input_tokens_seen": 33940840, + "step": 1601, + "time_per_iteration": 2.456775426864624 + }, + { + "auxiliary_loss_clip": 0.01144613, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.05309975, + "balance_loss_mlp": 1.02033293, + "epoch": 0.19262911080382372, + "flos": 23367827836800.0, + "grad_norm": 2.511493908268752, + "language_loss": 0.78138691, + "learning_rate": 3.7290570320787033e-06, + "loss": 0.8031165, + "num_input_tokens_seen": 33960420, + "step": 1602, + "time_per_iteration": 2.6451306343078613 + }, + { + "auxiliary_loss_clip": 0.01198717, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.06105781, + "balance_loss_mlp": 1.01948404, + "epoch": 0.1927493536944628, + "flos": 21943502858880.0, + "grad_norm": 1.8781472083369846, + "language_loss": 0.71293819, + "learning_rate": 3.728665402433793e-06, + "loss": 0.73521125, + "num_input_tokens_seen": 33978990, + "step": 1603, + "time_per_iteration": 2.53981876373291 + }, + { + "auxiliary_loss_clip": 0.011872, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.06172132, + "balance_loss_mlp": 1.02151155, + "epoch": 0.19286959658510192, + "flos": 16545590807040.0, + "grad_norm": 2.308360106896946, + "language_loss": 0.86256802, + "learning_rate": 3.7282735105517164e-06, + "loss": 0.8847422, + "num_input_tokens_seen": 33997115, + "step": 1604, + "time_per_iteration": 2.538268804550171 + }, + { + "auxiliary_loss_clip": 0.01163238, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.0533483, + "balance_loss_mlp": 1.02168679, + "epoch": 0.192989839475741, + "flos": 21616967295360.0, + "grad_norm": 1.920707076308608, + "language_loss": 0.67526257, + "learning_rate": 3.727881356491922e-06, + "loss": 0.6972062, + "num_input_tokens_seen": 34015525, + "step": 1605, + "time_per_iteration": 2.628882884979248 + }, + { + "auxiliary_loss_clip": 0.01217232, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.06507313, + "balance_loss_mlp": 1.023211, + "epoch": 0.19311008236638008, + "flos": 19281516906240.0, + "grad_norm": 2.252444080900672, + "language_loss": 0.75657284, + "learning_rate": 3.7274889403139002e-06, + "loss": 0.77905977, + "num_input_tokens_seen": 34033150, + "step": 1606, + "time_per_iteration": 2.488024950027466 + }, + { + "auxiliary_loss_clip": 0.01155068, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.05722642, + "balance_loss_mlp": 1.02144814, + "epoch": 0.1932303252570192, + "flos": 28652369587200.0, + "grad_norm": 2.1680497554511153, + "language_loss": 0.78321207, + "learning_rate": 3.727096262077179e-06, + "loss": 0.80506504, + "num_input_tokens_seen": 34052145, + "step": 1607, + "time_per_iteration": 2.6650302410125732 + }, + { + "auxiliary_loss_clip": 0.01202044, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.06120729, + "balance_loss_mlp": 1.02027869, + "epoch": 0.19335056814765827, + "flos": 18368990864640.0, + "grad_norm": 1.7635162005931657, + "language_loss": 0.85652536, + "learning_rate": 3.7267033218413285e-06, + "loss": 0.87883699, + "num_input_tokens_seen": 34069940, + "step": 1608, + "time_per_iteration": 2.5023279190063477 + }, + { + "auxiliary_loss_clip": 0.01143285, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.05090976, + "balance_loss_mlp": 1.02491248, + "epoch": 0.19347081103829736, + "flos": 13260877741440.0, + "grad_norm": 2.024657477418995, + "language_loss": 0.81175876, + "learning_rate": 3.726310119665957e-06, + "loss": 0.83354032, + "num_input_tokens_seen": 34086275, + "step": 1609, + "time_per_iteration": 2.64359974861145 + }, + { + "auxiliary_loss_clip": 0.01200634, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.0600071, + "balance_loss_mlp": 1.02203918, + "epoch": 0.19359105392893644, + "flos": 20300122788480.0, + "grad_norm": 1.7987723461752214, + "language_loss": 0.85547906, + "learning_rate": 3.725916655610713e-06, + "loss": 0.87779599, + "num_input_tokens_seen": 34105605, + "step": 1610, + "time_per_iteration": 2.559394121170044 + }, + { + "auxiliary_loss_clip": 0.01179786, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.05733716, + "balance_loss_mlp": 1.01996064, + "epoch": 0.19371129681957555, + "flos": 20484596062080.0, + "grad_norm": 2.4617495211709852, + "language_loss": 0.75207686, + "learning_rate": 3.725522929735284e-06, + "loss": 0.77416956, + "num_input_tokens_seen": 34122540, + "step": 1611, + "time_per_iteration": 2.5398318767547607 + }, + { + "auxiliary_loss_clip": 0.01191877, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.057235, + "balance_loss_mlp": 1.0201844, + "epoch": 0.19383153971021463, + "flos": 30445497457920.0, + "grad_norm": 2.100022521320054, + "language_loss": 0.74468136, + "learning_rate": 3.725128942099399e-06, + "loss": 0.766895, + "num_input_tokens_seen": 34142940, + "step": 1612, + "time_per_iteration": 2.6182045936584473 + }, + { + "auxiliary_loss_clip": 0.01177225, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.05694437, + "balance_loss_mlp": 1.02173877, + "epoch": 0.19395178260085372, + "flos": 24569937325440.0, + "grad_norm": 2.073351243441883, + "language_loss": 0.80170512, + "learning_rate": 3.7247346927628245e-06, + "loss": 0.82378614, + "num_input_tokens_seen": 34162875, + "step": 1613, + "time_per_iteration": 2.5784356594085693 + }, + { + "auxiliary_loss_clip": 0.01183131, + "auxiliary_loss_mlp": 0.00763912, + "balance_loss_clip": 1.05795002, + "balance_loss_mlp": 1.00071096, + "epoch": 0.19407202549149283, + "flos": 28950608211840.0, + "grad_norm": 1.767575810981389, + "language_loss": 0.79476094, + "learning_rate": 3.7243401817853694e-06, + "loss": 0.8142314, + "num_input_tokens_seen": 34183565, + "step": 1614, + "time_per_iteration": 2.6136717796325684 + }, + { + "auxiliary_loss_clip": 0.01195716, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.05950117, + "balance_loss_mlp": 1.02235353, + "epoch": 0.1941922683821319, + "flos": 18004497603840.0, + "grad_norm": 9.134276765810682, + "language_loss": 0.72079706, + "learning_rate": 3.723945409226879e-06, + "loss": 0.74306595, + "num_input_tokens_seen": 34202055, + "step": 1615, + "time_per_iteration": 2.492888927459717 + }, + { + "auxiliary_loss_clip": 0.01200159, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.06082368, + "balance_loss_mlp": 1.02605605, + "epoch": 0.194312511272771, + "flos": 9720337034880.0, + "grad_norm": 2.362612981519678, + "language_loss": 0.79882747, + "learning_rate": 3.723550375147241e-06, + "loss": 0.82118332, + "num_input_tokens_seen": 34216830, + "step": 1616, + "time_per_iteration": 2.4657514095306396 + }, + { + "auxiliary_loss_clip": 0.01156881, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.05163598, + "balance_loss_mlp": 1.02038026, + "epoch": 0.19443275416341008, + "flos": 27016208150400.0, + "grad_norm": 2.595897181932498, + "language_loss": 0.79870993, + "learning_rate": 3.7231550796063816e-06, + "loss": 0.8205772, + "num_input_tokens_seen": 34236840, + "step": 1617, + "time_per_iteration": 2.62611985206604 + }, + { + "auxiliary_loss_clip": 0.0119178, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.05996287, + "balance_loss_mlp": 1.02599144, + "epoch": 0.1945529970540492, + "flos": 15846625077120.0, + "grad_norm": 2.2725014390207385, + "language_loss": 0.64923573, + "learning_rate": 3.722759522664266e-06, + "loss": 0.67151195, + "num_input_tokens_seen": 34254140, + "step": 1618, + "time_per_iteration": 2.512683153152466 + }, + { + "auxiliary_loss_clip": 0.01157021, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.05480623, + "balance_loss_mlp": 1.02097976, + "epoch": 0.19467323994468827, + "flos": 19314985403520.0, + "grad_norm": 7.576578183489544, + "language_loss": 0.81772202, + "learning_rate": 3.7223637043809016e-06, + "loss": 0.83959514, + "num_input_tokens_seen": 34273120, + "step": 1619, + "time_per_iteration": 3.386240243911743 + }, + { + "auxiliary_loss_clip": 0.01174153, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.05873704, + "balance_loss_mlp": 1.02512431, + "epoch": 0.19479348283532735, + "flos": 24133227770880.0, + "grad_norm": 2.576918285327141, + "language_loss": 0.86616814, + "learning_rate": 3.7219676248163322e-06, + "loss": 0.88824546, + "num_input_tokens_seen": 34290285, + "step": 1620, + "time_per_iteration": 3.320338249206543 + }, + { + "auxiliary_loss_clip": 0.01205465, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.0626781, + "balance_loss_mlp": 1.02500129, + "epoch": 0.19491372572596646, + "flos": 25775638174080.0, + "grad_norm": 1.6798327102013508, + "language_loss": 0.93438828, + "learning_rate": 3.721571284030643e-06, + "loss": 0.95678926, + "num_input_tokens_seen": 34310095, + "step": 1621, + "time_per_iteration": 3.277984142303467 + }, + { + "auxiliary_loss_clip": 0.01204864, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.0614115, + "balance_loss_mlp": 1.0190444, + "epoch": 0.19503396861660555, + "flos": 19645220067840.0, + "grad_norm": 2.018932439039993, + "language_loss": 0.79146361, + "learning_rate": 3.7211746820839587e-06, + "loss": 0.81379825, + "num_input_tokens_seen": 34327190, + "step": 1622, + "time_per_iteration": 2.5066163539886475 + }, + { + "auxiliary_loss_clip": 0.01108498, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.04741955, + "balance_loss_mlp": 1.02113521, + "epoch": 0.19515421150724463, + "flos": 21033023892480.0, + "grad_norm": 1.8251053738665033, + "language_loss": 0.80745173, + "learning_rate": 3.7207778190364437e-06, + "loss": 0.82884175, + "num_input_tokens_seen": 34345615, + "step": 1623, + "time_per_iteration": 2.807015895843506 + }, + { + "auxiliary_loss_clip": 0.01129934, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.05109107, + "balance_loss_mlp": 1.02342415, + "epoch": 0.1952744543978837, + "flos": 32961255143040.0, + "grad_norm": 1.7108006226053047, + "language_loss": 0.73803741, + "learning_rate": 3.720380694948302e-06, + "loss": 0.75966465, + "num_input_tokens_seen": 34368500, + "step": 1624, + "time_per_iteration": 2.814282178878784 + }, + { + "auxiliary_loss_clip": 0.01080772, + "auxiliary_loss_mlp": 0.01004738, + "balance_loss_clip": 1.02909112, + "balance_loss_mlp": 1.00246131, + "epoch": 0.19539469728852282, + "flos": 64044312030720.0, + "grad_norm": 1.0392072075850456, + "language_loss": 0.71286041, + "learning_rate": 3.719983309879777e-06, + "loss": 0.73371553, + "num_input_tokens_seen": 34428280, + "step": 1625, + "time_per_iteration": 3.241161823272705 + }, + { + "auxiliary_loss_clip": 0.01161795, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.05409312, + "balance_loss_mlp": 1.02912903, + "epoch": 0.1955149401791619, + "flos": 13370908078080.0, + "grad_norm": 2.031876362300098, + "language_loss": 0.7731564, + "learning_rate": 3.719585663891151e-06, + "loss": 0.79515123, + "num_input_tokens_seen": 34445815, + "step": 1626, + "time_per_iteration": 2.5924386978149414 + }, + { + "auxiliary_loss_clip": 0.01147319, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.0542922, + "balance_loss_mlp": 1.02816272, + "epoch": 0.195635183069801, + "flos": 18728887184640.0, + "grad_norm": 2.345902450076477, + "language_loss": 0.79166961, + "learning_rate": 3.719187757042747e-06, + "loss": 0.81352127, + "num_input_tokens_seen": 34463635, + "step": 1627, + "time_per_iteration": 2.623539686203003 + }, + { + "auxiliary_loss_clip": 0.01093024, + "auxiliary_loss_mlp": 0.01001687, + "balance_loss_clip": 1.02561188, + "balance_loss_mlp": 0.9993149, + "epoch": 0.1957554259604401, + "flos": 69313952615040.0, + "grad_norm": 0.7351178613826388, + "language_loss": 0.54936302, + "learning_rate": 3.7187895893949275e-06, + "loss": 0.57031012, + "num_input_tokens_seen": 34530105, + "step": 1628, + "time_per_iteration": 3.215097665786743 + }, + { + "auxiliary_loss_clip": 0.01143787, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.05224967, + "balance_loss_mlp": 1.01702499, + "epoch": 0.19587566885107918, + "flos": 21069257736960.0, + "grad_norm": 6.655961930542399, + "language_loss": 0.75953436, + "learning_rate": 3.7183911610080937e-06, + "loss": 0.78124493, + "num_input_tokens_seen": 34546970, + "step": 1629, + "time_per_iteration": 2.638279676437378 + }, + { + "auxiliary_loss_clip": 0.01174416, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.05736256, + "balance_loss_mlp": 1.03027964, + "epoch": 0.19599591174171827, + "flos": 22194661731840.0, + "grad_norm": 2.9201306078273968, + "language_loss": 0.74799502, + "learning_rate": 3.7179924719426872e-06, + "loss": 0.77014172, + "num_input_tokens_seen": 34564865, + "step": 1630, + "time_per_iteration": 2.673372745513916 + }, + { + "auxiliary_loss_clip": 0.01204833, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.06267309, + "balance_loss_mlp": 1.02546954, + "epoch": 0.19611615463235738, + "flos": 23768375374080.0, + "grad_norm": 2.3130911236879803, + "language_loss": 0.75638539, + "learning_rate": 3.7175935222591885e-06, + "loss": 0.77878439, + "num_input_tokens_seen": 34584165, + "step": 1631, + "time_per_iteration": 2.5578906536102295 + }, + { + "auxiliary_loss_clip": 0.01190544, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.0631845, + "balance_loss_mlp": 1.02460885, + "epoch": 0.19623639752299646, + "flos": 28618218731520.0, + "grad_norm": 2.650796169643814, + "language_loss": 0.74154568, + "learning_rate": 3.717194312018118e-06, + "loss": 0.76379406, + "num_input_tokens_seen": 34603150, + "step": 1632, + "time_per_iteration": 2.619107484817505 + }, + { + "auxiliary_loss_clip": 0.01199109, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.05841351, + "balance_loss_mlp": 1.02090943, + "epoch": 0.19635664041363554, + "flos": 21032700670080.0, + "grad_norm": 2.0150693374957447, + "language_loss": 0.76289994, + "learning_rate": 3.716794841280036e-06, + "loss": 0.78519797, + "num_input_tokens_seen": 34621855, + "step": 1633, + "time_per_iteration": 2.512486219406128 + }, + { + "auxiliary_loss_clip": 0.01206862, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.06075537, + "balance_loss_mlp": 1.02178454, + "epoch": 0.19647688330427462, + "flos": 18879748306560.0, + "grad_norm": 1.9464611922317994, + "language_loss": 0.77214748, + "learning_rate": 3.7163951101055407e-06, + "loss": 0.7945298, + "num_input_tokens_seen": 34639915, + "step": 1634, + "time_per_iteration": 2.4814627170562744 + }, + { + "auxiliary_loss_clip": 0.01184354, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.06039715, + "balance_loss_mlp": 1.0254873, + "epoch": 0.19659712619491373, + "flos": 24242503921920.0, + "grad_norm": 1.7917313282607938, + "language_loss": 0.79137659, + "learning_rate": 3.715995118555273e-06, + "loss": 0.81357473, + "num_input_tokens_seen": 34659890, + "step": 1635, + "time_per_iteration": 2.564697504043579 + }, + { + "auxiliary_loss_clip": 0.01153534, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.05413699, + "balance_loss_mlp": 1.02844286, + "epoch": 0.19671736908555282, + "flos": 24717422568960.0, + "grad_norm": 2.30089066139468, + "language_loss": 0.85887462, + "learning_rate": 3.71559486668991e-06, + "loss": 0.88079834, + "num_input_tokens_seen": 34678750, + "step": 1636, + "time_per_iteration": 2.6649746894836426 + }, + { + "auxiliary_loss_clip": 0.0120762, + "auxiliary_loss_mlp": 0.00763624, + "balance_loss_clip": 1.06258154, + "balance_loss_mlp": 1.00069642, + "epoch": 0.1968376119761919, + "flos": 23842279607040.0, + "grad_norm": 1.6309884233273793, + "language_loss": 0.77462298, + "learning_rate": 3.715194354570169e-06, + "loss": 0.79433542, + "num_input_tokens_seen": 34698755, + "step": 1637, + "time_per_iteration": 2.5540716648101807 + }, + { + "auxiliary_loss_clip": 0.01203671, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.06475186, + "balance_loss_mlp": 1.02637494, + "epoch": 0.196957854866831, + "flos": 18113917409280.0, + "grad_norm": 2.215750686378058, + "language_loss": 0.83215821, + "learning_rate": 3.714793582256809e-06, + "loss": 0.85455227, + "num_input_tokens_seen": 34715820, + "step": 1638, + "time_per_iteration": 2.520659923553467 + }, + { + "auxiliary_loss_clip": 0.01214935, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.06231475, + "balance_loss_mlp": 1.02164245, + "epoch": 0.1970780977574701, + "flos": 21653129312640.0, + "grad_norm": 2.209767960014902, + "language_loss": 0.85586149, + "learning_rate": 3.7143925498106253e-06, + "loss": 0.87832308, + "num_input_tokens_seen": 34734360, + "step": 1639, + "time_per_iteration": 2.4703242778778076 + }, + { + "auxiliary_loss_clip": 0.01184925, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.0549655, + "balance_loss_mlp": 1.02338839, + "epoch": 0.19719834064810918, + "flos": 20811813984000.0, + "grad_norm": 2.3293392088667955, + "language_loss": 0.79011273, + "learning_rate": 3.7139912572924558e-06, + "loss": 0.81229573, + "num_input_tokens_seen": 34753390, + "step": 1640, + "time_per_iteration": 2.5630505084991455 + }, + { + "auxiliary_loss_clip": 0.01196784, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.05728757, + "balance_loss_mlp": 1.02588212, + "epoch": 0.19731858353874826, + "flos": 23434800744960.0, + "grad_norm": 2.475490396081664, + "language_loss": 0.80335522, + "learning_rate": 3.7135897047631744e-06, + "loss": 0.82567477, + "num_input_tokens_seen": 34771275, + "step": 1641, + "time_per_iteration": 2.511073589324951 + }, + { + "auxiliary_loss_clip": 0.01188162, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.06020761, + "balance_loss_mlp": 1.02238107, + "epoch": 0.19743882642938737, + "flos": 23988184652160.0, + "grad_norm": 1.917571712040667, + "language_loss": 0.76192307, + "learning_rate": 3.713187892283698e-06, + "loss": 0.78412616, + "num_input_tokens_seen": 34790885, + "step": 1642, + "time_per_iteration": 2.5702242851257324 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.05231488, + "balance_loss_mlp": 1.02548826, + "epoch": 0.19755906932002645, + "flos": 15004340081280.0, + "grad_norm": 2.4930014914657757, + "language_loss": 0.8743273, + "learning_rate": 3.71278581991498e-06, + "loss": 0.89623225, + "num_input_tokens_seen": 34806745, + "step": 1643, + "time_per_iteration": 2.598146677017212 + }, + { + "auxiliary_loss_clip": 0.01173944, + "auxiliary_loss_mlp": 0.00764785, + "balance_loss_clip": 1.0618633, + "balance_loss_mlp": 1.00063062, + "epoch": 0.19767931221066554, + "flos": 19494466686720.0, + "grad_norm": 2.1930940954781093, + "language_loss": 0.78857815, + "learning_rate": 3.712383487718015e-06, + "loss": 0.80796546, + "num_input_tokens_seen": 34824985, + "step": 1644, + "time_per_iteration": 2.582275152206421 + }, + { + "auxiliary_loss_clip": 0.01137844, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.05331945, + "balance_loss_mlp": 1.02369165, + "epoch": 0.19779955510130465, + "flos": 25737895958400.0, + "grad_norm": 1.854883992846687, + "language_loss": 0.86384141, + "learning_rate": 3.7119808957538365e-06, + "loss": 0.88554728, + "num_input_tokens_seen": 34843980, + "step": 1645, + "time_per_iteration": 3.430915355682373 + }, + { + "auxiliary_loss_clip": 0.01181996, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.05610895, + "balance_loss_mlp": 1.02267003, + "epoch": 0.19791979799194373, + "flos": 20777699041920.0, + "grad_norm": 2.0855465760219043, + "language_loss": 0.80220008, + "learning_rate": 3.711578044083517e-06, + "loss": 0.82434785, + "num_input_tokens_seen": 34860780, + "step": 1646, + "time_per_iteration": 3.3153223991394043 + }, + { + "auxiliary_loss_clip": 0.01191029, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.06083393, + "balance_loss_mlp": 1.02863729, + "epoch": 0.1980400408825828, + "flos": 25589010084480.0, + "grad_norm": 2.7434806600810795, + "language_loss": 0.7446847, + "learning_rate": 3.7111749327681698e-06, + "loss": 0.76698196, + "num_input_tokens_seen": 34880815, + "step": 1647, + "time_per_iteration": 3.408291816711426 + }, + { + "auxiliary_loss_clip": 0.0120915, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.06599391, + "balance_loss_mlp": 1.02032137, + "epoch": 0.1981602837732219, + "flos": 23513840622720.0, + "grad_norm": 3.4229397966358186, + "language_loss": 0.86264986, + "learning_rate": 3.7107715618689455e-06, + "loss": 0.88503438, + "num_input_tokens_seen": 34899790, + "step": 1648, + "time_per_iteration": 3.374119997024536 + }, + { + "auxiliary_loss_clip": 0.01200214, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.06261194, + "balance_loss_mlp": 1.0235734, + "epoch": 0.198280526663861, + "flos": 23185365724800.0, + "grad_norm": 1.450984357212388, + "language_loss": 0.83592135, + "learning_rate": 3.710367931447035e-06, + "loss": 0.85825908, + "num_input_tokens_seen": 34921570, + "step": 1649, + "time_per_iteration": 2.528393268585205 + }, + { + "auxiliary_loss_clip": 0.01209914, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.06288791, + "balance_loss_mlp": 1.0295639, + "epoch": 0.1984007695545001, + "flos": 21689470897920.0, + "grad_norm": 2.7347453648240827, + "language_loss": 0.8647542, + "learning_rate": 3.70996404156367e-06, + "loss": 0.8872509, + "num_input_tokens_seen": 34941205, + "step": 1650, + "time_per_iteration": 2.540945529937744 + }, + { + "auxiliary_loss_clip": 0.01147445, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.05281973, + "balance_loss_mlp": 1.03030479, + "epoch": 0.19852101244513917, + "flos": 36064008887040.0, + "grad_norm": 1.8527972490504971, + "language_loss": 0.72665894, + "learning_rate": 3.7095598922801187e-06, + "loss": 0.74853039, + "num_input_tokens_seen": 34963280, + "step": 1651, + "time_per_iteration": 2.7873172760009766 + }, + { + "auxiliary_loss_clip": 0.01217241, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.06391823, + "balance_loss_mlp": 1.02583039, + "epoch": 0.19864125533577828, + "flos": 23105894883840.0, + "grad_norm": 2.1924588804981693, + "language_loss": 0.76374602, + "learning_rate": 3.7091554836576914e-06, + "loss": 0.78627634, + "num_input_tokens_seen": 34979955, + "step": 1652, + "time_per_iteration": 2.4848053455352783 + }, + { + "auxiliary_loss_clip": 0.01202493, + "auxiliary_loss_mlp": 0.00764069, + "balance_loss_clip": 1.06498075, + "balance_loss_mlp": 1.00075865, + "epoch": 0.19876149822641737, + "flos": 24608505553920.0, + "grad_norm": 1.8638956708705425, + "language_loss": 0.82869571, + "learning_rate": 3.708750815757736e-06, + "loss": 0.84836137, + "num_input_tokens_seen": 35000725, + "step": 1653, + "time_per_iteration": 2.569766044616699 + }, + { + "auxiliary_loss_clip": 0.01202824, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.06204367, + "balance_loss_mlp": 1.03054094, + "epoch": 0.19888174111705645, + "flos": 32196645308160.0, + "grad_norm": 2.2522156210592725, + "language_loss": 0.73177385, + "learning_rate": 3.7083458886416407e-06, + "loss": 0.75421, + "num_input_tokens_seen": 35019920, + "step": 1654, + "time_per_iteration": 2.6149959564208984 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.05591226, + "balance_loss_mlp": 1.02418971, + "epoch": 0.19900198400769553, + "flos": 24608469640320.0, + "grad_norm": 2.1429928499892585, + "language_loss": 0.88118869, + "learning_rate": 3.707940702370832e-06, + "loss": 0.90298617, + "num_input_tokens_seen": 35040765, + "step": 1655, + "time_per_iteration": 2.6911890506744385 + }, + { + "auxiliary_loss_clip": 0.01097806, + "auxiliary_loss_mlp": 0.01004222, + "balance_loss_clip": 1.02478635, + "balance_loss_mlp": 1.00174224, + "epoch": 0.19912222689833464, + "flos": 67915805673600.0, + "grad_norm": 0.7626308736515407, + "language_loss": 0.58272457, + "learning_rate": 3.707535257006777e-06, + "loss": 0.60374486, + "num_input_tokens_seen": 35106390, + "step": 1656, + "time_per_iteration": 3.1586320400238037 + }, + { + "auxiliary_loss_clip": 0.01188512, + "auxiliary_loss_mlp": 0.01037025, + "balance_loss_clip": 1.05914247, + "balance_loss_mlp": 1.02689791, + "epoch": 0.19924246978897373, + "flos": 15742340916480.0, + "grad_norm": 2.5243694401453514, + "language_loss": 0.88647616, + "learning_rate": 3.707129552610981e-06, + "loss": 0.90873158, + "num_input_tokens_seen": 35125040, + "step": 1657, + "time_per_iteration": 2.5297036170959473 + }, + { + "auxiliary_loss_clip": 0.0118527, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.06357455, + "balance_loss_mlp": 1.02049518, + "epoch": 0.1993627126796128, + "flos": 17566566986880.0, + "grad_norm": 1.9926973970078439, + "language_loss": 0.73759419, + "learning_rate": 3.70672358924499e-06, + "loss": 0.75974786, + "num_input_tokens_seen": 35144280, + "step": 1658, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01173562, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.06265664, + "balance_loss_mlp": 1.02613115, + "epoch": 0.19948295557025192, + "flos": 40843826680320.0, + "grad_norm": 1.9165487816967923, + "language_loss": 0.78560036, + "learning_rate": 3.706317366970386e-06, + "loss": 0.80769682, + "num_input_tokens_seen": 35165280, + "step": 1659, + "time_per_iteration": 2.751410961151123 + }, + { + "auxiliary_loss_clip": 0.0121937, + "auxiliary_loss_mlp": 0.00764781, + "balance_loss_clip": 1.06237698, + "balance_loss_mlp": 1.00070548, + "epoch": 0.199603198460891, + "flos": 25082418620160.0, + "grad_norm": 2.5505432415791978, + "language_loss": 0.8371917, + "learning_rate": 3.705910885848795e-06, + "loss": 0.85703325, + "num_input_tokens_seen": 35183655, + "step": 1660, + "time_per_iteration": 2.5266826152801514 + }, + { + "auxiliary_loss_clip": 0.01201069, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.06157231, + "balance_loss_mlp": 1.02041078, + "epoch": 0.19972344135153008, + "flos": 20084120352000.0, + "grad_norm": 2.4063129528896203, + "language_loss": 0.84522533, + "learning_rate": 3.705504145941879e-06, + "loss": 0.86753464, + "num_input_tokens_seen": 35201825, + "step": 1661, + "time_per_iteration": 2.501857042312622 + }, + { + "auxiliary_loss_clip": 0.01214135, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.06156421, + "balance_loss_mlp": 1.01909518, + "epoch": 0.1998436842421692, + "flos": 23727472761600.0, + "grad_norm": 1.756778685082368, + "language_loss": 0.78959465, + "learning_rate": 3.7050971473113403e-06, + "loss": 0.81202173, + "num_input_tokens_seen": 35221600, + "step": 1662, + "time_per_iteration": 2.5155251026153564 + }, + { + "auxiliary_loss_clip": 0.01197483, + "auxiliary_loss_mlp": 0.00764127, + "balance_loss_clip": 1.05937016, + "balance_loss_mlp": 1.00070465, + "epoch": 0.19996392713280828, + "flos": 36102361633920.0, + "grad_norm": 1.7191903188815245, + "language_loss": 0.79904807, + "learning_rate": 3.7046898900189196e-06, + "loss": 0.81866419, + "num_input_tokens_seen": 35245935, + "step": 1663, + "time_per_iteration": 2.661000967025757 + }, + { + "auxiliary_loss_clip": 0.01175928, + "auxiliary_loss_mlp": 0.01037312, + "balance_loss_clip": 1.05993319, + "balance_loss_mlp": 1.02756047, + "epoch": 0.20008417002344736, + "flos": 23657662679040.0, + "grad_norm": 1.824706206666673, + "language_loss": 0.82733774, + "learning_rate": 3.704282374126398e-06, + "loss": 0.84947014, + "num_input_tokens_seen": 35265615, + "step": 1664, + "time_per_iteration": 2.6038355827331543 + } + ], + "logging_steps": 1.0, + "max_steps": 8316, + "num_input_tokens_seen": 35265615, + "num_train_epochs": 1, + "save_steps": 1664, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3752378227215565e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/training_args.bin b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..06c1a7fdcdce0a60eb934659e6ff98b068eb9b91 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb55e82347cbf0afcb6a120e8276013d73acc6c9d648bb0ea22d444f6dec41a +size 7928 diff --git a/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/zero_to_fp32.py b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/downtheta0.001_competesmoev30/checkpoint-1664/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)