diff --git a/checkpoint-14544/config.json b/checkpoint-14544/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0153f8ee396146a87c398da9234b3dce005be --- /dev/null +++ b/checkpoint-14544/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128259 +} diff --git a/checkpoint-14544/generation_config.json b/checkpoint-14544/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eab5082496e8b01f9c606a306676cbfabe0cce9d --- /dev/null +++ b/checkpoint-14544/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ff034903e280c1cd296a05290987817cb1a6871 --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584c9b461249d8db510d34cddb31b43a426689a3bed728d617ff2b0106f25261 +size 12045435328 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4c75365a5449449df3a90493943ee2d92efd82d --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead59c3cc9ebfb44de9c3c9207571beee5e3c2234f34776152c7d3499634b274 +size 12045436096 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a95889153ec165b047a9a352f3162ad97c426bfd --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6ea0c91ba5e49416e5523ca72acaf03b56d638d5783acff9951f6bf25d5253f +size 12045436352 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ce8d181d5c8307686e9e24c57267f0bb4b027d3 --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d845e213083c5cf1657e363bde310c538db7c3c44e5de64352edb41fb9d0124 +size 12045436096 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f90f992e7e080947112456ab65edc23437f3df2 --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f54c4235d211016d1486b46f79adffbaed728bd906249b03c342372f8b436a +size 12045436352 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7362a129aff82265244d08d42caf94bc6298e0e7 --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9af168848f51906a80e4826e46ab3cc5a2798d59851b20316bd900ced3486bf2 +size 12045436416 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..535c70270c73fff964b546d8a1f0d1d414f13751 --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe3b4bef6aea14aab08f8507f20a34493c8ced10ad51fb9dd13cae5e8324c9b +size 12045436096 diff --git a/checkpoint-14544/global_step14544/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..643876f9330ef5b18f13e6fd84032b96b83f470b --- /dev/null +++ b/checkpoint-14544/global_step14544/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4085f318a4791259dad09cafe3198c635346cc22a909321e635f21ac05f1870 +size 12045435008 diff --git a/checkpoint-14544/global_step14544/mp_rank_00_model_states.pt b/checkpoint-14544/global_step14544/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..252cb27712288b54a8e0d6793b6b359891751e87 --- /dev/null +++ b/checkpoint-14544/global_step14544/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b623de99d0eaadc92c4f66562e5e1fddfd9ba8955eea50dcb79dab5246c0b9a4 +size 16060659704 diff --git a/checkpoint-14544/latest b/checkpoint-14544/latest new file mode 100644 index 0000000000000000000000000000000000000000..d72ba190639e3341dbc1b0916ec1d7808be086bd --- /dev/null +++ b/checkpoint-14544/latest @@ -0,0 +1 @@ +global_step14544 \ No newline at end of file diff --git a/checkpoint-14544/model-00001-of-00004.safetensors b/checkpoint-14544/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a0646e77412d6c7d32e71137ae681233c9906555 --- /dev/null +++ b/checkpoint-14544/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee0fc4087d221db1ce5aa4fb39a0309a2cb12c315fc5b94ef25c7d5696b74dc5 +size 4976723248 diff --git a/checkpoint-14544/model-00002-of-00004.safetensors b/checkpoint-14544/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db1d885e3740073d14f32820a80485a955550233 --- /dev/null +++ b/checkpoint-14544/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff91f1724b4a3e57caa08f8f3a2085b91d675c4fefdf71890efd4e265a9bb8a4 +size 4999802720 diff --git a/checkpoint-14544/model-00003-of-00004.safetensors b/checkpoint-14544/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cbcd2da9427cdf747324cff58aed0ae3f17c6a7b --- /dev/null +++ b/checkpoint-14544/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804296ec661795d7ae80f36c5ed3021849539c3ddb7e636e9dae574003d0991b +size 4915916176 diff --git a/checkpoint-14544/model-00004-of-00004.safetensors b/checkpoint-14544/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44d7a59fb3d2e90b41775b8763f3c0e62a464c94 --- /dev/null +++ b/checkpoint-14544/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c531757f300e38c107e3807042f85acccefb9ea22e2c885d779107e3961e07 +size 1168163384 diff --git a/checkpoint-14544/model.safetensors.index.json b/checkpoint-14544/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..e734f8f9bcabe95e936a11f19b77148f54640122 --- /dev/null +++ b/checkpoint-14544/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060571648 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-14544/rng_state_0.pth b/checkpoint-14544/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-14544/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-14544/rng_state_1.pth b/checkpoint-14544/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-14544/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-14544/rng_state_2.pth b/checkpoint-14544/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-14544/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-14544/rng_state_3.pth b/checkpoint-14544/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-14544/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-14544/rng_state_4.pth b/checkpoint-14544/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-14544/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-14544/rng_state_5.pth b/checkpoint-14544/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-14544/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-14544/rng_state_6.pth b/checkpoint-14544/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-14544/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-14544/rng_state_7.pth b/checkpoint-14544/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-14544/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-14544/scheduler.pt b/checkpoint-14544/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cd98901e7d1c5dd7dd7b9851516b39f996a1998 --- /dev/null +++ b/checkpoint-14544/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb7926b667af4a843102788971a1be35782a74deeb4cd2a914dd89f1814993ce +size 1064 diff --git a/checkpoint-14544/special_tokens_map.json b/checkpoint-14544/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-14544/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-14544/tokenizer.json b/checkpoint-14544/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d29771c68b37af9541b4c450532cb095b564ca5 --- /dev/null +++ b/checkpoint-14544/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a36f231bc2882e8c2e1859bc27098f73c95ea211ccb73ad0cdb441a16f49c6 +size 17210280 diff --git a/checkpoint-14544/tokenizer_config.json b/checkpoint-14544/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a695c457b54a00f10768564f6c25b0142ccc840 --- /dev/null +++ b/checkpoint-14544/tokenizer_config.json @@ -0,0 +1,2087 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|im_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|end_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|autheur|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|sujet|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|><|khey|><|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-14544/trainer_state.json b/checkpoint-14544/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd7465cf132a0945e72ffe6782e147cdd947e2f2 --- /dev/null +++ b/checkpoint-14544/trainer_state.json @@ -0,0 +1,101841 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8004843414607298, + "eval_steps": 500, + "global_step": 14544, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.503880235566074e-05, + "grad_norm": 459.8753356933594, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.303, + "step": 1 + }, + { + "epoch": 0.00011007760471132149, + "grad_norm": 314.2561950683594, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.8226, + "step": 2 + }, + { + "epoch": 0.0001651164070669822, + "grad_norm": 314.1292419433594, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8517, + "step": 3 + }, + { + "epoch": 0.00022015520942264297, + "grad_norm": 312.4049072265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6248, + "step": 4 + }, + { + "epoch": 0.0002751940117783037, + "grad_norm": 353.7213134765625, + "learning_rate": 5.000000000000001e-07, + "loss": 2.7883, + "step": 5 + }, + { + "epoch": 0.0003302328141339644, + "grad_norm": 278.41668701171875, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5468, + "step": 6 + }, + { + "epoch": 0.0003852716164896252, + "grad_norm": 336.14532470703125, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7721, + "step": 7 + }, + { + "epoch": 0.00044031041884528595, + "grad_norm": 201.19374084472656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4873, + "step": 8 + }, + { + "epoch": 0.0004953492212009466, + "grad_norm": 184.7027587890625, + "learning_rate": 9.000000000000001e-07, + "loss": 2.6647, + "step": 9 + }, + { + "epoch": 0.0005503880235566074, + "grad_norm": 154.597412109375, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.602, + "step": 10 + }, + { + "epoch": 0.0006054268259122681, + "grad_norm": 40.47785568237305, + "learning_rate": 1.1e-06, + "loss": 2.6716, + "step": 11 + }, + { + "epoch": 0.0006604656282679288, + "grad_norm": 25.338607788085938, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2631, + "step": 12 + }, + { + "epoch": 0.0007155044306235897, + "grad_norm": 24.976919174194336, + "learning_rate": 1.3e-06, + "loss": 2.3564, + "step": 13 + }, + { + "epoch": 0.0007705432329792504, + "grad_norm": 15.239912033081055, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3295, + "step": 14 + }, + { + "epoch": 0.0008255820353349112, + "grad_norm": 14.125042915344238, + "learning_rate": 1.5e-06, + "loss": 2.307, + "step": 15 + }, + { + "epoch": 0.0008806208376905719, + "grad_norm": 13.163726806640625, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1493, + "step": 16 + }, + { + "epoch": 0.0009356596400462326, + "grad_norm": 8.726515769958496, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.0333, + "step": 17 + }, + { + "epoch": 0.0009906984424018933, + "grad_norm": 9.072502136230469, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.2046, + "step": 18 + }, + { + "epoch": 0.001045737244757554, + "grad_norm": 9.412588119506836, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.2001, + "step": 19 + }, + { + "epoch": 0.0011007760471132147, + "grad_norm": 8.67534065246582, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0011558148494688755, + "grad_norm": 14.015918731689453, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.9566, + "step": 21 + }, + { + "epoch": 0.0012108536518245362, + "grad_norm": 7.9474687576293945, + "learning_rate": 2.2e-06, + "loss": 1.9085, + "step": 22 + }, + { + "epoch": 0.001265892454180197, + "grad_norm": 6.806368350982666, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7918, + "step": 23 + }, + { + "epoch": 0.0013209312565358577, + "grad_norm": 5.3452582359313965, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8321, + "step": 24 + }, + { + "epoch": 0.0013759700588915184, + "grad_norm": 8.744244575500488, + "learning_rate": 2.5e-06, + "loss": 1.6317, + "step": 25 + }, + { + "epoch": 0.0014310088612471794, + "grad_norm": 5.304683685302734, + "learning_rate": 2.6e-06, + "loss": 1.6846, + "step": 26 + }, + { + "epoch": 0.00148604766360284, + "grad_norm": 5.650127410888672, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7449, + "step": 27 + }, + { + "epoch": 0.0015410864659585008, + "grad_norm": 5.479269504547119, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8158, + "step": 28 + }, + { + "epoch": 0.0015961252683141616, + "grad_norm": 4.873537063598633, + "learning_rate": 2.9e-06, + "loss": 1.8015, + "step": 29 + }, + { + "epoch": 0.0016511640706698223, + "grad_norm": 4.971101760864258, + "learning_rate": 3e-06, + "loss": 1.9034, + "step": 30 + }, + { + "epoch": 0.001706202873025483, + "grad_norm": 4.407571315765381, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.9037, + "step": 31 + }, + { + "epoch": 0.0017612416753811438, + "grad_norm": 4.429073810577393, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6812, + "step": 32 + }, + { + "epoch": 0.0018162804777368045, + "grad_norm": 5.16085147857666, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7627, + "step": 33 + }, + { + "epoch": 0.0018713192800924653, + "grad_norm": 4.0805768966674805, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6799, + "step": 34 + }, + { + "epoch": 0.001926358082448126, + "grad_norm": 4.548702239990234, + "learning_rate": 3.5e-06, + "loss": 1.7799, + "step": 35 + }, + { + "epoch": 0.0019813968848037865, + "grad_norm": 5.181888580322266, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.8235, + "step": 36 + }, + { + "epoch": 0.0020364356871594475, + "grad_norm": 3.9876129627227783, + "learning_rate": 3.7e-06, + "loss": 1.5999, + "step": 37 + }, + { + "epoch": 0.002091474489515108, + "grad_norm": 6.325051307678223, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.7499, + "step": 38 + }, + { + "epoch": 0.002146513291870769, + "grad_norm": 6.199049949645996, + "learning_rate": 3.900000000000001e-06, + "loss": 1.784, + "step": 39 + }, + { + "epoch": 0.0022015520942264295, + "grad_norm": 4.83912992477417, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8895, + "step": 40 + }, + { + "epoch": 0.0022565908965820904, + "grad_norm": 4.515626907348633, + "learning_rate": 4.1e-06, + "loss": 1.4887, + "step": 41 + }, + { + "epoch": 0.002311629698937751, + "grad_norm": 5.032265663146973, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7324, + "step": 42 + }, + { + "epoch": 0.002366668501293412, + "grad_norm": 4.1879048347473145, + "learning_rate": 4.3e-06, + "loss": 1.4912, + "step": 43 + }, + { + "epoch": 0.0024217073036490724, + "grad_norm": 4.128026485443115, + "learning_rate": 4.4e-06, + "loss": 1.554, + "step": 44 + }, + { + "epoch": 0.0024767461060047334, + "grad_norm": 4.527958393096924, + "learning_rate": 4.5e-06, + "loss": 1.652, + "step": 45 + }, + { + "epoch": 0.002531784908360394, + "grad_norm": 4.8388190269470215, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6696, + "step": 46 + }, + { + "epoch": 0.002586823710716055, + "grad_norm": 4.2088541984558105, + "learning_rate": 4.7e-06, + "loss": 1.568, + "step": 47 + }, + { + "epoch": 0.0026418625130717154, + "grad_norm": 4.789997577667236, + "learning_rate": 4.800000000000001e-06, + "loss": 1.642, + "step": 48 + }, + { + "epoch": 0.0026969013154273763, + "grad_norm": 4.408346652984619, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5181, + "step": 49 + }, + { + "epoch": 0.002751940117783037, + "grad_norm": 4.572340488433838, + "learning_rate": 5e-06, + "loss": 1.6698, + "step": 50 + }, + { + "epoch": 0.0028069789201386978, + "grad_norm": 4.728564739227295, + "learning_rate": 5.1e-06, + "loss": 1.5785, + "step": 51 + }, + { + "epoch": 0.0028620177224943587, + "grad_norm": 4.449855327606201, + "learning_rate": 5.2e-06, + "loss": 1.4624, + "step": 52 + }, + { + "epoch": 0.0029170565248500193, + "grad_norm": 4.127189636230469, + "learning_rate": 5.300000000000001e-06, + "loss": 1.6061, + "step": 53 + }, + { + "epoch": 0.00297209532720568, + "grad_norm": 4.244532108306885, + "learning_rate": 5.400000000000001e-06, + "loss": 1.491, + "step": 54 + }, + { + "epoch": 0.0030271341295613407, + "grad_norm": 3.437682628631592, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1967, + "step": 55 + }, + { + "epoch": 0.0030821729319170017, + "grad_norm": 3.83516788482666, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4731, + "step": 56 + }, + { + "epoch": 0.003137211734272662, + "grad_norm": 3.9108972549438477, + "learning_rate": 5.7e-06, + "loss": 1.4393, + "step": 57 + }, + { + "epoch": 0.003192250536628323, + "grad_norm": 3.5258419513702393, + "learning_rate": 5.8e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.0032472893389839837, + "grad_norm": 4.124903678894043, + "learning_rate": 5.9e-06, + "loss": 1.4747, + "step": 59 + }, + { + "epoch": 0.0033023281413396446, + "grad_norm": 4.055769920349121, + "learning_rate": 6e-06, + "loss": 1.4655, + "step": 60 + }, + { + "epoch": 0.003357366943695305, + "grad_norm": 3.904837131500244, + "learning_rate": 6.1e-06, + "loss": 1.5125, + "step": 61 + }, + { + "epoch": 0.003412405746050966, + "grad_norm": 3.2904794216156006, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4596, + "step": 62 + }, + { + "epoch": 0.0034674445484066266, + "grad_norm": 3.24053692817688, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3851, + "step": 63 + }, + { + "epoch": 0.0035224833507622876, + "grad_norm": 3.457639217376709, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4019, + "step": 64 + }, + { + "epoch": 0.003577522153117948, + "grad_norm": 3.073054790496826, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2872, + "step": 65 + }, + { + "epoch": 0.003632560955473609, + "grad_norm": 2.6726694107055664, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2361, + "step": 66 + }, + { + "epoch": 0.0036875997578292696, + "grad_norm": 2.9378459453582764, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4452, + "step": 67 + }, + { + "epoch": 0.0037426385601849305, + "grad_norm": 2.81107234954834, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4804, + "step": 68 + }, + { + "epoch": 0.003797677362540591, + "grad_norm": 2.60062313079834, + "learning_rate": 6.9e-06, + "loss": 1.3263, + "step": 69 + }, + { + "epoch": 0.003852716164896252, + "grad_norm": 2.5642921924591064, + "learning_rate": 7e-06, + "loss": 1.2751, + "step": 70 + }, + { + "epoch": 0.0039077549672519125, + "grad_norm": 2.3608031272888184, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2614, + "step": 71 + }, + { + "epoch": 0.003962793769607573, + "grad_norm": 2.7201738357543945, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5018, + "step": 72 + }, + { + "epoch": 0.004017832571963234, + "grad_norm": 2.584726095199585, + "learning_rate": 7.3e-06, + "loss": 1.3519, + "step": 73 + }, + { + "epoch": 0.004072871374318895, + "grad_norm": 1.9693044424057007, + "learning_rate": 7.4e-06, + "loss": 1.0934, + "step": 74 + }, + { + "epoch": 0.0041279101766745555, + "grad_norm": 2.220736503601074, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4687, + "step": 75 + }, + { + "epoch": 0.004182948979030216, + "grad_norm": 2.2629456520080566, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3328, + "step": 76 + }, + { + "epoch": 0.004237987781385877, + "grad_norm": 2.051820993423462, + "learning_rate": 7.7e-06, + "loss": 1.3058, + "step": 77 + }, + { + "epoch": 0.004293026583741538, + "grad_norm": 2.2451820373535156, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3556, + "step": 78 + }, + { + "epoch": 0.004348065386097198, + "grad_norm": 3.13584303855896, + "learning_rate": 7.9e-06, + "loss": 1.3262, + "step": 79 + }, + { + "epoch": 0.004403104188452859, + "grad_norm": 5.024479866027832, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2103, + "step": 80 + }, + { + "epoch": 0.00445814299080852, + "grad_norm": 2.070889711380005, + "learning_rate": 8.1e-06, + "loss": 1.1994, + "step": 81 + }, + { + "epoch": 0.004513181793164181, + "grad_norm": 2.797286033630371, + "learning_rate": 8.2e-06, + "loss": 1.3075, + "step": 82 + }, + { + "epoch": 0.004568220595519841, + "grad_norm": 2.11370849609375, + "learning_rate": 8.3e-06, + "loss": 1.36, + "step": 83 + }, + { + "epoch": 0.004623259397875502, + "grad_norm": 2.5416152477264404, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3484, + "step": 84 + }, + { + "epoch": 0.004678298200231163, + "grad_norm": 2.4702343940734863, + "learning_rate": 8.5e-06, + "loss": 1.3677, + "step": 85 + }, + { + "epoch": 0.004733337002586824, + "grad_norm": 3.670365333557129, + "learning_rate": 8.6e-06, + "loss": 1.2192, + "step": 86 + }, + { + "epoch": 0.004788375804942484, + "grad_norm": 2.282954692840576, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2982, + "step": 87 + }, + { + "epoch": 0.004843414607298145, + "grad_norm": 2.3659238815307617, + "learning_rate": 8.8e-06, + "loss": 1.3206, + "step": 88 + }, + { + "epoch": 0.004898453409653806, + "grad_norm": 4.939981460571289, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4328, + "step": 89 + }, + { + "epoch": 0.004953492212009467, + "grad_norm": 2.335858106613159, + "learning_rate": 9e-06, + "loss": 1.2603, + "step": 90 + }, + { + "epoch": 0.005008531014365127, + "grad_norm": 2.2165043354034424, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3141, + "step": 91 + }, + { + "epoch": 0.005063569816720788, + "grad_norm": 2.7872185707092285, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3314, + "step": 92 + }, + { + "epoch": 0.005118608619076449, + "grad_norm": 2.6353912353515625, + "learning_rate": 9.3e-06, + "loss": 1.2027, + "step": 93 + }, + { + "epoch": 0.00517364742143211, + "grad_norm": 3.2509102821350098, + "learning_rate": 9.4e-06, + "loss": 1.2316, + "step": 94 + }, + { + "epoch": 0.00522868622378777, + "grad_norm": 2.4560611248016357, + "learning_rate": 9.5e-06, + "loss": 1.1848, + "step": 95 + }, + { + "epoch": 0.005283725026143431, + "grad_norm": 2.338151216506958, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2392, + "step": 96 + }, + { + "epoch": 0.005338763828499092, + "grad_norm": 2.231065034866333, + "learning_rate": 9.7e-06, + "loss": 1.2089, + "step": 97 + }, + { + "epoch": 0.005393802630854753, + "grad_norm": 2.278428077697754, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2267, + "step": 98 + }, + { + "epoch": 0.005448841433210413, + "grad_norm": 2.4422810077667236, + "learning_rate": 9.9e-06, + "loss": 1.2041, + "step": 99 + }, + { + "epoch": 0.005503880235566074, + "grad_norm": 2.216248035430908, + "learning_rate": 1e-05, + "loss": 1.0798, + "step": 100 + }, + { + "epoch": 0.005558919037921735, + "grad_norm": 2.3301615715026855, + "learning_rate": 9.99999998121067e-06, + "loss": 1.3069, + "step": 101 + }, + { + "epoch": 0.0056139578402773956, + "grad_norm": 2.315436363220215, + "learning_rate": 9.999999924842678e-06, + "loss": 1.1589, + "step": 102 + }, + { + "epoch": 0.005668996642633056, + "grad_norm": 2.3522140979766846, + "learning_rate": 9.999999830896024e-06, + "loss": 1.0978, + "step": 103 + }, + { + "epoch": 0.0057240354449887175, + "grad_norm": 2.5798308849334717, + "learning_rate": 9.99999969937071e-06, + "loss": 1.0599, + "step": 104 + }, + { + "epoch": 0.005779074247344378, + "grad_norm": 2.456644058227539, + "learning_rate": 9.999999530266738e-06, + "loss": 1.1682, + "step": 105 + }, + { + "epoch": 0.0058341130497000385, + "grad_norm": 2.1559031009674072, + "learning_rate": 9.999999323584106e-06, + "loss": 1.0631, + "step": 106 + }, + { + "epoch": 0.005889151852055699, + "grad_norm": 2.2985048294067383, + "learning_rate": 9.99999907932282e-06, + "loss": 1.1455, + "step": 107 + }, + { + "epoch": 0.00594419065441136, + "grad_norm": 2.596167802810669, + "learning_rate": 9.999998797482877e-06, + "loss": 1.1686, + "step": 108 + }, + { + "epoch": 0.005999229456767021, + "grad_norm": 2.378618001937866, + "learning_rate": 9.999998478064283e-06, + "loss": 1.2226, + "step": 109 + }, + { + "epoch": 0.0060542682591226814, + "grad_norm": 2.228116750717163, + "learning_rate": 9.999998121067038e-06, + "loss": 1.1396, + "step": 110 + }, + { + "epoch": 0.006109307061478342, + "grad_norm": 2.4419472217559814, + "learning_rate": 9.999997726491146e-06, + "loss": 1.1401, + "step": 111 + }, + { + "epoch": 0.006164345863834003, + "grad_norm": 2.0695526599884033, + "learning_rate": 9.999997294336608e-06, + "loss": 1.1868, + "step": 112 + }, + { + "epoch": 0.006219384666189664, + "grad_norm": 2.3170363903045654, + "learning_rate": 9.99999682460343e-06, + "loss": 1.1172, + "step": 113 + }, + { + "epoch": 0.006274423468545324, + "grad_norm": 2.670466184616089, + "learning_rate": 9.999996317291615e-06, + "loss": 1.2481, + "step": 114 + }, + { + "epoch": 0.006329462270900985, + "grad_norm": 2.1214540004730225, + "learning_rate": 9.999995772401166e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.006384501073256646, + "grad_norm": 1.9283969402313232, + "learning_rate": 9.999995189932085e-06, + "loss": 1.0692, + "step": 116 + }, + { + "epoch": 0.006439539875612307, + "grad_norm": 2.2620882987976074, + "learning_rate": 9.99999456988438e-06, + "loss": 1.0725, + "step": 117 + }, + { + "epoch": 0.006494578677967967, + "grad_norm": 2.2121341228485107, + "learning_rate": 9.999993912258055e-06, + "loss": 1.1328, + "step": 118 + }, + { + "epoch": 0.006549617480323628, + "grad_norm": 2.298126220703125, + "learning_rate": 9.999993217053113e-06, + "loss": 1.1272, + "step": 119 + }, + { + "epoch": 0.006604656282679289, + "grad_norm": 1.81593656539917, + "learning_rate": 9.99999248426956e-06, + "loss": 1.017, + "step": 120 + }, + { + "epoch": 0.00665969508503495, + "grad_norm": 2.1174378395080566, + "learning_rate": 9.999991713907403e-06, + "loss": 1.0557, + "step": 121 + }, + { + "epoch": 0.00671473388739061, + "grad_norm": 1.9061017036437988, + "learning_rate": 9.999990905966647e-06, + "loss": 1.0379, + "step": 122 + }, + { + "epoch": 0.006769772689746271, + "grad_norm": 1.912500023841858, + "learning_rate": 9.999990060447297e-06, + "loss": 1.104, + "step": 123 + }, + { + "epoch": 0.006824811492101932, + "grad_norm": 1.9249529838562012, + "learning_rate": 9.99998917734936e-06, + "loss": 1.0136, + "step": 124 + }, + { + "epoch": 0.006879850294457593, + "grad_norm": 1.8504948616027832, + "learning_rate": 9.999988256672843e-06, + "loss": 0.99, + "step": 125 + }, + { + "epoch": 0.006934889096813253, + "grad_norm": 1.720042109489441, + "learning_rate": 9.999987298417753e-06, + "loss": 1.0666, + "step": 126 + }, + { + "epoch": 0.006989927899168914, + "grad_norm": 1.778251051902771, + "learning_rate": 9.999986302584097e-06, + "loss": 1.0424, + "step": 127 + }, + { + "epoch": 0.007044966701524575, + "grad_norm": 1.9485961198806763, + "learning_rate": 9.999985269171881e-06, + "loss": 1.105, + "step": 128 + }, + { + "epoch": 0.007100005503880236, + "grad_norm": 3.0802104473114014, + "learning_rate": 9.999984198181114e-06, + "loss": 1.1081, + "step": 129 + }, + { + "epoch": 0.007155044306235896, + "grad_norm": 1.7476954460144043, + "learning_rate": 9.999983089611806e-06, + "loss": 0.9677, + "step": 130 + }, + { + "epoch": 0.007210083108591557, + "grad_norm": 1.6127299070358276, + "learning_rate": 9.999981943463963e-06, + "loss": 0.9937, + "step": 131 + }, + { + "epoch": 0.007265121910947218, + "grad_norm": 2.1477208137512207, + "learning_rate": 9.999980759737594e-06, + "loss": 1.0319, + "step": 132 + }, + { + "epoch": 0.007320160713302879, + "grad_norm": 1.531163215637207, + "learning_rate": 9.999979538432707e-06, + "loss": 0.8696, + "step": 133 + }, + { + "epoch": 0.007375199515658539, + "grad_norm": 1.8226820230484009, + "learning_rate": 9.999978279549313e-06, + "loss": 1.2061, + "step": 134 + }, + { + "epoch": 0.0074302383180142, + "grad_norm": 1.481895923614502, + "learning_rate": 9.99997698308742e-06, + "loss": 0.949, + "step": 135 + }, + { + "epoch": 0.007485277120369861, + "grad_norm": 1.6715927124023438, + "learning_rate": 9.99997564904704e-06, + "loss": 1.1579, + "step": 136 + }, + { + "epoch": 0.0075403159227255215, + "grad_norm": 1.4235272407531738, + "learning_rate": 9.999974277428179e-06, + "loss": 1.064, + "step": 137 + }, + { + "epoch": 0.007595354725081182, + "grad_norm": 1.3524872064590454, + "learning_rate": 9.999972868230852e-06, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.007650393527436843, + "grad_norm": 1.3741765022277832, + "learning_rate": 9.999971421455066e-06, + "loss": 1.0256, + "step": 139 + }, + { + "epoch": 0.007705432329792504, + "grad_norm": 1.9869598150253296, + "learning_rate": 9.999969937100835e-06, + "loss": 0.9489, + "step": 140 + }, + { + "epoch": 0.0077604711321481645, + "grad_norm": 1.4785465002059937, + "learning_rate": 9.999968415168166e-06, + "loss": 0.9243, + "step": 141 + }, + { + "epoch": 0.007815509934503825, + "grad_norm": 1.5476176738739014, + "learning_rate": 9.999966855657074e-06, + "loss": 1.178, + "step": 142 + }, + { + "epoch": 0.007870548736859486, + "grad_norm": 1.500401258468628, + "learning_rate": 9.99996525856757e-06, + "loss": 0.9837, + "step": 143 + }, + { + "epoch": 0.007925587539215146, + "grad_norm": 1.3777157068252563, + "learning_rate": 9.999963623899664e-06, + "loss": 1.0732, + "step": 144 + }, + { + "epoch": 0.007980626341570807, + "grad_norm": 1.4466841220855713, + "learning_rate": 9.99996195165337e-06, + "loss": 0.9779, + "step": 145 + }, + { + "epoch": 0.008035665143926469, + "grad_norm": 1.5304051637649536, + "learning_rate": 9.9999602418287e-06, + "loss": 1.196, + "step": 146 + }, + { + "epoch": 0.008090703946282128, + "grad_norm": 1.9012362957000732, + "learning_rate": 9.99995849442567e-06, + "loss": 0.9797, + "step": 147 + }, + { + "epoch": 0.00814574274863779, + "grad_norm": 1.430679202079773, + "learning_rate": 9.999956709444289e-06, + "loss": 0.9869, + "step": 148 + }, + { + "epoch": 0.00820078155099345, + "grad_norm": 1.3489817380905151, + "learning_rate": 9.99995488688457e-06, + "loss": 1.0137, + "step": 149 + }, + { + "epoch": 0.008255820353349111, + "grad_norm": 1.1878125667572021, + "learning_rate": 9.999953026746531e-06, + "loss": 0.9355, + "step": 150 + }, + { + "epoch": 0.008310859155704772, + "grad_norm": 1.3481942415237427, + "learning_rate": 9.999951129030182e-06, + "loss": 1.1235, + "step": 151 + }, + { + "epoch": 0.008365897958060432, + "grad_norm": 1.7335314750671387, + "learning_rate": 9.999949193735539e-06, + "loss": 0.9382, + "step": 152 + }, + { + "epoch": 0.008420936760416093, + "grad_norm": 1.2029480934143066, + "learning_rate": 9.999947220862615e-06, + "loss": 0.9419, + "step": 153 + }, + { + "epoch": 0.008475975562771755, + "grad_norm": 1.2104203701019287, + "learning_rate": 9.999945210411428e-06, + "loss": 0.9196, + "step": 154 + }, + { + "epoch": 0.008531014365127414, + "grad_norm": 1.1857126951217651, + "learning_rate": 9.999943162381991e-06, + "loss": 0.9421, + "step": 155 + }, + { + "epoch": 0.008586053167483076, + "grad_norm": 1.115027904510498, + "learning_rate": 9.999941076774319e-06, + "loss": 0.9634, + "step": 156 + }, + { + "epoch": 0.008641091969838737, + "grad_norm": 1.4227553606033325, + "learning_rate": 9.999938953588428e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.008696130772194397, + "grad_norm": 1.2913776636123657, + "learning_rate": 9.999936792824334e-06, + "loss": 0.9232, + "step": 158 + }, + { + "epoch": 0.008751169574550058, + "grad_norm": 1.2817318439483643, + "learning_rate": 9.999934594482055e-06, + "loss": 0.9691, + "step": 159 + }, + { + "epoch": 0.008806208376905718, + "grad_norm": 1.5647841691970825, + "learning_rate": 9.999932358561604e-06, + "loss": 1.1842, + "step": 160 + }, + { + "epoch": 0.00886124717926138, + "grad_norm": 1.368135929107666, + "learning_rate": 9.999930085063002e-06, + "loss": 1.0873, + "step": 161 + }, + { + "epoch": 0.00891628598161704, + "grad_norm": 1.2297240495681763, + "learning_rate": 9.999927773986262e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.0089713247839727, + "grad_norm": 1.0658279657363892, + "learning_rate": 9.999925425331405e-06, + "loss": 0.9008, + "step": 163 + }, + { + "epoch": 0.009026363586328362, + "grad_norm": 1.3484326601028442, + "learning_rate": 9.999923039098445e-06, + "loss": 1.0664, + "step": 164 + }, + { + "epoch": 0.009081402388684023, + "grad_norm": 1.1839075088500977, + "learning_rate": 9.999920615287401e-06, + "loss": 0.9257, + "step": 165 + }, + { + "epoch": 0.009136441191039683, + "grad_norm": 1.2757254838943481, + "learning_rate": 9.999918153898295e-06, + "loss": 0.9473, + "step": 166 + }, + { + "epoch": 0.009191479993395344, + "grad_norm": 1.2414579391479492, + "learning_rate": 9.99991565493114e-06, + "loss": 1.1091, + "step": 167 + }, + { + "epoch": 0.009246518795751004, + "grad_norm": 1.2802611589431763, + "learning_rate": 9.999913118385959e-06, + "loss": 1.063, + "step": 168 + }, + { + "epoch": 0.009301557598106665, + "grad_norm": 1.2055327892303467, + "learning_rate": 9.99991054426277e-06, + "loss": 0.8, + "step": 169 + }, + { + "epoch": 0.009356596400462327, + "grad_norm": 1.0391098260879517, + "learning_rate": 9.99990793256159e-06, + "loss": 0.8672, + "step": 170 + }, + { + "epoch": 0.009411635202817986, + "grad_norm": 1.131536602973938, + "learning_rate": 9.99990528328244e-06, + "loss": 0.9569, + "step": 171 + }, + { + "epoch": 0.009466674005173648, + "grad_norm": 1.164307951927185, + "learning_rate": 9.999902596425342e-06, + "loss": 0.9999, + "step": 172 + }, + { + "epoch": 0.009521712807529309, + "grad_norm": 1.2099504470825195, + "learning_rate": 9.999899871990313e-06, + "loss": 0.9994, + "step": 173 + }, + { + "epoch": 0.009576751609884969, + "grad_norm": 1.7294539213180542, + "learning_rate": 9.999897109977376e-06, + "loss": 1.0265, + "step": 174 + }, + { + "epoch": 0.00963179041224063, + "grad_norm": 1.3009883165359497, + "learning_rate": 9.99989431038655e-06, + "loss": 0.9022, + "step": 175 + }, + { + "epoch": 0.00968682921459629, + "grad_norm": 1.1014611721038818, + "learning_rate": 9.999891473217857e-06, + "loss": 0.8476, + "step": 176 + }, + { + "epoch": 0.009741868016951951, + "grad_norm": 1.2410900592803955, + "learning_rate": 9.99988859847132e-06, + "loss": 1.0272, + "step": 177 + }, + { + "epoch": 0.009796906819307612, + "grad_norm": 1.336348295211792, + "learning_rate": 9.999885686146957e-06, + "loss": 0.9456, + "step": 178 + }, + { + "epoch": 0.009851945621663272, + "grad_norm": 1.2931095361709595, + "learning_rate": 9.99988273624479e-06, + "loss": 0.9554, + "step": 179 + }, + { + "epoch": 0.009906984424018933, + "grad_norm": 1.2647838592529297, + "learning_rate": 9.999879748764845e-06, + "loss": 1.0394, + "step": 180 + }, + { + "epoch": 0.009962023226374595, + "grad_norm": 1.3485127687454224, + "learning_rate": 9.99987672370714e-06, + "loss": 1.1016, + "step": 181 + }, + { + "epoch": 0.010017062028730254, + "grad_norm": 1.110187292098999, + "learning_rate": 9.999873661071702e-06, + "loss": 0.946, + "step": 182 + }, + { + "epoch": 0.010072100831085916, + "grad_norm": 1.0991623401641846, + "learning_rate": 9.999870560858551e-06, + "loss": 1.0084, + "step": 183 + }, + { + "epoch": 0.010127139633441576, + "grad_norm": 1.049804449081421, + "learning_rate": 9.999867423067713e-06, + "loss": 0.8264, + "step": 184 + }, + { + "epoch": 0.010182178435797237, + "grad_norm": 1.0947058200836182, + "learning_rate": 9.999864247699207e-06, + "loss": 0.8884, + "step": 185 + }, + { + "epoch": 0.010237217238152898, + "grad_norm": 1.1147902011871338, + "learning_rate": 9.999861034753061e-06, + "loss": 0.9657, + "step": 186 + }, + { + "epoch": 0.010292256040508558, + "grad_norm": 1.260027527809143, + "learning_rate": 9.999857784229298e-06, + "loss": 1.0102, + "step": 187 + }, + { + "epoch": 0.01034729484286422, + "grad_norm": 1.1275582313537598, + "learning_rate": 9.999854496127942e-06, + "loss": 1.028, + "step": 188 + }, + { + "epoch": 0.01040233364521988, + "grad_norm": 1.1377174854278564, + "learning_rate": 9.999851170449018e-06, + "loss": 1.032, + "step": 189 + }, + { + "epoch": 0.01045737244757554, + "grad_norm": 1.1734225749969482, + "learning_rate": 9.999847807192552e-06, + "loss": 1.0009, + "step": 190 + }, + { + "epoch": 0.010512411249931202, + "grad_norm": 1.1934596300125122, + "learning_rate": 9.999844406358565e-06, + "loss": 1.0432, + "step": 191 + }, + { + "epoch": 0.010567450052286861, + "grad_norm": 1.0638024806976318, + "learning_rate": 9.99984096794709e-06, + "loss": 0.8651, + "step": 192 + }, + { + "epoch": 0.010622488854642523, + "grad_norm": 1.2381829023361206, + "learning_rate": 9.999837491958147e-06, + "loss": 1.0088, + "step": 193 + }, + { + "epoch": 0.010677527656998184, + "grad_norm": 1.030246615409851, + "learning_rate": 9.999833978391763e-06, + "loss": 0.9488, + "step": 194 + }, + { + "epoch": 0.010732566459353844, + "grad_norm": 1.1640657186508179, + "learning_rate": 9.999830427247965e-06, + "loss": 1.0588, + "step": 195 + }, + { + "epoch": 0.010787605261709505, + "grad_norm": 1.0431616306304932, + "learning_rate": 9.99982683852678e-06, + "loss": 0.8728, + "step": 196 + }, + { + "epoch": 0.010842644064065167, + "grad_norm": 1.032263159751892, + "learning_rate": 9.999823212228235e-06, + "loss": 0.9498, + "step": 197 + }, + { + "epoch": 0.010897682866420826, + "grad_norm": 1.1383745670318604, + "learning_rate": 9.999819548352358e-06, + "loss": 0.9498, + "step": 198 + }, + { + "epoch": 0.010952721668776488, + "grad_norm": 1.1324639320373535, + "learning_rate": 9.999815846899175e-06, + "loss": 1.0432, + "step": 199 + }, + { + "epoch": 0.011007760471132147, + "grad_norm": 1.188672661781311, + "learning_rate": 9.999812107868714e-06, + "loss": 0.982, + "step": 200 + }, + { + "epoch": 0.011062799273487809, + "grad_norm": 1.1011098623275757, + "learning_rate": 9.999808331261005e-06, + "loss": 0.9587, + "step": 201 + }, + { + "epoch": 0.01111783807584347, + "grad_norm": 1.1782938241958618, + "learning_rate": 9.999804517076073e-06, + "loss": 1.0659, + "step": 202 + }, + { + "epoch": 0.01117287687819913, + "grad_norm": 1.0520117282867432, + "learning_rate": 9.99980066531395e-06, + "loss": 1.0056, + "step": 203 + }, + { + "epoch": 0.011227915680554791, + "grad_norm": 1.1584919691085815, + "learning_rate": 9.999796775974663e-06, + "loss": 0.9435, + "step": 204 + }, + { + "epoch": 0.011282954482910452, + "grad_norm": 1.2201849222183228, + "learning_rate": 9.999792849058242e-06, + "loss": 1.0562, + "step": 205 + }, + { + "epoch": 0.011337993285266112, + "grad_norm": 1.2985976934432983, + "learning_rate": 9.999788884564715e-06, + "loss": 1.0126, + "step": 206 + }, + { + "epoch": 0.011393032087621774, + "grad_norm": 0.9926307201385498, + "learning_rate": 9.999784882494115e-06, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.011448070889977435, + "grad_norm": 1.103365182876587, + "learning_rate": 9.99978084284647e-06, + "loss": 0.9833, + "step": 208 + }, + { + "epoch": 0.011503109692333095, + "grad_norm": 1.1798462867736816, + "learning_rate": 9.99977676562181e-06, + "loss": 0.8479, + "step": 209 + }, + { + "epoch": 0.011558148494688756, + "grad_norm": 1.2887194156646729, + "learning_rate": 9.999772650820168e-06, + "loss": 0.9606, + "step": 210 + }, + { + "epoch": 0.011613187297044416, + "grad_norm": 1.1120634078979492, + "learning_rate": 9.99976849844157e-06, + "loss": 0.9604, + "step": 211 + }, + { + "epoch": 0.011668226099400077, + "grad_norm": 1.1248979568481445, + "learning_rate": 9.999764308486052e-06, + "loss": 0.9428, + "step": 212 + }, + { + "epoch": 0.011723264901755738, + "grad_norm": 1.274610161781311, + "learning_rate": 9.999760080953643e-06, + "loss": 0.9044, + "step": 213 + }, + { + "epoch": 0.011778303704111398, + "grad_norm": 1.1746865510940552, + "learning_rate": 9.999755815844377e-06, + "loss": 0.9114, + "step": 214 + }, + { + "epoch": 0.01183334250646706, + "grad_norm": 1.2531086206436157, + "learning_rate": 9.999751513158282e-06, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.01188838130882272, + "grad_norm": 1.0789539813995361, + "learning_rate": 9.999747172895395e-06, + "loss": 0.9794, + "step": 216 + }, + { + "epoch": 0.01194342011117838, + "grad_norm": 1.1805329322814941, + "learning_rate": 9.999742795055746e-06, + "loss": 0.9602, + "step": 217 + }, + { + "epoch": 0.011998458913534042, + "grad_norm": 2.309329032897949, + "learning_rate": 9.99973837963937e-06, + "loss": 0.9482, + "step": 218 + }, + { + "epoch": 0.012053497715889702, + "grad_norm": 1.2379088401794434, + "learning_rate": 9.999733926646296e-06, + "loss": 1.0237, + "step": 219 + }, + { + "epoch": 0.012108536518245363, + "grad_norm": 1.1581377983093262, + "learning_rate": 9.999729436076562e-06, + "loss": 1.0583, + "step": 220 + }, + { + "epoch": 0.012163575320601024, + "grad_norm": 1.3006727695465088, + "learning_rate": 9.999724907930199e-06, + "loss": 0.9581, + "step": 221 + }, + { + "epoch": 0.012218614122956684, + "grad_norm": 1.3215982913970947, + "learning_rate": 9.999720342207243e-06, + "loss": 0.9438, + "step": 222 + }, + { + "epoch": 0.012273652925312345, + "grad_norm": 1.1107337474822998, + "learning_rate": 9.999715738907727e-06, + "loss": 0.9987, + "step": 223 + }, + { + "epoch": 0.012328691727668007, + "grad_norm": 1.0745457410812378, + "learning_rate": 9.999711098031685e-06, + "loss": 0.9637, + "step": 224 + }, + { + "epoch": 0.012383730530023666, + "grad_norm": 1.110861897468567, + "learning_rate": 9.999706419579154e-06, + "loss": 1.0225, + "step": 225 + }, + { + "epoch": 0.012438769332379328, + "grad_norm": 1.0755527019500732, + "learning_rate": 9.999701703550167e-06, + "loss": 1.0204, + "step": 226 + }, + { + "epoch": 0.012493808134734987, + "grad_norm": 1.1694976091384888, + "learning_rate": 9.99969694994476e-06, + "loss": 1.0566, + "step": 227 + }, + { + "epoch": 0.012548846937090649, + "grad_norm": 1.455856442451477, + "learning_rate": 9.99969215876297e-06, + "loss": 0.9397, + "step": 228 + }, + { + "epoch": 0.01260388573944631, + "grad_norm": 1.0707073211669922, + "learning_rate": 9.99968733000483e-06, + "loss": 0.8286, + "step": 229 + }, + { + "epoch": 0.01265892454180197, + "grad_norm": 1.189548134803772, + "learning_rate": 9.99968246367038e-06, + "loss": 0.8762, + "step": 230 + }, + { + "epoch": 0.012713963344157631, + "grad_norm": 1.1439214944839478, + "learning_rate": 9.999677559759655e-06, + "loss": 0.9187, + "step": 231 + }, + { + "epoch": 0.012769002146513293, + "grad_norm": 1.2329761981964111, + "learning_rate": 9.999672618272691e-06, + "loss": 1.0374, + "step": 232 + }, + { + "epoch": 0.012824040948868952, + "grad_norm": 1.1545134782791138, + "learning_rate": 9.999667639209527e-06, + "loss": 0.9343, + "step": 233 + }, + { + "epoch": 0.012879079751224614, + "grad_norm": 1.0946775674819946, + "learning_rate": 9.999662622570198e-06, + "loss": 0.9568, + "step": 234 + }, + { + "epoch": 0.012934118553580273, + "grad_norm": 1.2099589109420776, + "learning_rate": 9.999657568354743e-06, + "loss": 1.0364, + "step": 235 + }, + { + "epoch": 0.012989157355935935, + "grad_norm": 1.09062922000885, + "learning_rate": 9.999652476563202e-06, + "loss": 1.0289, + "step": 236 + }, + { + "epoch": 0.013044196158291596, + "grad_norm": 1.154557228088379, + "learning_rate": 9.999647347195612e-06, + "loss": 0.9925, + "step": 237 + }, + { + "epoch": 0.013099234960647256, + "grad_norm": 1.025374174118042, + "learning_rate": 9.999642180252008e-06, + "loss": 0.9346, + "step": 238 + }, + { + "epoch": 0.013154273763002917, + "grad_norm": 1.1473641395568848, + "learning_rate": 9.999636975732433e-06, + "loss": 1.0244, + "step": 239 + }, + { + "epoch": 0.013209312565358578, + "grad_norm": 1.0421240329742432, + "learning_rate": 9.999631733636923e-06, + "loss": 0.9368, + "step": 240 + }, + { + "epoch": 0.013264351367714238, + "grad_norm": 1.1076610088348389, + "learning_rate": 9.99962645396552e-06, + "loss": 1.0276, + "step": 241 + }, + { + "epoch": 0.0133193901700699, + "grad_norm": 1.143559455871582, + "learning_rate": 9.999621136718266e-06, + "loss": 0.9626, + "step": 242 + }, + { + "epoch": 0.01337442897242556, + "grad_norm": 1.0958378314971924, + "learning_rate": 9.999615781895195e-06, + "loss": 1.0254, + "step": 243 + }, + { + "epoch": 0.01342946777478122, + "grad_norm": 1.117688536643982, + "learning_rate": 9.99961038949635e-06, + "loss": 0.9685, + "step": 244 + }, + { + "epoch": 0.013484506577136882, + "grad_norm": 1.1645647287368774, + "learning_rate": 9.999604959521771e-06, + "loss": 1.0666, + "step": 245 + }, + { + "epoch": 0.013539545379492542, + "grad_norm": 1.1238516569137573, + "learning_rate": 9.999599491971502e-06, + "loss": 1.0252, + "step": 246 + }, + { + "epoch": 0.013594584181848203, + "grad_norm": 1.0196914672851562, + "learning_rate": 9.999593986845579e-06, + "loss": 0.9389, + "step": 247 + }, + { + "epoch": 0.013649622984203864, + "grad_norm": 1.0231372117996216, + "learning_rate": 9.999588444144049e-06, + "loss": 0.8786, + "step": 248 + }, + { + "epoch": 0.013704661786559524, + "grad_norm": 1.2504147291183472, + "learning_rate": 9.999582863866947e-06, + "loss": 1.0969, + "step": 249 + }, + { + "epoch": 0.013759700588915185, + "grad_norm": 1.1123549938201904, + "learning_rate": 9.99957724601432e-06, + "loss": 0.8833, + "step": 250 + }, + { + "epoch": 0.013814739391270847, + "grad_norm": 1.1068202257156372, + "learning_rate": 9.999571590586208e-06, + "loss": 0.9709, + "step": 251 + }, + { + "epoch": 0.013869778193626506, + "grad_norm": 0.9891651272773743, + "learning_rate": 9.999565897582655e-06, + "loss": 0.8598, + "step": 252 + }, + { + "epoch": 0.013924816995982168, + "grad_norm": 0.9866491556167603, + "learning_rate": 9.999560167003703e-06, + "loss": 0.8101, + "step": 253 + }, + { + "epoch": 0.013979855798337828, + "grad_norm": 1.0862594842910767, + "learning_rate": 9.999554398849396e-06, + "loss": 0.9411, + "step": 254 + }, + { + "epoch": 0.014034894600693489, + "grad_norm": 1.1898949146270752, + "learning_rate": 9.999548593119774e-06, + "loss": 0.9548, + "step": 255 + }, + { + "epoch": 0.01408993340304915, + "grad_norm": 1.2167880535125732, + "learning_rate": 9.999542749814886e-06, + "loss": 1.0302, + "step": 256 + }, + { + "epoch": 0.01414497220540481, + "grad_norm": 1.0784146785736084, + "learning_rate": 9.999536868934771e-06, + "loss": 0.8875, + "step": 257 + }, + { + "epoch": 0.014200011007760471, + "grad_norm": 1.1128027439117432, + "learning_rate": 9.999530950479475e-06, + "loss": 0.9498, + "step": 258 + }, + { + "epoch": 0.014255049810116133, + "grad_norm": 1.1311595439910889, + "learning_rate": 9.999524994449044e-06, + "loss": 0.9035, + "step": 259 + }, + { + "epoch": 0.014310088612471792, + "grad_norm": 1.225615382194519, + "learning_rate": 9.999519000843521e-06, + "loss": 1.0104, + "step": 260 + }, + { + "epoch": 0.014365127414827454, + "grad_norm": 1.2347793579101562, + "learning_rate": 9.99951296966295e-06, + "loss": 1.0288, + "step": 261 + }, + { + "epoch": 0.014420166217183113, + "grad_norm": 1.1837103366851807, + "learning_rate": 9.99950690090738e-06, + "loss": 0.9553, + "step": 262 + }, + { + "epoch": 0.014475205019538775, + "grad_norm": 1.1985397338867188, + "learning_rate": 9.999500794576852e-06, + "loss": 0.9561, + "step": 263 + }, + { + "epoch": 0.014530243821894436, + "grad_norm": 1.036928415298462, + "learning_rate": 9.999494650671418e-06, + "loss": 0.8906, + "step": 264 + }, + { + "epoch": 0.014585282624250096, + "grad_norm": 1.0797842741012573, + "learning_rate": 9.999488469191116e-06, + "loss": 0.8975, + "step": 265 + }, + { + "epoch": 0.014640321426605757, + "grad_norm": 1.0571156740188599, + "learning_rate": 9.999482250136e-06, + "loss": 0.9334, + "step": 266 + }, + { + "epoch": 0.014695360228961419, + "grad_norm": 1.2065023183822632, + "learning_rate": 9.999475993506114e-06, + "loss": 0.8986, + "step": 267 + }, + { + "epoch": 0.014750399031317078, + "grad_norm": 1.201586127281189, + "learning_rate": 9.999469699301502e-06, + "loss": 0.9192, + "step": 268 + }, + { + "epoch": 0.01480543783367274, + "grad_norm": 1.0470168590545654, + "learning_rate": 9.999463367522216e-06, + "loss": 0.8604, + "step": 269 + }, + { + "epoch": 0.0148604766360284, + "grad_norm": 1.1142147779464722, + "learning_rate": 9.9994569981683e-06, + "loss": 0.9847, + "step": 270 + }, + { + "epoch": 0.01491551543838406, + "grad_norm": 1.0352061986923218, + "learning_rate": 9.999450591239805e-06, + "loss": 0.8927, + "step": 271 + }, + { + "epoch": 0.014970554240739722, + "grad_norm": 1.0353184938430786, + "learning_rate": 9.999444146736779e-06, + "loss": 0.8435, + "step": 272 + }, + { + "epoch": 0.015025593043095382, + "grad_norm": 1.2091951370239258, + "learning_rate": 9.999437664659267e-06, + "loss": 0.8959, + "step": 273 + }, + { + "epoch": 0.015080631845451043, + "grad_norm": 1.006361722946167, + "learning_rate": 9.999431145007319e-06, + "loss": 0.8579, + "step": 274 + }, + { + "epoch": 0.015135670647806704, + "grad_norm": 1.1265509128570557, + "learning_rate": 9.999424587780985e-06, + "loss": 0.8808, + "step": 275 + }, + { + "epoch": 0.015190709450162364, + "grad_norm": 1.060882568359375, + "learning_rate": 9.999417992980317e-06, + "loss": 1.044, + "step": 276 + }, + { + "epoch": 0.015245748252518026, + "grad_norm": 1.0216747522354126, + "learning_rate": 9.999411360605358e-06, + "loss": 0.7773, + "step": 277 + }, + { + "epoch": 0.015300787054873685, + "grad_norm": 1.1382462978363037, + "learning_rate": 9.999404690656163e-06, + "loss": 0.8954, + "step": 278 + }, + { + "epoch": 0.015355825857229347, + "grad_norm": 1.113815188407898, + "learning_rate": 9.99939798313278e-06, + "loss": 0.8143, + "step": 279 + }, + { + "epoch": 0.015410864659585008, + "grad_norm": 1.123530387878418, + "learning_rate": 9.99939123803526e-06, + "loss": 0.8872, + "step": 280 + }, + { + "epoch": 0.015465903461940668, + "grad_norm": 1.0873669385910034, + "learning_rate": 9.999384455363656e-06, + "loss": 1.008, + "step": 281 + }, + { + "epoch": 0.015520942264296329, + "grad_norm": 1.5956637859344482, + "learning_rate": 9.999377635118014e-06, + "loss": 0.9456, + "step": 282 + }, + { + "epoch": 0.01557598106665199, + "grad_norm": 1.1471425294876099, + "learning_rate": 9.999370777298389e-06, + "loss": 0.9897, + "step": 283 + }, + { + "epoch": 0.01563101986900765, + "grad_norm": 0.9960193634033203, + "learning_rate": 9.999363881904831e-06, + "loss": 0.8196, + "step": 284 + }, + { + "epoch": 0.01568605867136331, + "grad_norm": 1.1033951044082642, + "learning_rate": 9.999356948937393e-06, + "loss": 0.879, + "step": 285 + }, + { + "epoch": 0.015741097473718973, + "grad_norm": 1.157765507698059, + "learning_rate": 9.999349978396126e-06, + "loss": 1.0116, + "step": 286 + }, + { + "epoch": 0.015796136276074634, + "grad_norm": 1.0472352504730225, + "learning_rate": 9.999342970281084e-06, + "loss": 0.8657, + "step": 287 + }, + { + "epoch": 0.015851175078430292, + "grad_norm": 1.1346659660339355, + "learning_rate": 9.999335924592315e-06, + "loss": 0.8482, + "step": 288 + }, + { + "epoch": 0.015906213880785953, + "grad_norm": 1.1164487600326538, + "learning_rate": 9.999328841329879e-06, + "loss": 1.0542, + "step": 289 + }, + { + "epoch": 0.015961252683141615, + "grad_norm": 1.1890591382980347, + "learning_rate": 9.999321720493825e-06, + "loss": 0.9598, + "step": 290 + }, + { + "epoch": 0.016016291485497276, + "grad_norm": 1.0419867038726807, + "learning_rate": 9.999314562084205e-06, + "loss": 0.9548, + "step": 291 + }, + { + "epoch": 0.016071330287852938, + "grad_norm": 1.0652042627334595, + "learning_rate": 9.999307366101077e-06, + "loss": 0.9359, + "step": 292 + }, + { + "epoch": 0.016126369090208596, + "grad_norm": 1.0166404247283936, + "learning_rate": 9.999300132544492e-06, + "loss": 0.9276, + "step": 293 + }, + { + "epoch": 0.016181407892564257, + "grad_norm": 1.1638866662979126, + "learning_rate": 9.999292861414507e-06, + "loss": 0.957, + "step": 294 + }, + { + "epoch": 0.01623644669491992, + "grad_norm": 1.5505993366241455, + "learning_rate": 9.999285552711173e-06, + "loss": 0.9878, + "step": 295 + }, + { + "epoch": 0.01629148549727558, + "grad_norm": 1.177262783050537, + "learning_rate": 9.999278206434549e-06, + "loss": 0.8631, + "step": 296 + }, + { + "epoch": 0.01634652429963124, + "grad_norm": 1.8578168153762817, + "learning_rate": 9.999270822584687e-06, + "loss": 0.9684, + "step": 297 + }, + { + "epoch": 0.0164015631019869, + "grad_norm": 1.2617360353469849, + "learning_rate": 9.999263401161643e-06, + "loss": 1.014, + "step": 298 + }, + { + "epoch": 0.01645660190434256, + "grad_norm": 0.9740132689476013, + "learning_rate": 9.999255942165475e-06, + "loss": 0.8606, + "step": 299 + }, + { + "epoch": 0.016511640706698222, + "grad_norm": 0.9821745753288269, + "learning_rate": 9.999248445596238e-06, + "loss": 0.8241, + "step": 300 + }, + { + "epoch": 0.016566679509053883, + "grad_norm": 1.0200445652008057, + "learning_rate": 9.999240911453986e-06, + "loss": 0.8256, + "step": 301 + }, + { + "epoch": 0.016621718311409545, + "grad_norm": 1.4100390672683716, + "learning_rate": 9.999233339738779e-06, + "loss": 0.9057, + "step": 302 + }, + { + "epoch": 0.016676757113765206, + "grad_norm": 1.056544303894043, + "learning_rate": 9.99922573045067e-06, + "loss": 1.0808, + "step": 303 + }, + { + "epoch": 0.016731795916120864, + "grad_norm": 0.9271026253700256, + "learning_rate": 9.99921808358972e-06, + "loss": 0.878, + "step": 304 + }, + { + "epoch": 0.016786834718476525, + "grad_norm": 0.9864157438278198, + "learning_rate": 9.999210399155987e-06, + "loss": 0.9198, + "step": 305 + }, + { + "epoch": 0.016841873520832187, + "grad_norm": 1.093995451927185, + "learning_rate": 9.999202677149525e-06, + "loss": 0.9794, + "step": 306 + }, + { + "epoch": 0.016896912323187848, + "grad_norm": 0.9717912077903748, + "learning_rate": 9.999194917570395e-06, + "loss": 0.8764, + "step": 307 + }, + { + "epoch": 0.01695195112554351, + "grad_norm": 1.0026428699493408, + "learning_rate": 9.999187120418653e-06, + "loss": 0.8526, + "step": 308 + }, + { + "epoch": 0.017006989927899167, + "grad_norm": 1.122870922088623, + "learning_rate": 9.999179285694359e-06, + "loss": 0.9773, + "step": 309 + }, + { + "epoch": 0.01706202873025483, + "grad_norm": 1.0522836446762085, + "learning_rate": 9.999171413397572e-06, + "loss": 1.0183, + "step": 310 + }, + { + "epoch": 0.01711706753261049, + "grad_norm": 0.9303658604621887, + "learning_rate": 9.99916350352835e-06, + "loss": 0.8402, + "step": 311 + }, + { + "epoch": 0.01717210633496615, + "grad_norm": 0.9606096148490906, + "learning_rate": 9.999155556086755e-06, + "loss": 0.9692, + "step": 312 + }, + { + "epoch": 0.017227145137321813, + "grad_norm": 1.176992416381836, + "learning_rate": 9.999147571072844e-06, + "loss": 0.8172, + "step": 313 + }, + { + "epoch": 0.017282183939677474, + "grad_norm": 1.1948801279067993, + "learning_rate": 9.999139548486678e-06, + "loss": 1.0205, + "step": 314 + }, + { + "epoch": 0.017337222742033132, + "grad_norm": 1.0064897537231445, + "learning_rate": 9.999131488328318e-06, + "loss": 0.9479, + "step": 315 + }, + { + "epoch": 0.017392261544388794, + "grad_norm": 1.048242449760437, + "learning_rate": 9.999123390597822e-06, + "loss": 0.9862, + "step": 316 + }, + { + "epoch": 0.017447300346744455, + "grad_norm": 1.12875497341156, + "learning_rate": 9.999115255295256e-06, + "loss": 0.9743, + "step": 317 + }, + { + "epoch": 0.017502339149100116, + "grad_norm": 1.0607460737228394, + "learning_rate": 9.999107082420674e-06, + "loss": 0.8878, + "step": 318 + }, + { + "epoch": 0.017557377951455778, + "grad_norm": 1.1480191946029663, + "learning_rate": 9.999098871974144e-06, + "loss": 0.8769, + "step": 319 + }, + { + "epoch": 0.017612416753811436, + "grad_norm": 1.1150004863739014, + "learning_rate": 9.999090623955724e-06, + "loss": 0.8615, + "step": 320 + }, + { + "epoch": 0.017667455556167097, + "grad_norm": 1.137839913368225, + "learning_rate": 9.999082338365478e-06, + "loss": 0.9703, + "step": 321 + }, + { + "epoch": 0.01772249435852276, + "grad_norm": 1.0883489847183228, + "learning_rate": 9.999074015203467e-06, + "loss": 0.9273, + "step": 322 + }, + { + "epoch": 0.01777753316087842, + "grad_norm": 1.0999557971954346, + "learning_rate": 9.999065654469752e-06, + "loss": 0.9605, + "step": 323 + }, + { + "epoch": 0.01783257196323408, + "grad_norm": 0.9911689758300781, + "learning_rate": 9.999057256164401e-06, + "loss": 0.9117, + "step": 324 + }, + { + "epoch": 0.01788761076558974, + "grad_norm": 1.040933609008789, + "learning_rate": 9.999048820287472e-06, + "loss": 0.9229, + "step": 325 + }, + { + "epoch": 0.0179426495679454, + "grad_norm": 1.4341392517089844, + "learning_rate": 9.999040346839031e-06, + "loss": 1.0718, + "step": 326 + }, + { + "epoch": 0.017997688370301062, + "grad_norm": 1.0246332883834839, + "learning_rate": 9.99903183581914e-06, + "loss": 0.9617, + "step": 327 + }, + { + "epoch": 0.018052727172656723, + "grad_norm": 10.162322998046875, + "learning_rate": 9.999023287227863e-06, + "loss": 1.0391, + "step": 328 + }, + { + "epoch": 0.018107765975012385, + "grad_norm": 1.3370027542114258, + "learning_rate": 9.999014701065266e-06, + "loss": 1.0211, + "step": 329 + }, + { + "epoch": 0.018162804777368046, + "grad_norm": 1.0146219730377197, + "learning_rate": 9.999006077331413e-06, + "loss": 0.8611, + "step": 330 + }, + { + "epoch": 0.018217843579723704, + "grad_norm": 1.0899269580841064, + "learning_rate": 9.998997416026368e-06, + "loss": 0.9209, + "step": 331 + }, + { + "epoch": 0.018272882382079365, + "grad_norm": 1.1343204975128174, + "learning_rate": 9.998988717150198e-06, + "loss": 0.9405, + "step": 332 + }, + { + "epoch": 0.018327921184435027, + "grad_norm": 1.2308380603790283, + "learning_rate": 9.998979980702965e-06, + "loss": 0.9579, + "step": 333 + }, + { + "epoch": 0.018382959986790688, + "grad_norm": 1.1433519124984741, + "learning_rate": 9.998971206684737e-06, + "loss": 1.0045, + "step": 334 + }, + { + "epoch": 0.01843799878914635, + "grad_norm": 1.0585781335830688, + "learning_rate": 9.99896239509558e-06, + "loss": 0.9171, + "step": 335 + }, + { + "epoch": 0.018493037591502007, + "grad_norm": 1.2735164165496826, + "learning_rate": 9.99895354593556e-06, + "loss": 1.1001, + "step": 336 + }, + { + "epoch": 0.01854807639385767, + "grad_norm": 1.2905755043029785, + "learning_rate": 9.998944659204744e-06, + "loss": 1.0294, + "step": 337 + }, + { + "epoch": 0.01860311519621333, + "grad_norm": 1.1442075967788696, + "learning_rate": 9.998935734903198e-06, + "loss": 0.9385, + "step": 338 + }, + { + "epoch": 0.01865815399856899, + "grad_norm": 1.1005232334136963, + "learning_rate": 9.998926773030987e-06, + "loss": 1.026, + "step": 339 + }, + { + "epoch": 0.018713192800924653, + "grad_norm": 1.2770785093307495, + "learning_rate": 9.998917773588182e-06, + "loss": 1.0015, + "step": 340 + }, + { + "epoch": 0.01876823160328031, + "grad_norm": 1.0963070392608643, + "learning_rate": 9.998908736574849e-06, + "loss": 0.9347, + "step": 341 + }, + { + "epoch": 0.018823270405635972, + "grad_norm": 1.10364830493927, + "learning_rate": 9.998899661991055e-06, + "loss": 0.869, + "step": 342 + }, + { + "epoch": 0.018878309207991634, + "grad_norm": 1.0364975929260254, + "learning_rate": 9.99889054983687e-06, + "loss": 0.9855, + "step": 343 + }, + { + "epoch": 0.018933348010347295, + "grad_norm": 1.104702115058899, + "learning_rate": 9.998881400112362e-06, + "loss": 0.9555, + "step": 344 + }, + { + "epoch": 0.018988386812702956, + "grad_norm": 0.9957441687583923, + "learning_rate": 9.998872212817599e-06, + "loss": 0.9634, + "step": 345 + }, + { + "epoch": 0.019043425615058618, + "grad_norm": 1.262271523475647, + "learning_rate": 9.998862987952651e-06, + "loss": 1.0133, + "step": 346 + }, + { + "epoch": 0.019098464417414276, + "grad_norm": 1.2075226306915283, + "learning_rate": 9.998853725517587e-06, + "loss": 1.0588, + "step": 347 + }, + { + "epoch": 0.019153503219769937, + "grad_norm": 1.0609898567199707, + "learning_rate": 9.998844425512477e-06, + "loss": 0.9952, + "step": 348 + }, + { + "epoch": 0.0192085420221256, + "grad_norm": 1.1930195093154907, + "learning_rate": 9.998835087937389e-06, + "loss": 0.9617, + "step": 349 + }, + { + "epoch": 0.01926358082448126, + "grad_norm": 1.2359932661056519, + "learning_rate": 9.998825712792396e-06, + "loss": 0.8768, + "step": 350 + }, + { + "epoch": 0.01931861962683692, + "grad_norm": 0.9984115362167358, + "learning_rate": 9.998816300077566e-06, + "loss": 0.8205, + "step": 351 + }, + { + "epoch": 0.01937365842919258, + "grad_norm": 1.6853677034378052, + "learning_rate": 9.998806849792972e-06, + "loss": 0.9066, + "step": 352 + }, + { + "epoch": 0.01942869723154824, + "grad_norm": 1.2869856357574463, + "learning_rate": 9.998797361938683e-06, + "loss": 1.0054, + "step": 353 + }, + { + "epoch": 0.019483736033903902, + "grad_norm": 1.2791584730148315, + "learning_rate": 9.99878783651477e-06, + "loss": 0.7627, + "step": 354 + }, + { + "epoch": 0.019538774836259563, + "grad_norm": 1.0795867443084717, + "learning_rate": 9.998778273521307e-06, + "loss": 0.9343, + "step": 355 + }, + { + "epoch": 0.019593813638615225, + "grad_norm": 1.0926088094711304, + "learning_rate": 9.998768672958365e-06, + "loss": 0.943, + "step": 356 + }, + { + "epoch": 0.019648852440970886, + "grad_norm": 1.0530847311019897, + "learning_rate": 9.998759034826015e-06, + "loss": 0.9656, + "step": 357 + }, + { + "epoch": 0.019703891243326544, + "grad_norm": 1.1793400049209595, + "learning_rate": 9.99874935912433e-06, + "loss": 0.9799, + "step": 358 + }, + { + "epoch": 0.019758930045682205, + "grad_norm": 1.0726191997528076, + "learning_rate": 9.998739645853383e-06, + "loss": 0.8739, + "step": 359 + }, + { + "epoch": 0.019813968848037867, + "grad_norm": 1.0488981008529663, + "learning_rate": 9.998729895013246e-06, + "loss": 0.8986, + "step": 360 + }, + { + "epoch": 0.019869007650393528, + "grad_norm": 1.8267477750778198, + "learning_rate": 9.998720106603993e-06, + "loss": 0.9175, + "step": 361 + }, + { + "epoch": 0.01992404645274919, + "grad_norm": 0.9868306517601013, + "learning_rate": 9.9987102806257e-06, + "loss": 0.9609, + "step": 362 + }, + { + "epoch": 0.019979085255104848, + "grad_norm": 1.0171183347702026, + "learning_rate": 9.998700417078438e-06, + "loss": 0.8904, + "step": 363 + }, + { + "epoch": 0.02003412405746051, + "grad_norm": 0.9800812602043152, + "learning_rate": 9.998690515962282e-06, + "loss": 0.8344, + "step": 364 + }, + { + "epoch": 0.02008916285981617, + "grad_norm": 1.024707317352295, + "learning_rate": 9.998680577277304e-06, + "loss": 0.9026, + "step": 365 + }, + { + "epoch": 0.02014420166217183, + "grad_norm": 1.1056619882583618, + "learning_rate": 9.998670601023584e-06, + "loss": 1.017, + "step": 366 + }, + { + "epoch": 0.020199240464527493, + "grad_norm": 1.0555908679962158, + "learning_rate": 9.998660587201191e-06, + "loss": 0.9627, + "step": 367 + }, + { + "epoch": 0.02025427926688315, + "grad_norm": 0.9502031803131104, + "learning_rate": 9.998650535810204e-06, + "loss": 0.935, + "step": 368 + }, + { + "epoch": 0.020309318069238812, + "grad_norm": 1.0355613231658936, + "learning_rate": 9.998640446850699e-06, + "loss": 0.9946, + "step": 369 + }, + { + "epoch": 0.020364356871594474, + "grad_norm": 0.9906355142593384, + "learning_rate": 9.99863032032275e-06, + "loss": 0.9389, + "step": 370 + }, + { + "epoch": 0.020419395673950135, + "grad_norm": 0.9483911395072937, + "learning_rate": 9.99862015622643e-06, + "loss": 0.979, + "step": 371 + }, + { + "epoch": 0.020474434476305797, + "grad_norm": 0.9769986271858215, + "learning_rate": 9.998609954561822e-06, + "loss": 0.8972, + "step": 372 + }, + { + "epoch": 0.020529473278661458, + "grad_norm": 1.1682699918746948, + "learning_rate": 9.998599715329e-06, + "loss": 0.943, + "step": 373 + }, + { + "epoch": 0.020584512081017116, + "grad_norm": 1.007912516593933, + "learning_rate": 9.99858943852804e-06, + "loss": 0.8825, + "step": 374 + }, + { + "epoch": 0.020639550883372777, + "grad_norm": 0.9788785576820374, + "learning_rate": 9.99857912415902e-06, + "loss": 0.9667, + "step": 375 + }, + { + "epoch": 0.02069458968572844, + "grad_norm": 1.0804275274276733, + "learning_rate": 9.998568772222017e-06, + "loss": 1.0026, + "step": 376 + }, + { + "epoch": 0.0207496284880841, + "grad_norm": 1.0859237909317017, + "learning_rate": 9.998558382717109e-06, + "loss": 0.9592, + "step": 377 + }, + { + "epoch": 0.02080466729043976, + "grad_norm": 1.2925337553024292, + "learning_rate": 9.998547955644373e-06, + "loss": 0.9067, + "step": 378 + }, + { + "epoch": 0.02085970609279542, + "grad_norm": 0.9853373765945435, + "learning_rate": 9.99853749100389e-06, + "loss": 0.9538, + "step": 379 + }, + { + "epoch": 0.02091474489515108, + "grad_norm": 1.0461076498031616, + "learning_rate": 9.998526988795738e-06, + "loss": 0.9261, + "step": 380 + }, + { + "epoch": 0.020969783697506742, + "grad_norm": 1.024559497833252, + "learning_rate": 9.998516449019995e-06, + "loss": 0.9117, + "step": 381 + }, + { + "epoch": 0.021024822499862404, + "grad_norm": 1.1474825143814087, + "learning_rate": 9.998505871676739e-06, + "loss": 1.0177, + "step": 382 + }, + { + "epoch": 0.021079861302218065, + "grad_norm": 0.9587596654891968, + "learning_rate": 9.998495256766051e-06, + "loss": 0.8809, + "step": 383 + }, + { + "epoch": 0.021134900104573723, + "grad_norm": 0.9505122303962708, + "learning_rate": 9.998484604288013e-06, + "loss": 0.9266, + "step": 384 + }, + { + "epoch": 0.021189938906929384, + "grad_norm": 0.9625647664070129, + "learning_rate": 9.9984739142427e-06, + "loss": 0.9073, + "step": 385 + }, + { + "epoch": 0.021244977709285046, + "grad_norm": 0.9650934338569641, + "learning_rate": 9.998463186630196e-06, + "loss": 0.9042, + "step": 386 + }, + { + "epoch": 0.021300016511640707, + "grad_norm": 1.0289491415023804, + "learning_rate": 9.99845242145058e-06, + "loss": 0.929, + "step": 387 + }, + { + "epoch": 0.02135505531399637, + "grad_norm": 0.9543869495391846, + "learning_rate": 9.998441618703935e-06, + "loss": 0.9406, + "step": 388 + }, + { + "epoch": 0.02141009411635203, + "grad_norm": 0.9276942610740662, + "learning_rate": 9.99843077839034e-06, + "loss": 0.8982, + "step": 389 + }, + { + "epoch": 0.021465132918707688, + "grad_norm": 0.9264664053916931, + "learning_rate": 9.998419900509877e-06, + "loss": 0.7255, + "step": 390 + }, + { + "epoch": 0.02152017172106335, + "grad_norm": 0.9961187243461609, + "learning_rate": 9.998408985062628e-06, + "loss": 0.9826, + "step": 391 + }, + { + "epoch": 0.02157521052341901, + "grad_norm": 0.966596245765686, + "learning_rate": 9.998398032048676e-06, + "loss": 0.8159, + "step": 392 + }, + { + "epoch": 0.021630249325774672, + "grad_norm": 1.1336095333099365, + "learning_rate": 9.998387041468102e-06, + "loss": 0.9289, + "step": 393 + }, + { + "epoch": 0.021685288128130333, + "grad_norm": 1.0453619956970215, + "learning_rate": 9.998376013320989e-06, + "loss": 0.8816, + "step": 394 + }, + { + "epoch": 0.02174032693048599, + "grad_norm": 0.8961821794509888, + "learning_rate": 9.998364947607419e-06, + "loss": 0.871, + "step": 395 + }, + { + "epoch": 0.021795365732841653, + "grad_norm": 1.3420332670211792, + "learning_rate": 9.998353844327477e-06, + "loss": 0.9338, + "step": 396 + }, + { + "epoch": 0.021850404535197314, + "grad_norm": 0.9635335206985474, + "learning_rate": 9.998342703481246e-06, + "loss": 0.9592, + "step": 397 + }, + { + "epoch": 0.021905443337552975, + "grad_norm": 1.3322341442108154, + "learning_rate": 9.998331525068807e-06, + "loss": 1.0974, + "step": 398 + }, + { + "epoch": 0.021960482139908637, + "grad_norm": 1.017220377922058, + "learning_rate": 9.998320309090247e-06, + "loss": 0.9827, + "step": 399 + }, + { + "epoch": 0.022015520942264295, + "grad_norm": 1.0080329179763794, + "learning_rate": 9.99830905554565e-06, + "loss": 0.877, + "step": 400 + }, + { + "epoch": 0.022070559744619956, + "grad_norm": 0.9883211255073547, + "learning_rate": 9.998297764435101e-06, + "loss": 0.9625, + "step": 401 + }, + { + "epoch": 0.022125598546975617, + "grad_norm": 1.0948412418365479, + "learning_rate": 9.998286435758684e-06, + "loss": 0.9058, + "step": 402 + }, + { + "epoch": 0.02218063734933128, + "grad_norm": 0.9402000308036804, + "learning_rate": 9.998275069516482e-06, + "loss": 0.8882, + "step": 403 + }, + { + "epoch": 0.02223567615168694, + "grad_norm": 0.9858806133270264, + "learning_rate": 9.998263665708583e-06, + "loss": 0.9086, + "step": 404 + }, + { + "epoch": 0.0222907149540426, + "grad_norm": 1.0556131601333618, + "learning_rate": 9.998252224335073e-06, + "loss": 0.9583, + "step": 405 + }, + { + "epoch": 0.02234575375639826, + "grad_norm": 1.092766284942627, + "learning_rate": 9.998240745396037e-06, + "loss": 0.9124, + "step": 406 + }, + { + "epoch": 0.02240079255875392, + "grad_norm": 1.1902250051498413, + "learning_rate": 9.998229228891563e-06, + "loss": 1.0566, + "step": 407 + }, + { + "epoch": 0.022455831361109582, + "grad_norm": 1.067906141281128, + "learning_rate": 9.998217674821734e-06, + "loss": 0.9823, + "step": 408 + }, + { + "epoch": 0.022510870163465244, + "grad_norm": 1.0051710605621338, + "learning_rate": 9.998206083186638e-06, + "loss": 0.9141, + "step": 409 + }, + { + "epoch": 0.022565908965820905, + "grad_norm": 1.046412467956543, + "learning_rate": 9.998194453986367e-06, + "loss": 0.9439, + "step": 410 + }, + { + "epoch": 0.022620947768176563, + "grad_norm": 1.1103553771972656, + "learning_rate": 9.998182787221e-06, + "loss": 0.9494, + "step": 411 + }, + { + "epoch": 0.022675986570532224, + "grad_norm": 1.0508466958999634, + "learning_rate": 9.998171082890632e-06, + "loss": 0.9202, + "step": 412 + }, + { + "epoch": 0.022731025372887886, + "grad_norm": 1.1364226341247559, + "learning_rate": 9.998159340995347e-06, + "loss": 0.9859, + "step": 413 + }, + { + "epoch": 0.022786064175243547, + "grad_norm": 1.2073607444763184, + "learning_rate": 9.998147561535234e-06, + "loss": 0.8883, + "step": 414 + }, + { + "epoch": 0.02284110297759921, + "grad_norm": 1.0657012462615967, + "learning_rate": 9.998135744510384e-06, + "loss": 0.8321, + "step": 415 + }, + { + "epoch": 0.02289614177995487, + "grad_norm": 1.0101548433303833, + "learning_rate": 9.998123889920881e-06, + "loss": 0.9374, + "step": 416 + }, + { + "epoch": 0.022951180582310528, + "grad_norm": 1.057455062866211, + "learning_rate": 9.998111997766817e-06, + "loss": 0.8831, + "step": 417 + }, + { + "epoch": 0.02300621938466619, + "grad_norm": 1.206092357635498, + "learning_rate": 9.998100068048282e-06, + "loss": 0.8812, + "step": 418 + }, + { + "epoch": 0.02306125818702185, + "grad_norm": 1.0709773302078247, + "learning_rate": 9.998088100765366e-06, + "loss": 0.9486, + "step": 419 + }, + { + "epoch": 0.023116296989377512, + "grad_norm": 1.066469669342041, + "learning_rate": 9.998076095918156e-06, + "loss": 1.0229, + "step": 420 + }, + { + "epoch": 0.023171335791733173, + "grad_norm": 1.0443583726882935, + "learning_rate": 9.998064053506744e-06, + "loss": 0.8615, + "step": 421 + }, + { + "epoch": 0.02322637459408883, + "grad_norm": 1.103096842765808, + "learning_rate": 9.99805197353122e-06, + "loss": 0.9909, + "step": 422 + }, + { + "epoch": 0.023281413396444493, + "grad_norm": 0.9804643392562866, + "learning_rate": 9.998039855991677e-06, + "loss": 0.9214, + "step": 423 + }, + { + "epoch": 0.023336452198800154, + "grad_norm": 0.9880676865577698, + "learning_rate": 9.998027700888202e-06, + "loss": 0.9345, + "step": 424 + }, + { + "epoch": 0.023391491001155815, + "grad_norm": 0.9633826017379761, + "learning_rate": 9.99801550822089e-06, + "loss": 0.9897, + "step": 425 + }, + { + "epoch": 0.023446529803511477, + "grad_norm": 1.0159331560134888, + "learning_rate": 9.998003277989831e-06, + "loss": 0.9385, + "step": 426 + }, + { + "epoch": 0.023501568605867135, + "grad_norm": 1.009667158126831, + "learning_rate": 9.99799101019512e-06, + "loss": 0.9013, + "step": 427 + }, + { + "epoch": 0.023556607408222796, + "grad_norm": 0.9478578567504883, + "learning_rate": 9.997978704836842e-06, + "loss": 0.8775, + "step": 428 + }, + { + "epoch": 0.023611646210578457, + "grad_norm": 1.013181447982788, + "learning_rate": 9.997966361915096e-06, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.02366668501293412, + "grad_norm": 1.0337481498718262, + "learning_rate": 9.997953981429974e-06, + "loss": 1.0047, + "step": 430 + }, + { + "epoch": 0.02372172381528978, + "grad_norm": 0.9423721432685852, + "learning_rate": 9.997941563381566e-06, + "loss": 0.8639, + "step": 431 + }, + { + "epoch": 0.02377676261764544, + "grad_norm": 1.100492000579834, + "learning_rate": 9.997929107769968e-06, + "loss": 1.0022, + "step": 432 + }, + { + "epoch": 0.0238318014200011, + "grad_norm": 1.1232364177703857, + "learning_rate": 9.997916614595272e-06, + "loss": 0.9145, + "step": 433 + }, + { + "epoch": 0.02388684022235676, + "grad_norm": 0.9466833472251892, + "learning_rate": 9.997904083857572e-06, + "loss": 0.9397, + "step": 434 + }, + { + "epoch": 0.023941879024712422, + "grad_norm": 0.9514566659927368, + "learning_rate": 9.997891515556963e-06, + "loss": 0.8025, + "step": 435 + }, + { + "epoch": 0.023996917827068084, + "grad_norm": 0.9292222261428833, + "learning_rate": 9.997878909693539e-06, + "loss": 0.7739, + "step": 436 + }, + { + "epoch": 0.024051956629423745, + "grad_norm": 1.1049963235855103, + "learning_rate": 9.997866266267397e-06, + "loss": 0.9439, + "step": 437 + }, + { + "epoch": 0.024106995431779403, + "grad_norm": 1.0938019752502441, + "learning_rate": 9.997853585278627e-06, + "loss": 0.9479, + "step": 438 + }, + { + "epoch": 0.024162034234135064, + "grad_norm": 1.0423611402511597, + "learning_rate": 9.997840866727331e-06, + "loss": 0.9309, + "step": 439 + }, + { + "epoch": 0.024217073036490726, + "grad_norm": 1.0584756135940552, + "learning_rate": 9.997828110613598e-06, + "loss": 1.0218, + "step": 440 + }, + { + "epoch": 0.024272111838846387, + "grad_norm": 0.9986408948898315, + "learning_rate": 9.997815316937527e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.02432715064120205, + "grad_norm": 0.9680983424186707, + "learning_rate": 9.997802485699215e-06, + "loss": 0.9286, + "step": 442 + }, + { + "epoch": 0.024382189443557706, + "grad_norm": 1.2231700420379639, + "learning_rate": 9.997789616898757e-06, + "loss": 0.8083, + "step": 443 + }, + { + "epoch": 0.024437228245913368, + "grad_norm": 1.0064021348953247, + "learning_rate": 9.99777671053625e-06, + "loss": 0.9161, + "step": 444 + }, + { + "epoch": 0.02449226704826903, + "grad_norm": 0.9658541679382324, + "learning_rate": 9.99776376661179e-06, + "loss": 0.8027, + "step": 445 + }, + { + "epoch": 0.02454730585062469, + "grad_norm": 0.9440343379974365, + "learning_rate": 9.997750785125477e-06, + "loss": 0.9124, + "step": 446 + }, + { + "epoch": 0.024602344652980352, + "grad_norm": 0.998792827129364, + "learning_rate": 9.997737766077404e-06, + "loss": 0.8699, + "step": 447 + }, + { + "epoch": 0.024657383455336013, + "grad_norm": 1.430880069732666, + "learning_rate": 9.997724709467676e-06, + "loss": 0.9158, + "step": 448 + }, + { + "epoch": 0.02471242225769167, + "grad_norm": 0.9737820029258728, + "learning_rate": 9.997711615296384e-06, + "loss": 0.9496, + "step": 449 + }, + { + "epoch": 0.024767461060047333, + "grad_norm": 0.9710075855255127, + "learning_rate": 9.997698483563629e-06, + "loss": 0.8714, + "step": 450 + }, + { + "epoch": 0.024822499862402994, + "grad_norm": 1.5286253690719604, + "learning_rate": 9.997685314269511e-06, + "loss": 0.8421, + "step": 451 + }, + { + "epoch": 0.024877538664758655, + "grad_norm": 1.0269445180892944, + "learning_rate": 9.99767210741413e-06, + "loss": 1.0131, + "step": 452 + }, + { + "epoch": 0.024932577467114317, + "grad_norm": 0.9780508279800415, + "learning_rate": 9.99765886299758e-06, + "loss": 0.9897, + "step": 453 + }, + { + "epoch": 0.024987616269469975, + "grad_norm": 0.998332679271698, + "learning_rate": 9.997645581019965e-06, + "loss": 0.9647, + "step": 454 + }, + { + "epoch": 0.025042655071825636, + "grad_norm": 1.7062602043151855, + "learning_rate": 9.997632261481383e-06, + "loss": 1.0729, + "step": 455 + }, + { + "epoch": 0.025097693874181298, + "grad_norm": 0.9793694615364075, + "learning_rate": 9.997618904381936e-06, + "loss": 0.9556, + "step": 456 + }, + { + "epoch": 0.02515273267653696, + "grad_norm": 1.0183895826339722, + "learning_rate": 9.997605509721721e-06, + "loss": 0.9194, + "step": 457 + }, + { + "epoch": 0.02520777147889262, + "grad_norm": 1.0288400650024414, + "learning_rate": 9.997592077500844e-06, + "loss": 0.955, + "step": 458 + }, + { + "epoch": 0.025262810281248282, + "grad_norm": 0.9551253914833069, + "learning_rate": 9.997578607719401e-06, + "loss": 0.8498, + "step": 459 + }, + { + "epoch": 0.02531784908360394, + "grad_norm": 0.9648008942604065, + "learning_rate": 9.997565100377494e-06, + "loss": 0.9306, + "step": 460 + }, + { + "epoch": 0.0253728878859596, + "grad_norm": 0.9206677675247192, + "learning_rate": 9.997551555475225e-06, + "loss": 0.7874, + "step": 461 + }, + { + "epoch": 0.025427926688315262, + "grad_norm": 1.0479545593261719, + "learning_rate": 9.997537973012698e-06, + "loss": 0.9201, + "step": 462 + }, + { + "epoch": 0.025482965490670924, + "grad_norm": 1.0329946279525757, + "learning_rate": 9.997524352990013e-06, + "loss": 0.9577, + "step": 463 + }, + { + "epoch": 0.025538004293026585, + "grad_norm": 1.1177828311920166, + "learning_rate": 9.997510695407273e-06, + "loss": 1.0041, + "step": 464 + }, + { + "epoch": 0.025593043095382243, + "grad_norm": 1.0351577997207642, + "learning_rate": 9.99749700026458e-06, + "loss": 0.9952, + "step": 465 + }, + { + "epoch": 0.025648081897737905, + "grad_norm": 0.905274510383606, + "learning_rate": 9.997483267562035e-06, + "loss": 0.8185, + "step": 466 + }, + { + "epoch": 0.025703120700093566, + "grad_norm": 1.0749776363372803, + "learning_rate": 9.997469497299747e-06, + "loss": 1.0611, + "step": 467 + }, + { + "epoch": 0.025758159502449227, + "grad_norm": 0.8972223401069641, + "learning_rate": 9.997455689477815e-06, + "loss": 0.8994, + "step": 468 + }, + { + "epoch": 0.02581319830480489, + "grad_norm": 1.0669914484024048, + "learning_rate": 9.997441844096342e-06, + "loss": 1.06, + "step": 469 + }, + { + "epoch": 0.025868237107160547, + "grad_norm": 1.0431914329528809, + "learning_rate": 9.997427961155435e-06, + "loss": 0.8657, + "step": 470 + }, + { + "epoch": 0.025923275909516208, + "grad_norm": 0.9609962701797485, + "learning_rate": 9.997414040655198e-06, + "loss": 0.8864, + "step": 471 + }, + { + "epoch": 0.02597831471187187, + "grad_norm": 1.0829721689224243, + "learning_rate": 9.997400082595735e-06, + "loss": 0.9221, + "step": 472 + }, + { + "epoch": 0.02603335351422753, + "grad_norm": 0.992082953453064, + "learning_rate": 9.99738608697715e-06, + "loss": 0.8455, + "step": 473 + }, + { + "epoch": 0.026088392316583192, + "grad_norm": 1.0486301183700562, + "learning_rate": 9.997372053799547e-06, + "loss": 0.8729, + "step": 474 + }, + { + "epoch": 0.026143431118938854, + "grad_norm": 1.0328491926193237, + "learning_rate": 9.997357983063036e-06, + "loss": 0.8788, + "step": 475 + }, + { + "epoch": 0.02619846992129451, + "grad_norm": 0.963333249092102, + "learning_rate": 9.997343874767719e-06, + "loss": 0.892, + "step": 476 + }, + { + "epoch": 0.026253508723650173, + "grad_norm": 1.1606497764587402, + "learning_rate": 9.997329728913704e-06, + "loss": 0.9984, + "step": 477 + }, + { + "epoch": 0.026308547526005834, + "grad_norm": 1.241650104522705, + "learning_rate": 9.997315545501096e-06, + "loss": 0.946, + "step": 478 + }, + { + "epoch": 0.026363586328361496, + "grad_norm": 1.008004069328308, + "learning_rate": 9.99730132453e-06, + "loss": 0.849, + "step": 479 + }, + { + "epoch": 0.026418625130717157, + "grad_norm": 0.9883478879928589, + "learning_rate": 9.997287066000527e-06, + "loss": 0.9478, + "step": 480 + }, + { + "epoch": 0.026473663933072815, + "grad_norm": 1.0224446058273315, + "learning_rate": 9.997272769912783e-06, + "loss": 1.0318, + "step": 481 + }, + { + "epoch": 0.026528702735428476, + "grad_norm": 0.9412569403648376, + "learning_rate": 9.997258436266874e-06, + "loss": 0.9119, + "step": 482 + }, + { + "epoch": 0.026583741537784138, + "grad_norm": 0.9214537739753723, + "learning_rate": 9.997244065062906e-06, + "loss": 0.8785, + "step": 483 + }, + { + "epoch": 0.0266387803401398, + "grad_norm": 1.0015628337860107, + "learning_rate": 9.997229656300991e-06, + "loss": 0.8869, + "step": 484 + }, + { + "epoch": 0.02669381914249546, + "grad_norm": 0.8965190052986145, + "learning_rate": 9.997215209981237e-06, + "loss": 0.7009, + "step": 485 + }, + { + "epoch": 0.02674885794485112, + "grad_norm": 1.1976135969161987, + "learning_rate": 9.997200726103749e-06, + "loss": 0.9795, + "step": 486 + }, + { + "epoch": 0.02680389674720678, + "grad_norm": 0.864780843257904, + "learning_rate": 9.997186204668639e-06, + "loss": 0.7687, + "step": 487 + }, + { + "epoch": 0.02685893554956244, + "grad_norm": 0.9946566820144653, + "learning_rate": 9.997171645676013e-06, + "loss": 0.9672, + "step": 488 + }, + { + "epoch": 0.026913974351918103, + "grad_norm": 1.043835997581482, + "learning_rate": 9.997157049125985e-06, + "loss": 0.862, + "step": 489 + }, + { + "epoch": 0.026969013154273764, + "grad_norm": 0.9697456955909729, + "learning_rate": 9.99714241501866e-06, + "loss": 0.8368, + "step": 490 + }, + { + "epoch": 0.027024051956629425, + "grad_norm": 0.9975618124008179, + "learning_rate": 9.997127743354153e-06, + "loss": 0.8739, + "step": 491 + }, + { + "epoch": 0.027079090758985083, + "grad_norm": 1.0055313110351562, + "learning_rate": 9.99711303413257e-06, + "loss": 0.9227, + "step": 492 + }, + { + "epoch": 0.027134129561340745, + "grad_norm": 1.0418384075164795, + "learning_rate": 9.997098287354024e-06, + "loss": 0.9978, + "step": 493 + }, + { + "epoch": 0.027189168363696406, + "grad_norm": 0.8648970723152161, + "learning_rate": 9.997083503018625e-06, + "loss": 0.8363, + "step": 494 + }, + { + "epoch": 0.027244207166052067, + "grad_norm": 1.13506019115448, + "learning_rate": 9.997068681126483e-06, + "loss": 0.8851, + "step": 495 + }, + { + "epoch": 0.02729924596840773, + "grad_norm": 0.974400520324707, + "learning_rate": 9.997053821677712e-06, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.027354284770763387, + "grad_norm": 1.226507544517517, + "learning_rate": 9.997038924672419e-06, + "loss": 0.8586, + "step": 497 + }, + { + "epoch": 0.027409323573119048, + "grad_norm": 1.004753589630127, + "learning_rate": 9.997023990110721e-06, + "loss": 0.8974, + "step": 498 + }, + { + "epoch": 0.02746436237547471, + "grad_norm": 1.0492571592330933, + "learning_rate": 9.997009017992729e-06, + "loss": 0.8457, + "step": 499 + }, + { + "epoch": 0.02751940117783037, + "grad_norm": 1.0068167448043823, + "learning_rate": 9.996994008318554e-06, + "loss": 0.9608, + "step": 500 + }, + { + "epoch": 0.027574439980186032, + "grad_norm": 0.9686044454574585, + "learning_rate": 9.996978961088311e-06, + "loss": 0.9041, + "step": 501 + }, + { + "epoch": 0.027629478782541694, + "grad_norm": 1.281728744506836, + "learning_rate": 9.99696387630211e-06, + "loss": 0.9739, + "step": 502 + }, + { + "epoch": 0.02768451758489735, + "grad_norm": 0.9069758653640747, + "learning_rate": 9.996948753960065e-06, + "loss": 0.8467, + "step": 503 + }, + { + "epoch": 0.027739556387253013, + "grad_norm": 1.0337222814559937, + "learning_rate": 9.996933594062293e-06, + "loss": 0.9638, + "step": 504 + }, + { + "epoch": 0.027794595189608674, + "grad_norm": 0.9695359468460083, + "learning_rate": 9.996918396608905e-06, + "loss": 0.8986, + "step": 505 + }, + { + "epoch": 0.027849633991964336, + "grad_norm": 0.9120615124702454, + "learning_rate": 9.996903161600016e-06, + "loss": 0.9103, + "step": 506 + }, + { + "epoch": 0.027904672794319997, + "grad_norm": 0.9736546874046326, + "learning_rate": 9.996887889035741e-06, + "loss": 0.9308, + "step": 507 + }, + { + "epoch": 0.027959711596675655, + "grad_norm": 1.0184897184371948, + "learning_rate": 9.996872578916192e-06, + "loss": 0.8978, + "step": 508 + }, + { + "epoch": 0.028014750399031316, + "grad_norm": 0.9791838526725769, + "learning_rate": 9.996857231241489e-06, + "loss": 0.8639, + "step": 509 + }, + { + "epoch": 0.028069789201386978, + "grad_norm": 1.2985681295394897, + "learning_rate": 9.996841846011742e-06, + "loss": 0.9581, + "step": 510 + }, + { + "epoch": 0.02812482800374264, + "grad_norm": 1.0647368431091309, + "learning_rate": 9.996826423227071e-06, + "loss": 1.0565, + "step": 511 + }, + { + "epoch": 0.0281798668060983, + "grad_norm": 1.0336421728134155, + "learning_rate": 9.996810962887591e-06, + "loss": 1.008, + "step": 512 + }, + { + "epoch": 0.02823490560845396, + "grad_norm": 1.1838933229446411, + "learning_rate": 9.996795464993416e-06, + "loss": 0.8359, + "step": 513 + }, + { + "epoch": 0.02828994441080962, + "grad_norm": 0.9898360371589661, + "learning_rate": 9.996779929544663e-06, + "loss": 0.8501, + "step": 514 + }, + { + "epoch": 0.02834498321316528, + "grad_norm": 0.9836066365242004, + "learning_rate": 9.99676435654145e-06, + "loss": 0.8795, + "step": 515 + }, + { + "epoch": 0.028400022015520943, + "grad_norm": 1.0621601343154907, + "learning_rate": 9.996748745983895e-06, + "loss": 0.8746, + "step": 516 + }, + { + "epoch": 0.028455060817876604, + "grad_norm": 1.0082437992095947, + "learning_rate": 9.996733097872113e-06, + "loss": 0.9278, + "step": 517 + }, + { + "epoch": 0.028510099620232265, + "grad_norm": 0.9903931617736816, + "learning_rate": 9.996717412206222e-06, + "loss": 0.8264, + "step": 518 + }, + { + "epoch": 0.028565138422587923, + "grad_norm": 1.0797243118286133, + "learning_rate": 9.996701688986342e-06, + "loss": 1.0077, + "step": 519 + }, + { + "epoch": 0.028620177224943585, + "grad_norm": 1.147133231163025, + "learning_rate": 9.99668592821259e-06, + "loss": 0.9374, + "step": 520 + }, + { + "epoch": 0.028675216027299246, + "grad_norm": 0.9993947744369507, + "learning_rate": 9.996670129885082e-06, + "loss": 0.9562, + "step": 521 + }, + { + "epoch": 0.028730254829654907, + "grad_norm": 0.8580895066261292, + "learning_rate": 9.99665429400394e-06, + "loss": 0.7985, + "step": 522 + }, + { + "epoch": 0.02878529363201057, + "grad_norm": 0.9251388907432556, + "learning_rate": 9.996638420569281e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.028840332434366227, + "grad_norm": 1.0010193586349487, + "learning_rate": 9.996622509581227e-06, + "loss": 0.9316, + "step": 524 + }, + { + "epoch": 0.028895371236721888, + "grad_norm": 0.9822579026222229, + "learning_rate": 9.996606561039894e-06, + "loss": 0.8978, + "step": 525 + }, + { + "epoch": 0.02895041003907755, + "grad_norm": 1.0760595798492432, + "learning_rate": 9.996590574945403e-06, + "loss": 0.9125, + "step": 526 + }, + { + "epoch": 0.02900544884143321, + "grad_norm": 1.138869285583496, + "learning_rate": 9.996574551297876e-06, + "loss": 0.8185, + "step": 527 + }, + { + "epoch": 0.029060487643788872, + "grad_norm": 1.002994179725647, + "learning_rate": 9.996558490097433e-06, + "loss": 0.9404, + "step": 528 + }, + { + "epoch": 0.02911552644614453, + "grad_norm": 0.9550611972808838, + "learning_rate": 9.996542391344194e-06, + "loss": 0.859, + "step": 529 + }, + { + "epoch": 0.02917056524850019, + "grad_norm": 0.9236055612564087, + "learning_rate": 9.996526255038277e-06, + "loss": 0.7758, + "step": 530 + }, + { + "epoch": 0.029225604050855853, + "grad_norm": 1.103966474533081, + "learning_rate": 9.996510081179808e-06, + "loss": 1.0147, + "step": 531 + }, + { + "epoch": 0.029280642853211514, + "grad_norm": 0.9884665012359619, + "learning_rate": 9.996493869768906e-06, + "loss": 0.8784, + "step": 532 + }, + { + "epoch": 0.029335681655567176, + "grad_norm": 0.9173223376274109, + "learning_rate": 9.996477620805694e-06, + "loss": 0.8741, + "step": 533 + }, + { + "epoch": 0.029390720457922837, + "grad_norm": 0.965548574924469, + "learning_rate": 9.996461334290294e-06, + "loss": 0.8989, + "step": 534 + }, + { + "epoch": 0.029445759260278495, + "grad_norm": 0.9939296245574951, + "learning_rate": 9.996445010222828e-06, + "loss": 0.8552, + "step": 535 + }, + { + "epoch": 0.029500798062634156, + "grad_norm": 1.0081578493118286, + "learning_rate": 9.996428648603417e-06, + "loss": 0.9138, + "step": 536 + }, + { + "epoch": 0.029555836864989818, + "grad_norm": 1.0139487981796265, + "learning_rate": 9.996412249432188e-06, + "loss": 0.9452, + "step": 537 + }, + { + "epoch": 0.02961087566734548, + "grad_norm": 0.9463647603988647, + "learning_rate": 9.996395812709262e-06, + "loss": 0.8721, + "step": 538 + }, + { + "epoch": 0.02966591446970114, + "grad_norm": 0.9981473684310913, + "learning_rate": 9.99637933843476e-06, + "loss": 0.7791, + "step": 539 + }, + { + "epoch": 0.0297209532720568, + "grad_norm": 1.1637190580368042, + "learning_rate": 9.996362826608812e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 0.02977599207441246, + "grad_norm": 2.2887051105499268, + "learning_rate": 9.996346277231536e-06, + "loss": 0.9303, + "step": 541 + }, + { + "epoch": 0.02983103087676812, + "grad_norm": 0.9173391461372375, + "learning_rate": 9.99632969030306e-06, + "loss": 0.8627, + "step": 542 + }, + { + "epoch": 0.029886069679123783, + "grad_norm": 1.033355474472046, + "learning_rate": 9.996313065823506e-06, + "loss": 0.9906, + "step": 543 + }, + { + "epoch": 0.029941108481479444, + "grad_norm": 0.9286639094352722, + "learning_rate": 9.996296403793002e-06, + "loss": 0.7043, + "step": 544 + }, + { + "epoch": 0.029996147283835102, + "grad_norm": 0.963238000869751, + "learning_rate": 9.996279704211671e-06, + "loss": 1.0236, + "step": 545 + }, + { + "epoch": 0.030051186086190763, + "grad_norm": 1.0275089740753174, + "learning_rate": 9.99626296707964e-06, + "loss": 0.976, + "step": 546 + }, + { + "epoch": 0.030106224888546425, + "grad_norm": 1.0944674015045166, + "learning_rate": 9.996246192397032e-06, + "loss": 0.9209, + "step": 547 + }, + { + "epoch": 0.030161263690902086, + "grad_norm": 0.9620945453643799, + "learning_rate": 9.996229380163976e-06, + "loss": 0.8973, + "step": 548 + }, + { + "epoch": 0.030216302493257748, + "grad_norm": 1.032549500465393, + "learning_rate": 9.996212530380597e-06, + "loss": 0.892, + "step": 549 + }, + { + "epoch": 0.03027134129561341, + "grad_norm": 1.0433719158172607, + "learning_rate": 9.996195643047023e-06, + "loss": 0.8428, + "step": 550 + }, + { + "epoch": 0.030326380097969067, + "grad_norm": 1.1541085243225098, + "learning_rate": 9.996178718163378e-06, + "loss": 0.9084, + "step": 551 + }, + { + "epoch": 0.03038141890032473, + "grad_norm": 0.9386873245239258, + "learning_rate": 9.996161755729793e-06, + "loss": 0.9246, + "step": 552 + }, + { + "epoch": 0.03043645770268039, + "grad_norm": 1.092236042022705, + "learning_rate": 9.996144755746393e-06, + "loss": 0.8419, + "step": 553 + }, + { + "epoch": 0.03049149650503605, + "grad_norm": 0.9517606496810913, + "learning_rate": 9.996127718213306e-06, + "loss": 0.9002, + "step": 554 + }, + { + "epoch": 0.030546535307391712, + "grad_norm": 0.965972900390625, + "learning_rate": 9.996110643130661e-06, + "loss": 0.9197, + "step": 555 + }, + { + "epoch": 0.03060157410974737, + "grad_norm": 0.9396095275878906, + "learning_rate": 9.996093530498586e-06, + "loss": 0.8686, + "step": 556 + }, + { + "epoch": 0.030656612912103032, + "grad_norm": 1.0154120922088623, + "learning_rate": 9.99607638031721e-06, + "loss": 0.9773, + "step": 557 + }, + { + "epoch": 0.030711651714458693, + "grad_norm": 1.3572301864624023, + "learning_rate": 9.99605919258666e-06, + "loss": 0.911, + "step": 558 + }, + { + "epoch": 0.030766690516814355, + "grad_norm": 0.968278169631958, + "learning_rate": 9.996041967307066e-06, + "loss": 0.7704, + "step": 559 + }, + { + "epoch": 0.030821729319170016, + "grad_norm": 0.9867869019508362, + "learning_rate": 9.99602470447856e-06, + "loss": 0.873, + "step": 560 + }, + { + "epoch": 0.030876768121525677, + "grad_norm": 1.056450605392456, + "learning_rate": 9.996007404101269e-06, + "loss": 0.941, + "step": 561 + }, + { + "epoch": 0.030931806923881335, + "grad_norm": 1.0419799089431763, + "learning_rate": 9.995990066175321e-06, + "loss": 0.957, + "step": 562 + }, + { + "epoch": 0.030986845726236997, + "grad_norm": 0.9789314866065979, + "learning_rate": 9.995972690700852e-06, + "loss": 0.9229, + "step": 563 + }, + { + "epoch": 0.031041884528592658, + "grad_norm": 0.917783796787262, + "learning_rate": 9.995955277677989e-06, + "loss": 0.8186, + "step": 564 + }, + { + "epoch": 0.03109692333094832, + "grad_norm": 1.0231432914733887, + "learning_rate": 9.995937827106863e-06, + "loss": 0.8624, + "step": 565 + }, + { + "epoch": 0.03115196213330398, + "grad_norm": 0.9552083015441895, + "learning_rate": 9.995920338987605e-06, + "loss": 0.7967, + "step": 566 + }, + { + "epoch": 0.03120700093565964, + "grad_norm": 0.9441083669662476, + "learning_rate": 9.995902813320349e-06, + "loss": 0.8471, + "step": 567 + }, + { + "epoch": 0.0312620397380153, + "grad_norm": 1.0025299787521362, + "learning_rate": 9.995885250105223e-06, + "loss": 0.8646, + "step": 568 + }, + { + "epoch": 0.03131707854037096, + "grad_norm": 0.8997280597686768, + "learning_rate": 9.99586764934236e-06, + "loss": 0.8736, + "step": 569 + }, + { + "epoch": 0.03137211734272662, + "grad_norm": 0.9090663194656372, + "learning_rate": 9.995850011031896e-06, + "loss": 0.8548, + "step": 570 + }, + { + "epoch": 0.031427156145082284, + "grad_norm": 0.9641294479370117, + "learning_rate": 9.995832335173959e-06, + "loss": 0.8667, + "step": 571 + }, + { + "epoch": 0.031482194947437946, + "grad_norm": 0.9165804982185364, + "learning_rate": 9.995814621768682e-06, + "loss": 0.803, + "step": 572 + }, + { + "epoch": 0.03153723374979361, + "grad_norm": 0.9672492742538452, + "learning_rate": 9.995796870816202e-06, + "loss": 0.8335, + "step": 573 + }, + { + "epoch": 0.03159227255214927, + "grad_norm": 0.9359404444694519, + "learning_rate": 9.995779082316648e-06, + "loss": 0.8294, + "step": 574 + }, + { + "epoch": 0.03164731135450492, + "grad_norm": 0.926925003528595, + "learning_rate": 9.995761256270157e-06, + "loss": 0.7714, + "step": 575 + }, + { + "epoch": 0.031702350156860584, + "grad_norm": 1.1848629713058472, + "learning_rate": 9.995743392676862e-06, + "loss": 0.8925, + "step": 576 + }, + { + "epoch": 0.031757388959216246, + "grad_norm": 0.9624786972999573, + "learning_rate": 9.995725491536897e-06, + "loss": 0.9292, + "step": 577 + }, + { + "epoch": 0.03181242776157191, + "grad_norm": 0.9479736089706421, + "learning_rate": 9.995707552850396e-06, + "loss": 0.8797, + "step": 578 + }, + { + "epoch": 0.03186746656392757, + "grad_norm": 0.9551546573638916, + "learning_rate": 9.995689576617494e-06, + "loss": 0.8793, + "step": 579 + }, + { + "epoch": 0.03192250536628323, + "grad_norm": 0.9210056662559509, + "learning_rate": 9.995671562838325e-06, + "loss": 0.9714, + "step": 580 + }, + { + "epoch": 0.03197754416863889, + "grad_norm": 1.063117504119873, + "learning_rate": 9.995653511513029e-06, + "loss": 0.9608, + "step": 581 + }, + { + "epoch": 0.03203258297099455, + "grad_norm": 0.9426459670066833, + "learning_rate": 9.995635422641736e-06, + "loss": 0.9102, + "step": 582 + }, + { + "epoch": 0.032087621773350214, + "grad_norm": 1.0176693201065063, + "learning_rate": 9.995617296224584e-06, + "loss": 0.9109, + "step": 583 + }, + { + "epoch": 0.032142660575705875, + "grad_norm": 0.9457042217254639, + "learning_rate": 9.995599132261711e-06, + "loss": 0.9017, + "step": 584 + }, + { + "epoch": 0.03219769937806154, + "grad_norm": 1.5851638317108154, + "learning_rate": 9.995580930753252e-06, + "loss": 0.967, + "step": 585 + }, + { + "epoch": 0.03225273818041719, + "grad_norm": 0.9961487054824829, + "learning_rate": 9.995562691699345e-06, + "loss": 0.9396, + "step": 586 + }, + { + "epoch": 0.03230777698277285, + "grad_norm": 0.9892112016677856, + "learning_rate": 9.995544415100125e-06, + "loss": 0.9058, + "step": 587 + }, + { + "epoch": 0.032362815785128514, + "grad_norm": 0.9052272439002991, + "learning_rate": 9.99552610095573e-06, + "loss": 0.9194, + "step": 588 + }, + { + "epoch": 0.032417854587484175, + "grad_norm": 0.8381399512290955, + "learning_rate": 9.995507749266297e-06, + "loss": 0.7465, + "step": 589 + }, + { + "epoch": 0.03247289338983984, + "grad_norm": 1.018964171409607, + "learning_rate": 9.995489360031969e-06, + "loss": 0.841, + "step": 590 + }, + { + "epoch": 0.0325279321921955, + "grad_norm": 0.908311128616333, + "learning_rate": 9.995470933252876e-06, + "loss": 0.8592, + "step": 591 + }, + { + "epoch": 0.03258297099455116, + "grad_norm": 1.2986040115356445, + "learning_rate": 9.995452468929162e-06, + "loss": 0.8341, + "step": 592 + }, + { + "epoch": 0.03263800979690682, + "grad_norm": 1.6565190553665161, + "learning_rate": 9.995433967060966e-06, + "loss": 0.8681, + "step": 593 + }, + { + "epoch": 0.03269304859926248, + "grad_norm": 0.9725674390792847, + "learning_rate": 9.995415427648423e-06, + "loss": 0.8449, + "step": 594 + }, + { + "epoch": 0.032748087401618144, + "grad_norm": 0.8683852553367615, + "learning_rate": 9.995396850691677e-06, + "loss": 0.8478, + "step": 595 + }, + { + "epoch": 0.0328031262039738, + "grad_norm": 0.9912856817245483, + "learning_rate": 9.995378236190862e-06, + "loss": 0.8912, + "step": 596 + }, + { + "epoch": 0.03285816500632946, + "grad_norm": 0.9396800398826599, + "learning_rate": 9.995359584146125e-06, + "loss": 0.856, + "step": 597 + }, + { + "epoch": 0.03291320380868512, + "grad_norm": 1.385006308555603, + "learning_rate": 9.995340894557601e-06, + "loss": 0.9633, + "step": 598 + }, + { + "epoch": 0.03296824261104078, + "grad_norm": 0.8982875943183899, + "learning_rate": 9.995322167425433e-06, + "loss": 0.9244, + "step": 599 + }, + { + "epoch": 0.033023281413396444, + "grad_norm": 0.8981022834777832, + "learning_rate": 9.995303402749759e-06, + "loss": 0.8854, + "step": 600 + }, + { + "epoch": 0.033078320215752105, + "grad_norm": 0.9917197227478027, + "learning_rate": 9.995284600530724e-06, + "loss": 1.0086, + "step": 601 + }, + { + "epoch": 0.033133359018107766, + "grad_norm": 1.0540626049041748, + "learning_rate": 9.995265760768464e-06, + "loss": 1.0022, + "step": 602 + }, + { + "epoch": 0.03318839782046343, + "grad_norm": 0.9523479342460632, + "learning_rate": 9.995246883463126e-06, + "loss": 0.9893, + "step": 603 + }, + { + "epoch": 0.03324343662281909, + "grad_norm": 0.9824770092964172, + "learning_rate": 9.99522796861485e-06, + "loss": 0.8385, + "step": 604 + }, + { + "epoch": 0.03329847542517475, + "grad_norm": 1.0968893766403198, + "learning_rate": 9.995209016223776e-06, + "loss": 1.0109, + "step": 605 + }, + { + "epoch": 0.03335351422753041, + "grad_norm": 0.9115625023841858, + "learning_rate": 9.995190026290049e-06, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.033408553029886066, + "grad_norm": 0.9795814156532288, + "learning_rate": 9.99517099881381e-06, + "loss": 0.8941, + "step": 607 + }, + { + "epoch": 0.03346359183224173, + "grad_norm": 0.9317291378974915, + "learning_rate": 9.995151933795204e-06, + "loss": 0.7819, + "step": 608 + }, + { + "epoch": 0.03351863063459739, + "grad_norm": 0.9936283230781555, + "learning_rate": 9.995132831234373e-06, + "loss": 0.8674, + "step": 609 + }, + { + "epoch": 0.03357366943695305, + "grad_norm": 0.9872812032699585, + "learning_rate": 9.995113691131462e-06, + "loss": 0.9038, + "step": 610 + }, + { + "epoch": 0.03362870823930871, + "grad_norm": 0.9516895413398743, + "learning_rate": 9.995094513486611e-06, + "loss": 0.9038, + "step": 611 + }, + { + "epoch": 0.03368374704166437, + "grad_norm": 1.090579867362976, + "learning_rate": 9.995075298299968e-06, + "loss": 0.9587, + "step": 612 + }, + { + "epoch": 0.033738785844020035, + "grad_norm": 1.021398663520813, + "learning_rate": 9.995056045571677e-06, + "loss": 0.9569, + "step": 613 + }, + { + "epoch": 0.033793824646375696, + "grad_norm": 1.009657382965088, + "learning_rate": 9.99503675530188e-06, + "loss": 0.8346, + "step": 614 + }, + { + "epoch": 0.03384886344873136, + "grad_norm": 1.0478712320327759, + "learning_rate": 9.995017427490725e-06, + "loss": 1.0566, + "step": 615 + }, + { + "epoch": 0.03390390225108702, + "grad_norm": 1.1391830444335938, + "learning_rate": 9.994998062138355e-06, + "loss": 1.0727, + "step": 616 + }, + { + "epoch": 0.03395894105344268, + "grad_norm": 1.0172302722930908, + "learning_rate": 9.994978659244918e-06, + "loss": 0.7869, + "step": 617 + }, + { + "epoch": 0.034013979855798335, + "grad_norm": 1.0532630681991577, + "learning_rate": 9.994959218810558e-06, + "loss": 0.8626, + "step": 618 + }, + { + "epoch": 0.034069018658153996, + "grad_norm": 0.8300478458404541, + "learning_rate": 9.99493974083542e-06, + "loss": 0.8166, + "step": 619 + }, + { + "epoch": 0.03412405746050966, + "grad_norm": 1.0613664388656616, + "learning_rate": 9.994920225319656e-06, + "loss": 0.8899, + "step": 620 + }, + { + "epoch": 0.03417909626286532, + "grad_norm": 0.9827042818069458, + "learning_rate": 9.994900672263406e-06, + "loss": 0.8243, + "step": 621 + }, + { + "epoch": 0.03423413506522098, + "grad_norm": 0.8790082931518555, + "learning_rate": 9.994881081666818e-06, + "loss": 0.8153, + "step": 622 + }, + { + "epoch": 0.03428917386757664, + "grad_norm": 1.033378005027771, + "learning_rate": 9.994861453530044e-06, + "loss": 0.8916, + "step": 623 + }, + { + "epoch": 0.0343442126699323, + "grad_norm": 0.9547238349914551, + "learning_rate": 9.994841787853227e-06, + "loss": 0.9141, + "step": 624 + }, + { + "epoch": 0.034399251472287964, + "grad_norm": 0.9606438279151917, + "learning_rate": 9.994822084636514e-06, + "loss": 0.9435, + "step": 625 + }, + { + "epoch": 0.034454290274643626, + "grad_norm": 0.8461503982543945, + "learning_rate": 9.994802343880059e-06, + "loss": 0.7914, + "step": 626 + }, + { + "epoch": 0.03450932907699929, + "grad_norm": 1.144538402557373, + "learning_rate": 9.994782565584004e-06, + "loss": 0.8025, + "step": 627 + }, + { + "epoch": 0.03456436787935495, + "grad_norm": 1.0099962949752808, + "learning_rate": 9.994762749748502e-06, + "loss": 0.9607, + "step": 628 + }, + { + "epoch": 0.0346194066817106, + "grad_norm": 0.9822041988372803, + "learning_rate": 9.9947428963737e-06, + "loss": 0.9216, + "step": 629 + }, + { + "epoch": 0.034674445484066264, + "grad_norm": 0.9056866765022278, + "learning_rate": 9.994723005459746e-06, + "loss": 0.7913, + "step": 630 + }, + { + "epoch": 0.034729484286421926, + "grad_norm": 1.0099287033081055, + "learning_rate": 9.994703077006792e-06, + "loss": 0.9937, + "step": 631 + }, + { + "epoch": 0.03478452308877759, + "grad_norm": 0.9559167623519897, + "learning_rate": 9.994683111014984e-06, + "loss": 0.9774, + "step": 632 + }, + { + "epoch": 0.03483956189113325, + "grad_norm": 1.0359059572219849, + "learning_rate": 9.994663107484478e-06, + "loss": 0.9062, + "step": 633 + }, + { + "epoch": 0.03489460069348891, + "grad_norm": 0.8803057074546814, + "learning_rate": 9.99464306641542e-06, + "loss": 0.9638, + "step": 634 + }, + { + "epoch": 0.03494963949584457, + "grad_norm": 1.0926579236984253, + "learning_rate": 9.994622987807962e-06, + "loss": 1.0467, + "step": 635 + }, + { + "epoch": 0.03500467829820023, + "grad_norm": 1.0051401853561401, + "learning_rate": 9.994602871662253e-06, + "loss": 0.8717, + "step": 636 + }, + { + "epoch": 0.035059717100555894, + "grad_norm": 1.2007508277893066, + "learning_rate": 9.994582717978448e-06, + "loss": 0.8004, + "step": 637 + }, + { + "epoch": 0.035114755902911556, + "grad_norm": 0.8826266527175903, + "learning_rate": 9.994562526756695e-06, + "loss": 0.8888, + "step": 638 + }, + { + "epoch": 0.03516979470526721, + "grad_norm": 0.9953717589378357, + "learning_rate": 9.994542297997147e-06, + "loss": 0.8999, + "step": 639 + }, + { + "epoch": 0.03522483350762287, + "grad_norm": 1.0203614234924316, + "learning_rate": 9.994522031699958e-06, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.03527987230997853, + "grad_norm": 0.8760203719139099, + "learning_rate": 9.994501727865276e-06, + "loss": 0.7893, + "step": 641 + }, + { + "epoch": 0.035334911112334194, + "grad_norm": 1.024888277053833, + "learning_rate": 9.994481386493257e-06, + "loss": 0.9865, + "step": 642 + }, + { + "epoch": 0.035389949914689856, + "grad_norm": 0.907454788684845, + "learning_rate": 9.994461007584052e-06, + "loss": 0.891, + "step": 643 + }, + { + "epoch": 0.03544498871704552, + "grad_norm": 1.0400965213775635, + "learning_rate": 9.994440591137816e-06, + "loss": 0.9345, + "step": 644 + }, + { + "epoch": 0.03550002751940118, + "grad_norm": 0.9816616177558899, + "learning_rate": 9.9944201371547e-06, + "loss": 0.91, + "step": 645 + }, + { + "epoch": 0.03555506632175684, + "grad_norm": 1.0528117418289185, + "learning_rate": 9.99439964563486e-06, + "loss": 0.952, + "step": 646 + }, + { + "epoch": 0.0356101051241125, + "grad_norm": 0.9802080988883972, + "learning_rate": 9.99437911657845e-06, + "loss": 0.9392, + "step": 647 + }, + { + "epoch": 0.03566514392646816, + "grad_norm": 0.9580393433570862, + "learning_rate": 9.994358549985623e-06, + "loss": 0.874, + "step": 648 + }, + { + "epoch": 0.035720182728823824, + "grad_norm": 0.8935576677322388, + "learning_rate": 9.994337945856533e-06, + "loss": 0.8435, + "step": 649 + }, + { + "epoch": 0.03577522153117948, + "grad_norm": 1.009699821472168, + "learning_rate": 9.994317304191337e-06, + "loss": 0.9436, + "step": 650 + }, + { + "epoch": 0.03583026033353514, + "grad_norm": 0.9126121401786804, + "learning_rate": 9.994296624990188e-06, + "loss": 0.8424, + "step": 651 + }, + { + "epoch": 0.0358852991358908, + "grad_norm": 0.9555553197860718, + "learning_rate": 9.994275908253243e-06, + "loss": 0.93, + "step": 652 + }, + { + "epoch": 0.03594033793824646, + "grad_norm": 0.8359857797622681, + "learning_rate": 9.994255153980658e-06, + "loss": 0.6326, + "step": 653 + }, + { + "epoch": 0.035995376740602124, + "grad_norm": 0.8918783664703369, + "learning_rate": 9.994234362172587e-06, + "loss": 0.8287, + "step": 654 + }, + { + "epoch": 0.036050415542957785, + "grad_norm": 0.9878549575805664, + "learning_rate": 9.994213532829188e-06, + "loss": 0.8841, + "step": 655 + }, + { + "epoch": 0.03610545434531345, + "grad_norm": 0.9504040479660034, + "learning_rate": 9.994192665950617e-06, + "loss": 1.0182, + "step": 656 + }, + { + "epoch": 0.03616049314766911, + "grad_norm": 0.9531422257423401, + "learning_rate": 9.99417176153703e-06, + "loss": 0.8504, + "step": 657 + }, + { + "epoch": 0.03621553195002477, + "grad_norm": 0.9580292105674744, + "learning_rate": 9.994150819588587e-06, + "loss": 0.8048, + "step": 658 + }, + { + "epoch": 0.03627057075238043, + "grad_norm": 0.9786819815635681, + "learning_rate": 9.99412984010544e-06, + "loss": 0.9124, + "step": 659 + }, + { + "epoch": 0.03632560955473609, + "grad_norm": 0.9733422994613647, + "learning_rate": 9.994108823087751e-06, + "loss": 0.8868, + "step": 660 + }, + { + "epoch": 0.03638064835709175, + "grad_norm": 1.093173623085022, + "learning_rate": 9.994087768535679e-06, + "loss": 0.9428, + "step": 661 + }, + { + "epoch": 0.03643568715944741, + "grad_norm": 0.9067148566246033, + "learning_rate": 9.994066676449378e-06, + "loss": 0.8838, + "step": 662 + }, + { + "epoch": 0.03649072596180307, + "grad_norm": 0.9509521722793579, + "learning_rate": 9.99404554682901e-06, + "loss": 0.9034, + "step": 663 + }, + { + "epoch": 0.03654576476415873, + "grad_norm": 0.9523824453353882, + "learning_rate": 9.994024379674731e-06, + "loss": 0.9623, + "step": 664 + }, + { + "epoch": 0.03660080356651439, + "grad_norm": 0.987276554107666, + "learning_rate": 9.994003174986703e-06, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.036655842368870054, + "grad_norm": 0.9500744342803955, + "learning_rate": 9.993981932765083e-06, + "loss": 0.9742, + "step": 666 + }, + { + "epoch": 0.036710881171225715, + "grad_norm": 0.9420705437660217, + "learning_rate": 9.993960653010034e-06, + "loss": 0.9657, + "step": 667 + }, + { + "epoch": 0.036765919973581376, + "grad_norm": 0.9443248510360718, + "learning_rate": 9.99393933572171e-06, + "loss": 0.8468, + "step": 668 + }, + { + "epoch": 0.03682095877593704, + "grad_norm": 0.9666558504104614, + "learning_rate": 9.993917980900276e-06, + "loss": 0.9871, + "step": 669 + }, + { + "epoch": 0.0368759975782927, + "grad_norm": 1.0236201286315918, + "learning_rate": 9.993896588545892e-06, + "loss": 0.9814, + "step": 670 + }, + { + "epoch": 0.03693103638064836, + "grad_norm": 1.016190528869629, + "learning_rate": 9.993875158658716e-06, + "loss": 1.0156, + "step": 671 + }, + { + "epoch": 0.036986075183004015, + "grad_norm": 0.9296661019325256, + "learning_rate": 9.993853691238913e-06, + "loss": 0.7956, + "step": 672 + }, + { + "epoch": 0.037041113985359676, + "grad_norm": 0.9276684522628784, + "learning_rate": 9.993832186286643e-06, + "loss": 0.9253, + "step": 673 + }, + { + "epoch": 0.03709615278771534, + "grad_norm": 0.8588787913322449, + "learning_rate": 9.993810643802065e-06, + "loss": 0.7878, + "step": 674 + }, + { + "epoch": 0.037151191590071, + "grad_norm": 0.9955212473869324, + "learning_rate": 9.993789063785344e-06, + "loss": 0.8711, + "step": 675 + }, + { + "epoch": 0.03720623039242666, + "grad_norm": 0.925578236579895, + "learning_rate": 9.993767446236642e-06, + "loss": 0.9431, + "step": 676 + }, + { + "epoch": 0.03726126919478232, + "grad_norm": 0.9610552787780762, + "learning_rate": 9.99374579115612e-06, + "loss": 0.887, + "step": 677 + }, + { + "epoch": 0.03731630799713798, + "grad_norm": 1.0052428245544434, + "learning_rate": 9.99372409854394e-06, + "loss": 0.8751, + "step": 678 + }, + { + "epoch": 0.037371346799493645, + "grad_norm": 0.9503066539764404, + "learning_rate": 9.99370236840027e-06, + "loss": 0.8556, + "step": 679 + }, + { + "epoch": 0.037426385601849306, + "grad_norm": 2.426232099533081, + "learning_rate": 9.993680600725266e-06, + "loss": 0.9077, + "step": 680 + }, + { + "epoch": 0.03748142440420497, + "grad_norm": 0.9119723439216614, + "learning_rate": 9.993658795519096e-06, + "loss": 0.8575, + "step": 681 + }, + { + "epoch": 0.03753646320656062, + "grad_norm": 0.9688286781311035, + "learning_rate": 9.993636952781923e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 0.03759150200891628, + "grad_norm": 1.030013084411621, + "learning_rate": 9.993615072513913e-06, + "loss": 0.8622, + "step": 683 + }, + { + "epoch": 0.037646540811271945, + "grad_norm": 1.055187463760376, + "learning_rate": 9.993593154715228e-06, + "loss": 0.9251, + "step": 684 + }, + { + "epoch": 0.037701579613627606, + "grad_norm": 1.0518591403961182, + "learning_rate": 9.993571199386032e-06, + "loss": 0.9575, + "step": 685 + }, + { + "epoch": 0.03775661841598327, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.993549206526495e-06, + "loss": 0.8522, + "step": 686 + }, + { + "epoch": 0.03781165721833893, + "grad_norm": 1.0212332010269165, + "learning_rate": 9.993527176136775e-06, + "loss": 0.9358, + "step": 687 + }, + { + "epoch": 0.03786669602069459, + "grad_norm": 0.9137141108512878, + "learning_rate": 9.993505108217045e-06, + "loss": 0.8561, + "step": 688 + }, + { + "epoch": 0.03792173482305025, + "grad_norm": 1.0069375038146973, + "learning_rate": 9.993483002767465e-06, + "loss": 0.8274, + "step": 689 + }, + { + "epoch": 0.03797677362540591, + "grad_norm": 0.9820672869682312, + "learning_rate": 9.993460859788204e-06, + "loss": 0.907, + "step": 690 + }, + { + "epoch": 0.038031812427761574, + "grad_norm": 1.0042002201080322, + "learning_rate": 9.993438679279428e-06, + "loss": 0.9263, + "step": 691 + }, + { + "epoch": 0.038086851230117236, + "grad_norm": 0.9733695983886719, + "learning_rate": 9.993416461241304e-06, + "loss": 0.8455, + "step": 692 + }, + { + "epoch": 0.03814189003247289, + "grad_norm": 0.9106015563011169, + "learning_rate": 9.993394205673996e-06, + "loss": 0.8469, + "step": 693 + }, + { + "epoch": 0.03819692883482855, + "grad_norm": 0.9802660346031189, + "learning_rate": 9.993371912577677e-06, + "loss": 0.8662, + "step": 694 + }, + { + "epoch": 0.03825196763718421, + "grad_norm": 0.9183964729309082, + "learning_rate": 9.99334958195251e-06, + "loss": 0.8968, + "step": 695 + }, + { + "epoch": 0.038307006439539874, + "grad_norm": 0.9572185277938843, + "learning_rate": 9.993327213798663e-06, + "loss": 0.953, + "step": 696 + }, + { + "epoch": 0.038362045241895536, + "grad_norm": 1.4480071067810059, + "learning_rate": 9.993304808116307e-06, + "loss": 1.1131, + "step": 697 + }, + { + "epoch": 0.0384170840442512, + "grad_norm": 0.9297361969947815, + "learning_rate": 9.993282364905607e-06, + "loss": 0.884, + "step": 698 + }, + { + "epoch": 0.03847212284660686, + "grad_norm": 0.9400073885917664, + "learning_rate": 9.993259884166735e-06, + "loss": 0.932, + "step": 699 + }, + { + "epoch": 0.03852716164896252, + "grad_norm": 0.9231798052787781, + "learning_rate": 9.993237365899858e-06, + "loss": 0.8981, + "step": 700 + }, + { + "epoch": 0.03858220045131818, + "grad_norm": 0.8233712911605835, + "learning_rate": 9.993214810105144e-06, + "loss": 0.8218, + "step": 701 + }, + { + "epoch": 0.03863723925367384, + "grad_norm": 1.0997854471206665, + "learning_rate": 9.993192216782768e-06, + "loss": 0.9298, + "step": 702 + }, + { + "epoch": 0.038692278056029504, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.993169585932893e-06, + "loss": 0.7815, + "step": 703 + }, + { + "epoch": 0.03874731685838516, + "grad_norm": 0.9913730025291443, + "learning_rate": 9.993146917555692e-06, + "loss": 0.9621, + "step": 704 + }, + { + "epoch": 0.03880235566074082, + "grad_norm": 1.088767409324646, + "learning_rate": 9.993124211651334e-06, + "loss": 0.9295, + "step": 705 + }, + { + "epoch": 0.03885739446309648, + "grad_norm": 0.8199124336242676, + "learning_rate": 9.993101468219995e-06, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.03891243326545214, + "grad_norm": 1.112566351890564, + "learning_rate": 9.99307868726184e-06, + "loss": 0.791, + "step": 707 + }, + { + "epoch": 0.038967472067807804, + "grad_norm": 0.9372578859329224, + "learning_rate": 9.99305586877704e-06, + "loss": 0.8567, + "step": 708 + }, + { + "epoch": 0.039022510870163465, + "grad_norm": 1.0167721509933472, + "learning_rate": 9.99303301276577e-06, + "loss": 0.9787, + "step": 709 + }, + { + "epoch": 0.03907754967251913, + "grad_norm": 1.3526856899261475, + "learning_rate": 9.993010119228202e-06, + "loss": 1.2215, + "step": 710 + }, + { + "epoch": 0.03913258847487479, + "grad_norm": 0.8819016814231873, + "learning_rate": 9.992987188164505e-06, + "loss": 0.7736, + "step": 711 + }, + { + "epoch": 0.03918762727723045, + "grad_norm": 1.0033677816390991, + "learning_rate": 9.992964219574852e-06, + "loss": 0.9919, + "step": 712 + }, + { + "epoch": 0.03924266607958611, + "grad_norm": 0.894926130771637, + "learning_rate": 9.992941213459417e-06, + "loss": 0.9058, + "step": 713 + }, + { + "epoch": 0.03929770488194177, + "grad_norm": 0.9481377005577087, + "learning_rate": 9.992918169818373e-06, + "loss": 0.8436, + "step": 714 + }, + { + "epoch": 0.03935274368429743, + "grad_norm": 0.9312933087348938, + "learning_rate": 9.992895088651893e-06, + "loss": 0.8869, + "step": 715 + }, + { + "epoch": 0.03940778248665309, + "grad_norm": 0.9765705466270447, + "learning_rate": 9.99287196996015e-06, + "loss": 0.9512, + "step": 716 + }, + { + "epoch": 0.03946282128900875, + "grad_norm": 0.9610235691070557, + "learning_rate": 9.992848813743317e-06, + "loss": 0.8005, + "step": 717 + }, + { + "epoch": 0.03951786009136441, + "grad_norm": 1.102995753288269, + "learning_rate": 9.99282562000157e-06, + "loss": 0.8017, + "step": 718 + }, + { + "epoch": 0.03957289889372007, + "grad_norm": 1.023317575454712, + "learning_rate": 9.99280238873508e-06, + "loss": 0.911, + "step": 719 + }, + { + "epoch": 0.039627937696075734, + "grad_norm": 1.0531049966812134, + "learning_rate": 9.992779119944025e-06, + "loss": 0.8562, + "step": 720 + }, + { + "epoch": 0.039682976498431395, + "grad_norm": 0.918250322341919, + "learning_rate": 9.992755813628579e-06, + "loss": 0.92, + "step": 721 + }, + { + "epoch": 0.039738015300787057, + "grad_norm": 0.8508251309394836, + "learning_rate": 9.992732469788915e-06, + "loss": 0.7347, + "step": 722 + }, + { + "epoch": 0.03979305410314272, + "grad_norm": 0.9184926152229309, + "learning_rate": 9.992709088425211e-06, + "loss": 0.8732, + "step": 723 + }, + { + "epoch": 0.03984809290549838, + "grad_norm": 1.1613929271697998, + "learning_rate": 9.992685669537643e-06, + "loss": 0.9522, + "step": 724 + }, + { + "epoch": 0.039903131707854034, + "grad_norm": 1.091513752937317, + "learning_rate": 9.992662213126386e-06, + "loss": 0.9646, + "step": 725 + }, + { + "epoch": 0.039958170510209695, + "grad_norm": 1.057803750038147, + "learning_rate": 9.992638719191615e-06, + "loss": 0.7032, + "step": 726 + }, + { + "epoch": 0.040013209312565357, + "grad_norm": 0.8771823644638062, + "learning_rate": 9.992615187733508e-06, + "loss": 0.8577, + "step": 727 + }, + { + "epoch": 0.04006824811492102, + "grad_norm": 0.9471028447151184, + "learning_rate": 9.992591618752244e-06, + "loss": 0.9057, + "step": 728 + }, + { + "epoch": 0.04012328691727668, + "grad_norm": 0.9547705054283142, + "learning_rate": 9.992568012247995e-06, + "loss": 0.9549, + "step": 729 + }, + { + "epoch": 0.04017832571963234, + "grad_norm": 0.8862974047660828, + "learning_rate": 9.992544368220941e-06, + "loss": 0.8593, + "step": 730 + }, + { + "epoch": 0.040233364521988, + "grad_norm": 0.906334400177002, + "learning_rate": 9.992520686671261e-06, + "loss": 0.8832, + "step": 731 + }, + { + "epoch": 0.04028840332434366, + "grad_norm": 1.07270085811615, + "learning_rate": 9.992496967599133e-06, + "loss": 0.9409, + "step": 732 + }, + { + "epoch": 0.040343442126699325, + "grad_norm": 0.9026005268096924, + "learning_rate": 9.992473211004734e-06, + "loss": 0.8326, + "step": 733 + }, + { + "epoch": 0.040398480929054986, + "grad_norm": 0.9762942790985107, + "learning_rate": 9.992449416888241e-06, + "loss": 0.9048, + "step": 734 + }, + { + "epoch": 0.04045351973141065, + "grad_norm": 0.9658033847808838, + "learning_rate": 9.992425585249837e-06, + "loss": 0.9219, + "step": 735 + }, + { + "epoch": 0.0405085585337663, + "grad_norm": 0.8909044861793518, + "learning_rate": 9.992401716089698e-06, + "loss": 0.8564, + "step": 736 + }, + { + "epoch": 0.04056359733612196, + "grad_norm": 1.0387929677963257, + "learning_rate": 9.992377809408001e-06, + "loss": 0.9533, + "step": 737 + }, + { + "epoch": 0.040618636138477625, + "grad_norm": 0.9044275879859924, + "learning_rate": 9.99235386520493e-06, + "loss": 0.8508, + "step": 738 + }, + { + "epoch": 0.040673674940833286, + "grad_norm": 1.019377589225769, + "learning_rate": 9.992329883480667e-06, + "loss": 0.8684, + "step": 739 + }, + { + "epoch": 0.04072871374318895, + "grad_norm": 0.9394627213478088, + "learning_rate": 9.992305864235385e-06, + "loss": 0.7665, + "step": 740 + }, + { + "epoch": 0.04078375254554461, + "grad_norm": 0.8652323484420776, + "learning_rate": 9.99228180746927e-06, + "loss": 0.8576, + "step": 741 + }, + { + "epoch": 0.04083879134790027, + "grad_norm": 0.9347619414329529, + "learning_rate": 9.992257713182502e-06, + "loss": 0.9586, + "step": 742 + }, + { + "epoch": 0.04089383015025593, + "grad_norm": 0.9510203003883362, + "learning_rate": 9.99223358137526e-06, + "loss": 0.9092, + "step": 743 + }, + { + "epoch": 0.04094886895261159, + "grad_norm": 0.8242866396903992, + "learning_rate": 9.992209412047729e-06, + "loss": 0.6997, + "step": 744 + }, + { + "epoch": 0.041003907754967255, + "grad_norm": 0.8842730522155762, + "learning_rate": 9.992185205200087e-06, + "loss": 0.8873, + "step": 745 + }, + { + "epoch": 0.041058946557322916, + "grad_norm": 1.0813730955123901, + "learning_rate": 9.992160960832518e-06, + "loss": 1.0162, + "step": 746 + }, + { + "epoch": 0.04111398535967857, + "grad_norm": 1.1276283264160156, + "learning_rate": 9.9921366789452e-06, + "loss": 1.0004, + "step": 747 + }, + { + "epoch": 0.04116902416203423, + "grad_norm": 0.8810326457023621, + "learning_rate": 9.992112359538323e-06, + "loss": 0.7823, + "step": 748 + }, + { + "epoch": 0.04122406296438989, + "grad_norm": 0.9939407110214233, + "learning_rate": 9.992088002612066e-06, + "loss": 1.0016, + "step": 749 + }, + { + "epoch": 0.041279101766745555, + "grad_norm": 1.0963523387908936, + "learning_rate": 9.99206360816661e-06, + "loss": 0.9252, + "step": 750 + }, + { + "epoch": 0.041334140569101216, + "grad_norm": 1.1346478462219238, + "learning_rate": 9.99203917620214e-06, + "loss": 0.9608, + "step": 751 + }, + { + "epoch": 0.04138917937145688, + "grad_norm": 1.0108580589294434, + "learning_rate": 9.992014706718841e-06, + "loss": 0.9179, + "step": 752 + }, + { + "epoch": 0.04144421817381254, + "grad_norm": 0.897293210029602, + "learning_rate": 9.991990199716894e-06, + "loss": 0.9295, + "step": 753 + }, + { + "epoch": 0.0414992569761682, + "grad_norm": 1.0152363777160645, + "learning_rate": 9.991965655196488e-06, + "loss": 0.8467, + "step": 754 + }, + { + "epoch": 0.04155429577852386, + "grad_norm": 0.8655388355255127, + "learning_rate": 9.9919410731578e-06, + "loss": 0.796, + "step": 755 + }, + { + "epoch": 0.04160933458087952, + "grad_norm": 1.0140331983566284, + "learning_rate": 9.991916453601023e-06, + "loss": 0.8444, + "step": 756 + }, + { + "epoch": 0.041664373383235184, + "grad_norm": 0.9387341141700745, + "learning_rate": 9.991891796526338e-06, + "loss": 0.8669, + "step": 757 + }, + { + "epoch": 0.04171941218559084, + "grad_norm": 0.9395696520805359, + "learning_rate": 9.991867101933928e-06, + "loss": 0.8376, + "step": 758 + }, + { + "epoch": 0.0417744509879465, + "grad_norm": 1.0856634378433228, + "learning_rate": 9.991842369823983e-06, + "loss": 0.9271, + "step": 759 + }, + { + "epoch": 0.04182948979030216, + "grad_norm": 0.8777190446853638, + "learning_rate": 9.991817600196687e-06, + "loss": 0.9197, + "step": 760 + }, + { + "epoch": 0.04188452859265782, + "grad_norm": 0.9639917016029358, + "learning_rate": 9.991792793052225e-06, + "loss": 0.8835, + "step": 761 + }, + { + "epoch": 0.041939567395013484, + "grad_norm": 0.9384773969650269, + "learning_rate": 9.991767948390785e-06, + "loss": 0.8403, + "step": 762 + }, + { + "epoch": 0.041994606197369146, + "grad_norm": 0.8987650275230408, + "learning_rate": 9.991743066212554e-06, + "loss": 0.7948, + "step": 763 + }, + { + "epoch": 0.04204964499972481, + "grad_norm": 1.0545049905776978, + "learning_rate": 9.991718146517717e-06, + "loss": 0.9359, + "step": 764 + }, + { + "epoch": 0.04210468380208047, + "grad_norm": 0.9840022325515747, + "learning_rate": 9.991693189306463e-06, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.04215972260443613, + "grad_norm": 0.8769927620887756, + "learning_rate": 9.991668194578981e-06, + "loss": 0.8647, + "step": 766 + }, + { + "epoch": 0.04221476140679179, + "grad_norm": 0.9268791675567627, + "learning_rate": 9.991643162335455e-06, + "loss": 0.897, + "step": 767 + }, + { + "epoch": 0.042269800209147446, + "grad_norm": 0.9316747784614563, + "learning_rate": 9.991618092576075e-06, + "loss": 0.9341, + "step": 768 + }, + { + "epoch": 0.04232483901150311, + "grad_norm": 0.8348364233970642, + "learning_rate": 9.991592985301031e-06, + "loss": 0.7528, + "step": 769 + }, + { + "epoch": 0.04237987781385877, + "grad_norm": 0.9139068126678467, + "learning_rate": 9.99156784051051e-06, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 0.04243491661621443, + "grad_norm": 0.9403928518295288, + "learning_rate": 9.991542658204701e-06, + "loss": 0.974, + "step": 771 + }, + { + "epoch": 0.04248995541857009, + "grad_norm": 0.993549108505249, + "learning_rate": 9.991517438383793e-06, + "loss": 0.9479, + "step": 772 + }, + { + "epoch": 0.04254499422092575, + "grad_norm": 0.8494916558265686, + "learning_rate": 9.991492181047975e-06, + "loss": 0.9149, + "step": 773 + }, + { + "epoch": 0.042600033023281414, + "grad_norm": 1.0351910591125488, + "learning_rate": 9.991466886197441e-06, + "loss": 0.9552, + "step": 774 + }, + { + "epoch": 0.042655071825637075, + "grad_norm": 0.916829526424408, + "learning_rate": 9.991441553832375e-06, + "loss": 0.8781, + "step": 775 + }, + { + "epoch": 0.04271011062799274, + "grad_norm": 1.113476276397705, + "learning_rate": 9.991416183952972e-06, + "loss": 0.8137, + "step": 776 + }, + { + "epoch": 0.0427651494303484, + "grad_norm": 1.1608171463012695, + "learning_rate": 9.991390776559421e-06, + "loss": 1.0045, + "step": 777 + }, + { + "epoch": 0.04282018823270406, + "grad_norm": 1.0045493841171265, + "learning_rate": 9.991365331651913e-06, + "loss": 0.8813, + "step": 778 + }, + { + "epoch": 0.042875227035059714, + "grad_norm": 0.918820858001709, + "learning_rate": 9.991339849230639e-06, + "loss": 0.9198, + "step": 779 + }, + { + "epoch": 0.042930265837415375, + "grad_norm": 0.9875735640525818, + "learning_rate": 9.991314329295792e-06, + "loss": 0.8665, + "step": 780 + }, + { + "epoch": 0.04298530463977104, + "grad_norm": 0.873768150806427, + "learning_rate": 9.991288771847561e-06, + "loss": 0.8606, + "step": 781 + }, + { + "epoch": 0.0430403434421267, + "grad_norm": 0.8892746567726135, + "learning_rate": 9.991263176886139e-06, + "loss": 0.9011, + "step": 782 + }, + { + "epoch": 0.04309538224448236, + "grad_norm": 1.097734808921814, + "learning_rate": 9.99123754441172e-06, + "loss": 1.009, + "step": 783 + }, + { + "epoch": 0.04315042104683802, + "grad_norm": 1.0065964460372925, + "learning_rate": 9.991211874424497e-06, + "loss": 0.9492, + "step": 784 + }, + { + "epoch": 0.04320545984919368, + "grad_norm": 1.0791678428649902, + "learning_rate": 9.99118616692466e-06, + "loss": 1.0142, + "step": 785 + }, + { + "epoch": 0.043260498651549344, + "grad_norm": 0.9454777836799622, + "learning_rate": 9.991160421912404e-06, + "loss": 0.8058, + "step": 786 + }, + { + "epoch": 0.043315537453905005, + "grad_norm": 0.9448156952857971, + "learning_rate": 9.991134639387922e-06, + "loss": 0.8184, + "step": 787 + }, + { + "epoch": 0.043370576256260666, + "grad_norm": 0.9636550545692444, + "learning_rate": 9.99110881935141e-06, + "loss": 0.8606, + "step": 788 + }, + { + "epoch": 0.04342561505861633, + "grad_norm": 0.9933613538742065, + "learning_rate": 9.991082961803058e-06, + "loss": 0.9449, + "step": 789 + }, + { + "epoch": 0.04348065386097198, + "grad_norm": 0.8906797170639038, + "learning_rate": 9.991057066743065e-06, + "loss": 0.8053, + "step": 790 + }, + { + "epoch": 0.043535692663327644, + "grad_norm": 1.0393906831741333, + "learning_rate": 9.991031134171621e-06, + "loss": 0.8487, + "step": 791 + }, + { + "epoch": 0.043590731465683305, + "grad_norm": 1.0618231296539307, + "learning_rate": 9.991005164088923e-06, + "loss": 0.9847, + "step": 792 + }, + { + "epoch": 0.043645770268038966, + "grad_norm": 0.9525149464607239, + "learning_rate": 9.990979156495167e-06, + "loss": 0.9318, + "step": 793 + }, + { + "epoch": 0.04370080907039463, + "grad_norm": 0.9430851936340332, + "learning_rate": 9.990953111390546e-06, + "loss": 0.8483, + "step": 794 + }, + { + "epoch": 0.04375584787275029, + "grad_norm": 0.9259672164916992, + "learning_rate": 9.99092702877526e-06, + "loss": 0.9365, + "step": 795 + }, + { + "epoch": 0.04381088667510595, + "grad_norm": 0.942609965801239, + "learning_rate": 9.9909009086495e-06, + "loss": 0.8408, + "step": 796 + }, + { + "epoch": 0.04386592547746161, + "grad_norm": 0.939255952835083, + "learning_rate": 9.990874751013467e-06, + "loss": 0.8749, + "step": 797 + }, + { + "epoch": 0.04392096427981727, + "grad_norm": 1.1701711416244507, + "learning_rate": 9.990848555867353e-06, + "loss": 0.9312, + "step": 798 + }, + { + "epoch": 0.043976003082172935, + "grad_norm": 1.0441124439239502, + "learning_rate": 9.990822323211358e-06, + "loss": 0.8618, + "step": 799 + }, + { + "epoch": 0.04403104188452859, + "grad_norm": 0.9601489305496216, + "learning_rate": 9.990796053045679e-06, + "loss": 0.9569, + "step": 800 + }, + { + "epoch": 0.04408608068688425, + "grad_norm": 0.9394032955169678, + "learning_rate": 9.990769745370513e-06, + "loss": 0.846, + "step": 801 + }, + { + "epoch": 0.04414111948923991, + "grad_norm": 0.9631348252296448, + "learning_rate": 9.990743400186056e-06, + "loss": 0.8754, + "step": 802 + }, + { + "epoch": 0.04419615829159557, + "grad_norm": 0.9234963059425354, + "learning_rate": 9.990717017492508e-06, + "loss": 0.8613, + "step": 803 + }, + { + "epoch": 0.044251197093951235, + "grad_norm": 0.9169090390205383, + "learning_rate": 9.990690597290069e-06, + "loss": 0.8867, + "step": 804 + }, + { + "epoch": 0.044306235896306896, + "grad_norm": 1.0194867849349976, + "learning_rate": 9.990664139578933e-06, + "loss": 0.8675, + "step": 805 + }, + { + "epoch": 0.04436127469866256, + "grad_norm": 1.3226114511489868, + "learning_rate": 9.990637644359302e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.04441631350101822, + "grad_norm": 0.8904317617416382, + "learning_rate": 9.990611111631374e-06, + "loss": 0.7274, + "step": 807 + }, + { + "epoch": 0.04447135230337388, + "grad_norm": 0.8909007906913757, + "learning_rate": 9.99058454139535e-06, + "loss": 0.8141, + "step": 808 + }, + { + "epoch": 0.04452639110572954, + "grad_norm": 1.004015564918518, + "learning_rate": 9.990557933651429e-06, + "loss": 0.9883, + "step": 809 + }, + { + "epoch": 0.0445814299080852, + "grad_norm": 1.1215732097625732, + "learning_rate": 9.990531288399807e-06, + "loss": 0.9355, + "step": 810 + }, + { + "epoch": 0.04463646871044086, + "grad_norm": 1.0545012950897217, + "learning_rate": 9.99050460564069e-06, + "loss": 0.9532, + "step": 811 + }, + { + "epoch": 0.04469150751279652, + "grad_norm": 0.9608867168426514, + "learning_rate": 9.990477885374277e-06, + "loss": 0.9363, + "step": 812 + }, + { + "epoch": 0.04474654631515218, + "grad_norm": 0.8750461935997009, + "learning_rate": 9.990451127600766e-06, + "loss": 0.7343, + "step": 813 + }, + { + "epoch": 0.04480158511750784, + "grad_norm": 0.891740620136261, + "learning_rate": 9.99042433232036e-06, + "loss": 0.8541, + "step": 814 + }, + { + "epoch": 0.0448566239198635, + "grad_norm": 1.1520029306411743, + "learning_rate": 9.990397499533264e-06, + "loss": 0.7696, + "step": 815 + }, + { + "epoch": 0.044911662722219164, + "grad_norm": 0.9526278972625732, + "learning_rate": 9.990370629239673e-06, + "loss": 0.8953, + "step": 816 + }, + { + "epoch": 0.044966701524574826, + "grad_norm": 0.9218434691429138, + "learning_rate": 9.990343721439795e-06, + "loss": 0.8198, + "step": 817 + }, + { + "epoch": 0.04502174032693049, + "grad_norm": 0.8502745628356934, + "learning_rate": 9.990316776133827e-06, + "loss": 0.8035, + "step": 818 + }, + { + "epoch": 0.04507677912928615, + "grad_norm": 0.8861565589904785, + "learning_rate": 9.990289793321975e-06, + "loss": 0.8626, + "step": 819 + }, + { + "epoch": 0.04513181793164181, + "grad_norm": 1.1113256216049194, + "learning_rate": 9.99026277300444e-06, + "loss": 0.9363, + "step": 820 + }, + { + "epoch": 0.04518685673399747, + "grad_norm": 0.9984708428382874, + "learning_rate": 9.990235715181426e-06, + "loss": 1.0376, + "step": 821 + }, + { + "epoch": 0.045241895536353126, + "grad_norm": 0.9026711583137512, + "learning_rate": 9.990208619853137e-06, + "loss": 0.9079, + "step": 822 + }, + { + "epoch": 0.04529693433870879, + "grad_norm": 0.8724965453147888, + "learning_rate": 9.990181487019775e-06, + "loss": 0.8665, + "step": 823 + }, + { + "epoch": 0.04535197314106445, + "grad_norm": 0.8923047780990601, + "learning_rate": 9.990154316681543e-06, + "loss": 0.7779, + "step": 824 + }, + { + "epoch": 0.04540701194342011, + "grad_norm": 0.9024640321731567, + "learning_rate": 9.99012710883865e-06, + "loss": 0.8859, + "step": 825 + }, + { + "epoch": 0.04546205074577577, + "grad_norm": 0.9245888590812683, + "learning_rate": 9.990099863491296e-06, + "loss": 0.8501, + "step": 826 + }, + { + "epoch": 0.04551708954813143, + "grad_norm": 0.9257050156593323, + "learning_rate": 9.990072580639687e-06, + "loss": 0.9561, + "step": 827 + }, + { + "epoch": 0.045572128350487094, + "grad_norm": 0.995610773563385, + "learning_rate": 9.99004526028403e-06, + "loss": 0.917, + "step": 828 + }, + { + "epoch": 0.045627167152842756, + "grad_norm": 0.9524009823799133, + "learning_rate": 9.990017902424525e-06, + "loss": 0.9184, + "step": 829 + }, + { + "epoch": 0.04568220595519842, + "grad_norm": 0.9264503121376038, + "learning_rate": 9.989990507061385e-06, + "loss": 0.8615, + "step": 830 + }, + { + "epoch": 0.04573724475755408, + "grad_norm": 1.0068570375442505, + "learning_rate": 9.989963074194809e-06, + "loss": 0.8331, + "step": 831 + }, + { + "epoch": 0.04579228355990974, + "grad_norm": 0.9295952320098877, + "learning_rate": 9.989935603825009e-06, + "loss": 0.8387, + "step": 832 + }, + { + "epoch": 0.045847322362265394, + "grad_norm": 1.0408827066421509, + "learning_rate": 9.989908095952186e-06, + "loss": 0.9686, + "step": 833 + }, + { + "epoch": 0.045902361164621056, + "grad_norm": 0.8874136209487915, + "learning_rate": 9.989880550576551e-06, + "loss": 0.815, + "step": 834 + }, + { + "epoch": 0.04595739996697672, + "grad_norm": 0.9898836016654968, + "learning_rate": 9.989852967698311e-06, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 0.04601243876933238, + "grad_norm": 0.9828970432281494, + "learning_rate": 9.989825347317668e-06, + "loss": 0.7922, + "step": 836 + }, + { + "epoch": 0.04606747757168804, + "grad_norm": 1.025447964668274, + "learning_rate": 9.989797689434836e-06, + "loss": 0.9349, + "step": 837 + }, + { + "epoch": 0.0461225163740437, + "grad_norm": 0.8623831272125244, + "learning_rate": 9.98976999405002e-06, + "loss": 0.8786, + "step": 838 + }, + { + "epoch": 0.04617755517639936, + "grad_norm": 0.9614997506141663, + "learning_rate": 9.98974226116343e-06, + "loss": 0.7885, + "step": 839 + }, + { + "epoch": 0.046232593978755024, + "grad_norm": 1.0207616090774536, + "learning_rate": 9.989714490775269e-06, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.046287632781110685, + "grad_norm": 0.8509595990180969, + "learning_rate": 9.98968668288575e-06, + "loss": 0.7312, + "step": 841 + }, + { + "epoch": 0.04634267158346635, + "grad_norm": 0.9822607040405273, + "learning_rate": 9.989658837495084e-06, + "loss": 0.952, + "step": 842 + }, + { + "epoch": 0.046397710385822, + "grad_norm": 1.0058252811431885, + "learning_rate": 9.989630954603477e-06, + "loss": 0.8811, + "step": 843 + }, + { + "epoch": 0.04645274918817766, + "grad_norm": 1.0146985054016113, + "learning_rate": 9.989603034211139e-06, + "loss": 0.9051, + "step": 844 + }, + { + "epoch": 0.046507787990533324, + "grad_norm": 0.8976503610610962, + "learning_rate": 9.98957507631828e-06, + "loss": 0.879, + "step": 845 + }, + { + "epoch": 0.046562826792888985, + "grad_norm": 0.8791939616203308, + "learning_rate": 9.989547080925111e-06, + "loss": 0.8944, + "step": 846 + }, + { + "epoch": 0.04661786559524465, + "grad_norm": 0.8530884981155396, + "learning_rate": 9.989519048031842e-06, + "loss": 0.9029, + "step": 847 + }, + { + "epoch": 0.04667290439760031, + "grad_norm": 0.9621617197990417, + "learning_rate": 9.989490977638683e-06, + "loss": 0.8374, + "step": 848 + }, + { + "epoch": 0.04672794319995597, + "grad_norm": 0.9629075527191162, + "learning_rate": 9.989462869745845e-06, + "loss": 0.9032, + "step": 849 + }, + { + "epoch": 0.04678298200231163, + "grad_norm": 1.3256126642227173, + "learning_rate": 9.989434724353541e-06, + "loss": 0.9748, + "step": 850 + }, + { + "epoch": 0.04683802080466729, + "grad_norm": 1.0230494737625122, + "learning_rate": 9.989406541461979e-06, + "loss": 0.9752, + "step": 851 + }, + { + "epoch": 0.046893059607022954, + "grad_norm": 0.8454533219337463, + "learning_rate": 9.989378321071375e-06, + "loss": 0.8426, + "step": 852 + }, + { + "epoch": 0.046948098409378615, + "grad_norm": 0.9995863437652588, + "learning_rate": 9.989350063181939e-06, + "loss": 0.9955, + "step": 853 + }, + { + "epoch": 0.04700313721173427, + "grad_norm": 0.8956604599952698, + "learning_rate": 9.989321767793883e-06, + "loss": 0.9024, + "step": 854 + }, + { + "epoch": 0.04705817601408993, + "grad_norm": 1.0123292207717896, + "learning_rate": 9.989293434907419e-06, + "loss": 0.7856, + "step": 855 + }, + { + "epoch": 0.04711321481644559, + "grad_norm": 0.814577043056488, + "learning_rate": 9.989265064522762e-06, + "loss": 0.8377, + "step": 856 + }, + { + "epoch": 0.047168253618801254, + "grad_norm": 1.1571552753448486, + "learning_rate": 9.989236656640125e-06, + "loss": 0.8562, + "step": 857 + }, + { + "epoch": 0.047223292421156915, + "grad_norm": 0.9681577682495117, + "learning_rate": 9.98920821125972e-06, + "loss": 0.8473, + "step": 858 + }, + { + "epoch": 0.047278331223512576, + "grad_norm": 0.9680121541023254, + "learning_rate": 9.989179728381761e-06, + "loss": 0.9811, + "step": 859 + }, + { + "epoch": 0.04733337002586824, + "grad_norm": 0.985477089881897, + "learning_rate": 9.989151208006464e-06, + "loss": 0.6994, + "step": 860 + }, + { + "epoch": 0.0473884088282239, + "grad_norm": 0.8612962365150452, + "learning_rate": 9.98912265013404e-06, + "loss": 0.7667, + "step": 861 + }, + { + "epoch": 0.04744344763057956, + "grad_norm": 0.8884604573249817, + "learning_rate": 9.989094054764708e-06, + "loss": 0.8382, + "step": 862 + }, + { + "epoch": 0.04749848643293522, + "grad_norm": 1.036881923675537, + "learning_rate": 9.989065421898681e-06, + "loss": 0.8748, + "step": 863 + }, + { + "epoch": 0.04755352523529088, + "grad_norm": 0.9954493045806885, + "learning_rate": 9.989036751536171e-06, + "loss": 0.9174, + "step": 864 + }, + { + "epoch": 0.04760856403764654, + "grad_norm": 0.9984694123268127, + "learning_rate": 9.989008043677399e-06, + "loss": 0.7636, + "step": 865 + }, + { + "epoch": 0.0476636028400022, + "grad_norm": 1.0412588119506836, + "learning_rate": 9.988979298322576e-06, + "loss": 0.773, + "step": 866 + }, + { + "epoch": 0.04771864164235786, + "grad_norm": 0.8034874796867371, + "learning_rate": 9.98895051547192e-06, + "loss": 0.7914, + "step": 867 + }, + { + "epoch": 0.04777368044471352, + "grad_norm": 0.8983979225158691, + "learning_rate": 9.988921695125648e-06, + "loss": 0.7292, + "step": 868 + }, + { + "epoch": 0.04782871924706918, + "grad_norm": 0.9445077776908875, + "learning_rate": 9.988892837283976e-06, + "loss": 0.8263, + "step": 869 + }, + { + "epoch": 0.047883758049424845, + "grad_norm": 1.0753306150436401, + "learning_rate": 9.988863941947121e-06, + "loss": 1.1122, + "step": 870 + }, + { + "epoch": 0.047938796851780506, + "grad_norm": 1.0091484785079956, + "learning_rate": 9.9888350091153e-06, + "loss": 0.9276, + "step": 871 + }, + { + "epoch": 0.04799383565413617, + "grad_norm": 1.0977306365966797, + "learning_rate": 9.988806038788732e-06, + "loss": 0.854, + "step": 872 + }, + { + "epoch": 0.04804887445649183, + "grad_norm": 1.0285007953643799, + "learning_rate": 9.988777030967632e-06, + "loss": 0.9441, + "step": 873 + }, + { + "epoch": 0.04810391325884749, + "grad_norm": 0.8973976373672485, + "learning_rate": 9.988747985652218e-06, + "loss": 0.786, + "step": 874 + }, + { + "epoch": 0.04815895206120315, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.98871890284271e-06, + "loss": 0.9042, + "step": 875 + }, + { + "epoch": 0.048213990863558806, + "grad_norm": 0.8514279723167419, + "learning_rate": 9.988689782539326e-06, + "loss": 0.7874, + "step": 876 + }, + { + "epoch": 0.04826902966591447, + "grad_norm": 0.8299674391746521, + "learning_rate": 9.988660624742286e-06, + "loss": 0.8704, + "step": 877 + }, + { + "epoch": 0.04832406846827013, + "grad_norm": 0.9862462282180786, + "learning_rate": 9.988631429451809e-06, + "loss": 0.9963, + "step": 878 + }, + { + "epoch": 0.04837910727062579, + "grad_norm": 0.9041131734848022, + "learning_rate": 9.988602196668111e-06, + "loss": 0.9207, + "step": 879 + }, + { + "epoch": 0.04843414607298145, + "grad_norm": 0.8597276210784912, + "learning_rate": 9.988572926391416e-06, + "loss": 0.8226, + "step": 880 + }, + { + "epoch": 0.04848918487533711, + "grad_norm": 0.9494329690933228, + "learning_rate": 9.988543618621941e-06, + "loss": 0.8834, + "step": 881 + }, + { + "epoch": 0.048544223677692774, + "grad_norm": 0.9129118323326111, + "learning_rate": 9.98851427335991e-06, + "loss": 0.7819, + "step": 882 + }, + { + "epoch": 0.048599262480048436, + "grad_norm": 0.9145999550819397, + "learning_rate": 9.988484890605539e-06, + "loss": 0.885, + "step": 883 + }, + { + "epoch": 0.0486543012824041, + "grad_norm": 1.0115307569503784, + "learning_rate": 9.98845547035905e-06, + "loss": 0.8347, + "step": 884 + }, + { + "epoch": 0.04870934008475976, + "grad_norm": 1.1372706890106201, + "learning_rate": 9.988426012620667e-06, + "loss": 0.944, + "step": 885 + }, + { + "epoch": 0.04876437888711541, + "grad_norm": 0.9502811431884766, + "learning_rate": 9.98839651739061e-06, + "loss": 0.9054, + "step": 886 + }, + { + "epoch": 0.048819417689471074, + "grad_norm": 0.9612823128700256, + "learning_rate": 9.988366984669097e-06, + "loss": 0.8796, + "step": 887 + }, + { + "epoch": 0.048874456491826736, + "grad_norm": 0.9551461935043335, + "learning_rate": 9.988337414456355e-06, + "loss": 0.8769, + "step": 888 + }, + { + "epoch": 0.0489294952941824, + "grad_norm": 0.8554086089134216, + "learning_rate": 9.988307806752603e-06, + "loss": 0.892, + "step": 889 + }, + { + "epoch": 0.04898453409653806, + "grad_norm": 0.8418886661529541, + "learning_rate": 9.988278161558067e-06, + "loss": 0.7568, + "step": 890 + }, + { + "epoch": 0.04903957289889372, + "grad_norm": 1.4780360460281372, + "learning_rate": 9.988248478872967e-06, + "loss": 0.9126, + "step": 891 + }, + { + "epoch": 0.04909461170124938, + "grad_norm": 0.8236714005470276, + "learning_rate": 9.988218758697526e-06, + "loss": 0.7317, + "step": 892 + }, + { + "epoch": 0.04914965050360504, + "grad_norm": 0.8777141571044922, + "learning_rate": 9.988189001031968e-06, + "loss": 0.7989, + "step": 893 + }, + { + "epoch": 0.049204689305960704, + "grad_norm": 1.0235031843185425, + "learning_rate": 9.988159205876516e-06, + "loss": 0.8335, + "step": 894 + }, + { + "epoch": 0.049259728108316365, + "grad_norm": 0.9340357184410095, + "learning_rate": 9.988129373231395e-06, + "loss": 0.8129, + "step": 895 + }, + { + "epoch": 0.04931476691067203, + "grad_norm": 1.7686667442321777, + "learning_rate": 9.98809950309683e-06, + "loss": 0.9792, + "step": 896 + }, + { + "epoch": 0.04936980571302768, + "grad_norm": 0.9252369403839111, + "learning_rate": 9.988069595473044e-06, + "loss": 0.8671, + "step": 897 + }, + { + "epoch": 0.04942484451538334, + "grad_norm": 0.9989960789680481, + "learning_rate": 9.988039650360262e-06, + "loss": 0.9245, + "step": 898 + }, + { + "epoch": 0.049479883317739004, + "grad_norm": 1.062912106513977, + "learning_rate": 9.98800966775871e-06, + "loss": 0.9146, + "step": 899 + }, + { + "epoch": 0.049534922120094665, + "grad_norm": 0.8698169589042664, + "learning_rate": 9.98797964766861e-06, + "loss": 0.8606, + "step": 900 + }, + { + "epoch": 0.04958996092245033, + "grad_norm": 1.6754224300384521, + "learning_rate": 9.98794959009019e-06, + "loss": 0.9236, + "step": 901 + }, + { + "epoch": 0.04964499972480599, + "grad_norm": 1.084174394607544, + "learning_rate": 9.98791949502368e-06, + "loss": 0.9252, + "step": 902 + }, + { + "epoch": 0.04970003852716165, + "grad_norm": 0.9866724610328674, + "learning_rate": 9.987889362469301e-06, + "loss": 0.9096, + "step": 903 + }, + { + "epoch": 0.04975507732951731, + "grad_norm": 0.8814040422439575, + "learning_rate": 9.987859192427279e-06, + "loss": 0.8475, + "step": 904 + }, + { + "epoch": 0.04981011613187297, + "grad_norm": 0.8796457052230835, + "learning_rate": 9.987828984897843e-06, + "loss": 0.8478, + "step": 905 + }, + { + "epoch": 0.049865154934228634, + "grad_norm": 1.0541884899139404, + "learning_rate": 9.98779873988122e-06, + "loss": 0.9799, + "step": 906 + }, + { + "epoch": 0.049920193736584295, + "grad_norm": 0.91409832239151, + "learning_rate": 9.987768457377636e-06, + "loss": 0.8701, + "step": 907 + }, + { + "epoch": 0.04997523253893995, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.98773813738732e-06, + "loss": 0.8417, + "step": 908 + }, + { + "epoch": 0.05003027134129561, + "grad_norm": 1.7744206190109253, + "learning_rate": 9.987707779910499e-06, + "loss": 0.9263, + "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + }, + { + "epoch": 0.2502063955088337, + "grad_norm": 0.8124812841415405, + "learning_rate": 9.633168274647203e-06, + "loss": 0.8133, + "step": 4546 + }, + { + "epoch": 0.2502614343111894, + "grad_norm": 0.8511367440223694, + "learning_rate": 9.63300528873605e-06, + "loss": 0.7747, + "step": 4547 + }, + { + "epoch": 0.25031647311354505, + "grad_norm": 0.7305121421813965, + "learning_rate": 9.632842268004469e-06, + "loss": 0.8479, + "step": 4548 + }, + { + "epoch": 0.25037151191590074, + "grad_norm": 0.7127692103385925, + "learning_rate": 9.632679212453686e-06, + "loss": 0.8514, + "step": 4549 + }, + { + "epoch": 0.25042655071825637, + "grad_norm": 0.8251872062683105, + "learning_rate": 9.632516122084926e-06, + "loss": 0.7686, + "step": 4550 + }, + { + "epoch": 0.25048158952061206, + "grad_norm": 0.6756613850593567, + "learning_rate": 9.632352996899413e-06, + "loss": 0.5959, + "step": 4551 + }, + { + "epoch": 0.2505366283229677, + "grad_norm": 0.9266120791435242, + "learning_rate": 9.632189836898377e-06, + "loss": 0.7889, + "step": 4552 + }, + { + "epoch": 0.2505916671253233, + "grad_norm": 0.769890546798706, + "learning_rate": 9.63202664208304e-06, + "loss": 0.7864, + "step": 4553 + }, + { + "epoch": 0.250646705927679, + "grad_norm": 0.7314025163650513, + "learning_rate": 9.631863412454634e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 0.25070174473003465, + "grad_norm": 0.818317711353302, + "learning_rate": 9.63170014801438e-06, + "loss": 0.7096, + "step": 4555 + }, + { + "epoch": 0.25075678353239034, + "grad_norm": 0.7538807392120361, + "learning_rate": 9.631536848763508e-06, + "loss": 0.7779, + "step": 4556 + }, + { + "epoch": 0.25081182233474597, + "grad_norm": 0.7658100128173828, + "learning_rate": 9.631373514703247e-06, + "loss": 0.8535, + "step": 4557 + }, + { + "epoch": 0.25086686113710166, + "grad_norm": 0.8019290566444397, + "learning_rate": 9.631210145834819e-06, + "loss": 0.8141, + "step": 4558 + }, + { + "epoch": 0.2509218999394573, + "grad_norm": 0.7257653474807739, + "learning_rate": 9.631046742159456e-06, + "loss": 0.7451, + "step": 4559 + }, + { + "epoch": 0.250976938741813, + "grad_norm": 0.7546024918556213, + "learning_rate": 9.630883303678386e-06, + "loss": 0.7707, + "step": 4560 + }, + { + "epoch": 0.2510319775441686, + "grad_norm": 0.7288938760757446, + "learning_rate": 9.630719830392835e-06, + "loss": 0.7362, + "step": 4561 + }, + { + "epoch": 0.2510870163465243, + "grad_norm": 0.7814223170280457, + "learning_rate": 9.630556322304036e-06, + "loss": 0.8514, + "step": 4562 + }, + { + "epoch": 0.25114205514887994, + "grad_norm": 0.7561381459236145, + "learning_rate": 9.630392779413214e-06, + "loss": 0.7659, + "step": 4563 + }, + { + "epoch": 0.25119709395123563, + "grad_norm": 0.750641942024231, + "learning_rate": 9.6302292017216e-06, + "loss": 0.8496, + "step": 4564 + }, + { + "epoch": 0.25125213275359126, + "grad_norm": 0.832155704498291, + "learning_rate": 9.630065589230422e-06, + "loss": 0.7778, + "step": 4565 + }, + { + "epoch": 0.25130717155594695, + "grad_norm": 0.8202440142631531, + "learning_rate": 9.62990194194091e-06, + "loss": 0.8962, + "step": 4566 + }, + { + "epoch": 0.2513622103583026, + "grad_norm": 0.8777977824211121, + "learning_rate": 9.629738259854295e-06, + "loss": 0.7215, + "step": 4567 + }, + { + "epoch": 0.2514172491606583, + "grad_norm": 1.1868599653244019, + "learning_rate": 9.629574542971806e-06, + "loss": 0.8238, + "step": 4568 + }, + { + "epoch": 0.2514722879630139, + "grad_norm": 0.9128753542900085, + "learning_rate": 9.629410791294675e-06, + "loss": 0.7638, + "step": 4569 + }, + { + "epoch": 0.2515273267653696, + "grad_norm": 0.7350082993507385, + "learning_rate": 9.629247004824132e-06, + "loss": 0.8041, + "step": 4570 + }, + { + "epoch": 0.25158236556772523, + "grad_norm": 0.7279660701751709, + "learning_rate": 9.629083183561407e-06, + "loss": 0.7377, + "step": 4571 + }, + { + "epoch": 0.2516374043700809, + "grad_norm": 0.8570461273193359, + "learning_rate": 9.628919327507732e-06, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.25169244317243655, + "grad_norm": 0.8998312950134277, + "learning_rate": 9.62875543666434e-06, + "loss": 0.8171, + "step": 4573 + }, + { + "epoch": 0.25174748197479224, + "grad_norm": 0.7631624937057495, + "learning_rate": 9.628591511032456e-06, + "loss": 0.7871, + "step": 4574 + }, + { + "epoch": 0.2518025207771479, + "grad_norm": 0.7752320766448975, + "learning_rate": 9.628427550613322e-06, + "loss": 0.8241, + "step": 4575 + }, + { + "epoch": 0.25185755957950356, + "grad_norm": 0.8741563558578491, + "learning_rate": 9.628263555408163e-06, + "loss": 0.7312, + "step": 4576 + }, + { + "epoch": 0.2519125983818592, + "grad_norm": 0.8615008592605591, + "learning_rate": 9.628099525418216e-06, + "loss": 0.8586, + "step": 4577 + }, + { + "epoch": 0.2519676371842149, + "grad_norm": 0.8273662328720093, + "learning_rate": 9.62793546064471e-06, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.2520226759865705, + "grad_norm": 0.7454090118408203, + "learning_rate": 9.627771361088882e-06, + "loss": 0.8461, + "step": 4579 + }, + { + "epoch": 0.2520777147889262, + "grad_norm": 0.8225379586219788, + "learning_rate": 9.627607226751962e-06, + "loss": 0.7792, + "step": 4580 + }, + { + "epoch": 0.25213275359128184, + "grad_norm": 0.8655416369438171, + "learning_rate": 9.627443057635184e-06, + "loss": 0.8165, + "step": 4581 + }, + { + "epoch": 0.25218779239363753, + "grad_norm": 0.7735984921455383, + "learning_rate": 9.627278853739783e-06, + "loss": 0.8208, + "step": 4582 + }, + { + "epoch": 0.25224283119599317, + "grad_norm": 0.8293350338935852, + "learning_rate": 9.627114615066994e-06, + "loss": 0.7394, + "step": 4583 + }, + { + "epoch": 0.25229786999834886, + "grad_norm": 0.7840214371681213, + "learning_rate": 9.626950341618048e-06, + "loss": 0.8522, + "step": 4584 + }, + { + "epoch": 0.2523529088007045, + "grad_norm": 0.7724186182022095, + "learning_rate": 9.626786033394185e-06, + "loss": 0.8175, + "step": 4585 + }, + { + "epoch": 0.2524079476030602, + "grad_norm": 1.0751588344573975, + "learning_rate": 9.626621690396634e-06, + "loss": 0.9229, + "step": 4586 + }, + { + "epoch": 0.2524629864054158, + "grad_norm": 0.7016913294792175, + "learning_rate": 9.626457312626634e-06, + "loss": 0.6883, + "step": 4587 + }, + { + "epoch": 0.2525180252077715, + "grad_norm": 0.918377697467804, + "learning_rate": 9.626292900085419e-06, + "loss": 0.7889, + "step": 4588 + }, + { + "epoch": 0.25257306401012714, + "grad_norm": 1.006564736366272, + "learning_rate": 9.626128452774226e-06, + "loss": 0.7888, + "step": 4589 + }, + { + "epoch": 0.2526281028124828, + "grad_norm": 1.0214998722076416, + "learning_rate": 9.625963970694287e-06, + "loss": 0.768, + "step": 4590 + }, + { + "epoch": 0.25268314161483846, + "grad_norm": 0.7980843186378479, + "learning_rate": 9.625799453846844e-06, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 0.25273818041719415, + "grad_norm": 0.734582245349884, + "learning_rate": 9.625634902233128e-06, + "loss": 0.759, + "step": 4592 + }, + { + "epoch": 0.2527932192195498, + "grad_norm": 0.7185904383659363, + "learning_rate": 9.62547031585438e-06, + "loss": 0.774, + "step": 4593 + }, + { + "epoch": 0.25284825802190547, + "grad_norm": 0.7356622219085693, + "learning_rate": 9.625305694711835e-06, + "loss": 0.7435, + "step": 4594 + }, + { + "epoch": 0.2529032968242611, + "grad_norm": 0.7589355707168579, + "learning_rate": 9.62514103880673e-06, + "loss": 0.807, + "step": 4595 + }, + { + "epoch": 0.25295833562661674, + "grad_norm": 0.889228880405426, + "learning_rate": 9.624976348140305e-06, + "loss": 0.8609, + "step": 4596 + }, + { + "epoch": 0.2530133744289724, + "grad_norm": 0.7546125650405884, + "learning_rate": 9.624811622713793e-06, + "loss": 0.8379, + "step": 4597 + }, + { + "epoch": 0.25306841323132806, + "grad_norm": 0.8262770175933838, + "learning_rate": 9.624646862528436e-06, + "loss": 0.7611, + "step": 4598 + }, + { + "epoch": 0.25312345203368375, + "grad_norm": 0.8876076936721802, + "learning_rate": 9.624482067585472e-06, + "loss": 0.8106, + "step": 4599 + }, + { + "epoch": 0.2531784908360394, + "grad_norm": 0.7045544981956482, + "learning_rate": 9.624317237886137e-06, + "loss": 0.7121, + "step": 4600 + }, + { + "epoch": 0.25323352963839507, + "grad_norm": 0.7693355083465576, + "learning_rate": 9.624152373431672e-06, + "loss": 0.8052, + "step": 4601 + }, + { + "epoch": 0.2532885684407507, + "grad_norm": 0.8072683811187744, + "learning_rate": 9.623987474223316e-06, + "loss": 0.8543, + "step": 4602 + }, + { + "epoch": 0.2533436072431064, + "grad_norm": 0.8158687949180603, + "learning_rate": 9.62382254026231e-06, + "loss": 0.6922, + "step": 4603 + }, + { + "epoch": 0.25339864604546203, + "grad_norm": 0.7688641548156738, + "learning_rate": 9.623657571549887e-06, + "loss": 0.7198, + "step": 4604 + }, + { + "epoch": 0.2534536848478177, + "grad_norm": 0.7806578278541565, + "learning_rate": 9.623492568087293e-06, + "loss": 0.8539, + "step": 4605 + }, + { + "epoch": 0.25350872365017335, + "grad_norm": 0.9557347893714905, + "learning_rate": 9.623327529875769e-06, + "loss": 0.6996, + "step": 4606 + }, + { + "epoch": 0.25356376245252904, + "grad_norm": 0.9465067386627197, + "learning_rate": 9.62316245691655e-06, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 0.2536188012548847, + "grad_norm": 0.8029165863990784, + "learning_rate": 9.62299734921088e-06, + "loss": 0.8573, + "step": 4608 + }, + { + "epoch": 0.25367384005724036, + "grad_norm": 0.7530128955841064, + "learning_rate": 9.62283220676e-06, + "loss": 0.7466, + "step": 4609 + }, + { + "epoch": 0.253728878859596, + "grad_norm": 0.6704453825950623, + "learning_rate": 9.622667029565151e-06, + "loss": 0.6512, + "step": 4610 + }, + { + "epoch": 0.2537839176619517, + "grad_norm": 0.7162728309631348, + "learning_rate": 9.622501817627574e-06, + "loss": 0.7615, + "step": 4611 + }, + { + "epoch": 0.2538389564643073, + "grad_norm": 0.7599188089370728, + "learning_rate": 9.622336570948509e-06, + "loss": 0.8463, + "step": 4612 + }, + { + "epoch": 0.253893995266663, + "grad_norm": 0.7922326922416687, + "learning_rate": 9.6221712895292e-06, + "loss": 0.9221, + "step": 4613 + }, + { + "epoch": 0.25394903406901864, + "grad_norm": 1.4635218381881714, + "learning_rate": 9.622005973370892e-06, + "loss": 0.9159, + "step": 4614 + }, + { + "epoch": 0.25400407287137433, + "grad_norm": 0.8695057034492493, + "learning_rate": 9.62184062247482e-06, + "loss": 0.6792, + "step": 4615 + }, + { + "epoch": 0.25405911167372996, + "grad_norm": 0.8070930242538452, + "learning_rate": 9.621675236842235e-06, + "loss": 0.8257, + "step": 4616 + }, + { + "epoch": 0.25411415047608565, + "grad_norm": 0.8642075061798096, + "learning_rate": 9.621509816474372e-06, + "loss": 0.8223, + "step": 4617 + }, + { + "epoch": 0.2541691892784413, + "grad_norm": 0.7131080031394958, + "learning_rate": 9.621344361372483e-06, + "loss": 0.6831, + "step": 4618 + }, + { + "epoch": 0.254224228080797, + "grad_norm": 0.7582216262817383, + "learning_rate": 9.621178871537804e-06, + "loss": 0.8091, + "step": 4619 + }, + { + "epoch": 0.2542792668831526, + "grad_norm": 0.7705016732215881, + "learning_rate": 9.62101334697158e-06, + "loss": 0.7537, + "step": 4620 + }, + { + "epoch": 0.2543343056855083, + "grad_norm": 0.7638342976570129, + "learning_rate": 9.62084778767506e-06, + "loss": 0.7661, + "step": 4621 + }, + { + "epoch": 0.25438934448786393, + "grad_norm": 0.9296607971191406, + "learning_rate": 9.620682193649482e-06, + "loss": 0.8875, + "step": 4622 + }, + { + "epoch": 0.2544443832902196, + "grad_norm": 0.795394778251648, + "learning_rate": 9.620516564896096e-06, + "loss": 0.6884, + "step": 4623 + }, + { + "epoch": 0.25449942209257526, + "grad_norm": 0.9164957404136658, + "learning_rate": 9.620350901416142e-06, + "loss": 0.8693, + "step": 4624 + }, + { + "epoch": 0.25455446089493095, + "grad_norm": 0.8306281566619873, + "learning_rate": 9.62018520321087e-06, + "loss": 0.8972, + "step": 4625 + }, + { + "epoch": 0.2546094996972866, + "grad_norm": 0.778831422328949, + "learning_rate": 9.620019470281521e-06, + "loss": 0.7574, + "step": 4626 + }, + { + "epoch": 0.25466453849964227, + "grad_norm": 0.9326225519180298, + "learning_rate": 9.619853702629343e-06, + "loss": 0.7712, + "step": 4627 + }, + { + "epoch": 0.2547195773019979, + "grad_norm": 0.8772255182266235, + "learning_rate": 9.619687900255581e-06, + "loss": 0.8241, + "step": 4628 + }, + { + "epoch": 0.2547746161043536, + "grad_norm": 0.8777550458908081, + "learning_rate": 9.619522063161482e-06, + "loss": 0.8724, + "step": 4629 + }, + { + "epoch": 0.2548296549067092, + "grad_norm": 0.8332602381706238, + "learning_rate": 9.61935619134829e-06, + "loss": 0.8716, + "step": 4630 + }, + { + "epoch": 0.2548846937090649, + "grad_norm": 0.8246355056762695, + "learning_rate": 9.619190284817255e-06, + "loss": 0.7789, + "step": 4631 + }, + { + "epoch": 0.25493973251142055, + "grad_norm": 0.7200644612312317, + "learning_rate": 9.61902434356962e-06, + "loss": 0.7956, + "step": 4632 + }, + { + "epoch": 0.25499477131377624, + "grad_norm": 0.827756404876709, + "learning_rate": 9.618858367606638e-06, + "loss": 0.7925, + "step": 4633 + }, + { + "epoch": 0.25504981011613187, + "grad_norm": 0.7749341726303101, + "learning_rate": 9.618692356929551e-06, + "loss": 0.8706, + "step": 4634 + }, + { + "epoch": 0.25510484891848756, + "grad_norm": 0.7233432531356812, + "learning_rate": 9.618526311539608e-06, + "loss": 0.7725, + "step": 4635 + }, + { + "epoch": 0.2551598877208432, + "grad_norm": 0.846340537071228, + "learning_rate": 9.618360231438058e-06, + "loss": 0.8758, + "step": 4636 + }, + { + "epoch": 0.2552149265231989, + "grad_norm": 0.8262908458709717, + "learning_rate": 9.61819411662615e-06, + "loss": 0.7758, + "step": 4637 + }, + { + "epoch": 0.2552699653255545, + "grad_norm": 0.7829110026359558, + "learning_rate": 9.61802796710513e-06, + "loss": 0.8494, + "step": 4638 + }, + { + "epoch": 0.25532500412791015, + "grad_norm": 0.7480815649032593, + "learning_rate": 9.617861782876247e-06, + "loss": 0.7639, + "step": 4639 + }, + { + "epoch": 0.25538004293026584, + "grad_norm": 0.8782994747161865, + "learning_rate": 9.617695563940752e-06, + "loss": 0.9651, + "step": 4640 + }, + { + "epoch": 0.25543508173262147, + "grad_norm": 0.7215868234634399, + "learning_rate": 9.617529310299895e-06, + "loss": 0.7833, + "step": 4641 + }, + { + "epoch": 0.25549012053497716, + "grad_norm": 0.8287535905838013, + "learning_rate": 9.617363021954922e-06, + "loss": 0.901, + "step": 4642 + }, + { + "epoch": 0.2555451593373328, + "grad_norm": 0.7679935097694397, + "learning_rate": 9.617196698907084e-06, + "loss": 0.761, + "step": 4643 + }, + { + "epoch": 0.2556001981396885, + "grad_norm": 0.7765942811965942, + "learning_rate": 9.617030341157632e-06, + "loss": 0.7356, + "step": 4644 + }, + { + "epoch": 0.2556552369420441, + "grad_norm": 0.6964583396911621, + "learning_rate": 9.616863948707816e-06, + "loss": 0.7683, + "step": 4645 + }, + { + "epoch": 0.2557102757443998, + "grad_norm": 0.8031953573226929, + "learning_rate": 9.616697521558886e-06, + "loss": 0.7875, + "step": 4646 + }, + { + "epoch": 0.25576531454675544, + "grad_norm": 0.7155965566635132, + "learning_rate": 9.616531059712094e-06, + "loss": 0.6516, + "step": 4647 + }, + { + "epoch": 0.25582035334911113, + "grad_norm": 0.6870070099830627, + "learning_rate": 9.61636456316869e-06, + "loss": 0.7217, + "step": 4648 + }, + { + "epoch": 0.25587539215146676, + "grad_norm": 0.7686315774917603, + "learning_rate": 9.616198031929926e-06, + "loss": 0.8136, + "step": 4649 + }, + { + "epoch": 0.25593043095382245, + "grad_norm": 0.7532772421836853, + "learning_rate": 9.616031465997054e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.2559854697561781, + "grad_norm": 0.8111574053764343, + "learning_rate": 9.615864865371323e-06, + "loss": 0.8501, + "step": 4651 + }, + { + "epoch": 0.2560405085585338, + "grad_norm": 0.771065890789032, + "learning_rate": 9.615698230053989e-06, + "loss": 0.7417, + "step": 4652 + }, + { + "epoch": 0.2560955473608894, + "grad_norm": 0.7468003034591675, + "learning_rate": 9.6155315600463e-06, + "loss": 0.7303, + "step": 4653 + }, + { + "epoch": 0.2561505861632451, + "grad_norm": 0.8041057586669922, + "learning_rate": 9.615364855349514e-06, + "loss": 0.8689, + "step": 4654 + }, + { + "epoch": 0.25620562496560073, + "grad_norm": 0.8439033627510071, + "learning_rate": 9.61519811596488e-06, + "loss": 0.8654, + "step": 4655 + }, + { + "epoch": 0.2562606637679564, + "grad_norm": 0.7768430113792419, + "learning_rate": 9.615031341893653e-06, + "loss": 0.8789, + "step": 4656 + }, + { + "epoch": 0.25631570257031205, + "grad_norm": 0.712876558303833, + "learning_rate": 9.614864533137086e-06, + "loss": 0.7497, + "step": 4657 + }, + { + "epoch": 0.25637074137266774, + "grad_norm": 0.7586949467658997, + "learning_rate": 9.614697689696431e-06, + "loss": 0.81, + "step": 4658 + }, + { + "epoch": 0.2564257801750234, + "grad_norm": 0.717078447341919, + "learning_rate": 9.614530811572946e-06, + "loss": 0.8023, + "step": 4659 + }, + { + "epoch": 0.25648081897737907, + "grad_norm": 0.7369407415390015, + "learning_rate": 9.61436389876788e-06, + "loss": 0.784, + "step": 4660 + }, + { + "epoch": 0.2565358577797347, + "grad_norm": 0.7536265850067139, + "learning_rate": 9.61419695128249e-06, + "loss": 0.7687, + "step": 4661 + }, + { + "epoch": 0.2565908965820904, + "grad_norm": 0.9718124866485596, + "learning_rate": 9.614029969118033e-06, + "loss": 0.8495, + "step": 4662 + }, + { + "epoch": 0.256645935384446, + "grad_norm": 1.1578630208969116, + "learning_rate": 9.613862952275762e-06, + "loss": 0.9189, + "step": 4663 + }, + { + "epoch": 0.2567009741868017, + "grad_norm": 0.7752498984336853, + "learning_rate": 9.613695900756929e-06, + "loss": 0.7677, + "step": 4664 + }, + { + "epoch": 0.25675601298915735, + "grad_norm": 0.9640393257141113, + "learning_rate": 9.613528814562795e-06, + "loss": 0.719, + "step": 4665 + }, + { + "epoch": 0.25681105179151303, + "grad_norm": 0.7690972089767456, + "learning_rate": 9.613361693694614e-06, + "loss": 0.7977, + "step": 4666 + }, + { + "epoch": 0.25686609059386867, + "grad_norm": 0.8390190601348877, + "learning_rate": 9.61319453815364e-06, + "loss": 0.8032, + "step": 4667 + }, + { + "epoch": 0.25692112939622436, + "grad_norm": 0.8293220400810242, + "learning_rate": 9.613027347941131e-06, + "loss": 0.8645, + "step": 4668 + }, + { + "epoch": 0.25697616819858, + "grad_norm": 0.8020731210708618, + "learning_rate": 9.612860123058344e-06, + "loss": 0.8374, + "step": 4669 + }, + { + "epoch": 0.2570312070009357, + "grad_norm": 0.7756736278533936, + "learning_rate": 9.612692863506534e-06, + "loss": 0.7318, + "step": 4670 + }, + { + "epoch": 0.2570862458032913, + "grad_norm": 0.895416259765625, + "learning_rate": 9.61252556928696e-06, + "loss": 0.9654, + "step": 4671 + }, + { + "epoch": 0.257141284605647, + "grad_norm": 0.8647375106811523, + "learning_rate": 9.61235824040088e-06, + "loss": 0.7411, + "step": 4672 + }, + { + "epoch": 0.25719632340800264, + "grad_norm": 0.6927250623703003, + "learning_rate": 9.612190876849546e-06, + "loss": 0.7558, + "step": 4673 + }, + { + "epoch": 0.2572513622103583, + "grad_norm": 0.7614898085594177, + "learning_rate": 9.612023478634222e-06, + "loss": 0.7696, + "step": 4674 + }, + { + "epoch": 0.25730640101271396, + "grad_norm": 0.7910586595535278, + "learning_rate": 9.611856045756166e-06, + "loss": 0.8207, + "step": 4675 + }, + { + "epoch": 0.25736143981506965, + "grad_norm": 0.7330125570297241, + "learning_rate": 9.611688578216632e-06, + "loss": 0.8615, + "step": 4676 + }, + { + "epoch": 0.2574164786174253, + "grad_norm": 0.7703417539596558, + "learning_rate": 9.611521076016882e-06, + "loss": 0.8321, + "step": 4677 + }, + { + "epoch": 0.25747151741978097, + "grad_norm": 0.7121796607971191, + "learning_rate": 9.611353539158174e-06, + "loss": 0.8228, + "step": 4678 + }, + { + "epoch": 0.2575265562221366, + "grad_norm": 0.8313117027282715, + "learning_rate": 9.611185967641768e-06, + "loss": 0.9012, + "step": 4679 + }, + { + "epoch": 0.2575815950244923, + "grad_norm": 0.806776225566864, + "learning_rate": 9.61101836146892e-06, + "loss": 0.769, + "step": 4680 + }, + { + "epoch": 0.2576366338268479, + "grad_norm": 0.7049515843391418, + "learning_rate": 9.610850720640894e-06, + "loss": 0.7938, + "step": 4681 + }, + { + "epoch": 0.25769167262920356, + "grad_norm": 0.7286638021469116, + "learning_rate": 9.610683045158948e-06, + "loss": 0.8168, + "step": 4682 + }, + { + "epoch": 0.25774671143155925, + "grad_norm": 0.7916898727416992, + "learning_rate": 9.610515335024345e-06, + "loss": 0.7681, + "step": 4683 + }, + { + "epoch": 0.2578017502339149, + "grad_norm": 0.7649673819541931, + "learning_rate": 9.61034759023834e-06, + "loss": 0.7273, + "step": 4684 + }, + { + "epoch": 0.2578567890362706, + "grad_norm": 0.8280686736106873, + "learning_rate": 9.610179810802196e-06, + "loss": 0.7968, + "step": 4685 + }, + { + "epoch": 0.2579118278386262, + "grad_norm": 0.7206569910049438, + "learning_rate": 9.610011996717175e-06, + "loss": 0.7359, + "step": 4686 + }, + { + "epoch": 0.2579668666409819, + "grad_norm": 0.7365424036979675, + "learning_rate": 9.60984414798454e-06, + "loss": 0.7962, + "step": 4687 + }, + { + "epoch": 0.25802190544333753, + "grad_norm": 0.8030344247817993, + "learning_rate": 9.609676264605549e-06, + "loss": 0.7931, + "step": 4688 + }, + { + "epoch": 0.2580769442456932, + "grad_norm": 0.8812693357467651, + "learning_rate": 9.609508346581464e-06, + "loss": 0.8493, + "step": 4689 + }, + { + "epoch": 0.25813198304804885, + "grad_norm": 0.8026734590530396, + "learning_rate": 9.60934039391355e-06, + "loss": 0.8368, + "step": 4690 + }, + { + "epoch": 0.25818702185040454, + "grad_norm": 0.8270768523216248, + "learning_rate": 9.609172406603067e-06, + "loss": 0.9077, + "step": 4691 + }, + { + "epoch": 0.2582420606527602, + "grad_norm": 0.7362856864929199, + "learning_rate": 9.609004384651276e-06, + "loss": 0.7384, + "step": 4692 + }, + { + "epoch": 0.25829709945511586, + "grad_norm": 0.7195929288864136, + "learning_rate": 9.608836328059444e-06, + "loss": 0.8475, + "step": 4693 + }, + { + "epoch": 0.2583521382574715, + "grad_norm": 0.7653167843818665, + "learning_rate": 9.60866823682883e-06, + "loss": 0.7704, + "step": 4694 + }, + { + "epoch": 0.2584071770598272, + "grad_norm": 0.7056792974472046, + "learning_rate": 9.6085001109607e-06, + "loss": 0.7835, + "step": 4695 + }, + { + "epoch": 0.2584622158621828, + "grad_norm": 0.7299804091453552, + "learning_rate": 9.60833195045632e-06, + "loss": 0.7894, + "step": 4696 + }, + { + "epoch": 0.2585172546645385, + "grad_norm": 0.7235645055770874, + "learning_rate": 9.608163755316948e-06, + "loss": 0.8113, + "step": 4697 + }, + { + "epoch": 0.25857229346689414, + "grad_norm": 0.7066782116889954, + "learning_rate": 9.60799552554385e-06, + "loss": 0.739, + "step": 4698 + }, + { + "epoch": 0.25862733226924983, + "grad_norm": 0.769930362701416, + "learning_rate": 9.607827261138291e-06, + "loss": 0.7565, + "step": 4699 + }, + { + "epoch": 0.25868237107160547, + "grad_norm": 0.8875935077667236, + "learning_rate": 9.607658962101538e-06, + "loss": 0.849, + "step": 4700 + }, + { + "epoch": 0.25873740987396115, + "grad_norm": 0.7887380123138428, + "learning_rate": 9.60749062843485e-06, + "loss": 0.8795, + "step": 4701 + }, + { + "epoch": 0.2587924486763168, + "grad_norm": 0.7600420117378235, + "learning_rate": 9.607322260139499e-06, + "loss": 0.7581, + "step": 4702 + }, + { + "epoch": 0.2588474874786725, + "grad_norm": 0.7431491017341614, + "learning_rate": 9.607153857216746e-06, + "loss": 0.7119, + "step": 4703 + }, + { + "epoch": 0.2589025262810281, + "grad_norm": 0.7444193363189697, + "learning_rate": 9.606985419667858e-06, + "loss": 0.7492, + "step": 4704 + }, + { + "epoch": 0.2589575650833838, + "grad_norm": 0.8348917365074158, + "learning_rate": 9.6068169474941e-06, + "loss": 0.7656, + "step": 4705 + }, + { + "epoch": 0.25901260388573943, + "grad_norm": 0.6790240406990051, + "learning_rate": 9.60664844069674e-06, + "loss": 0.6354, + "step": 4706 + }, + { + "epoch": 0.2590676426880951, + "grad_norm": 0.8425769805908203, + "learning_rate": 9.606479899277044e-06, + "loss": 0.7927, + "step": 4707 + }, + { + "epoch": 0.25912268149045076, + "grad_norm": 0.7234740853309631, + "learning_rate": 9.606311323236277e-06, + "loss": 0.8122, + "step": 4708 + }, + { + "epoch": 0.25917772029280645, + "grad_norm": 0.839507520198822, + "learning_rate": 9.606142712575707e-06, + "loss": 0.8807, + "step": 4709 + }, + { + "epoch": 0.2592327590951621, + "grad_norm": 0.7155291438102722, + "learning_rate": 9.605974067296601e-06, + "loss": 0.7852, + "step": 4710 + }, + { + "epoch": 0.25928779789751777, + "grad_norm": 0.7222152352333069, + "learning_rate": 9.605805387400228e-06, + "loss": 0.7362, + "step": 4711 + }, + { + "epoch": 0.2593428366998734, + "grad_norm": 0.8350114226341248, + "learning_rate": 9.605636672887854e-06, + "loss": 0.7201, + "step": 4712 + }, + { + "epoch": 0.2593978755022291, + "grad_norm": 0.6805943250656128, + "learning_rate": 9.605467923760747e-06, + "loss": 0.6936, + "step": 4713 + }, + { + "epoch": 0.2594529143045847, + "grad_norm": 0.7863980531692505, + "learning_rate": 9.605299140020177e-06, + "loss": 0.9079, + "step": 4714 + }, + { + "epoch": 0.2595079531069404, + "grad_norm": 0.838843584060669, + "learning_rate": 9.60513032166741e-06, + "loss": 0.839, + "step": 4715 + }, + { + "epoch": 0.25956299190929605, + "grad_norm": 0.7872797250747681, + "learning_rate": 9.60496146870372e-06, + "loss": 0.9164, + "step": 4716 + }, + { + "epoch": 0.25961803071165174, + "grad_norm": 0.7300794720649719, + "learning_rate": 9.604792581130369e-06, + "loss": 0.8227, + "step": 4717 + }, + { + "epoch": 0.25967306951400737, + "grad_norm": 0.8420879244804382, + "learning_rate": 9.60462365894863e-06, + "loss": 0.7865, + "step": 4718 + }, + { + "epoch": 0.25972810831636306, + "grad_norm": 0.807697057723999, + "learning_rate": 9.604454702159771e-06, + "loss": 0.9081, + "step": 4719 + }, + { + "epoch": 0.2597831471187187, + "grad_norm": 0.9041245579719543, + "learning_rate": 9.604285710765064e-06, + "loss": 0.8102, + "step": 4720 + }, + { + "epoch": 0.2598381859210744, + "grad_norm": 0.7061690092086792, + "learning_rate": 9.604116684765779e-06, + "loss": 0.762, + "step": 4721 + }, + { + "epoch": 0.25989322472343, + "grad_norm": 0.7790346741676331, + "learning_rate": 9.603947624163186e-06, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.2599482635257857, + "grad_norm": 0.8109704256057739, + "learning_rate": 9.603778528958553e-06, + "loss": 0.9105, + "step": 4723 + }, + { + "epoch": 0.26000330232814134, + "grad_norm": 0.7396997213363647, + "learning_rate": 9.603609399153153e-06, + "loss": 0.8384, + "step": 4724 + }, + { + "epoch": 0.260058341130497, + "grad_norm": 0.8594317436218262, + "learning_rate": 9.603440234748257e-06, + "loss": 0.8301, + "step": 4725 + }, + { + "epoch": 0.26011337993285266, + "grad_norm": 0.7087241411209106, + "learning_rate": 9.603271035745138e-06, + "loss": 0.6652, + "step": 4726 + }, + { + "epoch": 0.2601684187352083, + "grad_norm": 0.7405440211296082, + "learning_rate": 9.603101802145065e-06, + "loss": 0.7804, + "step": 4727 + }, + { + "epoch": 0.260223457537564, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.602932533949312e-06, + "loss": 0.8509, + "step": 4728 + }, + { + "epoch": 0.2602784963399196, + "grad_norm": 0.7040451765060425, + "learning_rate": 9.60276323115915e-06, + "loss": 0.7842, + "step": 4729 + }, + { + "epoch": 0.2603335351422753, + "grad_norm": 0.7743955254554749, + "learning_rate": 9.602593893775852e-06, + "loss": 0.8492, + "step": 4730 + }, + { + "epoch": 0.26038857394463094, + "grad_norm": 0.7110480070114136, + "learning_rate": 9.602424521800688e-06, + "loss": 0.7227, + "step": 4731 + }, + { + "epoch": 0.26044361274698663, + "grad_norm": 1.0066583156585693, + "learning_rate": 9.602255115234936e-06, + "loss": 0.8825, + "step": 4732 + }, + { + "epoch": 0.26049865154934226, + "grad_norm": 0.7746492624282837, + "learning_rate": 9.602085674079864e-06, + "loss": 0.8316, + "step": 4733 + }, + { + "epoch": 0.26055369035169795, + "grad_norm": 0.7394356727600098, + "learning_rate": 9.60191619833675e-06, + "loss": 0.746, + "step": 4734 + }, + { + "epoch": 0.2606087291540536, + "grad_norm": 0.7140582203865051, + "learning_rate": 9.601746688006866e-06, + "loss": 0.7204, + "step": 4735 + }, + { + "epoch": 0.2606637679564093, + "grad_norm": 0.753399133682251, + "learning_rate": 9.601577143091483e-06, + "loss": 0.8157, + "step": 4736 + }, + { + "epoch": 0.2607188067587649, + "grad_norm": 0.674320638179779, + "learning_rate": 9.601407563591881e-06, + "loss": 0.7279, + "step": 4737 + }, + { + "epoch": 0.2607738455611206, + "grad_norm": 0.855944037437439, + "learning_rate": 9.60123794950933e-06, + "loss": 0.804, + "step": 4738 + }, + { + "epoch": 0.26082888436347623, + "grad_norm": 0.6833948493003845, + "learning_rate": 9.601068300845106e-06, + "loss": 0.701, + "step": 4739 + }, + { + "epoch": 0.2608839231658319, + "grad_norm": 0.8085536360740662, + "learning_rate": 9.600898617600485e-06, + "loss": 0.8435, + "step": 4740 + }, + { + "epoch": 0.26093896196818755, + "grad_norm": 0.752849817276001, + "learning_rate": 9.600728899776741e-06, + "loss": 0.7205, + "step": 4741 + }, + { + "epoch": 0.26099400077054324, + "grad_norm": 0.7320554852485657, + "learning_rate": 9.600559147375151e-06, + "loss": 0.7556, + "step": 4742 + }, + { + "epoch": 0.2610490395728989, + "grad_norm": 0.7789202928543091, + "learning_rate": 9.600389360396988e-06, + "loss": 0.8467, + "step": 4743 + }, + { + "epoch": 0.26110407837525457, + "grad_norm": 0.8480898141860962, + "learning_rate": 9.600219538843532e-06, + "loss": 0.7762, + "step": 4744 + }, + { + "epoch": 0.2611591171776102, + "grad_norm": 0.8382542133331299, + "learning_rate": 9.600049682716055e-06, + "loss": 0.9051, + "step": 4745 + }, + { + "epoch": 0.2612141559799659, + "grad_norm": 0.8319274187088013, + "learning_rate": 9.599879792015838e-06, + "loss": 0.8221, + "step": 4746 + }, + { + "epoch": 0.2612691947823215, + "grad_norm": 0.7325875163078308, + "learning_rate": 9.599709866744156e-06, + "loss": 0.7968, + "step": 4747 + }, + { + "epoch": 0.2613242335846772, + "grad_norm": 0.7053360342979431, + "learning_rate": 9.599539906902285e-06, + "loss": 0.7073, + "step": 4748 + }, + { + "epoch": 0.26137927238703285, + "grad_norm": 0.763017475605011, + "learning_rate": 9.599369912491503e-06, + "loss": 0.7031, + "step": 4749 + }, + { + "epoch": 0.26143431118938854, + "grad_norm": 0.6816151738166809, + "learning_rate": 9.599199883513088e-06, + "loss": 0.7295, + "step": 4750 + }, + { + "epoch": 0.26148934999174417, + "grad_norm": 0.8143941164016724, + "learning_rate": 9.599029819968319e-06, + "loss": 0.8449, + "step": 4751 + }, + { + "epoch": 0.26154438879409986, + "grad_norm": 0.8093858361244202, + "learning_rate": 9.598859721858471e-06, + "loss": 0.8397, + "step": 4752 + }, + { + "epoch": 0.2615994275964555, + "grad_norm": 0.7431835532188416, + "learning_rate": 9.598689589184827e-06, + "loss": 0.7299, + "step": 4753 + }, + { + "epoch": 0.2616544663988112, + "grad_norm": 0.9871510863304138, + "learning_rate": 9.59851942194866e-06, + "loss": 0.7992, + "step": 4754 + }, + { + "epoch": 0.2617095052011668, + "grad_norm": 0.9304273724555969, + "learning_rate": 9.598349220151254e-06, + "loss": 0.7519, + "step": 4755 + }, + { + "epoch": 0.2617645440035225, + "grad_norm": 0.9361812472343445, + "learning_rate": 9.598178983793886e-06, + "loss": 0.8131, + "step": 4756 + }, + { + "epoch": 0.26181958280587814, + "grad_norm": 0.7783429622650146, + "learning_rate": 9.598008712877835e-06, + "loss": 0.7351, + "step": 4757 + }, + { + "epoch": 0.2618746216082338, + "grad_norm": 0.8739376068115234, + "learning_rate": 9.597838407404381e-06, + "loss": 0.9458, + "step": 4758 + }, + { + "epoch": 0.26192966041058946, + "grad_norm": 0.7076277732849121, + "learning_rate": 9.597668067374805e-06, + "loss": 0.7632, + "step": 4759 + }, + { + "epoch": 0.26198469921294515, + "grad_norm": 0.7652345299720764, + "learning_rate": 9.597497692790386e-06, + "loss": 0.8018, + "step": 4760 + }, + { + "epoch": 0.2620397380153008, + "grad_norm": 0.7332149147987366, + "learning_rate": 9.597327283652405e-06, + "loss": 0.8223, + "step": 4761 + }, + { + "epoch": 0.26209477681765647, + "grad_norm": 0.8361638784408569, + "learning_rate": 9.597156839962145e-06, + "loss": 0.8784, + "step": 4762 + }, + { + "epoch": 0.2621498156200121, + "grad_norm": 1.183772325515747, + "learning_rate": 9.596986361720882e-06, + "loss": 0.8768, + "step": 4763 + }, + { + "epoch": 0.2622048544223678, + "grad_norm": 0.9895418882369995, + "learning_rate": 9.596815848929902e-06, + "loss": 0.714, + "step": 4764 + }, + { + "epoch": 0.26225989322472343, + "grad_norm": 0.8210558295249939, + "learning_rate": 9.59664530159048e-06, + "loss": 0.7246, + "step": 4765 + }, + { + "epoch": 0.2623149320270791, + "grad_norm": 0.8003455996513367, + "learning_rate": 9.596474719703908e-06, + "loss": 0.8385, + "step": 4766 + }, + { + "epoch": 0.26236997082943475, + "grad_norm": 0.7555826306343079, + "learning_rate": 9.59630410327146e-06, + "loss": 0.7243, + "step": 4767 + }, + { + "epoch": 0.2624250096317904, + "grad_norm": 0.7746273279190063, + "learning_rate": 9.596133452294421e-06, + "loss": 0.8763, + "step": 4768 + }, + { + "epoch": 0.2624800484341461, + "grad_norm": 0.7238507866859436, + "learning_rate": 9.595962766774074e-06, + "loss": 0.8302, + "step": 4769 + }, + { + "epoch": 0.2625350872365017, + "grad_norm": 0.7874132394790649, + "learning_rate": 9.595792046711699e-06, + "loss": 0.7979, + "step": 4770 + }, + { + "epoch": 0.2625901260388574, + "grad_norm": 0.8792033791542053, + "learning_rate": 9.595621292108583e-06, + "loss": 0.8555, + "step": 4771 + }, + { + "epoch": 0.26264516484121303, + "grad_norm": 0.7026945948600769, + "learning_rate": 9.595450502966006e-06, + "loss": 0.718, + "step": 4772 + }, + { + "epoch": 0.2627002036435687, + "grad_norm": 0.7747959494590759, + "learning_rate": 9.595279679285254e-06, + "loss": 0.8329, + "step": 4773 + }, + { + "epoch": 0.26275524244592435, + "grad_norm": 0.697979748249054, + "learning_rate": 9.59510882106761e-06, + "loss": 0.7456, + "step": 4774 + }, + { + "epoch": 0.26281028124828004, + "grad_norm": 0.7600447535514832, + "learning_rate": 9.594937928314359e-06, + "loss": 0.875, + "step": 4775 + }, + { + "epoch": 0.2628653200506357, + "grad_norm": 0.7591384649276733, + "learning_rate": 9.594767001026783e-06, + "loss": 0.7607, + "step": 4776 + }, + { + "epoch": 0.26292035885299136, + "grad_norm": 0.9267380833625793, + "learning_rate": 9.59459603920617e-06, + "loss": 0.8926, + "step": 4777 + }, + { + "epoch": 0.262975397655347, + "grad_norm": 0.7751328349113464, + "learning_rate": 9.594425042853802e-06, + "loss": 0.7449, + "step": 4778 + }, + { + "epoch": 0.2630304364577027, + "grad_norm": 0.7066012620925903, + "learning_rate": 9.594254011970966e-06, + "loss": 0.8374, + "step": 4779 + }, + { + "epoch": 0.2630854752600583, + "grad_norm": 0.7564317584037781, + "learning_rate": 9.594082946558945e-06, + "loss": 0.735, + "step": 4780 + }, + { + "epoch": 0.263140514062414, + "grad_norm": 0.8151416182518005, + "learning_rate": 9.593911846619027e-06, + "loss": 0.8575, + "step": 4781 + }, + { + "epoch": 0.26319555286476964, + "grad_norm": 0.719261646270752, + "learning_rate": 9.593740712152497e-06, + "loss": 0.7981, + "step": 4782 + }, + { + "epoch": 0.26325059166712533, + "grad_norm": 0.8627344369888306, + "learning_rate": 9.593569543160642e-06, + "loss": 0.895, + "step": 4783 + }, + { + "epoch": 0.26330563046948097, + "grad_norm": 1.293272614479065, + "learning_rate": 9.593398339644748e-06, + "loss": 0.7531, + "step": 4784 + }, + { + "epoch": 0.26336066927183666, + "grad_norm": 0.8475207686424255, + "learning_rate": 9.593227101606102e-06, + "loss": 0.9091, + "step": 4785 + }, + { + "epoch": 0.2634157080741923, + "grad_norm": 0.78054279088974, + "learning_rate": 9.593055829045989e-06, + "loss": 0.7692, + "step": 4786 + }, + { + "epoch": 0.263470746876548, + "grad_norm": 0.7677399516105652, + "learning_rate": 9.592884521965699e-06, + "loss": 0.6232, + "step": 4787 + }, + { + "epoch": 0.2635257856789036, + "grad_norm": 0.7232677340507507, + "learning_rate": 9.59271318036652e-06, + "loss": 0.8087, + "step": 4788 + }, + { + "epoch": 0.2635808244812593, + "grad_norm": 0.8728463649749756, + "learning_rate": 9.592541804249735e-06, + "loss": 0.7824, + "step": 4789 + }, + { + "epoch": 0.26363586328361494, + "grad_norm": 0.7569910883903503, + "learning_rate": 9.592370393616637e-06, + "loss": 0.7418, + "step": 4790 + }, + { + "epoch": 0.2636909020859706, + "grad_norm": 0.7631934285163879, + "learning_rate": 9.592198948468511e-06, + "loss": 0.7929, + "step": 4791 + }, + { + "epoch": 0.26374594088832626, + "grad_norm": 0.8021631240844727, + "learning_rate": 9.592027468806649e-06, + "loss": 0.8111, + "step": 4792 + }, + { + "epoch": 0.26380097969068195, + "grad_norm": 0.9454651474952698, + "learning_rate": 9.591855954632336e-06, + "loss": 0.8239, + "step": 4793 + }, + { + "epoch": 0.2638560184930376, + "grad_norm": 0.672924280166626, + "learning_rate": 9.591684405946863e-06, + "loss": 0.6877, + "step": 4794 + }, + { + "epoch": 0.26391105729539327, + "grad_norm": 0.7942802906036377, + "learning_rate": 9.59151282275152e-06, + "loss": 0.9002, + "step": 4795 + }, + { + "epoch": 0.2639660960977489, + "grad_norm": 0.7131155133247375, + "learning_rate": 9.591341205047596e-06, + "loss": 0.7692, + "step": 4796 + }, + { + "epoch": 0.2640211349001046, + "grad_norm": 1.0395869016647339, + "learning_rate": 9.59116955283638e-06, + "loss": 0.8352, + "step": 4797 + }, + { + "epoch": 0.2640761737024602, + "grad_norm": 0.9503256678581238, + "learning_rate": 9.590997866119163e-06, + "loss": 1.0287, + "step": 4798 + }, + { + "epoch": 0.2641312125048159, + "grad_norm": 0.7539612054824829, + "learning_rate": 9.590826144897235e-06, + "loss": 0.872, + "step": 4799 + }, + { + "epoch": 0.26418625130717155, + "grad_norm": 0.7067893743515015, + "learning_rate": 9.590654389171885e-06, + "loss": 0.7636, + "step": 4800 + }, + { + "epoch": 0.26424129010952724, + "grad_norm": 0.7355281710624695, + "learning_rate": 9.590482598944407e-06, + "loss": 0.7715, + "step": 4801 + }, + { + "epoch": 0.26429632891188287, + "grad_norm": 0.7589674592018127, + "learning_rate": 9.590310774216089e-06, + "loss": 0.7451, + "step": 4802 + }, + { + "epoch": 0.26435136771423856, + "grad_norm": 0.701386034488678, + "learning_rate": 9.590138914988226e-06, + "loss": 0.7317, + "step": 4803 + }, + { + "epoch": 0.2644064065165942, + "grad_norm": 0.7663118243217468, + "learning_rate": 9.589967021262105e-06, + "loss": 0.8227, + "step": 4804 + }, + { + "epoch": 0.2644614453189499, + "grad_norm": 0.7059655785560608, + "learning_rate": 9.589795093039023e-06, + "loss": 0.7829, + "step": 4805 + }, + { + "epoch": 0.2645164841213055, + "grad_norm": 0.7377020120620728, + "learning_rate": 9.58962313032027e-06, + "loss": 0.8308, + "step": 4806 + }, + { + "epoch": 0.2645715229236612, + "grad_norm": 0.8635388612747192, + "learning_rate": 9.589451133107134e-06, + "loss": 0.7882, + "step": 4807 + }, + { + "epoch": 0.26462656172601684, + "grad_norm": 0.8282824754714966, + "learning_rate": 9.589279101400915e-06, + "loss": 0.8055, + "step": 4808 + }, + { + "epoch": 0.26468160052837253, + "grad_norm": 0.7026814818382263, + "learning_rate": 9.589107035202903e-06, + "loss": 0.7567, + "step": 4809 + }, + { + "epoch": 0.26473663933072816, + "grad_norm": 0.7575708031654358, + "learning_rate": 9.588934934514392e-06, + "loss": 0.7456, + "step": 4810 + }, + { + "epoch": 0.2647916781330838, + "grad_norm": 0.9732069969177246, + "learning_rate": 9.588762799336671e-06, + "loss": 0.8217, + "step": 4811 + }, + { + "epoch": 0.2648467169354395, + "grad_norm": 0.786803126335144, + "learning_rate": 9.58859062967104e-06, + "loss": 0.729, + "step": 4812 + }, + { + "epoch": 0.2649017557377951, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.588418425518789e-06, + "loss": 0.8204, + "step": 4813 + }, + { + "epoch": 0.2649567945401508, + "grad_norm": 0.8222702145576477, + "learning_rate": 9.588246186881213e-06, + "loss": 0.8349, + "step": 4814 + }, + { + "epoch": 0.26501183334250644, + "grad_norm": 0.7560802698135376, + "learning_rate": 9.588073913759608e-06, + "loss": 0.7601, + "step": 4815 + }, + { + "epoch": 0.26506687214486213, + "grad_norm": 0.9221365451812744, + "learning_rate": 9.587901606155266e-06, + "loss": 0.7725, + "step": 4816 + }, + { + "epoch": 0.26512191094721776, + "grad_norm": 0.8092262744903564, + "learning_rate": 9.587729264069485e-06, + "loss": 0.9074, + "step": 4817 + }, + { + "epoch": 0.26517694974957345, + "grad_norm": 0.8183920979499817, + "learning_rate": 9.587556887503557e-06, + "loss": 0.8321, + "step": 4818 + }, + { + "epoch": 0.2652319885519291, + "grad_norm": 0.7023420929908752, + "learning_rate": 9.587384476458781e-06, + "loss": 0.7842, + "step": 4819 + }, + { + "epoch": 0.2652870273542848, + "grad_norm": 1.2864880561828613, + "learning_rate": 9.58721203093645e-06, + "loss": 0.7519, + "step": 4820 + }, + { + "epoch": 0.2653420661566404, + "grad_norm": 0.8133784532546997, + "learning_rate": 9.587039550937864e-06, + "loss": 0.8208, + "step": 4821 + }, + { + "epoch": 0.2653971049589961, + "grad_norm": 0.739732027053833, + "learning_rate": 9.586867036464314e-06, + "loss": 0.8553, + "step": 4822 + }, + { + "epoch": 0.26545214376135173, + "grad_norm": 0.7539162635803223, + "learning_rate": 9.5866944875171e-06, + "loss": 0.7385, + "step": 4823 + }, + { + "epoch": 0.2655071825637074, + "grad_norm": 0.8012336492538452, + "learning_rate": 9.58652190409752e-06, + "loss": 0.8343, + "step": 4824 + }, + { + "epoch": 0.26556222136606306, + "grad_norm": 0.7972521185874939, + "learning_rate": 9.586349286206865e-06, + "loss": 0.8481, + "step": 4825 + }, + { + "epoch": 0.26561726016841875, + "grad_norm": 0.7772900462150574, + "learning_rate": 9.58617663384644e-06, + "loss": 0.7655, + "step": 4826 + }, + { + "epoch": 0.2656722989707744, + "grad_norm": 0.677916944026947, + "learning_rate": 9.586003947017537e-06, + "loss": 0.696, + "step": 4827 + }, + { + "epoch": 0.26572733777313007, + "grad_norm": 0.8254117369651794, + "learning_rate": 9.585831225721455e-06, + "loss": 0.7841, + "step": 4828 + }, + { + "epoch": 0.2657823765754857, + "grad_norm": 0.7256904244422913, + "learning_rate": 9.585658469959496e-06, + "loss": 0.8057, + "step": 4829 + }, + { + "epoch": 0.2658374153778414, + "grad_norm": 0.7651757001876831, + "learning_rate": 9.585485679732953e-06, + "loss": 0.7918, + "step": 4830 + }, + { + "epoch": 0.265892454180197, + "grad_norm": 0.7581052184104919, + "learning_rate": 9.58531285504313e-06, + "loss": 0.759, + "step": 4831 + }, + { + "epoch": 0.2659474929825527, + "grad_norm": 0.7190486192703247, + "learning_rate": 9.58513999589132e-06, + "loss": 0.7403, + "step": 4832 + }, + { + "epoch": 0.26600253178490835, + "grad_norm": 0.8603141903877258, + "learning_rate": 9.584967102278825e-06, + "loss": 0.8944, + "step": 4833 + }, + { + "epoch": 0.26605757058726404, + "grad_norm": 0.806297779083252, + "learning_rate": 9.584794174206947e-06, + "loss": 0.7039, + "step": 4834 + }, + { + "epoch": 0.26611260938961967, + "grad_norm": 0.7604451775550842, + "learning_rate": 9.584621211676981e-06, + "loss": 0.8076, + "step": 4835 + }, + { + "epoch": 0.26616764819197536, + "grad_norm": 0.7276773452758789, + "learning_rate": 9.584448214690232e-06, + "loss": 0.786, + "step": 4836 + }, + { + "epoch": 0.266222686994331, + "grad_norm": 0.8737080693244934, + "learning_rate": 9.584275183247994e-06, + "loss": 0.8071, + "step": 4837 + }, + { + "epoch": 0.2662777257966867, + "grad_norm": 0.8447219133377075, + "learning_rate": 9.584102117351574e-06, + "loss": 0.7682, + "step": 4838 + }, + { + "epoch": 0.2663327645990423, + "grad_norm": 0.7001703381538391, + "learning_rate": 9.583929017002268e-06, + "loss": 0.7077, + "step": 4839 + }, + { + "epoch": 0.266387803401398, + "grad_norm": 0.7935730218887329, + "learning_rate": 9.583755882201377e-06, + "loss": 0.8122, + "step": 4840 + }, + { + "epoch": 0.26644284220375364, + "grad_norm": 0.8763312697410583, + "learning_rate": 9.583582712950207e-06, + "loss": 0.8241, + "step": 4841 + }, + { + "epoch": 0.2664978810061093, + "grad_norm": 0.7910245656967163, + "learning_rate": 9.583409509250055e-06, + "loss": 0.7717, + "step": 4842 + }, + { + "epoch": 0.26655291980846496, + "grad_norm": 0.7975226640701294, + "learning_rate": 9.583236271102222e-06, + "loss": 0.7165, + "step": 4843 + }, + { + "epoch": 0.26660795861082065, + "grad_norm": 0.8060342073440552, + "learning_rate": 9.583062998508014e-06, + "loss": 0.7659, + "step": 4844 + }, + { + "epoch": 0.2666629974131763, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.582889691468732e-06, + "loss": 0.8207, + "step": 4845 + }, + { + "epoch": 0.266718036215532, + "grad_norm": 0.7409310936927795, + "learning_rate": 9.582716349985677e-06, + "loss": 0.8439, + "step": 4846 + }, + { + "epoch": 0.2667730750178876, + "grad_norm": 0.8871899843215942, + "learning_rate": 9.582542974060152e-06, + "loss": 0.8305, + "step": 4847 + }, + { + "epoch": 0.2668281138202433, + "grad_norm": 0.9003115296363831, + "learning_rate": 9.58236956369346e-06, + "loss": 0.8334, + "step": 4848 + }, + { + "epoch": 0.26688315262259893, + "grad_norm": 1.0149577856063843, + "learning_rate": 9.582196118886909e-06, + "loss": 0.7962, + "step": 4849 + }, + { + "epoch": 0.2669381914249546, + "grad_norm": 0.785214900970459, + "learning_rate": 9.582022639641795e-06, + "loss": 0.7806, + "step": 4850 + }, + { + "epoch": 0.26699323022731025, + "grad_norm": 0.9833952188491821, + "learning_rate": 9.581849125959426e-06, + "loss": 0.7607, + "step": 4851 + }, + { + "epoch": 0.26704826902966594, + "grad_norm": 1.404751181602478, + "learning_rate": 9.581675577841104e-06, + "loss": 0.9046, + "step": 4852 + }, + { + "epoch": 0.2671033078320216, + "grad_norm": 0.791159451007843, + "learning_rate": 9.581501995288137e-06, + "loss": 0.6582, + "step": 4853 + }, + { + "epoch": 0.2671583466343772, + "grad_norm": 0.8507272005081177, + "learning_rate": 9.581328378301827e-06, + "loss": 0.8946, + "step": 4854 + }, + { + "epoch": 0.2672133854367329, + "grad_norm": 0.7372786998748779, + "learning_rate": 9.58115472688348e-06, + "loss": 0.7865, + "step": 4855 + }, + { + "epoch": 0.26726842423908853, + "grad_norm": 0.8293853998184204, + "learning_rate": 9.580981041034398e-06, + "loss": 0.9113, + "step": 4856 + }, + { + "epoch": 0.2673234630414442, + "grad_norm": 0.7212402820587158, + "learning_rate": 9.580807320755889e-06, + "loss": 0.7149, + "step": 4857 + }, + { + "epoch": 0.26737850184379985, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.58063356604926e-06, + "loss": 0.8651, + "step": 4858 + }, + { + "epoch": 0.26743354064615554, + "grad_norm": 0.8444308042526245, + "learning_rate": 9.580459776915814e-06, + "loss": 0.7968, + "step": 4859 + }, + { + "epoch": 0.2674885794485112, + "grad_norm": 0.7974254488945007, + "learning_rate": 9.58028595335686e-06, + "loss": 0.8499, + "step": 4860 + }, + { + "epoch": 0.26754361825086687, + "grad_norm": 0.7491242289543152, + "learning_rate": 9.580112095373702e-06, + "loss": 0.8278, + "step": 4861 + }, + { + "epoch": 0.2675986570532225, + "grad_norm": 0.6856499314308167, + "learning_rate": 9.579938202967646e-06, + "loss": 0.7466, + "step": 4862 + }, + { + "epoch": 0.2676536958555782, + "grad_norm": 0.7347447872161865, + "learning_rate": 9.579764276140002e-06, + "loss": 0.8046, + "step": 4863 + }, + { + "epoch": 0.2677087346579338, + "grad_norm": 0.6797083020210266, + "learning_rate": 9.579590314892077e-06, + "loss": 0.7012, + "step": 4864 + }, + { + "epoch": 0.2677637734602895, + "grad_norm": 0.8219562768936157, + "learning_rate": 9.579416319225175e-06, + "loss": 0.7592, + "step": 4865 + }, + { + "epoch": 0.26781881226264515, + "grad_norm": 0.7388357520103455, + "learning_rate": 9.579242289140607e-06, + "loss": 0.8179, + "step": 4866 + }, + { + "epoch": 0.26787385106500083, + "grad_norm": 0.7394490838050842, + "learning_rate": 9.579068224639679e-06, + "loss": 0.694, + "step": 4867 + }, + { + "epoch": 0.26792888986735647, + "grad_norm": 0.7309017181396484, + "learning_rate": 9.578894125723699e-06, + "loss": 0.7882, + "step": 4868 + }, + { + "epoch": 0.26798392866971216, + "grad_norm": 0.7785035967826843, + "learning_rate": 9.578719992393978e-06, + "loss": 0.8142, + "step": 4869 + }, + { + "epoch": 0.2680389674720678, + "grad_norm": 0.8983079195022583, + "learning_rate": 9.57854582465182e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.2680940062744235, + "grad_norm": 0.7433765530586243, + "learning_rate": 9.578371622498542e-06, + "loss": 0.8937, + "step": 4871 + }, + { + "epoch": 0.2681490450767791, + "grad_norm": 0.8808667659759521, + "learning_rate": 9.578197385935446e-06, + "loss": 0.7821, + "step": 4872 + }, + { + "epoch": 0.2682040838791348, + "grad_norm": 0.825794517993927, + "learning_rate": 9.578023114963843e-06, + "loss": 0.8228, + "step": 4873 + }, + { + "epoch": 0.26825912268149044, + "grad_norm": 1.0165129899978638, + "learning_rate": 9.577848809585046e-06, + "loss": 0.7964, + "step": 4874 + }, + { + "epoch": 0.2683141614838461, + "grad_norm": 0.742028534412384, + "learning_rate": 9.577674469800362e-06, + "loss": 0.9126, + "step": 4875 + }, + { + "epoch": 0.26836920028620176, + "grad_norm": 0.7571890354156494, + "learning_rate": 9.577500095611101e-06, + "loss": 0.879, + "step": 4876 + }, + { + "epoch": 0.26842423908855745, + "grad_norm": 0.7577160596847534, + "learning_rate": 9.577325687018575e-06, + "loss": 0.8048, + "step": 4877 + }, + { + "epoch": 0.2684792778909131, + "grad_norm": 0.7704411745071411, + "learning_rate": 9.577151244024095e-06, + "loss": 0.7451, + "step": 4878 + }, + { + "epoch": 0.26853431669326877, + "grad_norm": 0.8323166966438293, + "learning_rate": 9.57697676662897e-06, + "loss": 0.7591, + "step": 4879 + }, + { + "epoch": 0.2685893554956244, + "grad_norm": 0.7257028222084045, + "learning_rate": 9.576802254834516e-06, + "loss": 0.7941, + "step": 4880 + }, + { + "epoch": 0.2686443942979801, + "grad_norm": 0.8170442581176758, + "learning_rate": 9.57662770864204e-06, + "loss": 0.8617, + "step": 4881 + }, + { + "epoch": 0.2686994331003357, + "grad_norm": 0.7435339689254761, + "learning_rate": 9.576453128052852e-06, + "loss": 0.7683, + "step": 4882 + }, + { + "epoch": 0.2687544719026914, + "grad_norm": 0.7932955026626587, + "learning_rate": 9.576278513068271e-06, + "loss": 0.7103, + "step": 4883 + }, + { + "epoch": 0.26880951070504705, + "grad_norm": 0.8008469939231873, + "learning_rate": 9.576103863689604e-06, + "loss": 0.8144, + "step": 4884 + }, + { + "epoch": 0.26886454950740274, + "grad_norm": 0.8573774695396423, + "learning_rate": 9.575929179918167e-06, + "loss": 0.8992, + "step": 4885 + }, + { + "epoch": 0.2689195883097584, + "grad_norm": 0.7326993942260742, + "learning_rate": 9.57575446175527e-06, + "loss": 0.699, + "step": 4886 + }, + { + "epoch": 0.26897462711211406, + "grad_norm": 0.8249791264533997, + "learning_rate": 9.575579709202228e-06, + "loss": 0.7445, + "step": 4887 + }, + { + "epoch": 0.2690296659144697, + "grad_norm": 0.7136644124984741, + "learning_rate": 9.575404922260351e-06, + "loss": 0.779, + "step": 4888 + }, + { + "epoch": 0.2690847047168254, + "grad_norm": 1.0130438804626465, + "learning_rate": 9.575230100930958e-06, + "loss": 0.8535, + "step": 4889 + }, + { + "epoch": 0.269139743519181, + "grad_norm": 0.6784926652908325, + "learning_rate": 9.575055245215358e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.2691947823215367, + "grad_norm": 0.7492508888244629, + "learning_rate": 9.57488035511487e-06, + "loss": 0.6748, + "step": 4891 + }, + { + "epoch": 0.26924982112389234, + "grad_norm": 0.7951217889785767, + "learning_rate": 9.574705430630807e-06, + "loss": 0.8119, + "step": 4892 + }, + { + "epoch": 0.26930485992624803, + "grad_norm": 0.9756677746772766, + "learning_rate": 9.574530471764478e-06, + "loss": 0.855, + "step": 4893 + }, + { + "epoch": 0.26935989872860366, + "grad_norm": 0.7806811928749084, + "learning_rate": 9.574355478517206e-06, + "loss": 0.8432, + "step": 4894 + }, + { + "epoch": 0.26941493753095935, + "grad_norm": 0.7814774513244629, + "learning_rate": 9.574180450890301e-06, + "loss": 0.8226, + "step": 4895 + }, + { + "epoch": 0.269469976333315, + "grad_norm": 0.7745325565338135, + "learning_rate": 9.574005388885081e-06, + "loss": 0.7722, + "step": 4896 + }, + { + "epoch": 0.2695250151356706, + "grad_norm": 0.7805666327476501, + "learning_rate": 9.573830292502862e-06, + "loss": 0.8357, + "step": 4897 + }, + { + "epoch": 0.2695800539380263, + "grad_norm": 0.8428031802177429, + "learning_rate": 9.573655161744958e-06, + "loss": 0.8056, + "step": 4898 + }, + { + "epoch": 0.26963509274038194, + "grad_norm": 0.7896600961685181, + "learning_rate": 9.573479996612684e-06, + "loss": 0.7984, + "step": 4899 + }, + { + "epoch": 0.26969013154273763, + "grad_norm": 0.7718683481216431, + "learning_rate": 9.57330479710736e-06, + "loss": 0.7527, + "step": 4900 + }, + { + "epoch": 0.26974517034509327, + "grad_norm": 0.7868129014968872, + "learning_rate": 9.573129563230302e-06, + "loss": 0.7876, + "step": 4901 + }, + { + "epoch": 0.26980020914744895, + "grad_norm": 0.8493777513504028, + "learning_rate": 9.572954294982826e-06, + "loss": 0.864, + "step": 4902 + }, + { + "epoch": 0.2698552479498046, + "grad_norm": 0.7492502331733704, + "learning_rate": 9.57277899236625e-06, + "loss": 0.8236, + "step": 4903 + }, + { + "epoch": 0.2699102867521603, + "grad_norm": 1.0534250736236572, + "learning_rate": 9.57260365538189e-06, + "loss": 0.8012, + "step": 4904 + }, + { + "epoch": 0.2699653255545159, + "grad_norm": 0.7557470202445984, + "learning_rate": 9.572428284031065e-06, + "loss": 0.9084, + "step": 4905 + }, + { + "epoch": 0.2700203643568716, + "grad_norm": 0.8055123686790466, + "learning_rate": 9.572252878315094e-06, + "loss": 0.7468, + "step": 4906 + }, + { + "epoch": 0.27007540315922723, + "grad_norm": 0.8399039506912231, + "learning_rate": 9.572077438235294e-06, + "loss": 0.9293, + "step": 4907 + }, + { + "epoch": 0.2701304419615829, + "grad_norm": 0.9800041317939758, + "learning_rate": 9.571901963792983e-06, + "loss": 0.8664, + "step": 4908 + }, + { + "epoch": 0.27018548076393856, + "grad_norm": 0.7732129096984863, + "learning_rate": 9.571726454989482e-06, + "loss": 0.7227, + "step": 4909 + }, + { + "epoch": 0.27024051956629425, + "grad_norm": 0.730754017829895, + "learning_rate": 9.571550911826109e-06, + "loss": 0.6467, + "step": 4910 + }, + { + "epoch": 0.2702955583686499, + "grad_norm": 0.8245325684547424, + "learning_rate": 9.57137533430418e-06, + "loss": 0.7847, + "step": 4911 + }, + { + "epoch": 0.27035059717100557, + "grad_norm": 0.8606786131858826, + "learning_rate": 9.57119972242502e-06, + "loss": 0.9556, + "step": 4912 + }, + { + "epoch": 0.2704056359733612, + "grad_norm": 0.7480195164680481, + "learning_rate": 9.571024076189947e-06, + "loss": 0.8504, + "step": 4913 + }, + { + "epoch": 0.2704606747757169, + "grad_norm": 0.718913197517395, + "learning_rate": 9.57084839560028e-06, + "loss": 0.7869, + "step": 4914 + }, + { + "epoch": 0.2705157135780725, + "grad_norm": 0.9778180122375488, + "learning_rate": 9.57067268065734e-06, + "loss": 0.8514, + "step": 4915 + }, + { + "epoch": 0.2705707523804282, + "grad_norm": 0.7394844889640808, + "learning_rate": 9.570496931362448e-06, + "loss": 0.7906, + "step": 4916 + }, + { + "epoch": 0.27062579118278385, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.570321147716923e-06, + "loss": 0.8194, + "step": 4917 + }, + { + "epoch": 0.27068082998513954, + "grad_norm": 0.8002632260322571, + "learning_rate": 9.57014532972209e-06, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.27073586878749517, + "grad_norm": 0.8668341040611267, + "learning_rate": 9.569969477379267e-06, + "loss": 0.8954, + "step": 4919 + }, + { + "epoch": 0.27079090758985086, + "grad_norm": 0.7403327226638794, + "learning_rate": 9.569793590689775e-06, + "loss": 0.7755, + "step": 4920 + }, + { + "epoch": 0.2708459463922065, + "grad_norm": 0.7399682998657227, + "learning_rate": 9.569617669654938e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 0.2709009851945622, + "grad_norm": 0.788600504398346, + "learning_rate": 9.56944171427608e-06, + "loss": 0.7565, + "step": 4922 + }, + { + "epoch": 0.2709560239969178, + "grad_norm": 0.7044861912727356, + "learning_rate": 9.56926572455452e-06, + "loss": 0.7073, + "step": 4923 + }, + { + "epoch": 0.2710110627992735, + "grad_norm": 0.8195114135742188, + "learning_rate": 9.569089700491581e-06, + "loss": 0.8658, + "step": 4924 + }, + { + "epoch": 0.27106610160162914, + "grad_norm": 0.7792258858680725, + "learning_rate": 9.568913642088589e-06, + "loss": 0.8628, + "step": 4925 + }, + { + "epoch": 0.27112114040398483, + "grad_norm": 0.764930248260498, + "learning_rate": 9.568737549346862e-06, + "loss": 0.7761, + "step": 4926 + }, + { + "epoch": 0.27117617920634046, + "grad_norm": 0.7226328253746033, + "learning_rate": 9.56856142226773e-06, + "loss": 0.7208, + "step": 4927 + }, + { + "epoch": 0.27123121800869615, + "grad_norm": 0.8726598620414734, + "learning_rate": 9.568385260852512e-06, + "loss": 0.8599, + "step": 4928 + }, + { + "epoch": 0.2712862568110518, + "grad_norm": 1.0126571655273438, + "learning_rate": 9.568209065102533e-06, + "loss": 0.8145, + "step": 4929 + }, + { + "epoch": 0.2713412956134075, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.568032835019116e-06, + "loss": 0.6758, + "step": 4930 + }, + { + "epoch": 0.2713963344157631, + "grad_norm": 0.6955474019050598, + "learning_rate": 9.567856570603589e-06, + "loss": 0.7461, + "step": 4931 + }, + { + "epoch": 0.2714513732181188, + "grad_norm": 0.7136832475662231, + "learning_rate": 9.567680271857274e-06, + "loss": 0.7692, + "step": 4932 + }, + { + "epoch": 0.27150641202047443, + "grad_norm": 1.2288198471069336, + "learning_rate": 9.567503938781497e-06, + "loss": 0.7815, + "step": 4933 + }, + { + "epoch": 0.2715614508228301, + "grad_norm": 0.9182234406471252, + "learning_rate": 9.567327571377584e-06, + "loss": 0.8822, + "step": 4934 + }, + { + "epoch": 0.27161648962518575, + "grad_norm": 0.7684763669967651, + "learning_rate": 9.567151169646859e-06, + "loss": 0.7618, + "step": 4935 + }, + { + "epoch": 0.27167152842754144, + "grad_norm": 0.872360348701477, + "learning_rate": 9.566974733590647e-06, + "loss": 0.7975, + "step": 4936 + }, + { + "epoch": 0.2717265672298971, + "grad_norm": 0.9010463356971741, + "learning_rate": 9.566798263210277e-06, + "loss": 0.7159, + "step": 4937 + }, + { + "epoch": 0.27178160603225276, + "grad_norm": 0.7254281044006348, + "learning_rate": 9.566621758507072e-06, + "loss": 0.6724, + "step": 4938 + }, + { + "epoch": 0.2718366448346084, + "grad_norm": 0.8478212356567383, + "learning_rate": 9.566445219482363e-06, + "loss": 0.659, + "step": 4939 + }, + { + "epoch": 0.27189168363696403, + "grad_norm": 0.9038714170455933, + "learning_rate": 9.56626864613747e-06, + "loss": 0.8766, + "step": 4940 + }, + { + "epoch": 0.2719467224393197, + "grad_norm": 0.9704582691192627, + "learning_rate": 9.566092038473728e-06, + "loss": 0.8972, + "step": 4941 + }, + { + "epoch": 0.27200176124167535, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.565915396492459e-06, + "loss": 0.8116, + "step": 4942 + }, + { + "epoch": 0.27205680004403104, + "grad_norm": 0.7432642579078674, + "learning_rate": 9.565738720194993e-06, + "loss": 0.847, + "step": 4943 + }, + { + "epoch": 0.2721118388463867, + "grad_norm": 0.6813814043998718, + "learning_rate": 9.565562009582655e-06, + "loss": 0.7146, + "step": 4944 + }, + { + "epoch": 0.27216687764874237, + "grad_norm": 0.7447707056999207, + "learning_rate": 9.565385264656776e-06, + "loss": 0.7696, + "step": 4945 + }, + { + "epoch": 0.272221916451098, + "grad_norm": 0.875073254108429, + "learning_rate": 9.565208485418685e-06, + "loss": 0.8714, + "step": 4946 + }, + { + "epoch": 0.2722769552534537, + "grad_norm": 0.7753880620002747, + "learning_rate": 9.565031671869707e-06, + "loss": 0.739, + "step": 4947 + }, + { + "epoch": 0.2723319940558093, + "grad_norm": 0.749264121055603, + "learning_rate": 9.564854824011172e-06, + "loss": 0.7957, + "step": 4948 + }, + { + "epoch": 0.272387032858165, + "grad_norm": 0.6733991503715515, + "learning_rate": 9.564677941844412e-06, + "loss": 0.7402, + "step": 4949 + }, + { + "epoch": 0.27244207166052065, + "grad_norm": 0.7426447868347168, + "learning_rate": 9.564501025370753e-06, + "loss": 0.7977, + "step": 4950 + }, + { + "epoch": 0.27249711046287634, + "grad_norm": 0.7930514812469482, + "learning_rate": 9.564324074591529e-06, + "loss": 0.8485, + "step": 4951 + }, + { + "epoch": 0.27255214926523197, + "grad_norm": 0.8087072968482971, + "learning_rate": 9.564147089508064e-06, + "loss": 0.9215, + "step": 4952 + }, + { + "epoch": 0.27260718806758766, + "grad_norm": 0.7560327053070068, + "learning_rate": 9.563970070121694e-06, + "loss": 0.7966, + "step": 4953 + }, + { + "epoch": 0.2726622268699433, + "grad_norm": 0.735573947429657, + "learning_rate": 9.563793016433744e-06, + "loss": 0.7737, + "step": 4954 + }, + { + "epoch": 0.272717265672299, + "grad_norm": 0.7603545784950256, + "learning_rate": 9.563615928445548e-06, + "loss": 0.7717, + "step": 4955 + }, + { + "epoch": 0.2727723044746546, + "grad_norm": 0.7185375094413757, + "learning_rate": 9.563438806158437e-06, + "loss": 0.8057, + "step": 4956 + }, + { + "epoch": 0.2728273432770103, + "grad_norm": 0.7619272470474243, + "learning_rate": 9.56326164957374e-06, + "loss": 0.8173, + "step": 4957 + }, + { + "epoch": 0.27288238207936594, + "grad_norm": 0.7868000864982605, + "learning_rate": 9.563084458692793e-06, + "loss": 0.6855, + "step": 4958 + }, + { + "epoch": 0.2729374208817216, + "grad_norm": 0.7949535846710205, + "learning_rate": 9.562907233516923e-06, + "loss": 0.7754, + "step": 4959 + }, + { + "epoch": 0.27299245968407726, + "grad_norm": 0.7037919163703918, + "learning_rate": 9.562729974047462e-06, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.27304749848643295, + "grad_norm": 0.7236568927764893, + "learning_rate": 9.562552680285746e-06, + "loss": 0.7135, + "step": 4961 + }, + { + "epoch": 0.2731025372887886, + "grad_norm": 0.8410467505455017, + "learning_rate": 9.562375352233105e-06, + "loss": 0.8507, + "step": 4962 + }, + { + "epoch": 0.27315757609114427, + "grad_norm": 0.8043560981750488, + "learning_rate": 9.562197989890871e-06, + "loss": 0.8484, + "step": 4963 + }, + { + "epoch": 0.2732126148934999, + "grad_norm": 0.6926127672195435, + "learning_rate": 9.56202059326038e-06, + "loss": 0.8087, + "step": 4964 + }, + { + "epoch": 0.2732676536958556, + "grad_norm": 0.7149024605751038, + "learning_rate": 9.561843162342961e-06, + "loss": 0.7349, + "step": 4965 + }, + { + "epoch": 0.27332269249821123, + "grad_norm": 0.7165781855583191, + "learning_rate": 9.561665697139952e-06, + "loss": 0.8139, + "step": 4966 + }, + { + "epoch": 0.2733777313005669, + "grad_norm": 0.7481133341789246, + "learning_rate": 9.561488197652684e-06, + "loss": 0.7712, + "step": 4967 + }, + { + "epoch": 0.27343277010292255, + "grad_norm": 0.6928209066390991, + "learning_rate": 9.561310663882491e-06, + "loss": 0.7524, + "step": 4968 + }, + { + "epoch": 0.27348780890527824, + "grad_norm": 0.7397856116294861, + "learning_rate": 9.561133095830708e-06, + "loss": 0.718, + "step": 4969 + }, + { + "epoch": 0.2735428477076339, + "grad_norm": 0.7712383270263672, + "learning_rate": 9.560955493498672e-06, + "loss": 0.8201, + "step": 4970 + }, + { + "epoch": 0.27359788650998956, + "grad_norm": 0.96076899766922, + "learning_rate": 9.560777856887714e-06, + "loss": 0.8555, + "step": 4971 + }, + { + "epoch": 0.2736529253123452, + "grad_norm": 0.7331019639968872, + "learning_rate": 9.56060018599917e-06, + "loss": 0.8315, + "step": 4972 + }, + { + "epoch": 0.2737079641147009, + "grad_norm": 0.7157140970230103, + "learning_rate": 9.560422480834374e-06, + "loss": 0.7177, + "step": 4973 + }, + { + "epoch": 0.2737630029170565, + "grad_norm": 0.807614266872406, + "learning_rate": 9.560244741394666e-06, + "loss": 0.8413, + "step": 4974 + }, + { + "epoch": 0.2738180417194122, + "grad_norm": 0.7618574500083923, + "learning_rate": 9.560066967681378e-06, + "loss": 0.8248, + "step": 4975 + }, + { + "epoch": 0.27387308052176784, + "grad_norm": 0.7886885404586792, + "learning_rate": 9.559889159695848e-06, + "loss": 0.8793, + "step": 4976 + }, + { + "epoch": 0.27392811932412353, + "grad_norm": 1.0090755224227905, + "learning_rate": 9.559711317439411e-06, + "loss": 0.9255, + "step": 4977 + }, + { + "epoch": 0.27398315812647916, + "grad_norm": 0.7855443358421326, + "learning_rate": 9.559533440913405e-06, + "loss": 0.8001, + "step": 4978 + }, + { + "epoch": 0.27403819692883485, + "grad_norm": 0.768741250038147, + "learning_rate": 9.559355530119165e-06, + "loss": 0.8109, + "step": 4979 + }, + { + "epoch": 0.2740932357311905, + "grad_norm": 0.759589672088623, + "learning_rate": 9.55917758505803e-06, + "loss": 0.8001, + "step": 4980 + }, + { + "epoch": 0.2741482745335462, + "grad_norm": 0.7937445640563965, + "learning_rate": 9.558999605731338e-06, + "loss": 0.8924, + "step": 4981 + }, + { + "epoch": 0.2742033133359018, + "grad_norm": 0.9041592478752136, + "learning_rate": 9.558821592140423e-06, + "loss": 0.9167, + "step": 4982 + }, + { + "epoch": 0.27425835213825744, + "grad_norm": 0.6971380710601807, + "learning_rate": 9.558643544286627e-06, + "loss": 0.7589, + "step": 4983 + }, + { + "epoch": 0.27431339094061313, + "grad_norm": 0.9292929172515869, + "learning_rate": 9.558465462171287e-06, + "loss": 0.9566, + "step": 4984 + }, + { + "epoch": 0.27436842974296877, + "grad_norm": 0.8320629000663757, + "learning_rate": 9.558287345795738e-06, + "loss": 0.8854, + "step": 4985 + }, + { + "epoch": 0.27442346854532446, + "grad_norm": 0.797272801399231, + "learning_rate": 9.558109195161325e-06, + "loss": 0.7838, + "step": 4986 + }, + { + "epoch": 0.2744785073476801, + "grad_norm": 0.9702700972557068, + "learning_rate": 9.557931010269382e-06, + "loss": 0.8593, + "step": 4987 + }, + { + "epoch": 0.2745335461500358, + "grad_norm": 0.8309103846549988, + "learning_rate": 9.557752791121248e-06, + "loss": 0.8902, + "step": 4988 + }, + { + "epoch": 0.2745885849523914, + "grad_norm": 0.706667959690094, + "learning_rate": 9.557574537718265e-06, + "loss": 0.7259, + "step": 4989 + }, + { + "epoch": 0.2746436237547471, + "grad_norm": 0.770239531993866, + "learning_rate": 9.557396250061771e-06, + "loss": 0.8644, + "step": 4990 + }, + { + "epoch": 0.27469866255710274, + "grad_norm": 0.8695803880691528, + "learning_rate": 9.557217928153108e-06, + "loss": 0.895, + "step": 4991 + }, + { + "epoch": 0.2747537013594584, + "grad_norm": 0.7525948286056519, + "learning_rate": 9.557039571993614e-06, + "loss": 0.7029, + "step": 4992 + }, + { + "epoch": 0.27480874016181406, + "grad_norm": 0.7616680264472961, + "learning_rate": 9.556861181584631e-06, + "loss": 0.8025, + "step": 4993 + }, + { + "epoch": 0.27486377896416975, + "grad_norm": 0.7216167449951172, + "learning_rate": 9.5566827569275e-06, + "loss": 0.8314, + "step": 4994 + }, + { + "epoch": 0.2749188177665254, + "grad_norm": 0.7412614226341248, + "learning_rate": 9.55650429802356e-06, + "loss": 0.7877, + "step": 4995 + }, + { + "epoch": 0.27497385656888107, + "grad_norm": 0.7176525592803955, + "learning_rate": 9.556325804874154e-06, + "loss": 0.7615, + "step": 4996 + }, + { + "epoch": 0.2750288953712367, + "grad_norm": 0.7544515132904053, + "learning_rate": 9.556147277480623e-06, + "loss": 0.8352, + "step": 4997 + }, + { + "epoch": 0.2750839341735924, + "grad_norm": 0.7318205833435059, + "learning_rate": 9.555968715844309e-06, + "loss": 0.7403, + "step": 4998 + }, + { + "epoch": 0.275138972975948, + "grad_norm": 0.7495027780532837, + "learning_rate": 9.555790119966552e-06, + "loss": 0.7611, + "step": 4999 + }, + { + "epoch": 0.2751940117783037, + "grad_norm": 0.7544401288032532, + "learning_rate": 9.555611489848697e-06, + "loss": 0.8594, + "step": 5000 + }, + { + "epoch": 0.27524905058065935, + "grad_norm": 0.7698250412940979, + "learning_rate": 9.555432825492084e-06, + "loss": 0.8438, + "step": 5001 + }, + { + "epoch": 0.27530408938301504, + "grad_norm": 0.7668892741203308, + "learning_rate": 9.555254126898059e-06, + "loss": 0.8082, + "step": 5002 + }, + { + "epoch": 0.27535912818537067, + "grad_norm": 0.9170669317245483, + "learning_rate": 9.555075394067963e-06, + "loss": 0.7443, + "step": 5003 + }, + { + "epoch": 0.27541416698772636, + "grad_norm": 0.7890255451202393, + "learning_rate": 9.55489662700314e-06, + "loss": 0.8269, + "step": 5004 + }, + { + "epoch": 0.275469205790082, + "grad_norm": 0.6740512847900391, + "learning_rate": 9.554717825704932e-06, + "loss": 0.6906, + "step": 5005 + }, + { + "epoch": 0.2755242445924377, + "grad_norm": 0.8032376170158386, + "learning_rate": 9.554538990174685e-06, + "loss": 0.812, + "step": 5006 + }, + { + "epoch": 0.2755792833947933, + "grad_norm": 0.6932135224342346, + "learning_rate": 9.554360120413741e-06, + "loss": 0.7823, + "step": 5007 + }, + { + "epoch": 0.275634322197149, + "grad_norm": 0.7447643876075745, + "learning_rate": 9.554181216423447e-06, + "loss": 0.8753, + "step": 5008 + }, + { + "epoch": 0.27568936099950464, + "grad_norm": 0.8035081624984741, + "learning_rate": 9.554002278205145e-06, + "loss": 0.7135, + "step": 5009 + }, + { + "epoch": 0.27574439980186033, + "grad_norm": 0.7544171214103699, + "learning_rate": 9.553823305760182e-06, + "loss": 0.7574, + "step": 5010 + }, + { + "epoch": 0.27579943860421596, + "grad_norm": 0.6648419499397278, + "learning_rate": 9.553644299089902e-06, + "loss": 0.7566, + "step": 5011 + }, + { + "epoch": 0.27585447740657165, + "grad_norm": 0.7481752038002014, + "learning_rate": 9.55346525819565e-06, + "loss": 0.7862, + "step": 5012 + }, + { + "epoch": 0.2759095162089273, + "grad_norm": 0.7000668048858643, + "learning_rate": 9.55328618307877e-06, + "loss": 0.7767, + "step": 5013 + }, + { + "epoch": 0.275964555011283, + "grad_norm": 0.7435166239738464, + "learning_rate": 9.553107073740612e-06, + "loss": 0.6888, + "step": 5014 + }, + { + "epoch": 0.2760195938136386, + "grad_norm": 0.7593170404434204, + "learning_rate": 9.552927930182521e-06, + "loss": 0.7272, + "step": 5015 + }, + { + "epoch": 0.2760746326159943, + "grad_norm": 0.870079755783081, + "learning_rate": 9.55274875240584e-06, + "loss": 0.8692, + "step": 5016 + }, + { + "epoch": 0.27612967141834993, + "grad_norm": 0.8550307750701904, + "learning_rate": 9.55256954041192e-06, + "loss": 0.8729, + "step": 5017 + }, + { + "epoch": 0.2761847102207056, + "grad_norm": 0.888830304145813, + "learning_rate": 9.552390294202105e-06, + "loss": 0.8607, + "step": 5018 + }, + { + "epoch": 0.27623974902306125, + "grad_norm": 0.8295729160308838, + "learning_rate": 9.552211013777743e-06, + "loss": 0.8722, + "step": 5019 + }, + { + "epoch": 0.27629478782541694, + "grad_norm": 0.7732356190681458, + "learning_rate": 9.552031699140182e-06, + "loss": 0.8332, + "step": 5020 + }, + { + "epoch": 0.2763498266277726, + "grad_norm": 0.9132987856864929, + "learning_rate": 9.55185235029077e-06, + "loss": 0.769, + "step": 5021 + }, + { + "epoch": 0.27640486543012827, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.551672967230851e-06, + "loss": 0.8505, + "step": 5022 + }, + { + "epoch": 0.2764599042324839, + "grad_norm": 0.8526949882507324, + "learning_rate": 9.551493549961778e-06, + "loss": 0.8002, + "step": 5023 + }, + { + "epoch": 0.2765149430348396, + "grad_norm": 0.9513188004493713, + "learning_rate": 9.551314098484901e-06, + "loss": 0.8558, + "step": 5024 + }, + { + "epoch": 0.2765699818371952, + "grad_norm": 0.7543003559112549, + "learning_rate": 9.551134612801563e-06, + "loss": 0.8292, + "step": 5025 + }, + { + "epoch": 0.27662502063955086, + "grad_norm": 0.7531017065048218, + "learning_rate": 9.550955092913115e-06, + "loss": 0.7837, + "step": 5026 + }, + { + "epoch": 0.27668005944190655, + "grad_norm": 0.8725717663764954, + "learning_rate": 9.550775538820907e-06, + "loss": 0.8362, + "step": 5027 + }, + { + "epoch": 0.2767350982442622, + "grad_norm": 0.8122721910476685, + "learning_rate": 9.550595950526288e-06, + "loss": 0.8539, + "step": 5028 + }, + { + "epoch": 0.27679013704661787, + "grad_norm": 0.7756829261779785, + "learning_rate": 9.550416328030608e-06, + "loss": 0.787, + "step": 5029 + }, + { + "epoch": 0.2768451758489735, + "grad_norm": 0.9086001515388489, + "learning_rate": 9.550236671335218e-06, + "loss": 0.7972, + "step": 5030 + }, + { + "epoch": 0.2769002146513292, + "grad_norm": 0.7857060432434082, + "learning_rate": 9.550056980441466e-06, + "loss": 0.7577, + "step": 5031 + }, + { + "epoch": 0.2769552534536848, + "grad_norm": 0.8190392851829529, + "learning_rate": 9.549877255350703e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.2770102922560405, + "grad_norm": 0.7714588642120361, + "learning_rate": 9.549697496064283e-06, + "loss": 0.7916, + "step": 5033 + }, + { + "epoch": 0.27706533105839615, + "grad_norm": 0.7178533673286438, + "learning_rate": 9.549517702583552e-06, + "loss": 0.8001, + "step": 5034 + }, + { + "epoch": 0.27712036986075184, + "grad_norm": 0.7552955150604248, + "learning_rate": 9.549337874909865e-06, + "loss": 0.8361, + "step": 5035 + }, + { + "epoch": 0.27717540866310747, + "grad_norm": 0.7823992371559143, + "learning_rate": 9.549158013044573e-06, + "loss": 0.7033, + "step": 5036 + }, + { + "epoch": 0.27723044746546316, + "grad_norm": 0.731504499912262, + "learning_rate": 9.548978116989026e-06, + "loss": 0.73, + "step": 5037 + }, + { + "epoch": 0.2772854862678188, + "grad_norm": 0.7455994486808777, + "learning_rate": 9.548798186744578e-06, + "loss": 0.8005, + "step": 5038 + }, + { + "epoch": 0.2773405250701745, + "grad_norm": 0.7020164728164673, + "learning_rate": 9.54861822231258e-06, + "loss": 0.6707, + "step": 5039 + }, + { + "epoch": 0.2773955638725301, + "grad_norm": 0.7526360750198364, + "learning_rate": 9.548438223694385e-06, + "loss": 0.7686, + "step": 5040 + }, + { + "epoch": 0.2774506026748858, + "grad_norm": 0.7268579006195068, + "learning_rate": 9.548258190891344e-06, + "loss": 0.7039, + "step": 5041 + }, + { + "epoch": 0.27750564147724144, + "grad_norm": 0.9361631274223328, + "learning_rate": 9.548078123904815e-06, + "loss": 0.8023, + "step": 5042 + }, + { + "epoch": 0.2775606802795971, + "grad_norm": 0.7786710262298584, + "learning_rate": 9.547898022736147e-06, + "loss": 0.6866, + "step": 5043 + }, + { + "epoch": 0.27761571908195276, + "grad_norm": 0.7175624370574951, + "learning_rate": 9.547717887386695e-06, + "loss": 0.7554, + "step": 5044 + }, + { + "epoch": 0.27767075788430845, + "grad_norm": 0.9157657623291016, + "learning_rate": 9.547537717857813e-06, + "loss": 0.7936, + "step": 5045 + }, + { + "epoch": 0.2777257966866641, + "grad_norm": 0.7881377935409546, + "learning_rate": 9.547357514150854e-06, + "loss": 0.8198, + "step": 5046 + }, + { + "epoch": 0.2777808354890198, + "grad_norm": 1.0444039106369019, + "learning_rate": 9.547177276267173e-06, + "loss": 0.7954, + "step": 5047 + }, + { + "epoch": 0.2778358742913754, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.546997004208124e-06, + "loss": 0.7697, + "step": 5048 + }, + { + "epoch": 0.2778909130937311, + "grad_norm": 0.7304134368896484, + "learning_rate": 9.546816697975066e-06, + "loss": 0.7034, + "step": 5049 + }, + { + "epoch": 0.27794595189608673, + "grad_norm": 0.7783082723617554, + "learning_rate": 9.546636357569347e-06, + "loss": 0.8185, + "step": 5050 + }, + { + "epoch": 0.2780009906984424, + "grad_norm": 0.750712513923645, + "learning_rate": 9.54645598299233e-06, + "loss": 0.7336, + "step": 5051 + }, + { + "epoch": 0.27805602950079805, + "grad_norm": 0.7849590182304382, + "learning_rate": 9.546275574245364e-06, + "loss": 0.8088, + "step": 5052 + }, + { + "epoch": 0.27811106830315374, + "grad_norm": 0.8490208983421326, + "learning_rate": 9.546095131329809e-06, + "loss": 0.8507, + "step": 5053 + }, + { + "epoch": 0.2781661071055094, + "grad_norm": 0.8107250928878784, + "learning_rate": 9.54591465424702e-06, + "loss": 0.7787, + "step": 5054 + }, + { + "epoch": 0.27822114590786506, + "grad_norm": 0.8278594613075256, + "learning_rate": 9.54573414299835e-06, + "loss": 0.7836, + "step": 5055 + }, + { + "epoch": 0.2782761847102207, + "grad_norm": 0.7982015013694763, + "learning_rate": 9.545553597585163e-06, + "loss": 0.7672, + "step": 5056 + }, + { + "epoch": 0.2783312235125764, + "grad_norm": 0.7311522364616394, + "learning_rate": 9.54537301800881e-06, + "loss": 0.7571, + "step": 5057 + }, + { + "epoch": 0.278386262314932, + "grad_norm": 0.8039999604225159, + "learning_rate": 9.545192404270651e-06, + "loss": 0.764, + "step": 5058 + }, + { + "epoch": 0.2784413011172877, + "grad_norm": 0.7810946702957153, + "learning_rate": 9.545011756372042e-06, + "loss": 0.9217, + "step": 5059 + }, + { + "epoch": 0.27849633991964334, + "grad_norm": 0.7092248797416687, + "learning_rate": 9.544831074314343e-06, + "loss": 0.7599, + "step": 5060 + }, + { + "epoch": 0.27855137872199903, + "grad_norm": 0.831550657749176, + "learning_rate": 9.544650358098908e-06, + "loss": 0.7278, + "step": 5061 + }, + { + "epoch": 0.27860641752435467, + "grad_norm": 0.7645474076271057, + "learning_rate": 9.544469607727098e-06, + "loss": 0.7945, + "step": 5062 + }, + { + "epoch": 0.27866145632671036, + "grad_norm": 0.6956788301467896, + "learning_rate": 9.544288823200273e-06, + "loss": 0.749, + "step": 5063 + }, + { + "epoch": 0.278716495129066, + "grad_norm": 0.7262974381446838, + "learning_rate": 9.544108004519786e-06, + "loss": 0.8074, + "step": 5064 + }, + { + "epoch": 0.2787715339314217, + "grad_norm": 0.7439202666282654, + "learning_rate": 9.543927151687001e-06, + "loss": 0.9403, + "step": 5065 + }, + { + "epoch": 0.2788265727337773, + "grad_norm": 0.8468778133392334, + "learning_rate": 9.543746264703277e-06, + "loss": 0.8182, + "step": 5066 + }, + { + "epoch": 0.278881611536133, + "grad_norm": 0.8396204113960266, + "learning_rate": 9.54356534356997e-06, + "loss": 0.8067, + "step": 5067 + }, + { + "epoch": 0.27893665033848863, + "grad_norm": 0.718758225440979, + "learning_rate": 9.543384388288445e-06, + "loss": 0.8172, + "step": 5068 + }, + { + "epoch": 0.27899168914084427, + "grad_norm": 0.7562685012817383, + "learning_rate": 9.543203398860056e-06, + "loss": 0.9053, + "step": 5069 + }, + { + "epoch": 0.27904672794319996, + "grad_norm": 0.9592792987823486, + "learning_rate": 9.543022375286169e-06, + "loss": 0.9375, + "step": 5070 + }, + { + "epoch": 0.2791017667455556, + "grad_norm": 0.7162739634513855, + "learning_rate": 9.54284131756814e-06, + "loss": 0.7297, + "step": 5071 + }, + { + "epoch": 0.2791568055479113, + "grad_norm": 0.7703517079353333, + "learning_rate": 9.542660225707335e-06, + "loss": 0.8863, + "step": 5072 + }, + { + "epoch": 0.2792118443502669, + "grad_norm": 0.7860418558120728, + "learning_rate": 9.542479099705109e-06, + "loss": 0.8335, + "step": 5073 + }, + { + "epoch": 0.2792668831526226, + "grad_norm": 0.8880825042724609, + "learning_rate": 9.542297939562825e-06, + "loss": 0.8344, + "step": 5074 + }, + { + "epoch": 0.27932192195497824, + "grad_norm": 0.7900505661964417, + "learning_rate": 9.542116745281849e-06, + "loss": 0.7613, + "step": 5075 + }, + { + "epoch": 0.2793769607573339, + "grad_norm": 0.7446081042289734, + "learning_rate": 9.541935516863536e-06, + "loss": 0.6615, + "step": 5076 + }, + { + "epoch": 0.27943199955968956, + "grad_norm": 0.7831308245658875, + "learning_rate": 9.541754254309254e-06, + "loss": 0.779, + "step": 5077 + }, + { + "epoch": 0.27948703836204525, + "grad_norm": 0.9007606506347656, + "learning_rate": 9.541572957620361e-06, + "loss": 0.8883, + "step": 5078 + }, + { + "epoch": 0.2795420771644009, + "grad_norm": 0.8033407330513, + "learning_rate": 9.541391626798222e-06, + "loss": 0.7354, + "step": 5079 + }, + { + "epoch": 0.27959711596675657, + "grad_norm": 0.9259470105171204, + "learning_rate": 9.5412102618442e-06, + "loss": 0.7602, + "step": 5080 + }, + { + "epoch": 0.2796521547691122, + "grad_norm": 0.786523163318634, + "learning_rate": 9.541028862759656e-06, + "loss": 0.7402, + "step": 5081 + }, + { + "epoch": 0.2797071935714679, + "grad_norm": 0.8053372502326965, + "learning_rate": 9.540847429545954e-06, + "loss": 0.825, + "step": 5082 + }, + { + "epoch": 0.2797622323738235, + "grad_norm": 0.8578022122383118, + "learning_rate": 9.54066596220446e-06, + "loss": 0.7866, + "step": 5083 + }, + { + "epoch": 0.2798172711761792, + "grad_norm": 0.916161835193634, + "learning_rate": 9.540484460736535e-06, + "loss": 0.5961, + "step": 5084 + }, + { + "epoch": 0.27987230997853485, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.540302925143545e-06, + "loss": 0.764, + "step": 5085 + }, + { + "epoch": 0.27992734878089054, + "grad_norm": 0.7392510771751404, + "learning_rate": 9.540121355426852e-06, + "loss": 0.8038, + "step": 5086 + }, + { + "epoch": 0.2799823875832462, + "grad_norm": 0.7406296133995056, + "learning_rate": 9.539939751587825e-06, + "loss": 0.8202, + "step": 5087 + }, + { + "epoch": 0.28003742638560186, + "grad_norm": 0.7274924516677856, + "learning_rate": 9.539758113627823e-06, + "loss": 0.7691, + "step": 5088 + }, + { + "epoch": 0.2800924651879575, + "grad_norm": 0.8563184142112732, + "learning_rate": 9.539576441548218e-06, + "loss": 0.8341, + "step": 5089 + }, + { + "epoch": 0.2801475039903132, + "grad_norm": 0.7708351016044617, + "learning_rate": 9.539394735350366e-06, + "loss": 0.7126, + "step": 5090 + }, + { + "epoch": 0.2802025427926688, + "grad_norm": 0.7314836382865906, + "learning_rate": 9.539212995035642e-06, + "loss": 0.7465, + "step": 5091 + }, + { + "epoch": 0.2802575815950245, + "grad_norm": 0.7594754695892334, + "learning_rate": 9.539031220605409e-06, + "loss": 0.7563, + "step": 5092 + }, + { + "epoch": 0.28031262039738014, + "grad_norm": 0.699414074420929, + "learning_rate": 9.53884941206103e-06, + "loss": 0.7847, + "step": 5093 + }, + { + "epoch": 0.28036765919973583, + "grad_norm": 0.8013063073158264, + "learning_rate": 9.538667569403877e-06, + "loss": 0.7769, + "step": 5094 + }, + { + "epoch": 0.28042269800209146, + "grad_norm": 0.7778805494308472, + "learning_rate": 9.538485692635312e-06, + "loss": 0.7646, + "step": 5095 + }, + { + "epoch": 0.28047773680444715, + "grad_norm": 0.785649299621582, + "learning_rate": 9.538303781756702e-06, + "loss": 0.8162, + "step": 5096 + }, + { + "epoch": 0.2805327756068028, + "grad_norm": 0.7073212265968323, + "learning_rate": 9.538121836769417e-06, + "loss": 0.7208, + "step": 5097 + }, + { + "epoch": 0.2805878144091585, + "grad_norm": 0.7545642852783203, + "learning_rate": 9.53793985767482e-06, + "loss": 0.8673, + "step": 5098 + }, + { + "epoch": 0.2806428532115141, + "grad_norm": 0.6818416118621826, + "learning_rate": 9.537757844474285e-06, + "loss": 0.7576, + "step": 5099 + }, + { + "epoch": 0.2806978920138698, + "grad_norm": 0.6718038320541382, + "learning_rate": 9.537575797169176e-06, + "loss": 0.6683, + "step": 5100 + }, + { + "epoch": 0.28075293081622543, + "grad_norm": 0.7851004600524902, + "learning_rate": 9.53739371576086e-06, + "loss": 0.8871, + "step": 5101 + }, + { + "epoch": 0.2808079696185811, + "grad_norm": 0.7565650343894958, + "learning_rate": 9.53721160025071e-06, + "loss": 0.8799, + "step": 5102 + }, + { + "epoch": 0.28086300842093676, + "grad_norm": 0.7522932887077332, + "learning_rate": 9.537029450640091e-06, + "loss": 0.838, + "step": 5103 + }, + { + "epoch": 0.28091804722329244, + "grad_norm": 0.929634690284729, + "learning_rate": 9.536847266930375e-06, + "loss": 0.7997, + "step": 5104 + }, + { + "epoch": 0.2809730860256481, + "grad_norm": 0.8050084710121155, + "learning_rate": 9.536665049122928e-06, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.28102812482800377, + "grad_norm": 0.7401233315467834, + "learning_rate": 9.53648279721912e-06, + "loss": 0.7904, + "step": 5106 + }, + { + "epoch": 0.2810831636303594, + "grad_norm": 0.7125453948974609, + "learning_rate": 9.536300511220322e-06, + "loss": 0.7349, + "step": 5107 + }, + { + "epoch": 0.2811382024327151, + "grad_norm": 0.7165758609771729, + "learning_rate": 9.536118191127905e-06, + "loss": 0.7314, + "step": 5108 + }, + { + "epoch": 0.2811932412350707, + "grad_norm": 0.7507439851760864, + "learning_rate": 9.535935836943237e-06, + "loss": 0.7603, + "step": 5109 + }, + { + "epoch": 0.2812482800374264, + "grad_norm": 0.7832109332084656, + "learning_rate": 9.535753448667688e-06, + "loss": 0.7279, + "step": 5110 + }, + { + "epoch": 0.28130331883978205, + "grad_norm": 0.7346609234809875, + "learning_rate": 9.535571026302633e-06, + "loss": 0.6882, + "step": 5111 + }, + { + "epoch": 0.2813583576421377, + "grad_norm": 0.7569608688354492, + "learning_rate": 9.535388569849437e-06, + "loss": 0.8451, + "step": 5112 + }, + { + "epoch": 0.28141339644449337, + "grad_norm": 0.7319865822792053, + "learning_rate": 9.535206079309478e-06, + "loss": 0.8161, + "step": 5113 + }, + { + "epoch": 0.281468435246849, + "grad_norm": 0.7744631171226501, + "learning_rate": 9.535023554684122e-06, + "loss": 0.8025, + "step": 5114 + }, + { + "epoch": 0.2815234740492047, + "grad_norm": 0.6867525577545166, + "learning_rate": 9.534840995974743e-06, + "loss": 0.7693, + "step": 5115 + }, + { + "epoch": 0.2815785128515603, + "grad_norm": 0.7625848054885864, + "learning_rate": 9.534658403182715e-06, + "loss": 0.8034, + "step": 5116 + }, + { + "epoch": 0.281633551653916, + "grad_norm": 0.7369832992553711, + "learning_rate": 9.534475776309406e-06, + "loss": 0.873, + "step": 5117 + }, + { + "epoch": 0.28168859045627165, + "grad_norm": 0.7267127633094788, + "learning_rate": 9.534293115356191e-06, + "loss": 0.7954, + "step": 5118 + }, + { + "epoch": 0.28174362925862734, + "grad_norm": 0.7244247794151306, + "learning_rate": 9.534110420324443e-06, + "loss": 0.7784, + "step": 5119 + }, + { + "epoch": 0.28179866806098297, + "grad_norm": 0.8207812905311584, + "learning_rate": 9.533927691215534e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.28185370686333866, + "grad_norm": 0.8669891357421875, + "learning_rate": 9.53374492803084e-06, + "loss": 0.8203, + "step": 5121 + }, + { + "epoch": 0.2819087456656943, + "grad_norm": 0.7650816440582275, + "learning_rate": 9.533562130771732e-06, + "loss": 0.77, + "step": 5122 + }, + { + "epoch": 0.28196378446805, + "grad_norm": 0.7664972543716431, + "learning_rate": 9.533379299439584e-06, + "loss": 0.7187, + "step": 5123 + }, + { + "epoch": 0.2820188232704056, + "grad_norm": 0.7921896576881409, + "learning_rate": 9.533196434035772e-06, + "loss": 0.8669, + "step": 5124 + }, + { + "epoch": 0.2820738620727613, + "grad_norm": 0.7714456915855408, + "learning_rate": 9.533013534561669e-06, + "loss": 0.8783, + "step": 5125 + }, + { + "epoch": 0.28212890087511694, + "grad_norm": 0.7222065329551697, + "learning_rate": 9.532830601018648e-06, + "loss": 0.7449, + "step": 5126 + }, + { + "epoch": 0.28218393967747263, + "grad_norm": 0.718142569065094, + "learning_rate": 9.532647633408085e-06, + "loss": 0.8226, + "step": 5127 + }, + { + "epoch": 0.28223897847982826, + "grad_norm": 0.730592668056488, + "learning_rate": 9.532464631731357e-06, + "loss": 0.7878, + "step": 5128 + }, + { + "epoch": 0.28229401728218395, + "grad_norm": 0.7841802835464478, + "learning_rate": 9.532281595989839e-06, + "loss": 0.8262, + "step": 5129 + }, + { + "epoch": 0.2823490560845396, + "grad_norm": 0.8617212772369385, + "learning_rate": 9.532098526184904e-06, + "loss": 0.8368, + "step": 5130 + }, + { + "epoch": 0.2824040948868953, + "grad_norm": 0.6968556642532349, + "learning_rate": 9.53191542231793e-06, + "loss": 0.6848, + "step": 5131 + }, + { + "epoch": 0.2824591336892509, + "grad_norm": 0.7872157096862793, + "learning_rate": 9.531732284390294e-06, + "loss": 0.7898, + "step": 5132 + }, + { + "epoch": 0.2825141724916066, + "grad_norm": 0.7727276086807251, + "learning_rate": 9.53154911240337e-06, + "loss": 0.8506, + "step": 5133 + }, + { + "epoch": 0.28256921129396223, + "grad_norm": 0.7279896140098572, + "learning_rate": 9.531365906358536e-06, + "loss": 0.7415, + "step": 5134 + }, + { + "epoch": 0.2826242500963179, + "grad_norm": 0.7457457780838013, + "learning_rate": 9.53118266625717e-06, + "loss": 0.7652, + "step": 5135 + }, + { + "epoch": 0.28267928889867355, + "grad_norm": 0.8989270329475403, + "learning_rate": 9.530999392100646e-06, + "loss": 0.9085, + "step": 5136 + }, + { + "epoch": 0.28273432770102924, + "grad_norm": 0.9622626304626465, + "learning_rate": 9.530816083890347e-06, + "loss": 0.8726, + "step": 5137 + }, + { + "epoch": 0.2827893665033849, + "grad_norm": 0.7712846994400024, + "learning_rate": 9.530632741627643e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.28284440530574056, + "grad_norm": 0.8320727348327637, + "learning_rate": 9.530449365313918e-06, + "loss": 0.7828, + "step": 5139 + }, + { + "epoch": 0.2828994441080962, + "grad_norm": 0.9310963153839111, + "learning_rate": 9.530265954950549e-06, + "loss": 0.8482, + "step": 5140 + }, + { + "epoch": 0.2829544829104519, + "grad_norm": 0.9984502792358398, + "learning_rate": 9.530082510538914e-06, + "loss": 0.8673, + "step": 5141 + }, + { + "epoch": 0.2830095217128075, + "grad_norm": 0.8300992250442505, + "learning_rate": 9.52989903208039e-06, + "loss": 0.8232, + "step": 5142 + }, + { + "epoch": 0.2830645605151632, + "grad_norm": 0.930052638053894, + "learning_rate": 9.529715519576356e-06, + "loss": 0.7766, + "step": 5143 + }, + { + "epoch": 0.28311959931751884, + "grad_norm": 0.8038359880447388, + "learning_rate": 9.529531973028194e-06, + "loss": 0.712, + "step": 5144 + }, + { + "epoch": 0.28317463811987453, + "grad_norm": 0.856250524520874, + "learning_rate": 9.529348392437283e-06, + "loss": 0.8578, + "step": 5145 + }, + { + "epoch": 0.28322967692223017, + "grad_norm": 0.7602483630180359, + "learning_rate": 9.529164777805002e-06, + "loss": 0.749, + "step": 5146 + }, + { + "epoch": 0.28328471572458586, + "grad_norm": 0.8946549892425537, + "learning_rate": 9.52898112913273e-06, + "loss": 0.8101, + "step": 5147 + }, + { + "epoch": 0.2833397545269415, + "grad_norm": 0.8015615344047546, + "learning_rate": 9.52879744642185e-06, + "loss": 0.8203, + "step": 5148 + }, + { + "epoch": 0.2833947933292972, + "grad_norm": 0.7767183780670166, + "learning_rate": 9.528613729673738e-06, + "loss": 0.8409, + "step": 5149 + }, + { + "epoch": 0.2834498321316528, + "grad_norm": 0.7604000568389893, + "learning_rate": 9.52842997888978e-06, + "loss": 0.8853, + "step": 5150 + }, + { + "epoch": 0.2835048709340085, + "grad_norm": 0.7079401016235352, + "learning_rate": 9.528246194071353e-06, + "loss": 0.6855, + "step": 5151 + }, + { + "epoch": 0.28355990973636414, + "grad_norm": 0.7616782188415527, + "learning_rate": 9.52806237521984e-06, + "loss": 0.785, + "step": 5152 + }, + { + "epoch": 0.2836149485387198, + "grad_norm": 0.7408583760261536, + "learning_rate": 9.527878522336622e-06, + "loss": 0.7105, + "step": 5153 + }, + { + "epoch": 0.28366998734107546, + "grad_norm": 0.694821834564209, + "learning_rate": 9.52769463542308e-06, + "loss": 0.6552, + "step": 5154 + }, + { + "epoch": 0.2837250261434311, + "grad_norm": 0.796925961971283, + "learning_rate": 9.5275107144806e-06, + "loss": 0.7122, + "step": 5155 + }, + { + "epoch": 0.2837800649457868, + "grad_norm": 0.8001971244812012, + "learning_rate": 9.527326759510558e-06, + "loss": 0.8528, + "step": 5156 + }, + { + "epoch": 0.2838351037481424, + "grad_norm": 0.8605831265449524, + "learning_rate": 9.527142770514341e-06, + "loss": 0.7948, + "step": 5157 + }, + { + "epoch": 0.2838901425504981, + "grad_norm": 0.8380078077316284, + "learning_rate": 9.526958747493334e-06, + "loss": 0.8184, + "step": 5158 + }, + { + "epoch": 0.28394518135285374, + "grad_norm": 0.8758485317230225, + "learning_rate": 9.526774690448913e-06, + "loss": 0.7625, + "step": 5159 + }, + { + "epoch": 0.2840002201552094, + "grad_norm": 0.7078989744186401, + "learning_rate": 9.526590599382466e-06, + "loss": 0.8179, + "step": 5160 + }, + { + "epoch": 0.28405525895756506, + "grad_norm": 0.6668990850448608, + "learning_rate": 9.526406474295376e-06, + "loss": 0.7169, + "step": 5161 + }, + { + "epoch": 0.28411029775992075, + "grad_norm": 0.7666084170341492, + "learning_rate": 9.526222315189026e-06, + "loss": 0.8511, + "step": 5162 + }, + { + "epoch": 0.2841653365622764, + "grad_norm": 0.7390545606613159, + "learning_rate": 9.526038122064802e-06, + "loss": 0.7926, + "step": 5163 + }, + { + "epoch": 0.28422037536463207, + "grad_norm": 0.7972092032432556, + "learning_rate": 9.525853894924086e-06, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 0.2842754141669877, + "grad_norm": 0.8988455533981323, + "learning_rate": 9.525669633768265e-06, + "loss": 0.9497, + "step": 5165 + }, + { + "epoch": 0.2843304529693434, + "grad_norm": 0.7092710137367249, + "learning_rate": 9.525485338598722e-06, + "loss": 0.7241, + "step": 5166 + }, + { + "epoch": 0.28438549177169903, + "grad_norm": 0.8630063533782959, + "learning_rate": 9.525301009416843e-06, + "loss": 0.8318, + "step": 5167 + }, + { + "epoch": 0.2844405305740547, + "grad_norm": 0.7336890697479248, + "learning_rate": 9.52511664622401e-06, + "loss": 0.7077, + "step": 5168 + }, + { + "epoch": 0.28449556937641035, + "grad_norm": 0.8156722784042358, + "learning_rate": 9.524932249021615e-06, + "loss": 0.8573, + "step": 5169 + }, + { + "epoch": 0.28455060817876604, + "grad_norm": 0.7061388492584229, + "learning_rate": 9.524747817811038e-06, + "loss": 0.7432, + "step": 5170 + }, + { + "epoch": 0.2846056469811217, + "grad_norm": 0.7948413491249084, + "learning_rate": 9.52456335259367e-06, + "loss": 0.8082, + "step": 5171 + }, + { + "epoch": 0.28466068578347736, + "grad_norm": 0.7208091020584106, + "learning_rate": 9.524378853370893e-06, + "loss": 0.7027, + "step": 5172 + }, + { + "epoch": 0.284715724585833, + "grad_norm": 0.8377540111541748, + "learning_rate": 9.524194320144096e-06, + "loss": 0.7093, + "step": 5173 + }, + { + "epoch": 0.2847707633881887, + "grad_norm": 0.8734563589096069, + "learning_rate": 9.524009752914666e-06, + "loss": 0.8422, + "step": 5174 + }, + { + "epoch": 0.2848258021905443, + "grad_norm": 0.7303940653800964, + "learning_rate": 9.523825151683989e-06, + "loss": 0.811, + "step": 5175 + }, + { + "epoch": 0.2848808409929, + "grad_norm": 0.7653842568397522, + "learning_rate": 9.523640516453455e-06, + "loss": 0.8595, + "step": 5176 + }, + { + "epoch": 0.28493587979525564, + "grad_norm": 0.7366930246353149, + "learning_rate": 9.523455847224448e-06, + "loss": 0.7832, + "step": 5177 + }, + { + "epoch": 0.28499091859761133, + "grad_norm": 0.7908505797386169, + "learning_rate": 9.523271143998357e-06, + "loss": 0.8115, + "step": 5178 + }, + { + "epoch": 0.28504595739996696, + "grad_norm": 0.8176048398017883, + "learning_rate": 9.523086406776572e-06, + "loss": 0.8377, + "step": 5179 + }, + { + "epoch": 0.28510099620232265, + "grad_norm": 0.724086344242096, + "learning_rate": 9.52290163556048e-06, + "loss": 0.7804, + "step": 5180 + }, + { + "epoch": 0.2851560350046783, + "grad_norm": 0.6461299657821655, + "learning_rate": 9.52271683035147e-06, + "loss": 0.5727, + "step": 5181 + }, + { + "epoch": 0.285211073807034, + "grad_norm": 0.7275353074073792, + "learning_rate": 9.522531991150932e-06, + "loss": 0.8345, + "step": 5182 + }, + { + "epoch": 0.2852661126093896, + "grad_norm": 0.7321951985359192, + "learning_rate": 9.522347117960253e-06, + "loss": 0.8832, + "step": 5183 + }, + { + "epoch": 0.2853211514117453, + "grad_norm": 0.7526552677154541, + "learning_rate": 9.522162210780825e-06, + "loss": 0.831, + "step": 5184 + }, + { + "epoch": 0.28537619021410093, + "grad_norm": 0.7592381238937378, + "learning_rate": 9.521977269614036e-06, + "loss": 0.7293, + "step": 5185 + }, + { + "epoch": 0.2854312290164566, + "grad_norm": 0.8060448169708252, + "learning_rate": 9.521792294461274e-06, + "loss": 0.819, + "step": 5186 + }, + { + "epoch": 0.28548626781881226, + "grad_norm": 0.7178553342819214, + "learning_rate": 9.521607285323932e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.28554130662116795, + "grad_norm": 0.8186969757080078, + "learning_rate": 9.521422242203401e-06, + "loss": 0.8526, + "step": 5188 + }, + { + "epoch": 0.2855963454235236, + "grad_norm": 0.8480883240699768, + "learning_rate": 9.521237165101071e-06, + "loss": 0.8088, + "step": 5189 + }, + { + "epoch": 0.28565138422587927, + "grad_norm": 0.8053719401359558, + "learning_rate": 9.521052054018333e-06, + "loss": 0.928, + "step": 5190 + }, + { + "epoch": 0.2857064230282349, + "grad_norm": 0.6937163472175598, + "learning_rate": 9.52086690895658e-06, + "loss": 0.7418, + "step": 5191 + }, + { + "epoch": 0.2857614618305906, + "grad_norm": 1.0616179704666138, + "learning_rate": 9.520681729917196e-06, + "loss": 0.8726, + "step": 5192 + }, + { + "epoch": 0.2858165006329462, + "grad_norm": 0.7504106163978577, + "learning_rate": 9.520496516901582e-06, + "loss": 0.844, + "step": 5193 + }, + { + "epoch": 0.2858715394353019, + "grad_norm": 0.7634509205818176, + "learning_rate": 9.520311269911127e-06, + "loss": 0.7595, + "step": 5194 + }, + { + "epoch": 0.28592657823765755, + "grad_norm": 0.7069799900054932, + "learning_rate": 9.52012598894722e-06, + "loss": 0.7566, + "step": 5195 + }, + { + "epoch": 0.28598161704001324, + "grad_norm": 0.695737361907959, + "learning_rate": 9.519940674011256e-06, + "loss": 0.7534, + "step": 5196 + }, + { + "epoch": 0.28603665584236887, + "grad_norm": 0.7212124466896057, + "learning_rate": 9.51975532510463e-06, + "loss": 0.8237, + "step": 5197 + }, + { + "epoch": 0.2860916946447245, + "grad_norm": 0.7274062633514404, + "learning_rate": 9.519569942228732e-06, + "loss": 0.756, + "step": 5198 + }, + { + "epoch": 0.2861467334470802, + "grad_norm": 0.7038697600364685, + "learning_rate": 9.519384525384956e-06, + "loss": 0.7308, + "step": 5199 + }, + { + "epoch": 0.2862017722494358, + "grad_norm": 0.6897109150886536, + "learning_rate": 9.519199074574694e-06, + "loss": 0.7858, + "step": 5200 + }, + { + "epoch": 0.2862568110517915, + "grad_norm": 0.8471527099609375, + "learning_rate": 9.519013589799343e-06, + "loss": 0.8198, + "step": 5201 + }, + { + "epoch": 0.28631184985414715, + "grad_norm": 0.6828129291534424, + "learning_rate": 9.518828071060295e-06, + "loss": 0.7734, + "step": 5202 + }, + { + "epoch": 0.28636688865650284, + "grad_norm": 0.7437755465507507, + "learning_rate": 9.518642518358946e-06, + "loss": 0.7669, + "step": 5203 + }, + { + "epoch": 0.28642192745885847, + "grad_norm": 0.8841923475265503, + "learning_rate": 9.518456931696689e-06, + "loss": 0.8201, + "step": 5204 + }, + { + "epoch": 0.28647696626121416, + "grad_norm": 0.9514154195785522, + "learning_rate": 9.518271311074917e-06, + "loss": 0.7864, + "step": 5205 + }, + { + "epoch": 0.2865320050635698, + "grad_norm": 0.830795407295227, + "learning_rate": 9.51808565649503e-06, + "loss": 0.8024, + "step": 5206 + }, + { + "epoch": 0.2865870438659255, + "grad_norm": 0.7274934649467468, + "learning_rate": 9.51789996795842e-06, + "loss": 0.7631, + "step": 5207 + }, + { + "epoch": 0.2866420826682811, + "grad_norm": 0.7004290223121643, + "learning_rate": 9.517714245466482e-06, + "loss": 0.7344, + "step": 5208 + }, + { + "epoch": 0.2866971214706368, + "grad_norm": 0.8559010624885559, + "learning_rate": 9.517528489020614e-06, + "loss": 0.7502, + "step": 5209 + }, + { + "epoch": 0.28675216027299244, + "grad_norm": 0.8913494348526001, + "learning_rate": 9.517342698622212e-06, + "loss": 0.8908, + "step": 5210 + }, + { + "epoch": 0.28680719907534813, + "grad_norm": 0.8375207781791687, + "learning_rate": 9.51715687427267e-06, + "loss": 0.7701, + "step": 5211 + }, + { + "epoch": 0.28686223787770376, + "grad_norm": 1.1804776191711426, + "learning_rate": 9.516971015973386e-06, + "loss": 0.8449, + "step": 5212 + }, + { + "epoch": 0.28691727668005945, + "grad_norm": 0.7260473370552063, + "learning_rate": 9.516785123725758e-06, + "loss": 0.7978, + "step": 5213 + }, + { + "epoch": 0.2869723154824151, + "grad_norm": 0.8159041404724121, + "learning_rate": 9.516599197531182e-06, + "loss": 0.7454, + "step": 5214 + }, + { + "epoch": 0.2870273542847708, + "grad_norm": 0.7850227952003479, + "learning_rate": 9.516413237391056e-06, + "loss": 0.8082, + "step": 5215 + }, + { + "epoch": 0.2870823930871264, + "grad_norm": 0.7596960067749023, + "learning_rate": 9.516227243306774e-06, + "loss": 0.7286, + "step": 5216 + }, + { + "epoch": 0.2871374318894821, + "grad_norm": 0.8763321042060852, + "learning_rate": 9.516041215279741e-06, + "loss": 0.8685, + "step": 5217 + }, + { + "epoch": 0.28719247069183773, + "grad_norm": 1.2130110263824463, + "learning_rate": 9.515855153311349e-06, + "loss": 0.8374, + "step": 5218 + }, + { + "epoch": 0.2872475094941934, + "grad_norm": 0.7578628063201904, + "learning_rate": 9.515669057402999e-06, + "loss": 0.793, + "step": 5219 + }, + { + "epoch": 0.28730254829654905, + "grad_norm": 0.9085225462913513, + "learning_rate": 9.515482927556088e-06, + "loss": 0.8366, + "step": 5220 + }, + { + "epoch": 0.28735758709890474, + "grad_norm": 0.7107900977134705, + "learning_rate": 9.515296763772017e-06, + "loss": 0.6571, + "step": 5221 + }, + { + "epoch": 0.2874126259012604, + "grad_norm": 0.7742018699645996, + "learning_rate": 9.515110566052183e-06, + "loss": 0.8387, + "step": 5222 + }, + { + "epoch": 0.28746766470361607, + "grad_norm": 0.8934319615364075, + "learning_rate": 9.514924334397987e-06, + "loss": 0.8546, + "step": 5223 + }, + { + "epoch": 0.2875227035059717, + "grad_norm": 0.720245897769928, + "learning_rate": 9.51473806881083e-06, + "loss": 0.7459, + "step": 5224 + }, + { + "epoch": 0.2875777423083274, + "grad_norm": 0.7074370384216309, + "learning_rate": 9.514551769292109e-06, + "loss": 0.8598, + "step": 5225 + }, + { + "epoch": 0.287632781110683, + "grad_norm": 0.7608621120452881, + "learning_rate": 9.514365435843226e-06, + "loss": 0.7263, + "step": 5226 + }, + { + "epoch": 0.2876878199130387, + "grad_norm": 0.7581011652946472, + "learning_rate": 9.51417906846558e-06, + "loss": 0.7498, + "step": 5227 + }, + { + "epoch": 0.28774285871539435, + "grad_norm": 0.8184412121772766, + "learning_rate": 9.513992667160572e-06, + "loss": 0.6889, + "step": 5228 + }, + { + "epoch": 0.28779789751775003, + "grad_norm": 0.6835145354270935, + "learning_rate": 9.513806231929605e-06, + "loss": 0.7399, + "step": 5229 + }, + { + "epoch": 0.28785293632010567, + "grad_norm": 0.7601536512374878, + "learning_rate": 9.513619762774077e-06, + "loss": 0.846, + "step": 5230 + }, + { + "epoch": 0.28790797512246136, + "grad_norm": 0.781491219997406, + "learning_rate": 9.513433259695392e-06, + "loss": 0.8326, + "step": 5231 + }, + { + "epoch": 0.287963013924817, + "grad_norm": 0.7978106141090393, + "learning_rate": 9.513246722694951e-06, + "loss": 0.7917, + "step": 5232 + }, + { + "epoch": 0.2880180527271727, + "grad_norm": 0.8071381449699402, + "learning_rate": 9.513060151774156e-06, + "loss": 0.8054, + "step": 5233 + }, + { + "epoch": 0.2880730915295283, + "grad_norm": 0.815567135810852, + "learning_rate": 9.512873546934406e-06, + "loss": 0.8647, + "step": 5234 + }, + { + "epoch": 0.288128130331884, + "grad_norm": 0.8255048990249634, + "learning_rate": 9.512686908177111e-06, + "loss": 0.9011, + "step": 5235 + }, + { + "epoch": 0.28818316913423964, + "grad_norm": 0.8392062187194824, + "learning_rate": 9.512500235503666e-06, + "loss": 0.8778, + "step": 5236 + }, + { + "epoch": 0.2882382079365953, + "grad_norm": 0.7256191372871399, + "learning_rate": 9.512313528915478e-06, + "loss": 0.7231, + "step": 5237 + }, + { + "epoch": 0.28829324673895096, + "grad_norm": 0.9041032195091248, + "learning_rate": 9.51212678841395e-06, + "loss": 0.8469, + "step": 5238 + }, + { + "epoch": 0.28834828554130665, + "grad_norm": 0.7857525944709778, + "learning_rate": 9.511940014000485e-06, + "loss": 0.7447, + "step": 5239 + }, + { + "epoch": 0.2884033243436623, + "grad_norm": 0.6925225257873535, + "learning_rate": 9.511753205676485e-06, + "loss": 0.8302, + "step": 5240 + }, + { + "epoch": 0.2884583631460179, + "grad_norm": 0.7253623008728027, + "learning_rate": 9.511566363443356e-06, + "loss": 0.8373, + "step": 5241 + }, + { + "epoch": 0.2885134019483736, + "grad_norm": 0.7198607921600342, + "learning_rate": 9.511379487302504e-06, + "loss": 0.79, + "step": 5242 + }, + { + "epoch": 0.28856844075072924, + "grad_norm": 0.7966421246528625, + "learning_rate": 9.511192577255328e-06, + "loss": 0.7933, + "step": 5243 + }, + { + "epoch": 0.2886234795530849, + "grad_norm": 0.9159359931945801, + "learning_rate": 9.511005633303239e-06, + "loss": 0.7254, + "step": 5244 + }, + { + "epoch": 0.28867851835544056, + "grad_norm": 0.9514481425285339, + "learning_rate": 9.510818655447638e-06, + "loss": 0.8916, + "step": 5245 + }, + { + "epoch": 0.28873355715779625, + "grad_norm": 0.7505099773406982, + "learning_rate": 9.510631643689932e-06, + "loss": 0.765, + "step": 5246 + }, + { + "epoch": 0.2887885959601519, + "grad_norm": 0.7824658751487732, + "learning_rate": 9.510444598031526e-06, + "loss": 0.6972, + "step": 5247 + }, + { + "epoch": 0.2888436347625076, + "grad_norm": 0.7778681516647339, + "learning_rate": 9.510257518473824e-06, + "loss": 0.8705, + "step": 5248 + }, + { + "epoch": 0.2888986735648632, + "grad_norm": 0.6785199642181396, + "learning_rate": 9.510070405018235e-06, + "loss": 0.6889, + "step": 5249 + }, + { + "epoch": 0.2889537123672189, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.509883257666164e-06, + "loss": 0.7979, + "step": 5250 + }, + { + "epoch": 0.28900875116957453, + "grad_norm": 1.3174562454223633, + "learning_rate": 9.509696076419018e-06, + "loss": 0.8802, + "step": 5251 + }, + { + "epoch": 0.2890637899719302, + "grad_norm": 1.1800767183303833, + "learning_rate": 9.509508861278205e-06, + "loss": 0.9246, + "step": 5252 + }, + { + "epoch": 0.28911882877428585, + "grad_norm": 0.7057580947875977, + "learning_rate": 9.509321612245128e-06, + "loss": 0.7565, + "step": 5253 + }, + { + "epoch": 0.28917386757664154, + "grad_norm": 0.7681905031204224, + "learning_rate": 9.509134329321197e-06, + "loss": 0.8678, + "step": 5254 + }, + { + "epoch": 0.2892289063789972, + "grad_norm": 0.96025550365448, + "learning_rate": 9.50894701250782e-06, + "loss": 0.9108, + "step": 5255 + }, + { + "epoch": 0.28928394518135286, + "grad_norm": 0.7786841988563538, + "learning_rate": 9.508759661806405e-06, + "loss": 0.7747, + "step": 5256 + }, + { + "epoch": 0.2893389839837085, + "grad_norm": 0.7073540091514587, + "learning_rate": 9.508572277218358e-06, + "loss": 0.7573, + "step": 5257 + }, + { + "epoch": 0.2893940227860642, + "grad_norm": 0.6648856401443481, + "learning_rate": 9.50838485874509e-06, + "loss": 0.7294, + "step": 5258 + }, + { + "epoch": 0.2894490615884198, + "grad_norm": 0.6794270873069763, + "learning_rate": 9.508197406388007e-06, + "loss": 0.7001, + "step": 5259 + }, + { + "epoch": 0.2895041003907755, + "grad_norm": 0.6819350123405457, + "learning_rate": 9.50800992014852e-06, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.28955913919313114, + "grad_norm": 0.6616997122764587, + "learning_rate": 9.507822400028036e-06, + "loss": 0.7108, + "step": 5261 + }, + { + "epoch": 0.28961417799548683, + "grad_norm": 0.7447230219841003, + "learning_rate": 9.507634846027966e-06, + "loss": 0.7865, + "step": 5262 + }, + { + "epoch": 0.28966921679784247, + "grad_norm": 0.7826278209686279, + "learning_rate": 9.50744725814972e-06, + "loss": 0.7922, + "step": 5263 + }, + { + "epoch": 0.28972425560019816, + "grad_norm": 0.8054459095001221, + "learning_rate": 9.507259636394706e-06, + "loss": 0.795, + "step": 5264 + }, + { + "epoch": 0.2897792944025538, + "grad_norm": 0.9539191722869873, + "learning_rate": 9.507071980764335e-06, + "loss": 0.9495, + "step": 5265 + }, + { + "epoch": 0.2898343332049095, + "grad_norm": 0.8877993226051331, + "learning_rate": 9.506884291260017e-06, + "loss": 0.8418, + "step": 5266 + }, + { + "epoch": 0.2898893720072651, + "grad_norm": 0.6620327234268188, + "learning_rate": 9.506696567883164e-06, + "loss": 0.6285, + "step": 5267 + }, + { + "epoch": 0.2899444108096208, + "grad_norm": 0.7604434490203857, + "learning_rate": 9.506508810635187e-06, + "loss": 0.8562, + "step": 5268 + }, + { + "epoch": 0.28999944961197643, + "grad_norm": 0.8181812763214111, + "learning_rate": 9.506321019517494e-06, + "loss": 0.905, + "step": 5269 + }, + { + "epoch": 0.2900544884143321, + "grad_norm": 0.7776391506195068, + "learning_rate": 9.5061331945315e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.29010952721668776, + "grad_norm": 0.8125039339065552, + "learning_rate": 9.505945335678613e-06, + "loss": 0.7254, + "step": 5271 + }, + { + "epoch": 0.29016456601904345, + "grad_norm": 0.7229846715927124, + "learning_rate": 9.50575744296025e-06, + "loss": 0.8192, + "step": 5272 + }, + { + "epoch": 0.2902196048213991, + "grad_norm": 0.72443026304245, + "learning_rate": 9.505569516377817e-06, + "loss": 0.7813, + "step": 5273 + }, + { + "epoch": 0.29027464362375477, + "grad_norm": 0.6798073053359985, + "learning_rate": 9.505381555932731e-06, + "loss": 0.7655, + "step": 5274 + }, + { + "epoch": 0.2903296824261104, + "grad_norm": 1.0805624723434448, + "learning_rate": 9.505193561626404e-06, + "loss": 0.9035, + "step": 5275 + }, + { + "epoch": 0.2903847212284661, + "grad_norm": 0.7579694986343384, + "learning_rate": 9.505005533460247e-06, + "loss": 0.8612, + "step": 5276 + }, + { + "epoch": 0.2904397600308217, + "grad_norm": 1.2496099472045898, + "learning_rate": 9.504817471435676e-06, + "loss": 0.813, + "step": 5277 + }, + { + "epoch": 0.2904947988331774, + "grad_norm": 0.6915673017501831, + "learning_rate": 9.504629375554102e-06, + "loss": 0.6891, + "step": 5278 + }, + { + "epoch": 0.29054983763553305, + "grad_norm": 0.8581767082214355, + "learning_rate": 9.504441245816937e-06, + "loss": 0.7137, + "step": 5279 + }, + { + "epoch": 0.29060487643788874, + "grad_norm": 0.7469545006752014, + "learning_rate": 9.504253082225601e-06, + "loss": 0.7621, + "step": 5280 + }, + { + "epoch": 0.29065991524024437, + "grad_norm": 0.7725615501403809, + "learning_rate": 9.504064884781503e-06, + "loss": 0.7988, + "step": 5281 + }, + { + "epoch": 0.29071495404260006, + "grad_norm": 1.0187722444534302, + "learning_rate": 9.503876653486058e-06, + "loss": 0.7772, + "step": 5282 + }, + { + "epoch": 0.2907699928449557, + "grad_norm": 0.675574779510498, + "learning_rate": 9.503688388340683e-06, + "loss": 0.7096, + "step": 5283 + }, + { + "epoch": 0.2908250316473113, + "grad_norm": 0.7980207800865173, + "learning_rate": 9.503500089346792e-06, + "loss": 0.8291, + "step": 5284 + }, + { + "epoch": 0.290880070449667, + "grad_norm": 0.6891655325889587, + "learning_rate": 9.503311756505797e-06, + "loss": 0.7186, + "step": 5285 + }, + { + "epoch": 0.29093510925202265, + "grad_norm": 0.7273408770561218, + "learning_rate": 9.50312338981912e-06, + "loss": 0.7483, + "step": 5286 + }, + { + "epoch": 0.29099014805437834, + "grad_norm": 0.7346869111061096, + "learning_rate": 9.50293498928817e-06, + "loss": 0.766, + "step": 5287 + }, + { + "epoch": 0.291045186856734, + "grad_norm": 0.7627394795417786, + "learning_rate": 9.502746554914368e-06, + "loss": 0.867, + "step": 5288 + }, + { + "epoch": 0.29110022565908966, + "grad_norm": 0.8477200865745544, + "learning_rate": 9.502558086699128e-06, + "loss": 0.8317, + "step": 5289 + }, + { + "epoch": 0.2911552644614453, + "grad_norm": 0.7696006894111633, + "learning_rate": 9.502369584643867e-06, + "loss": 0.7814, + "step": 5290 + }, + { + "epoch": 0.291210303263801, + "grad_norm": 0.7614455819129944, + "learning_rate": 9.502181048749999e-06, + "loss": 0.7398, + "step": 5291 + }, + { + "epoch": 0.2912653420661566, + "grad_norm": 0.7877628207206726, + "learning_rate": 9.501992479018946e-06, + "loss": 0.8731, + "step": 5292 + }, + { + "epoch": 0.2913203808685123, + "grad_norm": 0.7455846667289734, + "learning_rate": 9.50180387545212e-06, + "loss": 0.7059, + "step": 5293 + }, + { + "epoch": 0.29137541967086794, + "grad_norm": 1.145520567893982, + "learning_rate": 9.501615238050944e-06, + "loss": 0.6968, + "step": 5294 + }, + { + "epoch": 0.29143045847322363, + "grad_norm": 0.8100234866142273, + "learning_rate": 9.501426566816831e-06, + "loss": 0.8122, + "step": 5295 + }, + { + "epoch": 0.29148549727557926, + "grad_norm": 0.6813066005706787, + "learning_rate": 9.501237861751203e-06, + "loss": 0.6718, + "step": 5296 + }, + { + "epoch": 0.29154053607793495, + "grad_norm": 0.7400195002555847, + "learning_rate": 9.501049122855473e-06, + "loss": 0.802, + "step": 5297 + }, + { + "epoch": 0.2915955748802906, + "grad_norm": 0.7948681712150574, + "learning_rate": 9.500860350131065e-06, + "loss": 0.8237, + "step": 5298 + }, + { + "epoch": 0.2916506136826463, + "grad_norm": 0.772093653678894, + "learning_rate": 9.500671543579394e-06, + "loss": 0.7687, + "step": 5299 + }, + { + "epoch": 0.2917056524850019, + "grad_norm": 0.7468486428260803, + "learning_rate": 9.500482703201881e-06, + "loss": 0.7827, + "step": 5300 + }, + { + "epoch": 0.2917606912873576, + "grad_norm": 0.7284440398216248, + "learning_rate": 9.500293828999945e-06, + "loss": 0.8086, + "step": 5301 + }, + { + "epoch": 0.29181573008971323, + "grad_norm": 0.8014211654663086, + "learning_rate": 9.500104920975005e-06, + "loss": 0.8409, + "step": 5302 + }, + { + "epoch": 0.2918707688920689, + "grad_norm": 0.7588346004486084, + "learning_rate": 9.49991597912848e-06, + "loss": 0.7149, + "step": 5303 + }, + { + "epoch": 0.29192580769442456, + "grad_norm": 0.8098518252372742, + "learning_rate": 9.499727003461794e-06, + "loss": 0.8375, + "step": 5304 + }, + { + "epoch": 0.29198084649678024, + "grad_norm": 0.8502426743507385, + "learning_rate": 9.499537993976363e-06, + "loss": 0.8177, + "step": 5305 + }, + { + "epoch": 0.2920358852991359, + "grad_norm": 0.8010903596878052, + "learning_rate": 9.499348950673607e-06, + "loss": 0.8457, + "step": 5306 + }, + { + "epoch": 0.29209092410149157, + "grad_norm": 0.6628156304359436, + "learning_rate": 9.49915987355495e-06, + "loss": 0.7327, + "step": 5307 + }, + { + "epoch": 0.2921459629038472, + "grad_norm": 0.7414939999580383, + "learning_rate": 9.49897076262181e-06, + "loss": 0.8271, + "step": 5308 + }, + { + "epoch": 0.2922010017062029, + "grad_norm": 0.7490847706794739, + "learning_rate": 9.498781617875613e-06, + "loss": 0.7689, + "step": 5309 + }, + { + "epoch": 0.2922560405085585, + "grad_norm": 0.7913424968719482, + "learning_rate": 9.498592439317777e-06, + "loss": 0.8571, + "step": 5310 + }, + { + "epoch": 0.2923110793109142, + "grad_norm": 0.6903867125511169, + "learning_rate": 9.498403226949724e-06, + "loss": 0.7325, + "step": 5311 + }, + { + "epoch": 0.29236611811326985, + "grad_norm": 0.8087130188941956, + "learning_rate": 9.498213980772875e-06, + "loss": 0.8167, + "step": 5312 + }, + { + "epoch": 0.29242115691562554, + "grad_norm": 1.1316752433776855, + "learning_rate": 9.498024700788655e-06, + "loss": 0.912, + "step": 5313 + }, + { + "epoch": 0.29247619571798117, + "grad_norm": 0.8701719045639038, + "learning_rate": 9.497835386998486e-06, + "loss": 0.8728, + "step": 5314 + }, + { + "epoch": 0.29253123452033686, + "grad_norm": 0.6688953638076782, + "learning_rate": 9.49764603940379e-06, + "loss": 0.6561, + "step": 5315 + }, + { + "epoch": 0.2925862733226925, + "grad_norm": 0.8067505359649658, + "learning_rate": 9.49745665800599e-06, + "loss": 0.8419, + "step": 5316 + }, + { + "epoch": 0.2926413121250482, + "grad_norm": 0.7157390117645264, + "learning_rate": 9.49726724280651e-06, + "loss": 0.7964, + "step": 5317 + }, + { + "epoch": 0.2926963509274038, + "grad_norm": 0.7038627862930298, + "learning_rate": 9.497077793806772e-06, + "loss": 0.7343, + "step": 5318 + }, + { + "epoch": 0.2927513897297595, + "grad_norm": 0.7674478888511658, + "learning_rate": 9.4968883110082e-06, + "loss": 0.7624, + "step": 5319 + }, + { + "epoch": 0.29280642853211514, + "grad_norm": 0.6708847284317017, + "learning_rate": 9.496698794412223e-06, + "loss": 0.6554, + "step": 5320 + }, + { + "epoch": 0.2928614673344708, + "grad_norm": 0.8332329392433167, + "learning_rate": 9.49650924402026e-06, + "loss": 0.9357, + "step": 5321 + }, + { + "epoch": 0.29291650613682646, + "grad_norm": 0.7601341605186462, + "learning_rate": 9.496319659833737e-06, + "loss": 0.8208, + "step": 5322 + }, + { + "epoch": 0.29297154493918215, + "grad_norm": 0.8320396542549133, + "learning_rate": 9.496130041854077e-06, + "loss": 0.8423, + "step": 5323 + }, + { + "epoch": 0.2930265837415378, + "grad_norm": 0.8242839574813843, + "learning_rate": 9.49594039008271e-06, + "loss": 0.9101, + "step": 5324 + }, + { + "epoch": 0.29308162254389347, + "grad_norm": 0.8906320333480835, + "learning_rate": 9.495750704521058e-06, + "loss": 0.7343, + "step": 5325 + }, + { + "epoch": 0.2931366613462491, + "grad_norm": 0.7964318990707397, + "learning_rate": 9.495560985170546e-06, + "loss": 0.7789, + "step": 5326 + }, + { + "epoch": 0.29319170014860474, + "grad_norm": 0.8267771601676941, + "learning_rate": 9.495371232032602e-06, + "loss": 0.7447, + "step": 5327 + }, + { + "epoch": 0.29324673895096043, + "grad_norm": 0.8120046257972717, + "learning_rate": 9.49518144510865e-06, + "loss": 0.7803, + "step": 5328 + }, + { + "epoch": 0.29330177775331606, + "grad_norm": 0.7314801812171936, + "learning_rate": 9.494991624400119e-06, + "loss": 0.6758, + "step": 5329 + }, + { + "epoch": 0.29335681655567175, + "grad_norm": 0.6989930272102356, + "learning_rate": 9.494801769908433e-06, + "loss": 0.7945, + "step": 5330 + }, + { + "epoch": 0.2934118553580274, + "grad_norm": 0.7804785966873169, + "learning_rate": 9.494611881635021e-06, + "loss": 0.7977, + "step": 5331 + }, + { + "epoch": 0.2934668941603831, + "grad_norm": 0.8377045392990112, + "learning_rate": 9.494421959581308e-06, + "loss": 0.8077, + "step": 5332 + }, + { + "epoch": 0.2935219329627387, + "grad_norm": 0.7463418245315552, + "learning_rate": 9.494232003748724e-06, + "loss": 0.783, + "step": 5333 + }, + { + "epoch": 0.2935769717650944, + "grad_norm": 0.7598912715911865, + "learning_rate": 9.494042014138695e-06, + "loss": 0.7869, + "step": 5334 + }, + { + "epoch": 0.29363201056745003, + "grad_norm": 0.7634113430976868, + "learning_rate": 9.493851990752648e-06, + "loss": 0.8108, + "step": 5335 + }, + { + "epoch": 0.2936870493698057, + "grad_norm": 0.8056474328041077, + "learning_rate": 9.493661933592013e-06, + "loss": 0.7921, + "step": 5336 + }, + { + "epoch": 0.29374208817216135, + "grad_norm": 0.8699371218681335, + "learning_rate": 9.493471842658219e-06, + "loss": 0.8833, + "step": 5337 + }, + { + "epoch": 0.29379712697451704, + "grad_norm": 0.8803261518478394, + "learning_rate": 9.493281717952691e-06, + "loss": 0.7848, + "step": 5338 + }, + { + "epoch": 0.2938521657768727, + "grad_norm": 0.7678453922271729, + "learning_rate": 9.493091559476864e-06, + "loss": 0.836, + "step": 5339 + }, + { + "epoch": 0.29390720457922836, + "grad_norm": 0.7653701305389404, + "learning_rate": 9.49290136723216e-06, + "loss": 0.8215, + "step": 5340 + }, + { + "epoch": 0.293962243381584, + "grad_norm": 0.768120527267456, + "learning_rate": 9.492711141220013e-06, + "loss": 0.7498, + "step": 5341 + }, + { + "epoch": 0.2940172821839397, + "grad_norm": 0.7665749788284302, + "learning_rate": 9.492520881441854e-06, + "loss": 0.7883, + "step": 5342 + }, + { + "epoch": 0.2940723209862953, + "grad_norm": 0.7405015230178833, + "learning_rate": 9.492330587899108e-06, + "loss": 0.8112, + "step": 5343 + }, + { + "epoch": 0.294127359788651, + "grad_norm": 0.7183459997177124, + "learning_rate": 9.492140260593208e-06, + "loss": 0.8227, + "step": 5344 + }, + { + "epoch": 0.29418239859100664, + "grad_norm": 0.7453572154045105, + "learning_rate": 9.491949899525585e-06, + "loss": 0.8148, + "step": 5345 + }, + { + "epoch": 0.29423743739336233, + "grad_norm": 0.8963750600814819, + "learning_rate": 9.491759504697669e-06, + "loss": 0.9261, + "step": 5346 + }, + { + "epoch": 0.29429247619571797, + "grad_norm": 0.7631667256355286, + "learning_rate": 9.49156907611089e-06, + "loss": 0.7708, + "step": 5347 + }, + { + "epoch": 0.29434751499807366, + "grad_norm": 0.6324381232261658, + "learning_rate": 9.49137861376668e-06, + "loss": 0.6688, + "step": 5348 + }, + { + "epoch": 0.2944025538004293, + "grad_norm": 0.6969807147979736, + "learning_rate": 9.491188117666472e-06, + "loss": 0.7516, + "step": 5349 + }, + { + "epoch": 0.294457592602785, + "grad_norm": 1.633340835571289, + "learning_rate": 9.490997587811697e-06, + "loss": 0.8111, + "step": 5350 + }, + { + "epoch": 0.2945126314051406, + "grad_norm": 0.7084371447563171, + "learning_rate": 9.490807024203785e-06, + "loss": 0.8375, + "step": 5351 + }, + { + "epoch": 0.2945676702074963, + "grad_norm": 0.7335958480834961, + "learning_rate": 9.490616426844169e-06, + "loss": 0.7884, + "step": 5352 + }, + { + "epoch": 0.29462270900985194, + "grad_norm": 0.7560276985168457, + "learning_rate": 9.490425795734282e-06, + "loss": 0.8918, + "step": 5353 + }, + { + "epoch": 0.2946777478122076, + "grad_norm": 0.9185894727706909, + "learning_rate": 9.490235130875557e-06, + "loss": 0.7976, + "step": 5354 + }, + { + "epoch": 0.29473278661456326, + "grad_norm": 0.7871553897857666, + "learning_rate": 9.490044432269427e-06, + "loss": 0.8564, + "step": 5355 + }, + { + "epoch": 0.29478782541691895, + "grad_norm": 0.8736812472343445, + "learning_rate": 9.489853699917326e-06, + "loss": 0.8114, + "step": 5356 + }, + { + "epoch": 0.2948428642192746, + "grad_norm": 0.8068968653678894, + "learning_rate": 9.489662933820684e-06, + "loss": 0.9198, + "step": 5357 + }, + { + "epoch": 0.29489790302163027, + "grad_norm": 0.7816325426101685, + "learning_rate": 9.489472133980939e-06, + "loss": 0.8012, + "step": 5358 + }, + { + "epoch": 0.2949529418239859, + "grad_norm": 0.7248200178146362, + "learning_rate": 9.489281300399522e-06, + "loss": 0.8099, + "step": 5359 + }, + { + "epoch": 0.2950079806263416, + "grad_norm": 0.7887724041938782, + "learning_rate": 9.48909043307787e-06, + "loss": 0.884, + "step": 5360 + }, + { + "epoch": 0.2950630194286972, + "grad_norm": 0.765163004398346, + "learning_rate": 9.488899532017415e-06, + "loss": 0.8563, + "step": 5361 + }, + { + "epoch": 0.2951180582310529, + "grad_norm": 0.7658557295799255, + "learning_rate": 9.488708597219592e-06, + "loss": 0.8897, + "step": 5362 + }, + { + "epoch": 0.29517309703340855, + "grad_norm": 0.6653227806091309, + "learning_rate": 9.488517628685838e-06, + "loss": 0.7107, + "step": 5363 + }, + { + "epoch": 0.29522813583576424, + "grad_norm": 0.787739098072052, + "learning_rate": 9.488326626417586e-06, + "loss": 0.8181, + "step": 5364 + }, + { + "epoch": 0.29528317463811987, + "grad_norm": 0.7822532057762146, + "learning_rate": 9.488135590416275e-06, + "loss": 0.8238, + "step": 5365 + }, + { + "epoch": 0.29533821344047556, + "grad_norm": 0.7797419428825378, + "learning_rate": 9.487944520683334e-06, + "loss": 0.8484, + "step": 5366 + }, + { + "epoch": 0.2953932522428312, + "grad_norm": 0.7230222225189209, + "learning_rate": 9.487753417220207e-06, + "loss": 0.8193, + "step": 5367 + }, + { + "epoch": 0.2954482910451869, + "grad_norm": 0.8256810307502747, + "learning_rate": 9.487562280028325e-06, + "loss": 0.7691, + "step": 5368 + }, + { + "epoch": 0.2955033298475425, + "grad_norm": 0.7704648375511169, + "learning_rate": 9.487371109109127e-06, + "loss": 0.8235, + "step": 5369 + }, + { + "epoch": 0.29555836864989815, + "grad_norm": 0.7580391764640808, + "learning_rate": 9.487179904464048e-06, + "loss": 0.7911, + "step": 5370 + }, + { + "epoch": 0.29561340745225384, + "grad_norm": 0.7211806774139404, + "learning_rate": 9.486988666094526e-06, + "loss": 0.7188, + "step": 5371 + }, + { + "epoch": 0.2956684462546095, + "grad_norm": 0.8375828862190247, + "learning_rate": 9.486797394001999e-06, + "loss": 0.881, + "step": 5372 + }, + { + "epoch": 0.29572348505696516, + "grad_norm": 0.8500093221664429, + "learning_rate": 9.486606088187903e-06, + "loss": 0.8632, + "step": 5373 + }, + { + "epoch": 0.2957785238593208, + "grad_norm": 0.7754727005958557, + "learning_rate": 9.486414748653677e-06, + "loss": 0.8124, + "step": 5374 + }, + { + "epoch": 0.2958335626616765, + "grad_norm": 0.9395208954811096, + "learning_rate": 9.486223375400759e-06, + "loss": 0.8046, + "step": 5375 + }, + { + "epoch": 0.2958886014640321, + "grad_norm": 0.7587517499923706, + "learning_rate": 9.486031968430587e-06, + "loss": 0.7852, + "step": 5376 + }, + { + "epoch": 0.2959436402663878, + "grad_norm": 0.6921781301498413, + "learning_rate": 9.485840527744599e-06, + "loss": 0.7392, + "step": 5377 + }, + { + "epoch": 0.29599867906874344, + "grad_norm": 0.8768522143363953, + "learning_rate": 9.485649053344233e-06, + "loss": 0.7819, + "step": 5378 + }, + { + "epoch": 0.29605371787109913, + "grad_norm": 0.7565680146217346, + "learning_rate": 9.485457545230932e-06, + "loss": 0.7489, + "step": 5379 + }, + { + "epoch": 0.29610875667345476, + "grad_norm": 0.7760992050170898, + "learning_rate": 9.485266003406132e-06, + "loss": 0.8129, + "step": 5380 + }, + { + "epoch": 0.29616379547581045, + "grad_norm": 0.7726097106933594, + "learning_rate": 9.485074427871272e-06, + "loss": 0.725, + "step": 5381 + }, + { + "epoch": 0.2962188342781661, + "grad_norm": 0.6885473728179932, + "learning_rate": 9.484882818627796e-06, + "loss": 0.685, + "step": 5382 + }, + { + "epoch": 0.2962738730805218, + "grad_norm": 0.776509702205658, + "learning_rate": 9.484691175677138e-06, + "loss": 0.8077, + "step": 5383 + }, + { + "epoch": 0.2963289118828774, + "grad_norm": 0.7436297535896301, + "learning_rate": 9.484499499020744e-06, + "loss": 0.8161, + "step": 5384 + }, + { + "epoch": 0.2963839506852331, + "grad_norm": 0.7604314088821411, + "learning_rate": 9.484307788660052e-06, + "loss": 0.825, + "step": 5385 + }, + { + "epoch": 0.29643898948758873, + "grad_norm": 0.7230789065361023, + "learning_rate": 9.484116044596501e-06, + "loss": 0.8005, + "step": 5386 + }, + { + "epoch": 0.2964940282899444, + "grad_norm": 0.820442259311676, + "learning_rate": 9.483924266831536e-06, + "loss": 0.789, + "step": 5387 + }, + { + "epoch": 0.29654906709230006, + "grad_norm": 0.7514582276344299, + "learning_rate": 9.483732455366596e-06, + "loss": 0.8531, + "step": 5388 + }, + { + "epoch": 0.29660410589465575, + "grad_norm": 0.6671503782272339, + "learning_rate": 9.483540610203124e-06, + "loss": 0.7627, + "step": 5389 + }, + { + "epoch": 0.2966591446970114, + "grad_norm": 0.6955942511558533, + "learning_rate": 9.483348731342559e-06, + "loss": 0.726, + "step": 5390 + }, + { + "epoch": 0.29671418349936707, + "grad_norm": 0.769781768321991, + "learning_rate": 9.483156818786347e-06, + "loss": 0.8064, + "step": 5391 + }, + { + "epoch": 0.2967692223017227, + "grad_norm": 1.0764707326889038, + "learning_rate": 9.482964872535927e-06, + "loss": 0.8249, + "step": 5392 + }, + { + "epoch": 0.2968242611040784, + "grad_norm": 1.0508921146392822, + "learning_rate": 9.482772892592744e-06, + "loss": 0.706, + "step": 5393 + }, + { + "epoch": 0.296879299906434, + "grad_norm": 0.6442564129829407, + "learning_rate": 9.482580878958239e-06, + "loss": 0.6025, + "step": 5394 + }, + { + "epoch": 0.2969343387087897, + "grad_norm": 0.7622735500335693, + "learning_rate": 9.482388831633856e-06, + "loss": 0.7639, + "step": 5395 + }, + { + "epoch": 0.29698937751114535, + "grad_norm": 0.8179057240486145, + "learning_rate": 9.482196750621038e-06, + "loss": 0.7641, + "step": 5396 + }, + { + "epoch": 0.29704441631350104, + "grad_norm": 0.7955192923545837, + "learning_rate": 9.48200463592123e-06, + "loss": 0.8407, + "step": 5397 + }, + { + "epoch": 0.29709945511585667, + "grad_norm": 0.7909773588180542, + "learning_rate": 9.481812487535875e-06, + "loss": 0.7833, + "step": 5398 + }, + { + "epoch": 0.29715449391821236, + "grad_norm": 0.8409042954444885, + "learning_rate": 9.481620305466417e-06, + "loss": 0.7788, + "step": 5399 + }, + { + "epoch": 0.297209532720568, + "grad_norm": 0.7521414160728455, + "learning_rate": 9.4814280897143e-06, + "loss": 0.7192, + "step": 5400 + }, + { + "epoch": 0.2972645715229237, + "grad_norm": 0.7016280889511108, + "learning_rate": 9.481235840280969e-06, + "loss": 0.7181, + "step": 5401 + }, + { + "epoch": 0.2973196103252793, + "grad_norm": 0.7257362604141235, + "learning_rate": 9.48104355716787e-06, + "loss": 0.7845, + "step": 5402 + }, + { + "epoch": 0.297374649127635, + "grad_norm": 0.8048765659332275, + "learning_rate": 9.480851240376445e-06, + "loss": 0.7921, + "step": 5403 + }, + { + "epoch": 0.29742968792999064, + "grad_norm": 0.8715546131134033, + "learning_rate": 9.480658889908143e-06, + "loss": 0.856, + "step": 5404 + }, + { + "epoch": 0.2974847267323463, + "grad_norm": 0.7211160063743591, + "learning_rate": 9.480466505764408e-06, + "loss": 0.7687, + "step": 5405 + }, + { + "epoch": 0.29753976553470196, + "grad_norm": 0.8749645352363586, + "learning_rate": 9.480274087946686e-06, + "loss": 0.8419, + "step": 5406 + }, + { + "epoch": 0.29759480433705765, + "grad_norm": 0.7986398935317993, + "learning_rate": 9.480081636456424e-06, + "loss": 0.8309, + "step": 5407 + }, + { + "epoch": 0.2976498431394133, + "grad_norm": 0.8435508012771606, + "learning_rate": 9.479889151295067e-06, + "loss": 0.7457, + "step": 5408 + }, + { + "epoch": 0.297704881941769, + "grad_norm": 0.8725010752677917, + "learning_rate": 9.479696632464063e-06, + "loss": 0.8069, + "step": 5409 + }, + { + "epoch": 0.2977599207441246, + "grad_norm": 0.7364320158958435, + "learning_rate": 9.479504079964856e-06, + "loss": 0.8316, + "step": 5410 + }, + { + "epoch": 0.2978149595464803, + "grad_norm": 0.7967824935913086, + "learning_rate": 9.479311493798898e-06, + "loss": 0.7689, + "step": 5411 + }, + { + "epoch": 0.29786999834883593, + "grad_norm": 0.8415414094924927, + "learning_rate": 9.479118873967632e-06, + "loss": 0.8288, + "step": 5412 + }, + { + "epoch": 0.29792503715119156, + "grad_norm": 0.9723265767097473, + "learning_rate": 9.478926220472508e-06, + "loss": 0.7422, + "step": 5413 + }, + { + "epoch": 0.29798007595354725, + "grad_norm": 0.7203155159950256, + "learning_rate": 9.478733533314974e-06, + "loss": 0.707, + "step": 5414 + }, + { + "epoch": 0.2980351147559029, + "grad_norm": 0.7643926739692688, + "learning_rate": 9.478540812496478e-06, + "loss": 0.7793, + "step": 5415 + }, + { + "epoch": 0.2980901535582586, + "grad_norm": 0.9177087545394897, + "learning_rate": 9.478348058018467e-06, + "loss": 0.865, + "step": 5416 + }, + { + "epoch": 0.2981451923606142, + "grad_norm": 0.678931713104248, + "learning_rate": 9.478155269882392e-06, + "loss": 0.7716, + "step": 5417 + }, + { + "epoch": 0.2982002311629699, + "grad_norm": 0.8440513610839844, + "learning_rate": 9.4779624480897e-06, + "loss": 0.8904, + "step": 5418 + }, + { + "epoch": 0.29825526996532553, + "grad_norm": 0.8508756756782532, + "learning_rate": 9.47776959264184e-06, + "loss": 0.7994, + "step": 5419 + }, + { + "epoch": 0.2983103087676812, + "grad_norm": 0.8736951947212219, + "learning_rate": 9.477576703540265e-06, + "loss": 0.8374, + "step": 5420 + }, + { + "epoch": 0.29836534757003685, + "grad_norm": 0.8063240051269531, + "learning_rate": 9.47738378078642e-06, + "loss": 0.7217, + "step": 5421 + }, + { + "epoch": 0.29842038637239254, + "grad_norm": 1.1495088338851929, + "learning_rate": 9.477190824381757e-06, + "loss": 0.8902, + "step": 5422 + }, + { + "epoch": 0.2984754251747482, + "grad_norm": 1.0241554975509644, + "learning_rate": 9.476997834327725e-06, + "loss": 0.9354, + "step": 5423 + }, + { + "epoch": 0.29853046397710387, + "grad_norm": 0.939950168132782, + "learning_rate": 9.476804810625779e-06, + "loss": 0.8714, + "step": 5424 + }, + { + "epoch": 0.2985855027794595, + "grad_norm": 0.7592660188674927, + "learning_rate": 9.476611753277364e-06, + "loss": 0.7513, + "step": 5425 + }, + { + "epoch": 0.2986405415818152, + "grad_norm": 0.776153028011322, + "learning_rate": 9.476418662283935e-06, + "loss": 0.7828, + "step": 5426 + }, + { + "epoch": 0.2986955803841708, + "grad_norm": 0.9317814707756042, + "learning_rate": 9.47622553764694e-06, + "loss": 0.865, + "step": 5427 + }, + { + "epoch": 0.2987506191865265, + "grad_norm": 0.7770501971244812, + "learning_rate": 9.476032379367832e-06, + "loss": 0.7281, + "step": 5428 + }, + { + "epoch": 0.29880565798888215, + "grad_norm": 0.7815201282501221, + "learning_rate": 9.475839187448064e-06, + "loss": 0.7565, + "step": 5429 + }, + { + "epoch": 0.29886069679123783, + "grad_norm": 0.7992607951164246, + "learning_rate": 9.475645961889086e-06, + "loss": 0.8109, + "step": 5430 + }, + { + "epoch": 0.29891573559359347, + "grad_norm": 0.7780614495277405, + "learning_rate": 9.475452702692351e-06, + "loss": 0.7814, + "step": 5431 + }, + { + "epoch": 0.29897077439594916, + "grad_norm": 0.7409062385559082, + "learning_rate": 9.475259409859313e-06, + "loss": 0.7712, + "step": 5432 + }, + { + "epoch": 0.2990258131983048, + "grad_norm": 0.7935584187507629, + "learning_rate": 9.47506608339142e-06, + "loss": 0.8301, + "step": 5433 + }, + { + "epoch": 0.2990808520006605, + "grad_norm": 0.6931030750274658, + "learning_rate": 9.474872723290132e-06, + "loss": 0.7471, + "step": 5434 + }, + { + "epoch": 0.2991358908030161, + "grad_norm": 0.7622918486595154, + "learning_rate": 9.474679329556894e-06, + "loss": 0.7727, + "step": 5435 + }, + { + "epoch": 0.2991909296053718, + "grad_norm": 0.7957701086997986, + "learning_rate": 9.474485902193169e-06, + "loss": 0.7663, + "step": 5436 + }, + { + "epoch": 0.29924596840772744, + "grad_norm": 1.0600612163543701, + "learning_rate": 9.474292441200404e-06, + "loss": 0.7861, + "step": 5437 + }, + { + "epoch": 0.2993010072100831, + "grad_norm": 0.7343600392341614, + "learning_rate": 9.474098946580053e-06, + "loss": 0.8609, + "step": 5438 + }, + { + "epoch": 0.29935604601243876, + "grad_norm": 0.7477726340293884, + "learning_rate": 9.473905418333573e-06, + "loss": 0.7683, + "step": 5439 + }, + { + "epoch": 0.29941108481479445, + "grad_norm": 0.7955546379089355, + "learning_rate": 9.473711856462417e-06, + "loss": 0.8406, + "step": 5440 + }, + { + "epoch": 0.2994661236171501, + "grad_norm": 0.8291183114051819, + "learning_rate": 9.47351826096804e-06, + "loss": 0.6919, + "step": 5441 + }, + { + "epoch": 0.29952116241950577, + "grad_norm": 0.8899849057197571, + "learning_rate": 9.473324631851898e-06, + "loss": 0.9403, + "step": 5442 + }, + { + "epoch": 0.2995762012218614, + "grad_norm": 0.837066650390625, + "learning_rate": 9.473130969115445e-06, + "loss": 0.8676, + "step": 5443 + }, + { + "epoch": 0.2996312400242171, + "grad_norm": 0.8385708928108215, + "learning_rate": 9.472937272760138e-06, + "loss": 0.7588, + "step": 5444 + }, + { + "epoch": 0.2996862788265727, + "grad_norm": 0.6990595459938049, + "learning_rate": 9.472743542787431e-06, + "loss": 0.6769, + "step": 5445 + }, + { + "epoch": 0.2997413176289284, + "grad_norm": 0.789165735244751, + "learning_rate": 9.472549779198781e-06, + "loss": 0.8084, + "step": 5446 + }, + { + "epoch": 0.29979635643128405, + "grad_norm": 0.8820298314094543, + "learning_rate": 9.472355981995643e-06, + "loss": 0.8262, + "step": 5447 + }, + { + "epoch": 0.29985139523363974, + "grad_norm": 0.8928382992744446, + "learning_rate": 9.472162151179475e-06, + "loss": 0.8123, + "step": 5448 + }, + { + "epoch": 0.2999064340359954, + "grad_norm": 0.7688086032867432, + "learning_rate": 9.471968286751735e-06, + "loss": 0.6846, + "step": 5449 + }, + { + "epoch": 0.29996147283835106, + "grad_norm": 0.6962918043136597, + "learning_rate": 9.471774388713877e-06, + "loss": 0.7872, + "step": 5450 + }, + { + "epoch": 0.3000165116407067, + "grad_norm": 0.7467569708824158, + "learning_rate": 9.47158045706736e-06, + "loss": 0.8201, + "step": 5451 + }, + { + "epoch": 0.3000715504430624, + "grad_norm": 0.7651814222335815, + "learning_rate": 9.471386491813642e-06, + "loss": 0.7734, + "step": 5452 + }, + { + "epoch": 0.300126589245418, + "grad_norm": 0.8001144528388977, + "learning_rate": 9.47119249295418e-06, + "loss": 0.8266, + "step": 5453 + }, + { + "epoch": 0.3001816280477737, + "grad_norm": 0.7937704920768738, + "learning_rate": 9.47099846049043e-06, + "loss": 0.8025, + "step": 5454 + }, + { + "epoch": 0.30023666685012934, + "grad_norm": 0.7353448867797852, + "learning_rate": 9.470804394423853e-06, + "loss": 0.7926, + "step": 5455 + }, + { + "epoch": 0.300291705652485, + "grad_norm": 0.9116304516792297, + "learning_rate": 9.470610294755908e-06, + "loss": 0.8295, + "step": 5456 + }, + { + "epoch": 0.30034674445484066, + "grad_norm": 0.7169163823127747, + "learning_rate": 9.470416161488053e-06, + "loss": 0.822, + "step": 5457 + }, + { + "epoch": 0.3004017832571963, + "grad_norm": 1.0421968698501587, + "learning_rate": 9.470221994621747e-06, + "loss": 0.9273, + "step": 5458 + }, + { + "epoch": 0.300456822059552, + "grad_norm": 0.9064405560493469, + "learning_rate": 9.470027794158447e-06, + "loss": 0.7087, + "step": 5459 + }, + { + "epoch": 0.3005118608619076, + "grad_norm": 0.6766010522842407, + "learning_rate": 9.469833560099617e-06, + "loss": 0.7063, + "step": 5460 + }, + { + "epoch": 0.3005668996642633, + "grad_norm": 0.7987816333770752, + "learning_rate": 9.469639292446712e-06, + "loss": 0.8216, + "step": 5461 + }, + { + "epoch": 0.30062193846661894, + "grad_norm": 0.776792049407959, + "learning_rate": 9.469444991201197e-06, + "loss": 0.8598, + "step": 5462 + }, + { + "epoch": 0.30067697726897463, + "grad_norm": 0.8048756718635559, + "learning_rate": 9.469250656364529e-06, + "loss": 0.8645, + "step": 5463 + }, + { + "epoch": 0.30073201607133027, + "grad_norm": 1.0650218725204468, + "learning_rate": 9.46905628793817e-06, + "loss": 0.8918, + "step": 5464 + }, + { + "epoch": 0.30078705487368596, + "grad_norm": 0.7378712296485901, + "learning_rate": 9.468861885923577e-06, + "loss": 0.6866, + "step": 5465 + }, + { + "epoch": 0.3008420936760416, + "grad_norm": 0.7382808327674866, + "learning_rate": 9.468667450322218e-06, + "loss": 0.8413, + "step": 5466 + }, + { + "epoch": 0.3008971324783973, + "grad_norm": 0.8390250205993652, + "learning_rate": 9.468472981135548e-06, + "loss": 0.8275, + "step": 5467 + }, + { + "epoch": 0.3009521712807529, + "grad_norm": 0.9169766902923584, + "learning_rate": 9.468278478365034e-06, + "loss": 0.8274, + "step": 5468 + }, + { + "epoch": 0.3010072100831086, + "grad_norm": 0.7487995028495789, + "learning_rate": 9.468083942012134e-06, + "loss": 0.7729, + "step": 5469 + }, + { + "epoch": 0.30106224888546423, + "grad_norm": 0.7457556128501892, + "learning_rate": 9.467889372078309e-06, + "loss": 0.7435, + "step": 5470 + }, + { + "epoch": 0.3011172876878199, + "grad_norm": 0.7085639834403992, + "learning_rate": 9.467694768565026e-06, + "loss": 0.7686, + "step": 5471 + }, + { + "epoch": 0.30117232649017556, + "grad_norm": 0.7396196722984314, + "learning_rate": 9.467500131473744e-06, + "loss": 0.7496, + "step": 5472 + }, + { + "epoch": 0.30122736529253125, + "grad_norm": 0.7906790971755981, + "learning_rate": 9.467305460805927e-06, + "loss": 0.8341, + "step": 5473 + }, + { + "epoch": 0.3012824040948869, + "grad_norm": 0.673541247844696, + "learning_rate": 9.467110756563039e-06, + "loss": 0.8041, + "step": 5474 + }, + { + "epoch": 0.30133744289724257, + "grad_norm": 0.8247049450874329, + "learning_rate": 9.46691601874654e-06, + "loss": 0.8227, + "step": 5475 + }, + { + "epoch": 0.3013924816995982, + "grad_norm": 0.7564057111740112, + "learning_rate": 9.466721247357898e-06, + "loss": 0.8181, + "step": 5476 + }, + { + "epoch": 0.3014475205019539, + "grad_norm": 0.7533192038536072, + "learning_rate": 9.466526442398574e-06, + "loss": 0.782, + "step": 5477 + }, + { + "epoch": 0.3015025593043095, + "grad_norm": 0.6934120059013367, + "learning_rate": 9.466331603870033e-06, + "loss": 0.7153, + "step": 5478 + }, + { + "epoch": 0.3015575981066652, + "grad_norm": 0.7417232990264893, + "learning_rate": 9.466136731773738e-06, + "loss": 0.753, + "step": 5479 + }, + { + "epoch": 0.30161263690902085, + "grad_norm": 0.7421486973762512, + "learning_rate": 9.465941826111156e-06, + "loss": 0.7668, + "step": 5480 + }, + { + "epoch": 0.30166767571137654, + "grad_norm": 1.0851647853851318, + "learning_rate": 9.465746886883751e-06, + "loss": 0.8019, + "step": 5481 + }, + { + "epoch": 0.30172271451373217, + "grad_norm": 0.9209244847297668, + "learning_rate": 9.465551914092987e-06, + "loss": 0.7912, + "step": 5482 + }, + { + "epoch": 0.30177775331608786, + "grad_norm": 0.6915135383605957, + "learning_rate": 9.465356907740331e-06, + "loss": 0.8112, + "step": 5483 + }, + { + "epoch": 0.3018327921184435, + "grad_norm": 0.824593722820282, + "learning_rate": 9.465161867827247e-06, + "loss": 0.7969, + "step": 5484 + }, + { + "epoch": 0.3018878309207992, + "grad_norm": 0.7985100746154785, + "learning_rate": 9.464966794355201e-06, + "loss": 0.8258, + "step": 5485 + }, + { + "epoch": 0.3019428697231548, + "grad_norm": 0.8471764326095581, + "learning_rate": 9.464771687325663e-06, + "loss": 0.8241, + "step": 5486 + }, + { + "epoch": 0.3019979085255105, + "grad_norm": 0.8133455514907837, + "learning_rate": 9.464576546740093e-06, + "loss": 0.7809, + "step": 5487 + }, + { + "epoch": 0.30205294732786614, + "grad_norm": 0.7684013843536377, + "learning_rate": 9.464381372599961e-06, + "loss": 0.9023, + "step": 5488 + }, + { + "epoch": 0.30210798613022183, + "grad_norm": 0.7818747758865356, + "learning_rate": 9.464186164906735e-06, + "loss": 0.7152, + "step": 5489 + }, + { + "epoch": 0.30216302493257746, + "grad_norm": 0.7524297833442688, + "learning_rate": 9.46399092366188e-06, + "loss": 0.782, + "step": 5490 + }, + { + "epoch": 0.30221806373493315, + "grad_norm": 0.6550590991973877, + "learning_rate": 9.463795648866864e-06, + "loss": 0.7696, + "step": 5491 + }, + { + "epoch": 0.3022731025372888, + "grad_norm": 0.8679335117340088, + "learning_rate": 9.463600340523154e-06, + "loss": 0.8115, + "step": 5492 + }, + { + "epoch": 0.3023281413396445, + "grad_norm": 0.692500114440918, + "learning_rate": 9.46340499863222e-06, + "loss": 0.7692, + "step": 5493 + }, + { + "epoch": 0.3023831801420001, + "grad_norm": 0.8604017496109009, + "learning_rate": 9.463209623195528e-06, + "loss": 0.8547, + "step": 5494 + }, + { + "epoch": 0.3024382189443558, + "grad_norm": 0.6715821623802185, + "learning_rate": 9.463014214214548e-06, + "loss": 0.7638, + "step": 5495 + }, + { + "epoch": 0.30249325774671143, + "grad_norm": 0.7803179025650024, + "learning_rate": 9.462818771690747e-06, + "loss": 0.7795, + "step": 5496 + }, + { + "epoch": 0.3025482965490671, + "grad_norm": 0.787323534488678, + "learning_rate": 9.462623295625596e-06, + "loss": 0.735, + "step": 5497 + }, + { + "epoch": 0.30260333535142275, + "grad_norm": 0.9943159222602844, + "learning_rate": 9.462427786020563e-06, + "loss": 0.7451, + "step": 5498 + }, + { + "epoch": 0.3026583741537784, + "grad_norm": 0.772524893283844, + "learning_rate": 9.462232242877116e-06, + "loss": 0.9167, + "step": 5499 + }, + { + "epoch": 0.3027134129561341, + "grad_norm": 0.7204643487930298, + "learning_rate": 9.462036666196726e-06, + "loss": 0.7442, + "step": 5500 + }, + { + "epoch": 0.3027684517584897, + "grad_norm": 0.7450547218322754, + "learning_rate": 9.461841055980863e-06, + "loss": 0.8002, + "step": 5501 + }, + { + "epoch": 0.3028234905608454, + "grad_norm": 0.8096264004707336, + "learning_rate": 9.461645412230997e-06, + "loss": 0.8601, + "step": 5502 + }, + { + "epoch": 0.30287852936320103, + "grad_norm": 0.684968888759613, + "learning_rate": 9.461449734948597e-06, + "loss": 0.7251, + "step": 5503 + }, + { + "epoch": 0.3029335681655567, + "grad_norm": 0.7727203369140625, + "learning_rate": 9.461254024135138e-06, + "loss": 0.7797, + "step": 5504 + }, + { + "epoch": 0.30298860696791236, + "grad_norm": 0.9292891025543213, + "learning_rate": 9.461058279792086e-06, + "loss": 0.7519, + "step": 5505 + }, + { + "epoch": 0.30304364577026804, + "grad_norm": 0.7836466431617737, + "learning_rate": 9.460862501920915e-06, + "loss": 0.8201, + "step": 5506 + }, + { + "epoch": 0.3030986845726237, + "grad_norm": 0.9043576121330261, + "learning_rate": 9.460666690523094e-06, + "loss": 0.79, + "step": 5507 + }, + { + "epoch": 0.30315372337497937, + "grad_norm": 0.8339952230453491, + "learning_rate": 9.460470845600098e-06, + "loss": 0.8392, + "step": 5508 + }, + { + "epoch": 0.303208762177335, + "grad_norm": 0.7603133320808411, + "learning_rate": 9.460274967153395e-06, + "loss": 0.7168, + "step": 5509 + }, + { + "epoch": 0.3032638009796907, + "grad_norm": 0.7287996411323547, + "learning_rate": 9.460079055184461e-06, + "loss": 0.7452, + "step": 5510 + }, + { + "epoch": 0.3033188397820463, + "grad_norm": 0.707953691482544, + "learning_rate": 9.459883109694767e-06, + "loss": 0.8081, + "step": 5511 + }, + { + "epoch": 0.303373878584402, + "grad_norm": 0.7556451559066772, + "learning_rate": 9.459687130685784e-06, + "loss": 0.8145, + "step": 5512 + }, + { + "epoch": 0.30342891738675765, + "grad_norm": 0.8076426386833191, + "learning_rate": 9.459491118158987e-06, + "loss": 0.8006, + "step": 5513 + }, + { + "epoch": 0.30348395618911334, + "grad_norm": 0.7343682646751404, + "learning_rate": 9.459295072115849e-06, + "loss": 0.7574, + "step": 5514 + }, + { + "epoch": 0.30353899499146897, + "grad_norm": 0.68440181016922, + "learning_rate": 9.459098992557843e-06, + "loss": 0.7432, + "step": 5515 + }, + { + "epoch": 0.30359403379382466, + "grad_norm": 0.8278071880340576, + "learning_rate": 9.458902879486441e-06, + "loss": 0.8357, + "step": 5516 + }, + { + "epoch": 0.3036490725961803, + "grad_norm": 0.8377245664596558, + "learning_rate": 9.458706732903121e-06, + "loss": 0.7552, + "step": 5517 + }, + { + "epoch": 0.303704111398536, + "grad_norm": 0.7354543805122375, + "learning_rate": 9.458510552809353e-06, + "loss": 0.7862, + "step": 5518 + }, + { + "epoch": 0.3037591502008916, + "grad_norm": 0.8071799874305725, + "learning_rate": 9.458314339206611e-06, + "loss": 0.8428, + "step": 5519 + }, + { + "epoch": 0.3038141890032473, + "grad_norm": 0.7452389597892761, + "learning_rate": 9.458118092096376e-06, + "loss": 0.8252, + "step": 5520 + }, + { + "epoch": 0.30386922780560294, + "grad_norm": 0.7370620965957642, + "learning_rate": 9.457921811480115e-06, + "loss": 0.8143, + "step": 5521 + }, + { + "epoch": 0.3039242666079586, + "grad_norm": 0.8816156387329102, + "learning_rate": 9.45772549735931e-06, + "loss": 0.7163, + "step": 5522 + }, + { + "epoch": 0.30397930541031426, + "grad_norm": 0.7208901643753052, + "learning_rate": 9.457529149735432e-06, + "loss": 0.7877, + "step": 5523 + }, + { + "epoch": 0.30403434421266995, + "grad_norm": 0.820792019367218, + "learning_rate": 9.457332768609959e-06, + "loss": 0.8275, + "step": 5524 + }, + { + "epoch": 0.3040893830150256, + "grad_norm": 0.8471686244010925, + "learning_rate": 9.457136353984365e-06, + "loss": 0.8127, + "step": 5525 + }, + { + "epoch": 0.30414442181738127, + "grad_norm": 0.9448342323303223, + "learning_rate": 9.456939905860127e-06, + "loss": 0.8157, + "step": 5526 + }, + { + "epoch": 0.3041994606197369, + "grad_norm": 0.7835188508033752, + "learning_rate": 9.456743424238723e-06, + "loss": 0.7116, + "step": 5527 + }, + { + "epoch": 0.3042544994220926, + "grad_norm": 0.8884950876235962, + "learning_rate": 9.456546909121629e-06, + "loss": 0.8514, + "step": 5528 + }, + { + "epoch": 0.30430953822444823, + "grad_norm": 0.7400928735733032, + "learning_rate": 9.45635036051032e-06, + "loss": 0.8207, + "step": 5529 + }, + { + "epoch": 0.3043645770268039, + "grad_norm": 0.8278732299804688, + "learning_rate": 9.456153778406274e-06, + "loss": 0.8269, + "step": 5530 + }, + { + "epoch": 0.30441961582915955, + "grad_norm": 0.7423332929611206, + "learning_rate": 9.45595716281097e-06, + "loss": 0.7937, + "step": 5531 + }, + { + "epoch": 0.30447465463151524, + "grad_norm": 1.5018088817596436, + "learning_rate": 9.455760513725885e-06, + "loss": 0.7935, + "step": 5532 + }, + { + "epoch": 0.3045296934338709, + "grad_norm": 0.8105388283729553, + "learning_rate": 9.455563831152496e-06, + "loss": 0.8225, + "step": 5533 + }, + { + "epoch": 0.30458473223622656, + "grad_norm": 0.6874535083770752, + "learning_rate": 9.455367115092283e-06, + "loss": 0.7301, + "step": 5534 + }, + { + "epoch": 0.3046397710385822, + "grad_norm": 0.8085837960243225, + "learning_rate": 9.455170365546721e-06, + "loss": 0.83, + "step": 5535 + }, + { + "epoch": 0.3046948098409379, + "grad_norm": 0.810773491859436, + "learning_rate": 9.454973582517293e-06, + "loss": 0.7186, + "step": 5536 + }, + { + "epoch": 0.3047498486432935, + "grad_norm": 0.7290367484092712, + "learning_rate": 9.454776766005476e-06, + "loss": 0.8181, + "step": 5537 + }, + { + "epoch": 0.3048048874456492, + "grad_norm": 0.773728609085083, + "learning_rate": 9.45457991601275e-06, + "loss": 0.8454, + "step": 5538 + }, + { + "epoch": 0.30485992624800484, + "grad_norm": 0.792169451713562, + "learning_rate": 9.454383032540592e-06, + "loss": 0.8797, + "step": 5539 + }, + { + "epoch": 0.30491496505036053, + "grad_norm": 0.7478733658790588, + "learning_rate": 9.454186115590485e-06, + "loss": 0.7544, + "step": 5540 + }, + { + "epoch": 0.30497000385271616, + "grad_norm": 0.8527306318283081, + "learning_rate": 9.453989165163906e-06, + "loss": 0.8379, + "step": 5541 + }, + { + "epoch": 0.3050250426550718, + "grad_norm": 0.8829329013824463, + "learning_rate": 9.453792181262337e-06, + "loss": 0.7643, + "step": 5542 + }, + { + "epoch": 0.3050800814574275, + "grad_norm": 0.9477338790893555, + "learning_rate": 9.453595163887258e-06, + "loss": 0.7414, + "step": 5543 + }, + { + "epoch": 0.3051351202597831, + "grad_norm": 0.8311536312103271, + "learning_rate": 9.453398113040151e-06, + "loss": 0.8133, + "step": 5544 + }, + { + "epoch": 0.3051901590621388, + "grad_norm": 0.8035525679588318, + "learning_rate": 9.453201028722497e-06, + "loss": 0.7841, + "step": 5545 + }, + { + "epoch": 0.30524519786449444, + "grad_norm": 0.7779183983802795, + "learning_rate": 9.453003910935775e-06, + "loss": 0.7696, + "step": 5546 + }, + { + "epoch": 0.30530023666685013, + "grad_norm": 0.7843946218490601, + "learning_rate": 9.452806759681465e-06, + "loss": 0.6018, + "step": 5547 + }, + { + "epoch": 0.30535527546920577, + "grad_norm": 0.7215032577514648, + "learning_rate": 9.452609574961053e-06, + "loss": 0.7457, + "step": 5548 + }, + { + "epoch": 0.30541031427156146, + "grad_norm": 0.9628198742866516, + "learning_rate": 9.452412356776021e-06, + "loss": 0.8061, + "step": 5549 + }, + { + "epoch": 0.3054653530739171, + "grad_norm": 0.9468308687210083, + "learning_rate": 9.452215105127848e-06, + "loss": 0.7909, + "step": 5550 + }, + { + "epoch": 0.3055203918762728, + "grad_norm": 0.876402735710144, + "learning_rate": 9.452017820018017e-06, + "loss": 0.69, + "step": 5551 + }, + { + "epoch": 0.3055754306786284, + "grad_norm": 1.03409743309021, + "learning_rate": 9.451820501448014e-06, + "loss": 0.8375, + "step": 5552 + }, + { + "epoch": 0.3056304694809841, + "grad_norm": 0.8057541847229004, + "learning_rate": 9.45162314941932e-06, + "loss": 0.7704, + "step": 5553 + }, + { + "epoch": 0.30568550828333974, + "grad_norm": 0.7256304025650024, + "learning_rate": 9.451425763933417e-06, + "loss": 0.7819, + "step": 5554 + }, + { + "epoch": 0.3057405470856954, + "grad_norm": 0.7982180118560791, + "learning_rate": 9.451228344991788e-06, + "loss": 0.8094, + "step": 5555 + }, + { + "epoch": 0.30579558588805106, + "grad_norm": 1.0314620733261108, + "learning_rate": 9.45103089259592e-06, + "loss": 0.7777, + "step": 5556 + }, + { + "epoch": 0.30585062469040675, + "grad_norm": 0.6948755383491516, + "learning_rate": 9.450833406747294e-06, + "loss": 0.7189, + "step": 5557 + }, + { + "epoch": 0.3059056634927624, + "grad_norm": 0.7412117719650269, + "learning_rate": 9.450635887447396e-06, + "loss": 0.783, + "step": 5558 + }, + { + "epoch": 0.30596070229511807, + "grad_norm": 0.7394647002220154, + "learning_rate": 9.450438334697711e-06, + "loss": 0.7888, + "step": 5559 + }, + { + "epoch": 0.3060157410974737, + "grad_norm": 0.692701518535614, + "learning_rate": 9.450240748499725e-06, + "loss": 0.7427, + "step": 5560 + }, + { + "epoch": 0.3060707798998294, + "grad_norm": 0.6854925751686096, + "learning_rate": 9.450043128854916e-06, + "loss": 0.7877, + "step": 5561 + }, + { + "epoch": 0.306125818702185, + "grad_norm": 0.8073517680168152, + "learning_rate": 9.449845475764776e-06, + "loss": 0.8715, + "step": 5562 + }, + { + "epoch": 0.3061808575045407, + "grad_norm": 0.9672908186912537, + "learning_rate": 9.449647789230789e-06, + "loss": 0.782, + "step": 5563 + }, + { + "epoch": 0.30623589630689635, + "grad_norm": 0.7409735918045044, + "learning_rate": 9.44945006925444e-06, + "loss": 0.7956, + "step": 5564 + }, + { + "epoch": 0.30629093510925204, + "grad_norm": 0.7839213609695435, + "learning_rate": 9.449252315837215e-06, + "loss": 0.7559, + "step": 5565 + }, + { + "epoch": 0.30634597391160767, + "grad_norm": 0.668393075466156, + "learning_rate": 9.449054528980602e-06, + "loss": 0.717, + "step": 5566 + }, + { + "epoch": 0.30640101271396336, + "grad_norm": 0.8818438053131104, + "learning_rate": 9.448856708686084e-06, + "loss": 0.7801, + "step": 5567 + }, + { + "epoch": 0.306456051516319, + "grad_norm": 0.7331361770629883, + "learning_rate": 9.44865885495515e-06, + "loss": 0.6999, + "step": 5568 + }, + { + "epoch": 0.3065110903186747, + "grad_norm": 0.7818138599395752, + "learning_rate": 9.448460967789288e-06, + "loss": 0.7437, + "step": 5569 + }, + { + "epoch": 0.3065661291210303, + "grad_norm": 0.7713417410850525, + "learning_rate": 9.448263047189985e-06, + "loss": 0.8523, + "step": 5570 + }, + { + "epoch": 0.306621167923386, + "grad_norm": 0.7152866125106812, + "learning_rate": 9.448065093158726e-06, + "loss": 0.7706, + "step": 5571 + }, + { + "epoch": 0.30667620672574164, + "grad_norm": 0.7486638426780701, + "learning_rate": 9.447867105697e-06, + "loss": 0.7738, + "step": 5572 + }, + { + "epoch": 0.30673124552809733, + "grad_norm": 0.7014918923377991, + "learning_rate": 9.447669084806297e-06, + "loss": 0.7013, + "step": 5573 + }, + { + "epoch": 0.30678628433045296, + "grad_norm": 0.8328303694725037, + "learning_rate": 9.447471030488102e-06, + "loss": 0.8113, + "step": 5574 + }, + { + "epoch": 0.30684132313280865, + "grad_norm": 0.6800024509429932, + "learning_rate": 9.447272942743906e-06, + "loss": 0.6786, + "step": 5575 + }, + { + "epoch": 0.3068963619351643, + "grad_norm": 0.6827595829963684, + "learning_rate": 9.447074821575198e-06, + "loss": 0.812, + "step": 5576 + }, + { + "epoch": 0.30695140073752, + "grad_norm": 0.8775614500045776, + "learning_rate": 9.446876666983465e-06, + "loss": 0.7683, + "step": 5577 + }, + { + "epoch": 0.3070064395398756, + "grad_norm": 0.7440332174301147, + "learning_rate": 9.446678478970198e-06, + "loss": 0.7152, + "step": 5578 + }, + { + "epoch": 0.3070614783422313, + "grad_norm": 0.7031408548355103, + "learning_rate": 9.446480257536885e-06, + "loss": 0.7603, + "step": 5579 + }, + { + "epoch": 0.30711651714458693, + "grad_norm": 0.8419817090034485, + "learning_rate": 9.446282002685019e-06, + "loss": 0.9939, + "step": 5580 + }, + { + "epoch": 0.3071715559469426, + "grad_norm": 0.7622908353805542, + "learning_rate": 9.446083714416085e-06, + "loss": 0.8682, + "step": 5581 + }, + { + "epoch": 0.30722659474929825, + "grad_norm": 0.7341362833976746, + "learning_rate": 9.445885392731576e-06, + "loss": 0.848, + "step": 5582 + }, + { + "epoch": 0.30728163355165394, + "grad_norm": 0.7248286604881287, + "learning_rate": 9.445687037632984e-06, + "loss": 0.7699, + "step": 5583 + }, + { + "epoch": 0.3073366723540096, + "grad_norm": 0.9409947991371155, + "learning_rate": 9.445488649121797e-06, + "loss": 1.0051, + "step": 5584 + }, + { + "epoch": 0.3073917111563652, + "grad_norm": 0.7279968857765198, + "learning_rate": 9.445290227199509e-06, + "loss": 0.8001, + "step": 5585 + }, + { + "epoch": 0.3074467499587209, + "grad_norm": 0.7904797196388245, + "learning_rate": 9.445091771867607e-06, + "loss": 0.8892, + "step": 5586 + }, + { + "epoch": 0.30750178876107653, + "grad_norm": 0.7090430855751038, + "learning_rate": 9.444893283127587e-06, + "loss": 0.5983, + "step": 5587 + }, + { + "epoch": 0.3075568275634322, + "grad_norm": 0.8363901376724243, + "learning_rate": 9.444694760980939e-06, + "loss": 0.7688, + "step": 5588 + }, + { + "epoch": 0.30761186636578786, + "grad_norm": 0.7487169504165649, + "learning_rate": 9.444496205429152e-06, + "loss": 0.7585, + "step": 5589 + }, + { + "epoch": 0.30766690516814355, + "grad_norm": 0.750801146030426, + "learning_rate": 9.444297616473724e-06, + "loss": 0.6493, + "step": 5590 + }, + { + "epoch": 0.3077219439704992, + "grad_norm": 0.754846453666687, + "learning_rate": 9.444098994116144e-06, + "loss": 0.8528, + "step": 5591 + }, + { + "epoch": 0.30777698277285487, + "grad_norm": 0.7088152766227722, + "learning_rate": 9.443900338357907e-06, + "loss": 0.7927, + "step": 5592 + }, + { + "epoch": 0.3078320215752105, + "grad_norm": 0.7077113389968872, + "learning_rate": 9.443701649200503e-06, + "loss": 0.7996, + "step": 5593 + }, + { + "epoch": 0.3078870603775662, + "grad_norm": 0.732982873916626, + "learning_rate": 9.443502926645427e-06, + "loss": 0.7473, + "step": 5594 + }, + { + "epoch": 0.3079420991799218, + "grad_norm": 0.7068434357643127, + "learning_rate": 9.443304170694174e-06, + "loss": 0.7575, + "step": 5595 + }, + { + "epoch": 0.3079971379822775, + "grad_norm": 0.7703887224197388, + "learning_rate": 9.443105381348234e-06, + "loss": 0.8157, + "step": 5596 + }, + { + "epoch": 0.30805217678463315, + "grad_norm": 0.806924045085907, + "learning_rate": 9.442906558609103e-06, + "loss": 0.7572, + "step": 5597 + }, + { + "epoch": 0.30810721558698884, + "grad_norm": 0.8364617824554443, + "learning_rate": 9.442707702478278e-06, + "loss": 0.7491, + "step": 5598 + }, + { + "epoch": 0.30816225438934447, + "grad_norm": 0.9269624352455139, + "learning_rate": 9.442508812957249e-06, + "loss": 0.8746, + "step": 5599 + }, + { + "epoch": 0.30821729319170016, + "grad_norm": 0.7308455109596252, + "learning_rate": 9.442309890047515e-06, + "loss": 0.8068, + "step": 5600 + }, + { + "epoch": 0.3082723319940558, + "grad_norm": 0.812622606754303, + "learning_rate": 9.442110933750567e-06, + "loss": 0.9137, + "step": 5601 + }, + { + "epoch": 0.3083273707964115, + "grad_norm": 0.7100754976272583, + "learning_rate": 9.441911944067905e-06, + "loss": 0.7471, + "step": 5602 + }, + { + "epoch": 0.3083824095987671, + "grad_norm": 0.760208010673523, + "learning_rate": 9.44171292100102e-06, + "loss": 0.8243, + "step": 5603 + }, + { + "epoch": 0.3084374484011228, + "grad_norm": 0.6931812763214111, + "learning_rate": 9.44151386455141e-06, + "loss": 0.7523, + "step": 5604 + }, + { + "epoch": 0.30849248720347844, + "grad_norm": 0.6584734916687012, + "learning_rate": 9.44131477472057e-06, + "loss": 0.6929, + "step": 5605 + }, + { + "epoch": 0.3085475260058341, + "grad_norm": 0.977661669254303, + "learning_rate": 9.441115651509997e-06, + "loss": 0.8003, + "step": 5606 + }, + { + "epoch": 0.30860256480818976, + "grad_norm": 0.650434672832489, + "learning_rate": 9.440916494921189e-06, + "loss": 0.6629, + "step": 5607 + }, + { + "epoch": 0.30865760361054545, + "grad_norm": 0.6804447770118713, + "learning_rate": 9.44071730495564e-06, + "loss": 0.7216, + "step": 5608 + }, + { + "epoch": 0.3087126424129011, + "grad_norm": 0.7942929267883301, + "learning_rate": 9.44051808161485e-06, + "loss": 0.7593, + "step": 5609 + }, + { + "epoch": 0.3087676812152568, + "grad_norm": 0.7069621086120605, + "learning_rate": 9.440318824900313e-06, + "loss": 0.7453, + "step": 5610 + }, + { + "epoch": 0.3088227200176124, + "grad_norm": 0.7903168797492981, + "learning_rate": 9.440119534813528e-06, + "loss": 0.8084, + "step": 5611 + }, + { + "epoch": 0.3088777588199681, + "grad_norm": 0.7828298807144165, + "learning_rate": 9.439920211355993e-06, + "loss": 0.7556, + "step": 5612 + }, + { + "epoch": 0.30893279762232373, + "grad_norm": 0.8118648529052734, + "learning_rate": 9.43972085452921e-06, + "loss": 0.8548, + "step": 5613 + }, + { + "epoch": 0.3089878364246794, + "grad_norm": 0.9169642329216003, + "learning_rate": 9.439521464334669e-06, + "loss": 0.833, + "step": 5614 + }, + { + "epoch": 0.30904287522703505, + "grad_norm": 0.7844422459602356, + "learning_rate": 9.439322040773875e-06, + "loss": 0.8363, + "step": 5615 + }, + { + "epoch": 0.30909791402939074, + "grad_norm": 1.4801305532455444, + "learning_rate": 9.439122583848324e-06, + "loss": 0.7617, + "step": 5616 + }, + { + "epoch": 0.3091529528317464, + "grad_norm": 0.7737647891044617, + "learning_rate": 9.438923093559517e-06, + "loss": 0.7224, + "step": 5617 + }, + { + "epoch": 0.30920799163410206, + "grad_norm": 0.7279127836227417, + "learning_rate": 9.438723569908952e-06, + "loss": 0.7783, + "step": 5618 + }, + { + "epoch": 0.3092630304364577, + "grad_norm": 0.7635996341705322, + "learning_rate": 9.438524012898127e-06, + "loss": 0.8408, + "step": 5619 + }, + { + "epoch": 0.3093180692388134, + "grad_norm": 0.818445086479187, + "learning_rate": 9.438324422528547e-06, + "loss": 0.8836, + "step": 5620 + }, + { + "epoch": 0.309373108041169, + "grad_norm": 0.8620640635490417, + "learning_rate": 9.438124798801706e-06, + "loss": 0.925, + "step": 5621 + }, + { + "epoch": 0.3094281468435247, + "grad_norm": 0.7294883728027344, + "learning_rate": 9.437925141719108e-06, + "loss": 0.8387, + "step": 5622 + }, + { + "epoch": 0.30948318564588034, + "grad_norm": 0.6696046590805054, + "learning_rate": 9.437725451282252e-06, + "loss": 0.6712, + "step": 5623 + }, + { + "epoch": 0.30953822444823603, + "grad_norm": 0.8200504779815674, + "learning_rate": 9.43752572749264e-06, + "loss": 0.8191, + "step": 5624 + }, + { + "epoch": 0.30959326325059167, + "grad_norm": 0.8440756797790527, + "learning_rate": 9.437325970351773e-06, + "loss": 0.7412, + "step": 5625 + }, + { + "epoch": 0.30964830205294736, + "grad_norm": 0.8550771474838257, + "learning_rate": 9.43712617986115e-06, + "loss": 0.7842, + "step": 5626 + }, + { + "epoch": 0.309703340855303, + "grad_norm": 0.8203451037406921, + "learning_rate": 9.436926356022275e-06, + "loss": 0.8298, + "step": 5627 + }, + { + "epoch": 0.3097583796576586, + "grad_norm": 1.0105336904525757, + "learning_rate": 9.436726498836651e-06, + "loss": 0.8416, + "step": 5628 + }, + { + "epoch": 0.3098134184600143, + "grad_norm": 0.7684324383735657, + "learning_rate": 9.436526608305777e-06, + "loss": 0.7051, + "step": 5629 + }, + { + "epoch": 0.30986845726236995, + "grad_norm": 0.7284610867500305, + "learning_rate": 9.436326684431157e-06, + "loss": 0.755, + "step": 5630 + }, + { + "epoch": 0.30992349606472563, + "grad_norm": 0.7125874161720276, + "learning_rate": 9.436126727214293e-06, + "loss": 0.7336, + "step": 5631 + }, + { + "epoch": 0.30997853486708127, + "grad_norm": 0.7008525729179382, + "learning_rate": 9.435926736656687e-06, + "loss": 0.7185, + "step": 5632 + }, + { + "epoch": 0.31003357366943696, + "grad_norm": 0.7087175250053406, + "learning_rate": 9.435726712759844e-06, + "loss": 0.717, + "step": 5633 + }, + { + "epoch": 0.3100886124717926, + "grad_norm": 0.7892497777938843, + "learning_rate": 9.435526655525267e-06, + "loss": 0.8308, + "step": 5634 + }, + { + "epoch": 0.3101436512741483, + "grad_norm": 0.733906626701355, + "learning_rate": 9.435326564954457e-06, + "loss": 0.7421, + "step": 5635 + }, + { + "epoch": 0.3101986900765039, + "grad_norm": 0.7874915599822998, + "learning_rate": 9.43512644104892e-06, + "loss": 0.8808, + "step": 5636 + }, + { + "epoch": 0.3102537288788596, + "grad_norm": 0.6849297881126404, + "learning_rate": 9.434926283810162e-06, + "loss": 0.7297, + "step": 5637 + }, + { + "epoch": 0.31030876768121524, + "grad_norm": 0.7847834825515747, + "learning_rate": 9.434726093239685e-06, + "loss": 0.7873, + "step": 5638 + }, + { + "epoch": 0.3103638064835709, + "grad_norm": 0.6999106407165527, + "learning_rate": 9.434525869338992e-06, + "loss": 0.7699, + "step": 5639 + }, + { + "epoch": 0.31041884528592656, + "grad_norm": 0.7662788033485413, + "learning_rate": 9.43432561210959e-06, + "loss": 0.7583, + "step": 5640 + }, + { + "epoch": 0.31047388408828225, + "grad_norm": 0.8336607217788696, + "learning_rate": 9.434125321552985e-06, + "loss": 0.7297, + "step": 5641 + }, + { + "epoch": 0.3105289228906379, + "grad_norm": 0.8038349151611328, + "learning_rate": 9.433924997670681e-06, + "loss": 0.798, + "step": 5642 + }, + { + "epoch": 0.31058396169299357, + "grad_norm": 0.6819794178009033, + "learning_rate": 9.433724640464181e-06, + "loss": 0.7951, + "step": 5643 + }, + { + "epoch": 0.3106390004953492, + "grad_norm": 0.916238009929657, + "learning_rate": 9.433524249934995e-06, + "loss": 0.7371, + "step": 5644 + }, + { + "epoch": 0.3106940392977049, + "grad_norm": 0.8390263915061951, + "learning_rate": 9.433323826084628e-06, + "loss": 0.8211, + "step": 5645 + }, + { + "epoch": 0.3107490781000605, + "grad_norm": 0.7957239747047424, + "learning_rate": 9.433123368914586e-06, + "loss": 0.8406, + "step": 5646 + }, + { + "epoch": 0.3108041169024162, + "grad_norm": 0.6771933436393738, + "learning_rate": 9.432922878426374e-06, + "loss": 0.7664, + "step": 5647 + }, + { + "epoch": 0.31085915570477185, + "grad_norm": 0.7874065041542053, + "learning_rate": 9.432722354621503e-06, + "loss": 0.7445, + "step": 5648 + }, + { + "epoch": 0.31091419450712754, + "grad_norm": 0.674749493598938, + "learning_rate": 9.432521797501475e-06, + "loss": 0.745, + "step": 5649 + }, + { + "epoch": 0.3109692333094832, + "grad_norm": 0.7695828676223755, + "learning_rate": 9.432321207067799e-06, + "loss": 0.7555, + "step": 5650 + }, + { + "epoch": 0.31102427211183886, + "grad_norm": 0.8050221800804138, + "learning_rate": 9.432120583321984e-06, + "loss": 0.8464, + "step": 5651 + }, + { + "epoch": 0.3110793109141945, + "grad_norm": 0.7242713570594788, + "learning_rate": 9.431919926265538e-06, + "loss": 0.7439, + "step": 5652 + }, + { + "epoch": 0.3111343497165502, + "grad_norm": 0.7372434735298157, + "learning_rate": 9.431719235899967e-06, + "loss": 0.7973, + "step": 5653 + }, + { + "epoch": 0.3111893885189058, + "grad_norm": 0.7573439478874207, + "learning_rate": 9.431518512226783e-06, + "loss": 0.8259, + "step": 5654 + }, + { + "epoch": 0.3112444273212615, + "grad_norm": 0.7098552584648132, + "learning_rate": 9.43131775524749e-06, + "loss": 0.8159, + "step": 5655 + }, + { + "epoch": 0.31129946612361714, + "grad_norm": 0.7804632186889648, + "learning_rate": 9.431116964963599e-06, + "loss": 0.7795, + "step": 5656 + }, + { + "epoch": 0.31135450492597283, + "grad_norm": 1.0158027410507202, + "learning_rate": 9.43091614137662e-06, + "loss": 0.7935, + "step": 5657 + }, + { + "epoch": 0.31140954372832846, + "grad_norm": 0.708238422870636, + "learning_rate": 9.430715284488059e-06, + "loss": 0.7592, + "step": 5658 + }, + { + "epoch": 0.31146458253068415, + "grad_norm": 0.7086984515190125, + "learning_rate": 9.43051439429943e-06, + "loss": 0.7303, + "step": 5659 + }, + { + "epoch": 0.3115196213330398, + "grad_norm": 0.7620081305503845, + "learning_rate": 9.43031347081224e-06, + "loss": 0.7429, + "step": 5660 + }, + { + "epoch": 0.3115746601353955, + "grad_norm": 0.746126115322113, + "learning_rate": 9.430112514028e-06, + "loss": 0.8836, + "step": 5661 + }, + { + "epoch": 0.3116296989377511, + "grad_norm": 0.9113686680793762, + "learning_rate": 9.429911523948221e-06, + "loss": 0.6343, + "step": 5662 + }, + { + "epoch": 0.3116847377401068, + "grad_norm": 0.700890839099884, + "learning_rate": 9.429710500574413e-06, + "loss": 0.8201, + "step": 5663 + }, + { + "epoch": 0.31173977654246243, + "grad_norm": 0.7428706288337708, + "learning_rate": 9.429509443908085e-06, + "loss": 0.6838, + "step": 5664 + }, + { + "epoch": 0.3117948153448181, + "grad_norm": 0.851725697517395, + "learning_rate": 9.429308353950752e-06, + "loss": 0.7151, + "step": 5665 + }, + { + "epoch": 0.31184985414717376, + "grad_norm": 0.8555309176445007, + "learning_rate": 9.42910723070392e-06, + "loss": 0.7384, + "step": 5666 + }, + { + "epoch": 0.31190489294952944, + "grad_norm": 0.735927939414978, + "learning_rate": 9.428906074169107e-06, + "loss": 0.6911, + "step": 5667 + }, + { + "epoch": 0.3119599317518851, + "grad_norm": 0.8007609844207764, + "learning_rate": 9.42870488434782e-06, + "loss": 0.869, + "step": 5668 + }, + { + "epoch": 0.31201497055424077, + "grad_norm": 0.7604133486747742, + "learning_rate": 9.42850366124157e-06, + "loss": 0.7633, + "step": 5669 + }, + { + "epoch": 0.3120700093565964, + "grad_norm": 0.8181144595146179, + "learning_rate": 9.428302404851875e-06, + "loss": 0.7631, + "step": 5670 + }, + { + "epoch": 0.31212504815895203, + "grad_norm": 0.7115523219108582, + "learning_rate": 9.428101115180243e-06, + "loss": 0.734, + "step": 5671 + }, + { + "epoch": 0.3121800869613077, + "grad_norm": 0.7165855765342712, + "learning_rate": 9.42789979222819e-06, + "loss": 0.8068, + "step": 5672 + }, + { + "epoch": 0.31223512576366336, + "grad_norm": 0.6515665650367737, + "learning_rate": 9.427698435997225e-06, + "loss": 0.6946, + "step": 5673 + }, + { + "epoch": 0.31229016456601905, + "grad_norm": 0.7692676186561584, + "learning_rate": 9.427497046488867e-06, + "loss": 0.7387, + "step": 5674 + }, + { + "epoch": 0.3123452033683747, + "grad_norm": 0.70064777135849, + "learning_rate": 9.427295623704625e-06, + "loss": 0.7976, + "step": 5675 + }, + { + "epoch": 0.31240024217073037, + "grad_norm": 0.7464852333068848, + "learning_rate": 9.427094167646013e-06, + "loss": 0.7574, + "step": 5676 + }, + { + "epoch": 0.312455280973086, + "grad_norm": 0.7721675634384155, + "learning_rate": 9.426892678314548e-06, + "loss": 0.7405, + "step": 5677 + }, + { + "epoch": 0.3125103197754417, + "grad_norm": 0.6581596732139587, + "learning_rate": 9.42669115571174e-06, + "loss": 0.6972, + "step": 5678 + }, + { + "epoch": 0.3125653585777973, + "grad_norm": 0.8722662329673767, + "learning_rate": 9.426489599839108e-06, + "loss": 0.8073, + "step": 5679 + }, + { + "epoch": 0.312620397380153, + "grad_norm": 0.6800306439399719, + "learning_rate": 9.426288010698165e-06, + "loss": 0.7721, + "step": 5680 + }, + { + "epoch": 0.31267543618250865, + "grad_norm": 0.7443979382514954, + "learning_rate": 9.426086388290428e-06, + "loss": 0.7719, + "step": 5681 + }, + { + "epoch": 0.31273047498486434, + "grad_norm": 0.7818729877471924, + "learning_rate": 9.425884732617407e-06, + "loss": 0.7815, + "step": 5682 + }, + { + "epoch": 0.31278551378721997, + "grad_norm": 0.7640877366065979, + "learning_rate": 9.425683043680624e-06, + "loss": 0.8315, + "step": 5683 + }, + { + "epoch": 0.31284055258957566, + "grad_norm": 0.6871064305305481, + "learning_rate": 9.42548132148159e-06, + "loss": 0.8017, + "step": 5684 + }, + { + "epoch": 0.3128955913919313, + "grad_norm": 0.8394801616668701, + "learning_rate": 9.425279566021824e-06, + "loss": 0.763, + "step": 5685 + }, + { + "epoch": 0.312950630194287, + "grad_norm": 0.7104960083961487, + "learning_rate": 9.42507777730284e-06, + "loss": 0.7991, + "step": 5686 + }, + { + "epoch": 0.3130056689966426, + "grad_norm": 0.7820347547531128, + "learning_rate": 9.424875955326159e-06, + "loss": 0.825, + "step": 5687 + }, + { + "epoch": 0.3130607077989983, + "grad_norm": 0.783343493938446, + "learning_rate": 9.424674100093292e-06, + "loss": 0.8189, + "step": 5688 + }, + { + "epoch": 0.31311574660135394, + "grad_norm": 0.7998474836349487, + "learning_rate": 9.42447221160576e-06, + "loss": 0.7382, + "step": 5689 + }, + { + "epoch": 0.31317078540370963, + "grad_norm": 0.7232120633125305, + "learning_rate": 9.424270289865078e-06, + "loss": 0.8556, + "step": 5690 + }, + { + "epoch": 0.31322582420606526, + "grad_norm": 0.7944191694259644, + "learning_rate": 9.424068334872764e-06, + "loss": 0.8272, + "step": 5691 + }, + { + "epoch": 0.31328086300842095, + "grad_norm": 0.7951859831809998, + "learning_rate": 9.42386634663034e-06, + "loss": 0.7612, + "step": 5692 + }, + { + "epoch": 0.3133359018107766, + "grad_norm": 1.394667387008667, + "learning_rate": 9.423664325139318e-06, + "loss": 0.8108, + "step": 5693 + }, + { + "epoch": 0.3133909406131323, + "grad_norm": 0.868886411190033, + "learning_rate": 9.42346227040122e-06, + "loss": 0.8308, + "step": 5694 + }, + { + "epoch": 0.3134459794154879, + "grad_norm": 0.9442586302757263, + "learning_rate": 9.423260182417563e-06, + "loss": 0.9145, + "step": 5695 + }, + { + "epoch": 0.3135010182178436, + "grad_norm": 0.7432793974876404, + "learning_rate": 9.423058061189868e-06, + "loss": 0.7715, + "step": 5696 + }, + { + "epoch": 0.31355605702019923, + "grad_norm": 0.7221946120262146, + "learning_rate": 9.422855906719652e-06, + "loss": 0.7588, + "step": 5697 + }, + { + "epoch": 0.3136110958225549, + "grad_norm": 0.7459834814071655, + "learning_rate": 9.422653719008434e-06, + "loss": 0.7834, + "step": 5698 + }, + { + "epoch": 0.31366613462491055, + "grad_norm": 0.8562330007553101, + "learning_rate": 9.422451498057737e-06, + "loss": 0.6994, + "step": 5699 + }, + { + "epoch": 0.31372117342726624, + "grad_norm": 0.672696053981781, + "learning_rate": 9.422249243869075e-06, + "loss": 0.7201, + "step": 5700 + }, + { + "epoch": 0.3137762122296219, + "grad_norm": 0.7459990382194519, + "learning_rate": 9.422046956443973e-06, + "loss": 0.7663, + "step": 5701 + }, + { + "epoch": 0.31383125103197757, + "grad_norm": 0.9653169512748718, + "learning_rate": 9.42184463578395e-06, + "loss": 0.8899, + "step": 5702 + }, + { + "epoch": 0.3138862898343332, + "grad_norm": 0.7137778997421265, + "learning_rate": 9.421642281890526e-06, + "loss": 0.74, + "step": 5703 + }, + { + "epoch": 0.3139413286366889, + "grad_norm": 0.6961745619773865, + "learning_rate": 9.421439894765222e-06, + "loss": 0.7309, + "step": 5704 + }, + { + "epoch": 0.3139963674390445, + "grad_norm": 0.7843212485313416, + "learning_rate": 9.421237474409559e-06, + "loss": 0.8654, + "step": 5705 + }, + { + "epoch": 0.3140514062414002, + "grad_norm": 0.7560604810714722, + "learning_rate": 9.42103502082506e-06, + "loss": 0.7949, + "step": 5706 + }, + { + "epoch": 0.31410644504375584, + "grad_norm": 0.756200909614563, + "learning_rate": 9.420832534013245e-06, + "loss": 0.7315, + "step": 5707 + }, + { + "epoch": 0.31416148384611153, + "grad_norm": 0.7857967615127563, + "learning_rate": 9.420630013975635e-06, + "loss": 0.7698, + "step": 5708 + }, + { + "epoch": 0.31421652264846717, + "grad_norm": 0.6943809986114502, + "learning_rate": 9.420427460713754e-06, + "loss": 0.7691, + "step": 5709 + }, + { + "epoch": 0.31427156145082286, + "grad_norm": 0.7460532188415527, + "learning_rate": 9.420224874229123e-06, + "loss": 0.7679, + "step": 5710 + }, + { + "epoch": 0.3143266002531785, + "grad_norm": 0.764406144618988, + "learning_rate": 9.420022254523265e-06, + "loss": 0.9545, + "step": 5711 + }, + { + "epoch": 0.3143816390555342, + "grad_norm": 0.7191083431243896, + "learning_rate": 9.419819601597703e-06, + "loss": 0.728, + "step": 5712 + }, + { + "epoch": 0.3144366778578898, + "grad_norm": 0.8799699544906616, + "learning_rate": 9.419616915453959e-06, + "loss": 0.6911, + "step": 5713 + }, + { + "epoch": 0.31449171666024545, + "grad_norm": 0.7505975365638733, + "learning_rate": 9.419414196093558e-06, + "loss": 0.7953, + "step": 5714 + }, + { + "epoch": 0.31454675546260114, + "grad_norm": 0.7575502395629883, + "learning_rate": 9.419211443518023e-06, + "loss": 0.7752, + "step": 5715 + }, + { + "epoch": 0.31460179426495677, + "grad_norm": 0.7220337986946106, + "learning_rate": 9.419008657728879e-06, + "loss": 0.7894, + "step": 5716 + }, + { + "epoch": 0.31465683306731246, + "grad_norm": 0.7797306776046753, + "learning_rate": 9.418805838727648e-06, + "loss": 0.7582, + "step": 5717 + }, + { + "epoch": 0.3147118718696681, + "grad_norm": 0.9011242985725403, + "learning_rate": 9.418602986515855e-06, + "loss": 0.7379, + "step": 5718 + }, + { + "epoch": 0.3147669106720238, + "grad_norm": 0.7568445801734924, + "learning_rate": 9.418400101095025e-06, + "loss": 0.8003, + "step": 5719 + }, + { + "epoch": 0.3148219494743794, + "grad_norm": 0.6810547709465027, + "learning_rate": 9.418197182466681e-06, + "loss": 0.7186, + "step": 5720 + }, + { + "epoch": 0.3148769882767351, + "grad_norm": 0.7390284538269043, + "learning_rate": 9.417994230632352e-06, + "loss": 0.7478, + "step": 5721 + }, + { + "epoch": 0.31493202707909074, + "grad_norm": 0.695286214351654, + "learning_rate": 9.41779124559356e-06, + "loss": 0.7467, + "step": 5722 + }, + { + "epoch": 0.3149870658814464, + "grad_norm": 0.7783445715904236, + "learning_rate": 9.41758822735183e-06, + "loss": 0.824, + "step": 5723 + }, + { + "epoch": 0.31504210468380206, + "grad_norm": 0.7176268696784973, + "learning_rate": 9.41738517590869e-06, + "loss": 0.7596, + "step": 5724 + }, + { + "epoch": 0.31509714348615775, + "grad_norm": 0.7829678058624268, + "learning_rate": 9.417182091265668e-06, + "loss": 0.8184, + "step": 5725 + }, + { + "epoch": 0.3151521822885134, + "grad_norm": 0.7461703419685364, + "learning_rate": 9.416978973424286e-06, + "loss": 0.8732, + "step": 5726 + }, + { + "epoch": 0.31520722109086907, + "grad_norm": 0.7186999320983887, + "learning_rate": 9.416775822386073e-06, + "loss": 0.6878, + "step": 5727 + }, + { + "epoch": 0.3152622598932247, + "grad_norm": 0.6775033473968506, + "learning_rate": 9.416572638152553e-06, + "loss": 0.7211, + "step": 5728 + }, + { + "epoch": 0.3153172986955804, + "grad_norm": 0.6845641732215881, + "learning_rate": 9.416369420725258e-06, + "loss": 0.7282, + "step": 5729 + }, + { + "epoch": 0.31537233749793603, + "grad_norm": 0.8301281929016113, + "learning_rate": 9.416166170105712e-06, + "loss": 0.7999, + "step": 5730 + }, + { + "epoch": 0.3154273763002917, + "grad_norm": 0.8487183451652527, + "learning_rate": 9.415962886295442e-06, + "loss": 0.8202, + "step": 5731 + }, + { + "epoch": 0.31548241510264735, + "grad_norm": 0.74607914686203, + "learning_rate": 9.415759569295979e-06, + "loss": 0.7552, + "step": 5732 + }, + { + "epoch": 0.31553745390500304, + "grad_norm": 0.7774194478988647, + "learning_rate": 9.415556219108846e-06, + "loss": 0.7847, + "step": 5733 + }, + { + "epoch": 0.3155924927073587, + "grad_norm": 0.7782126069068909, + "learning_rate": 9.415352835735576e-06, + "loss": 0.8001, + "step": 5734 + }, + { + "epoch": 0.31564753150971436, + "grad_norm": 0.7577764987945557, + "learning_rate": 9.415149419177698e-06, + "loss": 0.8262, + "step": 5735 + }, + { + "epoch": 0.31570257031207, + "grad_norm": 0.7949855327606201, + "learning_rate": 9.414945969436737e-06, + "loss": 0.8259, + "step": 5736 + }, + { + "epoch": 0.3157576091144257, + "grad_norm": 0.7670153379440308, + "learning_rate": 9.414742486514224e-06, + "loss": 0.7181, + "step": 5737 + }, + { + "epoch": 0.3158126479167813, + "grad_norm": 0.7852359414100647, + "learning_rate": 9.414538970411687e-06, + "loss": 0.8802, + "step": 5738 + }, + { + "epoch": 0.315867686719137, + "grad_norm": 0.8300517201423645, + "learning_rate": 9.414335421130658e-06, + "loss": 0.7665, + "step": 5739 + }, + { + "epoch": 0.31592272552149264, + "grad_norm": 0.7631614804267883, + "learning_rate": 9.414131838672666e-06, + "loss": 0.8864, + "step": 5740 + }, + { + "epoch": 0.31597776432384833, + "grad_norm": 0.7946471571922302, + "learning_rate": 9.41392822303924e-06, + "loss": 0.7587, + "step": 5741 + }, + { + "epoch": 0.31603280312620396, + "grad_norm": 0.7043818235397339, + "learning_rate": 9.413724574231912e-06, + "loss": 0.7793, + "step": 5742 + }, + { + "epoch": 0.31608784192855965, + "grad_norm": 0.7276063561439514, + "learning_rate": 9.41352089225221e-06, + "loss": 0.8064, + "step": 5743 + }, + { + "epoch": 0.3161428807309153, + "grad_norm": 0.7141419053077698, + "learning_rate": 9.413317177101667e-06, + "loss": 0.7251, + "step": 5744 + }, + { + "epoch": 0.316197919533271, + "grad_norm": 0.7961493730545044, + "learning_rate": 9.413113428781815e-06, + "loss": 0.8438, + "step": 5745 + }, + { + "epoch": 0.3162529583356266, + "grad_norm": 0.7046970129013062, + "learning_rate": 9.412909647294181e-06, + "loss": 0.8319, + "step": 5746 + }, + { + "epoch": 0.3163079971379823, + "grad_norm": 0.8231918215751648, + "learning_rate": 9.412705832640302e-06, + "loss": 0.7707, + "step": 5747 + }, + { + "epoch": 0.31636303594033793, + "grad_norm": 0.769840657711029, + "learning_rate": 9.412501984821705e-06, + "loss": 0.6819, + "step": 5748 + }, + { + "epoch": 0.3164180747426936, + "grad_norm": 0.7526834607124329, + "learning_rate": 9.412298103839925e-06, + "loss": 0.8106, + "step": 5749 + }, + { + "epoch": 0.31647311354504926, + "grad_norm": 0.6763152480125427, + "learning_rate": 9.412094189696494e-06, + "loss": 0.7577, + "step": 5750 + }, + { + "epoch": 0.31652815234740495, + "grad_norm": 0.8460820317268372, + "learning_rate": 9.411890242392945e-06, + "loss": 0.752, + "step": 5751 + }, + { + "epoch": 0.3165831911497606, + "grad_norm": 0.7610277533531189, + "learning_rate": 9.411686261930809e-06, + "loss": 0.7284, + "step": 5752 + }, + { + "epoch": 0.31663822995211627, + "grad_norm": 0.7596566081047058, + "learning_rate": 9.411482248311619e-06, + "loss": 0.8518, + "step": 5753 + }, + { + "epoch": 0.3166932687544719, + "grad_norm": 0.7615048885345459, + "learning_rate": 9.41127820153691e-06, + "loss": 0.8232, + "step": 5754 + }, + { + "epoch": 0.3167483075568276, + "grad_norm": 0.7882834672927856, + "learning_rate": 9.411074121608215e-06, + "loss": 0.8682, + "step": 5755 + }, + { + "epoch": 0.3168033463591832, + "grad_norm": 0.748002827167511, + "learning_rate": 9.410870008527067e-06, + "loss": 0.7934, + "step": 5756 + }, + { + "epoch": 0.31685838516153886, + "grad_norm": 0.7677696943283081, + "learning_rate": 9.410665862295003e-06, + "loss": 0.8114, + "step": 5757 + }, + { + "epoch": 0.31691342396389455, + "grad_norm": 0.8966217041015625, + "learning_rate": 9.410461682913552e-06, + "loss": 0.8005, + "step": 5758 + }, + { + "epoch": 0.3169684627662502, + "grad_norm": 0.8769435286521912, + "learning_rate": 9.410257470384253e-06, + "loss": 0.7935, + "step": 5759 + }, + { + "epoch": 0.31702350156860587, + "grad_norm": 0.9828680753707886, + "learning_rate": 9.41005322470864e-06, + "loss": 0.8182, + "step": 5760 + }, + { + "epoch": 0.3170785403709615, + "grad_norm": 0.7340976595878601, + "learning_rate": 9.409848945888245e-06, + "loss": 0.7832, + "step": 5761 + }, + { + "epoch": 0.3171335791733172, + "grad_norm": 0.7516821622848511, + "learning_rate": 9.409644633924609e-06, + "loss": 0.8223, + "step": 5762 + }, + { + "epoch": 0.3171886179756728, + "grad_norm": 0.7556331157684326, + "learning_rate": 9.409440288819263e-06, + "loss": 0.7631, + "step": 5763 + }, + { + "epoch": 0.3172436567780285, + "grad_norm": 0.6182114481925964, + "learning_rate": 9.409235910573743e-06, + "loss": 0.558, + "step": 5764 + }, + { + "epoch": 0.31729869558038415, + "grad_norm": 0.7854578495025635, + "learning_rate": 9.409031499189586e-06, + "loss": 0.8496, + "step": 5765 + }, + { + "epoch": 0.31735373438273984, + "grad_norm": 0.7246551513671875, + "learning_rate": 9.40882705466833e-06, + "loss": 0.8407, + "step": 5766 + }, + { + "epoch": 0.31740877318509547, + "grad_norm": 1.089107632637024, + "learning_rate": 9.40862257701151e-06, + "loss": 0.8363, + "step": 5767 + }, + { + "epoch": 0.31746381198745116, + "grad_norm": 0.9886558055877686, + "learning_rate": 9.408418066220664e-06, + "loss": 0.6888, + "step": 5768 + }, + { + "epoch": 0.3175188507898068, + "grad_norm": 0.8724960088729858, + "learning_rate": 9.408213522297325e-06, + "loss": 0.7717, + "step": 5769 + }, + { + "epoch": 0.3175738895921625, + "grad_norm": 0.7453228831291199, + "learning_rate": 9.408008945243035e-06, + "loss": 0.7081, + "step": 5770 + }, + { + "epoch": 0.3176289283945181, + "grad_norm": 0.7601909637451172, + "learning_rate": 9.40780433505933e-06, + "loss": 0.7974, + "step": 5771 + }, + { + "epoch": 0.3176839671968738, + "grad_norm": 0.7704907655715942, + "learning_rate": 9.407599691747746e-06, + "loss": 0.7521, + "step": 5772 + }, + { + "epoch": 0.31773900599922944, + "grad_norm": 0.7639214396476746, + "learning_rate": 9.407395015309824e-06, + "loss": 0.7888, + "step": 5773 + }, + { + "epoch": 0.31779404480158513, + "grad_norm": 0.711355984210968, + "learning_rate": 9.4071903057471e-06, + "loss": 0.7482, + "step": 5774 + }, + { + "epoch": 0.31784908360394076, + "grad_norm": 0.6097242832183838, + "learning_rate": 9.406985563061114e-06, + "loss": 0.6533, + "step": 5775 + }, + { + "epoch": 0.31790412240629645, + "grad_norm": 0.807133138179779, + "learning_rate": 9.406780787253402e-06, + "loss": 0.7788, + "step": 5776 + }, + { + "epoch": 0.3179591612086521, + "grad_norm": 0.6938545107841492, + "learning_rate": 9.406575978325508e-06, + "loss": 0.8046, + "step": 5777 + }, + { + "epoch": 0.3180142000110078, + "grad_norm": 0.848858118057251, + "learning_rate": 9.406371136278968e-06, + "loss": 0.8481, + "step": 5778 + }, + { + "epoch": 0.3180692388133634, + "grad_norm": 0.8496920466423035, + "learning_rate": 9.40616626111532e-06, + "loss": 0.8172, + "step": 5779 + }, + { + "epoch": 0.3181242776157191, + "grad_norm": 0.8169928193092346, + "learning_rate": 9.405961352836107e-06, + "loss": 0.792, + "step": 5780 + }, + { + "epoch": 0.31817931641807473, + "grad_norm": 0.9380607604980469, + "learning_rate": 9.405756411442868e-06, + "loss": 0.8371, + "step": 5781 + }, + { + "epoch": 0.3182343552204304, + "grad_norm": 0.6938190460205078, + "learning_rate": 9.405551436937144e-06, + "loss": 0.7825, + "step": 5782 + }, + { + "epoch": 0.31828939402278605, + "grad_norm": 0.7726871371269226, + "learning_rate": 9.405346429320473e-06, + "loss": 0.6481, + "step": 5783 + }, + { + "epoch": 0.31834443282514174, + "grad_norm": 0.77762770652771, + "learning_rate": 9.4051413885944e-06, + "loss": 0.6916, + "step": 5784 + }, + { + "epoch": 0.3183994716274974, + "grad_norm": 0.7580817341804504, + "learning_rate": 9.404936314760459e-06, + "loss": 0.8222, + "step": 5785 + }, + { + "epoch": 0.31845451042985307, + "grad_norm": 0.6984102725982666, + "learning_rate": 9.4047312078202e-06, + "loss": 0.707, + "step": 5786 + }, + { + "epoch": 0.3185095492322087, + "grad_norm": 0.6887965202331543, + "learning_rate": 9.404526067775159e-06, + "loss": 0.7289, + "step": 5787 + }, + { + "epoch": 0.3185645880345644, + "grad_norm": 0.7022155523300171, + "learning_rate": 9.404320894626879e-06, + "loss": 0.741, + "step": 5788 + }, + { + "epoch": 0.31861962683692, + "grad_norm": 0.8007381558418274, + "learning_rate": 9.404115688376903e-06, + "loss": 0.8332, + "step": 5789 + }, + { + "epoch": 0.3186746656392757, + "grad_norm": 0.6985924243927002, + "learning_rate": 9.40391044902677e-06, + "loss": 0.7849, + "step": 5790 + }, + { + "epoch": 0.31872970444163135, + "grad_norm": 0.771060585975647, + "learning_rate": 9.403705176578028e-06, + "loss": 0.8728, + "step": 5791 + }, + { + "epoch": 0.31878474324398703, + "grad_norm": 0.6976794600486755, + "learning_rate": 9.403499871032214e-06, + "loss": 0.7621, + "step": 5792 + }, + { + "epoch": 0.31883978204634267, + "grad_norm": 0.7552126049995422, + "learning_rate": 9.403294532390876e-06, + "loss": 0.7641, + "step": 5793 + }, + { + "epoch": 0.31889482084869836, + "grad_norm": 1.0032007694244385, + "learning_rate": 9.403089160655553e-06, + "loss": 0.8497, + "step": 5794 + }, + { + "epoch": 0.318949859651054, + "grad_norm": 0.7193583250045776, + "learning_rate": 9.402883755827792e-06, + "loss": 0.7991, + "step": 5795 + }, + { + "epoch": 0.3190048984534097, + "grad_norm": 0.7665852308273315, + "learning_rate": 9.402678317909135e-06, + "loss": 0.7692, + "step": 5796 + }, + { + "epoch": 0.3190599372557653, + "grad_norm": 0.7514237761497498, + "learning_rate": 9.402472846901125e-06, + "loss": 0.7388, + "step": 5797 + }, + { + "epoch": 0.319114976058121, + "grad_norm": 0.6817325353622437, + "learning_rate": 9.402267342805309e-06, + "loss": 0.7249, + "step": 5798 + }, + { + "epoch": 0.31917001486047664, + "grad_norm": 0.7659624218940735, + "learning_rate": 9.402061805623229e-06, + "loss": 0.755, + "step": 5799 + }, + { + "epoch": 0.31922505366283227, + "grad_norm": 0.7860668301582336, + "learning_rate": 9.401856235356431e-06, + "loss": 0.8175, + "step": 5800 + }, + { + "epoch": 0.31928009246518796, + "grad_norm": 0.714030921459198, + "learning_rate": 9.401650632006461e-06, + "loss": 0.7359, + "step": 5801 + }, + { + "epoch": 0.3193351312675436, + "grad_norm": 0.6052672266960144, + "learning_rate": 9.401444995574862e-06, + "loss": 0.6167, + "step": 5802 + }, + { + "epoch": 0.3193901700698993, + "grad_norm": 0.7960858941078186, + "learning_rate": 9.40123932606318e-06, + "loss": 0.7542, + "step": 5803 + }, + { + "epoch": 0.3194452088722549, + "grad_norm": 0.7926718592643738, + "learning_rate": 9.401033623472962e-06, + "loss": 0.8292, + "step": 5804 + }, + { + "epoch": 0.3195002476746106, + "grad_norm": 0.7950098514556885, + "learning_rate": 9.400827887805754e-06, + "loss": 0.9332, + "step": 5805 + }, + { + "epoch": 0.31955528647696624, + "grad_norm": 0.7564939260482788, + "learning_rate": 9.400622119063101e-06, + "loss": 0.7217, + "step": 5806 + }, + { + "epoch": 0.3196103252793219, + "grad_norm": 0.7582511901855469, + "learning_rate": 9.40041631724655e-06, + "loss": 0.723, + "step": 5807 + }, + { + "epoch": 0.31966536408167756, + "grad_norm": 0.8826366066932678, + "learning_rate": 9.400210482357648e-06, + "loss": 0.6977, + "step": 5808 + }, + { + "epoch": 0.31972040288403325, + "grad_norm": 0.7029523253440857, + "learning_rate": 9.400004614397941e-06, + "loss": 0.6949, + "step": 5809 + }, + { + "epoch": 0.3197754416863889, + "grad_norm": 0.7651532888412476, + "learning_rate": 9.399798713368979e-06, + "loss": 0.7158, + "step": 5810 + }, + { + "epoch": 0.3198304804887446, + "grad_norm": 0.9379491806030273, + "learning_rate": 9.399592779272307e-06, + "loss": 0.7639, + "step": 5811 + }, + { + "epoch": 0.3198855192911002, + "grad_norm": 0.7945839762687683, + "learning_rate": 9.399386812109474e-06, + "loss": 0.8175, + "step": 5812 + }, + { + "epoch": 0.3199405580934559, + "grad_norm": 0.9462345242500305, + "learning_rate": 9.399180811882025e-06, + "loss": 0.6635, + "step": 5813 + }, + { + "epoch": 0.31999559689581153, + "grad_norm": 1.0449726581573486, + "learning_rate": 9.398974778591513e-06, + "loss": 0.789, + "step": 5814 + }, + { + "epoch": 0.3200506356981672, + "grad_norm": 0.8295683860778809, + "learning_rate": 9.398768712239483e-06, + "loss": 0.7937, + "step": 5815 + }, + { + "epoch": 0.32010567450052285, + "grad_norm": 0.7578030228614807, + "learning_rate": 9.398562612827485e-06, + "loss": 0.8291, + "step": 5816 + }, + { + "epoch": 0.32016071330287854, + "grad_norm": 0.804563581943512, + "learning_rate": 9.398356480357068e-06, + "loss": 0.7604, + "step": 5817 + }, + { + "epoch": 0.3202157521052342, + "grad_norm": 0.8073337078094482, + "learning_rate": 9.39815031482978e-06, + "loss": 0.8288, + "step": 5818 + }, + { + "epoch": 0.32027079090758986, + "grad_norm": 0.8054978251457214, + "learning_rate": 9.397944116247173e-06, + "loss": 0.819, + "step": 5819 + }, + { + "epoch": 0.3203258297099455, + "grad_norm": 0.8304697871208191, + "learning_rate": 9.397737884610794e-06, + "loss": 0.7991, + "step": 5820 + }, + { + "epoch": 0.3203808685123012, + "grad_norm": 0.784662663936615, + "learning_rate": 9.397531619922195e-06, + "loss": 0.763, + "step": 5821 + }, + { + "epoch": 0.3204359073146568, + "grad_norm": 0.726046085357666, + "learning_rate": 9.397325322182926e-06, + "loss": 0.7926, + "step": 5822 + }, + { + "epoch": 0.3204909461170125, + "grad_norm": 0.7291107773780823, + "learning_rate": 9.397118991394535e-06, + "loss": 0.6871, + "step": 5823 + }, + { + "epoch": 0.32054598491936814, + "grad_norm": 0.7870203256607056, + "learning_rate": 9.396912627558577e-06, + "loss": 0.7827, + "step": 5824 + }, + { + "epoch": 0.32060102372172383, + "grad_norm": 0.8665844798088074, + "learning_rate": 9.3967062306766e-06, + "loss": 0.8098, + "step": 5825 + }, + { + "epoch": 0.32065606252407947, + "grad_norm": 0.7743843793869019, + "learning_rate": 9.396499800750157e-06, + "loss": 0.835, + "step": 5826 + }, + { + "epoch": 0.32071110132643516, + "grad_norm": 0.7724023461341858, + "learning_rate": 9.396293337780796e-06, + "loss": 0.8928, + "step": 5827 + }, + { + "epoch": 0.3207661401287908, + "grad_norm": 0.7497217655181885, + "learning_rate": 9.39608684177007e-06, + "loss": 0.8035, + "step": 5828 + }, + { + "epoch": 0.3208211789311465, + "grad_norm": 0.8346971869468689, + "learning_rate": 9.395880312719536e-06, + "loss": 0.8879, + "step": 5829 + }, + { + "epoch": 0.3208762177335021, + "grad_norm": 0.836626410484314, + "learning_rate": 9.39567375063074e-06, + "loss": 0.8523, + "step": 5830 + }, + { + "epoch": 0.3209312565358578, + "grad_norm": 0.734428346157074, + "learning_rate": 9.395467155505237e-06, + "loss": 0.7568, + "step": 5831 + }, + { + "epoch": 0.32098629533821343, + "grad_norm": 0.6620383858680725, + "learning_rate": 9.39526052734458e-06, + "loss": 0.7296, + "step": 5832 + }, + { + "epoch": 0.3210413341405691, + "grad_norm": 0.9356484413146973, + "learning_rate": 9.39505386615032e-06, + "loss": 0.8233, + "step": 5833 + }, + { + "epoch": 0.32109637294292476, + "grad_norm": 0.9238032698631287, + "learning_rate": 9.394847171924013e-06, + "loss": 0.7397, + "step": 5834 + }, + { + "epoch": 0.32115141174528045, + "grad_norm": 0.7161185145378113, + "learning_rate": 9.39464044466721e-06, + "loss": 0.7541, + "step": 5835 + }, + { + "epoch": 0.3212064505476361, + "grad_norm": 0.8381507396697998, + "learning_rate": 9.394433684381467e-06, + "loss": 0.7839, + "step": 5836 + }, + { + "epoch": 0.32126148934999177, + "grad_norm": 0.8299819231033325, + "learning_rate": 9.394226891068337e-06, + "loss": 0.871, + "step": 5837 + }, + { + "epoch": 0.3213165281523474, + "grad_norm": 0.7443987131118774, + "learning_rate": 9.394020064729372e-06, + "loss": 0.7661, + "step": 5838 + }, + { + "epoch": 0.3213715669547031, + "grad_norm": 0.7084206938743591, + "learning_rate": 9.393813205366128e-06, + "loss": 0.7609, + "step": 5839 + }, + { + "epoch": 0.3214266057570587, + "grad_norm": 0.7443114519119263, + "learning_rate": 9.393606312980164e-06, + "loss": 0.8189, + "step": 5840 + }, + { + "epoch": 0.3214816445594144, + "grad_norm": 0.7157652974128723, + "learning_rate": 9.393399387573028e-06, + "loss": 0.8369, + "step": 5841 + }, + { + "epoch": 0.32153668336177005, + "grad_norm": 0.709507942199707, + "learning_rate": 9.393192429146278e-06, + "loss": 0.7314, + "step": 5842 + }, + { + "epoch": 0.3215917221641257, + "grad_norm": 0.7704687714576721, + "learning_rate": 9.39298543770147e-06, + "loss": 0.8793, + "step": 5843 + }, + { + "epoch": 0.32164676096648137, + "grad_norm": 0.8123828172683716, + "learning_rate": 9.39277841324016e-06, + "loss": 0.8748, + "step": 5844 + }, + { + "epoch": 0.321701799768837, + "grad_norm": 0.6951777338981628, + "learning_rate": 9.392571355763903e-06, + "loss": 0.7883, + "step": 5845 + }, + { + "epoch": 0.3217568385711927, + "grad_norm": 0.6753274202346802, + "learning_rate": 9.392364265274256e-06, + "loss": 0.7292, + "step": 5846 + }, + { + "epoch": 0.3218118773735483, + "grad_norm": 0.7940227389335632, + "learning_rate": 9.392157141772775e-06, + "loss": 0.7919, + "step": 5847 + }, + { + "epoch": 0.321866916175904, + "grad_norm": 0.6706317067146301, + "learning_rate": 9.391949985261016e-06, + "loss": 0.6791, + "step": 5848 + }, + { + "epoch": 0.32192195497825965, + "grad_norm": 0.7898741960525513, + "learning_rate": 9.391742795740537e-06, + "loss": 0.7539, + "step": 5849 + }, + { + "epoch": 0.32197699378061534, + "grad_norm": 0.7623887658119202, + "learning_rate": 9.391535573212895e-06, + "loss": 0.7891, + "step": 5850 + }, + { + "epoch": 0.322032032582971, + "grad_norm": 0.6852909326553345, + "learning_rate": 9.391328317679647e-06, + "loss": 0.6587, + "step": 5851 + }, + { + "epoch": 0.32208707138532666, + "grad_norm": 0.7944231033325195, + "learning_rate": 9.39112102914235e-06, + "loss": 0.8316, + "step": 5852 + }, + { + "epoch": 0.3221421101876823, + "grad_norm": 0.6720889806747437, + "learning_rate": 9.390913707602563e-06, + "loss": 0.7791, + "step": 5853 + }, + { + "epoch": 0.322197148990038, + "grad_norm": 0.7482234239578247, + "learning_rate": 9.390706353061845e-06, + "loss": 0.826, + "step": 5854 + }, + { + "epoch": 0.3222521877923936, + "grad_norm": 0.6821579933166504, + "learning_rate": 9.390498965521752e-06, + "loss": 0.7183, + "step": 5855 + }, + { + "epoch": 0.3223072265947493, + "grad_norm": 0.755171537399292, + "learning_rate": 9.390291544983845e-06, + "loss": 0.6887, + "step": 5856 + }, + { + "epoch": 0.32236226539710494, + "grad_norm": 0.748824417591095, + "learning_rate": 9.39008409144968e-06, + "loss": 0.7169, + "step": 5857 + }, + { + "epoch": 0.32241730419946063, + "grad_norm": 0.7479343414306641, + "learning_rate": 9.38987660492082e-06, + "loss": 0.8122, + "step": 5858 + }, + { + "epoch": 0.32247234300181626, + "grad_norm": 0.7459376454353333, + "learning_rate": 9.389669085398823e-06, + "loss": 0.7782, + "step": 5859 + }, + { + "epoch": 0.32252738180417195, + "grad_norm": 0.7016253471374512, + "learning_rate": 9.389461532885246e-06, + "loss": 0.7866, + "step": 5860 + }, + { + "epoch": 0.3225824206065276, + "grad_norm": 0.6711822152137756, + "learning_rate": 9.389253947381654e-06, + "loss": 0.7223, + "step": 5861 + }, + { + "epoch": 0.3226374594088833, + "grad_norm": 0.855045735836029, + "learning_rate": 9.389046328889602e-06, + "loss": 0.7327, + "step": 5862 + }, + { + "epoch": 0.3226924982112389, + "grad_norm": 0.7309823632240295, + "learning_rate": 9.388838677410654e-06, + "loss": 0.7737, + "step": 5863 + }, + { + "epoch": 0.3227475370135946, + "grad_norm": 0.7737841010093689, + "learning_rate": 9.388630992946369e-06, + "loss": 0.7061, + "step": 5864 + }, + { + "epoch": 0.32280257581595023, + "grad_norm": 0.9448195099830627, + "learning_rate": 9.388423275498307e-06, + "loss": 0.8382, + "step": 5865 + }, + { + "epoch": 0.3228576146183059, + "grad_norm": 0.7348229885101318, + "learning_rate": 9.388215525068032e-06, + "loss": 0.8317, + "step": 5866 + }, + { + "epoch": 0.32291265342066156, + "grad_norm": 1.2628185749053955, + "learning_rate": 9.388007741657103e-06, + "loss": 0.7959, + "step": 5867 + }, + { + "epoch": 0.32296769222301724, + "grad_norm": 0.7730327844619751, + "learning_rate": 9.387799925267083e-06, + "loss": 0.7455, + "step": 5868 + }, + { + "epoch": 0.3230227310253729, + "grad_norm": 0.8273047804832458, + "learning_rate": 9.387592075899532e-06, + "loss": 0.877, + "step": 5869 + }, + { + "epoch": 0.32307776982772857, + "grad_norm": 0.7413405776023865, + "learning_rate": 9.387384193556014e-06, + "loss": 0.7734, + "step": 5870 + }, + { + "epoch": 0.3231328086300842, + "grad_norm": 1.0173207521438599, + "learning_rate": 9.387176278238092e-06, + "loss": 0.8674, + "step": 5871 + }, + { + "epoch": 0.3231878474324399, + "grad_norm": 0.7741677761077881, + "learning_rate": 9.386968329947327e-06, + "loss": 0.8226, + "step": 5872 + }, + { + "epoch": 0.3232428862347955, + "grad_norm": 0.8912034034729004, + "learning_rate": 9.38676034868528e-06, + "loss": 0.7977, + "step": 5873 + }, + { + "epoch": 0.3232979250371512, + "grad_norm": 0.7343642711639404, + "learning_rate": 9.386552334453519e-06, + "loss": 0.7639, + "step": 5874 + }, + { + "epoch": 0.32335296383950685, + "grad_norm": 0.697225034236908, + "learning_rate": 9.386344287253603e-06, + "loss": 0.6801, + "step": 5875 + }, + { + "epoch": 0.32340800264186254, + "grad_norm": 0.7082511186599731, + "learning_rate": 9.386136207087099e-06, + "loss": 0.746, + "step": 5876 + }, + { + "epoch": 0.32346304144421817, + "grad_norm": 0.671419620513916, + "learning_rate": 9.38592809395557e-06, + "loss": 0.7023, + "step": 5877 + }, + { + "epoch": 0.32351808024657386, + "grad_norm": 0.775834321975708, + "learning_rate": 9.385719947860579e-06, + "loss": 0.7797, + "step": 5878 + }, + { + "epoch": 0.3235731190489295, + "grad_norm": 0.7867023348808289, + "learning_rate": 9.38551176880369e-06, + "loss": 0.8165, + "step": 5879 + }, + { + "epoch": 0.3236281578512852, + "grad_norm": 0.7099916934967041, + "learning_rate": 9.385303556786469e-06, + "loss": 0.7598, + "step": 5880 + }, + { + "epoch": 0.3236831966536408, + "grad_norm": 0.7362176179885864, + "learning_rate": 9.385095311810479e-06, + "loss": 0.8002, + "step": 5881 + }, + { + "epoch": 0.3237382354559965, + "grad_norm": 0.7310882806777954, + "learning_rate": 9.384887033877288e-06, + "loss": 0.7641, + "step": 5882 + }, + { + "epoch": 0.32379327425835214, + "grad_norm": 0.7769907116889954, + "learning_rate": 9.384678722988458e-06, + "loss": 0.7938, + "step": 5883 + }, + { + "epoch": 0.3238483130607078, + "grad_norm": 0.9913623929023743, + "learning_rate": 9.384470379145558e-06, + "loss": 0.8203, + "step": 5884 + }, + { + "epoch": 0.32390335186306346, + "grad_norm": 0.8765702247619629, + "learning_rate": 9.384262002350153e-06, + "loss": 0.9343, + "step": 5885 + }, + { + "epoch": 0.3239583906654191, + "grad_norm": 0.8122400641441345, + "learning_rate": 9.384053592603808e-06, + "loss": 0.8325, + "step": 5886 + }, + { + "epoch": 0.3240134294677748, + "grad_norm": 0.7600317597389221, + "learning_rate": 9.383845149908089e-06, + "loss": 0.8335, + "step": 5887 + }, + { + "epoch": 0.3240684682701304, + "grad_norm": 0.9472025632858276, + "learning_rate": 9.383636674264563e-06, + "loss": 0.7265, + "step": 5888 + }, + { + "epoch": 0.3241235070724861, + "grad_norm": 0.6961854100227356, + "learning_rate": 9.383428165674797e-06, + "loss": 0.6962, + "step": 5889 + }, + { + "epoch": 0.32417854587484174, + "grad_norm": 0.7032504081726074, + "learning_rate": 9.38321962414036e-06, + "loss": 0.7627, + "step": 5890 + }, + { + "epoch": 0.32423358467719743, + "grad_norm": 0.7727648019790649, + "learning_rate": 9.383011049662816e-06, + "loss": 0.757, + "step": 5891 + }, + { + "epoch": 0.32428862347955306, + "grad_norm": 0.7263824343681335, + "learning_rate": 9.382802442243735e-06, + "loss": 0.8057, + "step": 5892 + }, + { + "epoch": 0.32434366228190875, + "grad_norm": 0.7576926350593567, + "learning_rate": 9.382593801884683e-06, + "loss": 0.763, + "step": 5893 + }, + { + "epoch": 0.3243987010842644, + "grad_norm": 0.7468064427375793, + "learning_rate": 9.38238512858723e-06, + "loss": 0.731, + "step": 5894 + }, + { + "epoch": 0.3244537398866201, + "grad_norm": 0.9570005536079407, + "learning_rate": 9.382176422352944e-06, + "loss": 0.7985, + "step": 5895 + }, + { + "epoch": 0.3245087786889757, + "grad_norm": 0.7296027541160583, + "learning_rate": 9.381967683183393e-06, + "loss": 0.8117, + "step": 5896 + }, + { + "epoch": 0.3245638174913314, + "grad_norm": 0.7330880165100098, + "learning_rate": 9.381758911080145e-06, + "loss": 0.7229, + "step": 5897 + }, + { + "epoch": 0.32461885629368703, + "grad_norm": 0.7247695922851562, + "learning_rate": 9.38155010604477e-06, + "loss": 0.7704, + "step": 5898 + }, + { + "epoch": 0.3246738950960427, + "grad_norm": 0.8011599779129028, + "learning_rate": 9.381341268078836e-06, + "loss": 0.6982, + "step": 5899 + }, + { + "epoch": 0.32472893389839835, + "grad_norm": 0.7931570410728455, + "learning_rate": 9.381132397183917e-06, + "loss": 0.8188, + "step": 5900 + }, + { + "epoch": 0.32478397270075404, + "grad_norm": 0.7469003200531006, + "learning_rate": 9.380923493361577e-06, + "loss": 0.7638, + "step": 5901 + }, + { + "epoch": 0.3248390115031097, + "grad_norm": 0.7442750334739685, + "learning_rate": 9.380714556613391e-06, + "loss": 0.8134, + "step": 5902 + }, + { + "epoch": 0.32489405030546537, + "grad_norm": 0.8014402985572815, + "learning_rate": 9.380505586940925e-06, + "loss": 0.838, + "step": 5903 + }, + { + "epoch": 0.324949089107821, + "grad_norm": 0.7287543416023254, + "learning_rate": 9.380296584345751e-06, + "loss": 0.7317, + "step": 5904 + }, + { + "epoch": 0.3250041279101767, + "grad_norm": 0.7754266262054443, + "learning_rate": 9.380087548829441e-06, + "loss": 0.7205, + "step": 5905 + }, + { + "epoch": 0.3250591667125323, + "grad_norm": 0.7439714074134827, + "learning_rate": 9.379878480393567e-06, + "loss": 0.821, + "step": 5906 + }, + { + "epoch": 0.325114205514888, + "grad_norm": 0.7142870426177979, + "learning_rate": 9.379669379039698e-06, + "loss": 0.7462, + "step": 5907 + }, + { + "epoch": 0.32516924431724364, + "grad_norm": 0.6522948145866394, + "learning_rate": 9.379460244769407e-06, + "loss": 0.739, + "step": 5908 + }, + { + "epoch": 0.32522428311959933, + "grad_norm": 0.7879271507263184, + "learning_rate": 9.379251077584263e-06, + "loss": 0.719, + "step": 5909 + }, + { + "epoch": 0.32527932192195497, + "grad_norm": 0.6969109773635864, + "learning_rate": 9.379041877485842e-06, + "loss": 0.7517, + "step": 5910 + }, + { + "epoch": 0.32533436072431066, + "grad_norm": 0.736890971660614, + "learning_rate": 9.378832644475714e-06, + "loss": 0.7797, + "step": 5911 + }, + { + "epoch": 0.3253893995266663, + "grad_norm": 0.7504066824913025, + "learning_rate": 9.378623378555451e-06, + "loss": 0.7502, + "step": 5912 + }, + { + "epoch": 0.325444438329022, + "grad_norm": 0.9339223504066467, + "learning_rate": 9.378414079726629e-06, + "loss": 0.8842, + "step": 5913 + }, + { + "epoch": 0.3254994771313776, + "grad_norm": 1.08317232131958, + "learning_rate": 9.378204747990818e-06, + "loss": 0.7503, + "step": 5914 + }, + { + "epoch": 0.3255545159337333, + "grad_norm": 0.722665011882782, + "learning_rate": 9.37799538334959e-06, + "loss": 0.7825, + "step": 5915 + }, + { + "epoch": 0.32560955473608894, + "grad_norm": 0.7969509959220886, + "learning_rate": 9.377785985804521e-06, + "loss": 0.8678, + "step": 5916 + }, + { + "epoch": 0.3256645935384446, + "grad_norm": 0.7944697141647339, + "learning_rate": 9.377576555357187e-06, + "loss": 0.8067, + "step": 5917 + }, + { + "epoch": 0.32571963234080026, + "grad_norm": 0.905580461025238, + "learning_rate": 9.377367092009158e-06, + "loss": 0.7689, + "step": 5918 + }, + { + "epoch": 0.32577467114315595, + "grad_norm": 0.7428018450737, + "learning_rate": 9.37715759576201e-06, + "loss": 0.7748, + "step": 5919 + }, + { + "epoch": 0.3258297099455116, + "grad_norm": 0.7746098041534424, + "learning_rate": 9.376948066617316e-06, + "loss": 0.7235, + "step": 5920 + }, + { + "epoch": 0.32588474874786727, + "grad_norm": 0.6842886805534363, + "learning_rate": 9.376738504576653e-06, + "loss": 0.7697, + "step": 5921 + }, + { + "epoch": 0.3259397875502229, + "grad_norm": 0.7858961224555969, + "learning_rate": 9.376528909641595e-06, + "loss": 0.7746, + "step": 5922 + }, + { + "epoch": 0.3259948263525786, + "grad_norm": 0.7534621357917786, + "learning_rate": 9.376319281813717e-06, + "loss": 0.8183, + "step": 5923 + }, + { + "epoch": 0.3260498651549342, + "grad_norm": 1.2406045198440552, + "learning_rate": 9.376109621094594e-06, + "loss": 0.8173, + "step": 5924 + }, + { + "epoch": 0.3261049039572899, + "grad_norm": 0.740075945854187, + "learning_rate": 9.375899927485804e-06, + "loss": 0.725, + "step": 5925 + }, + { + "epoch": 0.32615994275964555, + "grad_norm": 0.8432604074478149, + "learning_rate": 9.375690200988921e-06, + "loss": 0.7805, + "step": 5926 + }, + { + "epoch": 0.32621498156200124, + "grad_norm": 0.7652943134307861, + "learning_rate": 9.37548044160552e-06, + "loss": 0.8609, + "step": 5927 + }, + { + "epoch": 0.32627002036435687, + "grad_norm": 0.7629607915878296, + "learning_rate": 9.37527064933718e-06, + "loss": 0.8776, + "step": 5928 + }, + { + "epoch": 0.3263250591667125, + "grad_norm": 0.8648995757102966, + "learning_rate": 9.375060824185479e-06, + "loss": 0.7543, + "step": 5929 + }, + { + "epoch": 0.3263800979690682, + "grad_norm": 0.8069457411766052, + "learning_rate": 9.374850966151989e-06, + "loss": 0.7995, + "step": 5930 + }, + { + "epoch": 0.32643513677142383, + "grad_norm": 0.7948445677757263, + "learning_rate": 9.374641075238293e-06, + "loss": 0.8312, + "step": 5931 + }, + { + "epoch": 0.3264901755737795, + "grad_norm": 0.7739841341972351, + "learning_rate": 9.374431151445963e-06, + "loss": 0.8442, + "step": 5932 + }, + { + "epoch": 0.32654521437613515, + "grad_norm": 0.7382220029830933, + "learning_rate": 9.374221194776583e-06, + "loss": 0.7519, + "step": 5933 + }, + { + "epoch": 0.32660025317849084, + "grad_norm": 0.7876916527748108, + "learning_rate": 9.374011205231725e-06, + "loss": 0.817, + "step": 5934 + }, + { + "epoch": 0.3266552919808465, + "grad_norm": 0.7175565958023071, + "learning_rate": 9.373801182812969e-06, + "loss": 0.7317, + "step": 5935 + }, + { + "epoch": 0.32671033078320216, + "grad_norm": 0.7739143967628479, + "learning_rate": 9.373591127521894e-06, + "loss": 0.8134, + "step": 5936 + }, + { + "epoch": 0.3267653695855578, + "grad_norm": 0.7388991713523865, + "learning_rate": 9.373381039360082e-06, + "loss": 0.8758, + "step": 5937 + }, + { + "epoch": 0.3268204083879135, + "grad_norm": 0.7393535375595093, + "learning_rate": 9.373170918329105e-06, + "loss": 0.7453, + "step": 5938 + }, + { + "epoch": 0.3268754471902691, + "grad_norm": 0.7168294191360474, + "learning_rate": 9.372960764430547e-06, + "loss": 0.6535, + "step": 5939 + }, + { + "epoch": 0.3269304859926248, + "grad_norm": 0.7472337484359741, + "learning_rate": 9.372750577665988e-06, + "loss": 0.8065, + "step": 5940 + }, + { + "epoch": 0.32698552479498044, + "grad_norm": 0.7211272120475769, + "learning_rate": 9.372540358037005e-06, + "loss": 0.7389, + "step": 5941 + }, + { + "epoch": 0.32704056359733613, + "grad_norm": 0.8097178339958191, + "learning_rate": 9.37233010554518e-06, + "loss": 0.8034, + "step": 5942 + }, + { + "epoch": 0.32709560239969176, + "grad_norm": 0.7929103970527649, + "learning_rate": 9.372119820192091e-06, + "loss": 0.796, + "step": 5943 + }, + { + "epoch": 0.32715064120204745, + "grad_norm": 0.701171875, + "learning_rate": 9.37190950197932e-06, + "loss": 0.7092, + "step": 5944 + }, + { + "epoch": 0.3272056800044031, + "grad_norm": 0.679142951965332, + "learning_rate": 9.371699150908448e-06, + "loss": 0.6995, + "step": 5945 + }, + { + "epoch": 0.3272607188067588, + "grad_norm": 0.7757906913757324, + "learning_rate": 9.371488766981057e-06, + "loss": 0.8662, + "step": 5946 + }, + { + "epoch": 0.3273157576091144, + "grad_norm": 0.8086597323417664, + "learning_rate": 9.371278350198724e-06, + "loss": 0.7455, + "step": 5947 + }, + { + "epoch": 0.3273707964114701, + "grad_norm": 0.6443416476249695, + "learning_rate": 9.371067900563033e-06, + "loss": 0.7262, + "step": 5948 + }, + { + "epoch": 0.32742583521382573, + "grad_norm": 0.8132354021072388, + "learning_rate": 9.370857418075567e-06, + "loss": 0.7841, + "step": 5949 + }, + { + "epoch": 0.3274808740161814, + "grad_norm": 0.6811150908470154, + "learning_rate": 9.370646902737907e-06, + "loss": 0.6955, + "step": 5950 + }, + { + "epoch": 0.32753591281853706, + "grad_norm": 0.8956614136695862, + "learning_rate": 9.370436354551633e-06, + "loss": 0.8218, + "step": 5951 + }, + { + "epoch": 0.32759095162089275, + "grad_norm": 0.6807655692100525, + "learning_rate": 9.370225773518332e-06, + "loss": 0.7869, + "step": 5952 + }, + { + "epoch": 0.3276459904232484, + "grad_norm": 0.7506592869758606, + "learning_rate": 9.37001515963958e-06, + "loss": 0.7975, + "step": 5953 + }, + { + "epoch": 0.32770102922560407, + "grad_norm": 0.7488718032836914, + "learning_rate": 9.369804512916966e-06, + "loss": 0.7611, + "step": 5954 + }, + { + "epoch": 0.3277560680279597, + "grad_norm": 0.734569251537323, + "learning_rate": 9.369593833352073e-06, + "loss": 0.8532, + "step": 5955 + }, + { + "epoch": 0.3278111068303154, + "grad_norm": 0.780170738697052, + "learning_rate": 9.36938312094648e-06, + "loss": 0.7766, + "step": 5956 + }, + { + "epoch": 0.327866145632671, + "grad_norm": 0.6329935193061829, + "learning_rate": 9.369172375701774e-06, + "loss": 0.6789, + "step": 5957 + }, + { + "epoch": 0.3279211844350267, + "grad_norm": 1.0177193880081177, + "learning_rate": 9.368961597619537e-06, + "loss": 0.8362, + "step": 5958 + }, + { + "epoch": 0.32797622323738235, + "grad_norm": 0.730696439743042, + "learning_rate": 9.368750786701354e-06, + "loss": 0.7696, + "step": 5959 + }, + { + "epoch": 0.32803126203973804, + "grad_norm": 0.7946468591690063, + "learning_rate": 9.36853994294881e-06, + "loss": 0.8559, + "step": 5960 + }, + { + "epoch": 0.32808630084209367, + "grad_norm": 0.9353142976760864, + "learning_rate": 9.368329066363489e-06, + "loss": 0.9041, + "step": 5961 + }, + { + "epoch": 0.32814133964444936, + "grad_norm": 0.7256187796592712, + "learning_rate": 9.368118156946977e-06, + "loss": 0.787, + "step": 5962 + }, + { + "epoch": 0.328196378446805, + "grad_norm": 0.7454268336296082, + "learning_rate": 9.367907214700858e-06, + "loss": 0.7255, + "step": 5963 + }, + { + "epoch": 0.3282514172491607, + "grad_norm": 0.7087902426719666, + "learning_rate": 9.367696239626716e-06, + "loss": 0.7166, + "step": 5964 + }, + { + "epoch": 0.3283064560515163, + "grad_norm": 0.8217566609382629, + "learning_rate": 9.36748523172614e-06, + "loss": 0.8351, + "step": 5965 + }, + { + "epoch": 0.328361494853872, + "grad_norm": 0.7712824940681458, + "learning_rate": 9.367274191000713e-06, + "loss": 0.7561, + "step": 5966 + }, + { + "epoch": 0.32841653365622764, + "grad_norm": 0.6798166036605835, + "learning_rate": 9.367063117452024e-06, + "loss": 0.7447, + "step": 5967 + }, + { + "epoch": 0.3284715724585833, + "grad_norm": 0.7139115929603577, + "learning_rate": 9.366852011081655e-06, + "loss": 0.7728, + "step": 5968 + }, + { + "epoch": 0.32852661126093896, + "grad_norm": 1.0488213300704956, + "learning_rate": 9.366640871891196e-06, + "loss": 0.8283, + "step": 5969 + }, + { + "epoch": 0.32858165006329465, + "grad_norm": 0.7939574122428894, + "learning_rate": 9.366429699882233e-06, + "loss": 0.849, + "step": 5970 + }, + { + "epoch": 0.3286366888656503, + "grad_norm": 0.7959052324295044, + "learning_rate": 9.366218495056356e-06, + "loss": 0.7469, + "step": 5971 + }, + { + "epoch": 0.3286917276680059, + "grad_norm": 0.7293235063552856, + "learning_rate": 9.366007257415146e-06, + "loss": 0.8537, + "step": 5972 + }, + { + "epoch": 0.3287467664703616, + "grad_norm": 0.7490390539169312, + "learning_rate": 9.365795986960196e-06, + "loss": 0.8166, + "step": 5973 + }, + { + "epoch": 0.32880180527271724, + "grad_norm": 0.6572316884994507, + "learning_rate": 9.365584683693093e-06, + "loss": 0.6919, + "step": 5974 + }, + { + "epoch": 0.32885684407507293, + "grad_norm": 0.7286609411239624, + "learning_rate": 9.365373347615421e-06, + "loss": 0.768, + "step": 5975 + }, + { + "epoch": 0.32891188287742856, + "grad_norm": 0.7798202037811279, + "learning_rate": 9.365161978728772e-06, + "loss": 0.788, + "step": 5976 + }, + { + "epoch": 0.32896692167978425, + "grad_norm": 0.7224245071411133, + "learning_rate": 9.364950577034737e-06, + "loss": 0.7551, + "step": 5977 + }, + { + "epoch": 0.3290219604821399, + "grad_norm": 0.7238701581954956, + "learning_rate": 9.364739142534898e-06, + "loss": 0.6663, + "step": 5978 + }, + { + "epoch": 0.3290769992844956, + "grad_norm": 0.8947147727012634, + "learning_rate": 9.36452767523085e-06, + "loss": 0.8559, + "step": 5979 + }, + { + "epoch": 0.3291320380868512, + "grad_norm": 0.7346563935279846, + "learning_rate": 9.36431617512418e-06, + "loss": 0.7915, + "step": 5980 + }, + { + "epoch": 0.3291870768892069, + "grad_norm": 0.7674046158790588, + "learning_rate": 9.364104642216479e-06, + "loss": 0.7643, + "step": 5981 + }, + { + "epoch": 0.32924211569156253, + "grad_norm": 0.7288179397583008, + "learning_rate": 9.363893076509335e-06, + "loss": 0.7796, + "step": 5982 + }, + { + "epoch": 0.3292971544939182, + "grad_norm": 0.6603766083717346, + "learning_rate": 9.363681478004339e-06, + "loss": 0.7035, + "step": 5983 + }, + { + "epoch": 0.32935219329627385, + "grad_norm": 0.7523066997528076, + "learning_rate": 9.36346984670308e-06, + "loss": 0.8196, + "step": 5984 + }, + { + "epoch": 0.32940723209862954, + "grad_norm": 0.730312168598175, + "learning_rate": 9.36325818260715e-06, + "loss": 0.7967, + "step": 5985 + }, + { + "epoch": 0.3294622709009852, + "grad_norm": 0.7341319918632507, + "learning_rate": 9.363046485718139e-06, + "loss": 0.8361, + "step": 5986 + }, + { + "epoch": 0.32951730970334087, + "grad_norm": 0.839894711971283, + "learning_rate": 9.36283475603764e-06, + "loss": 0.862, + "step": 5987 + }, + { + "epoch": 0.3295723485056965, + "grad_norm": 0.7794893980026245, + "learning_rate": 9.362622993567243e-06, + "loss": 0.8521, + "step": 5988 + }, + { + "epoch": 0.3296273873080522, + "grad_norm": 0.929410457611084, + "learning_rate": 9.362411198308538e-06, + "loss": 0.7644, + "step": 5989 + }, + { + "epoch": 0.3296824261104078, + "grad_norm": 0.7687333226203918, + "learning_rate": 9.362199370263118e-06, + "loss": 0.8047, + "step": 5990 + }, + { + "epoch": 0.3297374649127635, + "grad_norm": 0.8040616512298584, + "learning_rate": 9.361987509432576e-06, + "loss": 0.7574, + "step": 5991 + }, + { + "epoch": 0.32979250371511915, + "grad_norm": 0.7743237614631653, + "learning_rate": 9.361775615818503e-06, + "loss": 0.8491, + "step": 5992 + }, + { + "epoch": 0.32984754251747483, + "grad_norm": 1.2796664237976074, + "learning_rate": 9.361563689422493e-06, + "loss": 0.7975, + "step": 5993 + }, + { + "epoch": 0.32990258131983047, + "grad_norm": 0.9493466019630432, + "learning_rate": 9.361351730246136e-06, + "loss": 1.0258, + "step": 5994 + }, + { + "epoch": 0.32995762012218616, + "grad_norm": 0.7148050665855408, + "learning_rate": 9.36113973829103e-06, + "loss": 0.805, + "step": 5995 + }, + { + "epoch": 0.3300126589245418, + "grad_norm": 0.723426342010498, + "learning_rate": 9.360927713558762e-06, + "loss": 0.6886, + "step": 5996 + }, + { + "epoch": 0.3300676977268975, + "grad_norm": 0.8274679183959961, + "learning_rate": 9.360715656050929e-06, + "loss": 0.8559, + "step": 5997 + }, + { + "epoch": 0.3301227365292531, + "grad_norm": 0.7493795156478882, + "learning_rate": 9.360503565769126e-06, + "loss": 0.8266, + "step": 5998 + }, + { + "epoch": 0.3301777753316088, + "grad_norm": 0.7690125703811646, + "learning_rate": 9.360291442714944e-06, + "loss": 0.783, + "step": 5999 + }, + { + "epoch": 0.33023281413396444, + "grad_norm": 0.8740219473838806, + "learning_rate": 9.360079286889981e-06, + "loss": 0.8409, + "step": 6000 + }, + { + "epoch": 0.3302878529363201, + "grad_norm": 0.6931017637252808, + "learning_rate": 9.359867098295827e-06, + "loss": 0.7985, + "step": 6001 + }, + { + "epoch": 0.33034289173867576, + "grad_norm": 0.915532112121582, + "learning_rate": 9.35965487693408e-06, + "loss": 0.8718, + "step": 6002 + }, + { + "epoch": 0.33039793054103145, + "grad_norm": 0.7898837924003601, + "learning_rate": 9.359442622806332e-06, + "loss": 0.8571, + "step": 6003 + }, + { + "epoch": 0.3304529693433871, + "grad_norm": 0.8661002516746521, + "learning_rate": 9.359230335914182e-06, + "loss": 0.7963, + "step": 6004 + }, + { + "epoch": 0.33050800814574277, + "grad_norm": 0.7188493013381958, + "learning_rate": 9.359018016259223e-06, + "loss": 0.8188, + "step": 6005 + }, + { + "epoch": 0.3305630469480984, + "grad_norm": 0.8648282289505005, + "learning_rate": 9.358805663843051e-06, + "loss": 0.9136, + "step": 6006 + }, + { + "epoch": 0.3306180857504541, + "grad_norm": 0.8010255098342896, + "learning_rate": 9.358593278667265e-06, + "loss": 0.849, + "step": 6007 + }, + { + "epoch": 0.3306731245528097, + "grad_norm": 0.8128451108932495, + "learning_rate": 9.358380860733456e-06, + "loss": 0.8082, + "step": 6008 + }, + { + "epoch": 0.3307281633551654, + "grad_norm": 1.0003761053085327, + "learning_rate": 9.358168410043224e-06, + "loss": 0.9064, + "step": 6009 + }, + { + "epoch": 0.33078320215752105, + "grad_norm": 0.7412391901016235, + "learning_rate": 9.357955926598163e-06, + "loss": 0.8049, + "step": 6010 + }, + { + "epoch": 0.33083824095987674, + "grad_norm": 0.795615553855896, + "learning_rate": 9.357743410399875e-06, + "loss": 0.7923, + "step": 6011 + }, + { + "epoch": 0.3308932797622324, + "grad_norm": 0.8696123957633972, + "learning_rate": 9.357530861449953e-06, + "loss": 0.8543, + "step": 6012 + }, + { + "epoch": 0.33094831856458806, + "grad_norm": 0.8909900784492493, + "learning_rate": 9.357318279749994e-06, + "loss": 0.6157, + "step": 6013 + }, + { + "epoch": 0.3310033573669437, + "grad_norm": 0.7326250672340393, + "learning_rate": 9.357105665301597e-06, + "loss": 0.7647, + "step": 6014 + }, + { + "epoch": 0.33105839616929933, + "grad_norm": 0.8425576090812683, + "learning_rate": 9.356893018106364e-06, + "loss": 0.7832, + "step": 6015 + }, + { + "epoch": 0.331113434971655, + "grad_norm": 0.7404599785804749, + "learning_rate": 9.356680338165885e-06, + "loss": 0.7759, + "step": 6016 + }, + { + "epoch": 0.33116847377401065, + "grad_norm": 0.6935396790504456, + "learning_rate": 9.356467625481765e-06, + "loss": 0.7488, + "step": 6017 + }, + { + "epoch": 0.33122351257636634, + "grad_norm": 0.7799031138420105, + "learning_rate": 9.3562548800556e-06, + "loss": 0.7617, + "step": 6018 + }, + { + "epoch": 0.331278551378722, + "grad_norm": 0.7824636101722717, + "learning_rate": 9.35604210188899e-06, + "loss": 0.7936, + "step": 6019 + }, + { + "epoch": 0.33133359018107766, + "grad_norm": 0.7051861882209778, + "learning_rate": 9.355829290983531e-06, + "loss": 0.7869, + "step": 6020 + }, + { + "epoch": 0.3313886289834333, + "grad_norm": 0.8172006607055664, + "learning_rate": 9.355616447340826e-06, + "loss": 0.8888, + "step": 6021 + }, + { + "epoch": 0.331443667785789, + "grad_norm": 0.7263272404670715, + "learning_rate": 9.355403570962475e-06, + "loss": 0.8393, + "step": 6022 + }, + { + "epoch": 0.3314987065881446, + "grad_norm": 0.7143926620483398, + "learning_rate": 9.355190661850077e-06, + "loss": 0.6693, + "step": 6023 + }, + { + "epoch": 0.3315537453905003, + "grad_norm": 0.7294363975524902, + "learning_rate": 9.354977720005232e-06, + "loss": 0.8035, + "step": 6024 + }, + { + "epoch": 0.33160878419285594, + "grad_norm": 0.7072308659553528, + "learning_rate": 9.354764745429538e-06, + "loss": 0.761, + "step": 6025 + }, + { + "epoch": 0.33166382299521163, + "grad_norm": 0.6945865154266357, + "learning_rate": 9.3545517381246e-06, + "loss": 0.7212, + "step": 6026 + }, + { + "epoch": 0.33171886179756727, + "grad_norm": 0.7645060420036316, + "learning_rate": 9.354338698092016e-06, + "loss": 0.812, + "step": 6027 + }, + { + "epoch": 0.33177390059992296, + "grad_norm": 0.9494503140449524, + "learning_rate": 9.354125625333387e-06, + "loss": 0.9037, + "step": 6028 + }, + { + "epoch": 0.3318289394022786, + "grad_norm": 0.7311872243881226, + "learning_rate": 9.353912519850317e-06, + "loss": 0.7137, + "step": 6029 + }, + { + "epoch": 0.3318839782046343, + "grad_norm": 0.658562958240509, + "learning_rate": 9.353699381644405e-06, + "loss": 0.7048, + "step": 6030 + }, + { + "epoch": 0.3319390170069899, + "grad_norm": 0.8106339573860168, + "learning_rate": 9.353486210717253e-06, + "loss": 0.8905, + "step": 6031 + }, + { + "epoch": 0.3319940558093456, + "grad_norm": 0.8166239261627197, + "learning_rate": 9.353273007070465e-06, + "loss": 0.7011, + "step": 6032 + }, + { + "epoch": 0.33204909461170123, + "grad_norm": 0.730172872543335, + "learning_rate": 9.353059770705643e-06, + "loss": 0.6934, + "step": 6033 + }, + { + "epoch": 0.3321041334140569, + "grad_norm": 0.7633965611457825, + "learning_rate": 9.352846501624387e-06, + "loss": 0.7379, + "step": 6034 + }, + { + "epoch": 0.33215917221641256, + "grad_norm": 0.7786447405815125, + "learning_rate": 9.352633199828304e-06, + "loss": 0.8533, + "step": 6035 + }, + { + "epoch": 0.33221421101876825, + "grad_norm": 0.7211753726005554, + "learning_rate": 9.352419865318993e-06, + "loss": 0.815, + "step": 6036 + }, + { + "epoch": 0.3322692498211239, + "grad_norm": 0.6861024498939514, + "learning_rate": 9.352206498098062e-06, + "loss": 0.7678, + "step": 6037 + }, + { + "epoch": 0.33232428862347957, + "grad_norm": 0.7702088952064514, + "learning_rate": 9.35199309816711e-06, + "loss": 0.8463, + "step": 6038 + }, + { + "epoch": 0.3323793274258352, + "grad_norm": 0.7179547548294067, + "learning_rate": 9.351779665527742e-06, + "loss": 0.8315, + "step": 6039 + }, + { + "epoch": 0.3324343662281909, + "grad_norm": 0.8686990737915039, + "learning_rate": 9.351566200181565e-06, + "loss": 0.8396, + "step": 6040 + }, + { + "epoch": 0.3324894050305465, + "grad_norm": 0.7269062995910645, + "learning_rate": 9.351352702130181e-06, + "loss": 0.7126, + "step": 6041 + }, + { + "epoch": 0.3325444438329022, + "grad_norm": 0.7759222984313965, + "learning_rate": 9.351139171375195e-06, + "loss": 0.8383, + "step": 6042 + }, + { + "epoch": 0.33259948263525785, + "grad_norm": 0.6882128119468689, + "learning_rate": 9.350925607918212e-06, + "loss": 0.6371, + "step": 6043 + }, + { + "epoch": 0.33265452143761354, + "grad_norm": 0.7552365660667419, + "learning_rate": 9.350712011760834e-06, + "loss": 0.8018, + "step": 6044 + }, + { + "epoch": 0.33270956023996917, + "grad_norm": 0.8320692181587219, + "learning_rate": 9.350498382904672e-06, + "loss": 0.8556, + "step": 6045 + }, + { + "epoch": 0.33276459904232486, + "grad_norm": 0.7542223334312439, + "learning_rate": 9.350284721351326e-06, + "loss": 0.8006, + "step": 6046 + }, + { + "epoch": 0.3328196378446805, + "grad_norm": 1.2724859714508057, + "learning_rate": 9.350071027102406e-06, + "loss": 0.9253, + "step": 6047 + }, + { + "epoch": 0.3328746766470362, + "grad_norm": 0.731383204460144, + "learning_rate": 9.349857300159517e-06, + "loss": 0.83, + "step": 6048 + }, + { + "epoch": 0.3329297154493918, + "grad_norm": 0.731419026851654, + "learning_rate": 9.349643540524265e-06, + "loss": 0.779, + "step": 6049 + }, + { + "epoch": 0.3329847542517475, + "grad_norm": 0.8462278842926025, + "learning_rate": 9.349429748198256e-06, + "loss": 0.84, + "step": 6050 + }, + { + "epoch": 0.33303979305410314, + "grad_norm": 0.8199888467788696, + "learning_rate": 9.349215923183098e-06, + "loss": 0.844, + "step": 6051 + }, + { + "epoch": 0.33309483185645883, + "grad_norm": 0.8696722984313965, + "learning_rate": 9.349002065480397e-06, + "loss": 0.709, + "step": 6052 + }, + { + "epoch": 0.33314987065881446, + "grad_norm": 0.8484870195388794, + "learning_rate": 9.34878817509176e-06, + "loss": 0.7434, + "step": 6053 + }, + { + "epoch": 0.33320490946117015, + "grad_norm": 0.8392589688301086, + "learning_rate": 9.348574252018796e-06, + "loss": 0.8972, + "step": 6054 + }, + { + "epoch": 0.3332599482635258, + "grad_norm": 0.673829972743988, + "learning_rate": 9.34836029626311e-06, + "loss": 0.6789, + "step": 6055 + }, + { + "epoch": 0.3333149870658815, + "grad_norm": 0.6693649888038635, + "learning_rate": 9.348146307826315e-06, + "loss": 0.68, + "step": 6056 + }, + { + "epoch": 0.3333700258682371, + "grad_norm": 0.8516272306442261, + "learning_rate": 9.347932286710014e-06, + "loss": 0.8585, + "step": 6057 + }, + { + "epoch": 0.33342506467059274, + "grad_norm": 0.7431588768959045, + "learning_rate": 9.347718232915818e-06, + "loss": 0.8239, + "step": 6058 + }, + { + "epoch": 0.33348010347294843, + "grad_norm": 0.8823427557945251, + "learning_rate": 9.347504146445336e-06, + "loss": 0.845, + "step": 6059 + }, + { + "epoch": 0.33353514227530406, + "grad_norm": 0.7884035110473633, + "learning_rate": 9.347290027300177e-06, + "loss": 0.8503, + "step": 6060 + }, + { + "epoch": 0.33359018107765975, + "grad_norm": 0.841397225856781, + "learning_rate": 9.34707587548195e-06, + "loss": 0.7551, + "step": 6061 + }, + { + "epoch": 0.3336452198800154, + "grad_norm": 0.7592034935951233, + "learning_rate": 9.346861690992263e-06, + "loss": 0.8516, + "step": 6062 + }, + { + "epoch": 0.3337002586823711, + "grad_norm": 0.6925262212753296, + "learning_rate": 9.346647473832728e-06, + "loss": 0.7351, + "step": 6063 + }, + { + "epoch": 0.3337552974847267, + "grad_norm": 0.8152759075164795, + "learning_rate": 9.346433224004955e-06, + "loss": 0.7673, + "step": 6064 + }, + { + "epoch": 0.3338103362870824, + "grad_norm": 0.7383455038070679, + "learning_rate": 9.346218941510551e-06, + "loss": 0.7312, + "step": 6065 + }, + { + "epoch": 0.33386537508943803, + "grad_norm": 0.7905310392379761, + "learning_rate": 9.346004626351131e-06, + "loss": 0.7891, + "step": 6066 + }, + { + "epoch": 0.3339204138917937, + "grad_norm": 0.7032167315483093, + "learning_rate": 9.345790278528305e-06, + "loss": 0.8358, + "step": 6067 + }, + { + "epoch": 0.33397545269414936, + "grad_norm": 0.6415952444076538, + "learning_rate": 9.34557589804368e-06, + "loss": 0.6716, + "step": 6068 + }, + { + "epoch": 0.33403049149650504, + "grad_norm": 0.7558899521827698, + "learning_rate": 9.34536148489887e-06, + "loss": 0.781, + "step": 6069 + }, + { + "epoch": 0.3340855302988607, + "grad_norm": 0.8913301825523376, + "learning_rate": 9.345147039095485e-06, + "loss": 0.8482, + "step": 6070 + }, + { + "epoch": 0.33414056910121637, + "grad_norm": 0.768984854221344, + "learning_rate": 9.34493256063514e-06, + "loss": 0.7578, + "step": 6071 + }, + { + "epoch": 0.334195607903572, + "grad_norm": 0.7428637742996216, + "learning_rate": 9.344718049519445e-06, + "loss": 0.7812, + "step": 6072 + }, + { + "epoch": 0.3342506467059277, + "grad_norm": 0.7290430665016174, + "learning_rate": 9.344503505750012e-06, + "loss": 0.7536, + "step": 6073 + }, + { + "epoch": 0.3343056855082833, + "grad_norm": 0.7637680172920227, + "learning_rate": 9.344288929328453e-06, + "loss": 0.8576, + "step": 6074 + }, + { + "epoch": 0.334360724310639, + "grad_norm": 0.9568214416503906, + "learning_rate": 9.344074320256379e-06, + "loss": 0.897, + "step": 6075 + }, + { + "epoch": 0.33441576311299465, + "grad_norm": 0.7516217827796936, + "learning_rate": 9.34385967853541e-06, + "loss": 0.7853, + "step": 6076 + }, + { + "epoch": 0.33447080191535034, + "grad_norm": 0.833039402961731, + "learning_rate": 9.34364500416715e-06, + "loss": 0.702, + "step": 6077 + }, + { + "epoch": 0.33452584071770597, + "grad_norm": 0.8080580830574036, + "learning_rate": 9.34343029715322e-06, + "loss": 0.7867, + "step": 6078 + }, + { + "epoch": 0.33458087952006166, + "grad_norm": 0.8039596080780029, + "learning_rate": 9.343215557495229e-06, + "loss": 0.8221, + "step": 6079 + }, + { + "epoch": 0.3346359183224173, + "grad_norm": 0.7003986835479736, + "learning_rate": 9.343000785194794e-06, + "loss": 0.746, + "step": 6080 + }, + { + "epoch": 0.334690957124773, + "grad_norm": 0.6623722314834595, + "learning_rate": 9.342785980253526e-06, + "loss": 0.6998, + "step": 6081 + }, + { + "epoch": 0.3347459959271286, + "grad_norm": 0.8425901532173157, + "learning_rate": 9.342571142673042e-06, + "loss": 0.8789, + "step": 6082 + }, + { + "epoch": 0.3348010347294843, + "grad_norm": 0.7263861894607544, + "learning_rate": 9.342356272454954e-06, + "loss": 0.7299, + "step": 6083 + }, + { + "epoch": 0.33485607353183994, + "grad_norm": 0.8420364260673523, + "learning_rate": 9.34214136960088e-06, + "loss": 0.8073, + "step": 6084 + }, + { + "epoch": 0.3349111123341956, + "grad_norm": 0.950019359588623, + "learning_rate": 9.341926434112435e-06, + "loss": 0.9288, + "step": 6085 + }, + { + "epoch": 0.33496615113655126, + "grad_norm": 0.7583657503128052, + "learning_rate": 9.341711465991231e-06, + "loss": 0.8079, + "step": 6086 + }, + { + "epoch": 0.33502118993890695, + "grad_norm": 0.7623111605644226, + "learning_rate": 9.341496465238887e-06, + "loss": 0.879, + "step": 6087 + }, + { + "epoch": 0.3350762287412626, + "grad_norm": 0.8934749960899353, + "learning_rate": 9.341281431857017e-06, + "loss": 0.9348, + "step": 6088 + }, + { + "epoch": 0.33513126754361827, + "grad_norm": 0.7363337874412537, + "learning_rate": 9.341066365847238e-06, + "loss": 0.8284, + "step": 6089 + }, + { + "epoch": 0.3351863063459739, + "grad_norm": 0.6408932209014893, + "learning_rate": 9.340851267211166e-06, + "loss": 0.6019, + "step": 6090 + }, + { + "epoch": 0.3352413451483296, + "grad_norm": 0.8491614460945129, + "learning_rate": 9.34063613595042e-06, + "loss": 0.7287, + "step": 6091 + }, + { + "epoch": 0.33529638395068523, + "grad_norm": 0.6922628879547119, + "learning_rate": 9.340420972066612e-06, + "loss": 0.6649, + "step": 6092 + }, + { + "epoch": 0.3353514227530409, + "grad_norm": 0.7304210662841797, + "learning_rate": 9.340205775561364e-06, + "loss": 0.7373, + "step": 6093 + }, + { + "epoch": 0.33540646155539655, + "grad_norm": 0.8924282193183899, + "learning_rate": 9.339990546436289e-06, + "loss": 0.8337, + "step": 6094 + }, + { + "epoch": 0.33546150035775224, + "grad_norm": 0.7671791315078735, + "learning_rate": 9.339775284693008e-06, + "loss": 0.856, + "step": 6095 + }, + { + "epoch": 0.3355165391601079, + "grad_norm": 0.830427348613739, + "learning_rate": 9.339559990333138e-06, + "loss": 0.7204, + "step": 6096 + }, + { + "epoch": 0.33557157796246356, + "grad_norm": 0.7064357399940491, + "learning_rate": 9.339344663358297e-06, + "loss": 0.8533, + "step": 6097 + }, + { + "epoch": 0.3356266167648192, + "grad_norm": 0.7828566431999207, + "learning_rate": 9.3391293037701e-06, + "loss": 0.7203, + "step": 6098 + }, + { + "epoch": 0.3356816555671749, + "grad_norm": 0.7686871886253357, + "learning_rate": 9.338913911570172e-06, + "loss": 0.7813, + "step": 6099 + }, + { + "epoch": 0.3357366943695305, + "grad_norm": 0.7536553740501404, + "learning_rate": 9.338698486760126e-06, + "loss": 0.7581, + "step": 6100 + }, + { + "epoch": 0.33579173317188615, + "grad_norm": 0.7240094542503357, + "learning_rate": 9.338483029341586e-06, + "loss": 0.7513, + "step": 6101 + }, + { + "epoch": 0.33584677197424184, + "grad_norm": 0.7519696354866028, + "learning_rate": 9.338267539316169e-06, + "loss": 0.8139, + "step": 6102 + }, + { + "epoch": 0.3359018107765975, + "grad_norm": 0.7267377376556396, + "learning_rate": 9.338052016685492e-06, + "loss": 0.7807, + "step": 6103 + }, + { + "epoch": 0.33595684957895317, + "grad_norm": 0.6925491094589233, + "learning_rate": 9.33783646145118e-06, + "loss": 0.8124, + "step": 6104 + }, + { + "epoch": 0.3360118883813088, + "grad_norm": 0.6896460652351379, + "learning_rate": 9.337620873614848e-06, + "loss": 0.7459, + "step": 6105 + }, + { + "epoch": 0.3360669271836645, + "grad_norm": 0.8631082773208618, + "learning_rate": 9.337405253178121e-06, + "loss": 0.7662, + "step": 6106 + }, + { + "epoch": 0.3361219659860201, + "grad_norm": 0.76750248670578, + "learning_rate": 9.337189600142614e-06, + "loss": 0.9016, + "step": 6107 + }, + { + "epoch": 0.3361770047883758, + "grad_norm": 0.9230479001998901, + "learning_rate": 9.336973914509952e-06, + "loss": 0.7631, + "step": 6108 + }, + { + "epoch": 0.33623204359073144, + "grad_norm": 0.746776282787323, + "learning_rate": 9.336758196281756e-06, + "loss": 0.6934, + "step": 6109 + }, + { + "epoch": 0.33628708239308713, + "grad_norm": 0.7631211280822754, + "learning_rate": 9.336542445459646e-06, + "loss": 0.7957, + "step": 6110 + }, + { + "epoch": 0.33634212119544277, + "grad_norm": 0.7460417151451111, + "learning_rate": 9.336326662045243e-06, + "loss": 0.7979, + "step": 6111 + }, + { + "epoch": 0.33639715999779846, + "grad_norm": 0.7072319388389587, + "learning_rate": 9.336110846040171e-06, + "loss": 0.763, + "step": 6112 + }, + { + "epoch": 0.3364521988001541, + "grad_norm": 0.822266697883606, + "learning_rate": 9.33589499744605e-06, + "loss": 0.7719, + "step": 6113 + }, + { + "epoch": 0.3365072376025098, + "grad_norm": 0.778685986995697, + "learning_rate": 9.335679116264502e-06, + "loss": 0.896, + "step": 6114 + }, + { + "epoch": 0.3365622764048654, + "grad_norm": 0.9335552453994751, + "learning_rate": 9.33546320249715e-06, + "loss": 0.7317, + "step": 6115 + }, + { + "epoch": 0.3366173152072211, + "grad_norm": 0.755109965801239, + "learning_rate": 9.33524725614562e-06, + "loss": 0.8184, + "step": 6116 + }, + { + "epoch": 0.33667235400957674, + "grad_norm": 0.7963696122169495, + "learning_rate": 9.33503127721153e-06, + "loss": 0.7835, + "step": 6117 + }, + { + "epoch": 0.3367273928119324, + "grad_norm": 0.8298614621162415, + "learning_rate": 9.334815265696506e-06, + "loss": 0.7946, + "step": 6118 + }, + { + "epoch": 0.33678243161428806, + "grad_norm": 0.728638768196106, + "learning_rate": 9.33459922160217e-06, + "loss": 0.801, + "step": 6119 + }, + { + "epoch": 0.33683747041664375, + "grad_norm": 0.7275198698043823, + "learning_rate": 9.334383144930146e-06, + "loss": 0.7721, + "step": 6120 + }, + { + "epoch": 0.3368925092189994, + "grad_norm": 0.7146986722946167, + "learning_rate": 9.33416703568206e-06, + "loss": 0.7573, + "step": 6121 + }, + { + "epoch": 0.33694754802135507, + "grad_norm": 0.7875215411186218, + "learning_rate": 9.333950893859533e-06, + "loss": 0.8223, + "step": 6122 + }, + { + "epoch": 0.3370025868237107, + "grad_norm": 0.7636967301368713, + "learning_rate": 9.333734719464193e-06, + "loss": 0.7596, + "step": 6123 + }, + { + "epoch": 0.3370576256260664, + "grad_norm": 0.8068925142288208, + "learning_rate": 9.333518512497663e-06, + "loss": 0.834, + "step": 6124 + }, + { + "epoch": 0.337112664428422, + "grad_norm": 0.7153680920600891, + "learning_rate": 9.333302272961566e-06, + "loss": 0.703, + "step": 6125 + }, + { + "epoch": 0.3371677032307777, + "grad_norm": 0.7429617047309875, + "learning_rate": 9.33308600085753e-06, + "loss": 0.7327, + "step": 6126 + }, + { + "epoch": 0.33722274203313335, + "grad_norm": 0.6937283873558044, + "learning_rate": 9.33286969618718e-06, + "loss": 0.6494, + "step": 6127 + }, + { + "epoch": 0.33727778083548904, + "grad_norm": 0.7775923609733582, + "learning_rate": 9.33265335895214e-06, + "loss": 0.8668, + "step": 6128 + }, + { + "epoch": 0.33733281963784467, + "grad_norm": 0.6911064386367798, + "learning_rate": 9.33243698915404e-06, + "loss": 0.6462, + "step": 6129 + }, + { + "epoch": 0.33738785844020036, + "grad_norm": 0.8951280117034912, + "learning_rate": 9.3322205867945e-06, + "loss": 0.825, + "step": 6130 + }, + { + "epoch": 0.337442897242556, + "grad_norm": 0.9521064758300781, + "learning_rate": 9.332004151875151e-06, + "loss": 0.641, + "step": 6131 + }, + { + "epoch": 0.3374979360449117, + "grad_norm": 0.7036865949630737, + "learning_rate": 9.33178768439762e-06, + "loss": 0.804, + "step": 6132 + }, + { + "epoch": 0.3375529748472673, + "grad_norm": 1.0232574939727783, + "learning_rate": 9.331571184363529e-06, + "loss": 0.8577, + "step": 6133 + }, + { + "epoch": 0.337608013649623, + "grad_norm": 0.9680090546607971, + "learning_rate": 9.33135465177451e-06, + "loss": 0.7725, + "step": 6134 + }, + { + "epoch": 0.33766305245197864, + "grad_norm": 0.7664901614189148, + "learning_rate": 9.33113808663219e-06, + "loss": 0.8406, + "step": 6135 + }, + { + "epoch": 0.33771809125433433, + "grad_norm": 0.6703250408172607, + "learning_rate": 9.330921488938193e-06, + "loss": 0.7311, + "step": 6136 + }, + { + "epoch": 0.33777313005668996, + "grad_norm": 0.7364899516105652, + "learning_rate": 9.330704858694151e-06, + "loss": 0.8571, + "step": 6137 + }, + { + "epoch": 0.33782816885904565, + "grad_norm": 0.7167731523513794, + "learning_rate": 9.33048819590169e-06, + "loss": 0.7597, + "step": 6138 + }, + { + "epoch": 0.3378832076614013, + "grad_norm": 0.7761037945747375, + "learning_rate": 9.33027150056244e-06, + "loss": 0.8112, + "step": 6139 + }, + { + "epoch": 0.337938246463757, + "grad_norm": 0.8143900632858276, + "learning_rate": 9.330054772678028e-06, + "loss": 0.8213, + "step": 6140 + }, + { + "epoch": 0.3379932852661126, + "grad_norm": 0.7181026339530945, + "learning_rate": 9.329838012250083e-06, + "loss": 0.8228, + "step": 6141 + }, + { + "epoch": 0.3380483240684683, + "grad_norm": 0.7229815721511841, + "learning_rate": 9.329621219280235e-06, + "loss": 0.8205, + "step": 6142 + }, + { + "epoch": 0.33810336287082393, + "grad_norm": 0.7120887637138367, + "learning_rate": 9.329404393770113e-06, + "loss": 0.8012, + "step": 6143 + }, + { + "epoch": 0.33815840167317956, + "grad_norm": 0.7859634757041931, + "learning_rate": 9.329187535721346e-06, + "loss": 0.7583, + "step": 6144 + }, + { + "epoch": 0.33821344047553525, + "grad_norm": 0.7630401253700256, + "learning_rate": 9.328970645135564e-06, + "loss": 0.9087, + "step": 6145 + }, + { + "epoch": 0.3382684792778909, + "grad_norm": 0.7028466463088989, + "learning_rate": 9.328753722014399e-06, + "loss": 0.7253, + "step": 6146 + }, + { + "epoch": 0.3383235180802466, + "grad_norm": 0.8910240530967712, + "learning_rate": 9.328536766359477e-06, + "loss": 0.9048, + "step": 6147 + }, + { + "epoch": 0.3383785568826022, + "grad_norm": 0.6695914268493652, + "learning_rate": 9.328319778172435e-06, + "loss": 0.6817, + "step": 6148 + }, + { + "epoch": 0.3384335956849579, + "grad_norm": 0.9667700529098511, + "learning_rate": 9.328102757454898e-06, + "loss": 0.7721, + "step": 6149 + }, + { + "epoch": 0.33848863448731353, + "grad_norm": 0.7267603874206543, + "learning_rate": 9.3278857042085e-06, + "loss": 0.7263, + "step": 6150 + }, + { + "epoch": 0.3385436732896692, + "grad_norm": 0.7603437900543213, + "learning_rate": 9.32766861843487e-06, + "loss": 0.7856, + "step": 6151 + }, + { + "epoch": 0.33859871209202486, + "grad_norm": 0.7355918288230896, + "learning_rate": 9.327451500135641e-06, + "loss": 0.7687, + "step": 6152 + }, + { + "epoch": 0.33865375089438055, + "grad_norm": 0.712210476398468, + "learning_rate": 9.327234349312446e-06, + "loss": 0.7689, + "step": 6153 + }, + { + "epoch": 0.3387087896967362, + "grad_norm": 0.9011964797973633, + "learning_rate": 9.327017165966916e-06, + "loss": 0.888, + "step": 6154 + }, + { + "epoch": 0.33876382849909187, + "grad_norm": 0.7334766387939453, + "learning_rate": 9.326799950100683e-06, + "loss": 0.7577, + "step": 6155 + }, + { + "epoch": 0.3388188673014475, + "grad_norm": 0.711370587348938, + "learning_rate": 9.32658270171538e-06, + "loss": 0.7653, + "step": 6156 + }, + { + "epoch": 0.3388739061038032, + "grad_norm": 0.8465714454650879, + "learning_rate": 9.32636542081264e-06, + "loss": 0.7252, + "step": 6157 + }, + { + "epoch": 0.3389289449061588, + "grad_norm": 0.8105099201202393, + "learning_rate": 9.326148107394094e-06, + "loss": 0.7886, + "step": 6158 + }, + { + "epoch": 0.3389839837085145, + "grad_norm": 0.8082063794136047, + "learning_rate": 9.32593076146138e-06, + "loss": 0.8968, + "step": 6159 + }, + { + "epoch": 0.33903902251087015, + "grad_norm": 0.7451661229133606, + "learning_rate": 9.325713383016125e-06, + "loss": 0.762, + "step": 6160 + }, + { + "epoch": 0.33909406131322584, + "grad_norm": 0.8174484372138977, + "learning_rate": 9.325495972059968e-06, + "loss": 0.8285, + "step": 6161 + }, + { + "epoch": 0.33914910011558147, + "grad_norm": 0.7690935134887695, + "learning_rate": 9.32527852859454e-06, + "loss": 0.8908, + "step": 6162 + }, + { + "epoch": 0.33920413891793716, + "grad_norm": 0.7730095386505127, + "learning_rate": 9.325061052621476e-06, + "loss": 0.8571, + "step": 6163 + }, + { + "epoch": 0.3392591777202928, + "grad_norm": 0.7750043869018555, + "learning_rate": 9.324843544142412e-06, + "loss": 0.8314, + "step": 6164 + }, + { + "epoch": 0.3393142165226485, + "grad_norm": 0.8184822797775269, + "learning_rate": 9.32462600315898e-06, + "loss": 0.8783, + "step": 6165 + }, + { + "epoch": 0.3393692553250041, + "grad_norm": 0.8553629517555237, + "learning_rate": 9.32440842967282e-06, + "loss": 0.7116, + "step": 6166 + }, + { + "epoch": 0.3394242941273598, + "grad_norm": 0.8072115778923035, + "learning_rate": 9.324190823685562e-06, + "loss": 0.7498, + "step": 6167 + }, + { + "epoch": 0.33947933292971544, + "grad_norm": 0.7787594795227051, + "learning_rate": 9.323973185198843e-06, + "loss": 0.7567, + "step": 6168 + }, + { + "epoch": 0.3395343717320711, + "grad_norm": 0.7571421265602112, + "learning_rate": 9.323755514214299e-06, + "loss": 0.8349, + "step": 6169 + }, + { + "epoch": 0.33958941053442676, + "grad_norm": 0.6768494248390198, + "learning_rate": 9.323537810733565e-06, + "loss": 0.7382, + "step": 6170 + }, + { + "epoch": 0.33964444933678245, + "grad_norm": 0.7091678380966187, + "learning_rate": 9.32332007475828e-06, + "loss": 0.8107, + "step": 6171 + }, + { + "epoch": 0.3396994881391381, + "grad_norm": 0.6896559596061707, + "learning_rate": 9.323102306290078e-06, + "loss": 0.7973, + "step": 6172 + }, + { + "epoch": 0.3397545269414938, + "grad_norm": 0.7383756637573242, + "learning_rate": 9.322884505330595e-06, + "loss": 0.7998, + "step": 6173 + }, + { + "epoch": 0.3398095657438494, + "grad_norm": 0.7487883567810059, + "learning_rate": 9.32266667188147e-06, + "loss": 0.7928, + "step": 6174 + }, + { + "epoch": 0.3398646045462051, + "grad_norm": 0.7935298681259155, + "learning_rate": 9.32244880594434e-06, + "loss": 0.8457, + "step": 6175 + }, + { + "epoch": 0.33991964334856073, + "grad_norm": 0.6571856737136841, + "learning_rate": 9.322230907520841e-06, + "loss": 0.7177, + "step": 6176 + }, + { + "epoch": 0.3399746821509164, + "grad_norm": 0.7694165706634521, + "learning_rate": 9.322012976612613e-06, + "loss": 0.7124, + "step": 6177 + }, + { + "epoch": 0.34002972095327205, + "grad_norm": 0.8665503263473511, + "learning_rate": 9.32179501322129e-06, + "loss": 0.8054, + "step": 6178 + }, + { + "epoch": 0.34008475975562774, + "grad_norm": 0.6794337034225464, + "learning_rate": 9.321577017348515e-06, + "loss": 0.6468, + "step": 6179 + }, + { + "epoch": 0.3401397985579834, + "grad_norm": 0.7875672578811646, + "learning_rate": 9.32135898899592e-06, + "loss": 0.8384, + "step": 6180 + }, + { + "epoch": 0.34019483736033906, + "grad_norm": 0.8050880432128906, + "learning_rate": 9.321140928165152e-06, + "loss": 0.7261, + "step": 6181 + }, + { + "epoch": 0.3402498761626947, + "grad_norm": 0.7489742040634155, + "learning_rate": 9.320922834857844e-06, + "loss": 0.8252, + "step": 6182 + }, + { + "epoch": 0.3403049149650504, + "grad_norm": 0.7785589098930359, + "learning_rate": 9.320704709075637e-06, + "loss": 0.7123, + "step": 6183 + }, + { + "epoch": 0.340359953767406, + "grad_norm": 0.7698208689689636, + "learning_rate": 9.320486550820169e-06, + "loss": 0.704, + "step": 6184 + }, + { + "epoch": 0.3404149925697617, + "grad_norm": 0.78490149974823, + "learning_rate": 9.320268360093081e-06, + "loss": 0.8446, + "step": 6185 + }, + { + "epoch": 0.34047003137211734, + "grad_norm": 0.6684672236442566, + "learning_rate": 9.320050136896012e-06, + "loss": 0.6728, + "step": 6186 + }, + { + "epoch": 0.340525070174473, + "grad_norm": 0.818122386932373, + "learning_rate": 9.319831881230603e-06, + "loss": 0.7744, + "step": 6187 + }, + { + "epoch": 0.34058010897682867, + "grad_norm": 0.83867347240448, + "learning_rate": 9.319613593098494e-06, + "loss": 0.7423, + "step": 6188 + }, + { + "epoch": 0.3406351477791843, + "grad_norm": 0.7800338268280029, + "learning_rate": 9.319395272501326e-06, + "loss": 0.8189, + "step": 6189 + }, + { + "epoch": 0.34069018658154, + "grad_norm": 0.7530137300491333, + "learning_rate": 9.319176919440737e-06, + "loss": 0.7978, + "step": 6190 + }, + { + "epoch": 0.3407452253838956, + "grad_norm": 0.8916274309158325, + "learning_rate": 9.318958533918374e-06, + "loss": 0.8828, + "step": 6191 + }, + { + "epoch": 0.3408002641862513, + "grad_norm": 0.76950603723526, + "learning_rate": 9.318740115935873e-06, + "loss": 0.7691, + "step": 6192 + }, + { + "epoch": 0.34085530298860695, + "grad_norm": 0.8348222970962524, + "learning_rate": 9.318521665494877e-06, + "loss": 0.8022, + "step": 6193 + }, + { + "epoch": 0.34091034179096263, + "grad_norm": 0.6879388689994812, + "learning_rate": 9.318303182597029e-06, + "loss": 0.747, + "step": 6194 + }, + { + "epoch": 0.34096538059331827, + "grad_norm": 0.8032572269439697, + "learning_rate": 9.31808466724397e-06, + "loss": 0.7621, + "step": 6195 + }, + { + "epoch": 0.34102041939567396, + "grad_norm": 0.6842368841171265, + "learning_rate": 9.317866119437342e-06, + "loss": 0.6867, + "step": 6196 + }, + { + "epoch": 0.3410754581980296, + "grad_norm": 0.7797672152519226, + "learning_rate": 9.317647539178788e-06, + "loss": 0.8329, + "step": 6197 + }, + { + "epoch": 0.3411304970003853, + "grad_norm": 0.6865420341491699, + "learning_rate": 9.317428926469952e-06, + "loss": 0.7544, + "step": 6198 + }, + { + "epoch": 0.3411855358027409, + "grad_norm": 0.818217396736145, + "learning_rate": 9.317210281312475e-06, + "loss": 0.8853, + "step": 6199 + }, + { + "epoch": 0.3412405746050966, + "grad_norm": 0.7531415224075317, + "learning_rate": 9.316991603708001e-06, + "loss": 0.8225, + "step": 6200 + }, + { + "epoch": 0.34129561340745224, + "grad_norm": 0.7347036600112915, + "learning_rate": 9.316772893658173e-06, + "loss": 0.7817, + "step": 6201 + }, + { + "epoch": 0.3413506522098079, + "grad_norm": 0.7162033915519714, + "learning_rate": 9.316554151164636e-06, + "loss": 0.7836, + "step": 6202 + }, + { + "epoch": 0.34140569101216356, + "grad_norm": 0.7421988248825073, + "learning_rate": 9.316335376229035e-06, + "loss": 0.7782, + "step": 6203 + }, + { + "epoch": 0.34146072981451925, + "grad_norm": 0.7672573328018188, + "learning_rate": 9.31611656885301e-06, + "loss": 0.8585, + "step": 6204 + }, + { + "epoch": 0.3415157686168749, + "grad_norm": 0.6898330450057983, + "learning_rate": 9.31589772903821e-06, + "loss": 0.7719, + "step": 6205 + }, + { + "epoch": 0.34157080741923057, + "grad_norm": 0.7700635194778442, + "learning_rate": 9.315678856786279e-06, + "loss": 0.7345, + "step": 6206 + }, + { + "epoch": 0.3416258462215862, + "grad_norm": 0.6982038617134094, + "learning_rate": 9.315459952098858e-06, + "loss": 0.8332, + "step": 6207 + }, + { + "epoch": 0.3416808850239419, + "grad_norm": 0.8882858753204346, + "learning_rate": 9.315241014977598e-06, + "loss": 0.9029, + "step": 6208 + }, + { + "epoch": 0.3417359238262975, + "grad_norm": 0.7313854098320007, + "learning_rate": 9.31502204542414e-06, + "loss": 0.8061, + "step": 6209 + }, + { + "epoch": 0.3417909626286532, + "grad_norm": 0.7324157953262329, + "learning_rate": 9.314803043440131e-06, + "loss": 0.7889, + "step": 6210 + }, + { + "epoch": 0.34184600143100885, + "grad_norm": 0.7498225569725037, + "learning_rate": 9.314584009027218e-06, + "loss": 0.7937, + "step": 6211 + }, + { + "epoch": 0.34190104023336454, + "grad_norm": 0.7093212008476257, + "learning_rate": 9.314364942187048e-06, + "loss": 0.8404, + "step": 6212 + }, + { + "epoch": 0.3419560790357202, + "grad_norm": 0.7008668780326843, + "learning_rate": 9.314145842921264e-06, + "loss": 0.8175, + "step": 6213 + }, + { + "epoch": 0.34201111783807586, + "grad_norm": 0.8049909472465515, + "learning_rate": 9.313926711231516e-06, + "loss": 0.78, + "step": 6214 + }, + { + "epoch": 0.3420661566404315, + "grad_norm": 0.7777613997459412, + "learning_rate": 9.313707547119448e-06, + "loss": 0.9566, + "step": 6215 + }, + { + "epoch": 0.3421211954427872, + "grad_norm": 0.7787579894065857, + "learning_rate": 9.31348835058671e-06, + "loss": 0.7698, + "step": 6216 + }, + { + "epoch": 0.3421762342451428, + "grad_norm": 0.7779031991958618, + "learning_rate": 9.313269121634947e-06, + "loss": 0.8853, + "step": 6217 + }, + { + "epoch": 0.3422312730474985, + "grad_norm": 0.7194382548332214, + "learning_rate": 9.313049860265809e-06, + "loss": 0.8399, + "step": 6218 + }, + { + "epoch": 0.34228631184985414, + "grad_norm": 0.6513093709945679, + "learning_rate": 9.312830566480943e-06, + "loss": 0.7156, + "step": 6219 + }, + { + "epoch": 0.34234135065220983, + "grad_norm": 0.935325026512146, + "learning_rate": 9.312611240281996e-06, + "loss": 0.7525, + "step": 6220 + }, + { + "epoch": 0.34239638945456546, + "grad_norm": 0.7539558410644531, + "learning_rate": 9.312391881670618e-06, + "loss": 0.7716, + "step": 6221 + }, + { + "epoch": 0.34245142825692115, + "grad_norm": 0.7239616513252258, + "learning_rate": 9.312172490648457e-06, + "loss": 0.7272, + "step": 6222 + }, + { + "epoch": 0.3425064670592768, + "grad_norm": 0.7742316126823425, + "learning_rate": 9.311953067217162e-06, + "loss": 0.7657, + "step": 6223 + }, + { + "epoch": 0.3425615058616325, + "grad_norm": 0.782691240310669, + "learning_rate": 9.311733611378379e-06, + "loss": 0.813, + "step": 6224 + }, + { + "epoch": 0.3426165446639881, + "grad_norm": 0.7448118329048157, + "learning_rate": 9.311514123133765e-06, + "loss": 0.8298, + "step": 6225 + }, + { + "epoch": 0.3426715834663438, + "grad_norm": 0.8201695680618286, + "learning_rate": 9.311294602484961e-06, + "loss": 0.7738, + "step": 6226 + }, + { + "epoch": 0.34272662226869943, + "grad_norm": 0.6928383111953735, + "learning_rate": 9.311075049433625e-06, + "loss": 0.6829, + "step": 6227 + }, + { + "epoch": 0.3427816610710551, + "grad_norm": 0.7509302496910095, + "learning_rate": 9.310855463981399e-06, + "loss": 0.6265, + "step": 6228 + }, + { + "epoch": 0.34283669987341076, + "grad_norm": 0.7012569308280945, + "learning_rate": 9.310635846129938e-06, + "loss": 0.7478, + "step": 6229 + }, + { + "epoch": 0.3428917386757664, + "grad_norm": 0.7428532242774963, + "learning_rate": 9.310416195880894e-06, + "loss": 0.7434, + "step": 6230 + }, + { + "epoch": 0.3429467774781221, + "grad_norm": 0.9089111685752869, + "learning_rate": 9.310196513235915e-06, + "loss": 0.6991, + "step": 6231 + }, + { + "epoch": 0.3430018162804777, + "grad_norm": 0.7633285522460938, + "learning_rate": 9.309976798196651e-06, + "loss": 0.7789, + "step": 6232 + }, + { + "epoch": 0.3430568550828334, + "grad_norm": 0.7035595178604126, + "learning_rate": 9.309757050764756e-06, + "loss": 0.6784, + "step": 6233 + }, + { + "epoch": 0.34311189388518903, + "grad_norm": 0.8782615661621094, + "learning_rate": 9.309537270941881e-06, + "loss": 0.8861, + "step": 6234 + }, + { + "epoch": 0.3431669326875447, + "grad_norm": 0.7690381407737732, + "learning_rate": 9.309317458729677e-06, + "loss": 0.7701, + "step": 6235 + }, + { + "epoch": 0.34322197148990036, + "grad_norm": 0.7730939388275146, + "learning_rate": 9.309097614129797e-06, + "loss": 0.8004, + "step": 6236 + }, + { + "epoch": 0.34327701029225605, + "grad_norm": 0.9295101761817932, + "learning_rate": 9.308877737143894e-06, + "loss": 0.6964, + "step": 6237 + }, + { + "epoch": 0.3433320490946117, + "grad_norm": 0.7496231198310852, + "learning_rate": 9.308657827773617e-06, + "loss": 0.8107, + "step": 6238 + }, + { + "epoch": 0.34338708789696737, + "grad_norm": 0.7656146287918091, + "learning_rate": 9.308437886020622e-06, + "loss": 0.8016, + "step": 6239 + }, + { + "epoch": 0.343442126699323, + "grad_norm": 0.8925992846488953, + "learning_rate": 9.308217911886562e-06, + "loss": 0.7136, + "step": 6240 + }, + { + "epoch": 0.3434971655016787, + "grad_norm": 0.7669470906257629, + "learning_rate": 9.307997905373087e-06, + "loss": 0.8284, + "step": 6241 + }, + { + "epoch": 0.3435522043040343, + "grad_norm": 0.6964572072029114, + "learning_rate": 9.307777866481855e-06, + "loss": 0.7926, + "step": 6242 + }, + { + "epoch": 0.34360724310639, + "grad_norm": 0.8405120968818665, + "learning_rate": 9.307557795214517e-06, + "loss": 0.9398, + "step": 6243 + }, + { + "epoch": 0.34366228190874565, + "grad_norm": 0.7517451643943787, + "learning_rate": 9.30733769157273e-06, + "loss": 0.8315, + "step": 6244 + }, + { + "epoch": 0.34371732071110134, + "grad_norm": 0.7740843892097473, + "learning_rate": 9.307117555558144e-06, + "loss": 0.8287, + "step": 6245 + }, + { + "epoch": 0.34377235951345697, + "grad_norm": 0.7214275598526001, + "learning_rate": 9.306897387172413e-06, + "loss": 0.7416, + "step": 6246 + }, + { + "epoch": 0.34382739831581266, + "grad_norm": 0.8217877745628357, + "learning_rate": 9.306677186417197e-06, + "loss": 0.8365, + "step": 6247 + }, + { + "epoch": 0.3438824371181683, + "grad_norm": 0.7397332191467285, + "learning_rate": 9.306456953294148e-06, + "loss": 0.7284, + "step": 6248 + }, + { + "epoch": 0.343937475920524, + "grad_norm": 0.8141350746154785, + "learning_rate": 9.30623668780492e-06, + "loss": 0.8976, + "step": 6249 + }, + { + "epoch": 0.3439925147228796, + "grad_norm": 0.7078670263290405, + "learning_rate": 9.306016389951171e-06, + "loss": 0.8167, + "step": 6250 + }, + { + "epoch": 0.3440475535252353, + "grad_norm": 0.7136256098747253, + "learning_rate": 9.305796059734553e-06, + "loss": 0.7916, + "step": 6251 + }, + { + "epoch": 0.34410259232759094, + "grad_norm": 1.6186310052871704, + "learning_rate": 9.305575697156726e-06, + "loss": 0.8148, + "step": 6252 + }, + { + "epoch": 0.34415763112994663, + "grad_norm": 0.7567281126976013, + "learning_rate": 9.305355302219346e-06, + "loss": 0.8676, + "step": 6253 + }, + { + "epoch": 0.34421266993230226, + "grad_norm": 0.9036027193069458, + "learning_rate": 9.305134874924067e-06, + "loss": 0.8111, + "step": 6254 + }, + { + "epoch": 0.34426770873465795, + "grad_norm": 0.9375718235969543, + "learning_rate": 9.304914415272547e-06, + "loss": 0.6176, + "step": 6255 + }, + { + "epoch": 0.3443227475370136, + "grad_norm": 0.7309718132019043, + "learning_rate": 9.304693923266441e-06, + "loss": 0.7313, + "step": 6256 + }, + { + "epoch": 0.3443777863393693, + "grad_norm": 0.7499229311943054, + "learning_rate": 9.30447339890741e-06, + "loss": 0.6704, + "step": 6257 + }, + { + "epoch": 0.3444328251417249, + "grad_norm": 0.7553356289863586, + "learning_rate": 9.304252842197108e-06, + "loss": 0.8671, + "step": 6258 + }, + { + "epoch": 0.3444878639440806, + "grad_norm": 0.7144323587417603, + "learning_rate": 9.304032253137194e-06, + "loss": 0.7684, + "step": 6259 + }, + { + "epoch": 0.34454290274643623, + "grad_norm": 0.7566905617713928, + "learning_rate": 9.303811631729324e-06, + "loss": 0.8381, + "step": 6260 + }, + { + "epoch": 0.3445979415487919, + "grad_norm": 0.7300242185592651, + "learning_rate": 9.30359097797516e-06, + "loss": 0.7044, + "step": 6261 + }, + { + "epoch": 0.34465298035114755, + "grad_norm": 0.6504725813865662, + "learning_rate": 9.303370291876359e-06, + "loss": 0.6693, + "step": 6262 + }, + { + "epoch": 0.34470801915350324, + "grad_norm": 0.7010672688484192, + "learning_rate": 9.303149573434576e-06, + "loss": 0.6635, + "step": 6263 + }, + { + "epoch": 0.3447630579558589, + "grad_norm": 0.8416483998298645, + "learning_rate": 9.302928822651473e-06, + "loss": 0.8408, + "step": 6264 + }, + { + "epoch": 0.34481809675821457, + "grad_norm": 0.7011786699295044, + "learning_rate": 9.302708039528712e-06, + "loss": 0.7636, + "step": 6265 + }, + { + "epoch": 0.3448731355605702, + "grad_norm": 0.7361586689949036, + "learning_rate": 9.302487224067947e-06, + "loss": 0.824, + "step": 6266 + }, + { + "epoch": 0.3449281743629259, + "grad_norm": 0.7747073173522949, + "learning_rate": 9.302266376270839e-06, + "loss": 0.8012, + "step": 6267 + }, + { + "epoch": 0.3449832131652815, + "grad_norm": 0.9407958388328552, + "learning_rate": 9.302045496139049e-06, + "loss": 0.8664, + "step": 6268 + }, + { + "epoch": 0.3450382519676372, + "grad_norm": 0.8674719929695129, + "learning_rate": 9.301824583674238e-06, + "loss": 0.8842, + "step": 6269 + }, + { + "epoch": 0.34509329076999284, + "grad_norm": 0.7697336673736572, + "learning_rate": 9.301603638878062e-06, + "loss": 0.7148, + "step": 6270 + }, + { + "epoch": 0.34514832957234853, + "grad_norm": 0.7220168709754944, + "learning_rate": 9.301382661752187e-06, + "loss": 0.7199, + "step": 6271 + }, + { + "epoch": 0.34520336837470417, + "grad_norm": 0.6745235919952393, + "learning_rate": 9.301161652298272e-06, + "loss": 0.708, + "step": 6272 + }, + { + "epoch": 0.3452584071770598, + "grad_norm": 0.7062309980392456, + "learning_rate": 9.300940610517974e-06, + "loss": 0.863, + "step": 6273 + }, + { + "epoch": 0.3453134459794155, + "grad_norm": 0.7499971985816956, + "learning_rate": 9.300719536412961e-06, + "loss": 0.7976, + "step": 6274 + }, + { + "epoch": 0.3453684847817711, + "grad_norm": 0.8304464221000671, + "learning_rate": 9.30049842998489e-06, + "loss": 0.8689, + "step": 6275 + }, + { + "epoch": 0.3454235235841268, + "grad_norm": 0.7460494041442871, + "learning_rate": 9.300277291235423e-06, + "loss": 0.7499, + "step": 6276 + }, + { + "epoch": 0.34547856238648245, + "grad_norm": 0.758788526058197, + "learning_rate": 9.300056120166225e-06, + "loss": 0.7501, + "step": 6277 + }, + { + "epoch": 0.34553360118883814, + "grad_norm": 0.7204456925392151, + "learning_rate": 9.299834916778955e-06, + "loss": 0.8234, + "step": 6278 + }, + { + "epoch": 0.34558863999119377, + "grad_norm": 0.7647501826286316, + "learning_rate": 9.299613681075277e-06, + "loss": 0.8653, + "step": 6279 + }, + { + "epoch": 0.34564367879354946, + "grad_norm": 0.7543594837188721, + "learning_rate": 9.299392413056853e-06, + "loss": 0.7915, + "step": 6280 + }, + { + "epoch": 0.3456987175959051, + "grad_norm": 0.7691700458526611, + "learning_rate": 9.299171112725347e-06, + "loss": 0.7429, + "step": 6281 + }, + { + "epoch": 0.3457537563982608, + "grad_norm": 0.7703940272331238, + "learning_rate": 9.29894978008242e-06, + "loss": 0.7424, + "step": 6282 + }, + { + "epoch": 0.3458087952006164, + "grad_norm": 0.8482547402381897, + "learning_rate": 9.29872841512974e-06, + "loss": 0.8971, + "step": 6283 + }, + { + "epoch": 0.3458638340029721, + "grad_norm": 0.755224883556366, + "learning_rate": 9.298507017868966e-06, + "loss": 0.7984, + "step": 6284 + }, + { + "epoch": 0.34591887280532774, + "grad_norm": 1.079891324043274, + "learning_rate": 9.298285588301766e-06, + "loss": 0.8301, + "step": 6285 + }, + { + "epoch": 0.3459739116076834, + "grad_norm": 0.7357321381568909, + "learning_rate": 9.2980641264298e-06, + "loss": 0.9018, + "step": 6286 + }, + { + "epoch": 0.34602895041003906, + "grad_norm": 0.7541963458061218, + "learning_rate": 9.297842632254734e-06, + "loss": 0.8716, + "step": 6287 + }, + { + "epoch": 0.34608398921239475, + "grad_norm": 1.1570138931274414, + "learning_rate": 9.297621105778235e-06, + "loss": 0.9163, + "step": 6288 + }, + { + "epoch": 0.3461390280147504, + "grad_norm": 0.7626895904541016, + "learning_rate": 9.297399547001965e-06, + "loss": 0.8162, + "step": 6289 + }, + { + "epoch": 0.34619406681710607, + "grad_norm": 0.758469820022583, + "learning_rate": 9.297177955927593e-06, + "loss": 0.8966, + "step": 6290 + }, + { + "epoch": 0.3462491056194617, + "grad_norm": 0.8998799324035645, + "learning_rate": 9.296956332556779e-06, + "loss": 0.8127, + "step": 6291 + }, + { + "epoch": 0.3463041444218174, + "grad_norm": 0.7470666170120239, + "learning_rate": 9.29673467689119e-06, + "loss": 0.7738, + "step": 6292 + }, + { + "epoch": 0.34635918322417303, + "grad_norm": 0.8066977858543396, + "learning_rate": 9.296512988932497e-06, + "loss": 0.8958, + "step": 6293 + }, + { + "epoch": 0.3464142220265287, + "grad_norm": 0.8394894003868103, + "learning_rate": 9.29629126868236e-06, + "loss": 0.8023, + "step": 6294 + }, + { + "epoch": 0.34646926082888435, + "grad_norm": 0.9053472876548767, + "learning_rate": 9.29606951614245e-06, + "loss": 0.8244, + "step": 6295 + }, + { + "epoch": 0.34652429963124004, + "grad_norm": 0.6996710896492004, + "learning_rate": 9.295847731314428e-06, + "loss": 0.8203, + "step": 6296 + }, + { + "epoch": 0.3465793384335957, + "grad_norm": 0.7236999273300171, + "learning_rate": 9.295625914199968e-06, + "loss": 0.6982, + "step": 6297 + }, + { + "epoch": 0.34663437723595136, + "grad_norm": 0.7006070017814636, + "learning_rate": 9.295404064800733e-06, + "loss": 0.7881, + "step": 6298 + }, + { + "epoch": 0.346689416038307, + "grad_norm": 0.8188902735710144, + "learning_rate": 9.29518218311839e-06, + "loss": 0.7472, + "step": 6299 + }, + { + "epoch": 0.3467444548406627, + "grad_norm": 0.7708863019943237, + "learning_rate": 9.294960269154608e-06, + "loss": 0.7572, + "step": 6300 + }, + { + "epoch": 0.3467994936430183, + "grad_norm": 0.7819802761077881, + "learning_rate": 9.294738322911052e-06, + "loss": 0.8486, + "step": 6301 + }, + { + "epoch": 0.346854532445374, + "grad_norm": 0.7160501480102539, + "learning_rate": 9.294516344389394e-06, + "loss": 0.8104, + "step": 6302 + }, + { + "epoch": 0.34690957124772964, + "grad_norm": 0.7426022887229919, + "learning_rate": 9.294294333591302e-06, + "loss": 0.7158, + "step": 6303 + }, + { + "epoch": 0.34696461005008533, + "grad_norm": 0.8397019505500793, + "learning_rate": 9.294072290518441e-06, + "loss": 0.8466, + "step": 6304 + }, + { + "epoch": 0.34701964885244097, + "grad_norm": 0.7220905423164368, + "learning_rate": 9.293850215172483e-06, + "loss": 0.7619, + "step": 6305 + }, + { + "epoch": 0.34707468765479665, + "grad_norm": 0.7401862740516663, + "learning_rate": 9.293628107555097e-06, + "loss": 0.7873, + "step": 6306 + }, + { + "epoch": 0.3471297264571523, + "grad_norm": 0.6764525175094604, + "learning_rate": 9.29340596766795e-06, + "loss": 0.7278, + "step": 6307 + }, + { + "epoch": 0.347184765259508, + "grad_norm": 0.8553194403648376, + "learning_rate": 9.293183795512715e-06, + "loss": 0.9074, + "step": 6308 + }, + { + "epoch": 0.3472398040618636, + "grad_norm": 0.6796454191207886, + "learning_rate": 9.292961591091058e-06, + "loss": 0.7179, + "step": 6309 + }, + { + "epoch": 0.3472948428642193, + "grad_norm": 0.6075254082679749, + "learning_rate": 9.292739354404652e-06, + "loss": 0.7228, + "step": 6310 + }, + { + "epoch": 0.34734988166657493, + "grad_norm": 0.7366840243339539, + "learning_rate": 9.292517085455166e-06, + "loss": 0.7934, + "step": 6311 + }, + { + "epoch": 0.3474049204689306, + "grad_norm": 0.6820569038391113, + "learning_rate": 9.29229478424427e-06, + "loss": 0.7315, + "step": 6312 + }, + { + "epoch": 0.34745995927128626, + "grad_norm": 0.8356956243515015, + "learning_rate": 9.292072450773635e-06, + "loss": 0.7787, + "step": 6313 + }, + { + "epoch": 0.34751499807364195, + "grad_norm": 0.70506352186203, + "learning_rate": 9.291850085044933e-06, + "loss": 0.7411, + "step": 6314 + }, + { + "epoch": 0.3475700368759976, + "grad_norm": 0.9074786901473999, + "learning_rate": 9.291627687059835e-06, + "loss": 0.7352, + "step": 6315 + }, + { + "epoch": 0.3476250756783532, + "grad_norm": 0.7858747839927673, + "learning_rate": 9.291405256820013e-06, + "loss": 0.7816, + "step": 6316 + }, + { + "epoch": 0.3476801144807089, + "grad_norm": 0.8576731085777283, + "learning_rate": 9.291182794327134e-06, + "loss": 0.7861, + "step": 6317 + }, + { + "epoch": 0.34773515328306454, + "grad_norm": 0.7500558495521545, + "learning_rate": 9.290960299582877e-06, + "loss": 0.8028, + "step": 6318 + }, + { + "epoch": 0.3477901920854202, + "grad_norm": 0.6577744483947754, + "learning_rate": 9.29073777258891e-06, + "loss": 0.7458, + "step": 6319 + }, + { + "epoch": 0.34784523088777586, + "grad_norm": 0.742855429649353, + "learning_rate": 9.290515213346906e-06, + "loss": 0.755, + "step": 6320 + }, + { + "epoch": 0.34790026969013155, + "grad_norm": 0.7626619338989258, + "learning_rate": 9.290292621858542e-06, + "loss": 0.6671, + "step": 6321 + }, + { + "epoch": 0.3479553084924872, + "grad_norm": 0.7139305472373962, + "learning_rate": 9.290069998125481e-06, + "loss": 0.7981, + "step": 6322 + }, + { + "epoch": 0.34801034729484287, + "grad_norm": 0.9249686002731323, + "learning_rate": 9.289847342149407e-06, + "loss": 0.7243, + "step": 6323 + }, + { + "epoch": 0.3480653860971985, + "grad_norm": 0.8090649843215942, + "learning_rate": 9.289624653931986e-06, + "loss": 0.7892, + "step": 6324 + }, + { + "epoch": 0.3481204248995542, + "grad_norm": 0.6845510005950928, + "learning_rate": 9.289401933474895e-06, + "loss": 0.7427, + "step": 6325 + }, + { + "epoch": 0.3481754637019098, + "grad_norm": 0.7620648741722107, + "learning_rate": 9.289179180779808e-06, + "loss": 0.7715, + "step": 6326 + }, + { + "epoch": 0.3482305025042655, + "grad_norm": 0.7441076040267944, + "learning_rate": 9.288956395848398e-06, + "loss": 0.7814, + "step": 6327 + }, + { + "epoch": 0.34828554130662115, + "grad_norm": 0.6777048707008362, + "learning_rate": 9.28873357868234e-06, + "loss": 0.759, + "step": 6328 + }, + { + "epoch": 0.34834058010897684, + "grad_norm": 0.6534250974655151, + "learning_rate": 9.288510729283307e-06, + "loss": 0.6777, + "step": 6329 + }, + { + "epoch": 0.34839561891133247, + "grad_norm": 0.8205152153968811, + "learning_rate": 9.288287847652977e-06, + "loss": 0.8027, + "step": 6330 + }, + { + "epoch": 0.34845065771368816, + "grad_norm": 0.7152554392814636, + "learning_rate": 9.288064933793024e-06, + "loss": 0.7956, + "step": 6331 + }, + { + "epoch": 0.3485056965160438, + "grad_norm": 0.9816664457321167, + "learning_rate": 9.287841987705121e-06, + "loss": 0.828, + "step": 6332 + }, + { + "epoch": 0.3485607353183995, + "grad_norm": 0.826554000377655, + "learning_rate": 9.287619009390945e-06, + "loss": 0.8544, + "step": 6333 + }, + { + "epoch": 0.3486157741207551, + "grad_norm": 0.7255695462226868, + "learning_rate": 9.287395998852175e-06, + "loss": 0.7749, + "step": 6334 + }, + { + "epoch": 0.3486708129231108, + "grad_norm": 0.7161709070205688, + "learning_rate": 9.287172956090482e-06, + "loss": 0.7114, + "step": 6335 + }, + { + "epoch": 0.34872585172546644, + "grad_norm": 0.7219997644424438, + "learning_rate": 9.286949881107546e-06, + "loss": 0.8309, + "step": 6336 + }, + { + "epoch": 0.34878089052782213, + "grad_norm": 0.7269770503044128, + "learning_rate": 9.286726773905042e-06, + "loss": 0.8039, + "step": 6337 + }, + { + "epoch": 0.34883592933017776, + "grad_norm": 0.8142165541648865, + "learning_rate": 9.286503634484645e-06, + "loss": 0.7673, + "step": 6338 + }, + { + "epoch": 0.34889096813253345, + "grad_norm": 0.7568639516830444, + "learning_rate": 9.286280462848037e-06, + "loss": 0.8471, + "step": 6339 + }, + { + "epoch": 0.3489460069348891, + "grad_norm": 0.7927737236022949, + "learning_rate": 9.28605725899689e-06, + "loss": 0.8828, + "step": 6340 + }, + { + "epoch": 0.3490010457372448, + "grad_norm": 0.9755893349647522, + "learning_rate": 9.285834022932885e-06, + "loss": 0.837, + "step": 6341 + }, + { + "epoch": 0.3490560845396004, + "grad_norm": 0.6831560730934143, + "learning_rate": 9.2856107546577e-06, + "loss": 0.7169, + "step": 6342 + }, + { + "epoch": 0.3491111233419561, + "grad_norm": 0.728239119052887, + "learning_rate": 9.285387454173009e-06, + "loss": 0.7805, + "step": 6343 + }, + { + "epoch": 0.34916616214431173, + "grad_norm": 0.6979145407676697, + "learning_rate": 9.285164121480495e-06, + "loss": 0.7794, + "step": 6344 + }, + { + "epoch": 0.3492212009466674, + "grad_norm": 0.7206674218177795, + "learning_rate": 9.284940756581834e-06, + "loss": 0.7198, + "step": 6345 + }, + { + "epoch": 0.34927623974902305, + "grad_norm": 0.8156035542488098, + "learning_rate": 9.284717359478705e-06, + "loss": 0.884, + "step": 6346 + }, + { + "epoch": 0.34933127855137874, + "grad_norm": 0.6876983046531677, + "learning_rate": 9.284493930172788e-06, + "loss": 0.7426, + "step": 6347 + }, + { + "epoch": 0.3493863173537344, + "grad_norm": 0.6856677532196045, + "learning_rate": 9.284270468665762e-06, + "loss": 0.7085, + "step": 6348 + }, + { + "epoch": 0.34944135615609007, + "grad_norm": 0.8378047943115234, + "learning_rate": 9.284046974959304e-06, + "loss": 0.725, + "step": 6349 + }, + { + "epoch": 0.3494963949584457, + "grad_norm": 0.7410693764686584, + "learning_rate": 9.283823449055097e-06, + "loss": 0.7953, + "step": 6350 + }, + { + "epoch": 0.3495514337608014, + "grad_norm": 0.7558375000953674, + "learning_rate": 9.28359989095482e-06, + "loss": 0.8052, + "step": 6351 + }, + { + "epoch": 0.349606472563157, + "grad_norm": 0.7176862955093384, + "learning_rate": 9.283376300660151e-06, + "loss": 0.7077, + "step": 6352 + }, + { + "epoch": 0.3496615113655127, + "grad_norm": 0.7443307042121887, + "learning_rate": 9.283152678172774e-06, + "loss": 0.7557, + "step": 6353 + }, + { + "epoch": 0.34971655016786835, + "grad_norm": 0.6653748750686646, + "learning_rate": 9.282929023494368e-06, + "loss": 0.7558, + "step": 6354 + }, + { + "epoch": 0.34977158897022403, + "grad_norm": 0.8139400482177734, + "learning_rate": 9.282705336626615e-06, + "loss": 0.847, + "step": 6355 + }, + { + "epoch": 0.34982662777257967, + "grad_norm": 1.012450933456421, + "learning_rate": 9.282481617571193e-06, + "loss": 0.744, + "step": 6356 + }, + { + "epoch": 0.34988166657493536, + "grad_norm": 0.7877402305603027, + "learning_rate": 9.282257866329784e-06, + "loss": 0.7475, + "step": 6357 + }, + { + "epoch": 0.349936705377291, + "grad_norm": 0.7989935874938965, + "learning_rate": 9.282034082904075e-06, + "loss": 0.7379, + "step": 6358 + }, + { + "epoch": 0.3499917441796466, + "grad_norm": 0.6665796637535095, + "learning_rate": 9.281810267295741e-06, + "loss": 0.7253, + "step": 6359 + }, + { + "epoch": 0.3500467829820023, + "grad_norm": 0.8344665765762329, + "learning_rate": 9.28158641950647e-06, + "loss": 0.8095, + "step": 6360 + }, + { + "epoch": 0.35010182178435795, + "grad_norm": 0.8312307596206665, + "learning_rate": 9.281362539537939e-06, + "loss": 0.8452, + "step": 6361 + }, + { + "epoch": 0.35015686058671364, + "grad_norm": 0.7423825263977051, + "learning_rate": 9.281138627391834e-06, + "loss": 0.8291, + "step": 6362 + }, + { + "epoch": 0.35021189938906927, + "grad_norm": 0.7594212293624878, + "learning_rate": 9.280914683069837e-06, + "loss": 0.8314, + "step": 6363 + }, + { + "epoch": 0.35026693819142496, + "grad_norm": 0.8059762716293335, + "learning_rate": 9.280690706573633e-06, + "loss": 0.7695, + "step": 6364 + }, + { + "epoch": 0.3503219769937806, + "grad_norm": 0.8053386807441711, + "learning_rate": 9.280466697904902e-06, + "loss": 0.8941, + "step": 6365 + }, + { + "epoch": 0.3503770157961363, + "grad_norm": 0.6703817248344421, + "learning_rate": 9.280242657065329e-06, + "loss": 0.5978, + "step": 6366 + }, + { + "epoch": 0.3504320545984919, + "grad_norm": 0.9359784722328186, + "learning_rate": 9.280018584056598e-06, + "loss": 0.8479, + "step": 6367 + }, + { + "epoch": 0.3504870934008476, + "grad_norm": 0.7692418098449707, + "learning_rate": 9.279794478880393e-06, + "loss": 0.7254, + "step": 6368 + }, + { + "epoch": 0.35054213220320324, + "grad_norm": 0.7992031574249268, + "learning_rate": 9.279570341538397e-06, + "loss": 0.6749, + "step": 6369 + }, + { + "epoch": 0.3505971710055589, + "grad_norm": 0.7735288739204407, + "learning_rate": 9.279346172032297e-06, + "loss": 0.8545, + "step": 6370 + }, + { + "epoch": 0.35065220980791456, + "grad_norm": 0.7124339938163757, + "learning_rate": 9.279121970363778e-06, + "loss": 0.8066, + "step": 6371 + }, + { + "epoch": 0.35070724861027025, + "grad_norm": 0.8116535544395447, + "learning_rate": 9.278897736534521e-06, + "loss": 0.8197, + "step": 6372 + }, + { + "epoch": 0.3507622874126259, + "grad_norm": 0.9377869963645935, + "learning_rate": 9.278673470546217e-06, + "loss": 0.74, + "step": 6373 + }, + { + "epoch": 0.3508173262149816, + "grad_norm": 0.6726253628730774, + "learning_rate": 9.278449172400548e-06, + "loss": 0.6389, + "step": 6374 + }, + { + "epoch": 0.3508723650173372, + "grad_norm": 0.8470593094825745, + "learning_rate": 9.278224842099198e-06, + "loss": 0.8059, + "step": 6375 + }, + { + "epoch": 0.3509274038196929, + "grad_norm": 0.7041867971420288, + "learning_rate": 9.278000479643857e-06, + "loss": 0.7409, + "step": 6376 + }, + { + "epoch": 0.35098244262204853, + "grad_norm": 0.7467322945594788, + "learning_rate": 9.27777608503621e-06, + "loss": 0.823, + "step": 6377 + }, + { + "epoch": 0.3510374814244042, + "grad_norm": 0.7211065888404846, + "learning_rate": 9.277551658277942e-06, + "loss": 0.7655, + "step": 6378 + }, + { + "epoch": 0.35109252022675985, + "grad_norm": 0.7709450125694275, + "learning_rate": 9.27732719937074e-06, + "loss": 0.8938, + "step": 6379 + }, + { + "epoch": 0.35114755902911554, + "grad_norm": 0.7672929167747498, + "learning_rate": 9.277102708316293e-06, + "loss": 0.6814, + "step": 6380 + }, + { + "epoch": 0.3512025978314712, + "grad_norm": 0.7334907650947571, + "learning_rate": 9.276878185116287e-06, + "loss": 0.6608, + "step": 6381 + }, + { + "epoch": 0.35125763663382686, + "grad_norm": 0.7011460065841675, + "learning_rate": 9.27665362977241e-06, + "loss": 0.8196, + "step": 6382 + }, + { + "epoch": 0.3513126754361825, + "grad_norm": 0.7388820052146912, + "learning_rate": 9.276429042286349e-06, + "loss": 0.8793, + "step": 6383 + }, + { + "epoch": 0.3513677142385382, + "grad_norm": 0.809725821018219, + "learning_rate": 9.27620442265979e-06, + "loss": 0.6976, + "step": 6384 + }, + { + "epoch": 0.3514227530408938, + "grad_norm": 0.6933012008666992, + "learning_rate": 9.275979770894424e-06, + "loss": 0.759, + "step": 6385 + }, + { + "epoch": 0.3514777918432495, + "grad_norm": 0.7928480505943298, + "learning_rate": 9.27575508699194e-06, + "loss": 0.7462, + "step": 6386 + }, + { + "epoch": 0.35153283064560514, + "grad_norm": 0.8461304903030396, + "learning_rate": 9.275530370954024e-06, + "loss": 0.8184, + "step": 6387 + }, + { + "epoch": 0.35158786944796083, + "grad_norm": 0.7624425292015076, + "learning_rate": 9.275305622782366e-06, + "loss": 0.7913, + "step": 6388 + }, + { + "epoch": 0.35164290825031647, + "grad_norm": 0.7103675007820129, + "learning_rate": 9.275080842478657e-06, + "loss": 0.7633, + "step": 6389 + }, + { + "epoch": 0.35169794705267216, + "grad_norm": 0.9002664089202881, + "learning_rate": 9.274856030044583e-06, + "loss": 0.7643, + "step": 6390 + }, + { + "epoch": 0.3517529858550278, + "grad_norm": 0.7658692002296448, + "learning_rate": 9.274631185481836e-06, + "loss": 0.8028, + "step": 6391 + }, + { + "epoch": 0.3518080246573835, + "grad_norm": 0.6747875809669495, + "learning_rate": 9.274406308792106e-06, + "loss": 0.695, + "step": 6392 + }, + { + "epoch": 0.3518630634597391, + "grad_norm": 0.8197165131568909, + "learning_rate": 9.27418139997708e-06, + "loss": 0.7218, + "step": 6393 + }, + { + "epoch": 0.3519181022620948, + "grad_norm": 0.7597750425338745, + "learning_rate": 9.273956459038453e-06, + "loss": 0.7738, + "step": 6394 + }, + { + "epoch": 0.35197314106445043, + "grad_norm": 0.7365928888320923, + "learning_rate": 9.273731485977912e-06, + "loss": 0.7906, + "step": 6395 + }, + { + "epoch": 0.3520281798668061, + "grad_norm": 0.7313928604125977, + "learning_rate": 9.273506480797151e-06, + "loss": 0.834, + "step": 6396 + }, + { + "epoch": 0.35208321866916176, + "grad_norm": 0.758886456489563, + "learning_rate": 9.273281443497858e-06, + "loss": 0.8883, + "step": 6397 + }, + { + "epoch": 0.35213825747151745, + "grad_norm": 0.7318256497383118, + "learning_rate": 9.273056374081726e-06, + "loss": 0.7463, + "step": 6398 + }, + { + "epoch": 0.3521932962738731, + "grad_norm": 0.778448224067688, + "learning_rate": 9.272831272550446e-06, + "loss": 0.6838, + "step": 6399 + }, + { + "epoch": 0.3522483350762287, + "grad_norm": 0.7392274141311646, + "learning_rate": 9.272606138905709e-06, + "loss": 0.7237, + "step": 6400 + }, + { + "epoch": 0.3523033738785844, + "grad_norm": 0.8803032040596008, + "learning_rate": 9.272380973149209e-06, + "loss": 0.7839, + "step": 6401 + }, + { + "epoch": 0.35235841268094004, + "grad_norm": 0.7506754994392395, + "learning_rate": 9.272155775282636e-06, + "loss": 0.7665, + "step": 6402 + }, + { + "epoch": 0.3524134514832957, + "grad_norm": 0.8136595487594604, + "learning_rate": 9.271930545307686e-06, + "loss": 0.9111, + "step": 6403 + }, + { + "epoch": 0.35246849028565136, + "grad_norm": 0.7976880073547363, + "learning_rate": 9.271705283226047e-06, + "loss": 0.735, + "step": 6404 + }, + { + "epoch": 0.35252352908800705, + "grad_norm": 0.89708411693573, + "learning_rate": 9.271479989039415e-06, + "loss": 0.7698, + "step": 6405 + }, + { + "epoch": 0.3525785678903627, + "grad_norm": 0.8618703484535217, + "learning_rate": 9.271254662749484e-06, + "loss": 0.9001, + "step": 6406 + }, + { + "epoch": 0.35263360669271837, + "grad_norm": 0.7143027186393738, + "learning_rate": 9.271029304357946e-06, + "loss": 0.8188, + "step": 6407 + }, + { + "epoch": 0.352688645495074, + "grad_norm": 0.795365571975708, + "learning_rate": 9.270803913866496e-06, + "loss": 0.7389, + "step": 6408 + }, + { + "epoch": 0.3527436842974297, + "grad_norm": 0.6947643756866455, + "learning_rate": 9.270578491276825e-06, + "loss": 0.7278, + "step": 6409 + }, + { + "epoch": 0.3527987230997853, + "grad_norm": 0.7806137204170227, + "learning_rate": 9.27035303659063e-06, + "loss": 0.808, + "step": 6410 + }, + { + "epoch": 0.352853761902141, + "grad_norm": 0.8908704519271851, + "learning_rate": 9.270127549809606e-06, + "loss": 0.8659, + "step": 6411 + }, + { + "epoch": 0.35290880070449665, + "grad_norm": 0.8171417713165283, + "learning_rate": 9.269902030935445e-06, + "loss": 0.7918, + "step": 6412 + }, + { + "epoch": 0.35296383950685234, + "grad_norm": 0.7556712627410889, + "learning_rate": 9.269676479969842e-06, + "loss": 0.7121, + "step": 6413 + }, + { + "epoch": 0.353018878309208, + "grad_norm": 0.8080483675003052, + "learning_rate": 9.269450896914495e-06, + "loss": 0.8185, + "step": 6414 + }, + { + "epoch": 0.35307391711156366, + "grad_norm": 0.8514583706855774, + "learning_rate": 9.2692252817711e-06, + "loss": 0.8055, + "step": 6415 + }, + { + "epoch": 0.3531289559139193, + "grad_norm": 0.7914162278175354, + "learning_rate": 9.268999634541347e-06, + "loss": 0.759, + "step": 6416 + }, + { + "epoch": 0.353183994716275, + "grad_norm": 0.6452118754386902, + "learning_rate": 9.268773955226937e-06, + "loss": 0.6797, + "step": 6417 + }, + { + "epoch": 0.3532390335186306, + "grad_norm": 0.6876220107078552, + "learning_rate": 9.268548243829565e-06, + "loss": 0.7365, + "step": 6418 + }, + { + "epoch": 0.3532940723209863, + "grad_norm": 0.758550226688385, + "learning_rate": 9.268322500350926e-06, + "loss": 0.7069, + "step": 6419 + }, + { + "epoch": 0.35334911112334194, + "grad_norm": 0.7905879020690918, + "learning_rate": 9.268096724792718e-06, + "loss": 0.8024, + "step": 6420 + }, + { + "epoch": 0.35340414992569763, + "grad_norm": 0.755253255367279, + "learning_rate": 9.267870917156638e-06, + "loss": 0.8018, + "step": 6421 + }, + { + "epoch": 0.35345918872805326, + "grad_norm": 0.6879923343658447, + "learning_rate": 9.267645077444382e-06, + "loss": 0.7267, + "step": 6422 + }, + { + "epoch": 0.35351422753040895, + "grad_norm": 0.766214907169342, + "learning_rate": 9.267419205657649e-06, + "loss": 0.7801, + "step": 6423 + }, + { + "epoch": 0.3535692663327646, + "grad_norm": 0.868776798248291, + "learning_rate": 9.267193301798135e-06, + "loss": 0.9234, + "step": 6424 + }, + { + "epoch": 0.3536243051351203, + "grad_norm": 1.2007492780685425, + "learning_rate": 9.266967365867536e-06, + "loss": 0.7743, + "step": 6425 + }, + { + "epoch": 0.3536793439374759, + "grad_norm": 0.7445551156997681, + "learning_rate": 9.266741397867556e-06, + "loss": 0.6755, + "step": 6426 + }, + { + "epoch": 0.3537343827398316, + "grad_norm": 0.7493785619735718, + "learning_rate": 9.266515397799889e-06, + "loss": 0.7891, + "step": 6427 + }, + { + "epoch": 0.35378942154218723, + "grad_norm": 0.6718230843544006, + "learning_rate": 9.266289365666234e-06, + "loss": 0.6908, + "step": 6428 + }, + { + "epoch": 0.3538444603445429, + "grad_norm": 0.7783547639846802, + "learning_rate": 9.266063301468289e-06, + "loss": 0.7115, + "step": 6429 + }, + { + "epoch": 0.35389949914689856, + "grad_norm": 0.745627224445343, + "learning_rate": 9.265837205207755e-06, + "loss": 0.8421, + "step": 6430 + }, + { + "epoch": 0.35395453794925424, + "grad_norm": 0.7314152717590332, + "learning_rate": 9.26561107688633e-06, + "loss": 0.807, + "step": 6431 + }, + { + "epoch": 0.3540095767516099, + "grad_norm": 0.6975863575935364, + "learning_rate": 9.265384916505714e-06, + "loss": 0.7787, + "step": 6432 + }, + { + "epoch": 0.35406461555396557, + "grad_norm": 0.9758319854736328, + "learning_rate": 9.265158724067608e-06, + "loss": 0.8668, + "step": 6433 + }, + { + "epoch": 0.3541196543563212, + "grad_norm": 0.7686764001846313, + "learning_rate": 9.264932499573711e-06, + "loss": 0.7428, + "step": 6434 + }, + { + "epoch": 0.3541746931586769, + "grad_norm": 0.8761935830116272, + "learning_rate": 9.26470624302572e-06, + "loss": 0.8022, + "step": 6435 + }, + { + "epoch": 0.3542297319610325, + "grad_norm": 0.9145118594169617, + "learning_rate": 9.264479954425341e-06, + "loss": 0.7994, + "step": 6436 + }, + { + "epoch": 0.3542847707633882, + "grad_norm": 0.8217951655387878, + "learning_rate": 9.264253633774271e-06, + "loss": 0.7235, + "step": 6437 + }, + { + "epoch": 0.35433980956574385, + "grad_norm": 0.7624716758728027, + "learning_rate": 9.264027281074214e-06, + "loss": 0.8238, + "step": 6438 + }, + { + "epoch": 0.35439484836809954, + "grad_norm": 0.7772085070610046, + "learning_rate": 9.26380089632687e-06, + "loss": 0.7941, + "step": 6439 + }, + { + "epoch": 0.35444988717045517, + "grad_norm": 1.0462371110916138, + "learning_rate": 9.263574479533937e-06, + "loss": 0.8255, + "step": 6440 + }, + { + "epoch": 0.35450492597281086, + "grad_norm": 0.8523101210594177, + "learning_rate": 9.263348030697119e-06, + "loss": 0.8489, + "step": 6441 + }, + { + "epoch": 0.3545599647751665, + "grad_norm": 1.0292255878448486, + "learning_rate": 9.26312154981812e-06, + "loss": 0.7989, + "step": 6442 + }, + { + "epoch": 0.3546150035775221, + "grad_norm": 0.7621143460273743, + "learning_rate": 9.262895036898641e-06, + "loss": 0.8154, + "step": 6443 + }, + { + "epoch": 0.3546700423798778, + "grad_norm": 0.7158074378967285, + "learning_rate": 9.262668491940382e-06, + "loss": 0.7821, + "step": 6444 + }, + { + "epoch": 0.35472508118223345, + "grad_norm": 0.7969478964805603, + "learning_rate": 9.26244191494505e-06, + "loss": 0.8535, + "step": 6445 + }, + { + "epoch": 0.35478011998458914, + "grad_norm": 0.9244762063026428, + "learning_rate": 9.262215305914345e-06, + "loss": 0.7585, + "step": 6446 + }, + { + "epoch": 0.35483515878694477, + "grad_norm": 0.6862454414367676, + "learning_rate": 9.26198866484997e-06, + "loss": 0.7294, + "step": 6447 + }, + { + "epoch": 0.35489019758930046, + "grad_norm": 0.6816834211349487, + "learning_rate": 9.261761991753629e-06, + "loss": 0.7763, + "step": 6448 + }, + { + "epoch": 0.3549452363916561, + "grad_norm": 0.792539119720459, + "learning_rate": 9.261535286627025e-06, + "loss": 0.7829, + "step": 6449 + }, + { + "epoch": 0.3550002751940118, + "grad_norm": 0.8563211560249329, + "learning_rate": 9.261308549471866e-06, + "loss": 0.8945, + "step": 6450 + }, + { + "epoch": 0.3550553139963674, + "grad_norm": 0.7241078019142151, + "learning_rate": 9.26108178028985e-06, + "loss": 0.6936, + "step": 6451 + }, + { + "epoch": 0.3551103527987231, + "grad_norm": 0.7150034308433533, + "learning_rate": 9.260854979082682e-06, + "loss": 0.7689, + "step": 6452 + }, + { + "epoch": 0.35516539160107874, + "grad_norm": 0.8630193471908569, + "learning_rate": 9.260628145852073e-06, + "loss": 0.8506, + "step": 6453 + }, + { + "epoch": 0.35522043040343443, + "grad_norm": 0.7133893370628357, + "learning_rate": 9.26040128059972e-06, + "loss": 0.7976, + "step": 6454 + }, + { + "epoch": 0.35527546920579006, + "grad_norm": 0.6984630823135376, + "learning_rate": 9.260174383327332e-06, + "loss": 0.7442, + "step": 6455 + }, + { + "epoch": 0.35533050800814575, + "grad_norm": 0.7166933417320251, + "learning_rate": 9.259947454036613e-06, + "loss": 0.813, + "step": 6456 + }, + { + "epoch": 0.3553855468105014, + "grad_norm": 0.7353581190109253, + "learning_rate": 9.259720492729272e-06, + "loss": 0.8157, + "step": 6457 + }, + { + "epoch": 0.3554405856128571, + "grad_norm": 0.6810038089752197, + "learning_rate": 9.259493499407011e-06, + "loss": 0.7423, + "step": 6458 + }, + { + "epoch": 0.3554956244152127, + "grad_norm": 1.1599586009979248, + "learning_rate": 9.259266474071535e-06, + "loss": 0.7159, + "step": 6459 + }, + { + "epoch": 0.3555506632175684, + "grad_norm": 0.7857629060745239, + "learning_rate": 9.259039416724554e-06, + "loss": 0.7846, + "step": 6460 + }, + { + "epoch": 0.35560570201992403, + "grad_norm": 0.705333948135376, + "learning_rate": 9.258812327367773e-06, + "loss": 0.751, + "step": 6461 + }, + { + "epoch": 0.3556607408222797, + "grad_norm": 0.6899998188018799, + "learning_rate": 9.258585206002897e-06, + "loss": 0.7303, + "step": 6462 + }, + { + "epoch": 0.35571577962463535, + "grad_norm": 0.8007912039756775, + "learning_rate": 9.258358052631637e-06, + "loss": 0.7363, + "step": 6463 + }, + { + "epoch": 0.35577081842699104, + "grad_norm": 0.9403146505355835, + "learning_rate": 9.258130867255695e-06, + "loss": 0.9096, + "step": 6464 + }, + { + "epoch": 0.3558258572293467, + "grad_norm": 0.7069174647331238, + "learning_rate": 9.257903649876782e-06, + "loss": 0.7362, + "step": 6465 + }, + { + "epoch": 0.35588089603170237, + "grad_norm": 0.770807683467865, + "learning_rate": 9.257676400496607e-06, + "loss": 0.7904, + "step": 6466 + }, + { + "epoch": 0.355935934834058, + "grad_norm": 0.8586871027946472, + "learning_rate": 9.257449119116874e-06, + "loss": 0.7596, + "step": 6467 + }, + { + "epoch": 0.3559909736364137, + "grad_norm": 0.6934101581573486, + "learning_rate": 9.257221805739294e-06, + "loss": 0.6655, + "step": 6468 + }, + { + "epoch": 0.3560460124387693, + "grad_norm": 0.9494497179985046, + "learning_rate": 9.256994460365573e-06, + "loss": 0.7923, + "step": 6469 + }, + { + "epoch": 0.356101051241125, + "grad_norm": 0.7131130695343018, + "learning_rate": 9.256767082997422e-06, + "loss": 0.819, + "step": 6470 + }, + { + "epoch": 0.35615609004348064, + "grad_norm": 0.8641398549079895, + "learning_rate": 9.25653967363655e-06, + "loss": 0.8275, + "step": 6471 + }, + { + "epoch": 0.35621112884583633, + "grad_norm": 0.7350367307662964, + "learning_rate": 9.256312232284665e-06, + "loss": 0.7991, + "step": 6472 + }, + { + "epoch": 0.35626616764819197, + "grad_norm": 0.8174671530723572, + "learning_rate": 9.256084758943476e-06, + "loss": 0.7147, + "step": 6473 + }, + { + "epoch": 0.35632120645054766, + "grad_norm": 0.7560263872146606, + "learning_rate": 9.255857253614693e-06, + "loss": 0.7435, + "step": 6474 + }, + { + "epoch": 0.3563762452529033, + "grad_norm": 0.7465197443962097, + "learning_rate": 9.255629716300025e-06, + "loss": 0.8228, + "step": 6475 + }, + { + "epoch": 0.356431284055259, + "grad_norm": 0.7130733728408813, + "learning_rate": 9.255402147001184e-06, + "loss": 0.8361, + "step": 6476 + }, + { + "epoch": 0.3564863228576146, + "grad_norm": 0.7200759053230286, + "learning_rate": 9.255174545719882e-06, + "loss": 0.7387, + "step": 6477 + }, + { + "epoch": 0.3565413616599703, + "grad_norm": 0.8387622237205505, + "learning_rate": 9.254946912457826e-06, + "loss": 0.8427, + "step": 6478 + }, + { + "epoch": 0.35659640046232594, + "grad_norm": 0.7263510823249817, + "learning_rate": 9.254719247216725e-06, + "loss": 0.712, + "step": 6479 + }, + { + "epoch": 0.3566514392646816, + "grad_norm": 0.7393862009048462, + "learning_rate": 9.254491549998296e-06, + "loss": 0.6916, + "step": 6480 + }, + { + "epoch": 0.35670647806703726, + "grad_norm": 0.7289569973945618, + "learning_rate": 9.254263820804246e-06, + "loss": 0.7561, + "step": 6481 + }, + { + "epoch": 0.35676151686939295, + "grad_norm": 0.7597448825836182, + "learning_rate": 9.254036059636288e-06, + "loss": 0.853, + "step": 6482 + }, + { + "epoch": 0.3568165556717486, + "grad_norm": 0.7652063369750977, + "learning_rate": 9.253808266496136e-06, + "loss": 0.7652, + "step": 6483 + }, + { + "epoch": 0.35687159447410427, + "grad_norm": 1.193938136100769, + "learning_rate": 9.253580441385497e-06, + "loss": 0.8288, + "step": 6484 + }, + { + "epoch": 0.3569266332764599, + "grad_norm": 0.9258719086647034, + "learning_rate": 9.253352584306087e-06, + "loss": 0.807, + "step": 6485 + }, + { + "epoch": 0.35698167207881554, + "grad_norm": 0.78384929895401, + "learning_rate": 9.253124695259617e-06, + "loss": 0.7785, + "step": 6486 + }, + { + "epoch": 0.3570367108811712, + "grad_norm": 0.801403284072876, + "learning_rate": 9.252896774247802e-06, + "loss": 0.8382, + "step": 6487 + }, + { + "epoch": 0.35709174968352686, + "grad_norm": 0.9472376108169556, + "learning_rate": 9.25266882127235e-06, + "loss": 0.8661, + "step": 6488 + }, + { + "epoch": 0.35714678848588255, + "grad_norm": 0.7575686573982239, + "learning_rate": 9.252440836334981e-06, + "loss": 0.8428, + "step": 6489 + }, + { + "epoch": 0.3572018272882382, + "grad_norm": 0.736282467842102, + "learning_rate": 9.252212819437402e-06, + "loss": 0.801, + "step": 6490 + }, + { + "epoch": 0.35725686609059387, + "grad_norm": 0.7420864701271057, + "learning_rate": 9.251984770581332e-06, + "loss": 0.8849, + "step": 6491 + }, + { + "epoch": 0.3573119048929495, + "grad_norm": 0.7129189372062683, + "learning_rate": 9.251756689768482e-06, + "loss": 0.7716, + "step": 6492 + }, + { + "epoch": 0.3573669436953052, + "grad_norm": 0.7777297496795654, + "learning_rate": 9.251528577000566e-06, + "loss": 0.8183, + "step": 6493 + }, + { + "epoch": 0.35742198249766083, + "grad_norm": 0.7644590139389038, + "learning_rate": 9.2513004322793e-06, + "loss": 0.6319, + "step": 6494 + }, + { + "epoch": 0.3574770213000165, + "grad_norm": 0.7112484574317932, + "learning_rate": 9.251072255606399e-06, + "loss": 0.8012, + "step": 6495 + }, + { + "epoch": 0.35753206010237215, + "grad_norm": 0.7772265076637268, + "learning_rate": 9.250844046983576e-06, + "loss": 0.8372, + "step": 6496 + }, + { + "epoch": 0.35758709890472784, + "grad_norm": 0.9530157446861267, + "learning_rate": 9.250615806412546e-06, + "loss": 0.8683, + "step": 6497 + }, + { + "epoch": 0.3576421377070835, + "grad_norm": 0.7249575257301331, + "learning_rate": 9.250387533895026e-06, + "loss": 0.7091, + "step": 6498 + }, + { + "epoch": 0.35769717650943916, + "grad_norm": 0.8549422025680542, + "learning_rate": 9.25015922943273e-06, + "loss": 0.8376, + "step": 6499 + }, + { + "epoch": 0.3577522153117948, + "grad_norm": 0.74477618932724, + "learning_rate": 9.249930893027376e-06, + "loss": 0.7594, + "step": 6500 + }, + { + "epoch": 0.3578072541141505, + "grad_norm": 0.8269739151000977, + "learning_rate": 9.24970252468068e-06, + "loss": 0.6473, + "step": 6501 + }, + { + "epoch": 0.3578622929165061, + "grad_norm": 0.8375437259674072, + "learning_rate": 9.249474124394358e-06, + "loss": 0.7631, + "step": 6502 + }, + { + "epoch": 0.3579173317188618, + "grad_norm": 0.8680340051651001, + "learning_rate": 9.249245692170123e-06, + "loss": 0.7863, + "step": 6503 + }, + { + "epoch": 0.35797237052121744, + "grad_norm": 0.7179692983627319, + "learning_rate": 9.249017228009696e-06, + "loss": 0.8022, + "step": 6504 + }, + { + "epoch": 0.35802740932357313, + "grad_norm": 0.7797464728355408, + "learning_rate": 9.248788731914794e-06, + "loss": 0.8067, + "step": 6505 + }, + { + "epoch": 0.35808244812592877, + "grad_norm": 0.8032993674278259, + "learning_rate": 9.248560203887133e-06, + "loss": 0.7383, + "step": 6506 + }, + { + "epoch": 0.35813748692828445, + "grad_norm": 0.7714722156524658, + "learning_rate": 9.24833164392843e-06, + "loss": 0.7149, + "step": 6507 + }, + { + "epoch": 0.3581925257306401, + "grad_norm": 0.7492430210113525, + "learning_rate": 9.248103052040404e-06, + "loss": 0.7645, + "step": 6508 + }, + { + "epoch": 0.3582475645329958, + "grad_norm": 0.6843901872634888, + "learning_rate": 9.247874428224773e-06, + "loss": 0.7183, + "step": 6509 + }, + { + "epoch": 0.3583026033353514, + "grad_norm": 0.8370186686515808, + "learning_rate": 9.247645772483254e-06, + "loss": 0.7832, + "step": 6510 + }, + { + "epoch": 0.3583576421377071, + "grad_norm": 0.7907791137695312, + "learning_rate": 9.247417084817567e-06, + "loss": 0.8742, + "step": 6511 + }, + { + "epoch": 0.35841268094006273, + "grad_norm": 0.7950869798660278, + "learning_rate": 9.247188365229428e-06, + "loss": 0.8705, + "step": 6512 + }, + { + "epoch": 0.3584677197424184, + "grad_norm": 0.7276936173439026, + "learning_rate": 9.24695961372056e-06, + "loss": 0.7629, + "step": 6513 + }, + { + "epoch": 0.35852275854477406, + "grad_norm": 0.7761141657829285, + "learning_rate": 9.24673083029268e-06, + "loss": 0.8813, + "step": 6514 + }, + { + "epoch": 0.35857779734712975, + "grad_norm": 0.7528283596038818, + "learning_rate": 9.24650201494751e-06, + "loss": 0.7885, + "step": 6515 + }, + { + "epoch": 0.3586328361494854, + "grad_norm": 0.8972534537315369, + "learning_rate": 9.246273167686765e-06, + "loss": 0.9081, + "step": 6516 + }, + { + "epoch": 0.35868787495184107, + "grad_norm": 0.7658557891845703, + "learning_rate": 9.246044288512168e-06, + "loss": 0.8451, + "step": 6517 + }, + { + "epoch": 0.3587429137541967, + "grad_norm": 0.8013193607330322, + "learning_rate": 9.245815377425438e-06, + "loss": 0.7236, + "step": 6518 + }, + { + "epoch": 0.3587979525565524, + "grad_norm": 0.8134163022041321, + "learning_rate": 9.245586434428298e-06, + "loss": 0.908, + "step": 6519 + }, + { + "epoch": 0.358852991358908, + "grad_norm": 0.6479801535606384, + "learning_rate": 9.245357459522466e-06, + "loss": 0.7397, + "step": 6520 + }, + { + "epoch": 0.3589080301612637, + "grad_norm": 0.70014488697052, + "learning_rate": 9.245128452709665e-06, + "loss": 0.6898, + "step": 6521 + }, + { + "epoch": 0.35896306896361935, + "grad_norm": 0.7645437717437744, + "learning_rate": 9.244899413991613e-06, + "loss": 0.8319, + "step": 6522 + }, + { + "epoch": 0.35901810776597504, + "grad_norm": 0.6812799572944641, + "learning_rate": 9.244670343370033e-06, + "loss": 0.7359, + "step": 6523 + }, + { + "epoch": 0.35907314656833067, + "grad_norm": 0.6573774218559265, + "learning_rate": 9.244441240846647e-06, + "loss": 0.742, + "step": 6524 + }, + { + "epoch": 0.35912818537068636, + "grad_norm": 0.7870661020278931, + "learning_rate": 9.244212106423178e-06, + "loss": 0.7307, + "step": 6525 + }, + { + "epoch": 0.359183224173042, + "grad_norm": 0.9163166284561157, + "learning_rate": 9.243982940101347e-06, + "loss": 0.8584, + "step": 6526 + }, + { + "epoch": 0.3592382629753977, + "grad_norm": 0.766888439655304, + "learning_rate": 9.243753741882874e-06, + "loss": 0.8093, + "step": 6527 + }, + { + "epoch": 0.3592933017777533, + "grad_norm": 0.7831236124038696, + "learning_rate": 9.243524511769486e-06, + "loss": 0.8665, + "step": 6528 + }, + { + "epoch": 0.35934834058010895, + "grad_norm": 0.7485133409500122, + "learning_rate": 9.243295249762904e-06, + "loss": 0.7336, + "step": 6529 + }, + { + "epoch": 0.35940337938246464, + "grad_norm": 0.7231502532958984, + "learning_rate": 9.24306595586485e-06, + "loss": 0.8095, + "step": 6530 + }, + { + "epoch": 0.35945841818482027, + "grad_norm": 0.821898877620697, + "learning_rate": 9.242836630077048e-06, + "loss": 0.831, + "step": 6531 + }, + { + "epoch": 0.35951345698717596, + "grad_norm": 0.6792737245559692, + "learning_rate": 9.242607272401223e-06, + "loss": 0.7183, + "step": 6532 + }, + { + "epoch": 0.3595684957895316, + "grad_norm": 0.7200430631637573, + "learning_rate": 9.242377882839095e-06, + "loss": 0.7256, + "step": 6533 + }, + { + "epoch": 0.3596235345918873, + "grad_norm": 0.6713700890541077, + "learning_rate": 9.242148461392393e-06, + "loss": 0.7416, + "step": 6534 + }, + { + "epoch": 0.3596785733942429, + "grad_norm": 0.7054564356803894, + "learning_rate": 9.241919008062836e-06, + "loss": 0.6856, + "step": 6535 + }, + { + "epoch": 0.3597336121965986, + "grad_norm": 0.7516196966171265, + "learning_rate": 9.241689522852152e-06, + "loss": 0.7149, + "step": 6536 + }, + { + "epoch": 0.35978865099895424, + "grad_norm": 0.8547651767730713, + "learning_rate": 9.241460005762067e-06, + "loss": 0.7075, + "step": 6537 + }, + { + "epoch": 0.35984368980130993, + "grad_norm": 0.6791819334030151, + "learning_rate": 9.241230456794302e-06, + "loss": 0.6449, + "step": 6538 + }, + { + "epoch": 0.35989872860366556, + "grad_norm": 0.8365122079849243, + "learning_rate": 9.241000875950583e-06, + "loss": 0.7619, + "step": 6539 + }, + { + "epoch": 0.35995376740602125, + "grad_norm": 0.763829231262207, + "learning_rate": 9.24077126323264e-06, + "loss": 0.71, + "step": 6540 + }, + { + "epoch": 0.3600088062083769, + "grad_norm": 0.7698483467102051, + "learning_rate": 9.240541618642193e-06, + "loss": 0.7949, + "step": 6541 + }, + { + "epoch": 0.3600638450107326, + "grad_norm": 0.7331508994102478, + "learning_rate": 9.24031194218097e-06, + "loss": 0.8292, + "step": 6542 + }, + { + "epoch": 0.3601188838130882, + "grad_norm": 0.7507451772689819, + "learning_rate": 9.2400822338507e-06, + "loss": 0.8651, + "step": 6543 + }, + { + "epoch": 0.3601739226154439, + "grad_norm": 0.8537001609802246, + "learning_rate": 9.239852493653104e-06, + "loss": 0.848, + "step": 6544 + }, + { + "epoch": 0.36022896141779953, + "grad_norm": 0.683311939239502, + "learning_rate": 9.239622721589913e-06, + "loss": 0.803, + "step": 6545 + }, + { + "epoch": 0.3602840002201552, + "grad_norm": 0.6916974186897278, + "learning_rate": 9.239392917662852e-06, + "loss": 0.8037, + "step": 6546 + }, + { + "epoch": 0.36033903902251085, + "grad_norm": 0.798795223236084, + "learning_rate": 9.23916308187365e-06, + "loss": 0.8037, + "step": 6547 + }, + { + "epoch": 0.36039407782486654, + "grad_norm": 0.7284069657325745, + "learning_rate": 9.238933214224032e-06, + "loss": 0.7365, + "step": 6548 + }, + { + "epoch": 0.3604491166272222, + "grad_norm": 0.7789250016212463, + "learning_rate": 9.238703314715727e-06, + "loss": 0.788, + "step": 6549 + }, + { + "epoch": 0.36050415542957787, + "grad_norm": 0.7029675841331482, + "learning_rate": 9.238473383350462e-06, + "loss": 0.7796, + "step": 6550 + }, + { + "epoch": 0.3605591942319335, + "grad_norm": 0.9094457626342773, + "learning_rate": 9.238243420129965e-06, + "loss": 0.7884, + "step": 6551 + }, + { + "epoch": 0.3606142330342892, + "grad_norm": 0.8253848552703857, + "learning_rate": 9.238013425055965e-06, + "loss": 0.7671, + "step": 6552 + }, + { + "epoch": 0.3606692718366448, + "grad_norm": 0.7052987813949585, + "learning_rate": 9.237783398130193e-06, + "loss": 0.7511, + "step": 6553 + }, + { + "epoch": 0.3607243106390005, + "grad_norm": 0.7506607174873352, + "learning_rate": 9.237553339354373e-06, + "loss": 0.6804, + "step": 6554 + }, + { + "epoch": 0.36077934944135615, + "grad_norm": 0.725106418132782, + "learning_rate": 9.237323248730237e-06, + "loss": 0.7658, + "step": 6555 + }, + { + "epoch": 0.36083438824371183, + "grad_norm": 0.8164945244789124, + "learning_rate": 9.237093126259515e-06, + "loss": 0.7857, + "step": 6556 + }, + { + "epoch": 0.36088942704606747, + "grad_norm": 0.6937377452850342, + "learning_rate": 9.236862971943934e-06, + "loss": 0.6985, + "step": 6557 + }, + { + "epoch": 0.36094446584842316, + "grad_norm": 0.7511105537414551, + "learning_rate": 9.236632785785225e-06, + "loss": 0.7891, + "step": 6558 + }, + { + "epoch": 0.3609995046507788, + "grad_norm": 0.7217637896537781, + "learning_rate": 9.236402567785118e-06, + "loss": 0.7942, + "step": 6559 + }, + { + "epoch": 0.3610545434531345, + "grad_norm": 1.1438478231430054, + "learning_rate": 9.236172317945343e-06, + "loss": 0.8311, + "step": 6560 + }, + { + "epoch": 0.3611095822554901, + "grad_norm": 0.7414245009422302, + "learning_rate": 9.23594203626763e-06, + "loss": 0.7726, + "step": 6561 + }, + { + "epoch": 0.3611646210578458, + "grad_norm": 0.7762154340744019, + "learning_rate": 9.235711722753712e-06, + "loss": 0.7891, + "step": 6562 + }, + { + "epoch": 0.36121965986020144, + "grad_norm": 0.7368801832199097, + "learning_rate": 9.23548137740532e-06, + "loss": 0.7656, + "step": 6563 + }, + { + "epoch": 0.3612746986625571, + "grad_norm": 0.7571502923965454, + "learning_rate": 9.235251000224181e-06, + "loss": 0.7845, + "step": 6564 + }, + { + "epoch": 0.36132973746491276, + "grad_norm": 0.8078309297561646, + "learning_rate": 9.235020591212031e-06, + "loss": 0.7969, + "step": 6565 + }, + { + "epoch": 0.36138477626726845, + "grad_norm": 0.6897913813591003, + "learning_rate": 9.234790150370599e-06, + "loss": 0.6922, + "step": 6566 + }, + { + "epoch": 0.3614398150696241, + "grad_norm": 0.8053449988365173, + "learning_rate": 9.234559677701618e-06, + "loss": 0.9126, + "step": 6567 + }, + { + "epoch": 0.36149485387197977, + "grad_norm": 0.8400903940200806, + "learning_rate": 9.23432917320682e-06, + "loss": 0.8144, + "step": 6568 + }, + { + "epoch": 0.3615498926743354, + "grad_norm": 0.7753110527992249, + "learning_rate": 9.234098636887935e-06, + "loss": 0.7025, + "step": 6569 + }, + { + "epoch": 0.3616049314766911, + "grad_norm": 0.7901243567466736, + "learning_rate": 9.233868068746702e-06, + "loss": 0.783, + "step": 6570 + }, + { + "epoch": 0.3616599702790467, + "grad_norm": 1.2297497987747192, + "learning_rate": 9.233637468784849e-06, + "loss": 0.8541, + "step": 6571 + }, + { + "epoch": 0.36171500908140236, + "grad_norm": 0.7590478658676147, + "learning_rate": 9.233406837004108e-06, + "loss": 0.7856, + "step": 6572 + }, + { + "epoch": 0.36177004788375805, + "grad_norm": 0.6651493310928345, + "learning_rate": 9.233176173406216e-06, + "loss": 0.6822, + "step": 6573 + }, + { + "epoch": 0.3618250866861137, + "grad_norm": 0.7760787010192871, + "learning_rate": 9.232945477992905e-06, + "loss": 0.8017, + "step": 6574 + }, + { + "epoch": 0.3618801254884694, + "grad_norm": 0.8788009285926819, + "learning_rate": 9.232714750765908e-06, + "loss": 0.7812, + "step": 6575 + }, + { + "epoch": 0.361935164290825, + "grad_norm": 0.7014517188072205, + "learning_rate": 9.232483991726961e-06, + "loss": 0.7293, + "step": 6576 + }, + { + "epoch": 0.3619902030931807, + "grad_norm": 0.7586061954498291, + "learning_rate": 9.232253200877797e-06, + "loss": 0.7953, + "step": 6577 + }, + { + "epoch": 0.36204524189553633, + "grad_norm": 0.8202564120292664, + "learning_rate": 9.232022378220151e-06, + "loss": 0.8545, + "step": 6578 + }, + { + "epoch": 0.362100280697892, + "grad_norm": 0.7816846966743469, + "learning_rate": 9.231791523755758e-06, + "loss": 0.8573, + "step": 6579 + }, + { + "epoch": 0.36215531950024765, + "grad_norm": 0.883222222328186, + "learning_rate": 9.23156063748635e-06, + "loss": 0.7733, + "step": 6580 + }, + { + "epoch": 0.36221035830260334, + "grad_norm": 0.8472830057144165, + "learning_rate": 9.231329719413668e-06, + "loss": 0.8931, + "step": 6581 + }, + { + "epoch": 0.362265397104959, + "grad_norm": 0.7916087508201599, + "learning_rate": 9.231098769539443e-06, + "loss": 0.8806, + "step": 6582 + }, + { + "epoch": 0.36232043590731466, + "grad_norm": 0.815339982509613, + "learning_rate": 9.230867787865414e-06, + "loss": 0.9081, + "step": 6583 + }, + { + "epoch": 0.3623754747096703, + "grad_norm": 1.2352560758590698, + "learning_rate": 9.230636774393312e-06, + "loss": 0.726, + "step": 6584 + }, + { + "epoch": 0.362430513512026, + "grad_norm": 0.759308397769928, + "learning_rate": 9.230405729124878e-06, + "loss": 0.7648, + "step": 6585 + }, + { + "epoch": 0.3624855523143816, + "grad_norm": 0.8285754323005676, + "learning_rate": 9.230174652061847e-06, + "loss": 0.7972, + "step": 6586 + }, + { + "epoch": 0.3625405911167373, + "grad_norm": 0.7393043041229248, + "learning_rate": 9.229943543205956e-06, + "loss": 0.7859, + "step": 6587 + }, + { + "epoch": 0.36259562991909294, + "grad_norm": 0.7354594469070435, + "learning_rate": 9.229712402558942e-06, + "loss": 0.6683, + "step": 6588 + }, + { + "epoch": 0.36265066872144863, + "grad_norm": 0.8244406580924988, + "learning_rate": 9.229481230122543e-06, + "loss": 0.6977, + "step": 6589 + }, + { + "epoch": 0.36270570752380427, + "grad_norm": 0.810565173625946, + "learning_rate": 9.229250025898493e-06, + "loss": 0.7278, + "step": 6590 + }, + { + "epoch": 0.36276074632615996, + "grad_norm": 0.7443352937698364, + "learning_rate": 9.229018789888532e-06, + "loss": 0.7821, + "step": 6591 + }, + { + "epoch": 0.3628157851285156, + "grad_norm": 0.9211748838424683, + "learning_rate": 9.228787522094398e-06, + "loss": 0.9174, + "step": 6592 + }, + { + "epoch": 0.3628708239308713, + "grad_norm": 0.7099255919456482, + "learning_rate": 9.22855622251783e-06, + "loss": 0.74, + "step": 6593 + }, + { + "epoch": 0.3629258627332269, + "grad_norm": 0.7373029589653015, + "learning_rate": 9.228324891160564e-06, + "loss": 0.7909, + "step": 6594 + }, + { + "epoch": 0.3629809015355826, + "grad_norm": 0.8774755001068115, + "learning_rate": 9.22809352802434e-06, + "loss": 0.8354, + "step": 6595 + }, + { + "epoch": 0.36303594033793823, + "grad_norm": 0.7547696232795715, + "learning_rate": 9.227862133110899e-06, + "loss": 0.6942, + "step": 6596 + }, + { + "epoch": 0.3630909791402939, + "grad_norm": 0.7868191003799438, + "learning_rate": 9.227630706421975e-06, + "loss": 0.7575, + "step": 6597 + }, + { + "epoch": 0.36314601794264956, + "grad_norm": 0.6753721237182617, + "learning_rate": 9.227399247959312e-06, + "loss": 0.7092, + "step": 6598 + }, + { + "epoch": 0.36320105674500525, + "grad_norm": 0.7317304611206055, + "learning_rate": 9.227167757724646e-06, + "loss": 0.8372, + "step": 6599 + }, + { + "epoch": 0.3632560955473609, + "grad_norm": 0.8928040266036987, + "learning_rate": 9.226936235719721e-06, + "loss": 0.8536, + "step": 6600 + }, + { + "epoch": 0.36331113434971657, + "grad_norm": 0.7178280353546143, + "learning_rate": 9.226704681946275e-06, + "loss": 0.7648, + "step": 6601 + }, + { + "epoch": 0.3633661731520722, + "grad_norm": 0.7439851760864258, + "learning_rate": 9.226473096406046e-06, + "loss": 0.8284, + "step": 6602 + }, + { + "epoch": 0.3634212119544279, + "grad_norm": 0.7000887989997864, + "learning_rate": 9.226241479100777e-06, + "loss": 0.7797, + "step": 6603 + }, + { + "epoch": 0.3634762507567835, + "grad_norm": 0.7882626056671143, + "learning_rate": 9.226009830032209e-06, + "loss": 0.72, + "step": 6604 + }, + { + "epoch": 0.3635312895591392, + "grad_norm": 0.6445927619934082, + "learning_rate": 9.225778149202081e-06, + "loss": 0.6785, + "step": 6605 + }, + { + "epoch": 0.36358632836149485, + "grad_norm": 0.7348469495773315, + "learning_rate": 9.225546436612137e-06, + "loss": 0.8117, + "step": 6606 + }, + { + "epoch": 0.36364136716385054, + "grad_norm": 0.7455001473426819, + "learning_rate": 9.225314692264118e-06, + "loss": 0.8196, + "step": 6607 + }, + { + "epoch": 0.36369640596620617, + "grad_norm": 0.7149390578269958, + "learning_rate": 9.225082916159762e-06, + "loss": 0.8841, + "step": 6608 + }, + { + "epoch": 0.36375144476856186, + "grad_norm": 0.7095748782157898, + "learning_rate": 9.224851108300816e-06, + "loss": 0.7336, + "step": 6609 + }, + { + "epoch": 0.3638064835709175, + "grad_norm": 0.7112231850624084, + "learning_rate": 9.224619268689019e-06, + "loss": 0.8606, + "step": 6610 + }, + { + "epoch": 0.3638615223732732, + "grad_norm": 0.8052846789360046, + "learning_rate": 9.224387397326115e-06, + "loss": 0.7838, + "step": 6611 + }, + { + "epoch": 0.3639165611756288, + "grad_norm": 0.7538836002349854, + "learning_rate": 9.224155494213846e-06, + "loss": 0.8252, + "step": 6612 + }, + { + "epoch": 0.3639715999779845, + "grad_norm": 0.6968722343444824, + "learning_rate": 9.223923559353956e-06, + "loss": 0.759, + "step": 6613 + }, + { + "epoch": 0.36402663878034014, + "grad_norm": 0.7797368168830872, + "learning_rate": 9.223691592748185e-06, + "loss": 0.8452, + "step": 6614 + }, + { + "epoch": 0.3640816775826958, + "grad_norm": 0.7738572955131531, + "learning_rate": 9.223459594398278e-06, + "loss": 0.806, + "step": 6615 + }, + { + "epoch": 0.36413671638505146, + "grad_norm": 0.7998547554016113, + "learning_rate": 9.223227564305983e-06, + "loss": 0.748, + "step": 6616 + }, + { + "epoch": 0.3641917551874071, + "grad_norm": 0.838666558265686, + "learning_rate": 9.222995502473037e-06, + "loss": 0.8252, + "step": 6617 + }, + { + "epoch": 0.3642467939897628, + "grad_norm": 1.1672697067260742, + "learning_rate": 9.222763408901189e-06, + "loss": 0.806, + "step": 6618 + }, + { + "epoch": 0.3643018327921184, + "grad_norm": 0.6721193194389343, + "learning_rate": 9.22253128359218e-06, + "loss": 0.6897, + "step": 6619 + }, + { + "epoch": 0.3643568715944741, + "grad_norm": 0.8152795433998108, + "learning_rate": 9.222299126547758e-06, + "loss": 0.8377, + "step": 6620 + }, + { + "epoch": 0.36441191039682974, + "grad_norm": 0.7959492206573486, + "learning_rate": 9.222066937769664e-06, + "loss": 0.8496, + "step": 6621 + }, + { + "epoch": 0.36446694919918543, + "grad_norm": 0.7759784460067749, + "learning_rate": 9.221834717259646e-06, + "loss": 0.7736, + "step": 6622 + }, + { + "epoch": 0.36452198800154106, + "grad_norm": 0.6929076313972473, + "learning_rate": 9.221602465019449e-06, + "loss": 0.7759, + "step": 6623 + }, + { + "epoch": 0.36457702680389675, + "grad_norm": 0.7323315143585205, + "learning_rate": 9.221370181050817e-06, + "loss": 0.7958, + "step": 6624 + }, + { + "epoch": 0.3646320656062524, + "grad_norm": 0.7177294492721558, + "learning_rate": 9.221137865355496e-06, + "loss": 0.8405, + "step": 6625 + }, + { + "epoch": 0.3646871044086081, + "grad_norm": 0.7425093650817871, + "learning_rate": 9.220905517935235e-06, + "loss": 0.7722, + "step": 6626 + }, + { + "epoch": 0.3647421432109637, + "grad_norm": 0.8761040568351746, + "learning_rate": 9.220673138791775e-06, + "loss": 0.8617, + "step": 6627 + }, + { + "epoch": 0.3647971820133194, + "grad_norm": 0.927509069442749, + "learning_rate": 9.220440727926869e-06, + "loss": 0.7839, + "step": 6628 + }, + { + "epoch": 0.36485222081567503, + "grad_norm": 0.874399721622467, + "learning_rate": 9.220208285342258e-06, + "loss": 0.9697, + "step": 6629 + }, + { + "epoch": 0.3649072596180307, + "grad_norm": 0.931384801864624, + "learning_rate": 9.219975811039691e-06, + "loss": 0.8142, + "step": 6630 + }, + { + "epoch": 0.36496229842038636, + "grad_norm": 0.8567885160446167, + "learning_rate": 9.219743305020916e-06, + "loss": 0.7623, + "step": 6631 + }, + { + "epoch": 0.36501733722274204, + "grad_norm": 0.7287514209747314, + "learning_rate": 9.21951076728768e-06, + "loss": 0.8044, + "step": 6632 + }, + { + "epoch": 0.3650723760250977, + "grad_norm": 0.7234703302383423, + "learning_rate": 9.21927819784173e-06, + "loss": 0.7736, + "step": 6633 + }, + { + "epoch": 0.36512741482745337, + "grad_norm": 0.7174978256225586, + "learning_rate": 9.219045596684815e-06, + "loss": 0.7658, + "step": 6634 + }, + { + "epoch": 0.365182453629809, + "grad_norm": 0.751075804233551, + "learning_rate": 9.218812963818682e-06, + "loss": 0.7586, + "step": 6635 + }, + { + "epoch": 0.3652374924321647, + "grad_norm": 0.755283534526825, + "learning_rate": 9.21858029924508e-06, + "loss": 0.8904, + "step": 6636 + }, + { + "epoch": 0.3652925312345203, + "grad_norm": 0.6439716815948486, + "learning_rate": 9.21834760296576e-06, + "loss": 0.7335, + "step": 6637 + }, + { + "epoch": 0.365347570036876, + "grad_norm": 0.735285758972168, + "learning_rate": 9.218114874982467e-06, + "loss": 0.7193, + "step": 6638 + }, + { + "epoch": 0.36540260883923165, + "grad_norm": 0.7724307775497437, + "learning_rate": 9.217882115296952e-06, + "loss": 0.8322, + "step": 6639 + }, + { + "epoch": 0.36545764764158734, + "grad_norm": 0.7771303653717041, + "learning_rate": 9.217649323910964e-06, + "loss": 0.7952, + "step": 6640 + }, + { + "epoch": 0.36551268644394297, + "grad_norm": 0.7753337621688843, + "learning_rate": 9.217416500826251e-06, + "loss": 0.8501, + "step": 6641 + }, + { + "epoch": 0.36556772524629866, + "grad_norm": 0.8104514479637146, + "learning_rate": 9.217183646044567e-06, + "loss": 0.8503, + "step": 6642 + }, + { + "epoch": 0.3656227640486543, + "grad_norm": 0.7191929221153259, + "learning_rate": 9.21695075956766e-06, + "loss": 0.7578, + "step": 6643 + }, + { + "epoch": 0.36567780285101, + "grad_norm": 0.745837926864624, + "learning_rate": 9.216717841397277e-06, + "loss": 0.819, + "step": 6644 + }, + { + "epoch": 0.3657328416533656, + "grad_norm": 0.7019662261009216, + "learning_rate": 9.216484891535174e-06, + "loss": 0.8024, + "step": 6645 + }, + { + "epoch": 0.3657878804557213, + "grad_norm": 0.9709738492965698, + "learning_rate": 9.216251909983095e-06, + "loss": 0.7653, + "step": 6646 + }, + { + "epoch": 0.36584291925807694, + "grad_norm": 0.7973032593727112, + "learning_rate": 9.2160188967428e-06, + "loss": 0.8071, + "step": 6647 + }, + { + "epoch": 0.3658979580604326, + "grad_norm": 0.6945796012878418, + "learning_rate": 9.215785851816034e-06, + "loss": 0.6831, + "step": 6648 + }, + { + "epoch": 0.36595299686278826, + "grad_norm": 0.8685100674629211, + "learning_rate": 9.21555277520455e-06, + "loss": 0.821, + "step": 6649 + }, + { + "epoch": 0.36600803566514395, + "grad_norm": 1.0164310932159424, + "learning_rate": 9.2153196669101e-06, + "loss": 0.7861, + "step": 6650 + }, + { + "epoch": 0.3660630744674996, + "grad_norm": 0.8572850227355957, + "learning_rate": 9.215086526934435e-06, + "loss": 0.7982, + "step": 6651 + }, + { + "epoch": 0.36611811326985527, + "grad_norm": 0.7481987476348877, + "learning_rate": 9.214853355279307e-06, + "loss": 0.8258, + "step": 6652 + }, + { + "epoch": 0.3661731520722109, + "grad_norm": 0.750344455242157, + "learning_rate": 9.214620151946472e-06, + "loss": 0.7842, + "step": 6653 + }, + { + "epoch": 0.3662281908745666, + "grad_norm": 1.0266414880752563, + "learning_rate": 9.214386916937678e-06, + "loss": 0.7313, + "step": 6654 + }, + { + "epoch": 0.36628322967692223, + "grad_norm": 0.7913589477539062, + "learning_rate": 9.214153650254682e-06, + "loss": 0.8251, + "step": 6655 + }, + { + "epoch": 0.3663382684792779, + "grad_norm": 0.7185465693473816, + "learning_rate": 9.213920351899235e-06, + "loss": 0.7145, + "step": 6656 + }, + { + "epoch": 0.36639330728163355, + "grad_norm": 0.7185063362121582, + "learning_rate": 9.213687021873088e-06, + "loss": 0.8321, + "step": 6657 + }, + { + "epoch": 0.3664483460839892, + "grad_norm": 0.8380091190338135, + "learning_rate": 9.213453660178e-06, + "loss": 0.8293, + "step": 6658 + }, + { + "epoch": 0.3665033848863449, + "grad_norm": 0.7569485306739807, + "learning_rate": 9.21322026681572e-06, + "loss": 0.7201, + "step": 6659 + }, + { + "epoch": 0.3665584236887005, + "grad_norm": 0.7212445735931396, + "learning_rate": 9.212986841788005e-06, + "loss": 0.7869, + "step": 6660 + }, + { + "epoch": 0.3666134624910562, + "grad_norm": 0.9435489773750305, + "learning_rate": 9.212753385096612e-06, + "loss": 0.8469, + "step": 6661 + }, + { + "epoch": 0.36666850129341183, + "grad_norm": 0.6609265208244324, + "learning_rate": 9.212519896743289e-06, + "loss": 0.6446, + "step": 6662 + }, + { + "epoch": 0.3667235400957675, + "grad_norm": 0.7232604026794434, + "learning_rate": 9.212286376729794e-06, + "loss": 0.7138, + "step": 6663 + }, + { + "epoch": 0.36677857889812315, + "grad_norm": 0.7276197075843811, + "learning_rate": 9.212052825057882e-06, + "loss": 0.725, + "step": 6664 + }, + { + "epoch": 0.36683361770047884, + "grad_norm": 0.7029727101325989, + "learning_rate": 9.21181924172931e-06, + "loss": 0.6973, + "step": 6665 + }, + { + "epoch": 0.3668886565028345, + "grad_norm": 0.7292968034744263, + "learning_rate": 9.21158562674583e-06, + "loss": 0.6984, + "step": 6666 + }, + { + "epoch": 0.36694369530519017, + "grad_norm": 0.6977009177207947, + "learning_rate": 9.2113519801092e-06, + "loss": 0.7752, + "step": 6667 + }, + { + "epoch": 0.3669987341075458, + "grad_norm": 0.8019471764564514, + "learning_rate": 9.211118301821176e-06, + "loss": 0.7481, + "step": 6668 + }, + { + "epoch": 0.3670537729099015, + "grad_norm": 0.8097867965698242, + "learning_rate": 9.210884591883516e-06, + "loss": 0.8077, + "step": 6669 + }, + { + "epoch": 0.3671088117122571, + "grad_norm": 1.1622828245162964, + "learning_rate": 9.210650850297973e-06, + "loss": 0.8053, + "step": 6670 + }, + { + "epoch": 0.3671638505146128, + "grad_norm": 0.8188957571983337, + "learning_rate": 9.210417077066304e-06, + "loss": 0.7731, + "step": 6671 + }, + { + "epoch": 0.36721888931696844, + "grad_norm": 0.8531584739685059, + "learning_rate": 9.210183272190269e-06, + "loss": 0.8183, + "step": 6672 + }, + { + "epoch": 0.36727392811932413, + "grad_norm": 0.8007203936576843, + "learning_rate": 9.209949435671624e-06, + "loss": 0.7906, + "step": 6673 + }, + { + "epoch": 0.36732896692167977, + "grad_norm": 0.8284860253334045, + "learning_rate": 9.209715567512126e-06, + "loss": 0.7845, + "step": 6674 + }, + { + "epoch": 0.36738400572403546, + "grad_norm": 0.7735304236412048, + "learning_rate": 9.209481667713533e-06, + "loss": 0.7333, + "step": 6675 + }, + { + "epoch": 0.3674390445263911, + "grad_norm": 0.7390912771224976, + "learning_rate": 9.209247736277601e-06, + "loss": 0.7992, + "step": 6676 + }, + { + "epoch": 0.3674940833287468, + "grad_norm": 0.6871926784515381, + "learning_rate": 9.209013773206091e-06, + "loss": 0.7765, + "step": 6677 + }, + { + "epoch": 0.3675491221311024, + "grad_norm": 0.7241746187210083, + "learning_rate": 9.208779778500758e-06, + "loss": 0.7124, + "step": 6678 + }, + { + "epoch": 0.3676041609334581, + "grad_norm": 0.7362630367279053, + "learning_rate": 9.208545752163365e-06, + "loss": 0.7695, + "step": 6679 + }, + { + "epoch": 0.36765919973581374, + "grad_norm": 0.7577944993972778, + "learning_rate": 9.208311694195669e-06, + "loss": 0.8302, + "step": 6680 + }, + { + "epoch": 0.3677142385381694, + "grad_norm": 0.7182355523109436, + "learning_rate": 9.208077604599427e-06, + "loss": 0.8182, + "step": 6681 + }, + { + "epoch": 0.36776927734052506, + "grad_norm": 0.7636679410934448, + "learning_rate": 9.207843483376402e-06, + "loss": 0.7266, + "step": 6682 + }, + { + "epoch": 0.36782431614288075, + "grad_norm": 0.7325936555862427, + "learning_rate": 9.207609330528349e-06, + "loss": 0.735, + "step": 6683 + }, + { + "epoch": 0.3678793549452364, + "grad_norm": 1.1119143962860107, + "learning_rate": 9.207375146057033e-06, + "loss": 1.0124, + "step": 6684 + }, + { + "epoch": 0.36793439374759207, + "grad_norm": 0.7694228291511536, + "learning_rate": 9.207140929964212e-06, + "loss": 0.7803, + "step": 6685 + }, + { + "epoch": 0.3679894325499477, + "grad_norm": 0.7628658413887024, + "learning_rate": 9.206906682251644e-06, + "loss": 0.8057, + "step": 6686 + }, + { + "epoch": 0.3680444713523034, + "grad_norm": 0.766266942024231, + "learning_rate": 9.206672402921092e-06, + "loss": 0.7827, + "step": 6687 + }, + { + "epoch": 0.368099510154659, + "grad_norm": 0.7355746626853943, + "learning_rate": 9.206438091974316e-06, + "loss": 0.8146, + "step": 6688 + }, + { + "epoch": 0.3681545489570147, + "grad_norm": 0.8464547395706177, + "learning_rate": 9.20620374941308e-06, + "loss": 0.8296, + "step": 6689 + }, + { + "epoch": 0.36820958775937035, + "grad_norm": 0.7113955616950989, + "learning_rate": 9.20596937523914e-06, + "loss": 0.7621, + "step": 6690 + }, + { + "epoch": 0.36826462656172604, + "grad_norm": 0.7141324877738953, + "learning_rate": 9.205734969454259e-06, + "loss": 0.738, + "step": 6691 + }, + { + "epoch": 0.36831966536408167, + "grad_norm": 0.7576237320899963, + "learning_rate": 9.2055005320602e-06, + "loss": 0.7727, + "step": 6692 + }, + { + "epoch": 0.36837470416643736, + "grad_norm": 0.7448444962501526, + "learning_rate": 9.205266063058727e-06, + "loss": 0.8238, + "step": 6693 + }, + { + "epoch": 0.368429742968793, + "grad_norm": 0.7441811561584473, + "learning_rate": 9.205031562451599e-06, + "loss": 0.7518, + "step": 6694 + }, + { + "epoch": 0.3684847817711487, + "grad_norm": 0.9284115433692932, + "learning_rate": 9.20479703024058e-06, + "loss": 0.817, + "step": 6695 + }, + { + "epoch": 0.3685398205735043, + "grad_norm": 0.7019243836402893, + "learning_rate": 9.204562466427431e-06, + "loss": 0.7403, + "step": 6696 + }, + { + "epoch": 0.36859485937586, + "grad_norm": 0.6345306634902954, + "learning_rate": 9.204327871013917e-06, + "loss": 0.7058, + "step": 6697 + }, + { + "epoch": 0.36864989817821564, + "grad_norm": 0.7375063300132751, + "learning_rate": 9.2040932440018e-06, + "loss": 0.831, + "step": 6698 + }, + { + "epoch": 0.36870493698057133, + "grad_norm": 0.8213731050491333, + "learning_rate": 9.203858585392842e-06, + "loss": 0.7677, + "step": 6699 + }, + { + "epoch": 0.36875997578292696, + "grad_norm": 0.7114601731300354, + "learning_rate": 9.203623895188809e-06, + "loss": 0.8015, + "step": 6700 + }, + { + "epoch": 0.3688150145852826, + "grad_norm": 0.7707667350769043, + "learning_rate": 9.203389173391463e-06, + "loss": 0.7758, + "step": 6701 + }, + { + "epoch": 0.3688700533876383, + "grad_norm": 0.7374396920204163, + "learning_rate": 9.203154420002572e-06, + "loss": 0.7583, + "step": 6702 + }, + { + "epoch": 0.3689250921899939, + "grad_norm": 0.7156866192817688, + "learning_rate": 9.202919635023895e-06, + "loss": 0.8173, + "step": 6703 + }, + { + "epoch": 0.3689801309923496, + "grad_norm": 0.6811904311180115, + "learning_rate": 9.2026848184572e-06, + "loss": 0.7441, + "step": 6704 + }, + { + "epoch": 0.36903516979470524, + "grad_norm": 0.7515163421630859, + "learning_rate": 9.20244997030425e-06, + "loss": 0.7927, + "step": 6705 + }, + { + "epoch": 0.36909020859706093, + "grad_norm": 0.761116087436676, + "learning_rate": 9.202215090566813e-06, + "loss": 0.7686, + "step": 6706 + }, + { + "epoch": 0.36914524739941657, + "grad_norm": 0.8726711869239807, + "learning_rate": 9.20198017924665e-06, + "loss": 0.7831, + "step": 6707 + }, + { + "epoch": 0.36920028620177225, + "grad_norm": 0.6868153810501099, + "learning_rate": 9.20174523634553e-06, + "loss": 0.7855, + "step": 6708 + }, + { + "epoch": 0.3692553250041279, + "grad_norm": 0.7140498757362366, + "learning_rate": 9.201510261865218e-06, + "loss": 0.8144, + "step": 6709 + }, + { + "epoch": 0.3693103638064836, + "grad_norm": 0.8745181560516357, + "learning_rate": 9.201275255807478e-06, + "loss": 0.9204, + "step": 6710 + }, + { + "epoch": 0.3693654026088392, + "grad_norm": 0.6535945534706116, + "learning_rate": 9.20104021817408e-06, + "loss": 0.7729, + "step": 6711 + }, + { + "epoch": 0.3694204414111949, + "grad_norm": 0.655857503414154, + "learning_rate": 9.200805148966785e-06, + "loss": 0.8373, + "step": 6712 + }, + { + "epoch": 0.36947548021355053, + "grad_norm": 0.8393271565437317, + "learning_rate": 9.200570048187365e-06, + "loss": 0.8532, + "step": 6713 + }, + { + "epoch": 0.3695305190159062, + "grad_norm": 0.7484574913978577, + "learning_rate": 9.200334915837585e-06, + "loss": 0.8411, + "step": 6714 + }, + { + "epoch": 0.36958555781826186, + "grad_norm": 0.9913665652275085, + "learning_rate": 9.200099751919212e-06, + "loss": 0.9011, + "step": 6715 + }, + { + "epoch": 0.36964059662061755, + "grad_norm": 0.7314063310623169, + "learning_rate": 9.199864556434013e-06, + "loss": 0.7184, + "step": 6716 + }, + { + "epoch": 0.3696956354229732, + "grad_norm": 0.7881553173065186, + "learning_rate": 9.199629329383758e-06, + "loss": 0.796, + "step": 6717 + }, + { + "epoch": 0.36975067422532887, + "grad_norm": 0.7440283298492432, + "learning_rate": 9.199394070770212e-06, + "loss": 0.7472, + "step": 6718 + }, + { + "epoch": 0.3698057130276845, + "grad_norm": 0.6916326880455017, + "learning_rate": 9.199158780595144e-06, + "loss": 0.6808, + "step": 6719 + }, + { + "epoch": 0.3698607518300402, + "grad_norm": 0.8482714295387268, + "learning_rate": 9.198923458860323e-06, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.3699157906323958, + "grad_norm": 0.7541999816894531, + "learning_rate": 9.198688105567516e-06, + "loss": 0.7917, + "step": 6721 + }, + { + "epoch": 0.3699708294347515, + "grad_norm": 0.794335126876831, + "learning_rate": 9.198452720718494e-06, + "loss": 0.8463, + "step": 6722 + }, + { + "epoch": 0.37002586823710715, + "grad_norm": 0.7866827845573425, + "learning_rate": 9.198217304315025e-06, + "loss": 0.7938, + "step": 6723 + }, + { + "epoch": 0.37008090703946284, + "grad_norm": 0.7393556833267212, + "learning_rate": 9.19798185635888e-06, + "loss": 0.7825, + "step": 6724 + }, + { + "epoch": 0.37013594584181847, + "grad_norm": 0.7131090760231018, + "learning_rate": 9.197746376851825e-06, + "loss": 0.7184, + "step": 6725 + }, + { + "epoch": 0.37019098464417416, + "grad_norm": 0.7054039239883423, + "learning_rate": 9.197510865795634e-06, + "loss": 0.7458, + "step": 6726 + }, + { + "epoch": 0.3702460234465298, + "grad_norm": 0.7437009811401367, + "learning_rate": 9.197275323192073e-06, + "loss": 0.7921, + "step": 6727 + }, + { + "epoch": 0.3703010622488855, + "grad_norm": 1.0703076124191284, + "learning_rate": 9.197039749042916e-06, + "loss": 0.771, + "step": 6728 + }, + { + "epoch": 0.3703561010512411, + "grad_norm": 0.8278045654296875, + "learning_rate": 9.196804143349929e-06, + "loss": 0.8984, + "step": 6729 + }, + { + "epoch": 0.3704111398535968, + "grad_norm": 0.7713067531585693, + "learning_rate": 9.196568506114887e-06, + "loss": 0.7702, + "step": 6730 + }, + { + "epoch": 0.37046617865595244, + "grad_norm": 0.9040505290031433, + "learning_rate": 9.19633283733956e-06, + "loss": 0.7113, + "step": 6731 + }, + { + "epoch": 0.3705212174583081, + "grad_norm": 0.8853700757026672, + "learning_rate": 9.196097137025718e-06, + "loss": 0.8445, + "step": 6732 + }, + { + "epoch": 0.37057625626066376, + "grad_norm": 0.6870817542076111, + "learning_rate": 9.195861405175133e-06, + "loss": 0.7613, + "step": 6733 + }, + { + "epoch": 0.37063129506301945, + "grad_norm": 0.7539152503013611, + "learning_rate": 9.195625641789579e-06, + "loss": 0.7478, + "step": 6734 + }, + { + "epoch": 0.3706863338653751, + "grad_norm": 0.7084356546401978, + "learning_rate": 9.195389846870822e-06, + "loss": 0.7803, + "step": 6735 + }, + { + "epoch": 0.3707413726677308, + "grad_norm": 0.7883948087692261, + "learning_rate": 9.19515402042064e-06, + "loss": 0.8606, + "step": 6736 + }, + { + "epoch": 0.3707964114700864, + "grad_norm": 0.714948296546936, + "learning_rate": 9.194918162440804e-06, + "loss": 0.8066, + "step": 6737 + }, + { + "epoch": 0.3708514502724421, + "grad_norm": 0.7110786437988281, + "learning_rate": 9.194682272933085e-06, + "loss": 0.7439, + "step": 6738 + }, + { + "epoch": 0.37090648907479773, + "grad_norm": 0.7281045317649841, + "learning_rate": 9.194446351899257e-06, + "loss": 0.7772, + "step": 6739 + }, + { + "epoch": 0.3709615278771534, + "grad_norm": 0.7351245880126953, + "learning_rate": 9.194210399341093e-06, + "loss": 0.8777, + "step": 6740 + }, + { + "epoch": 0.37101656667950905, + "grad_norm": 0.8028532266616821, + "learning_rate": 9.193974415260367e-06, + "loss": 0.7461, + "step": 6741 + }, + { + "epoch": 0.37107160548186474, + "grad_norm": 0.8015451431274414, + "learning_rate": 9.19373839965885e-06, + "loss": 0.8006, + "step": 6742 + }, + { + "epoch": 0.3711266442842204, + "grad_norm": 0.9567442536354065, + "learning_rate": 9.193502352538321e-06, + "loss": 0.8636, + "step": 6743 + }, + { + "epoch": 0.371181683086576, + "grad_norm": 1.1413114070892334, + "learning_rate": 9.193266273900547e-06, + "loss": 0.8976, + "step": 6744 + }, + { + "epoch": 0.3712367218889317, + "grad_norm": 0.6971789002418518, + "learning_rate": 9.19303016374731e-06, + "loss": 0.7419, + "step": 6745 + }, + { + "epoch": 0.37129176069128733, + "grad_norm": 0.8117435574531555, + "learning_rate": 9.192794022080378e-06, + "loss": 0.8166, + "step": 6746 + }, + { + "epoch": 0.371346799493643, + "grad_norm": 0.7748119831085205, + "learning_rate": 9.19255784890153e-06, + "loss": 0.8073, + "step": 6747 + }, + { + "epoch": 0.37140183829599865, + "grad_norm": 0.6550068259239197, + "learning_rate": 9.192321644212539e-06, + "loss": 0.6976, + "step": 6748 + }, + { + "epoch": 0.37145687709835434, + "grad_norm": 0.7931404709815979, + "learning_rate": 9.19208540801518e-06, + "loss": 0.7153, + "step": 6749 + }, + { + "epoch": 0.37151191590071, + "grad_norm": 0.7107539176940918, + "learning_rate": 9.19184914031123e-06, + "loss": 0.7616, + "step": 6750 + }, + { + "epoch": 0.37156695470306567, + "grad_norm": 0.6983848810195923, + "learning_rate": 9.191612841102463e-06, + "loss": 0.6507, + "step": 6751 + }, + { + "epoch": 0.3716219935054213, + "grad_norm": 0.7653477787971497, + "learning_rate": 9.191376510390657e-06, + "loss": 0.708, + "step": 6752 + }, + { + "epoch": 0.371677032307777, + "grad_norm": 0.8903954029083252, + "learning_rate": 9.191140148177586e-06, + "loss": 0.8131, + "step": 6753 + }, + { + "epoch": 0.3717320711101326, + "grad_norm": 0.7584933042526245, + "learning_rate": 9.190903754465028e-06, + "loss": 0.8178, + "step": 6754 + }, + { + "epoch": 0.3717871099124883, + "grad_norm": 0.7338405847549438, + "learning_rate": 9.19066732925476e-06, + "loss": 0.7717, + "step": 6755 + }, + { + "epoch": 0.37184214871484395, + "grad_norm": 0.764944851398468, + "learning_rate": 9.190430872548557e-06, + "loss": 0.7762, + "step": 6756 + }, + { + "epoch": 0.37189718751719963, + "grad_norm": 0.7362231612205505, + "learning_rate": 9.190194384348199e-06, + "loss": 0.8277, + "step": 6757 + }, + { + "epoch": 0.37195222631955527, + "grad_norm": 0.7462226748466492, + "learning_rate": 9.18995786465546e-06, + "loss": 0.7362, + "step": 6758 + }, + { + "epoch": 0.37200726512191096, + "grad_norm": 0.7769725322723389, + "learning_rate": 9.18972131347212e-06, + "loss": 0.8217, + "step": 6759 + }, + { + "epoch": 0.3720623039242666, + "grad_norm": 0.7263969779014587, + "learning_rate": 9.189484730799956e-06, + "loss": 0.7719, + "step": 6760 + }, + { + "epoch": 0.3721173427266223, + "grad_norm": 0.7612473964691162, + "learning_rate": 9.189248116640746e-06, + "loss": 0.7149, + "step": 6761 + }, + { + "epoch": 0.3721723815289779, + "grad_norm": 0.6813042759895325, + "learning_rate": 9.189011470996268e-06, + "loss": 0.7119, + "step": 6762 + }, + { + "epoch": 0.3722274203313336, + "grad_norm": 0.7376571297645569, + "learning_rate": 9.188774793868302e-06, + "loss": 0.7998, + "step": 6763 + }, + { + "epoch": 0.37228245913368924, + "grad_norm": 0.8592102527618408, + "learning_rate": 9.188538085258626e-06, + "loss": 0.8026, + "step": 6764 + }, + { + "epoch": 0.3723374979360449, + "grad_norm": 0.7666613459587097, + "learning_rate": 9.188301345169017e-06, + "loss": 0.8571, + "step": 6765 + }, + { + "epoch": 0.37239253673840056, + "grad_norm": 0.7118985652923584, + "learning_rate": 9.188064573601258e-06, + "loss": 0.7637, + "step": 6766 + }, + { + "epoch": 0.37244757554075625, + "grad_norm": 0.8247082233428955, + "learning_rate": 9.187827770557127e-06, + "loss": 0.8209, + "step": 6767 + }, + { + "epoch": 0.3725026143431119, + "grad_norm": 0.7259567975997925, + "learning_rate": 9.187590936038403e-06, + "loss": 0.7918, + "step": 6768 + }, + { + "epoch": 0.37255765314546757, + "grad_norm": 0.7409893274307251, + "learning_rate": 9.187354070046867e-06, + "loss": 0.8004, + "step": 6769 + }, + { + "epoch": 0.3726126919478232, + "grad_norm": 0.8163084387779236, + "learning_rate": 9.187117172584298e-06, + "loss": 0.8452, + "step": 6770 + }, + { + "epoch": 0.3726677307501789, + "grad_norm": 0.9241586923599243, + "learning_rate": 9.186880243652477e-06, + "loss": 0.8939, + "step": 6771 + }, + { + "epoch": 0.3727227695525345, + "grad_norm": 0.710434079170227, + "learning_rate": 9.186643283253185e-06, + "loss": 0.7337, + "step": 6772 + }, + { + "epoch": 0.3727778083548902, + "grad_norm": 0.7850505709648132, + "learning_rate": 9.186406291388203e-06, + "loss": 0.7892, + "step": 6773 + }, + { + "epoch": 0.37283284715724585, + "grad_norm": 0.813979983329773, + "learning_rate": 9.186169268059311e-06, + "loss": 0.7993, + "step": 6774 + }, + { + "epoch": 0.37288788595960154, + "grad_norm": 0.7923213243484497, + "learning_rate": 9.185932213268292e-06, + "loss": 0.7501, + "step": 6775 + }, + { + "epoch": 0.3729429247619572, + "grad_norm": 0.7923155426979065, + "learning_rate": 9.185695127016928e-06, + "loss": 0.8435, + "step": 6776 + }, + { + "epoch": 0.37299796356431286, + "grad_norm": 0.69893479347229, + "learning_rate": 9.185458009306999e-06, + "loss": 0.7155, + "step": 6777 + }, + { + "epoch": 0.3730530023666685, + "grad_norm": 0.7848305106163025, + "learning_rate": 9.185220860140289e-06, + "loss": 0.7971, + "step": 6778 + }, + { + "epoch": 0.3731080411690242, + "grad_norm": 0.6707655787467957, + "learning_rate": 9.184983679518578e-06, + "loss": 0.6939, + "step": 6779 + }, + { + "epoch": 0.3731630799713798, + "grad_norm": 0.6612532734870911, + "learning_rate": 9.18474646744365e-06, + "loss": 0.7361, + "step": 6780 + }, + { + "epoch": 0.3732181187737355, + "grad_norm": 0.7753985524177551, + "learning_rate": 9.184509223917288e-06, + "loss": 0.7263, + "step": 6781 + }, + { + "epoch": 0.37327315757609114, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.184271948941275e-06, + "loss": 0.6923, + "step": 6782 + }, + { + "epoch": 0.37332819637844683, + "grad_norm": 0.7223647832870483, + "learning_rate": 9.184034642517393e-06, + "loss": 0.793, + "step": 6783 + }, + { + "epoch": 0.37338323518080246, + "grad_norm": 0.7428838014602661, + "learning_rate": 9.183797304647428e-06, + "loss": 0.7781, + "step": 6784 + }, + { + "epoch": 0.37343827398315815, + "grad_norm": 0.7301773428916931, + "learning_rate": 9.183559935333161e-06, + "loss": 0.7964, + "step": 6785 + }, + { + "epoch": 0.3734933127855138, + "grad_norm": 0.7883384823799133, + "learning_rate": 9.183322534576378e-06, + "loss": 0.8904, + "step": 6786 + }, + { + "epoch": 0.3735483515878694, + "grad_norm": 0.7943564653396606, + "learning_rate": 9.183085102378864e-06, + "loss": 0.7229, + "step": 6787 + }, + { + "epoch": 0.3736033903902251, + "grad_norm": 0.7385129928588867, + "learning_rate": 9.1828476387424e-06, + "loss": 0.7967, + "step": 6788 + }, + { + "epoch": 0.37365842919258074, + "grad_norm": 0.7968102097511292, + "learning_rate": 9.182610143668775e-06, + "loss": 0.8016, + "step": 6789 + }, + { + "epoch": 0.37371346799493643, + "grad_norm": 0.7810283303260803, + "learning_rate": 9.18237261715977e-06, + "loss": 0.8956, + "step": 6790 + }, + { + "epoch": 0.37376850679729207, + "grad_norm": 0.7110065221786499, + "learning_rate": 9.182135059217172e-06, + "loss": 0.7808, + "step": 6791 + }, + { + "epoch": 0.37382354559964776, + "grad_norm": 0.7513633370399475, + "learning_rate": 9.181897469842767e-06, + "loss": 0.8236, + "step": 6792 + }, + { + "epoch": 0.3738785844020034, + "grad_norm": 0.7850426435470581, + "learning_rate": 9.18165984903834e-06, + "loss": 0.8642, + "step": 6793 + }, + { + "epoch": 0.3739336232043591, + "grad_norm": 1.4948225021362305, + "learning_rate": 9.181422196805676e-06, + "loss": 0.8765, + "step": 6794 + }, + { + "epoch": 0.3739886620067147, + "grad_norm": 0.8242343068122864, + "learning_rate": 9.181184513146563e-06, + "loss": 0.7213, + "step": 6795 + }, + { + "epoch": 0.3740437008090704, + "grad_norm": 0.8017476797103882, + "learning_rate": 9.180946798062786e-06, + "loss": 0.655, + "step": 6796 + }, + { + "epoch": 0.37409873961142603, + "grad_norm": 0.9573387503623962, + "learning_rate": 9.180709051556132e-06, + "loss": 0.8674, + "step": 6797 + }, + { + "epoch": 0.3741537784137817, + "grad_norm": 0.7575511932373047, + "learning_rate": 9.180471273628388e-06, + "loss": 0.8672, + "step": 6798 + }, + { + "epoch": 0.37420881721613736, + "grad_norm": 0.7723323702812195, + "learning_rate": 9.180233464281343e-06, + "loss": 0.7698, + "step": 6799 + }, + { + "epoch": 0.37426385601849305, + "grad_norm": 0.8352731466293335, + "learning_rate": 9.17999562351678e-06, + "loss": 0.9248, + "step": 6800 + }, + { + "epoch": 0.3743188948208487, + "grad_norm": 0.7459322214126587, + "learning_rate": 9.179757751336488e-06, + "loss": 0.7561, + "step": 6801 + }, + { + "epoch": 0.37437393362320437, + "grad_norm": 0.8053051829338074, + "learning_rate": 9.179519847742257e-06, + "loss": 0.8743, + "step": 6802 + }, + { + "epoch": 0.37442897242556, + "grad_norm": 0.7781768441200256, + "learning_rate": 9.179281912735873e-06, + "loss": 0.7426, + "step": 6803 + }, + { + "epoch": 0.3744840112279157, + "grad_norm": 0.6812007427215576, + "learning_rate": 9.179043946319126e-06, + "loss": 0.761, + "step": 6804 + }, + { + "epoch": 0.3745390500302713, + "grad_norm": 0.8327108025550842, + "learning_rate": 9.178805948493803e-06, + "loss": 0.7633, + "step": 6805 + }, + { + "epoch": 0.374594088832627, + "grad_norm": 0.7519007921218872, + "learning_rate": 9.178567919261692e-06, + "loss": 0.8268, + "step": 6806 + }, + { + "epoch": 0.37464912763498265, + "grad_norm": 0.7507897019386292, + "learning_rate": 9.178329858624584e-06, + "loss": 0.8734, + "step": 6807 + }, + { + "epoch": 0.37470416643733834, + "grad_norm": 0.6874666213989258, + "learning_rate": 9.178091766584267e-06, + "loss": 0.6669, + "step": 6808 + }, + { + "epoch": 0.37475920523969397, + "grad_norm": 0.6987403631210327, + "learning_rate": 9.17785364314253e-06, + "loss": 0.7627, + "step": 6809 + }, + { + "epoch": 0.37481424404204966, + "grad_norm": 0.7777343392372131, + "learning_rate": 9.177615488301163e-06, + "loss": 0.7637, + "step": 6810 + }, + { + "epoch": 0.3748692828444053, + "grad_norm": 0.71980881690979, + "learning_rate": 9.177377302061958e-06, + "loss": 0.7964, + "step": 6811 + }, + { + "epoch": 0.374924321646761, + "grad_norm": 0.627328634262085, + "learning_rate": 9.177139084426704e-06, + "loss": 0.6862, + "step": 6812 + }, + { + "epoch": 0.3749793604491166, + "grad_norm": 0.7099852561950684, + "learning_rate": 9.176900835397188e-06, + "loss": 0.7592, + "step": 6813 + }, + { + "epoch": 0.3750343992514723, + "grad_norm": 0.7880212664604187, + "learning_rate": 9.176662554975205e-06, + "loss": 0.756, + "step": 6814 + }, + { + "epoch": 0.37508943805382794, + "grad_norm": 0.7347460389137268, + "learning_rate": 9.176424243162546e-06, + "loss": 0.8537, + "step": 6815 + }, + { + "epoch": 0.37514447685618363, + "grad_norm": 0.7020999789237976, + "learning_rate": 9.176185899960996e-06, + "loss": 0.7844, + "step": 6816 + }, + { + "epoch": 0.37519951565853926, + "grad_norm": 0.6857696175575256, + "learning_rate": 9.175947525372355e-06, + "loss": 0.8491, + "step": 6817 + }, + { + "epoch": 0.37525455446089495, + "grad_norm": 0.6882391571998596, + "learning_rate": 9.175709119398409e-06, + "loss": 0.7797, + "step": 6818 + }, + { + "epoch": 0.3753095932632506, + "grad_norm": 0.7788485288619995, + "learning_rate": 9.17547068204095e-06, + "loss": 0.6898, + "step": 6819 + }, + { + "epoch": 0.3753646320656063, + "grad_norm": 0.8529300093650818, + "learning_rate": 9.17523221330177e-06, + "loss": 0.8113, + "step": 6820 + }, + { + "epoch": 0.3754196708679619, + "grad_norm": 0.6297540068626404, + "learning_rate": 9.174993713182663e-06, + "loss": 0.7133, + "step": 6821 + }, + { + "epoch": 0.3754747096703176, + "grad_norm": 0.8225051760673523, + "learning_rate": 9.174755181685422e-06, + "loss": 0.83, + "step": 6822 + }, + { + "epoch": 0.37552974847267323, + "grad_norm": 0.7445290684700012, + "learning_rate": 9.174516618811838e-06, + "loss": 0.8597, + "step": 6823 + }, + { + "epoch": 0.3755847872750289, + "grad_norm": 0.7890744209289551, + "learning_rate": 9.174278024563706e-06, + "loss": 0.8021, + "step": 6824 + }, + { + "epoch": 0.37563982607738455, + "grad_norm": 0.644434928894043, + "learning_rate": 9.174039398942815e-06, + "loss": 0.7154, + "step": 6825 + }, + { + "epoch": 0.37569486487974024, + "grad_norm": 0.7664980292320251, + "learning_rate": 9.173800741950962e-06, + "loss": 0.8496, + "step": 6826 + }, + { + "epoch": 0.3757499036820959, + "grad_norm": 0.8062339425086975, + "learning_rate": 9.173562053589942e-06, + "loss": 0.7736, + "step": 6827 + }, + { + "epoch": 0.37580494248445157, + "grad_norm": 0.6334213018417358, + "learning_rate": 9.173323333861543e-06, + "loss": 0.6513, + "step": 6828 + }, + { + "epoch": 0.3758599812868072, + "grad_norm": 0.6825501322746277, + "learning_rate": 9.173084582767567e-06, + "loss": 0.755, + "step": 6829 + }, + { + "epoch": 0.37591502008916283, + "grad_norm": 0.7353835105895996, + "learning_rate": 9.172845800309801e-06, + "loss": 0.7783, + "step": 6830 + }, + { + "epoch": 0.3759700588915185, + "grad_norm": 0.7830193638801575, + "learning_rate": 9.172606986490046e-06, + "loss": 0.7352, + "step": 6831 + }, + { + "epoch": 0.37602509769387416, + "grad_norm": 0.7464943528175354, + "learning_rate": 9.172368141310091e-06, + "loss": 0.6454, + "step": 6832 + }, + { + "epoch": 0.37608013649622984, + "grad_norm": 0.7171493172645569, + "learning_rate": 9.172129264771736e-06, + "loss": 0.7978, + "step": 6833 + }, + { + "epoch": 0.3761351752985855, + "grad_norm": 0.6929624676704407, + "learning_rate": 9.171890356876774e-06, + "loss": 0.8026, + "step": 6834 + }, + { + "epoch": 0.37619021410094117, + "grad_norm": 0.7240758538246155, + "learning_rate": 9.171651417627e-06, + "loss": 0.8469, + "step": 6835 + }, + { + "epoch": 0.3762452529032968, + "grad_norm": 0.7713736891746521, + "learning_rate": 9.17141244702421e-06, + "loss": 0.8307, + "step": 6836 + }, + { + "epoch": 0.3763002917056525, + "grad_norm": 0.7417639493942261, + "learning_rate": 9.171173445070203e-06, + "loss": 0.8165, + "step": 6837 + }, + { + "epoch": 0.3763553305080081, + "grad_norm": 0.811005711555481, + "learning_rate": 9.17093441176677e-06, + "loss": 0.8418, + "step": 6838 + }, + { + "epoch": 0.3764103693103638, + "grad_norm": 0.9996818900108337, + "learning_rate": 9.170695347115713e-06, + "loss": 0.851, + "step": 6839 + }, + { + "epoch": 0.37646540811271945, + "grad_norm": 0.7703381776809692, + "learning_rate": 9.170456251118824e-06, + "loss": 0.8308, + "step": 6840 + }, + { + "epoch": 0.37652044691507514, + "grad_norm": 0.7194466590881348, + "learning_rate": 9.170217123777904e-06, + "loss": 0.699, + "step": 6841 + }, + { + "epoch": 0.37657548571743077, + "grad_norm": 0.7146462202072144, + "learning_rate": 9.169977965094748e-06, + "loss": 0.8247, + "step": 6842 + }, + { + "epoch": 0.37663052451978646, + "grad_norm": 0.7490555047988892, + "learning_rate": 9.169738775071153e-06, + "loss": 0.8627, + "step": 6843 + }, + { + "epoch": 0.3766855633221421, + "grad_norm": 0.827996015548706, + "learning_rate": 9.169499553708919e-06, + "loss": 0.7454, + "step": 6844 + }, + { + "epoch": 0.3767406021244978, + "grad_norm": 0.7185913324356079, + "learning_rate": 9.16926030100984e-06, + "loss": 0.7018, + "step": 6845 + }, + { + "epoch": 0.3767956409268534, + "grad_norm": 0.7879654169082642, + "learning_rate": 9.169021016975718e-06, + "loss": 0.8144, + "step": 6846 + }, + { + "epoch": 0.3768506797292091, + "grad_norm": 0.7072417736053467, + "learning_rate": 9.168781701608352e-06, + "loss": 0.7572, + "step": 6847 + }, + { + "epoch": 0.37690571853156474, + "grad_norm": 0.7359803915023804, + "learning_rate": 9.168542354909536e-06, + "loss": 0.7712, + "step": 6848 + }, + { + "epoch": 0.3769607573339204, + "grad_norm": 0.7672479748725891, + "learning_rate": 9.168302976881072e-06, + "loss": 0.7696, + "step": 6849 + }, + { + "epoch": 0.37701579613627606, + "grad_norm": 0.7276006937026978, + "learning_rate": 9.168063567524758e-06, + "loss": 0.8235, + "step": 6850 + }, + { + "epoch": 0.37707083493863175, + "grad_norm": 0.673577606678009, + "learning_rate": 9.167824126842396e-06, + "loss": 0.6515, + "step": 6851 + }, + { + "epoch": 0.3771258737409874, + "grad_norm": 0.7257997989654541, + "learning_rate": 9.167584654835782e-06, + "loss": 0.729, + "step": 6852 + }, + { + "epoch": 0.37718091254334307, + "grad_norm": 0.6655071377754211, + "learning_rate": 9.167345151506717e-06, + "loss": 0.7917, + "step": 6853 + }, + { + "epoch": 0.3772359513456987, + "grad_norm": 0.7603726983070374, + "learning_rate": 9.167105616857002e-06, + "loss": 0.8383, + "step": 6854 + }, + { + "epoch": 0.3772909901480544, + "grad_norm": 0.7066939473152161, + "learning_rate": 9.166866050888437e-06, + "loss": 0.7589, + "step": 6855 + }, + { + "epoch": 0.37734602895041003, + "grad_norm": 0.7002355456352234, + "learning_rate": 9.16662645360282e-06, + "loss": 0.8305, + "step": 6856 + }, + { + "epoch": 0.3774010677527657, + "grad_norm": 0.9499780535697937, + "learning_rate": 9.166386825001957e-06, + "loss": 0.78, + "step": 6857 + }, + { + "epoch": 0.37745610655512135, + "grad_norm": 0.7136938571929932, + "learning_rate": 9.166147165087645e-06, + "loss": 0.7449, + "step": 6858 + }, + { + "epoch": 0.37751114535747704, + "grad_norm": 0.740443766117096, + "learning_rate": 9.165907473861687e-06, + "loss": 0.8228, + "step": 6859 + }, + { + "epoch": 0.3775661841598327, + "grad_norm": 0.7649856209754944, + "learning_rate": 9.165667751325879e-06, + "loss": 0.7762, + "step": 6860 + }, + { + "epoch": 0.37762122296218836, + "grad_norm": 0.743251383304596, + "learning_rate": 9.165427997482032e-06, + "loss": 0.7536, + "step": 6861 + }, + { + "epoch": 0.377676261764544, + "grad_norm": 0.7023851871490479, + "learning_rate": 9.165188212331941e-06, + "loss": 0.7327, + "step": 6862 + }, + { + "epoch": 0.3777313005668997, + "grad_norm": 0.7304333448410034, + "learning_rate": 9.164948395877411e-06, + "loss": 0.8816, + "step": 6863 + }, + { + "epoch": 0.3777863393692553, + "grad_norm": 0.6666659116744995, + "learning_rate": 9.164708548120244e-06, + "loss": 0.7821, + "step": 6864 + }, + { + "epoch": 0.377841378171611, + "grad_norm": 0.6542865037918091, + "learning_rate": 9.164468669062242e-06, + "loss": 0.7044, + "step": 6865 + }, + { + "epoch": 0.37789641697396664, + "grad_norm": 0.7436043620109558, + "learning_rate": 9.16422875870521e-06, + "loss": 0.8492, + "step": 6866 + }, + { + "epoch": 0.37795145577632233, + "grad_norm": 0.7660424709320068, + "learning_rate": 9.163988817050947e-06, + "loss": 0.7236, + "step": 6867 + }, + { + "epoch": 0.37800649457867797, + "grad_norm": 0.7288914918899536, + "learning_rate": 9.16374884410126e-06, + "loss": 0.6361, + "step": 6868 + }, + { + "epoch": 0.37806153338103365, + "grad_norm": 0.884832501411438, + "learning_rate": 9.163508839857948e-06, + "loss": 0.8112, + "step": 6869 + }, + { + "epoch": 0.3781165721833893, + "grad_norm": 0.937660813331604, + "learning_rate": 9.163268804322822e-06, + "loss": 0.6405, + "step": 6870 + }, + { + "epoch": 0.378171610985745, + "grad_norm": 0.8295212388038635, + "learning_rate": 9.16302873749768e-06, + "loss": 0.8107, + "step": 6871 + }, + { + "epoch": 0.3782266497881006, + "grad_norm": 1.0573647022247314, + "learning_rate": 9.16278863938433e-06, + "loss": 0.7792, + "step": 6872 + }, + { + "epoch": 0.37828168859045624, + "grad_norm": 0.8450027108192444, + "learning_rate": 9.162548509984574e-06, + "loss": 0.8103, + "step": 6873 + }, + { + "epoch": 0.37833672739281193, + "grad_norm": 0.7372947931289673, + "learning_rate": 9.162308349300218e-06, + "loss": 0.8232, + "step": 6874 + }, + { + "epoch": 0.37839176619516757, + "grad_norm": 0.7573776841163635, + "learning_rate": 9.162068157333066e-06, + "loss": 0.773, + "step": 6875 + }, + { + "epoch": 0.37844680499752326, + "grad_norm": 0.7883201241493225, + "learning_rate": 9.161827934084924e-06, + "loss": 0.7561, + "step": 6876 + }, + { + "epoch": 0.3785018437998789, + "grad_norm": 0.7195025086402893, + "learning_rate": 9.161587679557598e-06, + "loss": 0.798, + "step": 6877 + }, + { + "epoch": 0.3785568826022346, + "grad_norm": 0.7047843337059021, + "learning_rate": 9.161347393752891e-06, + "loss": 0.8122, + "step": 6878 + }, + { + "epoch": 0.3786119214045902, + "grad_norm": 0.7354363203048706, + "learning_rate": 9.161107076672613e-06, + "loss": 0.7296, + "step": 6879 + }, + { + "epoch": 0.3786669602069459, + "grad_norm": 0.7748313546180725, + "learning_rate": 9.160866728318567e-06, + "loss": 0.9576, + "step": 6880 + }, + { + "epoch": 0.37872199900930154, + "grad_norm": 0.7197638750076294, + "learning_rate": 9.16062634869256e-06, + "loss": 0.8054, + "step": 6881 + }, + { + "epoch": 0.3787770378116572, + "grad_norm": 0.7086492776870728, + "learning_rate": 9.1603859377964e-06, + "loss": 0.8938, + "step": 6882 + }, + { + "epoch": 0.37883207661401286, + "grad_norm": 0.7764425873756409, + "learning_rate": 9.160145495631894e-06, + "loss": 0.7562, + "step": 6883 + }, + { + "epoch": 0.37888711541636855, + "grad_norm": 0.7673479914665222, + "learning_rate": 9.159905022200846e-06, + "loss": 0.6783, + "step": 6884 + }, + { + "epoch": 0.3789421542187242, + "grad_norm": 0.7323669195175171, + "learning_rate": 9.159664517505067e-06, + "loss": 0.8274, + "step": 6885 + }, + { + "epoch": 0.37899719302107987, + "grad_norm": 0.8283136487007141, + "learning_rate": 9.159423981546362e-06, + "loss": 0.7184, + "step": 6886 + }, + { + "epoch": 0.3790522318234355, + "grad_norm": 0.6949145793914795, + "learning_rate": 9.15918341432654e-06, + "loss": 0.7843, + "step": 6887 + }, + { + "epoch": 0.3791072706257912, + "grad_norm": 0.8584639430046082, + "learning_rate": 9.158942815847408e-06, + "loss": 0.71, + "step": 6888 + }, + { + "epoch": 0.3791623094281468, + "grad_norm": 0.7125271558761597, + "learning_rate": 9.158702186110777e-06, + "loss": 0.7432, + "step": 6889 + }, + { + "epoch": 0.3792173482305025, + "grad_norm": 0.6657430529594421, + "learning_rate": 9.158461525118452e-06, + "loss": 0.6715, + "step": 6890 + }, + { + "epoch": 0.37927238703285815, + "grad_norm": 0.770226240158081, + "learning_rate": 9.158220832872243e-06, + "loss": 0.7029, + "step": 6891 + }, + { + "epoch": 0.37932742583521384, + "grad_norm": 0.7697272300720215, + "learning_rate": 9.15798010937396e-06, + "loss": 0.686, + "step": 6892 + }, + { + "epoch": 0.37938246463756947, + "grad_norm": 0.7693290710449219, + "learning_rate": 9.157739354625413e-06, + "loss": 0.7669, + "step": 6893 + }, + { + "epoch": 0.37943750343992516, + "grad_norm": 0.8365996479988098, + "learning_rate": 9.157498568628406e-06, + "loss": 0.8254, + "step": 6894 + }, + { + "epoch": 0.3794925422422808, + "grad_norm": 0.8075883388519287, + "learning_rate": 9.157257751384756e-06, + "loss": 0.8311, + "step": 6895 + }, + { + "epoch": 0.3795475810446365, + "grad_norm": 0.8422812819480896, + "learning_rate": 9.15701690289627e-06, + "loss": 0.9173, + "step": 6896 + }, + { + "epoch": 0.3796026198469921, + "grad_norm": 0.7930355072021484, + "learning_rate": 9.156776023164755e-06, + "loss": 0.9376, + "step": 6897 + }, + { + "epoch": 0.3796576586493478, + "grad_norm": 0.7877563238143921, + "learning_rate": 9.156535112192026e-06, + "loss": 0.8358, + "step": 6898 + }, + { + "epoch": 0.37971269745170344, + "grad_norm": 0.7712885141372681, + "learning_rate": 9.156294169979891e-06, + "loss": 0.8781, + "step": 6899 + }, + { + "epoch": 0.37976773625405913, + "grad_norm": 0.6953728199005127, + "learning_rate": 9.156053196530162e-06, + "loss": 0.7861, + "step": 6900 + }, + { + "epoch": 0.37982277505641476, + "grad_norm": 0.9581564664840698, + "learning_rate": 9.155812191844649e-06, + "loss": 0.8294, + "step": 6901 + }, + { + "epoch": 0.37987781385877045, + "grad_norm": 0.738571286201477, + "learning_rate": 9.155571155925166e-06, + "loss": 0.7998, + "step": 6902 + }, + { + "epoch": 0.3799328526611261, + "grad_norm": 0.7059765458106995, + "learning_rate": 9.155330088773519e-06, + "loss": 0.7877, + "step": 6903 + }, + { + "epoch": 0.3799878914634818, + "grad_norm": 0.8572642207145691, + "learning_rate": 9.155088990391527e-06, + "loss": 0.7333, + "step": 6904 + }, + { + "epoch": 0.3800429302658374, + "grad_norm": 0.7442637085914612, + "learning_rate": 9.154847860780996e-06, + "loss": 0.685, + "step": 6905 + }, + { + "epoch": 0.3800979690681931, + "grad_norm": 0.7787682414054871, + "learning_rate": 9.154606699943741e-06, + "loss": 0.7893, + "step": 6906 + }, + { + "epoch": 0.38015300787054873, + "grad_norm": 0.8973822593688965, + "learning_rate": 9.154365507881574e-06, + "loss": 0.8297, + "step": 6907 + }, + { + "epoch": 0.3802080466729044, + "grad_norm": 0.7759919166564941, + "learning_rate": 9.154124284596311e-06, + "loss": 0.8257, + "step": 6908 + }, + { + "epoch": 0.38026308547526005, + "grad_norm": 0.8042850494384766, + "learning_rate": 9.153883030089759e-06, + "loss": 0.8024, + "step": 6909 + }, + { + "epoch": 0.38031812427761574, + "grad_norm": 0.8285790085792542, + "learning_rate": 9.153641744363733e-06, + "loss": 0.7824, + "step": 6910 + }, + { + "epoch": 0.3803731630799714, + "grad_norm": 0.7225445508956909, + "learning_rate": 9.15340042742005e-06, + "loss": 0.8065, + "step": 6911 + }, + { + "epoch": 0.38042820188232707, + "grad_norm": 0.7685298919677734, + "learning_rate": 9.15315907926052e-06, + "loss": 0.8151, + "step": 6912 + }, + { + "epoch": 0.3804832406846827, + "grad_norm": 0.9005589485168457, + "learning_rate": 9.152917699886958e-06, + "loss": 0.8413, + "step": 6913 + }, + { + "epoch": 0.3805382794870384, + "grad_norm": 0.8715279698371887, + "learning_rate": 9.152676289301178e-06, + "loss": 0.7233, + "step": 6914 + }, + { + "epoch": 0.380593318289394, + "grad_norm": 0.8764133453369141, + "learning_rate": 9.152434847504996e-06, + "loss": 0.783, + "step": 6915 + }, + { + "epoch": 0.38064835709174966, + "grad_norm": 0.6847019195556641, + "learning_rate": 9.152193374500225e-06, + "loss": 0.7133, + "step": 6916 + }, + { + "epoch": 0.38070339589410535, + "grad_norm": 0.7562721371650696, + "learning_rate": 9.151951870288678e-06, + "loss": 0.8155, + "step": 6917 + }, + { + "epoch": 0.380758434696461, + "grad_norm": 0.6888439059257507, + "learning_rate": 9.151710334872173e-06, + "loss": 0.6395, + "step": 6918 + }, + { + "epoch": 0.38081347349881667, + "grad_norm": 1.0951511859893799, + "learning_rate": 9.151468768252525e-06, + "loss": 0.8936, + "step": 6919 + }, + { + "epoch": 0.3808685123011723, + "grad_norm": 0.7261115908622742, + "learning_rate": 9.151227170431549e-06, + "loss": 0.7864, + "step": 6920 + }, + { + "epoch": 0.380923551103528, + "grad_norm": 1.2851859331130981, + "learning_rate": 9.150985541411061e-06, + "loss": 0.9419, + "step": 6921 + }, + { + "epoch": 0.3809785899058836, + "grad_norm": 0.7621721625328064, + "learning_rate": 9.150743881192876e-06, + "loss": 0.7773, + "step": 6922 + }, + { + "epoch": 0.3810336287082393, + "grad_norm": 0.7605605721473694, + "learning_rate": 9.150502189778811e-06, + "loss": 0.8752, + "step": 6923 + }, + { + "epoch": 0.38108866751059495, + "grad_norm": 0.8422327041625977, + "learning_rate": 9.150260467170683e-06, + "loss": 0.8555, + "step": 6924 + }, + { + "epoch": 0.38114370631295064, + "grad_norm": 0.7227829098701477, + "learning_rate": 9.15001871337031e-06, + "loss": 0.7637, + "step": 6925 + }, + { + "epoch": 0.38119874511530627, + "grad_norm": 0.6568942666053772, + "learning_rate": 9.149776928379506e-06, + "loss": 0.6944, + "step": 6926 + }, + { + "epoch": 0.38125378391766196, + "grad_norm": 0.9317567944526672, + "learning_rate": 9.149535112200087e-06, + "loss": 0.8098, + "step": 6927 + }, + { + "epoch": 0.3813088227200176, + "grad_norm": 0.6374759674072266, + "learning_rate": 9.149293264833877e-06, + "loss": 0.6654, + "step": 6928 + }, + { + "epoch": 0.3813638615223733, + "grad_norm": 0.7276837825775146, + "learning_rate": 9.149051386282685e-06, + "loss": 0.7728, + "step": 6929 + }, + { + "epoch": 0.3814189003247289, + "grad_norm": 0.7573683261871338, + "learning_rate": 9.148809476548337e-06, + "loss": 0.7681, + "step": 6930 + }, + { + "epoch": 0.3814739391270846, + "grad_norm": 0.7535703778266907, + "learning_rate": 9.148567535632647e-06, + "loss": 0.8498, + "step": 6931 + }, + { + "epoch": 0.38152897792944024, + "grad_norm": 0.7510126233100891, + "learning_rate": 9.148325563537432e-06, + "loss": 0.7874, + "step": 6932 + }, + { + "epoch": 0.3815840167317959, + "grad_norm": 0.7809224724769592, + "learning_rate": 9.148083560264515e-06, + "loss": 0.7223, + "step": 6933 + }, + { + "epoch": 0.38163905553415156, + "grad_norm": 0.7433155179023743, + "learning_rate": 9.14784152581571e-06, + "loss": 0.7914, + "step": 6934 + }, + { + "epoch": 0.38169409433650725, + "grad_norm": 0.7142858505249023, + "learning_rate": 9.14759946019284e-06, + "loss": 0.781, + "step": 6935 + }, + { + "epoch": 0.3817491331388629, + "grad_norm": 0.7910202741622925, + "learning_rate": 9.147357363397721e-06, + "loss": 0.755, + "step": 6936 + }, + { + "epoch": 0.3818041719412186, + "grad_norm": 1.007727026939392, + "learning_rate": 9.147115235432176e-06, + "loss": 0.7809, + "step": 6937 + }, + { + "epoch": 0.3818592107435742, + "grad_norm": 0.7227005362510681, + "learning_rate": 9.146873076298024e-06, + "loss": 0.7276, + "step": 6938 + }, + { + "epoch": 0.3819142495459299, + "grad_norm": 0.6945967674255371, + "learning_rate": 9.146630885997081e-06, + "loss": 0.825, + "step": 6939 + }, + { + "epoch": 0.38196928834828553, + "grad_norm": 0.6719669103622437, + "learning_rate": 9.146388664531172e-06, + "loss": 0.6486, + "step": 6940 + }, + { + "epoch": 0.3820243271506412, + "grad_norm": 0.7528467178344727, + "learning_rate": 9.146146411902115e-06, + "loss": 0.8143, + "step": 6941 + }, + { + "epoch": 0.38207936595299685, + "grad_norm": 0.6835548877716064, + "learning_rate": 9.145904128111732e-06, + "loss": 0.7742, + "step": 6942 + }, + { + "epoch": 0.38213440475535254, + "grad_norm": 0.7829870581626892, + "learning_rate": 9.145661813161844e-06, + "loss": 0.8147, + "step": 6943 + }, + { + "epoch": 0.3821894435577082, + "grad_norm": 0.6833155155181885, + "learning_rate": 9.145419467054271e-06, + "loss": 0.7615, + "step": 6944 + }, + { + "epoch": 0.38224448236006386, + "grad_norm": 0.7577275037765503, + "learning_rate": 9.145177089790833e-06, + "loss": 0.8611, + "step": 6945 + }, + { + "epoch": 0.3822995211624195, + "grad_norm": 0.7102984189987183, + "learning_rate": 9.144934681373356e-06, + "loss": 0.8373, + "step": 6946 + }, + { + "epoch": 0.3823545599647752, + "grad_norm": 0.6906121373176575, + "learning_rate": 9.144692241803658e-06, + "loss": 0.8314, + "step": 6947 + }, + { + "epoch": 0.3824095987671308, + "grad_norm": 0.7790967226028442, + "learning_rate": 9.144449771083563e-06, + "loss": 0.8285, + "step": 6948 + }, + { + "epoch": 0.3824646375694865, + "grad_norm": 0.8420237898826599, + "learning_rate": 9.144207269214893e-06, + "loss": 0.8159, + "step": 6949 + }, + { + "epoch": 0.38251967637184214, + "grad_norm": 0.7944310307502747, + "learning_rate": 9.143964736199471e-06, + "loss": 0.7981, + "step": 6950 + }, + { + "epoch": 0.38257471517419783, + "grad_norm": 0.7610076069831848, + "learning_rate": 9.14372217203912e-06, + "loss": 0.8011, + "step": 6951 + }, + { + "epoch": 0.38262975397655347, + "grad_norm": 0.7183333039283752, + "learning_rate": 9.143479576735661e-06, + "loss": 0.7504, + "step": 6952 + }, + { + "epoch": 0.38268479277890916, + "grad_norm": 0.7363573312759399, + "learning_rate": 9.14323695029092e-06, + "loss": 0.7561, + "step": 6953 + }, + { + "epoch": 0.3827398315812648, + "grad_norm": 0.7330427765846252, + "learning_rate": 9.142994292706716e-06, + "loss": 0.754, + "step": 6954 + }, + { + "epoch": 0.3827948703836205, + "grad_norm": 0.8307509422302246, + "learning_rate": 9.142751603984879e-06, + "loss": 0.8059, + "step": 6955 + }, + { + "epoch": 0.3828499091859761, + "grad_norm": 0.7340347766876221, + "learning_rate": 9.142508884127228e-06, + "loss": 0.8636, + "step": 6956 + }, + { + "epoch": 0.3829049479883318, + "grad_norm": 0.7032678127288818, + "learning_rate": 9.14226613313559e-06, + "loss": 0.8237, + "step": 6957 + }, + { + "epoch": 0.38295998679068743, + "grad_norm": 0.769809365272522, + "learning_rate": 9.142023351011788e-06, + "loss": 0.7523, + "step": 6958 + }, + { + "epoch": 0.38301502559304307, + "grad_norm": 0.7446833252906799, + "learning_rate": 9.141780537757647e-06, + "loss": 0.8382, + "step": 6959 + }, + { + "epoch": 0.38307006439539876, + "grad_norm": 0.6926285028457642, + "learning_rate": 9.141537693374994e-06, + "loss": 0.7997, + "step": 6960 + }, + { + "epoch": 0.3831251031977544, + "grad_norm": 0.7303034067153931, + "learning_rate": 9.141294817865651e-06, + "loss": 0.794, + "step": 6961 + }, + { + "epoch": 0.3831801420001101, + "grad_norm": 0.7453297972679138, + "learning_rate": 9.141051911231445e-06, + "loss": 0.7031, + "step": 6962 + }, + { + "epoch": 0.3832351808024657, + "grad_norm": 0.8503912091255188, + "learning_rate": 9.140808973474201e-06, + "loss": 0.7855, + "step": 6963 + }, + { + "epoch": 0.3832902196048214, + "grad_norm": 0.7304036617279053, + "learning_rate": 9.140566004595746e-06, + "loss": 0.7062, + "step": 6964 + }, + { + "epoch": 0.38334525840717704, + "grad_norm": 0.7534968852996826, + "learning_rate": 9.140323004597904e-06, + "loss": 0.8138, + "step": 6965 + }, + { + "epoch": 0.3834002972095327, + "grad_norm": 0.8122013807296753, + "learning_rate": 9.140079973482503e-06, + "loss": 0.7769, + "step": 6966 + }, + { + "epoch": 0.38345533601188836, + "grad_norm": 0.7345744967460632, + "learning_rate": 9.13983691125137e-06, + "loss": 0.7588, + "step": 6967 + }, + { + "epoch": 0.38351037481424405, + "grad_norm": 0.7251620292663574, + "learning_rate": 9.13959381790633e-06, + "loss": 0.8027, + "step": 6968 + }, + { + "epoch": 0.3835654136165997, + "grad_norm": 0.7157594561576843, + "learning_rate": 9.139350693449212e-06, + "loss": 0.7233, + "step": 6969 + }, + { + "epoch": 0.38362045241895537, + "grad_norm": 0.8076621890068054, + "learning_rate": 9.139107537881842e-06, + "loss": 0.7256, + "step": 6970 + }, + { + "epoch": 0.383675491221311, + "grad_norm": 0.717182993888855, + "learning_rate": 9.138864351206047e-06, + "loss": 0.7003, + "step": 6971 + }, + { + "epoch": 0.3837305300236667, + "grad_norm": 0.7534194588661194, + "learning_rate": 9.138621133423656e-06, + "loss": 0.7315, + "step": 6972 + }, + { + "epoch": 0.3837855688260223, + "grad_norm": 0.6400160193443298, + "learning_rate": 9.138377884536494e-06, + "loss": 0.6814, + "step": 6973 + }, + { + "epoch": 0.383840607628378, + "grad_norm": 0.7319507002830505, + "learning_rate": 9.138134604546394e-06, + "loss": 0.7942, + "step": 6974 + }, + { + "epoch": 0.38389564643073365, + "grad_norm": 0.7109829783439636, + "learning_rate": 9.137891293455181e-06, + "loss": 0.7528, + "step": 6975 + }, + { + "epoch": 0.38395068523308934, + "grad_norm": 1.006724238395691, + "learning_rate": 9.137647951264685e-06, + "loss": 0.7652, + "step": 6976 + }, + { + "epoch": 0.384005724035445, + "grad_norm": 0.7080540060997009, + "learning_rate": 9.137404577976736e-06, + "loss": 0.7706, + "step": 6977 + }, + { + "epoch": 0.38406076283780066, + "grad_norm": 0.7551368474960327, + "learning_rate": 9.137161173593161e-06, + "loss": 0.8202, + "step": 6978 + }, + { + "epoch": 0.3841158016401563, + "grad_norm": 0.6624314785003662, + "learning_rate": 9.13691773811579e-06, + "loss": 0.7258, + "step": 6979 + }, + { + "epoch": 0.384170840442512, + "grad_norm": 0.9603848457336426, + "learning_rate": 9.136674271546451e-06, + "loss": 0.9415, + "step": 6980 + }, + { + "epoch": 0.3842258792448676, + "grad_norm": 0.6964829564094543, + "learning_rate": 9.136430773886977e-06, + "loss": 0.7604, + "step": 6981 + }, + { + "epoch": 0.3842809180472233, + "grad_norm": 0.6503588557243347, + "learning_rate": 9.136187245139197e-06, + "loss": 0.7141, + "step": 6982 + }, + { + "epoch": 0.38433595684957894, + "grad_norm": 0.9179829359054565, + "learning_rate": 9.13594368530494e-06, + "loss": 0.7619, + "step": 6983 + }, + { + "epoch": 0.38439099565193463, + "grad_norm": 0.7993278503417969, + "learning_rate": 9.135700094386038e-06, + "loss": 0.832, + "step": 6984 + }, + { + "epoch": 0.38444603445429026, + "grad_norm": 0.8136988282203674, + "learning_rate": 9.13545647238432e-06, + "loss": 0.8127, + "step": 6985 + }, + { + "epoch": 0.38450107325664595, + "grad_norm": 0.9918104410171509, + "learning_rate": 9.135212819301619e-06, + "loss": 0.836, + "step": 6986 + }, + { + "epoch": 0.3845561120590016, + "grad_norm": 0.7767511010169983, + "learning_rate": 9.134969135139765e-06, + "loss": 0.8391, + "step": 6987 + }, + { + "epoch": 0.3846111508613573, + "grad_norm": 0.6889285445213318, + "learning_rate": 9.134725419900589e-06, + "loss": 0.7639, + "step": 6988 + }, + { + "epoch": 0.3846661896637129, + "grad_norm": 1.803467035293579, + "learning_rate": 9.134481673585924e-06, + "loss": 0.7629, + "step": 6989 + }, + { + "epoch": 0.3847212284660686, + "grad_norm": 0.721581757068634, + "learning_rate": 9.134237896197603e-06, + "loss": 0.8194, + "step": 6990 + }, + { + "epoch": 0.38477626726842423, + "grad_norm": 0.8163189888000488, + "learning_rate": 9.133994087737456e-06, + "loss": 0.7789, + "step": 6991 + }, + { + "epoch": 0.3848313060707799, + "grad_norm": 0.7518420815467834, + "learning_rate": 9.133750248207315e-06, + "loss": 0.7529, + "step": 6992 + }, + { + "epoch": 0.38488634487313556, + "grad_norm": 0.7318000197410583, + "learning_rate": 9.133506377609015e-06, + "loss": 0.7829, + "step": 6993 + }, + { + "epoch": 0.38494138367549124, + "grad_norm": 0.7765058875083923, + "learning_rate": 9.133262475944386e-06, + "loss": 0.7902, + "step": 6994 + }, + { + "epoch": 0.3849964224778469, + "grad_norm": 0.845567524433136, + "learning_rate": 9.133018543215265e-06, + "loss": 0.8117, + "step": 6995 + }, + { + "epoch": 0.38505146128020257, + "grad_norm": 0.7081887125968933, + "learning_rate": 9.13277457942348e-06, + "loss": 0.8131, + "step": 6996 + }, + { + "epoch": 0.3851065000825582, + "grad_norm": 0.7447869777679443, + "learning_rate": 9.132530584570869e-06, + "loss": 0.7765, + "step": 6997 + }, + { + "epoch": 0.3851615388849139, + "grad_norm": 0.8554795384407043, + "learning_rate": 9.132286558659265e-06, + "loss": 0.8966, + "step": 6998 + }, + { + "epoch": 0.3852165776872695, + "grad_norm": 0.7117023468017578, + "learning_rate": 9.1320425016905e-06, + "loss": 0.7461, + "step": 6999 + }, + { + "epoch": 0.3852716164896252, + "grad_norm": 0.6965934038162231, + "learning_rate": 9.131798413666411e-06, + "loss": 0.6827, + "step": 7000 + }, + { + "epoch": 0.38532665529198085, + "grad_norm": 0.7449018359184265, + "learning_rate": 9.13155429458883e-06, + "loss": 0.7562, + "step": 7001 + }, + { + "epoch": 0.3853816940943365, + "grad_norm": 0.7764221429824829, + "learning_rate": 9.131310144459593e-06, + "loss": 0.7842, + "step": 7002 + }, + { + "epoch": 0.38543673289669217, + "grad_norm": 0.9788658618927002, + "learning_rate": 9.131065963280536e-06, + "loss": 0.7857, + "step": 7003 + }, + { + "epoch": 0.3854917716990478, + "grad_norm": 0.7900908589363098, + "learning_rate": 9.13082175105349e-06, + "loss": 0.8733, + "step": 7004 + }, + { + "epoch": 0.3855468105014035, + "grad_norm": 0.814822793006897, + "learning_rate": 9.130577507780298e-06, + "loss": 0.8032, + "step": 7005 + }, + { + "epoch": 0.3856018493037591, + "grad_norm": 1.0648475885391235, + "learning_rate": 9.130333233462789e-06, + "loss": 0.8078, + "step": 7006 + }, + { + "epoch": 0.3856568881061148, + "grad_norm": 0.7359917163848877, + "learning_rate": 9.130088928102799e-06, + "loss": 0.6491, + "step": 7007 + }, + { + "epoch": 0.38571192690847045, + "grad_norm": 0.7321771383285522, + "learning_rate": 9.129844591702169e-06, + "loss": 0.7663, + "step": 7008 + }, + { + "epoch": 0.38576696571082614, + "grad_norm": 0.6937146186828613, + "learning_rate": 9.129600224262732e-06, + "loss": 0.7835, + "step": 7009 + }, + { + "epoch": 0.38582200451318177, + "grad_norm": 0.7330107688903809, + "learning_rate": 9.129355825786323e-06, + "loss": 0.7626, + "step": 7010 + }, + { + "epoch": 0.38587704331553746, + "grad_norm": 0.7021715044975281, + "learning_rate": 9.129111396274783e-06, + "loss": 0.7115, + "step": 7011 + }, + { + "epoch": 0.3859320821178931, + "grad_norm": 0.6599563360214233, + "learning_rate": 9.128866935729947e-06, + "loss": 0.6554, + "step": 7012 + }, + { + "epoch": 0.3859871209202488, + "grad_norm": 0.7323513031005859, + "learning_rate": 9.128622444153652e-06, + "loss": 0.7392, + "step": 7013 + }, + { + "epoch": 0.3860421597226044, + "grad_norm": 0.681888222694397, + "learning_rate": 9.128377921547736e-06, + "loss": 0.7474, + "step": 7014 + }, + { + "epoch": 0.3860971985249601, + "grad_norm": 0.8454889059066772, + "learning_rate": 9.128133367914036e-06, + "loss": 0.8355, + "step": 7015 + }, + { + "epoch": 0.38615223732731574, + "grad_norm": 0.7514123916625977, + "learning_rate": 9.12788878325439e-06, + "loss": 0.7683, + "step": 7016 + }, + { + "epoch": 0.38620727612967143, + "grad_norm": 0.7317092418670654, + "learning_rate": 9.12764416757064e-06, + "loss": 0.7201, + "step": 7017 + }, + { + "epoch": 0.38626231493202706, + "grad_norm": 0.7626729011535645, + "learning_rate": 9.127399520864619e-06, + "loss": 0.7701, + "step": 7018 + }, + { + "epoch": 0.38631735373438275, + "grad_norm": 0.9790363311767578, + "learning_rate": 9.127154843138168e-06, + "loss": 0.8034, + "step": 7019 + }, + { + "epoch": 0.3863723925367384, + "grad_norm": 0.663593590259552, + "learning_rate": 9.126910134393125e-06, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 0.3864274313390941, + "grad_norm": 0.6599924564361572, + "learning_rate": 9.126665394631332e-06, + "loss": 0.7395, + "step": 7021 + }, + { + "epoch": 0.3864824701414497, + "grad_norm": 0.8493411540985107, + "learning_rate": 9.126420623854625e-06, + "loss": 0.8008, + "step": 7022 + }, + { + "epoch": 0.3865375089438054, + "grad_norm": 0.7587194442749023, + "learning_rate": 9.126175822064846e-06, + "loss": 0.7533, + "step": 7023 + }, + { + "epoch": 0.38659254774616103, + "grad_norm": 0.773764431476593, + "learning_rate": 9.125930989263835e-06, + "loss": 0.75, + "step": 7024 + }, + { + "epoch": 0.3866475865485167, + "grad_norm": 0.7126749753952026, + "learning_rate": 9.12568612545343e-06, + "loss": 0.7794, + "step": 7025 + }, + { + "epoch": 0.38670262535087235, + "grad_norm": 0.7404584884643555, + "learning_rate": 9.125441230635472e-06, + "loss": 0.7264, + "step": 7026 + }, + { + "epoch": 0.38675766415322804, + "grad_norm": 0.8057644367218018, + "learning_rate": 9.125196304811804e-06, + "loss": 0.8058, + "step": 7027 + }, + { + "epoch": 0.3868127029555837, + "grad_norm": 0.9586995840072632, + "learning_rate": 9.124951347984263e-06, + "loss": 0.7659, + "step": 7028 + }, + { + "epoch": 0.38686774175793937, + "grad_norm": 0.7567793726921082, + "learning_rate": 9.124706360154693e-06, + "loss": 0.8961, + "step": 7029 + }, + { + "epoch": 0.386922780560295, + "grad_norm": 0.8523182272911072, + "learning_rate": 9.124461341324934e-06, + "loss": 0.8815, + "step": 7030 + }, + { + "epoch": 0.3869778193626507, + "grad_norm": 0.7466379404067993, + "learning_rate": 9.124216291496826e-06, + "loss": 0.7817, + "step": 7031 + }, + { + "epoch": 0.3870328581650063, + "grad_norm": 0.6721325516700745, + "learning_rate": 9.123971210672214e-06, + "loss": 0.7637, + "step": 7032 + }, + { + "epoch": 0.387087896967362, + "grad_norm": 0.6620928049087524, + "learning_rate": 9.123726098852936e-06, + "loss": 0.6956, + "step": 7033 + }, + { + "epoch": 0.38714293576971764, + "grad_norm": 0.6784290671348572, + "learning_rate": 9.12348095604084e-06, + "loss": 0.7034, + "step": 7034 + }, + { + "epoch": 0.38719797457207333, + "grad_norm": 0.7138848304748535, + "learning_rate": 9.123235782237763e-06, + "loss": 0.6037, + "step": 7035 + }, + { + "epoch": 0.38725301337442897, + "grad_norm": 0.8473613858222961, + "learning_rate": 9.122990577445548e-06, + "loss": 0.8157, + "step": 7036 + }, + { + "epoch": 0.38730805217678466, + "grad_norm": 0.835381031036377, + "learning_rate": 9.122745341666041e-06, + "loss": 0.8736, + "step": 7037 + }, + { + "epoch": 0.3873630909791403, + "grad_norm": 0.8823271989822388, + "learning_rate": 9.122500074901083e-06, + "loss": 0.7448, + "step": 7038 + }, + { + "epoch": 0.387418129781496, + "grad_norm": 0.6494244933128357, + "learning_rate": 9.122254777152519e-06, + "loss": 0.7423, + "step": 7039 + }, + { + "epoch": 0.3874731685838516, + "grad_norm": 0.7232181429862976, + "learning_rate": 9.122009448422191e-06, + "loss": 0.8489, + "step": 7040 + }, + { + "epoch": 0.3875282073862073, + "grad_norm": 0.7357699275016785, + "learning_rate": 9.121764088711945e-06, + "loss": 0.8799, + "step": 7041 + }, + { + "epoch": 0.38758324618856294, + "grad_norm": 0.7638574838638306, + "learning_rate": 9.121518698023621e-06, + "loss": 0.8539, + "step": 7042 + }, + { + "epoch": 0.3876382849909186, + "grad_norm": 0.7407062649726868, + "learning_rate": 9.121273276359068e-06, + "loss": 0.7152, + "step": 7043 + }, + { + "epoch": 0.38769332379327426, + "grad_norm": 0.6945983171463013, + "learning_rate": 9.121027823720126e-06, + "loss": 0.8224, + "step": 7044 + }, + { + "epoch": 0.3877483625956299, + "grad_norm": 0.7163639068603516, + "learning_rate": 9.120782340108643e-06, + "loss": 0.808, + "step": 7045 + }, + { + "epoch": 0.3878034013979856, + "grad_norm": 0.7062035799026489, + "learning_rate": 9.120536825526463e-06, + "loss": 0.783, + "step": 7046 + }, + { + "epoch": 0.3878584402003412, + "grad_norm": 0.7459971308708191, + "learning_rate": 9.120291279975431e-06, + "loss": 0.8219, + "step": 7047 + }, + { + "epoch": 0.3879134790026969, + "grad_norm": 0.9016150236129761, + "learning_rate": 9.120045703457394e-06, + "loss": 0.8605, + "step": 7048 + }, + { + "epoch": 0.38796851780505254, + "grad_norm": 0.78440922498703, + "learning_rate": 9.119800095974193e-06, + "loss": 0.8424, + "step": 7049 + }, + { + "epoch": 0.3880235566074082, + "grad_norm": 0.751504123210907, + "learning_rate": 9.119554457527681e-06, + "loss": 0.701, + "step": 7050 + }, + { + "epoch": 0.38807859540976386, + "grad_norm": 0.7540284991264343, + "learning_rate": 9.119308788119698e-06, + "loss": 0.7912, + "step": 7051 + }, + { + "epoch": 0.38813363421211955, + "grad_norm": 0.7977007627487183, + "learning_rate": 9.119063087752094e-06, + "loss": 0.9297, + "step": 7052 + }, + { + "epoch": 0.3881886730144752, + "grad_norm": 0.6923508644104004, + "learning_rate": 9.118817356426715e-06, + "loss": 0.7458, + "step": 7053 + }, + { + "epoch": 0.38824371181683087, + "grad_norm": 0.7170272469520569, + "learning_rate": 9.118571594145406e-06, + "loss": 0.733, + "step": 7054 + }, + { + "epoch": 0.3882987506191865, + "grad_norm": 0.7547701001167297, + "learning_rate": 9.118325800910015e-06, + "loss": 0.7758, + "step": 7055 + }, + { + "epoch": 0.3883537894215422, + "grad_norm": 0.7921421527862549, + "learning_rate": 9.118079976722391e-06, + "loss": 0.8262, + "step": 7056 + }, + { + "epoch": 0.38840882822389783, + "grad_norm": 0.734470784664154, + "learning_rate": 9.117834121584379e-06, + "loss": 0.817, + "step": 7057 + }, + { + "epoch": 0.3884638670262535, + "grad_norm": 0.8106420040130615, + "learning_rate": 9.117588235497829e-06, + "loss": 0.8203, + "step": 7058 + }, + { + "epoch": 0.38851890582860915, + "grad_norm": 0.7355543375015259, + "learning_rate": 9.117342318464588e-06, + "loss": 0.8076, + "step": 7059 + }, + { + "epoch": 0.38857394463096484, + "grad_norm": 0.7665252685546875, + "learning_rate": 9.117096370486504e-06, + "loss": 0.7611, + "step": 7060 + }, + { + "epoch": 0.3886289834333205, + "grad_norm": 0.7968598008155823, + "learning_rate": 9.116850391565426e-06, + "loss": 0.6461, + "step": 7061 + }, + { + "epoch": 0.38868402223567616, + "grad_norm": 0.7187741994857788, + "learning_rate": 9.116604381703203e-06, + "loss": 0.7982, + "step": 7062 + }, + { + "epoch": 0.3887390610380318, + "grad_norm": 0.8566913604736328, + "learning_rate": 9.11635834090168e-06, + "loss": 0.9072, + "step": 7063 + }, + { + "epoch": 0.3887940998403875, + "grad_norm": 0.7120797038078308, + "learning_rate": 9.116112269162714e-06, + "loss": 0.7353, + "step": 7064 + }, + { + "epoch": 0.3888491386427431, + "grad_norm": 0.7230019569396973, + "learning_rate": 9.115866166488148e-06, + "loss": 0.7717, + "step": 7065 + }, + { + "epoch": 0.3889041774450988, + "grad_norm": 0.6650584936141968, + "learning_rate": 9.115620032879833e-06, + "loss": 0.7384, + "step": 7066 + }, + { + "epoch": 0.38895921624745444, + "grad_norm": 0.970750629901886, + "learning_rate": 9.115373868339621e-06, + "loss": 0.8478, + "step": 7067 + }, + { + "epoch": 0.38901425504981013, + "grad_norm": 0.7066280245780945, + "learning_rate": 9.115127672869359e-06, + "loss": 0.7638, + "step": 7068 + }, + { + "epoch": 0.38906929385216577, + "grad_norm": 0.6952232718467712, + "learning_rate": 9.1148814464709e-06, + "loss": 0.7869, + "step": 7069 + }, + { + "epoch": 0.38912433265452145, + "grad_norm": 0.804489254951477, + "learning_rate": 9.114635189146094e-06, + "loss": 0.7905, + "step": 7070 + }, + { + "epoch": 0.3891793714568771, + "grad_norm": 0.6988457441329956, + "learning_rate": 9.114388900896791e-06, + "loss": 0.7107, + "step": 7071 + }, + { + "epoch": 0.3892344102592328, + "grad_norm": 0.6379980444908142, + "learning_rate": 9.114142581724842e-06, + "loss": 0.733, + "step": 7072 + }, + { + "epoch": 0.3892894490615884, + "grad_norm": 0.7238649129867554, + "learning_rate": 9.113896231632098e-06, + "loss": 0.8252, + "step": 7073 + }, + { + "epoch": 0.3893444878639441, + "grad_norm": 0.7168585062026978, + "learning_rate": 9.113649850620412e-06, + "loss": 0.6459, + "step": 7074 + }, + { + "epoch": 0.38939952666629973, + "grad_norm": 0.7315915822982788, + "learning_rate": 9.113403438691634e-06, + "loss": 0.7557, + "step": 7075 + }, + { + "epoch": 0.3894545654686554, + "grad_norm": 0.7438754439353943, + "learning_rate": 9.11315699584762e-06, + "loss": 0.7938, + "step": 7076 + }, + { + "epoch": 0.38950960427101106, + "grad_norm": 0.7497848272323608, + "learning_rate": 9.112910522090215e-06, + "loss": 0.8232, + "step": 7077 + }, + { + "epoch": 0.38956464307336675, + "grad_norm": 0.8072896003723145, + "learning_rate": 9.112664017421277e-06, + "loss": 0.7974, + "step": 7078 + }, + { + "epoch": 0.3896196818757224, + "grad_norm": 0.7255920767784119, + "learning_rate": 9.112417481842657e-06, + "loss": 0.7658, + "step": 7079 + }, + { + "epoch": 0.38967472067807807, + "grad_norm": 0.6263132095336914, + "learning_rate": 9.112170915356209e-06, + "loss": 0.7188, + "step": 7080 + }, + { + "epoch": 0.3897297594804337, + "grad_norm": 0.6817660927772522, + "learning_rate": 9.111924317963785e-06, + "loss": 0.7406, + "step": 7081 + }, + { + "epoch": 0.3897847982827894, + "grad_norm": 0.7829134464263916, + "learning_rate": 9.111677689667238e-06, + "loss": 0.8406, + "step": 7082 + }, + { + "epoch": 0.389839837085145, + "grad_norm": 0.7122843861579895, + "learning_rate": 9.111431030468421e-06, + "loss": 0.7722, + "step": 7083 + }, + { + "epoch": 0.3898948758875007, + "grad_norm": 0.7041764259338379, + "learning_rate": 9.11118434036919e-06, + "loss": 0.8307, + "step": 7084 + }, + { + "epoch": 0.38994991468985635, + "grad_norm": 0.7582009434700012, + "learning_rate": 9.110937619371398e-06, + "loss": 0.7461, + "step": 7085 + }, + { + "epoch": 0.39000495349221204, + "grad_norm": 0.7156100273132324, + "learning_rate": 9.110690867476899e-06, + "loss": 0.7294, + "step": 7086 + }, + { + "epoch": 0.39005999229456767, + "grad_norm": 0.79449063539505, + "learning_rate": 9.110444084687549e-06, + "loss": 0.8652, + "step": 7087 + }, + { + "epoch": 0.3901150310969233, + "grad_norm": 0.7692831754684448, + "learning_rate": 9.1101972710052e-06, + "loss": 0.7899, + "step": 7088 + }, + { + "epoch": 0.390170069899279, + "grad_norm": 0.7189639806747437, + "learning_rate": 9.109950426431708e-06, + "loss": 0.726, + "step": 7089 + }, + { + "epoch": 0.3902251087016346, + "grad_norm": 0.7491177916526794, + "learning_rate": 9.10970355096893e-06, + "loss": 0.8881, + "step": 7090 + }, + { + "epoch": 0.3902801475039903, + "grad_norm": 0.783027172088623, + "learning_rate": 9.10945664461872e-06, + "loss": 0.7728, + "step": 7091 + }, + { + "epoch": 0.39033518630634595, + "grad_norm": 1.0871556997299194, + "learning_rate": 9.109209707382934e-06, + "loss": 0.8059, + "step": 7092 + }, + { + "epoch": 0.39039022510870164, + "grad_norm": 0.7287113666534424, + "learning_rate": 9.108962739263429e-06, + "loss": 0.7896, + "step": 7093 + }, + { + "epoch": 0.39044526391105727, + "grad_norm": 0.7801700234413147, + "learning_rate": 9.108715740262058e-06, + "loss": 0.8012, + "step": 7094 + }, + { + "epoch": 0.39050030271341296, + "grad_norm": 0.846709132194519, + "learning_rate": 9.10846871038068e-06, + "loss": 0.8392, + "step": 7095 + }, + { + "epoch": 0.3905553415157686, + "grad_norm": 0.7408092617988586, + "learning_rate": 9.10822164962115e-06, + "loss": 0.8657, + "step": 7096 + }, + { + "epoch": 0.3906103803181243, + "grad_norm": 0.6748743057250977, + "learning_rate": 9.107974557985328e-06, + "loss": 0.7659, + "step": 7097 + }, + { + "epoch": 0.3906654191204799, + "grad_norm": 0.7512170672416687, + "learning_rate": 9.107727435475067e-06, + "loss": 0.7704, + "step": 7098 + }, + { + "epoch": 0.3907204579228356, + "grad_norm": 0.9039596319198608, + "learning_rate": 9.107480282092227e-06, + "loss": 0.8412, + "step": 7099 + }, + { + "epoch": 0.39077549672519124, + "grad_norm": 0.829785943031311, + "learning_rate": 9.107233097838663e-06, + "loss": 0.8229, + "step": 7100 + }, + { + "epoch": 0.39083053552754693, + "grad_norm": 0.7597842812538147, + "learning_rate": 9.106985882716238e-06, + "loss": 0.7798, + "step": 7101 + }, + { + "epoch": 0.39088557432990256, + "grad_norm": 0.7619945406913757, + "learning_rate": 9.106738636726802e-06, + "loss": 0.7504, + "step": 7102 + }, + { + "epoch": 0.39094061313225825, + "grad_norm": 0.6791092157363892, + "learning_rate": 9.10649135987222e-06, + "loss": 0.8167, + "step": 7103 + }, + { + "epoch": 0.3909956519346139, + "grad_norm": 0.7977412343025208, + "learning_rate": 9.10624405215435e-06, + "loss": 0.8252, + "step": 7104 + }, + { + "epoch": 0.3910506907369696, + "grad_norm": 0.7329283356666565, + "learning_rate": 9.105996713575047e-06, + "loss": 0.7084, + "step": 7105 + }, + { + "epoch": 0.3911057295393252, + "grad_norm": 0.7125133872032166, + "learning_rate": 9.105749344136172e-06, + "loss": 0.6672, + "step": 7106 + }, + { + "epoch": 0.3911607683416809, + "grad_norm": 0.6974679827690125, + "learning_rate": 9.105501943839583e-06, + "loss": 0.7354, + "step": 7107 + }, + { + "epoch": 0.39121580714403653, + "grad_norm": 0.7191265225410461, + "learning_rate": 9.10525451268714e-06, + "loss": 0.8133, + "step": 7108 + }, + { + "epoch": 0.3912708459463922, + "grad_norm": 0.7188206911087036, + "learning_rate": 9.105007050680704e-06, + "loss": 0.7947, + "step": 7109 + }, + { + "epoch": 0.39132588474874785, + "grad_norm": 0.9017364382743835, + "learning_rate": 9.104759557822135e-06, + "loss": 0.7848, + "step": 7110 + }, + { + "epoch": 0.39138092355110354, + "grad_norm": 0.7551164031028748, + "learning_rate": 9.104512034113292e-06, + "loss": 0.8266, + "step": 7111 + }, + { + "epoch": 0.3914359623534592, + "grad_norm": 0.7810001969337463, + "learning_rate": 9.104264479556033e-06, + "loss": 0.7731, + "step": 7112 + }, + { + "epoch": 0.39149100115581487, + "grad_norm": 0.787723183631897, + "learning_rate": 9.104016894152223e-06, + "loss": 0.8008, + "step": 7113 + }, + { + "epoch": 0.3915460399581705, + "grad_norm": 0.7303524017333984, + "learning_rate": 9.103769277903718e-06, + "loss": 0.826, + "step": 7114 + }, + { + "epoch": 0.3916010787605262, + "grad_norm": 0.707759439945221, + "learning_rate": 9.103521630812384e-06, + "loss": 0.6303, + "step": 7115 + }, + { + "epoch": 0.3916561175628818, + "grad_norm": 0.6929940581321716, + "learning_rate": 9.10327395288008e-06, + "loss": 0.733, + "step": 7116 + }, + { + "epoch": 0.3917111563652375, + "grad_norm": 0.7133205533027649, + "learning_rate": 9.103026244108667e-06, + "loss": 0.8421, + "step": 7117 + }, + { + "epoch": 0.39176619516759315, + "grad_norm": 1.2049434185028076, + "learning_rate": 9.102778504500005e-06, + "loss": 0.8618, + "step": 7118 + }, + { + "epoch": 0.39182123396994883, + "grad_norm": 0.7792720198631287, + "learning_rate": 9.10253073405596e-06, + "loss": 0.717, + "step": 7119 + }, + { + "epoch": 0.39187627277230447, + "grad_norm": 0.7234412431716919, + "learning_rate": 9.10228293277839e-06, + "loss": 0.7547, + "step": 7120 + }, + { + "epoch": 0.39193131157466016, + "grad_norm": 0.6845420002937317, + "learning_rate": 9.102035100669162e-06, + "loss": 0.7255, + "step": 7121 + }, + { + "epoch": 0.3919863503770158, + "grad_norm": 0.7446799874305725, + "learning_rate": 9.101787237730135e-06, + "loss": 0.7947, + "step": 7122 + }, + { + "epoch": 0.3920413891793715, + "grad_norm": 0.812924325466156, + "learning_rate": 9.101539343963176e-06, + "loss": 0.843, + "step": 7123 + }, + { + "epoch": 0.3920964279817271, + "grad_norm": 0.7373847365379333, + "learning_rate": 9.101291419370141e-06, + "loss": 0.7703, + "step": 7124 + }, + { + "epoch": 0.3921514667840828, + "grad_norm": 0.8305120468139648, + "learning_rate": 9.101043463952899e-06, + "loss": 0.8904, + "step": 7125 + }, + { + "epoch": 0.39220650558643844, + "grad_norm": 0.7263030409812927, + "learning_rate": 9.100795477713313e-06, + "loss": 0.8319, + "step": 7126 + }, + { + "epoch": 0.3922615443887941, + "grad_norm": 0.8358581066131592, + "learning_rate": 9.100547460653245e-06, + "loss": 0.8305, + "step": 7127 + }, + { + "epoch": 0.39231658319114976, + "grad_norm": 0.6608800292015076, + "learning_rate": 9.10029941277456e-06, + "loss": 0.7815, + "step": 7128 + }, + { + "epoch": 0.39237162199350545, + "grad_norm": 0.8590257167816162, + "learning_rate": 9.100051334079122e-06, + "loss": 0.8292, + "step": 7129 + }, + { + "epoch": 0.3924266607958611, + "grad_norm": 0.6241755485534668, + "learning_rate": 9.099803224568797e-06, + "loss": 0.6568, + "step": 7130 + }, + { + "epoch": 0.3924816995982167, + "grad_norm": 0.7298059463500977, + "learning_rate": 9.099555084245447e-06, + "loss": 0.727, + "step": 7131 + }, + { + "epoch": 0.3925367384005724, + "grad_norm": 0.7741055488586426, + "learning_rate": 9.099306913110939e-06, + "loss": 0.8481, + "step": 7132 + }, + { + "epoch": 0.39259177720292804, + "grad_norm": 0.9674170613288879, + "learning_rate": 9.099058711167137e-06, + "loss": 0.8507, + "step": 7133 + }, + { + "epoch": 0.3926468160052837, + "grad_norm": 0.7285159826278687, + "learning_rate": 9.098810478415907e-06, + "loss": 0.766, + "step": 7134 + }, + { + "epoch": 0.39270185480763936, + "grad_norm": 0.7215660810470581, + "learning_rate": 9.098562214859115e-06, + "loss": 0.794, + "step": 7135 + }, + { + "epoch": 0.39275689360999505, + "grad_norm": 0.764437735080719, + "learning_rate": 9.098313920498627e-06, + "loss": 0.8228, + "step": 7136 + }, + { + "epoch": 0.3928119324123507, + "grad_norm": 0.7222796082496643, + "learning_rate": 9.098065595336309e-06, + "loss": 0.8064, + "step": 7137 + }, + { + "epoch": 0.3928669712147064, + "grad_norm": 0.7044625878334045, + "learning_rate": 9.097817239374024e-06, + "loss": 0.8017, + "step": 7138 + }, + { + "epoch": 0.392922010017062, + "grad_norm": 0.7929979562759399, + "learning_rate": 9.097568852613646e-06, + "loss": 0.7527, + "step": 7139 + }, + { + "epoch": 0.3929770488194177, + "grad_norm": 0.7833721041679382, + "learning_rate": 9.097320435057033e-06, + "loss": 0.8335, + "step": 7140 + }, + { + "epoch": 0.39303208762177333, + "grad_norm": 0.8365728259086609, + "learning_rate": 9.097071986706058e-06, + "loss": 0.6439, + "step": 7141 + }, + { + "epoch": 0.393087126424129, + "grad_norm": 0.7547842264175415, + "learning_rate": 9.096823507562588e-06, + "loss": 0.8316, + "step": 7142 + }, + { + "epoch": 0.39314216522648465, + "grad_norm": 0.6598891019821167, + "learning_rate": 9.09657499762849e-06, + "loss": 0.6547, + "step": 7143 + }, + { + "epoch": 0.39319720402884034, + "grad_norm": 0.7913638949394226, + "learning_rate": 9.096326456905627e-06, + "loss": 0.7964, + "step": 7144 + }, + { + "epoch": 0.393252242831196, + "grad_norm": 0.6927905082702637, + "learning_rate": 9.096077885395874e-06, + "loss": 0.7836, + "step": 7145 + }, + { + "epoch": 0.39330728163355166, + "grad_norm": 0.7505417466163635, + "learning_rate": 9.095829283101094e-06, + "loss": 0.7707, + "step": 7146 + }, + { + "epoch": 0.3933623204359073, + "grad_norm": 0.8797083497047424, + "learning_rate": 9.095580650023158e-06, + "loss": 0.866, + "step": 7147 + }, + { + "epoch": 0.393417359238263, + "grad_norm": 0.7023645639419556, + "learning_rate": 9.095331986163935e-06, + "loss": 0.7013, + "step": 7148 + }, + { + "epoch": 0.3934723980406186, + "grad_norm": 0.697354793548584, + "learning_rate": 9.095083291525293e-06, + "loss": 0.7691, + "step": 7149 + }, + { + "epoch": 0.3935274368429743, + "grad_norm": 0.7211105227470398, + "learning_rate": 9.094834566109101e-06, + "loss": 0.6816, + "step": 7150 + }, + { + "epoch": 0.39358247564532994, + "grad_norm": 0.8593278527259827, + "learning_rate": 9.094585809917227e-06, + "loss": 0.915, + "step": 7151 + }, + { + "epoch": 0.39363751444768563, + "grad_norm": 0.7406070828437805, + "learning_rate": 9.094337022951545e-06, + "loss": 0.7825, + "step": 7152 + }, + { + "epoch": 0.39369255325004127, + "grad_norm": 0.7644504308700562, + "learning_rate": 9.09408820521392e-06, + "loss": 0.6796, + "step": 7153 + }, + { + "epoch": 0.39374759205239696, + "grad_norm": 0.8239033222198486, + "learning_rate": 9.093839356706224e-06, + "loss": 0.8396, + "step": 7154 + }, + { + "epoch": 0.3938026308547526, + "grad_norm": 0.6433991193771362, + "learning_rate": 9.093590477430327e-06, + "loss": 0.6941, + "step": 7155 + }, + { + "epoch": 0.3938576696571083, + "grad_norm": 0.6979972124099731, + "learning_rate": 9.093341567388102e-06, + "loss": 0.8142, + "step": 7156 + }, + { + "epoch": 0.3939127084594639, + "grad_norm": 0.7062026262283325, + "learning_rate": 9.093092626581414e-06, + "loss": 0.804, + "step": 7157 + }, + { + "epoch": 0.3939677472618196, + "grad_norm": 0.7070814967155457, + "learning_rate": 9.09284365501214e-06, + "loss": 0.765, + "step": 7158 + }, + { + "epoch": 0.39402278606417523, + "grad_norm": 0.8577908873558044, + "learning_rate": 9.092594652682147e-06, + "loss": 0.7074, + "step": 7159 + }, + { + "epoch": 0.3940778248665309, + "grad_norm": 0.7386197447776794, + "learning_rate": 9.092345619593309e-06, + "loss": 0.7629, + "step": 7160 + }, + { + "epoch": 0.39413286366888656, + "grad_norm": 0.8048123121261597, + "learning_rate": 9.092096555747496e-06, + "loss": 0.9225, + "step": 7161 + }, + { + "epoch": 0.39418790247124225, + "grad_norm": 0.7479888200759888, + "learning_rate": 9.091847461146582e-06, + "loss": 0.7284, + "step": 7162 + }, + { + "epoch": 0.3942429412735979, + "grad_norm": 0.7448734045028687, + "learning_rate": 9.091598335792438e-06, + "loss": 0.8694, + "step": 7163 + }, + { + "epoch": 0.39429798007595357, + "grad_norm": 0.7511261701583862, + "learning_rate": 9.091349179686935e-06, + "loss": 0.7822, + "step": 7164 + }, + { + "epoch": 0.3943530188783092, + "grad_norm": 0.7079344391822815, + "learning_rate": 9.091099992831946e-06, + "loss": 0.7238, + "step": 7165 + }, + { + "epoch": 0.3944080576806649, + "grad_norm": 0.7007229328155518, + "learning_rate": 9.090850775229347e-06, + "loss": 0.7269, + "step": 7166 + }, + { + "epoch": 0.3944630964830205, + "grad_norm": 0.769800066947937, + "learning_rate": 9.090601526881007e-06, + "loss": 0.7894, + "step": 7167 + }, + { + "epoch": 0.3945181352853762, + "grad_norm": 0.7211676836013794, + "learning_rate": 9.090352247788801e-06, + "loss": 0.7998, + "step": 7168 + }, + { + "epoch": 0.39457317408773185, + "grad_norm": 0.6784254312515259, + "learning_rate": 9.090102937954602e-06, + "loss": 0.7576, + "step": 7169 + }, + { + "epoch": 0.39462821289008754, + "grad_norm": 0.7696946859359741, + "learning_rate": 9.089853597380285e-06, + "loss": 0.8395, + "step": 7170 + }, + { + "epoch": 0.39468325169244317, + "grad_norm": 0.8720405697822571, + "learning_rate": 9.089604226067723e-06, + "loss": 0.8971, + "step": 7171 + }, + { + "epoch": 0.39473829049479886, + "grad_norm": 0.8457947373390198, + "learning_rate": 9.08935482401879e-06, + "loss": 0.7002, + "step": 7172 + }, + { + "epoch": 0.3947933292971545, + "grad_norm": 0.8181997537612915, + "learning_rate": 9.089105391235361e-06, + "loss": 0.8949, + "step": 7173 + }, + { + "epoch": 0.3948483680995101, + "grad_norm": 0.7717136144638062, + "learning_rate": 9.08885592771931e-06, + "loss": 0.829, + "step": 7174 + }, + { + "epoch": 0.3949034069018658, + "grad_norm": 0.6941567063331604, + "learning_rate": 9.088606433472514e-06, + "loss": 0.7592, + "step": 7175 + }, + { + "epoch": 0.39495844570422145, + "grad_norm": 0.7358599901199341, + "learning_rate": 9.088356908496845e-06, + "loss": 0.8657, + "step": 7176 + }, + { + "epoch": 0.39501348450657714, + "grad_norm": 1.1329307556152344, + "learning_rate": 9.08810735279418e-06, + "loss": 0.8307, + "step": 7177 + }, + { + "epoch": 0.3950685233089328, + "grad_norm": 0.7011532187461853, + "learning_rate": 9.087857766366395e-06, + "loss": 0.7487, + "step": 7178 + }, + { + "epoch": 0.39512356211128846, + "grad_norm": 0.7390572428703308, + "learning_rate": 9.087608149215366e-06, + "loss": 0.8244, + "step": 7179 + }, + { + "epoch": 0.3951786009136441, + "grad_norm": 0.6907634735107422, + "learning_rate": 9.087358501342966e-06, + "loss": 0.751, + "step": 7180 + }, + { + "epoch": 0.3952336397159998, + "grad_norm": 0.7467379570007324, + "learning_rate": 9.087108822751076e-06, + "loss": 0.8549, + "step": 7181 + }, + { + "epoch": 0.3952886785183554, + "grad_norm": 0.7493302226066589, + "learning_rate": 9.086859113441568e-06, + "loss": 0.8332, + "step": 7182 + }, + { + "epoch": 0.3953437173207111, + "grad_norm": 0.8364959955215454, + "learning_rate": 9.086609373416321e-06, + "loss": 0.7873, + "step": 7183 + }, + { + "epoch": 0.39539875612306674, + "grad_norm": 0.7330418825149536, + "learning_rate": 9.086359602677214e-06, + "loss": 0.7861, + "step": 7184 + }, + { + "epoch": 0.39545379492542243, + "grad_norm": 0.7296311855316162, + "learning_rate": 9.086109801226121e-06, + "loss": 0.7946, + "step": 7185 + }, + { + "epoch": 0.39550883372777806, + "grad_norm": 0.7884660363197327, + "learning_rate": 9.085859969064921e-06, + "loss": 0.7851, + "step": 7186 + }, + { + "epoch": 0.39556387253013375, + "grad_norm": 0.7311955690383911, + "learning_rate": 9.08561010619549e-06, + "loss": 0.7645, + "step": 7187 + }, + { + "epoch": 0.3956189113324894, + "grad_norm": 0.7447296977043152, + "learning_rate": 9.085360212619707e-06, + "loss": 0.7446, + "step": 7188 + }, + { + "epoch": 0.3956739501348451, + "grad_norm": 0.755628228187561, + "learning_rate": 9.08511028833945e-06, + "loss": 0.8107, + "step": 7189 + }, + { + "epoch": 0.3957289889372007, + "grad_norm": 0.6800833940505981, + "learning_rate": 9.0848603333566e-06, + "loss": 0.7471, + "step": 7190 + }, + { + "epoch": 0.3957840277395564, + "grad_norm": 0.6396341919898987, + "learning_rate": 9.08461034767303e-06, + "loss": 0.6797, + "step": 7191 + }, + { + "epoch": 0.39583906654191203, + "grad_norm": 0.729680597782135, + "learning_rate": 9.084360331290625e-06, + "loss": 0.7224, + "step": 7192 + }, + { + "epoch": 0.3958941053442677, + "grad_norm": 0.7630584239959717, + "learning_rate": 9.084110284211259e-06, + "loss": 0.8203, + "step": 7193 + }, + { + "epoch": 0.39594914414662336, + "grad_norm": 0.8799235820770264, + "learning_rate": 9.083860206436813e-06, + "loss": 0.8312, + "step": 7194 + }, + { + "epoch": 0.39600418294897904, + "grad_norm": 0.797081708908081, + "learning_rate": 9.083610097969169e-06, + "loss": 0.7561, + "step": 7195 + }, + { + "epoch": 0.3960592217513347, + "grad_norm": 0.7408759593963623, + "learning_rate": 9.083359958810203e-06, + "loss": 0.7854, + "step": 7196 + }, + { + "epoch": 0.39611426055369037, + "grad_norm": 0.7552130222320557, + "learning_rate": 9.083109788961797e-06, + "loss": 0.8145, + "step": 7197 + }, + { + "epoch": 0.396169299356046, + "grad_norm": 0.7147447466850281, + "learning_rate": 9.08285958842583e-06, + "loss": 0.792, + "step": 7198 + }, + { + "epoch": 0.3962243381584017, + "grad_norm": 0.7416259050369263, + "learning_rate": 9.082609357204183e-06, + "loss": 0.7801, + "step": 7199 + }, + { + "epoch": 0.3962793769607573, + "grad_norm": 0.7551109194755554, + "learning_rate": 9.082359095298741e-06, + "loss": 0.8841, + "step": 7200 + }, + { + "epoch": 0.396334415763113, + "grad_norm": 0.761472225189209, + "learning_rate": 9.082108802711377e-06, + "loss": 0.9061, + "step": 7201 + }, + { + "epoch": 0.39638945456546865, + "grad_norm": 0.7234126329421997, + "learning_rate": 9.081858479443977e-06, + "loss": 0.8308, + "step": 7202 + }, + { + "epoch": 0.39644449336782434, + "grad_norm": 0.7204816341400146, + "learning_rate": 9.08160812549842e-06, + "loss": 0.7481, + "step": 7203 + }, + { + "epoch": 0.39649953217017997, + "grad_norm": 0.7207956910133362, + "learning_rate": 9.081357740876591e-06, + "loss": 0.762, + "step": 7204 + }, + { + "epoch": 0.39655457097253566, + "grad_norm": 0.7967123985290527, + "learning_rate": 9.081107325580367e-06, + "loss": 0.7931, + "step": 7205 + }, + { + "epoch": 0.3966096097748913, + "grad_norm": 0.9839354753494263, + "learning_rate": 9.080856879611635e-06, + "loss": 0.8182, + "step": 7206 + }, + { + "epoch": 0.396664648577247, + "grad_norm": 0.8468357920646667, + "learning_rate": 9.080606402972274e-06, + "loss": 0.7056, + "step": 7207 + }, + { + "epoch": 0.3967196873796026, + "grad_norm": 0.6549574136734009, + "learning_rate": 9.080355895664169e-06, + "loss": 0.7604, + "step": 7208 + }, + { + "epoch": 0.3967747261819583, + "grad_norm": 0.7475417256355286, + "learning_rate": 9.080105357689201e-06, + "loss": 0.7107, + "step": 7209 + }, + { + "epoch": 0.39682976498431394, + "grad_norm": 0.7464179992675781, + "learning_rate": 9.079854789049251e-06, + "loss": 0.793, + "step": 7210 + }, + { + "epoch": 0.3968848037866696, + "grad_norm": 0.8332071900367737, + "learning_rate": 9.079604189746207e-06, + "loss": 0.8383, + "step": 7211 + }, + { + "epoch": 0.39693984258902526, + "grad_norm": 0.722055196762085, + "learning_rate": 9.07935355978195e-06, + "loss": 0.8569, + "step": 7212 + }, + { + "epoch": 0.39699488139138095, + "grad_norm": 0.7442018389701843, + "learning_rate": 9.079102899158363e-06, + "loss": 0.8165, + "step": 7213 + }, + { + "epoch": 0.3970499201937366, + "grad_norm": 0.6986141204833984, + "learning_rate": 9.07885220787733e-06, + "loss": 0.7562, + "step": 7214 + }, + { + "epoch": 0.39710495899609227, + "grad_norm": 0.7718464732170105, + "learning_rate": 9.078601485940736e-06, + "loss": 0.8529, + "step": 7215 + }, + { + "epoch": 0.3971599977984479, + "grad_norm": 0.7583653330802917, + "learning_rate": 9.078350733350464e-06, + "loss": 0.7855, + "step": 7216 + }, + { + "epoch": 0.39721503660080354, + "grad_norm": 0.7699223160743713, + "learning_rate": 9.078099950108401e-06, + "loss": 0.8061, + "step": 7217 + }, + { + "epoch": 0.39727007540315923, + "grad_norm": 0.7374141812324524, + "learning_rate": 9.07784913621643e-06, + "loss": 0.789, + "step": 7218 + }, + { + "epoch": 0.39732511420551486, + "grad_norm": 0.7446104884147644, + "learning_rate": 9.077598291676436e-06, + "loss": 0.8381, + "step": 7219 + }, + { + "epoch": 0.39738015300787055, + "grad_norm": 0.7017301917076111, + "learning_rate": 9.077347416490305e-06, + "loss": 0.7153, + "step": 7220 + }, + { + "epoch": 0.3974351918102262, + "grad_norm": 0.7676172852516174, + "learning_rate": 9.077096510659922e-06, + "loss": 0.8029, + "step": 7221 + }, + { + "epoch": 0.3974902306125819, + "grad_norm": 0.9340602159500122, + "learning_rate": 9.076845574187174e-06, + "loss": 0.7865, + "step": 7222 + }, + { + "epoch": 0.3975452694149375, + "grad_norm": 0.8634235262870789, + "learning_rate": 9.076594607073945e-06, + "loss": 0.7606, + "step": 7223 + }, + { + "epoch": 0.3976003082172932, + "grad_norm": 0.8967369198799133, + "learning_rate": 9.076343609322123e-06, + "loss": 0.7011, + "step": 7224 + }, + { + "epoch": 0.39765534701964883, + "grad_norm": 0.7269352078437805, + "learning_rate": 9.076092580933594e-06, + "loss": 0.8043, + "step": 7225 + }, + { + "epoch": 0.3977103858220045, + "grad_norm": 0.7550628781318665, + "learning_rate": 9.075841521910243e-06, + "loss": 0.7344, + "step": 7226 + }, + { + "epoch": 0.39776542462436015, + "grad_norm": 0.6973844766616821, + "learning_rate": 9.075590432253958e-06, + "loss": 0.6995, + "step": 7227 + }, + { + "epoch": 0.39782046342671584, + "grad_norm": 0.648560643196106, + "learning_rate": 9.075339311966627e-06, + "loss": 0.6997, + "step": 7228 + }, + { + "epoch": 0.3978755022290715, + "grad_norm": 0.8457548022270203, + "learning_rate": 9.075088161050134e-06, + "loss": 0.8548, + "step": 7229 + }, + { + "epoch": 0.39793054103142717, + "grad_norm": 0.7644637823104858, + "learning_rate": 9.074836979506373e-06, + "loss": 0.6966, + "step": 7230 + }, + { + "epoch": 0.3979855798337828, + "grad_norm": 0.7146210670471191, + "learning_rate": 9.074585767337227e-06, + "loss": 0.7673, + "step": 7231 + }, + { + "epoch": 0.3980406186361385, + "grad_norm": 0.8570694327354431, + "learning_rate": 9.074334524544585e-06, + "loss": 0.8233, + "step": 7232 + }, + { + "epoch": 0.3980956574384941, + "grad_norm": 0.7257633805274963, + "learning_rate": 9.074083251130334e-06, + "loss": 0.7464, + "step": 7233 + }, + { + "epoch": 0.3981506962408498, + "grad_norm": 0.9377032518386841, + "learning_rate": 9.073831947096365e-06, + "loss": 0.7814, + "step": 7234 + }, + { + "epoch": 0.39820573504320544, + "grad_norm": 0.8105629086494446, + "learning_rate": 9.073580612444566e-06, + "loss": 0.8069, + "step": 7235 + }, + { + "epoch": 0.39826077384556113, + "grad_norm": 0.7874456644058228, + "learning_rate": 9.073329247176824e-06, + "loss": 0.8414, + "step": 7236 + }, + { + "epoch": 0.39831581264791677, + "grad_norm": 0.6829617023468018, + "learning_rate": 9.07307785129503e-06, + "loss": 0.7633, + "step": 7237 + }, + { + "epoch": 0.39837085145027246, + "grad_norm": 0.6838501691818237, + "learning_rate": 9.072826424801075e-06, + "loss": 0.6972, + "step": 7238 + }, + { + "epoch": 0.3984258902526281, + "grad_norm": 0.7054216861724854, + "learning_rate": 9.072574967696845e-06, + "loss": 0.8049, + "step": 7239 + }, + { + "epoch": 0.3984809290549838, + "grad_norm": 0.9462615847587585, + "learning_rate": 9.072323479984232e-06, + "loss": 0.7988, + "step": 7240 + }, + { + "epoch": 0.3985359678573394, + "grad_norm": 0.7334465980529785, + "learning_rate": 9.072071961665128e-06, + "loss": 0.7538, + "step": 7241 + }, + { + "epoch": 0.3985910066596951, + "grad_norm": 0.7506609559059143, + "learning_rate": 9.071820412741418e-06, + "loss": 0.7991, + "step": 7242 + }, + { + "epoch": 0.39864604546205074, + "grad_norm": 0.6858688592910767, + "learning_rate": 9.071568833214998e-06, + "loss": 0.7258, + "step": 7243 + }, + { + "epoch": 0.3987010842644064, + "grad_norm": 0.8117396235466003, + "learning_rate": 9.071317223087754e-06, + "loss": 0.752, + "step": 7244 + }, + { + "epoch": 0.39875612306676206, + "grad_norm": 0.7772389054298401, + "learning_rate": 9.071065582361582e-06, + "loss": 0.7444, + "step": 7245 + }, + { + "epoch": 0.39881116186911775, + "grad_norm": 0.7221882939338684, + "learning_rate": 9.07081391103837e-06, + "loss": 0.8035, + "step": 7246 + }, + { + "epoch": 0.3988662006714734, + "grad_norm": 0.8113289475440979, + "learning_rate": 9.07056220912001e-06, + "loss": 0.7623, + "step": 7247 + }, + { + "epoch": 0.39892123947382907, + "grad_norm": 0.730823278427124, + "learning_rate": 9.070310476608395e-06, + "loss": 0.7872, + "step": 7248 + }, + { + "epoch": 0.3989762782761847, + "grad_norm": 0.7690893411636353, + "learning_rate": 9.070058713505415e-06, + "loss": 0.7402, + "step": 7249 + }, + { + "epoch": 0.3990313170785404, + "grad_norm": 0.6768597364425659, + "learning_rate": 9.069806919812963e-06, + "loss": 0.7283, + "step": 7250 + }, + { + "epoch": 0.399086355880896, + "grad_norm": 0.6938686370849609, + "learning_rate": 9.069555095532932e-06, + "loss": 0.7209, + "step": 7251 + }, + { + "epoch": 0.3991413946832517, + "grad_norm": 0.7162025570869446, + "learning_rate": 9.069303240667215e-06, + "loss": 0.7915, + "step": 7252 + }, + { + "epoch": 0.39919643348560735, + "grad_norm": 0.9170399308204651, + "learning_rate": 9.069051355217704e-06, + "loss": 0.8399, + "step": 7253 + }, + { + "epoch": 0.39925147228796304, + "grad_norm": 0.7080186009407043, + "learning_rate": 9.068799439186291e-06, + "loss": 0.8678, + "step": 7254 + }, + { + "epoch": 0.39930651109031867, + "grad_norm": 1.013613224029541, + "learning_rate": 9.068547492574872e-06, + "loss": 0.817, + "step": 7255 + }, + { + "epoch": 0.39936154989267436, + "grad_norm": 0.6911013722419739, + "learning_rate": 9.068295515385337e-06, + "loss": 0.7048, + "step": 7256 + }, + { + "epoch": 0.39941658869503, + "grad_norm": 0.748219907283783, + "learning_rate": 9.068043507619584e-06, + "loss": 0.8115, + "step": 7257 + }, + { + "epoch": 0.3994716274973857, + "grad_norm": 0.6763347387313843, + "learning_rate": 9.067791469279504e-06, + "loss": 0.763, + "step": 7258 + }, + { + "epoch": 0.3995266662997413, + "grad_norm": 0.7291030287742615, + "learning_rate": 9.067539400366993e-06, + "loss": 0.7319, + "step": 7259 + }, + { + "epoch": 0.39958170510209695, + "grad_norm": 0.6515628695487976, + "learning_rate": 9.067287300883945e-06, + "loss": 0.7903, + "step": 7260 + }, + { + "epoch": 0.39963674390445264, + "grad_norm": 0.7815985679626465, + "learning_rate": 9.067035170832253e-06, + "loss": 0.8241, + "step": 7261 + }, + { + "epoch": 0.3996917827068083, + "grad_norm": 0.6747417449951172, + "learning_rate": 9.066783010213812e-06, + "loss": 0.7544, + "step": 7262 + }, + { + "epoch": 0.39974682150916396, + "grad_norm": 0.6568340063095093, + "learning_rate": 9.066530819030522e-06, + "loss": 0.7754, + "step": 7263 + }, + { + "epoch": 0.3998018603115196, + "grad_norm": 0.6703339219093323, + "learning_rate": 9.066278597284273e-06, + "loss": 0.7581, + "step": 7264 + }, + { + "epoch": 0.3998568991138753, + "grad_norm": 0.7421279549598694, + "learning_rate": 9.066026344976962e-06, + "loss": 0.7974, + "step": 7265 + }, + { + "epoch": 0.3999119379162309, + "grad_norm": 0.7226015329360962, + "learning_rate": 9.065774062110486e-06, + "loss": 0.7777, + "step": 7266 + }, + { + "epoch": 0.3999669767185866, + "grad_norm": 0.7092894911766052, + "learning_rate": 9.06552174868674e-06, + "loss": 0.7885, + "step": 7267 + }, + { + "epoch": 0.40002201552094224, + "grad_norm": 0.837902307510376, + "learning_rate": 9.065269404707622e-06, + "loss": 0.7425, + "step": 7268 + }, + { + "epoch": 0.40007705432329793, + "grad_norm": 0.803811252117157, + "learning_rate": 9.065017030175027e-06, + "loss": 0.8418, + "step": 7269 + }, + { + "epoch": 0.40013209312565357, + "grad_norm": 0.8110278248786926, + "learning_rate": 9.064764625090854e-06, + "loss": 0.7724, + "step": 7270 + }, + { + "epoch": 0.40018713192800925, + "grad_norm": 0.7305173277854919, + "learning_rate": 9.064512189456995e-06, + "loss": 0.7465, + "step": 7271 + }, + { + "epoch": 0.4002421707303649, + "grad_norm": 0.7312467694282532, + "learning_rate": 9.06425972327535e-06, + "loss": 0.8406, + "step": 7272 + }, + { + "epoch": 0.4002972095327206, + "grad_norm": 0.7348741292953491, + "learning_rate": 9.064007226547819e-06, + "loss": 0.8103, + "step": 7273 + }, + { + "epoch": 0.4003522483350762, + "grad_norm": 0.6561787724494934, + "learning_rate": 9.063754699276297e-06, + "loss": 0.6634, + "step": 7274 + }, + { + "epoch": 0.4004072871374319, + "grad_norm": 0.7924866080284119, + "learning_rate": 9.063502141462682e-06, + "loss": 0.6592, + "step": 7275 + }, + { + "epoch": 0.40046232593978753, + "grad_norm": 0.6873973608016968, + "learning_rate": 9.063249553108873e-06, + "loss": 0.7912, + "step": 7276 + }, + { + "epoch": 0.4005173647421432, + "grad_norm": 0.6872708797454834, + "learning_rate": 9.062996934216768e-06, + "loss": 0.732, + "step": 7277 + }, + { + "epoch": 0.40057240354449886, + "grad_norm": 0.7381585836410522, + "learning_rate": 9.062744284788265e-06, + "loss": 0.84, + "step": 7278 + }, + { + "epoch": 0.40062744234685455, + "grad_norm": 0.7885964512825012, + "learning_rate": 9.062491604825266e-06, + "loss": 0.8229, + "step": 7279 + }, + { + "epoch": 0.4006824811492102, + "grad_norm": 0.9066407680511475, + "learning_rate": 9.062238894329664e-06, + "loss": 0.7299, + "step": 7280 + }, + { + "epoch": 0.40073751995156587, + "grad_norm": 0.7694007754325867, + "learning_rate": 9.061986153303364e-06, + "loss": 0.8033, + "step": 7281 + }, + { + "epoch": 0.4007925587539215, + "grad_norm": 1.021766185760498, + "learning_rate": 9.061733381748263e-06, + "loss": 0.79, + "step": 7282 + }, + { + "epoch": 0.4008475975562772, + "grad_norm": 0.7776662111282349, + "learning_rate": 9.06148057966626e-06, + "loss": 0.8484, + "step": 7283 + }, + { + "epoch": 0.4009026363586328, + "grad_norm": 0.8646043539047241, + "learning_rate": 9.061227747059257e-06, + "loss": 0.8223, + "step": 7284 + }, + { + "epoch": 0.4009576751609885, + "grad_norm": 0.7347257733345032, + "learning_rate": 9.060974883929154e-06, + "loss": 0.8062, + "step": 7285 + }, + { + "epoch": 0.40101271396334415, + "grad_norm": 0.8233902454376221, + "learning_rate": 9.06072199027785e-06, + "loss": 0.8922, + "step": 7286 + }, + { + "epoch": 0.40106775276569984, + "grad_norm": 0.7099601030349731, + "learning_rate": 9.060469066107246e-06, + "loss": 0.7125, + "step": 7287 + }, + { + "epoch": 0.40112279156805547, + "grad_norm": 0.7549998164176941, + "learning_rate": 9.060216111419246e-06, + "loss": 0.7851, + "step": 7288 + }, + { + "epoch": 0.40117783037041116, + "grad_norm": 0.753516435623169, + "learning_rate": 9.059963126215748e-06, + "loss": 0.7831, + "step": 7289 + }, + { + "epoch": 0.4012328691727668, + "grad_norm": 0.6718429327011108, + "learning_rate": 9.059710110498651e-06, + "loss": 0.7305, + "step": 7290 + }, + { + "epoch": 0.4012879079751225, + "grad_norm": 0.6796036958694458, + "learning_rate": 9.05945706426986e-06, + "loss": 0.802, + "step": 7291 + }, + { + "epoch": 0.4013429467774781, + "grad_norm": 0.8046827912330627, + "learning_rate": 9.05920398753128e-06, + "loss": 0.7286, + "step": 7292 + }, + { + "epoch": 0.4013979855798338, + "grad_norm": 0.7518643140792847, + "learning_rate": 9.058950880284807e-06, + "loss": 0.7287, + "step": 7293 + }, + { + "epoch": 0.40145302438218944, + "grad_norm": 0.8386855125427246, + "learning_rate": 9.058697742532345e-06, + "loss": 0.8201, + "step": 7294 + }, + { + "epoch": 0.4015080631845451, + "grad_norm": 0.7780192494392395, + "learning_rate": 9.058444574275797e-06, + "loss": 0.7999, + "step": 7295 + }, + { + "epoch": 0.40156310198690076, + "grad_norm": 0.7715566754341125, + "learning_rate": 9.058191375517068e-06, + "loss": 0.732, + "step": 7296 + }, + { + "epoch": 0.40161814078925645, + "grad_norm": 0.9940280914306641, + "learning_rate": 9.057938146258057e-06, + "loss": 0.8247, + "step": 7297 + }, + { + "epoch": 0.4016731795916121, + "grad_norm": 0.7567923069000244, + "learning_rate": 9.05768488650067e-06, + "loss": 0.8254, + "step": 7298 + }, + { + "epoch": 0.4017282183939678, + "grad_norm": 0.7544496655464172, + "learning_rate": 9.05743159624681e-06, + "loss": 0.811, + "step": 7299 + }, + { + "epoch": 0.4017832571963234, + "grad_norm": 0.63368821144104, + "learning_rate": 9.05717827549838e-06, + "loss": 0.6498, + "step": 7300 + }, + { + "epoch": 0.4018382959986791, + "grad_norm": 0.7077621221542358, + "learning_rate": 9.056924924257284e-06, + "loss": 0.7401, + "step": 7301 + }, + { + "epoch": 0.40189333480103473, + "grad_norm": 0.6782366037368774, + "learning_rate": 9.056671542525426e-06, + "loss": 0.8013, + "step": 7302 + }, + { + "epoch": 0.40194837360339036, + "grad_norm": 0.6605678200721741, + "learning_rate": 9.056418130304709e-06, + "loss": 0.8038, + "step": 7303 + }, + { + "epoch": 0.40200341240574605, + "grad_norm": 0.8716840147972107, + "learning_rate": 9.056164687597041e-06, + "loss": 0.7652, + "step": 7304 + }, + { + "epoch": 0.4020584512081017, + "grad_norm": 0.8464542031288147, + "learning_rate": 9.055911214404325e-06, + "loss": 0.8663, + "step": 7305 + }, + { + "epoch": 0.4021134900104574, + "grad_norm": 0.7165409326553345, + "learning_rate": 9.055657710728466e-06, + "loss": 0.8028, + "step": 7306 + }, + { + "epoch": 0.402168528812813, + "grad_norm": 0.7313430309295654, + "learning_rate": 9.055404176571369e-06, + "loss": 0.7538, + "step": 7307 + }, + { + "epoch": 0.4022235676151687, + "grad_norm": 0.7757230401039124, + "learning_rate": 9.05515061193494e-06, + "loss": 0.9096, + "step": 7308 + }, + { + "epoch": 0.40227860641752433, + "grad_norm": 0.7178354859352112, + "learning_rate": 9.054897016821085e-06, + "loss": 0.7186, + "step": 7309 + }, + { + "epoch": 0.40233364521988, + "grad_norm": 0.8331356048583984, + "learning_rate": 9.054643391231708e-06, + "loss": 0.8724, + "step": 7310 + }, + { + "epoch": 0.40238868402223565, + "grad_norm": 0.7709757685661316, + "learning_rate": 9.054389735168717e-06, + "loss": 0.692, + "step": 7311 + }, + { + "epoch": 0.40244372282459134, + "grad_norm": 0.7393380999565125, + "learning_rate": 9.054136048634018e-06, + "loss": 0.7863, + "step": 7312 + }, + { + "epoch": 0.402498761626947, + "grad_norm": 0.7372385859489441, + "learning_rate": 9.053882331629518e-06, + "loss": 0.781, + "step": 7313 + }, + { + "epoch": 0.40255380042930267, + "grad_norm": 0.7076019048690796, + "learning_rate": 9.053628584157123e-06, + "loss": 0.7598, + "step": 7314 + }, + { + "epoch": 0.4026088392316583, + "grad_norm": 0.7465673685073853, + "learning_rate": 9.053374806218742e-06, + "loss": 0.7454, + "step": 7315 + }, + { + "epoch": 0.402663878034014, + "grad_norm": 0.7414120435714722, + "learning_rate": 9.05312099781628e-06, + "loss": 0.7135, + "step": 7316 + }, + { + "epoch": 0.4027189168363696, + "grad_norm": 0.7490748167037964, + "learning_rate": 9.052867158951646e-06, + "loss": 0.6833, + "step": 7317 + }, + { + "epoch": 0.4027739556387253, + "grad_norm": 0.8027878999710083, + "learning_rate": 9.052613289626747e-06, + "loss": 0.7466, + "step": 7318 + }, + { + "epoch": 0.40282899444108095, + "grad_norm": 0.6777862310409546, + "learning_rate": 9.052359389843493e-06, + "loss": 0.7446, + "step": 7319 + }, + { + "epoch": 0.40288403324343663, + "grad_norm": 0.9240381717681885, + "learning_rate": 9.052105459603787e-06, + "loss": 0.7801, + "step": 7320 + }, + { + "epoch": 0.40293907204579227, + "grad_norm": 0.9592602252960205, + "learning_rate": 9.051851498909543e-06, + "loss": 0.9648, + "step": 7321 + }, + { + "epoch": 0.40299411084814796, + "grad_norm": 0.8469638228416443, + "learning_rate": 9.051597507762669e-06, + "loss": 0.8303, + "step": 7322 + }, + { + "epoch": 0.4030491496505036, + "grad_norm": 0.6981443166732788, + "learning_rate": 9.05134348616507e-06, + "loss": 0.7245, + "step": 7323 + }, + { + "epoch": 0.4031041884528593, + "grad_norm": 0.7133469581604004, + "learning_rate": 9.05108943411866e-06, + "loss": 0.7763, + "step": 7324 + }, + { + "epoch": 0.4031592272552149, + "grad_norm": 0.7043703198432922, + "learning_rate": 9.050835351625344e-06, + "loss": 0.8247, + "step": 7325 + }, + { + "epoch": 0.4032142660575706, + "grad_norm": 0.6662501692771912, + "learning_rate": 9.050581238687036e-06, + "loss": 0.7669, + "step": 7326 + }, + { + "epoch": 0.40326930485992624, + "grad_norm": 0.6482356786727905, + "learning_rate": 9.050327095305643e-06, + "loss": 0.6477, + "step": 7327 + }, + { + "epoch": 0.4033243436622819, + "grad_norm": 0.7465450167655945, + "learning_rate": 9.050072921483076e-06, + "loss": 0.8053, + "step": 7328 + }, + { + "epoch": 0.40337938246463756, + "grad_norm": 0.6765472292900085, + "learning_rate": 9.049818717221245e-06, + "loss": 0.765, + "step": 7329 + }, + { + "epoch": 0.40343442126699325, + "grad_norm": 0.7098689675331116, + "learning_rate": 9.04956448252206e-06, + "loss": 0.8059, + "step": 7330 + }, + { + "epoch": 0.4034894600693489, + "grad_norm": 0.6773823499679565, + "learning_rate": 9.049310217387432e-06, + "loss": 0.6848, + "step": 7331 + }, + { + "epoch": 0.40354449887170457, + "grad_norm": 0.6884829998016357, + "learning_rate": 9.049055921819275e-06, + "loss": 0.696, + "step": 7332 + }, + { + "epoch": 0.4035995376740602, + "grad_norm": 0.662545919418335, + "learning_rate": 9.048801595819494e-06, + "loss": 0.8286, + "step": 7333 + }, + { + "epoch": 0.4036545764764159, + "grad_norm": 0.6863077878952026, + "learning_rate": 9.048547239390007e-06, + "loss": 0.7215, + "step": 7334 + }, + { + "epoch": 0.4037096152787715, + "grad_norm": 0.6982632875442505, + "learning_rate": 9.048292852532721e-06, + "loss": 0.7635, + "step": 7335 + }, + { + "epoch": 0.4037646540811272, + "grad_norm": 0.8512400984764099, + "learning_rate": 9.048038435249548e-06, + "loss": 0.6226, + "step": 7336 + }, + { + "epoch": 0.40381969288348285, + "grad_norm": 0.6952843070030212, + "learning_rate": 9.047783987542405e-06, + "loss": 0.8317, + "step": 7337 + }, + { + "epoch": 0.40387473168583854, + "grad_norm": 0.7802778482437134, + "learning_rate": 9.0475295094132e-06, + "loss": 0.8615, + "step": 7338 + }, + { + "epoch": 0.4039297704881942, + "grad_norm": 0.8783930540084839, + "learning_rate": 9.047275000863844e-06, + "loss": 0.743, + "step": 7339 + }, + { + "epoch": 0.40398480929054986, + "grad_norm": 0.7205806970596313, + "learning_rate": 9.047020461896256e-06, + "loss": 0.7953, + "step": 7340 + }, + { + "epoch": 0.4040398480929055, + "grad_norm": 0.8438451290130615, + "learning_rate": 9.046765892512344e-06, + "loss": 0.7613, + "step": 7341 + }, + { + "epoch": 0.4040948868952612, + "grad_norm": 0.7300973534584045, + "learning_rate": 9.046511292714021e-06, + "loss": 0.7856, + "step": 7342 + }, + { + "epoch": 0.4041499256976168, + "grad_norm": 0.8472041487693787, + "learning_rate": 9.046256662503206e-06, + "loss": 0.8526, + "step": 7343 + }, + { + "epoch": 0.4042049644999725, + "grad_norm": 0.789465606212616, + "learning_rate": 9.046002001881807e-06, + "loss": 0.7792, + "step": 7344 + }, + { + "epoch": 0.40426000330232814, + "grad_norm": 0.7720938920974731, + "learning_rate": 9.04574731085174e-06, + "loss": 0.8065, + "step": 7345 + }, + { + "epoch": 0.4043150421046838, + "grad_norm": 0.6968526840209961, + "learning_rate": 9.04549258941492e-06, + "loss": 0.8135, + "step": 7346 + }, + { + "epoch": 0.40437008090703946, + "grad_norm": 0.746865451335907, + "learning_rate": 9.04523783757326e-06, + "loss": 0.8216, + "step": 7347 + }, + { + "epoch": 0.4044251197093951, + "grad_norm": 0.6750560998916626, + "learning_rate": 9.044983055328676e-06, + "loss": 0.7883, + "step": 7348 + }, + { + "epoch": 0.4044801585117508, + "grad_norm": 0.6791195273399353, + "learning_rate": 9.044728242683081e-06, + "loss": 0.7721, + "step": 7349 + }, + { + "epoch": 0.4045351973141064, + "grad_norm": 0.7238358855247498, + "learning_rate": 9.044473399638392e-06, + "loss": 0.739, + "step": 7350 + }, + { + "epoch": 0.4045902361164621, + "grad_norm": 0.6793557405471802, + "learning_rate": 9.044218526196523e-06, + "loss": 0.7853, + "step": 7351 + }, + { + "epoch": 0.40464527491881774, + "grad_norm": 0.767564058303833, + "learning_rate": 9.043963622359392e-06, + "loss": 0.8158, + "step": 7352 + }, + { + "epoch": 0.40470031372117343, + "grad_norm": 0.6800708770751953, + "learning_rate": 9.043708688128909e-06, + "loss": 0.7493, + "step": 7353 + }, + { + "epoch": 0.40475535252352907, + "grad_norm": 0.75978022813797, + "learning_rate": 9.043453723506996e-06, + "loss": 0.7066, + "step": 7354 + }, + { + "epoch": 0.40481039132588476, + "grad_norm": 1.0194984674453735, + "learning_rate": 9.043198728495568e-06, + "loss": 0.6238, + "step": 7355 + }, + { + "epoch": 0.4048654301282404, + "grad_norm": 0.7102386355400085, + "learning_rate": 9.04294370309654e-06, + "loss": 0.75, + "step": 7356 + }, + { + "epoch": 0.4049204689305961, + "grad_norm": 0.8468191623687744, + "learning_rate": 9.04268864731183e-06, + "loss": 0.8095, + "step": 7357 + }, + { + "epoch": 0.4049755077329517, + "grad_norm": 0.7022871971130371, + "learning_rate": 9.042433561143353e-06, + "loss": 0.8394, + "step": 7358 + }, + { + "epoch": 0.4050305465353074, + "grad_norm": 1.1873482465744019, + "learning_rate": 9.042178444593028e-06, + "loss": 0.7863, + "step": 7359 + }, + { + "epoch": 0.40508558533766303, + "grad_norm": 0.7074940204620361, + "learning_rate": 9.041923297662772e-06, + "loss": 0.7067, + "step": 7360 + }, + { + "epoch": 0.4051406241400187, + "grad_norm": 0.7602211833000183, + "learning_rate": 9.041668120354503e-06, + "loss": 0.6594, + "step": 7361 + }, + { + "epoch": 0.40519566294237436, + "grad_norm": 0.7903324365615845, + "learning_rate": 9.041412912670138e-06, + "loss": 0.7978, + "step": 7362 + }, + { + "epoch": 0.40525070174473005, + "grad_norm": 0.7422891855239868, + "learning_rate": 9.041157674611595e-06, + "loss": 0.8162, + "step": 7363 + }, + { + "epoch": 0.4053057405470857, + "grad_norm": 0.7978767156600952, + "learning_rate": 9.040902406180791e-06, + "loss": 0.762, + "step": 7364 + }, + { + "epoch": 0.40536077934944137, + "grad_norm": 0.7719776630401611, + "learning_rate": 9.04064710737965e-06, + "loss": 0.8098, + "step": 7365 + }, + { + "epoch": 0.405415818151797, + "grad_norm": 0.8646591305732727, + "learning_rate": 9.040391778210083e-06, + "loss": 0.9372, + "step": 7366 + }, + { + "epoch": 0.4054708569541527, + "grad_norm": 0.6616937518119812, + "learning_rate": 9.040136418674015e-06, + "loss": 0.7424, + "step": 7367 + }, + { + "epoch": 0.4055258957565083, + "grad_norm": 0.7676553130149841, + "learning_rate": 9.039881028773363e-06, + "loss": 0.6327, + "step": 7368 + }, + { + "epoch": 0.405580934558864, + "grad_norm": 0.6838239431381226, + "learning_rate": 9.039625608510047e-06, + "loss": 0.7548, + "step": 7369 + }, + { + "epoch": 0.40563597336121965, + "grad_norm": 0.7476304769515991, + "learning_rate": 9.039370157885986e-06, + "loss": 0.7262, + "step": 7370 + }, + { + "epoch": 0.40569101216357534, + "grad_norm": 0.8985139727592468, + "learning_rate": 9.0391146769031e-06, + "loss": 0.7729, + "step": 7371 + }, + { + "epoch": 0.40574605096593097, + "grad_norm": 0.7840422987937927, + "learning_rate": 9.038859165563308e-06, + "loss": 0.7855, + "step": 7372 + }, + { + "epoch": 0.40580108976828666, + "grad_norm": 0.6777672171592712, + "learning_rate": 9.038603623868534e-06, + "loss": 0.7379, + "step": 7373 + }, + { + "epoch": 0.4058561285706423, + "grad_norm": 0.7226746678352356, + "learning_rate": 9.038348051820694e-06, + "loss": 0.7686, + "step": 7374 + }, + { + "epoch": 0.405911167372998, + "grad_norm": 0.7647444605827332, + "learning_rate": 9.038092449421713e-06, + "loss": 0.8859, + "step": 7375 + }, + { + "epoch": 0.4059662061753536, + "grad_norm": 0.6524979472160339, + "learning_rate": 9.037836816673508e-06, + "loss": 0.6982, + "step": 7376 + }, + { + "epoch": 0.4060212449777093, + "grad_norm": 0.7842861413955688, + "learning_rate": 9.037581153578004e-06, + "loss": 0.8099, + "step": 7377 + }, + { + "epoch": 0.40607628378006494, + "grad_norm": 0.6424387693405151, + "learning_rate": 9.03732546013712e-06, + "loss": 0.7387, + "step": 7378 + }, + { + "epoch": 0.40613132258242063, + "grad_norm": 0.8444356918334961, + "learning_rate": 9.037069736352779e-06, + "loss": 0.8813, + "step": 7379 + }, + { + "epoch": 0.40618636138477626, + "grad_norm": 0.6487529277801514, + "learning_rate": 9.036813982226904e-06, + "loss": 0.7609, + "step": 7380 + }, + { + "epoch": 0.40624140018713195, + "grad_norm": 0.7891185879707336, + "learning_rate": 9.036558197761413e-06, + "loss": 0.8589, + "step": 7381 + }, + { + "epoch": 0.4062964389894876, + "grad_norm": 0.7183120250701904, + "learning_rate": 9.036302382958233e-06, + "loss": 0.8429, + "step": 7382 + }, + { + "epoch": 0.4063514777918433, + "grad_norm": 0.6386578679084778, + "learning_rate": 9.036046537819283e-06, + "loss": 0.6955, + "step": 7383 + }, + { + "epoch": 0.4064065165941989, + "grad_norm": 0.7572369575500488, + "learning_rate": 9.035790662346488e-06, + "loss": 0.8018, + "step": 7384 + }, + { + "epoch": 0.4064615553965546, + "grad_norm": 0.7105650305747986, + "learning_rate": 9.035534756541771e-06, + "loss": 0.8527, + "step": 7385 + }, + { + "epoch": 0.40651659419891023, + "grad_norm": 0.7031856179237366, + "learning_rate": 9.035278820407056e-06, + "loss": 0.6991, + "step": 7386 + }, + { + "epoch": 0.4065716330012659, + "grad_norm": 0.7407381534576416, + "learning_rate": 9.035022853944266e-06, + "loss": 0.708, + "step": 7387 + }, + { + "epoch": 0.40662667180362155, + "grad_norm": 0.7078498601913452, + "learning_rate": 9.034766857155322e-06, + "loss": 0.7584, + "step": 7388 + }, + { + "epoch": 0.4066817106059772, + "grad_norm": 0.7643301486968994, + "learning_rate": 9.034510830042151e-06, + "loss": 0.7836, + "step": 7389 + }, + { + "epoch": 0.4067367494083329, + "grad_norm": 0.7165302038192749, + "learning_rate": 9.034254772606676e-06, + "loss": 0.7769, + "step": 7390 + }, + { + "epoch": 0.4067917882106885, + "grad_norm": 0.7442395091056824, + "learning_rate": 9.033998684850824e-06, + "loss": 0.7231, + "step": 7391 + }, + { + "epoch": 0.4068468270130442, + "grad_norm": 0.7425046563148499, + "learning_rate": 9.033742566776517e-06, + "loss": 0.7709, + "step": 7392 + }, + { + "epoch": 0.40690186581539983, + "grad_norm": 0.768419086933136, + "learning_rate": 9.03348641838568e-06, + "loss": 0.7768, + "step": 7393 + }, + { + "epoch": 0.4069569046177555, + "grad_norm": 0.6785634160041809, + "learning_rate": 9.03323023968024e-06, + "loss": 0.7468, + "step": 7394 + }, + { + "epoch": 0.40701194342011116, + "grad_norm": 0.7075444459915161, + "learning_rate": 9.03297403066212e-06, + "loss": 0.7757, + "step": 7395 + }, + { + "epoch": 0.40706698222246684, + "grad_norm": 0.7580223679542542, + "learning_rate": 9.032717791333247e-06, + "loss": 0.7311, + "step": 7396 + }, + { + "epoch": 0.4071220210248225, + "grad_norm": 0.8110041618347168, + "learning_rate": 9.032461521695546e-06, + "loss": 0.7923, + "step": 7397 + }, + { + "epoch": 0.40717705982717817, + "grad_norm": 0.7204881310462952, + "learning_rate": 9.032205221750945e-06, + "loss": 0.759, + "step": 7398 + }, + { + "epoch": 0.4072320986295338, + "grad_norm": 0.8392491340637207, + "learning_rate": 9.031948891501368e-06, + "loss": 0.8292, + "step": 7399 + }, + { + "epoch": 0.4072871374318895, + "grad_norm": 0.7134600281715393, + "learning_rate": 9.031692530948742e-06, + "loss": 0.7, + "step": 7400 + }, + { + "epoch": 0.4073421762342451, + "grad_norm": 0.6324336528778076, + "learning_rate": 9.031436140094995e-06, + "loss": 0.6964, + "step": 7401 + }, + { + "epoch": 0.4073972150366008, + "grad_norm": 0.7281947731971741, + "learning_rate": 9.031179718942052e-06, + "loss": 0.7567, + "step": 7402 + }, + { + "epoch": 0.40745225383895645, + "grad_norm": 0.8828619718551636, + "learning_rate": 9.030923267491842e-06, + "loss": 0.8139, + "step": 7403 + }, + { + "epoch": 0.40750729264131214, + "grad_norm": 0.7039986252784729, + "learning_rate": 9.030666785746292e-06, + "loss": 0.7339, + "step": 7404 + }, + { + "epoch": 0.40756233144366777, + "grad_norm": 0.7049984931945801, + "learning_rate": 9.030410273707331e-06, + "loss": 0.6842, + "step": 7405 + }, + { + "epoch": 0.40761737024602346, + "grad_norm": 0.7149737477302551, + "learning_rate": 9.030153731376883e-06, + "loss": 0.6837, + "step": 7406 + }, + { + "epoch": 0.4076724090483791, + "grad_norm": 1.0804089307785034, + "learning_rate": 9.029897158756878e-06, + "loss": 0.7726, + "step": 7407 + }, + { + "epoch": 0.4077274478507348, + "grad_norm": 0.8354909420013428, + "learning_rate": 9.029640555849244e-06, + "loss": 0.8058, + "step": 7408 + }, + { + "epoch": 0.4077824866530904, + "grad_norm": 0.7091527581214905, + "learning_rate": 9.029383922655914e-06, + "loss": 0.7636, + "step": 7409 + }, + { + "epoch": 0.4078375254554461, + "grad_norm": 0.6720988750457764, + "learning_rate": 9.029127259178809e-06, + "loss": 0.7179, + "step": 7410 + }, + { + "epoch": 0.40789256425780174, + "grad_norm": 0.685858964920044, + "learning_rate": 9.028870565419865e-06, + "loss": 0.7637, + "step": 7411 + }, + { + "epoch": 0.4079476030601574, + "grad_norm": 0.7505033016204834, + "learning_rate": 9.028613841381007e-06, + "loss": 0.7463, + "step": 7412 + }, + { + "epoch": 0.40800264186251306, + "grad_norm": 0.8801671862602234, + "learning_rate": 9.028357087064166e-06, + "loss": 0.8399, + "step": 7413 + }, + { + "epoch": 0.40805768066486875, + "grad_norm": 0.7441918849945068, + "learning_rate": 9.02810030247127e-06, + "loss": 0.7689, + "step": 7414 + }, + { + "epoch": 0.4081127194672244, + "grad_norm": 0.7410128712654114, + "learning_rate": 9.027843487604251e-06, + "loss": 0.8013, + "step": 7415 + }, + { + "epoch": 0.40816775826958007, + "grad_norm": 0.8075226545333862, + "learning_rate": 9.02758664246504e-06, + "loss": 0.7717, + "step": 7416 + }, + { + "epoch": 0.4082227970719357, + "grad_norm": 0.7985545992851257, + "learning_rate": 9.027329767055566e-06, + "loss": 0.8459, + "step": 7417 + }, + { + "epoch": 0.4082778358742914, + "grad_norm": 0.7887235283851624, + "learning_rate": 9.027072861377757e-06, + "loss": 0.8201, + "step": 7418 + }, + { + "epoch": 0.40833287467664703, + "grad_norm": 0.7876266241073608, + "learning_rate": 9.02681592543355e-06, + "loss": 0.8205, + "step": 7419 + }, + { + "epoch": 0.4083879134790027, + "grad_norm": 0.758168637752533, + "learning_rate": 9.02655895922487e-06, + "loss": 0.6619, + "step": 7420 + }, + { + "epoch": 0.40844295228135835, + "grad_norm": 0.7279811501502991, + "learning_rate": 9.02630196275365e-06, + "loss": 0.7634, + "step": 7421 + }, + { + "epoch": 0.40849799108371404, + "grad_norm": 0.7540523409843445, + "learning_rate": 9.026044936021822e-06, + "loss": 0.7819, + "step": 7422 + }, + { + "epoch": 0.4085530298860697, + "grad_norm": 0.8091018795967102, + "learning_rate": 9.02578787903132e-06, + "loss": 0.7749, + "step": 7423 + }, + { + "epoch": 0.40860806868842536, + "grad_norm": 0.7625396847724915, + "learning_rate": 9.025530791784074e-06, + "loss": 0.7635, + "step": 7424 + }, + { + "epoch": 0.408663107490781, + "grad_norm": 0.7663947939872742, + "learning_rate": 9.025273674282015e-06, + "loss": 0.8281, + "step": 7425 + }, + { + "epoch": 0.4087181462931367, + "grad_norm": 0.6672662496566772, + "learning_rate": 9.025016526527077e-06, + "loss": 0.641, + "step": 7426 + }, + { + "epoch": 0.4087731850954923, + "grad_norm": 0.7649143934249878, + "learning_rate": 9.024759348521193e-06, + "loss": 0.7462, + "step": 7427 + }, + { + "epoch": 0.408828223897848, + "grad_norm": 0.7540067434310913, + "learning_rate": 9.024502140266293e-06, + "loss": 0.8756, + "step": 7428 + }, + { + "epoch": 0.40888326270020364, + "grad_norm": 0.721615731716156, + "learning_rate": 9.024244901764314e-06, + "loss": 0.8507, + "step": 7429 + }, + { + "epoch": 0.40893830150255933, + "grad_norm": 0.6949496269226074, + "learning_rate": 9.023987633017186e-06, + "loss": 0.7021, + "step": 7430 + }, + { + "epoch": 0.40899334030491497, + "grad_norm": 0.7108990550041199, + "learning_rate": 9.023730334026845e-06, + "loss": 0.807, + "step": 7431 + }, + { + "epoch": 0.4090483791072706, + "grad_norm": 0.7606124877929688, + "learning_rate": 9.023473004795225e-06, + "loss": 0.7769, + "step": 7432 + }, + { + "epoch": 0.4091034179096263, + "grad_norm": 0.7792031764984131, + "learning_rate": 9.023215645324256e-06, + "loss": 0.728, + "step": 7433 + }, + { + "epoch": 0.4091584567119819, + "grad_norm": 0.728884756565094, + "learning_rate": 9.022958255615877e-06, + "loss": 0.7831, + "step": 7434 + }, + { + "epoch": 0.4092134955143376, + "grad_norm": 0.8196625709533691, + "learning_rate": 9.022700835672022e-06, + "loss": 0.8265, + "step": 7435 + }, + { + "epoch": 0.40926853431669324, + "grad_norm": 0.762734055519104, + "learning_rate": 9.022443385494621e-06, + "loss": 0.8028, + "step": 7436 + }, + { + "epoch": 0.40932357311904893, + "grad_norm": 0.7259558439254761, + "learning_rate": 9.022185905085614e-06, + "loss": 0.789, + "step": 7437 + }, + { + "epoch": 0.40937861192140457, + "grad_norm": 0.7402371764183044, + "learning_rate": 9.021928394446936e-06, + "loss": 0.7667, + "step": 7438 + }, + { + "epoch": 0.40943365072376026, + "grad_norm": 0.8399797677993774, + "learning_rate": 9.021670853580519e-06, + "loss": 0.8451, + "step": 7439 + }, + { + "epoch": 0.4094886895261159, + "grad_norm": 0.6439585089683533, + "learning_rate": 9.0214132824883e-06, + "loss": 0.776, + "step": 7440 + }, + { + "epoch": 0.4095437283284716, + "grad_norm": 0.6956612467765808, + "learning_rate": 9.021155681172215e-06, + "loss": 0.6921, + "step": 7441 + }, + { + "epoch": 0.4095987671308272, + "grad_norm": 0.855413556098938, + "learning_rate": 9.020898049634203e-06, + "loss": 0.8552, + "step": 7442 + }, + { + "epoch": 0.4096538059331829, + "grad_norm": 0.6690535545349121, + "learning_rate": 9.020640387876194e-06, + "loss": 0.7552, + "step": 7443 + }, + { + "epoch": 0.40970884473553854, + "grad_norm": 0.6615462899208069, + "learning_rate": 9.020382695900131e-06, + "loss": 0.8216, + "step": 7444 + }, + { + "epoch": 0.4097638835378942, + "grad_norm": 0.6975858211517334, + "learning_rate": 9.020124973707947e-06, + "loss": 0.7453, + "step": 7445 + }, + { + "epoch": 0.40981892234024986, + "grad_norm": 0.6461964249610901, + "learning_rate": 9.019867221301579e-06, + "loss": 0.656, + "step": 7446 + }, + { + "epoch": 0.40987396114260555, + "grad_norm": 0.7221645712852478, + "learning_rate": 9.019609438682967e-06, + "loss": 0.661, + "step": 7447 + }, + { + "epoch": 0.4099289999449612, + "grad_norm": 0.6785755753517151, + "learning_rate": 9.019351625854044e-06, + "loss": 0.7294, + "step": 7448 + }, + { + "epoch": 0.40998403874731687, + "grad_norm": 0.7040538787841797, + "learning_rate": 9.019093782816751e-06, + "loss": 0.8546, + "step": 7449 + }, + { + "epoch": 0.4100390775496725, + "grad_norm": 0.737922191619873, + "learning_rate": 9.018835909573025e-06, + "loss": 0.8144, + "step": 7450 + }, + { + "epoch": 0.4100941163520282, + "grad_norm": 0.6705496311187744, + "learning_rate": 9.018578006124802e-06, + "loss": 0.6937, + "step": 7451 + }, + { + "epoch": 0.4101491551543838, + "grad_norm": 0.7347431182861328, + "learning_rate": 9.018320072474026e-06, + "loss": 0.7716, + "step": 7452 + }, + { + "epoch": 0.4102041939567395, + "grad_norm": 0.7023493647575378, + "learning_rate": 9.018062108622631e-06, + "loss": 0.7295, + "step": 7453 + }, + { + "epoch": 0.41025923275909515, + "grad_norm": 0.8017870187759399, + "learning_rate": 9.017804114572556e-06, + "loss": 0.7471, + "step": 7454 + }, + { + "epoch": 0.41031427156145084, + "grad_norm": 0.9171211123466492, + "learning_rate": 9.01754609032574e-06, + "loss": 0.8262, + "step": 7455 + }, + { + "epoch": 0.41036931036380647, + "grad_norm": 0.6682952046394348, + "learning_rate": 9.017288035884124e-06, + "loss": 0.7165, + "step": 7456 + }, + { + "epoch": 0.41042434916616216, + "grad_norm": 0.9339122772216797, + "learning_rate": 9.017029951249648e-06, + "loss": 0.8618, + "step": 7457 + }, + { + "epoch": 0.4104793879685178, + "grad_norm": 0.7063136696815491, + "learning_rate": 9.016771836424248e-06, + "loss": 0.8068, + "step": 7458 + }, + { + "epoch": 0.4105344267708735, + "grad_norm": 0.6717063784599304, + "learning_rate": 9.016513691409867e-06, + "loss": 0.738, + "step": 7459 + }, + { + "epoch": 0.4105894655732291, + "grad_norm": 0.6807749271392822, + "learning_rate": 9.016255516208443e-06, + "loss": 0.7842, + "step": 7460 + }, + { + "epoch": 0.4106445043755848, + "grad_norm": 0.6990453600883484, + "learning_rate": 9.01599731082192e-06, + "loss": 0.7726, + "step": 7461 + }, + { + "epoch": 0.41069954317794044, + "grad_norm": 0.6704931259155273, + "learning_rate": 9.015739075252234e-06, + "loss": 0.7006, + "step": 7462 + }, + { + "epoch": 0.41075458198029613, + "grad_norm": 0.7162300944328308, + "learning_rate": 9.01548080950133e-06, + "loss": 0.8462, + "step": 7463 + }, + { + "epoch": 0.41080962078265176, + "grad_norm": 0.6845411658287048, + "learning_rate": 9.015222513571144e-06, + "loss": 0.7466, + "step": 7464 + }, + { + "epoch": 0.41086465958500745, + "grad_norm": 0.7146134376525879, + "learning_rate": 9.014964187463623e-06, + "loss": 0.7594, + "step": 7465 + }, + { + "epoch": 0.4109196983873631, + "grad_norm": 0.7664906978607178, + "learning_rate": 9.014705831180706e-06, + "loss": 0.8376, + "step": 7466 + }, + { + "epoch": 0.4109747371897188, + "grad_norm": 0.7319341897964478, + "learning_rate": 9.014447444724332e-06, + "loss": 0.7748, + "step": 7467 + }, + { + "epoch": 0.4110297759920744, + "grad_norm": 0.7269605398178101, + "learning_rate": 9.014189028096448e-06, + "loss": 0.6941, + "step": 7468 + }, + { + "epoch": 0.4110848147944301, + "grad_norm": 0.72607421875, + "learning_rate": 9.013930581298993e-06, + "loss": 0.7174, + "step": 7469 + }, + { + "epoch": 0.41113985359678573, + "grad_norm": 0.7385421991348267, + "learning_rate": 9.01367210433391e-06, + "loss": 0.7761, + "step": 7470 + }, + { + "epoch": 0.4111948923991414, + "grad_norm": 0.8392042517662048, + "learning_rate": 9.013413597203144e-06, + "loss": 0.7417, + "step": 7471 + }, + { + "epoch": 0.41124993120149705, + "grad_norm": 0.7454584836959839, + "learning_rate": 9.013155059908634e-06, + "loss": 0.8976, + "step": 7472 + }, + { + "epoch": 0.41130497000385274, + "grad_norm": 0.7358037829399109, + "learning_rate": 9.012896492452325e-06, + "loss": 0.7706, + "step": 7473 + }, + { + "epoch": 0.4113600088062084, + "grad_norm": 0.7454121708869934, + "learning_rate": 9.01263789483616e-06, + "loss": 0.7425, + "step": 7474 + }, + { + "epoch": 0.411415047608564, + "grad_norm": 0.7842294573783875, + "learning_rate": 9.012379267062081e-06, + "loss": 0.7739, + "step": 7475 + }, + { + "epoch": 0.4114700864109197, + "grad_norm": 0.7181714773178101, + "learning_rate": 9.012120609132036e-06, + "loss": 0.8466, + "step": 7476 + }, + { + "epoch": 0.41152512521327533, + "grad_norm": 0.7239206433296204, + "learning_rate": 9.011861921047966e-06, + "loss": 0.7493, + "step": 7477 + }, + { + "epoch": 0.411580164015631, + "grad_norm": 0.6773414611816406, + "learning_rate": 9.011603202811816e-06, + "loss": 0.7433, + "step": 7478 + }, + { + "epoch": 0.41163520281798666, + "grad_norm": 0.7770900130271912, + "learning_rate": 9.011344454425527e-06, + "loss": 0.7488, + "step": 7479 + }, + { + "epoch": 0.41169024162034235, + "grad_norm": 0.7305957674980164, + "learning_rate": 9.011085675891051e-06, + "loss": 0.7989, + "step": 7480 + }, + { + "epoch": 0.411745280422698, + "grad_norm": 0.734603762626648, + "learning_rate": 9.010826867210327e-06, + "loss": 0.805, + "step": 7481 + }, + { + "epoch": 0.41180031922505367, + "grad_norm": 0.7438979148864746, + "learning_rate": 9.010568028385303e-06, + "loss": 0.8407, + "step": 7482 + }, + { + "epoch": 0.4118553580274093, + "grad_norm": 0.6718543767929077, + "learning_rate": 9.01030915941792e-06, + "loss": 0.7575, + "step": 7483 + }, + { + "epoch": 0.411910396829765, + "grad_norm": 0.8157614469528198, + "learning_rate": 9.01005026031013e-06, + "loss": 0.8231, + "step": 7484 + }, + { + "epoch": 0.4119654356321206, + "grad_norm": 0.8927714824676514, + "learning_rate": 9.009791331063874e-06, + "loss": 0.808, + "step": 7485 + }, + { + "epoch": 0.4120204744344763, + "grad_norm": 0.7604075074195862, + "learning_rate": 9.009532371681101e-06, + "loss": 0.7505, + "step": 7486 + }, + { + "epoch": 0.41207551323683195, + "grad_norm": 0.6861944794654846, + "learning_rate": 9.009273382163754e-06, + "loss": 0.719, + "step": 7487 + }, + { + "epoch": 0.41213055203918764, + "grad_norm": 0.7043709754943848, + "learning_rate": 9.009014362513784e-06, + "loss": 0.8193, + "step": 7488 + }, + { + "epoch": 0.41218559084154327, + "grad_norm": 0.7459648847579956, + "learning_rate": 9.008755312733136e-06, + "loss": 0.8617, + "step": 7489 + }, + { + "epoch": 0.41224062964389896, + "grad_norm": 0.7272594571113586, + "learning_rate": 9.008496232823754e-06, + "loss": 0.7255, + "step": 7490 + }, + { + "epoch": 0.4122956684462546, + "grad_norm": 0.7486668229103088, + "learning_rate": 9.008237122787586e-06, + "loss": 0.6479, + "step": 7491 + }, + { + "epoch": 0.4123507072486103, + "grad_norm": 0.8149027228355408, + "learning_rate": 9.007977982626582e-06, + "loss": 0.8052, + "step": 7492 + }, + { + "epoch": 0.4124057460509659, + "grad_norm": 0.7054859399795532, + "learning_rate": 9.00771881234269e-06, + "loss": 0.8215, + "step": 7493 + }, + { + "epoch": 0.4124607848533216, + "grad_norm": 0.6840499639511108, + "learning_rate": 9.007459611937854e-06, + "loss": 0.776, + "step": 7494 + }, + { + "epoch": 0.41251582365567724, + "grad_norm": 0.7340932488441467, + "learning_rate": 9.007200381414026e-06, + "loss": 0.713, + "step": 7495 + }, + { + "epoch": 0.4125708624580329, + "grad_norm": 0.8282599449157715, + "learning_rate": 9.00694112077315e-06, + "loss": 0.7037, + "step": 7496 + }, + { + "epoch": 0.41262590126038856, + "grad_norm": 0.849588930606842, + "learning_rate": 9.00668183001718e-06, + "loss": 0.7845, + "step": 7497 + }, + { + "epoch": 0.41268094006274425, + "grad_norm": 0.8330783843994141, + "learning_rate": 9.00642250914806e-06, + "loss": 0.9049, + "step": 7498 + }, + { + "epoch": 0.4127359788650999, + "grad_norm": 0.7020101547241211, + "learning_rate": 9.00616315816774e-06, + "loss": 0.8146, + "step": 7499 + }, + { + "epoch": 0.4127910176674556, + "grad_norm": 0.7632037997245789, + "learning_rate": 9.005903777078173e-06, + "loss": 0.6629, + "step": 7500 + }, + { + "epoch": 0.4128460564698112, + "grad_norm": 0.7286840081214905, + "learning_rate": 9.005644365881304e-06, + "loss": 0.7795, + "step": 7501 + }, + { + "epoch": 0.4129010952721669, + "grad_norm": 0.710451066493988, + "learning_rate": 9.005384924579084e-06, + "loss": 0.7615, + "step": 7502 + }, + { + "epoch": 0.41295613407452253, + "grad_norm": 0.7657510042190552, + "learning_rate": 9.005125453173463e-06, + "loss": 0.8938, + "step": 7503 + }, + { + "epoch": 0.4130111728768782, + "grad_norm": 0.6978467702865601, + "learning_rate": 9.004865951666392e-06, + "loss": 0.7464, + "step": 7504 + }, + { + "epoch": 0.41306621167923385, + "grad_norm": 0.7028319835662842, + "learning_rate": 9.00460642005982e-06, + "loss": 0.7899, + "step": 7505 + }, + { + "epoch": 0.41312125048158954, + "grad_norm": 0.923951268196106, + "learning_rate": 9.004346858355698e-06, + "loss": 0.8851, + "step": 7506 + }, + { + "epoch": 0.4131762892839452, + "grad_norm": 0.7293704748153687, + "learning_rate": 9.004087266555978e-06, + "loss": 0.7594, + "step": 7507 + }, + { + "epoch": 0.41323132808630086, + "grad_norm": 0.7458868622779846, + "learning_rate": 9.003827644662608e-06, + "loss": 0.7538, + "step": 7508 + }, + { + "epoch": 0.4132863668886565, + "grad_norm": 0.6764113306999207, + "learning_rate": 9.003567992677543e-06, + "loss": 0.7303, + "step": 7509 + }, + { + "epoch": 0.4133414056910122, + "grad_norm": 0.7827350497245789, + "learning_rate": 9.003308310602732e-06, + "loss": 0.7708, + "step": 7510 + }, + { + "epoch": 0.4133964444933678, + "grad_norm": 0.7683281302452087, + "learning_rate": 9.003048598440127e-06, + "loss": 0.7971, + "step": 7511 + }, + { + "epoch": 0.4134514832957235, + "grad_norm": 0.8793813586235046, + "learning_rate": 9.002788856191679e-06, + "loss": 0.7434, + "step": 7512 + }, + { + "epoch": 0.41350652209807914, + "grad_norm": 0.6598063111305237, + "learning_rate": 9.002529083859343e-06, + "loss": 0.7082, + "step": 7513 + }, + { + "epoch": 0.41356156090043483, + "grad_norm": 0.8239839673042297, + "learning_rate": 9.002269281445071e-06, + "loss": 0.8457, + "step": 7514 + }, + { + "epoch": 0.41361659970279047, + "grad_norm": 0.7433123588562012, + "learning_rate": 9.002009448950812e-06, + "loss": 0.7399, + "step": 7515 + }, + { + "epoch": 0.41367163850514616, + "grad_norm": 0.8310487866401672, + "learning_rate": 9.001749586378524e-06, + "loss": 0.7482, + "step": 7516 + }, + { + "epoch": 0.4137266773075018, + "grad_norm": 0.7170824408531189, + "learning_rate": 9.001489693730155e-06, + "loss": 0.7856, + "step": 7517 + }, + { + "epoch": 0.4137817161098574, + "grad_norm": 0.9063520431518555, + "learning_rate": 9.00122977100766e-06, + "loss": 0.8623, + "step": 7518 + }, + { + "epoch": 0.4138367549122131, + "grad_norm": 0.8753733038902283, + "learning_rate": 9.000969818212996e-06, + "loss": 0.7875, + "step": 7519 + }, + { + "epoch": 0.41389179371456875, + "grad_norm": 0.7013519406318665, + "learning_rate": 9.000709835348112e-06, + "loss": 0.724, + "step": 7520 + }, + { + "epoch": 0.41394683251692443, + "grad_norm": 0.7385973334312439, + "learning_rate": 9.000449822414963e-06, + "loss": 0.7286, + "step": 7521 + }, + { + "epoch": 0.41400187131928007, + "grad_norm": 0.7605431079864502, + "learning_rate": 9.000189779415505e-06, + "loss": 0.728, + "step": 7522 + }, + { + "epoch": 0.41405691012163576, + "grad_norm": 0.7631710767745972, + "learning_rate": 8.99992970635169e-06, + "loss": 0.8276, + "step": 7523 + }, + { + "epoch": 0.4141119489239914, + "grad_norm": 0.8066657185554504, + "learning_rate": 8.999669603225477e-06, + "loss": 0.8319, + "step": 7524 + }, + { + "epoch": 0.4141669877263471, + "grad_norm": 0.689407229423523, + "learning_rate": 8.999409470038815e-06, + "loss": 0.6675, + "step": 7525 + }, + { + "epoch": 0.4142220265287027, + "grad_norm": 0.7391255497932434, + "learning_rate": 8.999149306793664e-06, + "loss": 0.8228, + "step": 7526 + }, + { + "epoch": 0.4142770653310584, + "grad_norm": 0.7208844423294067, + "learning_rate": 8.998889113491977e-06, + "loss": 0.7689, + "step": 7527 + }, + { + "epoch": 0.41433210413341404, + "grad_norm": 0.8278803825378418, + "learning_rate": 8.99862889013571e-06, + "loss": 0.7964, + "step": 7528 + }, + { + "epoch": 0.4143871429357697, + "grad_norm": 0.7287253141403198, + "learning_rate": 8.998368636726817e-06, + "loss": 0.7689, + "step": 7529 + }, + { + "epoch": 0.41444218173812536, + "grad_norm": 0.7159145474433899, + "learning_rate": 8.998108353267257e-06, + "loss": 0.7537, + "step": 7530 + }, + { + "epoch": 0.41449722054048105, + "grad_norm": 0.7605739235877991, + "learning_rate": 8.997848039758985e-06, + "loss": 0.7327, + "step": 7531 + }, + { + "epoch": 0.4145522593428367, + "grad_norm": 0.7290406227111816, + "learning_rate": 8.997587696203958e-06, + "loss": 0.6804, + "step": 7532 + }, + { + "epoch": 0.41460729814519237, + "grad_norm": 0.7613189816474915, + "learning_rate": 8.997327322604131e-06, + "loss": 0.7465, + "step": 7533 + }, + { + "epoch": 0.414662336947548, + "grad_norm": 0.7796703577041626, + "learning_rate": 8.99706691896146e-06, + "loss": 0.7444, + "step": 7534 + }, + { + "epoch": 0.4147173757499037, + "grad_norm": 0.8758549094200134, + "learning_rate": 8.996806485277904e-06, + "loss": 0.8586, + "step": 7535 + }, + { + "epoch": 0.4147724145522593, + "grad_norm": 0.9599420428276062, + "learning_rate": 8.996546021555423e-06, + "loss": 0.7554, + "step": 7536 + }, + { + "epoch": 0.414827453354615, + "grad_norm": 0.8216326236724854, + "learning_rate": 8.996285527795972e-06, + "loss": 0.7995, + "step": 7537 + }, + { + "epoch": 0.41488249215697065, + "grad_norm": 0.6777452230453491, + "learning_rate": 8.996025004001507e-06, + "loss": 0.7809, + "step": 7538 + }, + { + "epoch": 0.41493753095932634, + "grad_norm": 0.7354100942611694, + "learning_rate": 8.995764450173989e-06, + "loss": 0.6548, + "step": 7539 + }, + { + "epoch": 0.414992569761682, + "grad_norm": 0.7548280358314514, + "learning_rate": 8.995503866315373e-06, + "loss": 0.8308, + "step": 7540 + }, + { + "epoch": 0.41504760856403766, + "grad_norm": 0.6891447901725769, + "learning_rate": 8.995243252427622e-06, + "loss": 0.8386, + "step": 7541 + }, + { + "epoch": 0.4151026473663933, + "grad_norm": 0.6848340034484863, + "learning_rate": 8.99498260851269e-06, + "loss": 0.7587, + "step": 7542 + }, + { + "epoch": 0.415157686168749, + "grad_norm": 0.7109090685844421, + "learning_rate": 8.994721934572538e-06, + "loss": 0.6847, + "step": 7543 + }, + { + "epoch": 0.4152127249711046, + "grad_norm": 0.6708144545555115, + "learning_rate": 8.994461230609128e-06, + "loss": 0.7266, + "step": 7544 + }, + { + "epoch": 0.4152677637734603, + "grad_norm": 0.6985414028167725, + "learning_rate": 8.994200496624415e-06, + "loss": 0.7696, + "step": 7545 + }, + { + "epoch": 0.41532280257581594, + "grad_norm": 0.6989198923110962, + "learning_rate": 8.993939732620359e-06, + "loss": 0.7894, + "step": 7546 + }, + { + "epoch": 0.41537784137817163, + "grad_norm": 0.6667589545249939, + "learning_rate": 8.993678938598921e-06, + "loss": 0.7417, + "step": 7547 + }, + { + "epoch": 0.41543288018052726, + "grad_norm": 1.0692487955093384, + "learning_rate": 8.993418114562064e-06, + "loss": 0.7147, + "step": 7548 + }, + { + "epoch": 0.41548791898288295, + "grad_norm": 0.6709207892417908, + "learning_rate": 8.993157260511742e-06, + "loss": 0.7694, + "step": 7549 + }, + { + "epoch": 0.4155429577852386, + "grad_norm": 0.6714604496955872, + "learning_rate": 8.992896376449923e-06, + "loss": 0.6969, + "step": 7550 + }, + { + "epoch": 0.4155979965875943, + "grad_norm": 0.8266897201538086, + "learning_rate": 8.99263546237856e-06, + "loss": 0.8392, + "step": 7551 + }, + { + "epoch": 0.4156530353899499, + "grad_norm": 0.675188422203064, + "learning_rate": 8.992374518299619e-06, + "loss": 0.7525, + "step": 7552 + }, + { + "epoch": 0.4157080741923056, + "grad_norm": 0.7406265139579773, + "learning_rate": 8.992113544215059e-06, + "loss": 0.7895, + "step": 7553 + }, + { + "epoch": 0.41576311299466123, + "grad_norm": 0.837336003780365, + "learning_rate": 8.991852540126844e-06, + "loss": 0.7376, + "step": 7554 + }, + { + "epoch": 0.4158181517970169, + "grad_norm": 0.6774994730949402, + "learning_rate": 8.991591506036931e-06, + "loss": 0.7231, + "step": 7555 + }, + { + "epoch": 0.41587319059937256, + "grad_norm": 0.6941245794296265, + "learning_rate": 8.991330441947287e-06, + "loss": 0.7213, + "step": 7556 + }, + { + "epoch": 0.41592822940172824, + "grad_norm": 0.7588210105895996, + "learning_rate": 8.991069347859871e-06, + "loss": 0.7829, + "step": 7557 + }, + { + "epoch": 0.4159832682040839, + "grad_norm": 0.7580196857452393, + "learning_rate": 8.990808223776647e-06, + "loss": 0.7782, + "step": 7558 + }, + { + "epoch": 0.41603830700643957, + "grad_norm": 0.7597478032112122, + "learning_rate": 8.990547069699576e-06, + "loss": 0.7764, + "step": 7559 + }, + { + "epoch": 0.4160933458087952, + "grad_norm": 0.7950314283370972, + "learning_rate": 8.990285885630622e-06, + "loss": 0.7263, + "step": 7560 + }, + { + "epoch": 0.41614838461115083, + "grad_norm": 0.6962432265281677, + "learning_rate": 8.990024671571747e-06, + "loss": 0.6616, + "step": 7561 + }, + { + "epoch": 0.4162034234135065, + "grad_norm": 0.682816207408905, + "learning_rate": 8.989763427524915e-06, + "loss": 0.7862, + "step": 7562 + }, + { + "epoch": 0.41625846221586216, + "grad_norm": 0.686673104763031, + "learning_rate": 8.989502153492089e-06, + "loss": 0.8199, + "step": 7563 + }, + { + "epoch": 0.41631350101821785, + "grad_norm": 0.7954965233802795, + "learning_rate": 8.989240849475231e-06, + "loss": 0.8021, + "step": 7564 + }, + { + "epoch": 0.4163685398205735, + "grad_norm": 0.7516284584999084, + "learning_rate": 8.988979515476309e-06, + "loss": 0.7803, + "step": 7565 + }, + { + "epoch": 0.41642357862292917, + "grad_norm": 0.7148317694664001, + "learning_rate": 8.988718151497284e-06, + "loss": 0.7407, + "step": 7566 + }, + { + "epoch": 0.4164786174252848, + "grad_norm": 0.7898986339569092, + "learning_rate": 8.98845675754012e-06, + "loss": 0.8382, + "step": 7567 + }, + { + "epoch": 0.4165336562276405, + "grad_norm": 0.7014235854148865, + "learning_rate": 8.988195333606784e-06, + "loss": 0.7205, + "step": 7568 + }, + { + "epoch": 0.4165886950299961, + "grad_norm": 0.6520957350730896, + "learning_rate": 8.987933879699238e-06, + "loss": 0.7452, + "step": 7569 + }, + { + "epoch": 0.4166437338323518, + "grad_norm": 0.7462863922119141, + "learning_rate": 8.987672395819449e-06, + "loss": 0.7787, + "step": 7570 + }, + { + "epoch": 0.41669877263470745, + "grad_norm": 0.7366049885749817, + "learning_rate": 8.987410881969382e-06, + "loss": 0.7662, + "step": 7571 + }, + { + "epoch": 0.41675381143706314, + "grad_norm": 0.7732293009757996, + "learning_rate": 8.987149338151002e-06, + "loss": 0.8258, + "step": 7572 + }, + { + "epoch": 0.41680885023941877, + "grad_norm": 0.9309358596801758, + "learning_rate": 8.986887764366275e-06, + "loss": 0.6538, + "step": 7573 + }, + { + "epoch": 0.41686388904177446, + "grad_norm": 0.6976680755615234, + "learning_rate": 8.986626160617167e-06, + "loss": 0.7175, + "step": 7574 + }, + { + "epoch": 0.4169189278441301, + "grad_norm": 0.7541783452033997, + "learning_rate": 8.986364526905645e-06, + "loss": 0.8153, + "step": 7575 + }, + { + "epoch": 0.4169739666464858, + "grad_norm": 0.8968943357467651, + "learning_rate": 8.986102863233673e-06, + "loss": 0.7859, + "step": 7576 + }, + { + "epoch": 0.4170290054488414, + "grad_norm": 0.6910044550895691, + "learning_rate": 8.985841169603218e-06, + "loss": 0.8381, + "step": 7577 + }, + { + "epoch": 0.4170840442511971, + "grad_norm": 0.8944257497787476, + "learning_rate": 8.985579446016249e-06, + "loss": 0.7062, + "step": 7578 + }, + { + "epoch": 0.41713908305355274, + "grad_norm": 0.6665629744529724, + "learning_rate": 8.98531769247473e-06, + "loss": 0.7928, + "step": 7579 + }, + { + "epoch": 0.41719412185590843, + "grad_norm": 0.7642979621887207, + "learning_rate": 8.985055908980634e-06, + "loss": 0.8442, + "step": 7580 + }, + { + "epoch": 0.41724916065826406, + "grad_norm": 0.7575559020042419, + "learning_rate": 8.98479409553592e-06, + "loss": 0.795, + "step": 7581 + }, + { + "epoch": 0.41730419946061975, + "grad_norm": 0.6567206978797913, + "learning_rate": 8.984532252142563e-06, + "loss": 0.713, + "step": 7582 + }, + { + "epoch": 0.4173592382629754, + "grad_norm": 0.6677179336547852, + "learning_rate": 8.984270378802527e-06, + "loss": 0.8173, + "step": 7583 + }, + { + "epoch": 0.4174142770653311, + "grad_norm": 0.6846007704734802, + "learning_rate": 8.984008475517782e-06, + "loss": 0.7154, + "step": 7584 + }, + { + "epoch": 0.4174693158676867, + "grad_norm": 0.7758762836456299, + "learning_rate": 8.983746542290294e-06, + "loss": 0.8686, + "step": 7585 + }, + { + "epoch": 0.4175243546700424, + "grad_norm": 0.6850305199623108, + "learning_rate": 8.983484579122036e-06, + "loss": 0.7568, + "step": 7586 + }, + { + "epoch": 0.41757939347239803, + "grad_norm": 0.7165307998657227, + "learning_rate": 8.983222586014973e-06, + "loss": 0.7856, + "step": 7587 + }, + { + "epoch": 0.4176344322747537, + "grad_norm": 0.7747449278831482, + "learning_rate": 8.982960562971074e-06, + "loss": 0.8148, + "step": 7588 + }, + { + "epoch": 0.41768947107710935, + "grad_norm": 0.789235532283783, + "learning_rate": 8.982698509992311e-06, + "loss": 0.8021, + "step": 7589 + }, + { + "epoch": 0.41774450987946504, + "grad_norm": 0.664186954498291, + "learning_rate": 8.982436427080652e-06, + "loss": 0.7394, + "step": 7590 + }, + { + "epoch": 0.4177995486818207, + "grad_norm": 0.7045899033546448, + "learning_rate": 8.982174314238069e-06, + "loss": 0.7029, + "step": 7591 + }, + { + "epoch": 0.41785458748417637, + "grad_norm": 0.7569751739501953, + "learning_rate": 8.981912171466525e-06, + "loss": 0.6106, + "step": 7592 + }, + { + "epoch": 0.417909626286532, + "grad_norm": 0.7383938431739807, + "learning_rate": 8.981649998767998e-06, + "loss": 0.8163, + "step": 7593 + }, + { + "epoch": 0.4179646650888877, + "grad_norm": 0.7314342856407166, + "learning_rate": 8.981387796144456e-06, + "loss": 0.6847, + "step": 7594 + }, + { + "epoch": 0.4180197038912433, + "grad_norm": 0.7249840497970581, + "learning_rate": 8.981125563597867e-06, + "loss": 0.8025, + "step": 7595 + }, + { + "epoch": 0.418074742693599, + "grad_norm": 0.7260022759437561, + "learning_rate": 8.980863301130206e-06, + "loss": 0.7807, + "step": 7596 + }, + { + "epoch": 0.41812978149595464, + "grad_norm": 0.6249421834945679, + "learning_rate": 8.980601008743441e-06, + "loss": 0.6744, + "step": 7597 + }, + { + "epoch": 0.41818482029831033, + "grad_norm": 0.8132835626602173, + "learning_rate": 8.980338686439544e-06, + "loss": 0.7992, + "step": 7598 + }, + { + "epoch": 0.41823985910066597, + "grad_norm": 0.7279506921768188, + "learning_rate": 8.980076334220487e-06, + "loss": 0.8402, + "step": 7599 + }, + { + "epoch": 0.41829489790302166, + "grad_norm": 0.7168325781822205, + "learning_rate": 8.979813952088242e-06, + "loss": 0.9107, + "step": 7600 + }, + { + "epoch": 0.4183499367053773, + "grad_norm": 0.633661150932312, + "learning_rate": 8.97955154004478e-06, + "loss": 0.6328, + "step": 7601 + }, + { + "epoch": 0.418404975507733, + "grad_norm": 0.6770638227462769, + "learning_rate": 8.979289098092074e-06, + "loss": 0.7604, + "step": 7602 + }, + { + "epoch": 0.4184600143100886, + "grad_norm": 0.7589067816734314, + "learning_rate": 8.979026626232098e-06, + "loss": 0.7774, + "step": 7603 + }, + { + "epoch": 0.41851505311244425, + "grad_norm": 0.7116312980651855, + "learning_rate": 8.97876412446682e-06, + "loss": 0.8186, + "step": 7604 + }, + { + "epoch": 0.41857009191479994, + "grad_norm": 0.7369259595870972, + "learning_rate": 8.978501592798219e-06, + "loss": 0.6705, + "step": 7605 + }, + { + "epoch": 0.41862513071715557, + "grad_norm": 0.6201806664466858, + "learning_rate": 8.978239031228265e-06, + "loss": 0.7011, + "step": 7606 + }, + { + "epoch": 0.41868016951951126, + "grad_norm": 0.7652842998504639, + "learning_rate": 8.977976439758929e-06, + "loss": 0.8112, + "step": 7607 + }, + { + "epoch": 0.4187352083218669, + "grad_norm": 0.7214640974998474, + "learning_rate": 8.97771381839219e-06, + "loss": 0.767, + "step": 7608 + }, + { + "epoch": 0.4187902471242226, + "grad_norm": 0.8093706369400024, + "learning_rate": 8.977451167130015e-06, + "loss": 0.8112, + "step": 7609 + }, + { + "epoch": 0.4188452859265782, + "grad_norm": 0.7023005485534668, + "learning_rate": 8.977188485974382e-06, + "loss": 0.7678, + "step": 7610 + }, + { + "epoch": 0.4189003247289339, + "grad_norm": 0.8126183748245239, + "learning_rate": 8.976925774927267e-06, + "loss": 0.8207, + "step": 7611 + }, + { + "epoch": 0.41895536353128954, + "grad_norm": 0.9624595642089844, + "learning_rate": 8.976663033990643e-06, + "loss": 0.7853, + "step": 7612 + }, + { + "epoch": 0.4190104023336452, + "grad_norm": 0.7866421937942505, + "learning_rate": 8.976400263166483e-06, + "loss": 0.6319, + "step": 7613 + }, + { + "epoch": 0.41906544113600086, + "grad_norm": 0.7555810213088989, + "learning_rate": 8.976137462456762e-06, + "loss": 0.7781, + "step": 7614 + }, + { + "epoch": 0.41912047993835655, + "grad_norm": 0.7383303046226501, + "learning_rate": 8.975874631863457e-06, + "loss": 0.8152, + "step": 7615 + }, + { + "epoch": 0.4191755187407122, + "grad_norm": 0.7873355746269226, + "learning_rate": 8.975611771388542e-06, + "loss": 0.723, + "step": 7616 + }, + { + "epoch": 0.41923055754306787, + "grad_norm": 0.7265962362289429, + "learning_rate": 8.975348881033993e-06, + "loss": 0.8016, + "step": 7617 + }, + { + "epoch": 0.4192855963454235, + "grad_norm": 0.7074393033981323, + "learning_rate": 8.975085960801788e-06, + "loss": 0.7453, + "step": 7618 + }, + { + "epoch": 0.4193406351477792, + "grad_norm": 0.6975581049919128, + "learning_rate": 8.9748230106939e-06, + "loss": 0.6516, + "step": 7619 + }, + { + "epoch": 0.41939567395013483, + "grad_norm": 0.7730469107627869, + "learning_rate": 8.974560030712304e-06, + "loss": 0.7297, + "step": 7620 + }, + { + "epoch": 0.4194507127524905, + "grad_norm": 0.7289026379585266, + "learning_rate": 8.974297020858982e-06, + "loss": 0.7087, + "step": 7621 + }, + { + "epoch": 0.41950575155484615, + "grad_norm": 0.8029256463050842, + "learning_rate": 8.974033981135906e-06, + "loss": 0.7923, + "step": 7622 + }, + { + "epoch": 0.41956079035720184, + "grad_norm": 0.765312135219574, + "learning_rate": 8.973770911545055e-06, + "loss": 0.7824, + "step": 7623 + }, + { + "epoch": 0.4196158291595575, + "grad_norm": 0.7903861403465271, + "learning_rate": 8.973507812088404e-06, + "loss": 0.8207, + "step": 7624 + }, + { + "epoch": 0.41967086796191316, + "grad_norm": 0.6875497698783875, + "learning_rate": 8.973244682767934e-06, + "loss": 0.7972, + "step": 7625 + }, + { + "epoch": 0.4197259067642688, + "grad_norm": 0.7781878709793091, + "learning_rate": 8.972981523585617e-06, + "loss": 0.754, + "step": 7626 + }, + { + "epoch": 0.4197809455666245, + "grad_norm": 0.6495640873908997, + "learning_rate": 8.972718334543437e-06, + "loss": 0.6851, + "step": 7627 + }, + { + "epoch": 0.4198359843689801, + "grad_norm": 0.7610780596733093, + "learning_rate": 8.97245511564337e-06, + "loss": 0.8161, + "step": 7628 + }, + { + "epoch": 0.4198910231713358, + "grad_norm": 0.7764771580696106, + "learning_rate": 8.972191866887393e-06, + "loss": 0.8341, + "step": 7629 + }, + { + "epoch": 0.41994606197369144, + "grad_norm": 0.7709774374961853, + "learning_rate": 8.971928588277485e-06, + "loss": 0.765, + "step": 7630 + }, + { + "epoch": 0.42000110077604713, + "grad_norm": 0.8213009238243103, + "learning_rate": 8.971665279815625e-06, + "loss": 0.8971, + "step": 7631 + }, + { + "epoch": 0.42005613957840277, + "grad_norm": 0.7232406735420227, + "learning_rate": 8.971401941503792e-06, + "loss": 0.7919, + "step": 7632 + }, + { + "epoch": 0.42011117838075845, + "grad_norm": 0.7322028279304504, + "learning_rate": 8.971138573343964e-06, + "loss": 0.8167, + "step": 7633 + }, + { + "epoch": 0.4201662171831141, + "grad_norm": 0.7204442024230957, + "learning_rate": 8.970875175338123e-06, + "loss": 0.8152, + "step": 7634 + }, + { + "epoch": 0.4202212559854698, + "grad_norm": 0.7385342121124268, + "learning_rate": 8.970611747488246e-06, + "loss": 0.8204, + "step": 7635 + }, + { + "epoch": 0.4202762947878254, + "grad_norm": 0.758941113948822, + "learning_rate": 8.970348289796316e-06, + "loss": 0.8402, + "step": 7636 + }, + { + "epoch": 0.4203313335901811, + "grad_norm": 0.7331902384757996, + "learning_rate": 8.970084802264309e-06, + "loss": 0.7305, + "step": 7637 + }, + { + "epoch": 0.42038637239253673, + "grad_norm": 0.7822885513305664, + "learning_rate": 8.969821284894208e-06, + "loss": 0.8708, + "step": 7638 + }, + { + "epoch": 0.4204414111948924, + "grad_norm": 0.6625984311103821, + "learning_rate": 8.969557737687992e-06, + "loss": 0.7806, + "step": 7639 + }, + { + "epoch": 0.42049644999724806, + "grad_norm": 1.02848482131958, + "learning_rate": 8.969294160647645e-06, + "loss": 0.7176, + "step": 7640 + }, + { + "epoch": 0.42055148879960375, + "grad_norm": 0.7888724207878113, + "learning_rate": 8.969030553775144e-06, + "loss": 0.8326, + "step": 7641 + }, + { + "epoch": 0.4206065276019594, + "grad_norm": 0.7148883938789368, + "learning_rate": 8.968766917072472e-06, + "loss": 0.7405, + "step": 7642 + }, + { + "epoch": 0.42066156640431507, + "grad_norm": 0.6629698872566223, + "learning_rate": 8.96850325054161e-06, + "loss": 0.845, + "step": 7643 + }, + { + "epoch": 0.4207166052066707, + "grad_norm": 0.8414682149887085, + "learning_rate": 8.96823955418454e-06, + "loss": 1.3631, + "step": 7644 + }, + { + "epoch": 0.4207716440090264, + "grad_norm": 0.7105298638343811, + "learning_rate": 8.967975828003244e-06, + "loss": 0.6808, + "step": 7645 + }, + { + "epoch": 0.420826682811382, + "grad_norm": 0.7324852347373962, + "learning_rate": 8.967712071999703e-06, + "loss": 0.8237, + "step": 7646 + }, + { + "epoch": 0.42088172161373766, + "grad_norm": 0.737324595451355, + "learning_rate": 8.9674482861759e-06, + "loss": 0.8486, + "step": 7647 + }, + { + "epoch": 0.42093676041609335, + "grad_norm": 0.6763800382614136, + "learning_rate": 8.967184470533818e-06, + "loss": 0.72, + "step": 7648 + }, + { + "epoch": 0.420991799218449, + "grad_norm": 0.7560757994651794, + "learning_rate": 8.96692062507544e-06, + "loss": 0.7704, + "step": 7649 + }, + { + "epoch": 0.42104683802080467, + "grad_norm": 0.7289260029792786, + "learning_rate": 8.966656749802748e-06, + "loss": 0.7411, + "step": 7650 + }, + { + "epoch": 0.4211018768231603, + "grad_norm": 0.6935442686080933, + "learning_rate": 8.966392844717726e-06, + "loss": 0.7848, + "step": 7651 + }, + { + "epoch": 0.421156915625516, + "grad_norm": 0.7111918330192566, + "learning_rate": 8.966128909822356e-06, + "loss": 0.8377, + "step": 7652 + }, + { + "epoch": 0.4212119544278716, + "grad_norm": 0.8594884872436523, + "learning_rate": 8.965864945118625e-06, + "loss": 0.8227, + "step": 7653 + }, + { + "epoch": 0.4212669932302273, + "grad_norm": 0.6521008014678955, + "learning_rate": 8.965600950608513e-06, + "loss": 0.7034, + "step": 7654 + }, + { + "epoch": 0.42132203203258295, + "grad_norm": 0.6362404823303223, + "learning_rate": 8.965336926294007e-06, + "loss": 0.6712, + "step": 7655 + }, + { + "epoch": 0.42137707083493864, + "grad_norm": 0.6955040097236633, + "learning_rate": 8.965072872177088e-06, + "loss": 0.7789, + "step": 7656 + }, + { + "epoch": 0.42143210963729427, + "grad_norm": 0.7311720252037048, + "learning_rate": 8.964808788259745e-06, + "loss": 0.7522, + "step": 7657 + }, + { + "epoch": 0.42148714843964996, + "grad_norm": 0.781131386756897, + "learning_rate": 8.96454467454396e-06, + "loss": 0.7831, + "step": 7658 + }, + { + "epoch": 0.4215421872420056, + "grad_norm": 0.6740639805793762, + "learning_rate": 8.964280531031718e-06, + "loss": 0.7102, + "step": 7659 + }, + { + "epoch": 0.4215972260443613, + "grad_norm": 0.7843424677848816, + "learning_rate": 8.964016357725003e-06, + "loss": 0.8325, + "step": 7660 + }, + { + "epoch": 0.4216522648467169, + "grad_norm": 0.7833517789840698, + "learning_rate": 8.963752154625804e-06, + "loss": 0.8603, + "step": 7661 + }, + { + "epoch": 0.4217073036490726, + "grad_norm": 0.7270992994308472, + "learning_rate": 8.963487921736104e-06, + "loss": 0.745, + "step": 7662 + }, + { + "epoch": 0.42176234245142824, + "grad_norm": 0.6517582535743713, + "learning_rate": 8.963223659057892e-06, + "loss": 0.6983, + "step": 7663 + }, + { + "epoch": 0.42181738125378393, + "grad_norm": 0.6974934935569763, + "learning_rate": 8.962959366593149e-06, + "loss": 0.733, + "step": 7664 + }, + { + "epoch": 0.42187242005613956, + "grad_norm": 0.712045431137085, + "learning_rate": 8.962695044343865e-06, + "loss": 0.725, + "step": 7665 + }, + { + "epoch": 0.42192745885849525, + "grad_norm": 0.7311459183692932, + "learning_rate": 8.962430692312028e-06, + "loss": 0.8025, + "step": 7666 + }, + { + "epoch": 0.4219824976608509, + "grad_norm": 0.7439966201782227, + "learning_rate": 8.962166310499621e-06, + "loss": 0.7711, + "step": 7667 + }, + { + "epoch": 0.4220375364632066, + "grad_norm": 0.690832257270813, + "learning_rate": 8.961901898908632e-06, + "loss": 0.8414, + "step": 7668 + }, + { + "epoch": 0.4220925752655622, + "grad_norm": 0.8437964916229248, + "learning_rate": 8.961637457541049e-06, + "loss": 0.8253, + "step": 7669 + }, + { + "epoch": 0.4221476140679179, + "grad_norm": 0.7876344323158264, + "learning_rate": 8.96137298639886e-06, + "loss": 0.754, + "step": 7670 + }, + { + "epoch": 0.42220265287027353, + "grad_norm": 0.7551780343055725, + "learning_rate": 8.961108485484052e-06, + "loss": 0.8555, + "step": 7671 + }, + { + "epoch": 0.4222576916726292, + "grad_norm": 0.6867276430130005, + "learning_rate": 8.96084395479861e-06, + "loss": 0.7216, + "step": 7672 + }, + { + "epoch": 0.42231273047498485, + "grad_norm": 0.9052873849868774, + "learning_rate": 8.960579394344528e-06, + "loss": 0.7945, + "step": 7673 + }, + { + "epoch": 0.42236776927734054, + "grad_norm": 0.6731994152069092, + "learning_rate": 8.96031480412379e-06, + "loss": 0.7691, + "step": 7674 + }, + { + "epoch": 0.4224228080796962, + "grad_norm": 0.7074670195579529, + "learning_rate": 8.960050184138389e-06, + "loss": 0.8008, + "step": 7675 + }, + { + "epoch": 0.42247784688205187, + "grad_norm": 0.9482604265213013, + "learning_rate": 8.959785534390309e-06, + "loss": 0.7095, + "step": 7676 + }, + { + "epoch": 0.4225328856844075, + "grad_norm": 0.6915413737297058, + "learning_rate": 8.95952085488154e-06, + "loss": 0.6717, + "step": 7677 + }, + { + "epoch": 0.4225879244867632, + "grad_norm": 0.7565900087356567, + "learning_rate": 8.959256145614073e-06, + "loss": 0.8311, + "step": 7678 + }, + { + "epoch": 0.4226429632891188, + "grad_norm": 0.8307167887687683, + "learning_rate": 8.958991406589896e-06, + "loss": 0.8585, + "step": 7679 + }, + { + "epoch": 0.4226980020914745, + "grad_norm": 0.7955091595649719, + "learning_rate": 8.958726637811e-06, + "loss": 0.8154, + "step": 7680 + }, + { + "epoch": 0.42275304089383015, + "grad_norm": 0.7692292332649231, + "learning_rate": 8.958461839279376e-06, + "loss": 0.7965, + "step": 7681 + }, + { + "epoch": 0.42280807969618583, + "grad_norm": 0.7355942726135254, + "learning_rate": 8.95819701099701e-06, + "loss": 0.7557, + "step": 7682 + }, + { + "epoch": 0.42286311849854147, + "grad_norm": 0.8781518936157227, + "learning_rate": 8.957932152965895e-06, + "loss": 0.8033, + "step": 7683 + }, + { + "epoch": 0.42291815730089716, + "grad_norm": 0.7180802226066589, + "learning_rate": 8.957667265188022e-06, + "loss": 0.7283, + "step": 7684 + }, + { + "epoch": 0.4229731961032528, + "grad_norm": 0.6967236995697021, + "learning_rate": 8.95740234766538e-06, + "loss": 0.769, + "step": 7685 + }, + { + "epoch": 0.4230282349056085, + "grad_norm": 0.7462503910064697, + "learning_rate": 8.957137400399963e-06, + "loss": 0.8179, + "step": 7686 + }, + { + "epoch": 0.4230832737079641, + "grad_norm": 0.67714524269104, + "learning_rate": 8.956872423393761e-06, + "loss": 0.7976, + "step": 7687 + }, + { + "epoch": 0.4231383125103198, + "grad_norm": 0.8239946365356445, + "learning_rate": 8.956607416648763e-06, + "loss": 0.7946, + "step": 7688 + }, + { + "epoch": 0.42319335131267544, + "grad_norm": 0.6724610924720764, + "learning_rate": 8.956342380166963e-06, + "loss": 0.7633, + "step": 7689 + }, + { + "epoch": 0.42324839011503107, + "grad_norm": 0.744987964630127, + "learning_rate": 8.956077313950354e-06, + "loss": 0.9028, + "step": 7690 + }, + { + "epoch": 0.42330342891738676, + "grad_norm": 0.7700596451759338, + "learning_rate": 8.955812218000925e-06, + "loss": 0.8954, + "step": 7691 + }, + { + "epoch": 0.4233584677197424, + "grad_norm": 0.6952996253967285, + "learning_rate": 8.955547092320673e-06, + "loss": 0.8094, + "step": 7692 + }, + { + "epoch": 0.4234135065220981, + "grad_norm": 0.6410536766052246, + "learning_rate": 8.955281936911586e-06, + "loss": 0.6281, + "step": 7693 + }, + { + "epoch": 0.4234685453244537, + "grad_norm": 1.0939754247665405, + "learning_rate": 8.95501675177566e-06, + "loss": 0.8239, + "step": 7694 + }, + { + "epoch": 0.4235235841268094, + "grad_norm": 0.7419464588165283, + "learning_rate": 8.954751536914885e-06, + "loss": 0.8015, + "step": 7695 + }, + { + "epoch": 0.42357862292916504, + "grad_norm": 0.8171356320381165, + "learning_rate": 8.954486292331257e-06, + "loss": 0.8183, + "step": 7696 + }, + { + "epoch": 0.4236336617315207, + "grad_norm": 0.745884358882904, + "learning_rate": 8.95422101802677e-06, + "loss": 0.7457, + "step": 7697 + }, + { + "epoch": 0.42368870053387636, + "grad_norm": 0.7355740070343018, + "learning_rate": 8.953955714003414e-06, + "loss": 0.7517, + "step": 7698 + }, + { + "epoch": 0.42374373933623205, + "grad_norm": 0.7103458642959595, + "learning_rate": 8.953690380263186e-06, + "loss": 0.7306, + "step": 7699 + }, + { + "epoch": 0.4237987781385877, + "grad_norm": 0.7453970909118652, + "learning_rate": 8.95342501680808e-06, + "loss": 0.8396, + "step": 7700 + }, + { + "epoch": 0.4238538169409434, + "grad_norm": 0.7132760286331177, + "learning_rate": 8.953159623640088e-06, + "loss": 0.7861, + "step": 7701 + }, + { + "epoch": 0.423908855743299, + "grad_norm": 0.785827100276947, + "learning_rate": 8.952894200761209e-06, + "loss": 0.8681, + "step": 7702 + }, + { + "epoch": 0.4239638945456547, + "grad_norm": 0.7075281143188477, + "learning_rate": 8.952628748173433e-06, + "loss": 0.7257, + "step": 7703 + }, + { + "epoch": 0.42401893334801033, + "grad_norm": 0.8205186724662781, + "learning_rate": 8.952363265878758e-06, + "loss": 0.7361, + "step": 7704 + }, + { + "epoch": 0.424073972150366, + "grad_norm": 0.6517061591148376, + "learning_rate": 8.952097753879181e-06, + "loss": 0.7127, + "step": 7705 + }, + { + "epoch": 0.42412901095272165, + "grad_norm": 0.7252761125564575, + "learning_rate": 8.951832212176692e-06, + "loss": 0.796, + "step": 7706 + }, + { + "epoch": 0.42418404975507734, + "grad_norm": 0.6688609719276428, + "learning_rate": 8.951566640773292e-06, + "loss": 0.7698, + "step": 7707 + }, + { + "epoch": 0.424239088557433, + "grad_norm": 0.7163566946983337, + "learning_rate": 8.951301039670974e-06, + "loss": 0.8069, + "step": 7708 + }, + { + "epoch": 0.42429412735978866, + "grad_norm": 0.7027623057365417, + "learning_rate": 8.951035408871735e-06, + "loss": 0.7061, + "step": 7709 + }, + { + "epoch": 0.4243491661621443, + "grad_norm": 0.9558683037757874, + "learning_rate": 8.950769748377572e-06, + "loss": 0.926, + "step": 7710 + }, + { + "epoch": 0.4244042049645, + "grad_norm": 0.7173893451690674, + "learning_rate": 8.950504058190482e-06, + "loss": 0.7519, + "step": 7711 + }, + { + "epoch": 0.4244592437668556, + "grad_norm": 0.8481128811836243, + "learning_rate": 8.950238338312459e-06, + "loss": 0.7804, + "step": 7712 + }, + { + "epoch": 0.4245142825692113, + "grad_norm": 0.6957072615623474, + "learning_rate": 8.949972588745502e-06, + "loss": 0.611, + "step": 7713 + }, + { + "epoch": 0.42456932137156694, + "grad_norm": 0.7910122871398926, + "learning_rate": 8.94970680949161e-06, + "loss": 0.8435, + "step": 7714 + }, + { + "epoch": 0.42462436017392263, + "grad_norm": 0.8068616986274719, + "learning_rate": 8.949441000552777e-06, + "loss": 0.8658, + "step": 7715 + }, + { + "epoch": 0.42467939897627827, + "grad_norm": 0.718110978603363, + "learning_rate": 8.949175161931006e-06, + "loss": 0.7908, + "step": 7716 + }, + { + "epoch": 0.42473443777863396, + "grad_norm": 0.7329656481742859, + "learning_rate": 8.948909293628289e-06, + "loss": 0.7477, + "step": 7717 + }, + { + "epoch": 0.4247894765809896, + "grad_norm": 0.7046940326690674, + "learning_rate": 8.948643395646625e-06, + "loss": 0.7985, + "step": 7718 + }, + { + "epoch": 0.4248445153833453, + "grad_norm": 0.6699581742286682, + "learning_rate": 8.948377467988017e-06, + "loss": 0.6575, + "step": 7719 + }, + { + "epoch": 0.4248995541857009, + "grad_norm": 0.8055217266082764, + "learning_rate": 8.94811151065446e-06, + "loss": 0.7008, + "step": 7720 + }, + { + "epoch": 0.4249545929880566, + "grad_norm": 0.8374543190002441, + "learning_rate": 8.947845523647954e-06, + "loss": 0.8918, + "step": 7721 + }, + { + "epoch": 0.42500963179041223, + "grad_norm": 0.6974833607673645, + "learning_rate": 8.947579506970498e-06, + "loss": 0.8594, + "step": 7722 + }, + { + "epoch": 0.4250646705927679, + "grad_norm": 0.7466567754745483, + "learning_rate": 8.947313460624091e-06, + "loss": 0.6935, + "step": 7723 + }, + { + "epoch": 0.42511970939512356, + "grad_norm": 0.8118101358413696, + "learning_rate": 8.947047384610734e-06, + "loss": 0.8432, + "step": 7724 + }, + { + "epoch": 0.42517474819747925, + "grad_norm": 0.6885644197463989, + "learning_rate": 8.946781278932422e-06, + "loss": 0.8059, + "step": 7725 + }, + { + "epoch": 0.4252297869998349, + "grad_norm": 0.7257012128829956, + "learning_rate": 8.94651514359116e-06, + "loss": 0.8239, + "step": 7726 + }, + { + "epoch": 0.42528482580219057, + "grad_norm": 1.311591386795044, + "learning_rate": 8.946248978588947e-06, + "loss": 0.8207, + "step": 7727 + }, + { + "epoch": 0.4253398646045462, + "grad_norm": 0.7694151997566223, + "learning_rate": 8.945982783927784e-06, + "loss": 0.8948, + "step": 7728 + }, + { + "epoch": 0.4253949034069019, + "grad_norm": 0.6922980546951294, + "learning_rate": 8.945716559609669e-06, + "loss": 0.7883, + "step": 7729 + }, + { + "epoch": 0.4254499422092575, + "grad_norm": 0.7803757786750793, + "learning_rate": 8.945450305636605e-06, + "loss": 0.9166, + "step": 7730 + }, + { + "epoch": 0.4255049810116132, + "grad_norm": 0.6775311827659607, + "learning_rate": 8.945184022010593e-06, + "loss": 0.6976, + "step": 7731 + }, + { + "epoch": 0.42556001981396885, + "grad_norm": 0.7108052968978882, + "learning_rate": 8.944917708733634e-06, + "loss": 0.7763, + "step": 7732 + }, + { + "epoch": 0.4256150586163245, + "grad_norm": 0.7215770483016968, + "learning_rate": 8.94465136580773e-06, + "loss": 0.7907, + "step": 7733 + }, + { + "epoch": 0.42567009741868017, + "grad_norm": 0.6690788865089417, + "learning_rate": 8.944384993234881e-06, + "loss": 0.8403, + "step": 7734 + }, + { + "epoch": 0.4257251362210358, + "grad_norm": 0.7372478246688843, + "learning_rate": 8.94411859101709e-06, + "loss": 0.7618, + "step": 7735 + }, + { + "epoch": 0.4257801750233915, + "grad_norm": 0.9398306608200073, + "learning_rate": 8.94385215915636e-06, + "loss": 0.9043, + "step": 7736 + }, + { + "epoch": 0.4258352138257471, + "grad_norm": 0.8790311217308044, + "learning_rate": 8.943585697654693e-06, + "loss": 0.9378, + "step": 7737 + }, + { + "epoch": 0.4258902526281028, + "grad_norm": 0.7579166889190674, + "learning_rate": 8.943319206514091e-06, + "loss": 0.7913, + "step": 7738 + }, + { + "epoch": 0.42594529143045845, + "grad_norm": 0.6426860690116882, + "learning_rate": 8.943052685736559e-06, + "loss": 0.744, + "step": 7739 + }, + { + "epoch": 0.42600033023281414, + "grad_norm": 0.688117265701294, + "learning_rate": 8.942786135324098e-06, + "loss": 0.8386, + "step": 7740 + }, + { + "epoch": 0.4260553690351698, + "grad_norm": 0.7178692817687988, + "learning_rate": 8.94251955527871e-06, + "loss": 0.7937, + "step": 7741 + }, + { + "epoch": 0.42611040783752546, + "grad_norm": 0.7980415225028992, + "learning_rate": 8.942252945602403e-06, + "loss": 0.76, + "step": 7742 + }, + { + "epoch": 0.4261654466398811, + "grad_norm": 0.6858333349227905, + "learning_rate": 8.941986306297175e-06, + "loss": 0.8155, + "step": 7743 + }, + { + "epoch": 0.4262204854422368, + "grad_norm": 0.763297975063324, + "learning_rate": 8.941719637365037e-06, + "loss": 0.8003, + "step": 7744 + }, + { + "epoch": 0.4262755242445924, + "grad_norm": 0.661016047000885, + "learning_rate": 8.941452938807986e-06, + "loss": 0.6788, + "step": 7745 + }, + { + "epoch": 0.4263305630469481, + "grad_norm": 0.7168089151382446, + "learning_rate": 8.94118621062803e-06, + "loss": 0.7791, + "step": 7746 + }, + { + "epoch": 0.42638560184930374, + "grad_norm": 0.6879743337631226, + "learning_rate": 8.940919452827174e-06, + "loss": 0.7978, + "step": 7747 + }, + { + "epoch": 0.42644064065165943, + "grad_norm": 0.672298014163971, + "learning_rate": 8.940652665407424e-06, + "loss": 0.7569, + "step": 7748 + }, + { + "epoch": 0.42649567945401506, + "grad_norm": 0.7237414717674255, + "learning_rate": 8.940385848370782e-06, + "loss": 0.6788, + "step": 7749 + }, + { + "epoch": 0.42655071825637075, + "grad_norm": 0.6793895363807678, + "learning_rate": 8.940119001719255e-06, + "loss": 0.749, + "step": 7750 + }, + { + "epoch": 0.4266057570587264, + "grad_norm": 1.1172789335250854, + "learning_rate": 8.939852125454847e-06, + "loss": 0.9017, + "step": 7751 + }, + { + "epoch": 0.4266607958610821, + "grad_norm": 0.7138717770576477, + "learning_rate": 8.939585219579567e-06, + "loss": 0.8586, + "step": 7752 + }, + { + "epoch": 0.4267158346634377, + "grad_norm": 0.8678629398345947, + "learning_rate": 8.939318284095417e-06, + "loss": 0.7333, + "step": 7753 + }, + { + "epoch": 0.4267708734657934, + "grad_norm": 0.7274941802024841, + "learning_rate": 8.939051319004407e-06, + "loss": 0.8426, + "step": 7754 + }, + { + "epoch": 0.42682591226814903, + "grad_norm": 0.6845358610153198, + "learning_rate": 8.93878432430854e-06, + "loss": 0.7731, + "step": 7755 + }, + { + "epoch": 0.4268809510705047, + "grad_norm": 0.7042781710624695, + "learning_rate": 8.938517300009826e-06, + "loss": 0.6703, + "step": 7756 + }, + { + "epoch": 0.42693598987286036, + "grad_norm": 0.7147190570831299, + "learning_rate": 8.93825024611027e-06, + "loss": 0.7977, + "step": 7757 + }, + { + "epoch": 0.42699102867521604, + "grad_norm": 0.6584187150001526, + "learning_rate": 8.93798316261188e-06, + "loss": 0.716, + "step": 7758 + }, + { + "epoch": 0.4270460674775717, + "grad_norm": 0.8061439990997314, + "learning_rate": 8.93771604951666e-06, + "loss": 0.9075, + "step": 7759 + }, + { + "epoch": 0.42710110627992737, + "grad_norm": 0.6741406917572021, + "learning_rate": 8.937448906826622e-06, + "loss": 0.7828, + "step": 7760 + }, + { + "epoch": 0.427156145082283, + "grad_norm": 0.8791692852973938, + "learning_rate": 8.937181734543773e-06, + "loss": 0.7685, + "step": 7761 + }, + { + "epoch": 0.4272111838846387, + "grad_norm": 0.6804112195968628, + "learning_rate": 8.936914532670119e-06, + "loss": 0.7672, + "step": 7762 + }, + { + "epoch": 0.4272662226869943, + "grad_norm": 0.6983451843261719, + "learning_rate": 8.936647301207668e-06, + "loss": 0.8228, + "step": 7763 + }, + { + "epoch": 0.42732126148935, + "grad_norm": 0.8248929977416992, + "learning_rate": 8.936380040158432e-06, + "loss": 0.7628, + "step": 7764 + }, + { + "epoch": 0.42737630029170565, + "grad_norm": 0.8324941992759705, + "learning_rate": 8.936112749524415e-06, + "loss": 0.8125, + "step": 7765 + }, + { + "epoch": 0.42743133909406134, + "grad_norm": 0.7489150762557983, + "learning_rate": 8.935845429307631e-06, + "loss": 0.8766, + "step": 7766 + }, + { + "epoch": 0.42748637789641697, + "grad_norm": 0.7323104739189148, + "learning_rate": 8.935578079510083e-06, + "loss": 0.8607, + "step": 7767 + }, + { + "epoch": 0.42754141669877266, + "grad_norm": 0.6825152635574341, + "learning_rate": 8.935310700133786e-06, + "loss": 0.7817, + "step": 7768 + }, + { + "epoch": 0.4275964555011283, + "grad_norm": 0.8928677439689636, + "learning_rate": 8.935043291180748e-06, + "loss": 0.7621, + "step": 7769 + }, + { + "epoch": 0.427651494303484, + "grad_norm": 0.7071405649185181, + "learning_rate": 8.934775852652975e-06, + "loss": 0.7798, + "step": 7770 + }, + { + "epoch": 0.4277065331058396, + "grad_norm": 0.8225427269935608, + "learning_rate": 8.934508384552481e-06, + "loss": 0.7212, + "step": 7771 + }, + { + "epoch": 0.4277615719081953, + "grad_norm": 0.6931234002113342, + "learning_rate": 8.934240886881276e-06, + "loss": 0.7301, + "step": 7772 + }, + { + "epoch": 0.42781661071055094, + "grad_norm": 0.6901859641075134, + "learning_rate": 8.933973359641369e-06, + "loss": 0.6974, + "step": 7773 + }, + { + "epoch": 0.4278716495129066, + "grad_norm": 0.7736960649490356, + "learning_rate": 8.93370580283477e-06, + "loss": 0.6562, + "step": 7774 + }, + { + "epoch": 0.42792668831526226, + "grad_norm": 0.7363499999046326, + "learning_rate": 8.933438216463495e-06, + "loss": 0.8274, + "step": 7775 + }, + { + "epoch": 0.4279817271176179, + "grad_norm": 0.6855602860450745, + "learning_rate": 8.933170600529548e-06, + "loss": 0.7576, + "step": 7776 + }, + { + "epoch": 0.4280367659199736, + "grad_norm": 0.7641676664352417, + "learning_rate": 8.932902955034945e-06, + "loss": 0.7837, + "step": 7777 + }, + { + "epoch": 0.4280918047223292, + "grad_norm": 0.74812251329422, + "learning_rate": 8.932635279981695e-06, + "loss": 0.8402, + "step": 7778 + }, + { + "epoch": 0.4281468435246849, + "grad_norm": 0.7445259094238281, + "learning_rate": 8.932367575371813e-06, + "loss": 0.862, + "step": 7779 + }, + { + "epoch": 0.42820188232704054, + "grad_norm": 0.8977177739143372, + "learning_rate": 8.932099841207306e-06, + "loss": 0.7735, + "step": 7780 + }, + { + "epoch": 0.42825692112939623, + "grad_norm": 0.74172043800354, + "learning_rate": 8.93183207749019e-06, + "loss": 0.7053, + "step": 7781 + }, + { + "epoch": 0.42831195993175186, + "grad_norm": 0.6670083999633789, + "learning_rate": 8.931564284222479e-06, + "loss": 0.6348, + "step": 7782 + }, + { + "epoch": 0.42836699873410755, + "grad_norm": 0.7575422525405884, + "learning_rate": 8.93129646140618e-06, + "loss": 0.9354, + "step": 7783 + }, + { + "epoch": 0.4284220375364632, + "grad_norm": 0.7436977624893188, + "learning_rate": 8.931028609043311e-06, + "loss": 0.7461, + "step": 7784 + }, + { + "epoch": 0.4284770763388189, + "grad_norm": 0.7383070588111877, + "learning_rate": 8.930760727135882e-06, + "loss": 0.7629, + "step": 7785 + }, + { + "epoch": 0.4285321151411745, + "grad_norm": 0.6926067471504211, + "learning_rate": 8.93049281568591e-06, + "loss": 0.6788, + "step": 7786 + }, + { + "epoch": 0.4285871539435302, + "grad_norm": 0.7680530548095703, + "learning_rate": 8.930224874695404e-06, + "loss": 0.722, + "step": 7787 + }, + { + "epoch": 0.42864219274588583, + "grad_norm": 0.9880867004394531, + "learning_rate": 8.92995690416638e-06, + "loss": 0.833, + "step": 7788 + }, + { + "epoch": 0.4286972315482415, + "grad_norm": 0.7915430068969727, + "learning_rate": 8.929688904100853e-06, + "loss": 0.7643, + "step": 7789 + }, + { + "epoch": 0.42875227035059715, + "grad_norm": 0.6972275376319885, + "learning_rate": 8.929420874500836e-06, + "loss": 0.7697, + "step": 7790 + }, + { + "epoch": 0.42880730915295284, + "grad_norm": 0.9583331346511841, + "learning_rate": 8.929152815368343e-06, + "loss": 0.7591, + "step": 7791 + }, + { + "epoch": 0.4288623479553085, + "grad_norm": 0.7254299521446228, + "learning_rate": 8.928884726705388e-06, + "loss": 0.7913, + "step": 7792 + }, + { + "epoch": 0.42891738675766417, + "grad_norm": 0.7925865054130554, + "learning_rate": 8.928616608513989e-06, + "loss": 0.8248, + "step": 7793 + }, + { + "epoch": 0.4289724255600198, + "grad_norm": 0.9367457628250122, + "learning_rate": 8.928348460796157e-06, + "loss": 0.7767, + "step": 7794 + }, + { + "epoch": 0.4290274643623755, + "grad_norm": 0.8511868119239807, + "learning_rate": 8.928080283553912e-06, + "loss": 0.841, + "step": 7795 + }, + { + "epoch": 0.4290825031647311, + "grad_norm": 0.8518061637878418, + "learning_rate": 8.927812076789267e-06, + "loss": 0.7907, + "step": 7796 + }, + { + "epoch": 0.4291375419670868, + "grad_norm": 0.7208365797996521, + "learning_rate": 8.927543840504236e-06, + "loss": 0.7344, + "step": 7797 + }, + { + "epoch": 0.42919258076944244, + "grad_norm": 0.7541850209236145, + "learning_rate": 8.927275574700838e-06, + "loss": 0.7724, + "step": 7798 + }, + { + "epoch": 0.42924761957179813, + "grad_norm": 0.7378629446029663, + "learning_rate": 8.927007279381087e-06, + "loss": 0.7614, + "step": 7799 + }, + { + "epoch": 0.42930265837415377, + "grad_norm": 0.7358561158180237, + "learning_rate": 8.926738954547001e-06, + "loss": 0.7288, + "step": 7800 + }, + { + "epoch": 0.42935769717650946, + "grad_norm": 0.7385967969894409, + "learning_rate": 8.926470600200597e-06, + "loss": 0.7562, + "step": 7801 + }, + { + "epoch": 0.4294127359788651, + "grad_norm": 0.6904877424240112, + "learning_rate": 8.92620221634389e-06, + "loss": 0.6507, + "step": 7802 + }, + { + "epoch": 0.4294677747812208, + "grad_norm": 0.7205148935317993, + "learning_rate": 8.925933802978898e-06, + "loss": 0.7683, + "step": 7803 + }, + { + "epoch": 0.4295228135835764, + "grad_norm": 0.6830344200134277, + "learning_rate": 8.925665360107639e-06, + "loss": 0.6886, + "step": 7804 + }, + { + "epoch": 0.4295778523859321, + "grad_norm": 0.7648812532424927, + "learning_rate": 8.92539688773213e-06, + "loss": 0.7559, + "step": 7805 + }, + { + "epoch": 0.42963289118828774, + "grad_norm": 0.7819112539291382, + "learning_rate": 8.925128385854389e-06, + "loss": 0.7443, + "step": 7806 + }, + { + "epoch": 0.4296879299906434, + "grad_norm": 0.6742433309555054, + "learning_rate": 8.924859854476433e-06, + "loss": 0.7191, + "step": 7807 + }, + { + "epoch": 0.42974296879299906, + "grad_norm": 0.7368177771568298, + "learning_rate": 8.924591293600281e-06, + "loss": 0.6946, + "step": 7808 + }, + { + "epoch": 0.42979800759535475, + "grad_norm": 0.663112998008728, + "learning_rate": 8.924322703227953e-06, + "loss": 0.7405, + "step": 7809 + }, + { + "epoch": 0.4298530463977104, + "grad_norm": 0.6735410690307617, + "learning_rate": 8.924054083361465e-06, + "loss": 0.7982, + "step": 7810 + }, + { + "epoch": 0.42990808520006607, + "grad_norm": 0.7770369648933411, + "learning_rate": 8.923785434002834e-06, + "loss": 0.9179, + "step": 7811 + }, + { + "epoch": 0.4299631240024217, + "grad_norm": 0.7464482188224792, + "learning_rate": 8.923516755154085e-06, + "loss": 0.8514, + "step": 7812 + }, + { + "epoch": 0.4300181628047774, + "grad_norm": 0.9249551892280579, + "learning_rate": 8.923248046817235e-06, + "loss": 0.8287, + "step": 7813 + }, + { + "epoch": 0.430073201607133, + "grad_norm": 0.7071338891983032, + "learning_rate": 8.922979308994302e-06, + "loss": 0.7509, + "step": 7814 + }, + { + "epoch": 0.4301282404094887, + "grad_norm": 0.6910794377326965, + "learning_rate": 8.922710541687305e-06, + "loss": 0.7373, + "step": 7815 + }, + { + "epoch": 0.43018327921184435, + "grad_norm": 0.8424028158187866, + "learning_rate": 8.922441744898267e-06, + "loss": 0.741, + "step": 7816 + }, + { + "epoch": 0.43023831801420004, + "grad_norm": 0.8162125945091248, + "learning_rate": 8.922172918629208e-06, + "loss": 0.8044, + "step": 7817 + }, + { + "epoch": 0.43029335681655567, + "grad_norm": 0.7415170669555664, + "learning_rate": 8.921904062882145e-06, + "loss": 0.7427, + "step": 7818 + }, + { + "epoch": 0.4303483956189113, + "grad_norm": 1.1357808113098145, + "learning_rate": 8.921635177659103e-06, + "loss": 0.7802, + "step": 7819 + }, + { + "epoch": 0.430403434421267, + "grad_norm": 0.7039839625358582, + "learning_rate": 8.9213662629621e-06, + "loss": 0.7368, + "step": 7820 + }, + { + "epoch": 0.43045847322362263, + "grad_norm": 0.721077024936676, + "learning_rate": 8.921097318793157e-06, + "loss": 0.6575, + "step": 7821 + }, + { + "epoch": 0.4305135120259783, + "grad_norm": 0.7823510766029358, + "learning_rate": 8.920828345154297e-06, + "loss": 0.7499, + "step": 7822 + }, + { + "epoch": 0.43056855082833395, + "grad_norm": 0.6400569677352905, + "learning_rate": 8.920559342047539e-06, + "loss": 0.7091, + "step": 7823 + }, + { + "epoch": 0.43062358963068964, + "grad_norm": 0.8974951505661011, + "learning_rate": 8.920290309474908e-06, + "loss": 0.7228, + "step": 7824 + }, + { + "epoch": 0.4306786284330453, + "grad_norm": 0.8176010847091675, + "learning_rate": 8.920021247438426e-06, + "loss": 0.8852, + "step": 7825 + }, + { + "epoch": 0.43073366723540096, + "grad_norm": 0.7591422200202942, + "learning_rate": 8.919752155940112e-06, + "loss": 0.8382, + "step": 7826 + }, + { + "epoch": 0.4307887060377566, + "grad_norm": 0.7089776396751404, + "learning_rate": 8.919483034981988e-06, + "loss": 0.7188, + "step": 7827 + }, + { + "epoch": 0.4308437448401123, + "grad_norm": 0.7328840494155884, + "learning_rate": 8.919213884566081e-06, + "loss": 0.7609, + "step": 7828 + }, + { + "epoch": 0.4308987836424679, + "grad_norm": 0.6473509669303894, + "learning_rate": 8.918944704694411e-06, + "loss": 0.7027, + "step": 7829 + }, + { + "epoch": 0.4309538224448236, + "grad_norm": 0.6585624814033508, + "learning_rate": 8.918675495369003e-06, + "loss": 0.7133, + "step": 7830 + }, + { + "epoch": 0.43100886124717924, + "grad_norm": 0.7232397794723511, + "learning_rate": 8.918406256591876e-06, + "loss": 0.7458, + "step": 7831 + }, + { + "epoch": 0.43106390004953493, + "grad_norm": 0.8752645254135132, + "learning_rate": 8.918136988365059e-06, + "loss": 0.671, + "step": 7832 + }, + { + "epoch": 0.43111893885189057, + "grad_norm": 0.7890885472297668, + "learning_rate": 8.917867690690573e-06, + "loss": 0.7674, + "step": 7833 + }, + { + "epoch": 0.43117397765424625, + "grad_norm": 0.6725128293037415, + "learning_rate": 8.917598363570441e-06, + "loss": 0.7373, + "step": 7834 + }, + { + "epoch": 0.4312290164566019, + "grad_norm": 0.808897852897644, + "learning_rate": 8.917329007006688e-06, + "loss": 0.8397, + "step": 7835 + }, + { + "epoch": 0.4312840552589576, + "grad_norm": 0.7268605828285217, + "learning_rate": 8.91705962100134e-06, + "loss": 0.7957, + "step": 7836 + }, + { + "epoch": 0.4313390940613132, + "grad_norm": 0.7336069345474243, + "learning_rate": 8.916790205556421e-06, + "loss": 0.746, + "step": 7837 + }, + { + "epoch": 0.4313941328636689, + "grad_norm": 0.7380902171134949, + "learning_rate": 8.916520760673955e-06, + "loss": 0.674, + "step": 7838 + }, + { + "epoch": 0.43144917166602453, + "grad_norm": 0.8041831851005554, + "learning_rate": 8.916251286355967e-06, + "loss": 0.8392, + "step": 7839 + }, + { + "epoch": 0.4315042104683802, + "grad_norm": 0.6745681166648865, + "learning_rate": 8.915981782604481e-06, + "loss": 0.7676, + "step": 7840 + }, + { + "epoch": 0.43155924927073586, + "grad_norm": 0.6572039127349854, + "learning_rate": 8.915712249421526e-06, + "loss": 0.7471, + "step": 7841 + }, + { + "epoch": 0.43161428807309155, + "grad_norm": 0.7250062227249146, + "learning_rate": 8.915442686809124e-06, + "loss": 0.8566, + "step": 7842 + }, + { + "epoch": 0.4316693268754472, + "grad_norm": 0.7008941769599915, + "learning_rate": 8.915173094769306e-06, + "loss": 0.7876, + "step": 7843 + }, + { + "epoch": 0.43172436567780287, + "grad_norm": 0.7078337073326111, + "learning_rate": 8.914903473304093e-06, + "loss": 0.756, + "step": 7844 + }, + { + "epoch": 0.4317794044801585, + "grad_norm": 0.7822949886322021, + "learning_rate": 8.914633822415513e-06, + "loss": 0.9423, + "step": 7845 + }, + { + "epoch": 0.4318344432825142, + "grad_norm": 0.6707580089569092, + "learning_rate": 8.914364142105593e-06, + "loss": 0.639, + "step": 7846 + }, + { + "epoch": 0.4318894820848698, + "grad_norm": 0.7868423461914062, + "learning_rate": 8.914094432376362e-06, + "loss": 0.7768, + "step": 7847 + }, + { + "epoch": 0.4319445208872255, + "grad_norm": 0.6147592067718506, + "learning_rate": 8.913824693229845e-06, + "loss": 0.6693, + "step": 7848 + }, + { + "epoch": 0.43199955968958115, + "grad_norm": 0.6901249885559082, + "learning_rate": 8.913554924668067e-06, + "loss": 0.7779, + "step": 7849 + }, + { + "epoch": 0.43205459849193684, + "grad_norm": 0.7062137126922607, + "learning_rate": 8.913285126693058e-06, + "loss": 0.7951, + "step": 7850 + }, + { + "epoch": 0.43210963729429247, + "grad_norm": 0.6363390684127808, + "learning_rate": 8.913015299306846e-06, + "loss": 0.6723, + "step": 7851 + }, + { + "epoch": 0.43216467609664816, + "grad_norm": 0.7168677449226379, + "learning_rate": 8.912745442511459e-06, + "loss": 0.7442, + "step": 7852 + }, + { + "epoch": 0.4322197148990038, + "grad_norm": 0.7347995042800903, + "learning_rate": 8.912475556308925e-06, + "loss": 0.8361, + "step": 7853 + }, + { + "epoch": 0.4322747537013595, + "grad_norm": 0.683777391910553, + "learning_rate": 8.91220564070127e-06, + "loss": 0.7583, + "step": 7854 + }, + { + "epoch": 0.4323297925037151, + "grad_norm": 0.7436330914497375, + "learning_rate": 8.911935695690527e-06, + "loss": 0.8414, + "step": 7855 + }, + { + "epoch": 0.4323848313060708, + "grad_norm": 0.7748109102249146, + "learning_rate": 8.911665721278721e-06, + "loss": 0.7812, + "step": 7856 + }, + { + "epoch": 0.43243987010842644, + "grad_norm": 0.7984411120414734, + "learning_rate": 8.911395717467883e-06, + "loss": 0.6845, + "step": 7857 + }, + { + "epoch": 0.4324949089107821, + "grad_norm": 0.680144727230072, + "learning_rate": 8.911125684260042e-06, + "loss": 0.7156, + "step": 7858 + }, + { + "epoch": 0.43254994771313776, + "grad_norm": 0.7738325595855713, + "learning_rate": 8.910855621657228e-06, + "loss": 0.7295, + "step": 7859 + }, + { + "epoch": 0.43260498651549345, + "grad_norm": 0.7276971340179443, + "learning_rate": 8.910585529661469e-06, + "loss": 0.7982, + "step": 7860 + }, + { + "epoch": 0.4326600253178491, + "grad_norm": 0.7655037641525269, + "learning_rate": 8.910315408274796e-06, + "loss": 0.8416, + "step": 7861 + }, + { + "epoch": 0.4327150641202047, + "grad_norm": 0.7220892906188965, + "learning_rate": 8.910045257499238e-06, + "loss": 0.8002, + "step": 7862 + }, + { + "epoch": 0.4327701029225604, + "grad_norm": 0.6255655884742737, + "learning_rate": 8.90977507733683e-06, + "loss": 0.6477, + "step": 7863 + }, + { + "epoch": 0.43282514172491604, + "grad_norm": 0.649472713470459, + "learning_rate": 8.909504867789594e-06, + "loss": 0.6838, + "step": 7864 + }, + { + "epoch": 0.43288018052727173, + "grad_norm": 0.6915234923362732, + "learning_rate": 8.909234628859568e-06, + "loss": 0.7146, + "step": 7865 + }, + { + "epoch": 0.43293521932962736, + "grad_norm": 0.7120145559310913, + "learning_rate": 8.908964360548783e-06, + "loss": 0.7782, + "step": 7866 + }, + { + "epoch": 0.43299025813198305, + "grad_norm": 0.8125410079956055, + "learning_rate": 8.908694062859267e-06, + "loss": 0.7514, + "step": 7867 + }, + { + "epoch": 0.4330452969343387, + "grad_norm": 0.6821436882019043, + "learning_rate": 8.908423735793053e-06, + "loss": 0.8074, + "step": 7868 + }, + { + "epoch": 0.4331003357366944, + "grad_norm": 0.8079590201377869, + "learning_rate": 8.908153379352171e-06, + "loss": 0.7932, + "step": 7869 + }, + { + "epoch": 0.43315537453905, + "grad_norm": 0.676013708114624, + "learning_rate": 8.907882993538655e-06, + "loss": 0.6611, + "step": 7870 + }, + { + "epoch": 0.4332104133414057, + "grad_norm": 0.706624448299408, + "learning_rate": 8.907612578354537e-06, + "loss": 0.8241, + "step": 7871 + }, + { + "epoch": 0.43326545214376133, + "grad_norm": 0.6533300876617432, + "learning_rate": 8.907342133801848e-06, + "loss": 0.6969, + "step": 7872 + }, + { + "epoch": 0.433320490946117, + "grad_norm": 0.6778282523155212, + "learning_rate": 8.907071659882622e-06, + "loss": 0.6877, + "step": 7873 + }, + { + "epoch": 0.43337552974847265, + "grad_norm": 0.7068879008293152, + "learning_rate": 8.906801156598892e-06, + "loss": 0.7912, + "step": 7874 + }, + { + "epoch": 0.43343056855082834, + "grad_norm": 0.6620263457298279, + "learning_rate": 8.90653062395269e-06, + "loss": 0.7317, + "step": 7875 + }, + { + "epoch": 0.433485607353184, + "grad_norm": 0.7084807753562927, + "learning_rate": 8.906260061946049e-06, + "loss": 0.7268, + "step": 7876 + }, + { + "epoch": 0.43354064615553967, + "grad_norm": 0.7899147272109985, + "learning_rate": 8.905989470581003e-06, + "loss": 0.8258, + "step": 7877 + }, + { + "epoch": 0.4335956849578953, + "grad_norm": 0.6657128930091858, + "learning_rate": 8.905718849859585e-06, + "loss": 0.6564, + "step": 7878 + }, + { + "epoch": 0.433650723760251, + "grad_norm": 0.8737723231315613, + "learning_rate": 8.905448199783831e-06, + "loss": 0.8646, + "step": 7879 + }, + { + "epoch": 0.4337057625626066, + "grad_norm": 0.7517673969268799, + "learning_rate": 8.905177520355775e-06, + "loss": 0.7658, + "step": 7880 + }, + { + "epoch": 0.4337608013649623, + "grad_norm": 0.6724270582199097, + "learning_rate": 8.904906811577447e-06, + "loss": 0.7509, + "step": 7881 + }, + { + "epoch": 0.43381584016731795, + "grad_norm": 0.6490511894226074, + "learning_rate": 8.904636073450885e-06, + "loss": 0.7282, + "step": 7882 + }, + { + "epoch": 0.43387087896967363, + "grad_norm": 0.73885178565979, + "learning_rate": 8.904365305978126e-06, + "loss": 0.7575, + "step": 7883 + }, + { + "epoch": 0.43392591777202927, + "grad_norm": 0.6823462843894958, + "learning_rate": 8.9040945091612e-06, + "loss": 0.7566, + "step": 7884 + }, + { + "epoch": 0.43398095657438496, + "grad_norm": 0.6705971956253052, + "learning_rate": 8.903823683002146e-06, + "loss": 0.7726, + "step": 7885 + }, + { + "epoch": 0.4340359953767406, + "grad_norm": 0.6898428201675415, + "learning_rate": 8.903552827502998e-06, + "loss": 0.7545, + "step": 7886 + }, + { + "epoch": 0.4340910341790963, + "grad_norm": 0.810357928276062, + "learning_rate": 8.90328194266579e-06, + "loss": 0.8883, + "step": 7887 + }, + { + "epoch": 0.4341460729814519, + "grad_norm": 0.6505162119865417, + "learning_rate": 8.903011028492563e-06, + "loss": 0.7205, + "step": 7888 + }, + { + "epoch": 0.4342011117838076, + "grad_norm": 0.8401693105697632, + "learning_rate": 8.902740084985348e-06, + "loss": 0.8105, + "step": 7889 + }, + { + "epoch": 0.43425615058616324, + "grad_norm": 0.7151880860328674, + "learning_rate": 8.902469112146183e-06, + "loss": 0.7748, + "step": 7890 + }, + { + "epoch": 0.4343111893885189, + "grad_norm": 0.7257007956504822, + "learning_rate": 8.902198109977107e-06, + "loss": 0.7818, + "step": 7891 + }, + { + "epoch": 0.43436622819087456, + "grad_norm": 0.786691427230835, + "learning_rate": 8.901927078480153e-06, + "loss": 0.8527, + "step": 7892 + }, + { + "epoch": 0.43442126699323025, + "grad_norm": 0.7420910596847534, + "learning_rate": 8.901656017657358e-06, + "loss": 0.7087, + "step": 7893 + }, + { + "epoch": 0.4344763057955859, + "grad_norm": 0.6713958978652954, + "learning_rate": 8.901384927510763e-06, + "loss": 0.7366, + "step": 7894 + }, + { + "epoch": 0.43453134459794157, + "grad_norm": 1.0276658535003662, + "learning_rate": 8.901113808042402e-06, + "loss": 0.7462, + "step": 7895 + }, + { + "epoch": 0.4345863834002972, + "grad_norm": 0.7207444906234741, + "learning_rate": 8.900842659254314e-06, + "loss": 0.6777, + "step": 7896 + }, + { + "epoch": 0.4346414222026529, + "grad_norm": 0.7581979036331177, + "learning_rate": 8.900571481148538e-06, + "loss": 0.8081, + "step": 7897 + }, + { + "epoch": 0.4346964610050085, + "grad_norm": 0.9224075675010681, + "learning_rate": 8.90030027372711e-06, + "loss": 0.892, + "step": 7898 + }, + { + "epoch": 0.4347514998073642, + "grad_norm": 0.6844260096549988, + "learning_rate": 8.900029036992069e-06, + "loss": 0.8063, + "step": 7899 + }, + { + "epoch": 0.43480653860971985, + "grad_norm": 0.7008691430091858, + "learning_rate": 8.899757770945453e-06, + "loss": 0.6998, + "step": 7900 + }, + { + "epoch": 0.43486157741207554, + "grad_norm": 0.7311949729919434, + "learning_rate": 8.899486475589303e-06, + "loss": 0.7724, + "step": 7901 + }, + { + "epoch": 0.4349166162144312, + "grad_norm": 0.7441468238830566, + "learning_rate": 8.899215150925656e-06, + "loss": 0.7728, + "step": 7902 + }, + { + "epoch": 0.43497165501678686, + "grad_norm": 0.7405179142951965, + "learning_rate": 8.89894379695655e-06, + "loss": 0.8267, + "step": 7903 + }, + { + "epoch": 0.4350266938191425, + "grad_norm": 0.6967620253562927, + "learning_rate": 8.898672413684029e-06, + "loss": 0.7284, + "step": 7904 + }, + { + "epoch": 0.43508173262149813, + "grad_norm": 0.8979219794273376, + "learning_rate": 8.898401001110127e-06, + "loss": 0.8267, + "step": 7905 + }, + { + "epoch": 0.4351367714238538, + "grad_norm": 0.7905356884002686, + "learning_rate": 8.898129559236888e-06, + "loss": 0.8011, + "step": 7906 + }, + { + "epoch": 0.43519181022620945, + "grad_norm": 0.6740859150886536, + "learning_rate": 8.897858088066351e-06, + "loss": 0.6597, + "step": 7907 + }, + { + "epoch": 0.43524684902856514, + "grad_norm": 0.7451572418212891, + "learning_rate": 8.897586587600555e-06, + "loss": 0.7466, + "step": 7908 + }, + { + "epoch": 0.4353018878309208, + "grad_norm": 0.7726565003395081, + "learning_rate": 8.897315057841542e-06, + "loss": 0.7873, + "step": 7909 + }, + { + "epoch": 0.43535692663327646, + "grad_norm": 0.8348171710968018, + "learning_rate": 8.897043498791354e-06, + "loss": 0.7583, + "step": 7910 + }, + { + "epoch": 0.4354119654356321, + "grad_norm": 0.6714087724685669, + "learning_rate": 8.896771910452027e-06, + "loss": 0.7909, + "step": 7911 + }, + { + "epoch": 0.4354670042379878, + "grad_norm": 0.7397969365119934, + "learning_rate": 8.896500292825607e-06, + "loss": 0.7734, + "step": 7912 + }, + { + "epoch": 0.4355220430403434, + "grad_norm": 0.6806391477584839, + "learning_rate": 8.896228645914133e-06, + "loss": 0.7898, + "step": 7913 + }, + { + "epoch": 0.4355770818426991, + "grad_norm": 0.7135224342346191, + "learning_rate": 8.89595696971965e-06, + "loss": 0.7453, + "step": 7914 + }, + { + "epoch": 0.43563212064505474, + "grad_norm": 0.8275992274284363, + "learning_rate": 8.895685264244195e-06, + "loss": 0.7326, + "step": 7915 + }, + { + "epoch": 0.43568715944741043, + "grad_norm": 0.7254159450531006, + "learning_rate": 8.895413529489813e-06, + "loss": 0.7523, + "step": 7916 + }, + { + "epoch": 0.43574219824976607, + "grad_norm": 0.8060647249221802, + "learning_rate": 8.895141765458546e-06, + "loss": 0.7878, + "step": 7917 + }, + { + "epoch": 0.43579723705212176, + "grad_norm": 0.7007316946983337, + "learning_rate": 8.894869972152435e-06, + "loss": 0.7837, + "step": 7918 + }, + { + "epoch": 0.4358522758544774, + "grad_norm": 0.6874841451644897, + "learning_rate": 8.894598149573524e-06, + "loss": 0.7773, + "step": 7919 + }, + { + "epoch": 0.4359073146568331, + "grad_norm": 0.7557696104049683, + "learning_rate": 8.894326297723856e-06, + "loss": 0.6905, + "step": 7920 + }, + { + "epoch": 0.4359623534591887, + "grad_norm": 0.7589512467384338, + "learning_rate": 8.894054416605475e-06, + "loss": 0.8292, + "step": 7921 + }, + { + "epoch": 0.4360173922615444, + "grad_norm": 0.9062818884849548, + "learning_rate": 8.893782506220424e-06, + "loss": 0.9149, + "step": 7922 + }, + { + "epoch": 0.43607243106390003, + "grad_norm": 0.7553420662879944, + "learning_rate": 8.893510566570744e-06, + "loss": 0.7256, + "step": 7923 + }, + { + "epoch": 0.4361274698662557, + "grad_norm": 0.7130489349365234, + "learning_rate": 8.89323859765848e-06, + "loss": 0.7375, + "step": 7924 + }, + { + "epoch": 0.43618250866861136, + "grad_norm": 0.6234793066978455, + "learning_rate": 8.89296659948568e-06, + "loss": 0.716, + "step": 7925 + }, + { + "epoch": 0.43623754747096705, + "grad_norm": 0.7527539134025574, + "learning_rate": 8.892694572054383e-06, + "loss": 0.7884, + "step": 7926 + }, + { + "epoch": 0.4362925862733227, + "grad_norm": 0.7677647471427917, + "learning_rate": 8.892422515366636e-06, + "loss": 0.7136, + "step": 7927 + }, + { + "epoch": 0.43634762507567837, + "grad_norm": 0.7212143540382385, + "learning_rate": 8.892150429424484e-06, + "loss": 0.8113, + "step": 7928 + }, + { + "epoch": 0.436402663878034, + "grad_norm": 0.6735568046569824, + "learning_rate": 8.89187831422997e-06, + "loss": 0.6472, + "step": 7929 + }, + { + "epoch": 0.4364577026803897, + "grad_norm": 0.7120702862739563, + "learning_rate": 8.891606169785141e-06, + "loss": 0.8032, + "step": 7930 + }, + { + "epoch": 0.4365127414827453, + "grad_norm": 0.679499089717865, + "learning_rate": 8.891333996092041e-06, + "loss": 0.7366, + "step": 7931 + }, + { + "epoch": 0.436567780285101, + "grad_norm": 0.7774114012718201, + "learning_rate": 8.891061793152718e-06, + "loss": 0.7917, + "step": 7932 + }, + { + "epoch": 0.43662281908745665, + "grad_norm": 0.6951174139976501, + "learning_rate": 8.890789560969216e-06, + "loss": 0.7518, + "step": 7933 + }, + { + "epoch": 0.43667785788981234, + "grad_norm": 0.7645227909088135, + "learning_rate": 8.89051729954358e-06, + "loss": 0.7787, + "step": 7934 + }, + { + "epoch": 0.43673289669216797, + "grad_norm": 0.7127084732055664, + "learning_rate": 8.890245008877857e-06, + "loss": 0.8137, + "step": 7935 + }, + { + "epoch": 0.43678793549452366, + "grad_norm": 0.7541413903236389, + "learning_rate": 8.889972688974095e-06, + "loss": 0.776, + "step": 7936 + }, + { + "epoch": 0.4368429742968793, + "grad_norm": 0.690963625907898, + "learning_rate": 8.889700339834339e-06, + "loss": 0.7691, + "step": 7937 + }, + { + "epoch": 0.436898013099235, + "grad_norm": 0.750221848487854, + "learning_rate": 8.889427961460636e-06, + "loss": 0.7831, + "step": 7938 + }, + { + "epoch": 0.4369530519015906, + "grad_norm": 0.7255545854568481, + "learning_rate": 8.889155553855035e-06, + "loss": 0.7831, + "step": 7939 + }, + { + "epoch": 0.4370080907039463, + "grad_norm": 0.7187026143074036, + "learning_rate": 8.88888311701958e-06, + "loss": 0.792, + "step": 7940 + }, + { + "epoch": 0.43706312950630194, + "grad_norm": 0.8313350081443787, + "learning_rate": 8.888610650956322e-06, + "loss": 0.706, + "step": 7941 + }, + { + "epoch": 0.43711816830865763, + "grad_norm": 0.8083454370498657, + "learning_rate": 8.888338155667307e-06, + "loss": 0.7857, + "step": 7942 + }, + { + "epoch": 0.43717320711101326, + "grad_norm": 0.8200840353965759, + "learning_rate": 8.888065631154583e-06, + "loss": 0.8601, + "step": 7943 + }, + { + "epoch": 0.43722824591336895, + "grad_norm": 0.7503816485404968, + "learning_rate": 8.887793077420198e-06, + "loss": 0.7744, + "step": 7944 + }, + { + "epoch": 0.4372832847157246, + "grad_norm": 0.7466493248939514, + "learning_rate": 8.887520494466202e-06, + "loss": 0.7818, + "step": 7945 + }, + { + "epoch": 0.4373383235180803, + "grad_norm": 0.728118360042572, + "learning_rate": 8.887247882294641e-06, + "loss": 0.7157, + "step": 7946 + }, + { + "epoch": 0.4373933623204359, + "grad_norm": 0.9199670553207397, + "learning_rate": 8.886975240907568e-06, + "loss": 0.8283, + "step": 7947 + }, + { + "epoch": 0.43744840112279154, + "grad_norm": 0.735584557056427, + "learning_rate": 8.886702570307027e-06, + "loss": 0.6588, + "step": 7948 + }, + { + "epoch": 0.43750343992514723, + "grad_norm": 0.8619036674499512, + "learning_rate": 8.886429870495072e-06, + "loss": 0.7269, + "step": 7949 + }, + { + "epoch": 0.43755847872750286, + "grad_norm": 0.7304830551147461, + "learning_rate": 8.886157141473747e-06, + "loss": 0.6725, + "step": 7950 + }, + { + "epoch": 0.43761351752985855, + "grad_norm": 0.7669086456298828, + "learning_rate": 8.885884383245109e-06, + "loss": 0.6957, + "step": 7951 + }, + { + "epoch": 0.4376685563322142, + "grad_norm": 0.7558299899101257, + "learning_rate": 8.885611595811203e-06, + "loss": 0.8159, + "step": 7952 + }, + { + "epoch": 0.4377235951345699, + "grad_norm": 0.7661786079406738, + "learning_rate": 8.88533877917408e-06, + "loss": 0.764, + "step": 7953 + }, + { + "epoch": 0.4377786339369255, + "grad_norm": 0.7461101412773132, + "learning_rate": 8.88506593333579e-06, + "loss": 0.7544, + "step": 7954 + }, + { + "epoch": 0.4378336727392812, + "grad_norm": 0.7989180088043213, + "learning_rate": 8.884793058298387e-06, + "loss": 0.6913, + "step": 7955 + }, + { + "epoch": 0.43788871154163683, + "grad_norm": 0.7964022755622864, + "learning_rate": 8.884520154063917e-06, + "loss": 0.7339, + "step": 7956 + }, + { + "epoch": 0.4379437503439925, + "grad_norm": 0.7278034687042236, + "learning_rate": 8.884247220634433e-06, + "loss": 0.8477, + "step": 7957 + }, + { + "epoch": 0.43799878914634816, + "grad_norm": 0.7294753789901733, + "learning_rate": 8.883974258011988e-06, + "loss": 0.8412, + "step": 7958 + }, + { + "epoch": 0.43805382794870384, + "grad_norm": 0.665734589099884, + "learning_rate": 8.88370126619863e-06, + "loss": 0.7838, + "step": 7959 + }, + { + "epoch": 0.4381088667510595, + "grad_norm": 0.6984216570854187, + "learning_rate": 8.883428245196414e-06, + "loss": 0.7657, + "step": 7960 + }, + { + "epoch": 0.43816390555341517, + "grad_norm": 0.8048402070999146, + "learning_rate": 8.883155195007393e-06, + "loss": 0.7553, + "step": 7961 + }, + { + "epoch": 0.4382189443557708, + "grad_norm": 0.7145794630050659, + "learning_rate": 8.882882115633616e-06, + "loss": 0.6583, + "step": 7962 + }, + { + "epoch": 0.4382739831581265, + "grad_norm": 0.7073546648025513, + "learning_rate": 8.882609007077135e-06, + "loss": 0.7869, + "step": 7963 + }, + { + "epoch": 0.4383290219604821, + "grad_norm": 0.8300859928131104, + "learning_rate": 8.882335869340004e-06, + "loss": 0.773, + "step": 7964 + }, + { + "epoch": 0.4383840607628378, + "grad_norm": 0.8343188762664795, + "learning_rate": 8.882062702424276e-06, + "loss": 0.6743, + "step": 7965 + }, + { + "epoch": 0.43843909956519345, + "grad_norm": 0.7106530666351318, + "learning_rate": 8.881789506332007e-06, + "loss": 0.7414, + "step": 7966 + }, + { + "epoch": 0.43849413836754914, + "grad_norm": 0.7015630602836609, + "learning_rate": 8.881516281065244e-06, + "loss": 0.7434, + "step": 7967 + }, + { + "epoch": 0.43854917716990477, + "grad_norm": 0.8106673955917358, + "learning_rate": 8.881243026626044e-06, + "loss": 0.7741, + "step": 7968 + }, + { + "epoch": 0.43860421597226046, + "grad_norm": 0.8181495070457458, + "learning_rate": 8.88096974301646e-06, + "loss": 0.8046, + "step": 7969 + }, + { + "epoch": 0.4386592547746161, + "grad_norm": 0.7767857313156128, + "learning_rate": 8.880696430238546e-06, + "loss": 0.8586, + "step": 7970 + }, + { + "epoch": 0.4387142935769718, + "grad_norm": 0.7257522940635681, + "learning_rate": 8.880423088294359e-06, + "loss": 0.7799, + "step": 7971 + }, + { + "epoch": 0.4387693323793274, + "grad_norm": 0.6896021366119385, + "learning_rate": 8.880149717185948e-06, + "loss": 0.8178, + "step": 7972 + }, + { + "epoch": 0.4388243711816831, + "grad_norm": 0.7646406292915344, + "learning_rate": 8.879876316915372e-06, + "loss": 0.8754, + "step": 7973 + }, + { + "epoch": 0.43887940998403874, + "grad_norm": 0.8043848872184753, + "learning_rate": 8.879602887484684e-06, + "loss": 0.8562, + "step": 7974 + }, + { + "epoch": 0.4389344487863944, + "grad_norm": 0.6727305054664612, + "learning_rate": 8.879329428895937e-06, + "loss": 0.6168, + "step": 7975 + }, + { + "epoch": 0.43898948758875006, + "grad_norm": 0.7634731531143188, + "learning_rate": 8.87905594115119e-06, + "loss": 0.857, + "step": 7976 + }, + { + "epoch": 0.43904452639110575, + "grad_norm": 0.6544492244720459, + "learning_rate": 8.878782424252497e-06, + "loss": 0.6302, + "step": 7977 + }, + { + "epoch": 0.4390995651934614, + "grad_norm": 0.8126636743545532, + "learning_rate": 8.878508878201915e-06, + "loss": 0.7823, + "step": 7978 + }, + { + "epoch": 0.43915460399581707, + "grad_norm": 0.7235779166221619, + "learning_rate": 8.878235303001497e-06, + "loss": 0.7527, + "step": 7979 + }, + { + "epoch": 0.4392096427981727, + "grad_norm": 0.6961055397987366, + "learning_rate": 8.8779616986533e-06, + "loss": 0.7383, + "step": 7980 + }, + { + "epoch": 0.4392646816005284, + "grad_norm": 0.7684490084648132, + "learning_rate": 8.877688065159382e-06, + "loss": 0.8009, + "step": 7981 + }, + { + "epoch": 0.43931972040288403, + "grad_norm": 0.7897803783416748, + "learning_rate": 8.877414402521797e-06, + "loss": 0.7561, + "step": 7982 + }, + { + "epoch": 0.4393747592052397, + "grad_norm": 0.7877688407897949, + "learning_rate": 8.877140710742606e-06, + "loss": 0.7949, + "step": 7983 + }, + { + "epoch": 0.43942979800759535, + "grad_norm": 0.8341611623764038, + "learning_rate": 8.876866989823862e-06, + "loss": 0.7585, + "step": 7984 + }, + { + "epoch": 0.43948483680995104, + "grad_norm": 0.7663636207580566, + "learning_rate": 8.876593239767622e-06, + "loss": 0.771, + "step": 7985 + }, + { + "epoch": 0.4395398756123067, + "grad_norm": 0.6824129223823547, + "learning_rate": 8.876319460575946e-06, + "loss": 0.7852, + "step": 7986 + }, + { + "epoch": 0.43959491441466236, + "grad_norm": 0.6533854007720947, + "learning_rate": 8.876045652250891e-06, + "loss": 0.723, + "step": 7987 + }, + { + "epoch": 0.439649953217018, + "grad_norm": 0.7174259424209595, + "learning_rate": 8.875771814794515e-06, + "loss": 0.749, + "step": 7988 + }, + { + "epoch": 0.4397049920193737, + "grad_norm": 0.8585928678512573, + "learning_rate": 8.875497948208875e-06, + "loss": 0.6727, + "step": 7989 + }, + { + "epoch": 0.4397600308217293, + "grad_norm": 0.7558062672615051, + "learning_rate": 8.875224052496029e-06, + "loss": 0.7929, + "step": 7990 + }, + { + "epoch": 0.43981506962408495, + "grad_norm": 0.7063853144645691, + "learning_rate": 8.874950127658037e-06, + "loss": 0.7397, + "step": 7991 + }, + { + "epoch": 0.43987010842644064, + "grad_norm": 0.7165526747703552, + "learning_rate": 8.874676173696956e-06, + "loss": 0.7678, + "step": 7992 + }, + { + "epoch": 0.4399251472287963, + "grad_norm": 0.7657830715179443, + "learning_rate": 8.874402190614847e-06, + "loss": 0.8318, + "step": 7993 + }, + { + "epoch": 0.43998018603115197, + "grad_norm": 0.7776834964752197, + "learning_rate": 8.874128178413769e-06, + "loss": 0.8589, + "step": 7994 + }, + { + "epoch": 0.4400352248335076, + "grad_norm": 0.6805633306503296, + "learning_rate": 8.873854137095778e-06, + "loss": 0.7009, + "step": 7995 + }, + { + "epoch": 0.4400902636358633, + "grad_norm": 0.6962490677833557, + "learning_rate": 8.87358006666294e-06, + "loss": 0.7896, + "step": 7996 + }, + { + "epoch": 0.4401453024382189, + "grad_norm": 0.611610472202301, + "learning_rate": 8.873305967117307e-06, + "loss": 0.5993, + "step": 7997 + }, + { + "epoch": 0.4402003412405746, + "grad_norm": 0.7442964911460876, + "learning_rate": 8.873031838460946e-06, + "loss": 0.8277, + "step": 7998 + }, + { + "epoch": 0.44025538004293024, + "grad_norm": 0.6858734488487244, + "learning_rate": 8.872757680695914e-06, + "loss": 0.8064, + "step": 7999 + }, + { + "epoch": 0.44031041884528593, + "grad_norm": 0.6654849052429199, + "learning_rate": 8.872483493824273e-06, + "loss": 0.7408, + "step": 8000 + }, + { + "epoch": 0.44036545764764157, + "grad_norm": 0.8241575956344604, + "learning_rate": 8.87220927784808e-06, + "loss": 0.8819, + "step": 8001 + }, + { + "epoch": 0.44042049644999726, + "grad_norm": 0.7078573107719421, + "learning_rate": 8.8719350327694e-06, + "loss": 0.7709, + "step": 8002 + }, + { + "epoch": 0.4404755352523529, + "grad_norm": 0.7369210720062256, + "learning_rate": 8.871660758590292e-06, + "loss": 0.7867, + "step": 8003 + }, + { + "epoch": 0.4405305740547086, + "grad_norm": 0.7206673622131348, + "learning_rate": 8.87138645531282e-06, + "loss": 0.8697, + "step": 8004 + }, + { + "epoch": 0.4405856128570642, + "grad_norm": 0.8370183706283569, + "learning_rate": 8.871112122939041e-06, + "loss": 0.7201, + "step": 8005 + }, + { + "epoch": 0.4406406516594199, + "grad_norm": 0.8015196323394775, + "learning_rate": 8.870837761471023e-06, + "loss": 0.774, + "step": 8006 + }, + { + "epoch": 0.44069569046177554, + "grad_norm": 0.730185329914093, + "learning_rate": 8.870563370910821e-06, + "loss": 0.7153, + "step": 8007 + }, + { + "epoch": 0.4407507292641312, + "grad_norm": 0.6719930768013, + "learning_rate": 8.870288951260503e-06, + "loss": 0.7949, + "step": 8008 + }, + { + "epoch": 0.44080576806648686, + "grad_norm": 0.7614291906356812, + "learning_rate": 8.870014502522128e-06, + "loss": 0.7143, + "step": 8009 + }, + { + "epoch": 0.44086080686884255, + "grad_norm": 0.7438056468963623, + "learning_rate": 8.86974002469776e-06, + "loss": 0.6859, + "step": 8010 + }, + { + "epoch": 0.4409158456711982, + "grad_norm": 0.759903073310852, + "learning_rate": 8.869465517789463e-06, + "loss": 0.8095, + "step": 8011 + }, + { + "epoch": 0.44097088447355387, + "grad_norm": 0.7622823119163513, + "learning_rate": 8.869190981799298e-06, + "loss": 0.786, + "step": 8012 + }, + { + "epoch": 0.4410259232759095, + "grad_norm": 0.677003800868988, + "learning_rate": 8.86891641672933e-06, + "loss": 0.7074, + "step": 8013 + }, + { + "epoch": 0.4410809620782652, + "grad_norm": 0.9258451461791992, + "learning_rate": 8.86864182258162e-06, + "loss": 0.7218, + "step": 8014 + }, + { + "epoch": 0.4411360008806208, + "grad_norm": 0.7027828693389893, + "learning_rate": 8.868367199358236e-06, + "loss": 0.7654, + "step": 8015 + }, + { + "epoch": 0.4411910396829765, + "grad_norm": 0.8279967308044434, + "learning_rate": 8.868092547061239e-06, + "loss": 0.8969, + "step": 8016 + }, + { + "epoch": 0.44124607848533215, + "grad_norm": 0.7366079688072205, + "learning_rate": 8.867817865692693e-06, + "loss": 0.8421, + "step": 8017 + }, + { + "epoch": 0.44130111728768784, + "grad_norm": 0.7548787593841553, + "learning_rate": 8.867543155254665e-06, + "loss": 0.79, + "step": 8018 + }, + { + "epoch": 0.44135615609004347, + "grad_norm": 0.7558487057685852, + "learning_rate": 8.867268415749215e-06, + "loss": 0.8461, + "step": 8019 + }, + { + "epoch": 0.44141119489239916, + "grad_norm": 0.6413403153419495, + "learning_rate": 8.866993647178413e-06, + "loss": 0.6811, + "step": 8020 + }, + { + "epoch": 0.4414662336947548, + "grad_norm": 0.9251089692115784, + "learning_rate": 8.86671884954432e-06, + "loss": 0.868, + "step": 8021 + }, + { + "epoch": 0.4415212724971105, + "grad_norm": 0.7920099496841431, + "learning_rate": 8.866444022849006e-06, + "loss": 0.8131, + "step": 8022 + }, + { + "epoch": 0.4415763112994661, + "grad_norm": 0.8738380670547485, + "learning_rate": 8.866169167094532e-06, + "loss": 0.857, + "step": 8023 + }, + { + "epoch": 0.4416313501018218, + "grad_norm": 0.7181336283683777, + "learning_rate": 8.865894282282965e-06, + "loss": 0.7869, + "step": 8024 + }, + { + "epoch": 0.44168638890417744, + "grad_norm": 0.8003776669502258, + "learning_rate": 8.865619368416373e-06, + "loss": 0.8874, + "step": 8025 + }, + { + "epoch": 0.44174142770653313, + "grad_norm": 0.7186623215675354, + "learning_rate": 8.86534442549682e-06, + "loss": 0.7931, + "step": 8026 + }, + { + "epoch": 0.44179646650888876, + "grad_norm": 0.7006831765174866, + "learning_rate": 8.865069453526371e-06, + "loss": 0.7046, + "step": 8027 + }, + { + "epoch": 0.44185150531124445, + "grad_norm": 0.7394786477088928, + "learning_rate": 8.864794452507097e-06, + "loss": 0.685, + "step": 8028 + }, + { + "epoch": 0.4419065441136001, + "grad_norm": 0.7512097358703613, + "learning_rate": 8.864519422441062e-06, + "loss": 0.8047, + "step": 8029 + }, + { + "epoch": 0.4419615829159558, + "grad_norm": 0.6866902709007263, + "learning_rate": 8.864244363330333e-06, + "loss": 0.7099, + "step": 8030 + }, + { + "epoch": 0.4420166217183114, + "grad_norm": 0.7316723465919495, + "learning_rate": 8.863969275176978e-06, + "loss": 0.7767, + "step": 8031 + }, + { + "epoch": 0.4420716605206671, + "grad_norm": 0.7103593349456787, + "learning_rate": 8.863694157983064e-06, + "loss": 0.7832, + "step": 8032 + }, + { + "epoch": 0.44212669932302273, + "grad_norm": 0.6922749876976013, + "learning_rate": 8.863419011750659e-06, + "loss": 0.7833, + "step": 8033 + }, + { + "epoch": 0.44218173812537837, + "grad_norm": 0.7989425659179688, + "learning_rate": 8.863143836481831e-06, + "loss": 0.8651, + "step": 8034 + }, + { + "epoch": 0.44223677692773405, + "grad_norm": 0.6765440702438354, + "learning_rate": 8.862868632178648e-06, + "loss": 0.7858, + "step": 8035 + }, + { + "epoch": 0.4422918157300897, + "grad_norm": 0.670767068862915, + "learning_rate": 8.862593398843178e-06, + "loss": 0.6789, + "step": 8036 + }, + { + "epoch": 0.4423468545324454, + "grad_norm": 0.7556853294372559, + "learning_rate": 8.86231813647749e-06, + "loss": 0.8036, + "step": 8037 + }, + { + "epoch": 0.442401893334801, + "grad_norm": 0.788690984249115, + "learning_rate": 8.862042845083654e-06, + "loss": 0.8355, + "step": 8038 + }, + { + "epoch": 0.4424569321371567, + "grad_norm": 0.8439056873321533, + "learning_rate": 8.861767524663736e-06, + "loss": 0.7327, + "step": 8039 + }, + { + "epoch": 0.44251197093951233, + "grad_norm": 0.7101821899414062, + "learning_rate": 8.861492175219808e-06, + "loss": 0.8303, + "step": 8040 + }, + { + "epoch": 0.442567009741868, + "grad_norm": 0.741680383682251, + "learning_rate": 8.861216796753937e-06, + "loss": 0.7377, + "step": 8041 + }, + { + "epoch": 0.44262204854422366, + "grad_norm": 0.7588099837303162, + "learning_rate": 8.860941389268196e-06, + "loss": 0.8217, + "step": 8042 + }, + { + "epoch": 0.44267708734657935, + "grad_norm": 0.7654829025268555, + "learning_rate": 8.860665952764654e-06, + "loss": 0.8416, + "step": 8043 + }, + { + "epoch": 0.442732126148935, + "grad_norm": 0.7025987505912781, + "learning_rate": 8.860390487245378e-06, + "loss": 0.7312, + "step": 8044 + }, + { + "epoch": 0.44278716495129067, + "grad_norm": 0.7206251621246338, + "learning_rate": 8.860114992712441e-06, + "loss": 0.7522, + "step": 8045 + }, + { + "epoch": 0.4428422037536463, + "grad_norm": 0.7041749954223633, + "learning_rate": 8.859839469167912e-06, + "loss": 0.746, + "step": 8046 + }, + { + "epoch": 0.442897242556002, + "grad_norm": 0.6941862106323242, + "learning_rate": 8.859563916613864e-06, + "loss": 0.7692, + "step": 8047 + }, + { + "epoch": 0.4429522813583576, + "grad_norm": 0.6897740364074707, + "learning_rate": 8.859288335052367e-06, + "loss": 0.7963, + "step": 8048 + }, + { + "epoch": 0.4430073201607133, + "grad_norm": 0.6744545698165894, + "learning_rate": 8.859012724485492e-06, + "loss": 0.7647, + "step": 8049 + }, + { + "epoch": 0.44306235896306895, + "grad_norm": 0.7899364829063416, + "learning_rate": 8.858737084915309e-06, + "loss": 0.8373, + "step": 8050 + }, + { + "epoch": 0.44311739776542464, + "grad_norm": 0.806016743183136, + "learning_rate": 8.85846141634389e-06, + "loss": 0.7871, + "step": 8051 + }, + { + "epoch": 0.44317243656778027, + "grad_norm": 0.7444993257522583, + "learning_rate": 8.85818571877331e-06, + "loss": 0.8099, + "step": 8052 + }, + { + "epoch": 0.44322747537013596, + "grad_norm": 0.772735059261322, + "learning_rate": 8.85790999220564e-06, + "loss": 0.7113, + "step": 8053 + }, + { + "epoch": 0.4432825141724916, + "grad_norm": 0.7743984460830688, + "learning_rate": 8.85763423664295e-06, + "loss": 0.8935, + "step": 8054 + }, + { + "epoch": 0.4433375529748473, + "grad_norm": 0.6751214265823364, + "learning_rate": 8.857358452087313e-06, + "loss": 0.6769, + "step": 8055 + }, + { + "epoch": 0.4433925917772029, + "grad_norm": 0.6921005845069885, + "learning_rate": 8.857082638540803e-06, + "loss": 0.7071, + "step": 8056 + }, + { + "epoch": 0.4434476305795586, + "grad_norm": 0.7884092330932617, + "learning_rate": 8.856806796005491e-06, + "loss": 0.7919, + "step": 8057 + }, + { + "epoch": 0.44350266938191424, + "grad_norm": 0.6522679924964905, + "learning_rate": 8.856530924483452e-06, + "loss": 0.7449, + "step": 8058 + }, + { + "epoch": 0.4435577081842699, + "grad_norm": 0.7172590494155884, + "learning_rate": 8.85625502397676e-06, + "loss": 0.7306, + "step": 8059 + }, + { + "epoch": 0.44361274698662556, + "grad_norm": 0.698658287525177, + "learning_rate": 8.855979094487488e-06, + "loss": 0.803, + "step": 8060 + }, + { + "epoch": 0.44366778578898125, + "grad_norm": 0.685589075088501, + "learning_rate": 8.855703136017708e-06, + "loss": 0.763, + "step": 8061 + }, + { + "epoch": 0.4437228245913369, + "grad_norm": 0.8259774446487427, + "learning_rate": 8.855427148569495e-06, + "loss": 0.811, + "step": 8062 + }, + { + "epoch": 0.4437778633936926, + "grad_norm": 0.6976660490036011, + "learning_rate": 8.855151132144926e-06, + "loss": 0.7345, + "step": 8063 + }, + { + "epoch": 0.4438329021960482, + "grad_norm": 0.7696738243103027, + "learning_rate": 8.854875086746071e-06, + "loss": 0.823, + "step": 8064 + }, + { + "epoch": 0.4438879409984039, + "grad_norm": 0.6627930998802185, + "learning_rate": 8.854599012375006e-06, + "loss": 0.7455, + "step": 8065 + }, + { + "epoch": 0.44394297980075953, + "grad_norm": 0.7492700815200806, + "learning_rate": 8.854322909033809e-06, + "loss": 0.8195, + "step": 8066 + }, + { + "epoch": 0.4439980186031152, + "grad_norm": 0.8335888981819153, + "learning_rate": 8.85404677672455e-06, + "loss": 0.7683, + "step": 8067 + }, + { + "epoch": 0.44405305740547085, + "grad_norm": 0.7448242902755737, + "learning_rate": 8.853770615449309e-06, + "loss": 0.8352, + "step": 8068 + }, + { + "epoch": 0.44410809620782654, + "grad_norm": 0.700616180896759, + "learning_rate": 8.853494425210158e-06, + "loss": 0.7892, + "step": 8069 + }, + { + "epoch": 0.4441631350101822, + "grad_norm": 0.6959284543991089, + "learning_rate": 8.853218206009176e-06, + "loss": 0.6944, + "step": 8070 + }, + { + "epoch": 0.44421817381253786, + "grad_norm": 0.7507375478744507, + "learning_rate": 8.852941957848438e-06, + "loss": 0.8921, + "step": 8071 + }, + { + "epoch": 0.4442732126148935, + "grad_norm": 0.7843918204307556, + "learning_rate": 8.852665680730019e-06, + "loss": 0.816, + "step": 8072 + }, + { + "epoch": 0.4443282514172492, + "grad_norm": 0.8702702522277832, + "learning_rate": 8.852389374655995e-06, + "loss": 0.8191, + "step": 8073 + }, + { + "epoch": 0.4443832902196048, + "grad_norm": 0.6784317493438721, + "learning_rate": 8.852113039628445e-06, + "loss": 0.7726, + "step": 8074 + }, + { + "epoch": 0.4444383290219605, + "grad_norm": 0.724530041217804, + "learning_rate": 8.851836675649443e-06, + "loss": 0.8214, + "step": 8075 + }, + { + "epoch": 0.44449336782431614, + "grad_norm": 0.9814287424087524, + "learning_rate": 8.851560282721067e-06, + "loss": 0.8368, + "step": 8076 + }, + { + "epoch": 0.4445484066266718, + "grad_norm": 0.6606815457344055, + "learning_rate": 8.851283860845398e-06, + "loss": 0.7772, + "step": 8077 + }, + { + "epoch": 0.44460344542902747, + "grad_norm": 0.6910951137542725, + "learning_rate": 8.851007410024507e-06, + "loss": 0.7007, + "step": 8078 + }, + { + "epoch": 0.4446584842313831, + "grad_norm": 0.6764300465583801, + "learning_rate": 8.850730930260479e-06, + "loss": 0.7265, + "step": 8079 + }, + { + "epoch": 0.4447135230337388, + "grad_norm": 0.669622004032135, + "learning_rate": 8.850454421555386e-06, + "loss": 0.7551, + "step": 8080 + }, + { + "epoch": 0.4447685618360944, + "grad_norm": 0.7068240642547607, + "learning_rate": 8.850177883911307e-06, + "loss": 0.8358, + "step": 8081 + }, + { + "epoch": 0.4448236006384501, + "grad_norm": 0.7100360989570618, + "learning_rate": 8.849901317330324e-06, + "loss": 0.7074, + "step": 8082 + }, + { + "epoch": 0.44487863944080575, + "grad_norm": 0.7510328888893127, + "learning_rate": 8.849624721814511e-06, + "loss": 0.6654, + "step": 8083 + }, + { + "epoch": 0.44493367824316143, + "grad_norm": 0.8106432557106018, + "learning_rate": 8.849348097365951e-06, + "loss": 0.6944, + "step": 8084 + }, + { + "epoch": 0.44498871704551707, + "grad_norm": 0.6852346062660217, + "learning_rate": 8.84907144398672e-06, + "loss": 0.7203, + "step": 8085 + }, + { + "epoch": 0.44504375584787276, + "grad_norm": 0.8495593667030334, + "learning_rate": 8.848794761678898e-06, + "loss": 0.7918, + "step": 8086 + }, + { + "epoch": 0.4450987946502284, + "grad_norm": 0.7110981941223145, + "learning_rate": 8.848518050444565e-06, + "loss": 0.8176, + "step": 8087 + }, + { + "epoch": 0.4451538334525841, + "grad_norm": 0.7740922570228577, + "learning_rate": 8.8482413102858e-06, + "loss": 0.7573, + "step": 8088 + }, + { + "epoch": 0.4452088722549397, + "grad_norm": 0.9645134806632996, + "learning_rate": 8.847964541204685e-06, + "loss": 0.7842, + "step": 8089 + }, + { + "epoch": 0.4452639110572954, + "grad_norm": 0.767621636390686, + "learning_rate": 8.847687743203299e-06, + "loss": 0.8182, + "step": 8090 + }, + { + "epoch": 0.44531894985965104, + "grad_norm": 0.6842975616455078, + "learning_rate": 8.84741091628372e-06, + "loss": 0.7795, + "step": 8091 + }, + { + "epoch": 0.4453739886620067, + "grad_norm": 0.768644392490387, + "learning_rate": 8.847134060448032e-06, + "loss": 0.7363, + "step": 8092 + }, + { + "epoch": 0.44542902746436236, + "grad_norm": 0.6813824772834778, + "learning_rate": 8.846857175698314e-06, + "loss": 0.7601, + "step": 8093 + }, + { + "epoch": 0.44548406626671805, + "grad_norm": 0.8608306646347046, + "learning_rate": 8.846580262036645e-06, + "loss": 0.8205, + "step": 8094 + }, + { + "epoch": 0.4455391050690737, + "grad_norm": 0.6917694807052612, + "learning_rate": 8.84630331946511e-06, + "loss": 0.7207, + "step": 8095 + }, + { + "epoch": 0.44559414387142937, + "grad_norm": 0.6777203679084778, + "learning_rate": 8.84602634798579e-06, + "loss": 0.6939, + "step": 8096 + }, + { + "epoch": 0.445649182673785, + "grad_norm": 0.7249894142150879, + "learning_rate": 8.845749347600764e-06, + "loss": 0.7918, + "step": 8097 + }, + { + "epoch": 0.4457042214761407, + "grad_norm": 0.7446995973587036, + "learning_rate": 8.845472318312116e-06, + "loss": 0.7379, + "step": 8098 + }, + { + "epoch": 0.4457592602784963, + "grad_norm": 0.8245479464530945, + "learning_rate": 8.845195260121927e-06, + "loss": 0.8532, + "step": 8099 + }, + { + "epoch": 0.445814299080852, + "grad_norm": 0.7160329818725586, + "learning_rate": 8.84491817303228e-06, + "loss": 0.7042, + "step": 8100 + }, + { + "epoch": 0.44586933788320765, + "grad_norm": 0.8056026101112366, + "learning_rate": 8.844641057045257e-06, + "loss": 0.8581, + "step": 8101 + }, + { + "epoch": 0.44592437668556334, + "grad_norm": 0.7257886528968811, + "learning_rate": 8.84436391216294e-06, + "loss": 0.7297, + "step": 8102 + }, + { + "epoch": 0.445979415487919, + "grad_norm": 0.7400404810905457, + "learning_rate": 8.844086738387415e-06, + "loss": 0.7703, + "step": 8103 + }, + { + "epoch": 0.44603445429027466, + "grad_norm": 0.665271520614624, + "learning_rate": 8.843809535720763e-06, + "loss": 0.7769, + "step": 8104 + }, + { + "epoch": 0.4460894930926303, + "grad_norm": 0.7041043639183044, + "learning_rate": 8.843532304165066e-06, + "loss": 0.7995, + "step": 8105 + }, + { + "epoch": 0.446144531894986, + "grad_norm": 0.8517841100692749, + "learning_rate": 8.84325504372241e-06, + "loss": 0.8239, + "step": 8106 + }, + { + "epoch": 0.4461995706973416, + "grad_norm": 0.7045741677284241, + "learning_rate": 8.842977754394877e-06, + "loss": 0.7982, + "step": 8107 + }, + { + "epoch": 0.4462546094996973, + "grad_norm": 0.7056185007095337, + "learning_rate": 8.842700436184552e-06, + "loss": 0.8003, + "step": 8108 + }, + { + "epoch": 0.44630964830205294, + "grad_norm": 0.9042232632637024, + "learning_rate": 8.842423089093519e-06, + "loss": 0.7534, + "step": 8109 + }, + { + "epoch": 0.44636468710440863, + "grad_norm": 0.8584854602813721, + "learning_rate": 8.842145713123863e-06, + "loss": 0.7759, + "step": 8110 + }, + { + "epoch": 0.44641972590676426, + "grad_norm": 0.7333530187606812, + "learning_rate": 8.841868308277668e-06, + "loss": 0.7218, + "step": 8111 + }, + { + "epoch": 0.44647476470911995, + "grad_norm": 0.7866941094398499, + "learning_rate": 8.84159087455702e-06, + "loss": 0.7016, + "step": 8112 + }, + { + "epoch": 0.4465298035114756, + "grad_norm": 0.7785252928733826, + "learning_rate": 8.841313411964001e-06, + "loss": 0.8232, + "step": 8113 + }, + { + "epoch": 0.4465848423138313, + "grad_norm": 0.7060698866844177, + "learning_rate": 8.841035920500702e-06, + "loss": 0.6987, + "step": 8114 + }, + { + "epoch": 0.4466398811161869, + "grad_norm": 0.7211717963218689, + "learning_rate": 8.840758400169203e-06, + "loss": 0.8604, + "step": 8115 + }, + { + "epoch": 0.4466949199185426, + "grad_norm": 0.979678213596344, + "learning_rate": 8.840480850971593e-06, + "loss": 0.9028, + "step": 8116 + }, + { + "epoch": 0.44674995872089823, + "grad_norm": 0.6595104336738586, + "learning_rate": 8.840203272909957e-06, + "loss": 0.6899, + "step": 8117 + }, + { + "epoch": 0.4468049975232539, + "grad_norm": 0.6392405033111572, + "learning_rate": 8.83992566598638e-06, + "loss": 0.7729, + "step": 8118 + }, + { + "epoch": 0.44686003632560956, + "grad_norm": 1.1084040403366089, + "learning_rate": 8.839648030202949e-06, + "loss": 0.822, + "step": 8119 + }, + { + "epoch": 0.4469150751279652, + "grad_norm": 0.7024106383323669, + "learning_rate": 8.839370365561754e-06, + "loss": 0.7615, + "step": 8120 + }, + { + "epoch": 0.4469701139303209, + "grad_norm": 0.7204060554504395, + "learning_rate": 8.839092672064878e-06, + "loss": 0.7527, + "step": 8121 + }, + { + "epoch": 0.4470251527326765, + "grad_norm": 0.7307723760604858, + "learning_rate": 8.838814949714407e-06, + "loss": 0.8139, + "step": 8122 + }, + { + "epoch": 0.4470801915350322, + "grad_norm": 0.824034571647644, + "learning_rate": 8.838537198512434e-06, + "loss": 0.8299, + "step": 8123 + }, + { + "epoch": 0.44713523033738783, + "grad_norm": 0.6603747606277466, + "learning_rate": 8.83825941846104e-06, + "loss": 0.6762, + "step": 8124 + }, + { + "epoch": 0.4471902691397435, + "grad_norm": 0.7403088808059692, + "learning_rate": 8.837981609562316e-06, + "loss": 0.716, + "step": 8125 + }, + { + "epoch": 0.44724530794209916, + "grad_norm": 0.742173969745636, + "learning_rate": 8.837703771818351e-06, + "loss": 0.7672, + "step": 8126 + }, + { + "epoch": 0.44730034674445485, + "grad_norm": 0.7158839106559753, + "learning_rate": 8.837425905231232e-06, + "loss": 0.6941, + "step": 8127 + }, + { + "epoch": 0.4473553855468105, + "grad_norm": 0.7659464478492737, + "learning_rate": 8.837148009803044e-06, + "loss": 0.7293, + "step": 8128 + }, + { + "epoch": 0.44741042434916617, + "grad_norm": 0.8681113719940186, + "learning_rate": 8.836870085535882e-06, + "loss": 0.8647, + "step": 8129 + }, + { + "epoch": 0.4474654631515218, + "grad_norm": 0.7117272615432739, + "learning_rate": 8.83659213243183e-06, + "loss": 0.8035, + "step": 8130 + }, + { + "epoch": 0.4475205019538775, + "grad_norm": 0.8220957517623901, + "learning_rate": 8.836314150492978e-06, + "loss": 0.6978, + "step": 8131 + }, + { + "epoch": 0.4475755407562331, + "grad_norm": 0.7045003175735474, + "learning_rate": 8.836036139721418e-06, + "loss": 0.747, + "step": 8132 + }, + { + "epoch": 0.4476305795585888, + "grad_norm": 0.6833191514015198, + "learning_rate": 8.835758100119235e-06, + "loss": 0.7604, + "step": 8133 + }, + { + "epoch": 0.44768561836094445, + "grad_norm": 0.7305697798728943, + "learning_rate": 8.835480031688521e-06, + "loss": 0.7301, + "step": 8134 + }, + { + "epoch": 0.44774065716330014, + "grad_norm": 0.7266964912414551, + "learning_rate": 8.835201934431366e-06, + "loss": 0.7675, + "step": 8135 + }, + { + "epoch": 0.44779569596565577, + "grad_norm": 0.6822015047073364, + "learning_rate": 8.834923808349861e-06, + "loss": 0.8226, + "step": 8136 + }, + { + "epoch": 0.44785073476801146, + "grad_norm": 0.7443515062332153, + "learning_rate": 8.834645653446095e-06, + "loss": 0.9289, + "step": 8137 + }, + { + "epoch": 0.4479057735703671, + "grad_norm": 0.7337210178375244, + "learning_rate": 8.834367469722158e-06, + "loss": 0.7758, + "step": 8138 + }, + { + "epoch": 0.4479608123727228, + "grad_norm": 0.6794925332069397, + "learning_rate": 8.83408925718014e-06, + "loss": 0.8426, + "step": 8139 + }, + { + "epoch": 0.4480158511750784, + "grad_norm": 0.7808265089988708, + "learning_rate": 8.833811015822135e-06, + "loss": 0.8464, + "step": 8140 + }, + { + "epoch": 0.4480708899774341, + "grad_norm": 0.7837018370628357, + "learning_rate": 8.833532745650234e-06, + "loss": 0.8722, + "step": 8141 + }, + { + "epoch": 0.44812592877978974, + "grad_norm": 0.9218140840530396, + "learning_rate": 8.833254446666526e-06, + "loss": 0.7981, + "step": 8142 + }, + { + "epoch": 0.44818096758214543, + "grad_norm": 0.7980387806892395, + "learning_rate": 8.832976118873103e-06, + "loss": 0.7705, + "step": 8143 + }, + { + "epoch": 0.44823600638450106, + "grad_norm": 0.7354007363319397, + "learning_rate": 8.832697762272057e-06, + "loss": 0.8286, + "step": 8144 + }, + { + "epoch": 0.44829104518685675, + "grad_norm": 0.7006223201751709, + "learning_rate": 8.832419376865482e-06, + "loss": 0.7107, + "step": 8145 + }, + { + "epoch": 0.4483460839892124, + "grad_norm": 0.7838212847709656, + "learning_rate": 8.83214096265547e-06, + "loss": 0.7676, + "step": 8146 + }, + { + "epoch": 0.4484011227915681, + "grad_norm": 0.7768213748931885, + "learning_rate": 8.83186251964411e-06, + "loss": 0.8689, + "step": 8147 + }, + { + "epoch": 0.4484561615939237, + "grad_norm": 0.7451630234718323, + "learning_rate": 8.831584047833497e-06, + "loss": 0.8625, + "step": 8148 + }, + { + "epoch": 0.4485112003962794, + "grad_norm": 0.7573269605636597, + "learning_rate": 8.831305547225725e-06, + "loss": 0.7357, + "step": 8149 + }, + { + "epoch": 0.44856623919863503, + "grad_norm": 0.6884848475456238, + "learning_rate": 8.831027017822886e-06, + "loss": 0.7306, + "step": 8150 + }, + { + "epoch": 0.4486212780009907, + "grad_norm": 0.7715907096862793, + "learning_rate": 8.830748459627073e-06, + "loss": 0.8311, + "step": 8151 + }, + { + "epoch": 0.44867631680334635, + "grad_norm": 0.6919859647750854, + "learning_rate": 8.83046987264038e-06, + "loss": 0.845, + "step": 8152 + }, + { + "epoch": 0.44873135560570204, + "grad_norm": 0.7066411972045898, + "learning_rate": 8.830191256864902e-06, + "loss": 0.7554, + "step": 8153 + }, + { + "epoch": 0.4487863944080577, + "grad_norm": 0.754196047782898, + "learning_rate": 8.829912612302729e-06, + "loss": 0.7396, + "step": 8154 + }, + { + "epoch": 0.44884143321041337, + "grad_norm": 0.7612286806106567, + "learning_rate": 8.82963393895596e-06, + "loss": 0.8154, + "step": 8155 + }, + { + "epoch": 0.448896472012769, + "grad_norm": 0.8576892614364624, + "learning_rate": 8.829355236826688e-06, + "loss": 0.7395, + "step": 8156 + }, + { + "epoch": 0.4489515108151247, + "grad_norm": 0.6813738346099854, + "learning_rate": 8.829076505917005e-06, + "loss": 0.7661, + "step": 8157 + }, + { + "epoch": 0.4490065496174803, + "grad_norm": 0.7453964948654175, + "learning_rate": 8.828797746229009e-06, + "loss": 0.8221, + "step": 8158 + }, + { + "epoch": 0.449061588419836, + "grad_norm": 0.7546728849411011, + "learning_rate": 8.828518957764795e-06, + "loss": 0.7717, + "step": 8159 + }, + { + "epoch": 0.44911662722219164, + "grad_norm": 0.8270652890205383, + "learning_rate": 8.828240140526456e-06, + "loss": 0.7582, + "step": 8160 + }, + { + "epoch": 0.44917166602454733, + "grad_norm": 0.8188696503639221, + "learning_rate": 8.827961294516089e-06, + "loss": 0.8841, + "step": 8161 + }, + { + "epoch": 0.44922670482690297, + "grad_norm": 0.9101365208625793, + "learning_rate": 8.82768241973579e-06, + "loss": 0.7099, + "step": 8162 + }, + { + "epoch": 0.4492817436292586, + "grad_norm": 0.6749762892723083, + "learning_rate": 8.827403516187656e-06, + "loss": 0.7766, + "step": 8163 + }, + { + "epoch": 0.4493367824316143, + "grad_norm": 1.1351534128189087, + "learning_rate": 8.827124583873781e-06, + "loss": 0.7536, + "step": 8164 + }, + { + "epoch": 0.4493918212339699, + "grad_norm": 0.8729487061500549, + "learning_rate": 8.826845622796261e-06, + "loss": 0.8613, + "step": 8165 + }, + { + "epoch": 0.4494468600363256, + "grad_norm": 0.7495871782302856, + "learning_rate": 8.826566632957193e-06, + "loss": 0.8365, + "step": 8166 + }, + { + "epoch": 0.44950189883868125, + "grad_norm": 0.6414516568183899, + "learning_rate": 8.826287614358677e-06, + "loss": 0.6574, + "step": 8167 + }, + { + "epoch": 0.44955693764103694, + "grad_norm": 0.6954017281532288, + "learning_rate": 8.826008567002805e-06, + "loss": 0.7857, + "step": 8168 + }, + { + "epoch": 0.44961197644339257, + "grad_norm": 0.7199459075927734, + "learning_rate": 8.825729490891678e-06, + "loss": 0.8585, + "step": 8169 + }, + { + "epoch": 0.44966701524574826, + "grad_norm": 0.8245406746864319, + "learning_rate": 8.825450386027392e-06, + "loss": 0.7238, + "step": 8170 + }, + { + "epoch": 0.4497220540481039, + "grad_norm": 0.6348667740821838, + "learning_rate": 8.825171252412044e-06, + "loss": 0.6991, + "step": 8171 + }, + { + "epoch": 0.4497770928504596, + "grad_norm": 0.6304741501808167, + "learning_rate": 8.824892090047734e-06, + "loss": 0.7101, + "step": 8172 + }, + { + "epoch": 0.4498321316528152, + "grad_norm": 0.7088820338249207, + "learning_rate": 8.82461289893656e-06, + "loss": 0.8217, + "step": 8173 + }, + { + "epoch": 0.4498871704551709, + "grad_norm": 0.7570851445198059, + "learning_rate": 8.824333679080617e-06, + "loss": 0.8029, + "step": 8174 + }, + { + "epoch": 0.44994220925752654, + "grad_norm": 0.7544378042221069, + "learning_rate": 8.824054430482007e-06, + "loss": 0.777, + "step": 8175 + }, + { + "epoch": 0.4499972480598822, + "grad_norm": 0.8226260542869568, + "learning_rate": 8.823775153142827e-06, + "loss": 0.8391, + "step": 8176 + }, + { + "epoch": 0.45005228686223786, + "grad_norm": 0.6861422061920166, + "learning_rate": 8.823495847065176e-06, + "loss": 0.7491, + "step": 8177 + }, + { + "epoch": 0.45010732566459355, + "grad_norm": 0.6643275618553162, + "learning_rate": 8.823216512251153e-06, + "loss": 0.6773, + "step": 8178 + }, + { + "epoch": 0.4501623644669492, + "grad_norm": 0.8201391100883484, + "learning_rate": 8.82293714870286e-06, + "loss": 0.8065, + "step": 8179 + }, + { + "epoch": 0.45021740326930487, + "grad_norm": 0.7783405780792236, + "learning_rate": 8.822657756422394e-06, + "loss": 0.7884, + "step": 8180 + }, + { + "epoch": 0.4502724420716605, + "grad_norm": 0.720745861530304, + "learning_rate": 8.822378335411856e-06, + "loss": 0.765, + "step": 8181 + }, + { + "epoch": 0.4503274808740162, + "grad_norm": 0.740364670753479, + "learning_rate": 8.822098885673346e-06, + "loss": 0.6354, + "step": 8182 + }, + { + "epoch": 0.45038251967637183, + "grad_norm": 0.8049225807189941, + "learning_rate": 8.821819407208963e-06, + "loss": 0.7023, + "step": 8183 + }, + { + "epoch": 0.4504375584787275, + "grad_norm": 0.7320911288261414, + "learning_rate": 8.821539900020808e-06, + "loss": 0.8429, + "step": 8184 + }, + { + "epoch": 0.45049259728108315, + "grad_norm": 0.7065376043319702, + "learning_rate": 8.821260364110984e-06, + "loss": 0.7283, + "step": 8185 + }, + { + "epoch": 0.45054763608343884, + "grad_norm": 0.7172972559928894, + "learning_rate": 8.820980799481588e-06, + "loss": 0.7673, + "step": 8186 + }, + { + "epoch": 0.4506026748857945, + "grad_norm": 0.712273895740509, + "learning_rate": 8.820701206134724e-06, + "loss": 0.7317, + "step": 8187 + }, + { + "epoch": 0.45065771368815016, + "grad_norm": 0.6954227685928345, + "learning_rate": 8.820421584072492e-06, + "loss": 0.7037, + "step": 8188 + }, + { + "epoch": 0.4507127524905058, + "grad_norm": 0.6790304780006409, + "learning_rate": 8.820141933296994e-06, + "loss": 0.7544, + "step": 8189 + }, + { + "epoch": 0.4507677912928615, + "grad_norm": 0.7483745813369751, + "learning_rate": 8.819862253810332e-06, + "loss": 0.7894, + "step": 8190 + }, + { + "epoch": 0.4508228300952171, + "grad_norm": 0.7926133871078491, + "learning_rate": 8.819582545614608e-06, + "loss": 0.8085, + "step": 8191 + }, + { + "epoch": 0.4508778688975728, + "grad_norm": 0.8442840576171875, + "learning_rate": 8.819302808711924e-06, + "loss": 0.8252, + "step": 8192 + }, + { + "epoch": 0.45093290769992844, + "grad_norm": 0.8359581232070923, + "learning_rate": 8.819023043104383e-06, + "loss": 0.8187, + "step": 8193 + }, + { + "epoch": 0.45098794650228413, + "grad_norm": 0.7793936133384705, + "learning_rate": 8.818743248794085e-06, + "loss": 0.8425, + "step": 8194 + }, + { + "epoch": 0.45104298530463977, + "grad_norm": 0.735509991645813, + "learning_rate": 8.818463425783136e-06, + "loss": 0.7781, + "step": 8195 + }, + { + "epoch": 0.45109802410699545, + "grad_norm": 0.6735361814498901, + "learning_rate": 8.818183574073639e-06, + "loss": 0.6987, + "step": 8196 + }, + { + "epoch": 0.4511530629093511, + "grad_norm": 0.7780157923698425, + "learning_rate": 8.817903693667695e-06, + "loss": 0.8474, + "step": 8197 + }, + { + "epoch": 0.4512081017117068, + "grad_norm": 0.6714445948600769, + "learning_rate": 8.817623784567411e-06, + "loss": 0.7216, + "step": 8198 + }, + { + "epoch": 0.4512631405140624, + "grad_norm": 0.6311395168304443, + "learning_rate": 8.817343846774886e-06, + "loss": 0.5724, + "step": 8199 + }, + { + "epoch": 0.4513181793164181, + "grad_norm": 0.7446333169937134, + "learning_rate": 8.817063880292227e-06, + "loss": 0.7867, + "step": 8200 + }, + { + "epoch": 0.45137321811877373, + "grad_norm": 0.7684246301651001, + "learning_rate": 8.816783885121539e-06, + "loss": 0.8141, + "step": 8201 + }, + { + "epoch": 0.4514282569211294, + "grad_norm": 0.754781186580658, + "learning_rate": 8.816503861264925e-06, + "loss": 0.8438, + "step": 8202 + }, + { + "epoch": 0.45148329572348506, + "grad_norm": 0.7705762982368469, + "learning_rate": 8.816223808724488e-06, + "loss": 0.8948, + "step": 8203 + }, + { + "epoch": 0.4515383345258407, + "grad_norm": 0.7731552720069885, + "learning_rate": 8.815943727502333e-06, + "loss": 0.7462, + "step": 8204 + }, + { + "epoch": 0.4515933733281964, + "grad_norm": 0.6615393757820129, + "learning_rate": 8.81566361760057e-06, + "loss": 0.7499, + "step": 8205 + }, + { + "epoch": 0.451648412130552, + "grad_norm": 0.724453866481781, + "learning_rate": 8.8153834790213e-06, + "loss": 0.7382, + "step": 8206 + }, + { + "epoch": 0.4517034509329077, + "grad_norm": 0.6369735598564148, + "learning_rate": 8.815103311766629e-06, + "loss": 0.7452, + "step": 8207 + }, + { + "epoch": 0.45175848973526334, + "grad_norm": 0.686000406742096, + "learning_rate": 8.814823115838659e-06, + "loss": 0.6971, + "step": 8208 + }, + { + "epoch": 0.451813528537619, + "grad_norm": 0.7372714281082153, + "learning_rate": 8.814542891239505e-06, + "loss": 0.8553, + "step": 8209 + }, + { + "epoch": 0.45186856733997466, + "grad_norm": 0.8348672986030579, + "learning_rate": 8.814262637971264e-06, + "loss": 0.7135, + "step": 8210 + }, + { + "epoch": 0.45192360614233035, + "grad_norm": 0.7829258441925049, + "learning_rate": 8.813982356036049e-06, + "loss": 0.7974, + "step": 8211 + }, + { + "epoch": 0.451978644944686, + "grad_norm": 0.7013983726501465, + "learning_rate": 8.81370204543596e-06, + "loss": 0.7531, + "step": 8212 + }, + { + "epoch": 0.45203368374704167, + "grad_norm": 0.8424196243286133, + "learning_rate": 8.81342170617311e-06, + "loss": 0.8217, + "step": 8213 + }, + { + "epoch": 0.4520887225493973, + "grad_norm": 0.7113365530967712, + "learning_rate": 8.813141338249603e-06, + "loss": 0.7728, + "step": 8214 + }, + { + "epoch": 0.452143761351753, + "grad_norm": 0.958642303943634, + "learning_rate": 8.812860941667545e-06, + "loss": 0.7234, + "step": 8215 + }, + { + "epoch": 0.4521988001541086, + "grad_norm": 0.6712706685066223, + "learning_rate": 8.812580516429045e-06, + "loss": 0.6998, + "step": 8216 + }, + { + "epoch": 0.4522538389564643, + "grad_norm": 0.7258469462394714, + "learning_rate": 8.812300062536212e-06, + "loss": 0.6758, + "step": 8217 + }, + { + "epoch": 0.45230887775881995, + "grad_norm": 0.735047459602356, + "learning_rate": 8.812019579991152e-06, + "loss": 0.7045, + "step": 8218 + }, + { + "epoch": 0.45236391656117564, + "grad_norm": 0.8339886665344238, + "learning_rate": 8.811739068795971e-06, + "loss": 0.8069, + "step": 8219 + }, + { + "epoch": 0.45241895536353127, + "grad_norm": 0.7170082926750183, + "learning_rate": 8.81145852895278e-06, + "loss": 0.6345, + "step": 8220 + }, + { + "epoch": 0.45247399416588696, + "grad_norm": 0.6892569661140442, + "learning_rate": 8.81117796046369e-06, + "loss": 0.712, + "step": 8221 + }, + { + "epoch": 0.4525290329682426, + "grad_norm": 0.6837140321731567, + "learning_rate": 8.810897363330804e-06, + "loss": 0.7184, + "step": 8222 + }, + { + "epoch": 0.4525840717705983, + "grad_norm": 0.7410069108009338, + "learning_rate": 8.810616737556235e-06, + "loss": 0.8265, + "step": 8223 + }, + { + "epoch": 0.4526391105729539, + "grad_norm": 0.6945875883102417, + "learning_rate": 8.810336083142089e-06, + "loss": 0.7163, + "step": 8224 + }, + { + "epoch": 0.4526941493753096, + "grad_norm": 0.6978884339332581, + "learning_rate": 8.810055400090477e-06, + "loss": 0.795, + "step": 8225 + }, + { + "epoch": 0.45274918817766524, + "grad_norm": 0.7209095358848572, + "learning_rate": 8.809774688403509e-06, + "loss": 0.7317, + "step": 8226 + }, + { + "epoch": 0.45280422698002093, + "grad_norm": 0.7279626727104187, + "learning_rate": 8.809493948083294e-06, + "loss": 0.7699, + "step": 8227 + }, + { + "epoch": 0.45285926578237656, + "grad_norm": 0.7642556428909302, + "learning_rate": 8.809213179131943e-06, + "loss": 0.8518, + "step": 8228 + }, + { + "epoch": 0.45291430458473225, + "grad_norm": 0.6868709325790405, + "learning_rate": 8.808932381551565e-06, + "loss": 0.737, + "step": 8229 + }, + { + "epoch": 0.4529693433870879, + "grad_norm": 0.7012789845466614, + "learning_rate": 8.80865155534427e-06, + "loss": 0.8146, + "step": 8230 + }, + { + "epoch": 0.4530243821894436, + "grad_norm": 0.678683340549469, + "learning_rate": 8.808370700512171e-06, + "loss": 0.7531, + "step": 8231 + }, + { + "epoch": 0.4530794209917992, + "grad_norm": 0.690559983253479, + "learning_rate": 8.808089817057377e-06, + "loss": 0.6779, + "step": 8232 + }, + { + "epoch": 0.4531344597941549, + "grad_norm": 0.7179763317108154, + "learning_rate": 8.807808904981997e-06, + "loss": 0.8815, + "step": 8233 + }, + { + "epoch": 0.45318949859651053, + "grad_norm": 0.7708277702331543, + "learning_rate": 8.807527964288147e-06, + "loss": 0.8084, + "step": 8234 + }, + { + "epoch": 0.4532445373988662, + "grad_norm": 0.6828494071960449, + "learning_rate": 8.807246994977936e-06, + "loss": 0.7587, + "step": 8235 + }, + { + "epoch": 0.45329957620122185, + "grad_norm": 0.7085250616073608, + "learning_rate": 8.806965997053475e-06, + "loss": 0.7894, + "step": 8236 + }, + { + "epoch": 0.45335461500357754, + "grad_norm": 0.7723467946052551, + "learning_rate": 8.806684970516876e-06, + "loss": 0.7408, + "step": 8237 + }, + { + "epoch": 0.4534096538059332, + "grad_norm": 0.8887566328048706, + "learning_rate": 8.806403915370253e-06, + "loss": 0.9022, + "step": 8238 + }, + { + "epoch": 0.45346469260828887, + "grad_norm": 0.7379833459854126, + "learning_rate": 8.806122831615718e-06, + "loss": 0.8264, + "step": 8239 + }, + { + "epoch": 0.4535197314106445, + "grad_norm": 0.903279721736908, + "learning_rate": 8.80584171925538e-06, + "loss": 0.7432, + "step": 8240 + }, + { + "epoch": 0.4535747702130002, + "grad_norm": 0.7671363353729248, + "learning_rate": 8.805560578291356e-06, + "loss": 0.8109, + "step": 8241 + }, + { + "epoch": 0.4536298090153558, + "grad_norm": 0.6047827005386353, + "learning_rate": 8.805279408725755e-06, + "loss": 0.6628, + "step": 8242 + }, + { + "epoch": 0.4536848478177115, + "grad_norm": 1.0570796728134155, + "learning_rate": 8.804998210560696e-06, + "loss": 0.7981, + "step": 8243 + }, + { + "epoch": 0.45373988662006715, + "grad_norm": 0.7116600871086121, + "learning_rate": 8.804716983798288e-06, + "loss": 0.7601, + "step": 8244 + }, + { + "epoch": 0.45379492542242283, + "grad_norm": 0.7162767648696899, + "learning_rate": 8.804435728440644e-06, + "loss": 0.8389, + "step": 8245 + }, + { + "epoch": 0.45384996422477847, + "grad_norm": 0.6715626120567322, + "learning_rate": 8.80415444448988e-06, + "loss": 0.6377, + "step": 8246 + }, + { + "epoch": 0.4539050030271341, + "grad_norm": 0.7168908715248108, + "learning_rate": 8.80387313194811e-06, + "loss": 0.7946, + "step": 8247 + }, + { + "epoch": 0.4539600418294898, + "grad_norm": 0.7497992515563965, + "learning_rate": 8.803591790817448e-06, + "loss": 0.8026, + "step": 8248 + }, + { + "epoch": 0.4540150806318454, + "grad_norm": 0.6665049195289612, + "learning_rate": 8.803310421100009e-06, + "loss": 0.779, + "step": 8249 + }, + { + "epoch": 0.4540701194342011, + "grad_norm": 0.766674280166626, + "learning_rate": 8.803029022797905e-06, + "loss": 0.7467, + "step": 8250 + }, + { + "epoch": 0.45412515823655675, + "grad_norm": 0.7306104302406311, + "learning_rate": 8.802747595913255e-06, + "loss": 0.8323, + "step": 8251 + }, + { + "epoch": 0.45418019703891244, + "grad_norm": 0.6425766944885254, + "learning_rate": 8.802466140448169e-06, + "loss": 0.7226, + "step": 8252 + }, + { + "epoch": 0.45423523584126807, + "grad_norm": 0.7992560267448425, + "learning_rate": 8.802184656404769e-06, + "loss": 0.7285, + "step": 8253 + }, + { + "epoch": 0.45429027464362376, + "grad_norm": 0.6935924887657166, + "learning_rate": 8.801903143785164e-06, + "loss": 0.5757, + "step": 8254 + }, + { + "epoch": 0.4543453134459794, + "grad_norm": 0.7091512084007263, + "learning_rate": 8.801621602591473e-06, + "loss": 0.7719, + "step": 8255 + }, + { + "epoch": 0.4544003522483351, + "grad_norm": 0.851231038570404, + "learning_rate": 8.801340032825814e-06, + "loss": 0.7804, + "step": 8256 + }, + { + "epoch": 0.4544553910506907, + "grad_norm": 0.7443445920944214, + "learning_rate": 8.801058434490298e-06, + "loss": 0.7172, + "step": 8257 + }, + { + "epoch": 0.4545104298530464, + "grad_norm": 0.7156546115875244, + "learning_rate": 8.800776807587046e-06, + "loss": 0.7756, + "step": 8258 + }, + { + "epoch": 0.45456546865540204, + "grad_norm": 0.8027580380439758, + "learning_rate": 8.800495152118172e-06, + "loss": 0.8035, + "step": 8259 + }, + { + "epoch": 0.4546205074577577, + "grad_norm": 0.6868240833282471, + "learning_rate": 8.800213468085794e-06, + "loss": 0.7159, + "step": 8260 + }, + { + "epoch": 0.45467554626011336, + "grad_norm": 0.9127504229545593, + "learning_rate": 8.79993175549203e-06, + "loss": 0.7705, + "step": 8261 + }, + { + "epoch": 0.45473058506246905, + "grad_norm": 0.7074575424194336, + "learning_rate": 8.799650014338994e-06, + "loss": 0.7841, + "step": 8262 + }, + { + "epoch": 0.4547856238648247, + "grad_norm": 0.7462378740310669, + "learning_rate": 8.799368244628807e-06, + "loss": 0.8125, + "step": 8263 + }, + { + "epoch": 0.4548406626671804, + "grad_norm": 0.7510300874710083, + "learning_rate": 8.799086446363585e-06, + "loss": 0.8354, + "step": 8264 + }, + { + "epoch": 0.454895701469536, + "grad_norm": 0.7134591937065125, + "learning_rate": 8.798804619545446e-06, + "loss": 0.7968, + "step": 8265 + }, + { + "epoch": 0.4549507402718917, + "grad_norm": 1.0424071550369263, + "learning_rate": 8.798522764176509e-06, + "loss": 0.8638, + "step": 8266 + }, + { + "epoch": 0.45500577907424733, + "grad_norm": 0.6805267930030823, + "learning_rate": 8.79824088025889e-06, + "loss": 0.757, + "step": 8267 + }, + { + "epoch": 0.455060817876603, + "grad_norm": 0.8145313262939453, + "learning_rate": 8.79795896779471e-06, + "loss": 0.7589, + "step": 8268 + }, + { + "epoch": 0.45511585667895865, + "grad_norm": 0.7611781358718872, + "learning_rate": 8.79767702678609e-06, + "loss": 0.8426, + "step": 8269 + }, + { + "epoch": 0.45517089548131434, + "grad_norm": 0.7639568448066711, + "learning_rate": 8.797395057235142e-06, + "loss": 0.6609, + "step": 8270 + }, + { + "epoch": 0.45522593428367, + "grad_norm": 0.8577544093132019, + "learning_rate": 8.79711305914399e-06, + "loss": 0.8085, + "step": 8271 + }, + { + "epoch": 0.45528097308602566, + "grad_norm": 0.7740383148193359, + "learning_rate": 8.796831032514754e-06, + "loss": 0.8689, + "step": 8272 + }, + { + "epoch": 0.4553360118883813, + "grad_norm": 0.7300885915756226, + "learning_rate": 8.796548977349553e-06, + "loss": 0.8303, + "step": 8273 + }, + { + "epoch": 0.455391050690737, + "grad_norm": 0.6677057147026062, + "learning_rate": 8.796266893650504e-06, + "loss": 0.7449, + "step": 8274 + }, + { + "epoch": 0.4554460894930926, + "grad_norm": 0.7269144058227539, + "learning_rate": 8.79598478141973e-06, + "loss": 0.8744, + "step": 8275 + }, + { + "epoch": 0.4555011282954483, + "grad_norm": 0.7458559274673462, + "learning_rate": 8.795702640659351e-06, + "loss": 0.8036, + "step": 8276 + }, + { + "epoch": 0.45555616709780394, + "grad_norm": 0.7693114280700684, + "learning_rate": 8.795420471371487e-06, + "loss": 0.7617, + "step": 8277 + }, + { + "epoch": 0.45561120590015963, + "grad_norm": 0.7594510316848755, + "learning_rate": 8.79513827355826e-06, + "loss": 0.7049, + "step": 8278 + }, + { + "epoch": 0.45566624470251527, + "grad_norm": 0.7481217980384827, + "learning_rate": 8.794856047221786e-06, + "loss": 0.804, + "step": 8279 + }, + { + "epoch": 0.45572128350487096, + "grad_norm": 0.726859986782074, + "learning_rate": 8.794573792364192e-06, + "loss": 0.7322, + "step": 8280 + }, + { + "epoch": 0.4557763223072266, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.794291508987597e-06, + "loss": 0.8467, + "step": 8281 + }, + { + "epoch": 0.4558313611095823, + "grad_norm": 0.6264217495918274, + "learning_rate": 8.794009197094122e-06, + "loss": 0.6203, + "step": 8282 + }, + { + "epoch": 0.4558863999119379, + "grad_norm": 0.6973850131034851, + "learning_rate": 8.79372685668589e-06, + "loss": 0.8211, + "step": 8283 + }, + { + "epoch": 0.4559414387142936, + "grad_norm": 0.6992879509925842, + "learning_rate": 8.793444487765022e-06, + "loss": 0.7831, + "step": 8284 + }, + { + "epoch": 0.45599647751664923, + "grad_norm": 0.7641519904136658, + "learning_rate": 8.793162090333643e-06, + "loss": 0.7519, + "step": 8285 + }, + { + "epoch": 0.4560515163190049, + "grad_norm": 0.7296152710914612, + "learning_rate": 8.79287966439387e-06, + "loss": 0.8738, + "step": 8286 + }, + { + "epoch": 0.45610655512136056, + "grad_norm": 0.7549383044242859, + "learning_rate": 8.79259720994783e-06, + "loss": 0.7868, + "step": 8287 + }, + { + "epoch": 0.45616159392371625, + "grad_norm": 0.7932083606719971, + "learning_rate": 8.792314726997644e-06, + "loss": 0.8443, + "step": 8288 + }, + { + "epoch": 0.4562166327260719, + "grad_norm": 0.7999894022941589, + "learning_rate": 8.792032215545437e-06, + "loss": 0.852, + "step": 8289 + }, + { + "epoch": 0.4562716715284275, + "grad_norm": 0.8092383742332458, + "learning_rate": 8.79174967559333e-06, + "loss": 0.7922, + "step": 8290 + }, + { + "epoch": 0.4563267103307832, + "grad_norm": 0.7481340169906616, + "learning_rate": 8.791467107143447e-06, + "loss": 0.7086, + "step": 8291 + }, + { + "epoch": 0.45638174913313884, + "grad_norm": 0.8096129298210144, + "learning_rate": 8.791184510197912e-06, + "loss": 0.6645, + "step": 8292 + }, + { + "epoch": 0.4564367879354945, + "grad_norm": 0.7276492118835449, + "learning_rate": 8.79090188475885e-06, + "loss": 0.7174, + "step": 8293 + }, + { + "epoch": 0.45649182673785016, + "grad_norm": 0.815535843372345, + "learning_rate": 8.790619230828385e-06, + "loss": 0.8622, + "step": 8294 + }, + { + "epoch": 0.45654686554020585, + "grad_norm": 0.8191169500350952, + "learning_rate": 8.790336548408637e-06, + "loss": 0.8666, + "step": 8295 + }, + { + "epoch": 0.4566019043425615, + "grad_norm": 0.7449167966842651, + "learning_rate": 8.790053837501737e-06, + "loss": 0.7728, + "step": 8296 + }, + { + "epoch": 0.45665694314491717, + "grad_norm": 0.7311065196990967, + "learning_rate": 8.789771098109808e-06, + "loss": 0.8059, + "step": 8297 + }, + { + "epoch": 0.4567119819472728, + "grad_norm": 0.7381907105445862, + "learning_rate": 8.789488330234971e-06, + "loss": 0.7722, + "step": 8298 + }, + { + "epoch": 0.4567670207496285, + "grad_norm": 0.8180661201477051, + "learning_rate": 8.789205533879355e-06, + "loss": 0.9032, + "step": 8299 + }, + { + "epoch": 0.4568220595519841, + "grad_norm": 0.7993118762969971, + "learning_rate": 8.788922709045087e-06, + "loss": 0.8065, + "step": 8300 + }, + { + "epoch": 0.4568770983543398, + "grad_norm": 0.8449206948280334, + "learning_rate": 8.788639855734287e-06, + "loss": 0.7895, + "step": 8301 + }, + { + "epoch": 0.45693213715669545, + "grad_norm": 0.9224583506584167, + "learning_rate": 8.788356973949084e-06, + "loss": 0.78, + "step": 8302 + }, + { + "epoch": 0.45698717595905114, + "grad_norm": 0.7109915614128113, + "learning_rate": 8.788074063691604e-06, + "loss": 0.8029, + "step": 8303 + }, + { + "epoch": 0.4570422147614068, + "grad_norm": 0.7372310757637024, + "learning_rate": 8.787791124963976e-06, + "loss": 0.8118, + "step": 8304 + }, + { + "epoch": 0.45709725356376246, + "grad_norm": 0.8127168416976929, + "learning_rate": 8.787508157768323e-06, + "loss": 0.8665, + "step": 8305 + }, + { + "epoch": 0.4571522923661181, + "grad_norm": 0.7193050980567932, + "learning_rate": 8.787225162106771e-06, + "loss": 0.749, + "step": 8306 + }, + { + "epoch": 0.4572073311684738, + "grad_norm": 0.8825041651725769, + "learning_rate": 8.786942137981449e-06, + "loss": 0.9651, + "step": 8307 + }, + { + "epoch": 0.4572623699708294, + "grad_norm": 0.6854885816574097, + "learning_rate": 8.786659085394485e-06, + "loss": 0.8259, + "step": 8308 + }, + { + "epoch": 0.4573174087731851, + "grad_norm": 0.6698010563850403, + "learning_rate": 8.786376004348004e-06, + "loss": 0.7212, + "step": 8309 + }, + { + "epoch": 0.45737244757554074, + "grad_norm": 0.7706398963928223, + "learning_rate": 8.786092894844132e-06, + "loss": 0.719, + "step": 8310 + }, + { + "epoch": 0.45742748637789643, + "grad_norm": 0.8905620574951172, + "learning_rate": 8.785809756885002e-06, + "loss": 0.7518, + "step": 8311 + }, + { + "epoch": 0.45748252518025206, + "grad_norm": 0.7537117004394531, + "learning_rate": 8.78552659047274e-06, + "loss": 0.8267, + "step": 8312 + }, + { + "epoch": 0.45753756398260775, + "grad_norm": 0.7840754985809326, + "learning_rate": 8.78524339560947e-06, + "loss": 0.8417, + "step": 8313 + }, + { + "epoch": 0.4575926027849634, + "grad_norm": 0.7373713254928589, + "learning_rate": 8.784960172297327e-06, + "loss": 0.784, + "step": 8314 + }, + { + "epoch": 0.4576476415873191, + "grad_norm": 0.6648432016372681, + "learning_rate": 8.784676920538436e-06, + "loss": 0.7252, + "step": 8315 + }, + { + "epoch": 0.4577026803896747, + "grad_norm": 0.7904912829399109, + "learning_rate": 8.784393640334925e-06, + "loss": 0.7777, + "step": 8316 + }, + { + "epoch": 0.4577577191920304, + "grad_norm": 0.7691501379013062, + "learning_rate": 8.784110331688927e-06, + "loss": 0.733, + "step": 8317 + }, + { + "epoch": 0.45781275799438603, + "grad_norm": 0.6054617762565613, + "learning_rate": 8.783826994602566e-06, + "loss": 0.6367, + "step": 8318 + }, + { + "epoch": 0.4578677967967417, + "grad_norm": 0.7495457530021667, + "learning_rate": 8.783543629077976e-06, + "loss": 0.8672, + "step": 8319 + }, + { + "epoch": 0.45792283559909736, + "grad_norm": 0.6979867815971375, + "learning_rate": 8.783260235117283e-06, + "loss": 0.7338, + "step": 8320 + }, + { + "epoch": 0.45797787440145304, + "grad_norm": 0.6927759647369385, + "learning_rate": 8.78297681272262e-06, + "loss": 0.6925, + "step": 8321 + }, + { + "epoch": 0.4580329132038087, + "grad_norm": 0.9076687097549438, + "learning_rate": 8.782693361896115e-06, + "loss": 0.8225, + "step": 8322 + }, + { + "epoch": 0.45808795200616437, + "grad_norm": 0.7990893721580505, + "learning_rate": 8.782409882639902e-06, + "loss": 0.8144, + "step": 8323 + }, + { + "epoch": 0.45814299080852, + "grad_norm": 0.7958230376243591, + "learning_rate": 8.782126374956107e-06, + "loss": 0.7717, + "step": 8324 + }, + { + "epoch": 0.4581980296108757, + "grad_norm": 0.7694645524024963, + "learning_rate": 8.781842838846861e-06, + "loss": 0.8314, + "step": 8325 + }, + { + "epoch": 0.4582530684132313, + "grad_norm": 0.8653621077537537, + "learning_rate": 8.781559274314297e-06, + "loss": 0.7567, + "step": 8326 + }, + { + "epoch": 0.458308107215587, + "grad_norm": 0.7834668755531311, + "learning_rate": 8.781275681360548e-06, + "loss": 0.7431, + "step": 8327 + }, + { + "epoch": 0.45836314601794265, + "grad_norm": 0.6800104975700378, + "learning_rate": 8.780992059987742e-06, + "loss": 0.8266, + "step": 8328 + }, + { + "epoch": 0.45841818482029834, + "grad_norm": 0.7274910807609558, + "learning_rate": 8.780708410198011e-06, + "loss": 0.7358, + "step": 8329 + }, + { + "epoch": 0.45847322362265397, + "grad_norm": 0.8102344870567322, + "learning_rate": 8.780424731993488e-06, + "loss": 0.7397, + "step": 8330 + }, + { + "epoch": 0.45852826242500966, + "grad_norm": 0.7536956071853638, + "learning_rate": 8.780141025376305e-06, + "loss": 0.7053, + "step": 8331 + }, + { + "epoch": 0.4585833012273653, + "grad_norm": 0.678535521030426, + "learning_rate": 8.779857290348594e-06, + "loss": 0.792, + "step": 8332 + }, + { + "epoch": 0.4586383400297209, + "grad_norm": 0.8847216963768005, + "learning_rate": 8.779573526912487e-06, + "loss": 0.8117, + "step": 8333 + }, + { + "epoch": 0.4586933788320766, + "grad_norm": 0.6997288465499878, + "learning_rate": 8.779289735070117e-06, + "loss": 0.7797, + "step": 8334 + }, + { + "epoch": 0.45874841763443225, + "grad_norm": 0.7445441484451294, + "learning_rate": 8.779005914823617e-06, + "loss": 0.7505, + "step": 8335 + }, + { + "epoch": 0.45880345643678794, + "grad_norm": 0.618844211101532, + "learning_rate": 8.778722066175121e-06, + "loss": 0.661, + "step": 8336 + }, + { + "epoch": 0.45885849523914357, + "grad_norm": 0.6810492873191833, + "learning_rate": 8.778438189126761e-06, + "loss": 0.6819, + "step": 8337 + }, + { + "epoch": 0.45891353404149926, + "grad_norm": 0.6785591244697571, + "learning_rate": 8.778154283680671e-06, + "loss": 0.7808, + "step": 8338 + }, + { + "epoch": 0.4589685728438549, + "grad_norm": 0.7461212873458862, + "learning_rate": 8.777870349838984e-06, + "loss": 0.8566, + "step": 8339 + }, + { + "epoch": 0.4590236116462106, + "grad_norm": 0.6731496453285217, + "learning_rate": 8.777586387603836e-06, + "loss": 0.823, + "step": 8340 + }, + { + "epoch": 0.4590786504485662, + "grad_norm": 0.7295553684234619, + "learning_rate": 8.77730239697736e-06, + "loss": 0.9229, + "step": 8341 + }, + { + "epoch": 0.4591336892509219, + "grad_norm": 0.783275842666626, + "learning_rate": 8.77701837796169e-06, + "loss": 0.782, + "step": 8342 + }, + { + "epoch": 0.45918872805327754, + "grad_norm": 0.6952852606773376, + "learning_rate": 8.77673433055896e-06, + "loss": 0.7977, + "step": 8343 + }, + { + "epoch": 0.45924376685563323, + "grad_norm": 0.7381969094276428, + "learning_rate": 8.776450254771305e-06, + "loss": 0.768, + "step": 8344 + }, + { + "epoch": 0.45929880565798886, + "grad_norm": 0.7911093831062317, + "learning_rate": 8.776166150600862e-06, + "loss": 0.8284, + "step": 8345 + }, + { + "epoch": 0.45935384446034455, + "grad_norm": 0.7319246530532837, + "learning_rate": 8.775882018049765e-06, + "loss": 0.8135, + "step": 8346 + }, + { + "epoch": 0.4594088832627002, + "grad_norm": 0.7888429760932922, + "learning_rate": 8.77559785712015e-06, + "loss": 0.9001, + "step": 8347 + }, + { + "epoch": 0.4594639220650559, + "grad_norm": 0.6983326077461243, + "learning_rate": 8.775313667814151e-06, + "loss": 0.7537, + "step": 8348 + }, + { + "epoch": 0.4595189608674115, + "grad_norm": 0.7532416582107544, + "learning_rate": 8.775029450133905e-06, + "loss": 0.8307, + "step": 8349 + }, + { + "epoch": 0.4595739996697672, + "grad_norm": 0.7159993052482605, + "learning_rate": 8.774745204081549e-06, + "loss": 0.7874, + "step": 8350 + }, + { + "epoch": 0.45962903847212283, + "grad_norm": 0.6898767352104187, + "learning_rate": 8.774460929659218e-06, + "loss": 0.7453, + "step": 8351 + }, + { + "epoch": 0.4596840772744785, + "grad_norm": 0.6833236813545227, + "learning_rate": 8.774176626869051e-06, + "loss": 0.7281, + "step": 8352 + }, + { + "epoch": 0.45973911607683415, + "grad_norm": 0.7840244770050049, + "learning_rate": 8.77389229571318e-06, + "loss": 0.7194, + "step": 8353 + }, + { + "epoch": 0.45979415487918984, + "grad_norm": 0.7920441627502441, + "learning_rate": 8.773607936193747e-06, + "loss": 0.7135, + "step": 8354 + }, + { + "epoch": 0.4598491936815455, + "grad_norm": 0.7395668625831604, + "learning_rate": 8.773323548312884e-06, + "loss": 0.8162, + "step": 8355 + }, + { + "epoch": 0.45990423248390117, + "grad_norm": 0.7854128479957581, + "learning_rate": 8.773039132072734e-06, + "loss": 0.8252, + "step": 8356 + }, + { + "epoch": 0.4599592712862568, + "grad_norm": 0.694997251033783, + "learning_rate": 8.772754687475431e-06, + "loss": 0.6627, + "step": 8357 + }, + { + "epoch": 0.4600143100886125, + "grad_norm": 0.7698866724967957, + "learning_rate": 8.772470214523112e-06, + "loss": 0.8814, + "step": 8358 + }, + { + "epoch": 0.4600693488909681, + "grad_norm": 0.7323407530784607, + "learning_rate": 8.77218571321792e-06, + "loss": 0.7769, + "step": 8359 + }, + { + "epoch": 0.4601243876933238, + "grad_norm": 0.6637027263641357, + "learning_rate": 8.771901183561986e-06, + "loss": 0.6741, + "step": 8360 + }, + { + "epoch": 0.46017942649567944, + "grad_norm": 0.7423702478408813, + "learning_rate": 8.771616625557455e-06, + "loss": 0.7303, + "step": 8361 + }, + { + "epoch": 0.46023446529803513, + "grad_norm": 0.7599568367004395, + "learning_rate": 8.771332039206463e-06, + "loss": 0.8161, + "step": 8362 + }, + { + "epoch": 0.46028950410039077, + "grad_norm": 0.9063183069229126, + "learning_rate": 8.771047424511148e-06, + "loss": 0.8098, + "step": 8363 + }, + { + "epoch": 0.46034454290274646, + "grad_norm": 0.658210813999176, + "learning_rate": 8.770762781473651e-06, + "loss": 0.7097, + "step": 8364 + }, + { + "epoch": 0.4603995817051021, + "grad_norm": 0.8396975994110107, + "learning_rate": 8.770478110096111e-06, + "loss": 0.8731, + "step": 8365 + }, + { + "epoch": 0.4604546205074578, + "grad_norm": 0.7334815263748169, + "learning_rate": 8.770193410380663e-06, + "loss": 0.7689, + "step": 8366 + }, + { + "epoch": 0.4605096593098134, + "grad_norm": 0.8220386505126953, + "learning_rate": 8.769908682329453e-06, + "loss": 0.8139, + "step": 8367 + }, + { + "epoch": 0.4605646981121691, + "grad_norm": 0.8077995181083679, + "learning_rate": 8.76962392594462e-06, + "loss": 0.7379, + "step": 8368 + }, + { + "epoch": 0.46061973691452474, + "grad_norm": 0.8007730841636658, + "learning_rate": 8.7693391412283e-06, + "loss": 0.7835, + "step": 8369 + }, + { + "epoch": 0.4606747757168804, + "grad_norm": 0.7108187079429626, + "learning_rate": 8.769054328182637e-06, + "loss": 0.6787, + "step": 8370 + }, + { + "epoch": 0.46072981451923606, + "grad_norm": 0.7623056173324585, + "learning_rate": 8.768769486809772e-06, + "loss": 0.8056, + "step": 8371 + }, + { + "epoch": 0.46078485332159175, + "grad_norm": 0.6991614103317261, + "learning_rate": 8.768484617111843e-06, + "loss": 0.7404, + "step": 8372 + }, + { + "epoch": 0.4608398921239474, + "grad_norm": 0.7531471848487854, + "learning_rate": 8.768199719090991e-06, + "loss": 0.8104, + "step": 8373 + }, + { + "epoch": 0.46089493092630307, + "grad_norm": 1.0271111726760864, + "learning_rate": 8.76791479274936e-06, + "loss": 0.9028, + "step": 8374 + }, + { + "epoch": 0.4609499697286587, + "grad_norm": 0.7346897125244141, + "learning_rate": 8.76762983808909e-06, + "loss": 0.8179, + "step": 8375 + }, + { + "epoch": 0.46100500853101434, + "grad_norm": 0.6413559913635254, + "learning_rate": 8.767344855112324e-06, + "loss": 0.7995, + "step": 8376 + }, + { + "epoch": 0.46106004733337, + "grad_norm": 0.7187537550926208, + "learning_rate": 8.767059843821199e-06, + "loss": 0.7973, + "step": 8377 + }, + { + "epoch": 0.46111508613572566, + "grad_norm": 0.6819092035293579, + "learning_rate": 8.766774804217864e-06, + "loss": 0.8255, + "step": 8378 + }, + { + "epoch": 0.46117012493808135, + "grad_norm": 0.683318018913269, + "learning_rate": 8.766489736304457e-06, + "loss": 0.6794, + "step": 8379 + }, + { + "epoch": 0.461225163740437, + "grad_norm": 0.7345470786094666, + "learning_rate": 8.76620464008312e-06, + "loss": 0.8741, + "step": 8380 + }, + { + "epoch": 0.46128020254279267, + "grad_norm": 0.7369397282600403, + "learning_rate": 8.765919515556e-06, + "loss": 0.8301, + "step": 8381 + }, + { + "epoch": 0.4613352413451483, + "grad_norm": 0.7304979562759399, + "learning_rate": 8.765634362725233e-06, + "loss": 0.7507, + "step": 8382 + }, + { + "epoch": 0.461390280147504, + "grad_norm": 0.7968454957008362, + "learning_rate": 8.765349181592969e-06, + "loss": 0.7396, + "step": 8383 + }, + { + "epoch": 0.46144531894985963, + "grad_norm": 0.691439151763916, + "learning_rate": 8.765063972161347e-06, + "loss": 0.7199, + "step": 8384 + }, + { + "epoch": 0.4615003577522153, + "grad_norm": 0.8355879187583923, + "learning_rate": 8.764778734432513e-06, + "loss": 0.7369, + "step": 8385 + }, + { + "epoch": 0.46155539655457095, + "grad_norm": 0.908017098903656, + "learning_rate": 8.76449346840861e-06, + "loss": 0.8271, + "step": 8386 + }, + { + "epoch": 0.46161043535692664, + "grad_norm": 0.6426172852516174, + "learning_rate": 8.764208174091781e-06, + "loss": 0.6646, + "step": 8387 + }, + { + "epoch": 0.4616654741592823, + "grad_norm": 0.7003652453422546, + "learning_rate": 8.763922851484171e-06, + "loss": 0.7272, + "step": 8388 + }, + { + "epoch": 0.46172051296163796, + "grad_norm": 0.7470494508743286, + "learning_rate": 8.763637500587925e-06, + "loss": 0.8333, + "step": 8389 + }, + { + "epoch": 0.4617755517639936, + "grad_norm": 0.6974903345108032, + "learning_rate": 8.763352121405187e-06, + "loss": 0.834, + "step": 8390 + }, + { + "epoch": 0.4618305905663493, + "grad_norm": 0.8146659135818481, + "learning_rate": 8.7630667139381e-06, + "loss": 0.724, + "step": 8391 + }, + { + "epoch": 0.4618856293687049, + "grad_norm": 0.6614096164703369, + "learning_rate": 8.762781278188813e-06, + "loss": 0.6822, + "step": 8392 + }, + { + "epoch": 0.4619406681710606, + "grad_norm": 0.712944746017456, + "learning_rate": 8.762495814159469e-06, + "loss": 0.7864, + "step": 8393 + }, + { + "epoch": 0.46199570697341624, + "grad_norm": 0.7531552910804749, + "learning_rate": 8.762210321852213e-06, + "loss": 0.7494, + "step": 8394 + }, + { + "epoch": 0.46205074577577193, + "grad_norm": 0.8150199055671692, + "learning_rate": 8.761924801269191e-06, + "loss": 0.7869, + "step": 8395 + }, + { + "epoch": 0.46210578457812757, + "grad_norm": 0.8586462736129761, + "learning_rate": 8.76163925241255e-06, + "loss": 0.7647, + "step": 8396 + }, + { + "epoch": 0.46216082338048325, + "grad_norm": 0.7258061766624451, + "learning_rate": 8.761353675284434e-06, + "loss": 0.7672, + "step": 8397 + }, + { + "epoch": 0.4622158621828389, + "grad_norm": 0.6592851281166077, + "learning_rate": 8.761068069886992e-06, + "loss": 0.7488, + "step": 8398 + }, + { + "epoch": 0.4622709009851946, + "grad_norm": 0.7410836219787598, + "learning_rate": 8.760782436222368e-06, + "loss": 0.6669, + "step": 8399 + }, + { + "epoch": 0.4623259397875502, + "grad_norm": 0.7121642231941223, + "learning_rate": 8.76049677429271e-06, + "loss": 0.7005, + "step": 8400 + }, + { + "epoch": 0.4623809785899059, + "grad_norm": 0.7170663475990295, + "learning_rate": 8.760211084100166e-06, + "loss": 0.8154, + "step": 8401 + }, + { + "epoch": 0.46243601739226153, + "grad_norm": 0.6851769685745239, + "learning_rate": 8.759925365646882e-06, + "loss": 0.7948, + "step": 8402 + }, + { + "epoch": 0.4624910561946172, + "grad_norm": 0.7728533744812012, + "learning_rate": 8.759639618935006e-06, + "loss": 0.8263, + "step": 8403 + }, + { + "epoch": 0.46254609499697286, + "grad_norm": 0.7276784777641296, + "learning_rate": 8.759353843966682e-06, + "loss": 0.6992, + "step": 8404 + }, + { + "epoch": 0.46260113379932855, + "grad_norm": 0.7533649802207947, + "learning_rate": 8.759068040744063e-06, + "loss": 0.7744, + "step": 8405 + }, + { + "epoch": 0.4626561726016842, + "grad_norm": 0.6911979913711548, + "learning_rate": 8.758782209269294e-06, + "loss": 0.6977, + "step": 8406 + }, + { + "epoch": 0.46271121140403987, + "grad_norm": 0.6723766922950745, + "learning_rate": 8.758496349544526e-06, + "loss": 0.7286, + "step": 8407 + }, + { + "epoch": 0.4627662502063955, + "grad_norm": 0.7327921390533447, + "learning_rate": 8.758210461571903e-06, + "loss": 0.7708, + "step": 8408 + }, + { + "epoch": 0.4628212890087512, + "grad_norm": 0.7498626708984375, + "learning_rate": 8.757924545353578e-06, + "loss": 0.7476, + "step": 8409 + }, + { + "epoch": 0.4628763278111068, + "grad_norm": 0.8944914937019348, + "learning_rate": 8.757638600891696e-06, + "loss": 0.7814, + "step": 8410 + }, + { + "epoch": 0.4629313666134625, + "grad_norm": 0.7242841124534607, + "learning_rate": 8.757352628188411e-06, + "loss": 0.7564, + "step": 8411 + }, + { + "epoch": 0.46298640541581815, + "grad_norm": 0.6706324815750122, + "learning_rate": 8.757066627245866e-06, + "loss": 0.7792, + "step": 8412 + }, + { + "epoch": 0.46304144421817384, + "grad_norm": 0.8044155836105347, + "learning_rate": 8.756780598066218e-06, + "loss": 0.7873, + "step": 8413 + }, + { + "epoch": 0.46309648302052947, + "grad_norm": 0.9265295267105103, + "learning_rate": 8.75649454065161e-06, + "loss": 0.878, + "step": 8414 + }, + { + "epoch": 0.46315152182288516, + "grad_norm": 0.8162378668785095, + "learning_rate": 8.756208455004194e-06, + "loss": 0.8758, + "step": 8415 + }, + { + "epoch": 0.4632065606252408, + "grad_norm": 0.7081401348114014, + "learning_rate": 8.755922341126121e-06, + "loss": 0.8053, + "step": 8416 + }, + { + "epoch": 0.4632615994275965, + "grad_norm": 0.663885235786438, + "learning_rate": 8.755636199019544e-06, + "loss": 0.7456, + "step": 8417 + }, + { + "epoch": 0.4633166382299521, + "grad_norm": 0.6934974193572998, + "learning_rate": 8.755350028686608e-06, + "loss": 0.7316, + "step": 8418 + }, + { + "epoch": 0.46337167703230775, + "grad_norm": 0.7162168025970459, + "learning_rate": 8.755063830129467e-06, + "loss": 0.8566, + "step": 8419 + }, + { + "epoch": 0.46342671583466344, + "grad_norm": 0.7507640719413757, + "learning_rate": 8.75477760335027e-06, + "loss": 0.8141, + "step": 8420 + }, + { + "epoch": 0.46348175463701907, + "grad_norm": 0.6853382587432861, + "learning_rate": 8.754491348351172e-06, + "loss": 0.6995, + "step": 8421 + }, + { + "epoch": 0.46353679343937476, + "grad_norm": 0.6421381831169128, + "learning_rate": 8.75420506513432e-06, + "loss": 0.6344, + "step": 8422 + }, + { + "epoch": 0.4635918322417304, + "grad_norm": 0.8042624592781067, + "learning_rate": 8.753918753701868e-06, + "loss": 0.7506, + "step": 8423 + }, + { + "epoch": 0.4636468710440861, + "grad_norm": 0.7184088230133057, + "learning_rate": 8.753632414055969e-06, + "loss": 0.7997, + "step": 8424 + }, + { + "epoch": 0.4637019098464417, + "grad_norm": 0.749919593334198, + "learning_rate": 8.753346046198773e-06, + "loss": 0.8168, + "step": 8425 + }, + { + "epoch": 0.4637569486487974, + "grad_norm": 0.6583670973777771, + "learning_rate": 8.753059650132433e-06, + "loss": 0.6615, + "step": 8426 + }, + { + "epoch": 0.46381198745115304, + "grad_norm": 0.7560496926307678, + "learning_rate": 8.7527732258591e-06, + "loss": 0.7221, + "step": 8427 + }, + { + "epoch": 0.46386702625350873, + "grad_norm": 0.7031972408294678, + "learning_rate": 8.752486773380928e-06, + "loss": 0.8124, + "step": 8428 + }, + { + "epoch": 0.46392206505586436, + "grad_norm": 0.684124767780304, + "learning_rate": 8.752200292700072e-06, + "loss": 0.6862, + "step": 8429 + }, + { + "epoch": 0.46397710385822005, + "grad_norm": 0.8015589118003845, + "learning_rate": 8.751913783818682e-06, + "loss": 0.7863, + "step": 8430 + }, + { + "epoch": 0.4640321426605757, + "grad_norm": 0.6815705299377441, + "learning_rate": 8.751627246738912e-06, + "loss": 0.8116, + "step": 8431 + }, + { + "epoch": 0.4640871814629314, + "grad_norm": 0.7402058839797974, + "learning_rate": 8.751340681462914e-06, + "loss": 0.7341, + "step": 8432 + }, + { + "epoch": 0.464142220265287, + "grad_norm": 0.7484470009803772, + "learning_rate": 8.751054087992848e-06, + "loss": 0.8103, + "step": 8433 + }, + { + "epoch": 0.4641972590676427, + "grad_norm": 0.8148707151412964, + "learning_rate": 8.75076746633086e-06, + "loss": 0.8995, + "step": 8434 + }, + { + "epoch": 0.46425229786999833, + "grad_norm": 0.6403086185455322, + "learning_rate": 8.750480816479107e-06, + "loss": 0.6705, + "step": 8435 + }, + { + "epoch": 0.464307336672354, + "grad_norm": 0.7787690758705139, + "learning_rate": 8.750194138439748e-06, + "loss": 0.854, + "step": 8436 + }, + { + "epoch": 0.46436237547470965, + "grad_norm": 0.6975393891334534, + "learning_rate": 8.749907432214931e-06, + "loss": 0.7588, + "step": 8437 + }, + { + "epoch": 0.46441741427706534, + "grad_norm": 0.8002430200576782, + "learning_rate": 8.749620697806812e-06, + "loss": 0.8244, + "step": 8438 + }, + { + "epoch": 0.464472453079421, + "grad_norm": 0.8049100637435913, + "learning_rate": 8.74933393521755e-06, + "loss": 0.7686, + "step": 8439 + }, + { + "epoch": 0.46452749188177667, + "grad_norm": 0.6716971397399902, + "learning_rate": 8.749047144449298e-06, + "loss": 0.7823, + "step": 8440 + }, + { + "epoch": 0.4645825306841323, + "grad_norm": 0.7292011380195618, + "learning_rate": 8.748760325504212e-06, + "loss": 0.7643, + "step": 8441 + }, + { + "epoch": 0.464637569486488, + "grad_norm": 0.6823335886001587, + "learning_rate": 8.748473478384444e-06, + "loss": 0.7539, + "step": 8442 + }, + { + "epoch": 0.4646926082888436, + "grad_norm": 0.761730968952179, + "learning_rate": 8.748186603092155e-06, + "loss": 0.7279, + "step": 8443 + }, + { + "epoch": 0.4647476470911993, + "grad_norm": 0.694007933139801, + "learning_rate": 8.747899699629498e-06, + "loss": 0.7907, + "step": 8444 + }, + { + "epoch": 0.46480268589355495, + "grad_norm": 0.7638683319091797, + "learning_rate": 8.74761276799863e-06, + "loss": 0.7278, + "step": 8445 + }, + { + "epoch": 0.46485772469591063, + "grad_norm": 0.6281229853630066, + "learning_rate": 8.747325808201708e-06, + "loss": 0.6609, + "step": 8446 + }, + { + "epoch": 0.46491276349826627, + "grad_norm": 0.7273259162902832, + "learning_rate": 8.747038820240887e-06, + "loss": 0.7553, + "step": 8447 + }, + { + "epoch": 0.46496780230062196, + "grad_norm": 0.807482898235321, + "learning_rate": 8.746751804118326e-06, + "loss": 0.7783, + "step": 8448 + }, + { + "epoch": 0.4650228411029776, + "grad_norm": 0.7088230848312378, + "learning_rate": 8.746464759836182e-06, + "loss": 0.762, + "step": 8449 + }, + { + "epoch": 0.4650778799053333, + "grad_norm": 0.7039850354194641, + "learning_rate": 8.746177687396612e-06, + "loss": 0.7811, + "step": 8450 + }, + { + "epoch": 0.4651329187076889, + "grad_norm": 0.7154161334037781, + "learning_rate": 8.745890586801773e-06, + "loss": 0.76, + "step": 8451 + }, + { + "epoch": 0.4651879575100446, + "grad_norm": 0.6738846302032471, + "learning_rate": 8.745603458053822e-06, + "loss": 0.7119, + "step": 8452 + }, + { + "epoch": 0.46524299631240024, + "grad_norm": 0.6615753173828125, + "learning_rate": 8.745316301154919e-06, + "loss": 0.8061, + "step": 8453 + }, + { + "epoch": 0.4652980351147559, + "grad_norm": 0.7285076379776001, + "learning_rate": 8.74502911610722e-06, + "loss": 0.7522, + "step": 8454 + }, + { + "epoch": 0.46535307391711156, + "grad_norm": 0.7100732922554016, + "learning_rate": 8.744741902912886e-06, + "loss": 0.7665, + "step": 8455 + }, + { + "epoch": 0.46540811271946725, + "grad_norm": 0.6564487814903259, + "learning_rate": 8.744454661574074e-06, + "loss": 0.7352, + "step": 8456 + }, + { + "epoch": 0.4654631515218229, + "grad_norm": 0.689549446105957, + "learning_rate": 8.744167392092944e-06, + "loss": 0.7011, + "step": 8457 + }, + { + "epoch": 0.46551819032417857, + "grad_norm": 0.6660958528518677, + "learning_rate": 8.743880094471651e-06, + "loss": 0.7074, + "step": 8458 + }, + { + "epoch": 0.4655732291265342, + "grad_norm": 0.7470804452896118, + "learning_rate": 8.743592768712361e-06, + "loss": 0.6684, + "step": 8459 + }, + { + "epoch": 0.4656282679288899, + "grad_norm": 0.8058002591133118, + "learning_rate": 8.743305414817227e-06, + "loss": 0.7945, + "step": 8460 + }, + { + "epoch": 0.4656833067312455, + "grad_norm": 0.7756261825561523, + "learning_rate": 8.743018032788413e-06, + "loss": 0.8442, + "step": 8461 + }, + { + "epoch": 0.46573834553360116, + "grad_norm": 0.9267478585243225, + "learning_rate": 8.742730622628077e-06, + "loss": 0.8721, + "step": 8462 + }, + { + "epoch": 0.46579338433595685, + "grad_norm": 0.8684219121932983, + "learning_rate": 8.74244318433838e-06, + "loss": 0.7833, + "step": 8463 + }, + { + "epoch": 0.4658484231383125, + "grad_norm": 0.7060475945472717, + "learning_rate": 8.742155717921481e-06, + "loss": 0.7724, + "step": 8464 + }, + { + "epoch": 0.4659034619406682, + "grad_norm": 0.7316318154335022, + "learning_rate": 8.741868223379543e-06, + "loss": 0.7489, + "step": 8465 + }, + { + "epoch": 0.4659585007430238, + "grad_norm": 0.8131282925605774, + "learning_rate": 8.741580700714724e-06, + "loss": 0.7453, + "step": 8466 + }, + { + "epoch": 0.4660135395453795, + "grad_norm": 0.6985850930213928, + "learning_rate": 8.741293149929187e-06, + "loss": 0.7083, + "step": 8467 + }, + { + "epoch": 0.46606857834773513, + "grad_norm": 0.7512301206588745, + "learning_rate": 8.74100557102509e-06, + "loss": 0.7343, + "step": 8468 + }, + { + "epoch": 0.4661236171500908, + "grad_norm": 0.7547290921211243, + "learning_rate": 8.740717964004596e-06, + "loss": 0.8358, + "step": 8469 + }, + { + "epoch": 0.46617865595244645, + "grad_norm": 0.9091271758079529, + "learning_rate": 8.740430328869868e-06, + "loss": 0.762, + "step": 8470 + }, + { + "epoch": 0.46623369475480214, + "grad_norm": 0.6960130333900452, + "learning_rate": 8.740142665623069e-06, + "loss": 0.7317, + "step": 8471 + }, + { + "epoch": 0.4662887335571578, + "grad_norm": 0.684309184551239, + "learning_rate": 8.739854974266357e-06, + "loss": 0.7653, + "step": 8472 + }, + { + "epoch": 0.46634377235951346, + "grad_norm": 0.7669411301612854, + "learning_rate": 8.739567254801898e-06, + "loss": 0.7152, + "step": 8473 + }, + { + "epoch": 0.4663988111618691, + "grad_norm": 0.7072784900665283, + "learning_rate": 8.73927950723185e-06, + "loss": 0.7508, + "step": 8474 + }, + { + "epoch": 0.4664538499642248, + "grad_norm": 0.7249277234077454, + "learning_rate": 8.73899173155838e-06, + "loss": 0.7469, + "step": 8475 + }, + { + "epoch": 0.4665088887665804, + "grad_norm": 0.7664750218391418, + "learning_rate": 8.738703927783647e-06, + "loss": 0.8692, + "step": 8476 + }, + { + "epoch": 0.4665639275689361, + "grad_norm": 0.7579765319824219, + "learning_rate": 8.738416095909818e-06, + "loss": 0.8283, + "step": 8477 + }, + { + "epoch": 0.46661896637129174, + "grad_norm": 0.7066456079483032, + "learning_rate": 8.738128235939054e-06, + "loss": 0.7125, + "step": 8478 + }, + { + "epoch": 0.46667400517364743, + "grad_norm": 0.766106367111206, + "learning_rate": 8.737840347873518e-06, + "loss": 0.7683, + "step": 8479 + }, + { + "epoch": 0.46672904397600307, + "grad_norm": 0.7599226236343384, + "learning_rate": 8.737552431715374e-06, + "loss": 0.8375, + "step": 8480 + }, + { + "epoch": 0.46678408277835876, + "grad_norm": 0.6955341100692749, + "learning_rate": 8.737264487466789e-06, + "loss": 0.7012, + "step": 8481 + }, + { + "epoch": 0.4668391215807144, + "grad_norm": 0.6096246242523193, + "learning_rate": 8.736976515129923e-06, + "loss": 0.6126, + "step": 8482 + }, + { + "epoch": 0.4668941603830701, + "grad_norm": 0.7469536066055298, + "learning_rate": 8.73668851470694e-06, + "loss": 0.7675, + "step": 8483 + }, + { + "epoch": 0.4669491991854257, + "grad_norm": 0.8018775582313538, + "learning_rate": 8.73640048620001e-06, + "loss": 0.7372, + "step": 8484 + }, + { + "epoch": 0.4670042379877814, + "grad_norm": 0.7446827292442322, + "learning_rate": 8.736112429611293e-06, + "loss": 0.7277, + "step": 8485 + }, + { + "epoch": 0.46705927679013703, + "grad_norm": 0.6292026042938232, + "learning_rate": 8.735824344942954e-06, + "loss": 0.6172, + "step": 8486 + }, + { + "epoch": 0.4671143155924927, + "grad_norm": 0.7207980751991272, + "learning_rate": 8.735536232197159e-06, + "loss": 0.8363, + "step": 8487 + }, + { + "epoch": 0.46716935439484836, + "grad_norm": 0.8585891127586365, + "learning_rate": 8.735248091376073e-06, + "loss": 0.8006, + "step": 8488 + }, + { + "epoch": 0.46722439319720405, + "grad_norm": 0.8149702548980713, + "learning_rate": 8.734959922481863e-06, + "loss": 0.7869, + "step": 8489 + }, + { + "epoch": 0.4672794319995597, + "grad_norm": 0.7113268971443176, + "learning_rate": 8.734671725516695e-06, + "loss": 0.7774, + "step": 8490 + }, + { + "epoch": 0.46733447080191537, + "grad_norm": 0.6940683722496033, + "learning_rate": 8.734383500482733e-06, + "loss": 0.7157, + "step": 8491 + }, + { + "epoch": 0.467389509604271, + "grad_norm": 0.7823536396026611, + "learning_rate": 8.734095247382145e-06, + "loss": 0.8161, + "step": 8492 + }, + { + "epoch": 0.4674445484066267, + "grad_norm": 0.7094922065734863, + "learning_rate": 8.733806966217096e-06, + "loss": 0.7593, + "step": 8493 + }, + { + "epoch": 0.4674995872089823, + "grad_norm": 0.656432569026947, + "learning_rate": 8.733518656989753e-06, + "loss": 0.7853, + "step": 8494 + }, + { + "epoch": 0.467554626011338, + "grad_norm": 0.6715715527534485, + "learning_rate": 8.733230319702284e-06, + "loss": 0.839, + "step": 8495 + }, + { + "epoch": 0.46760966481369365, + "grad_norm": 0.7496705055236816, + "learning_rate": 8.732941954356854e-06, + "loss": 0.8231, + "step": 8496 + }, + { + "epoch": 0.46766470361604934, + "grad_norm": 0.7728047370910645, + "learning_rate": 8.732653560955635e-06, + "loss": 0.7852, + "step": 8497 + }, + { + "epoch": 0.46771974241840497, + "grad_norm": 1.5637458562850952, + "learning_rate": 8.732365139500787e-06, + "loss": 0.7749, + "step": 8498 + }, + { + "epoch": 0.46777478122076066, + "grad_norm": 0.6603190898895264, + "learning_rate": 8.732076689994484e-06, + "loss": 0.6628, + "step": 8499 + }, + { + "epoch": 0.4678298200231163, + "grad_norm": 0.7170974612236023, + "learning_rate": 8.73178821243889e-06, + "loss": 0.7855, + "step": 8500 + }, + { + "epoch": 0.467884858825472, + "grad_norm": 0.7220103740692139, + "learning_rate": 8.731499706836175e-06, + "loss": 0.7035, + "step": 8501 + }, + { + "epoch": 0.4679398976278276, + "grad_norm": 0.6940942406654358, + "learning_rate": 8.731211173188507e-06, + "loss": 0.7857, + "step": 8502 + }, + { + "epoch": 0.4679949364301833, + "grad_norm": 2.441596508026123, + "learning_rate": 8.730922611498057e-06, + "loss": 0.695, + "step": 8503 + }, + { + "epoch": 0.46804997523253894, + "grad_norm": 0.7654910087585449, + "learning_rate": 8.730634021766989e-06, + "loss": 0.788, + "step": 8504 + }, + { + "epoch": 0.4681050140348946, + "grad_norm": 0.791824996471405, + "learning_rate": 8.730345403997475e-06, + "loss": 0.7899, + "step": 8505 + }, + { + "epoch": 0.46816005283725026, + "grad_norm": 0.6863934993743896, + "learning_rate": 8.730056758191682e-06, + "loss": 0.7402, + "step": 8506 + }, + { + "epoch": 0.4682150916396059, + "grad_norm": 0.7920359373092651, + "learning_rate": 8.729768084351783e-06, + "loss": 0.7835, + "step": 8507 + }, + { + "epoch": 0.4682701304419616, + "grad_norm": 0.7077129483222961, + "learning_rate": 8.729479382479944e-06, + "loss": 0.7761, + "step": 8508 + }, + { + "epoch": 0.4683251692443172, + "grad_norm": 0.6870049238204956, + "learning_rate": 8.729190652578337e-06, + "loss": 0.8169, + "step": 8509 + }, + { + "epoch": 0.4683802080466729, + "grad_norm": 0.6802713871002197, + "learning_rate": 8.728901894649131e-06, + "loss": 0.7914, + "step": 8510 + }, + { + "epoch": 0.46843524684902854, + "grad_norm": 0.6645112633705139, + "learning_rate": 8.728613108694497e-06, + "loss": 0.7543, + "step": 8511 + }, + { + "epoch": 0.46849028565138423, + "grad_norm": 0.708292543888092, + "learning_rate": 8.728324294716604e-06, + "loss": 0.7015, + "step": 8512 + }, + { + "epoch": 0.46854532445373986, + "grad_norm": 0.7444465160369873, + "learning_rate": 8.728035452717625e-06, + "loss": 0.7999, + "step": 8513 + }, + { + "epoch": 0.46860036325609555, + "grad_norm": 0.7028616666793823, + "learning_rate": 8.727746582699728e-06, + "loss": 0.8094, + "step": 8514 + }, + { + "epoch": 0.4686554020584512, + "grad_norm": 0.7063208222389221, + "learning_rate": 8.727457684665088e-06, + "loss": 0.8028, + "step": 8515 + }, + { + "epoch": 0.4687104408608069, + "grad_norm": 0.8455138802528381, + "learning_rate": 8.727168758615871e-06, + "loss": 0.7691, + "step": 8516 + }, + { + "epoch": 0.4687654796631625, + "grad_norm": 1.0325778722763062, + "learning_rate": 8.726879804554252e-06, + "loss": 0.7042, + "step": 8517 + }, + { + "epoch": 0.4688205184655182, + "grad_norm": 0.7352754473686218, + "learning_rate": 8.726590822482402e-06, + "loss": 0.8467, + "step": 8518 + }, + { + "epoch": 0.46887555726787383, + "grad_norm": 0.7247193455696106, + "learning_rate": 8.726301812402494e-06, + "loss": 0.8034, + "step": 8519 + }, + { + "epoch": 0.4689305960702295, + "grad_norm": 0.6876820921897888, + "learning_rate": 8.726012774316699e-06, + "loss": 0.7308, + "step": 8520 + }, + { + "epoch": 0.46898563487258516, + "grad_norm": 0.6987231969833374, + "learning_rate": 8.725723708227188e-06, + "loss": 0.7655, + "step": 8521 + }, + { + "epoch": 0.46904067367494084, + "grad_norm": 0.7471843361854553, + "learning_rate": 8.725434614136135e-06, + "loss": 0.7271, + "step": 8522 + }, + { + "epoch": 0.4690957124772965, + "grad_norm": 0.7564642429351807, + "learning_rate": 8.725145492045715e-06, + "loss": 0.7335, + "step": 8523 + }, + { + "epoch": 0.46915075127965217, + "grad_norm": 0.7488992214202881, + "learning_rate": 8.724856341958095e-06, + "loss": 0.8815, + "step": 8524 + }, + { + "epoch": 0.4692057900820078, + "grad_norm": 0.6776759028434753, + "learning_rate": 8.724567163875455e-06, + "loss": 0.7452, + "step": 8525 + }, + { + "epoch": 0.4692608288843635, + "grad_norm": 0.6905981302261353, + "learning_rate": 8.724277957799963e-06, + "loss": 0.6815, + "step": 8526 + }, + { + "epoch": 0.4693158676867191, + "grad_norm": 0.7392297983169556, + "learning_rate": 8.723988723733795e-06, + "loss": 0.7546, + "step": 8527 + }, + { + "epoch": 0.4693709064890748, + "grad_norm": 0.7479110360145569, + "learning_rate": 8.723699461679128e-06, + "loss": 0.7455, + "step": 8528 + }, + { + "epoch": 0.46942594529143045, + "grad_norm": 0.7231360673904419, + "learning_rate": 8.723410171638129e-06, + "loss": 0.7611, + "step": 8529 + }, + { + "epoch": 0.46948098409378614, + "grad_norm": 0.7493714690208435, + "learning_rate": 8.723120853612976e-06, + "loss": 0.6997, + "step": 8530 + }, + { + "epoch": 0.46953602289614177, + "grad_norm": 0.8056793808937073, + "learning_rate": 8.722831507605844e-06, + "loss": 0.7431, + "step": 8531 + }, + { + "epoch": 0.46959106169849746, + "grad_norm": 0.7528547048568726, + "learning_rate": 8.722542133618907e-06, + "loss": 0.8798, + "step": 8532 + }, + { + "epoch": 0.4696461005008531, + "grad_norm": 0.6964863538742065, + "learning_rate": 8.72225273165434e-06, + "loss": 0.8462, + "step": 8533 + }, + { + "epoch": 0.4697011393032088, + "grad_norm": 0.7354302406311035, + "learning_rate": 8.721963301714318e-06, + "loss": 0.7882, + "step": 8534 + }, + { + "epoch": 0.4697561781055644, + "grad_norm": 0.7365205883979797, + "learning_rate": 8.721673843801014e-06, + "loss": 0.7483, + "step": 8535 + }, + { + "epoch": 0.4698112169079201, + "grad_norm": 0.7485378384590149, + "learning_rate": 8.72138435791661e-06, + "loss": 0.8539, + "step": 8536 + }, + { + "epoch": 0.46986625571027574, + "grad_norm": 0.7674353718757629, + "learning_rate": 8.721094844063274e-06, + "loss": 0.834, + "step": 8537 + }, + { + "epoch": 0.4699212945126314, + "grad_norm": 0.7054184079170227, + "learning_rate": 8.720805302243185e-06, + "loss": 0.7938, + "step": 8538 + }, + { + "epoch": 0.46997633331498706, + "grad_norm": 0.7414574027061462, + "learning_rate": 8.72051573245852e-06, + "loss": 0.7932, + "step": 8539 + }, + { + "epoch": 0.47003137211734275, + "grad_norm": 0.6734428405761719, + "learning_rate": 8.720226134711455e-06, + "loss": 0.8775, + "step": 8540 + }, + { + "epoch": 0.4700864109196984, + "grad_norm": 0.6588559150695801, + "learning_rate": 8.719936509004166e-06, + "loss": 0.6985, + "step": 8541 + }, + { + "epoch": 0.4701414497220541, + "grad_norm": 0.6557223200798035, + "learning_rate": 8.71964685533883e-06, + "loss": 0.7243, + "step": 8542 + }, + { + "epoch": 0.4701964885244097, + "grad_norm": 0.7876269221305847, + "learning_rate": 8.719357173717624e-06, + "loss": 0.8075, + "step": 8543 + }, + { + "epoch": 0.4702515273267654, + "grad_norm": 0.8346554040908813, + "learning_rate": 8.719067464142726e-06, + "loss": 0.8427, + "step": 8544 + }, + { + "epoch": 0.47030656612912103, + "grad_norm": 0.7190483808517456, + "learning_rate": 8.718777726616311e-06, + "loss": 0.7689, + "step": 8545 + }, + { + "epoch": 0.4703616049314767, + "grad_norm": 1.303118109703064, + "learning_rate": 8.718487961140558e-06, + "loss": 0.7537, + "step": 8546 + }, + { + "epoch": 0.47041664373383235, + "grad_norm": 0.7733024954795837, + "learning_rate": 8.718198167717647e-06, + "loss": 0.747, + "step": 8547 + }, + { + "epoch": 0.470471682536188, + "grad_norm": 0.6692484617233276, + "learning_rate": 8.717908346349751e-06, + "loss": 0.725, + "step": 8548 + }, + { + "epoch": 0.4705267213385437, + "grad_norm": 0.9639461636543274, + "learning_rate": 8.717618497039054e-06, + "loss": 0.8642, + "step": 8549 + }, + { + "epoch": 0.4705817601408993, + "grad_norm": 0.7584646344184875, + "learning_rate": 8.717328619787728e-06, + "loss": 0.8174, + "step": 8550 + }, + { + "epoch": 0.470636798943255, + "grad_norm": 0.7051709890365601, + "learning_rate": 8.717038714597957e-06, + "loss": 0.7962, + "step": 8551 + }, + { + "epoch": 0.47069183774561063, + "grad_norm": 0.738913893699646, + "learning_rate": 8.716748781471918e-06, + "loss": 0.7367, + "step": 8552 + }, + { + "epoch": 0.4707468765479663, + "grad_norm": 0.7027214169502258, + "learning_rate": 8.716458820411791e-06, + "loss": 0.7613, + "step": 8553 + }, + { + "epoch": 0.47080191535032195, + "grad_norm": 0.6701993346214294, + "learning_rate": 8.716168831419754e-06, + "loss": 0.638, + "step": 8554 + }, + { + "epoch": 0.47085695415267764, + "grad_norm": 0.7422072887420654, + "learning_rate": 8.715878814497984e-06, + "loss": 0.8338, + "step": 8555 + }, + { + "epoch": 0.4709119929550333, + "grad_norm": 0.985992968082428, + "learning_rate": 8.715588769648667e-06, + "loss": 0.7765, + "step": 8556 + }, + { + "epoch": 0.47096703175738897, + "grad_norm": 0.6937553882598877, + "learning_rate": 8.715298696873978e-06, + "loss": 0.7306, + "step": 8557 + }, + { + "epoch": 0.4710220705597446, + "grad_norm": 1.1683214902877808, + "learning_rate": 8.715008596176099e-06, + "loss": 0.7782, + "step": 8558 + }, + { + "epoch": 0.4710771093621003, + "grad_norm": 0.7493681907653809, + "learning_rate": 8.714718467557209e-06, + "loss": 0.9166, + "step": 8559 + }, + { + "epoch": 0.4711321481644559, + "grad_norm": 0.7562084794044495, + "learning_rate": 8.71442831101949e-06, + "loss": 0.7999, + "step": 8560 + }, + { + "epoch": 0.4711871869668116, + "grad_norm": 0.7950266003608704, + "learning_rate": 8.71413812656512e-06, + "loss": 0.8094, + "step": 8561 + }, + { + "epoch": 0.47124222576916724, + "grad_norm": 1.1411044597625732, + "learning_rate": 8.713847914196287e-06, + "loss": 0.7631, + "step": 8562 + }, + { + "epoch": 0.47129726457152293, + "grad_norm": 0.7270122170448303, + "learning_rate": 8.713557673915162e-06, + "loss": 0.7529, + "step": 8563 + }, + { + "epoch": 0.47135230337387857, + "grad_norm": 0.8138573169708252, + "learning_rate": 8.713267405723935e-06, + "loss": 0.8215, + "step": 8564 + }, + { + "epoch": 0.47140734217623426, + "grad_norm": 0.732982873916626, + "learning_rate": 8.712977109624783e-06, + "loss": 0.7099, + "step": 8565 + }, + { + "epoch": 0.4714623809785899, + "grad_norm": 0.7307591438293457, + "learning_rate": 8.712686785619888e-06, + "loss": 0.7035, + "step": 8566 + }, + { + "epoch": 0.4715174197809456, + "grad_norm": 0.8684857487678528, + "learning_rate": 8.712396433711434e-06, + "loss": 0.8605, + "step": 8567 + }, + { + "epoch": 0.4715724585833012, + "grad_norm": 0.7490718364715576, + "learning_rate": 8.712106053901603e-06, + "loss": 0.7439, + "step": 8568 + }, + { + "epoch": 0.4716274973856569, + "grad_norm": 0.8572973012924194, + "learning_rate": 8.711815646192575e-06, + "loss": 0.8187, + "step": 8569 + }, + { + "epoch": 0.47168253618801254, + "grad_norm": 0.785270094871521, + "learning_rate": 8.711525210586536e-06, + "loss": 0.7812, + "step": 8570 + }, + { + "epoch": 0.4717375749903682, + "grad_norm": 0.683651864528656, + "learning_rate": 8.711234747085663e-06, + "loss": 0.7682, + "step": 8571 + }, + { + "epoch": 0.47179261379272386, + "grad_norm": 0.7990714907646179, + "learning_rate": 8.710944255692147e-06, + "loss": 0.8114, + "step": 8572 + }, + { + "epoch": 0.47184765259507955, + "grad_norm": 0.9354856610298157, + "learning_rate": 8.710653736408165e-06, + "loss": 0.7353, + "step": 8573 + }, + { + "epoch": 0.4719026913974352, + "grad_norm": 0.8309356570243835, + "learning_rate": 8.710363189235904e-06, + "loss": 0.8635, + "step": 8574 + }, + { + "epoch": 0.47195773019979087, + "grad_norm": 0.7018463015556335, + "learning_rate": 8.710072614177547e-06, + "loss": 0.6372, + "step": 8575 + }, + { + "epoch": 0.4720127690021465, + "grad_norm": 0.7626469135284424, + "learning_rate": 8.709782011235277e-06, + "loss": 0.7684, + "step": 8576 + }, + { + "epoch": 0.4720678078045022, + "grad_norm": 0.6995826959609985, + "learning_rate": 8.70949138041128e-06, + "loss": 0.7301, + "step": 8577 + }, + { + "epoch": 0.4721228466068578, + "grad_norm": 0.719307541847229, + "learning_rate": 8.709200721707736e-06, + "loss": 0.7437, + "step": 8578 + }, + { + "epoch": 0.4721778854092135, + "grad_norm": 0.7355539202690125, + "learning_rate": 8.708910035126832e-06, + "loss": 0.7926, + "step": 8579 + }, + { + "epoch": 0.47223292421156915, + "grad_norm": 0.7262680530548096, + "learning_rate": 8.708619320670755e-06, + "loss": 0.7641, + "step": 8580 + }, + { + "epoch": 0.47228796301392484, + "grad_norm": 0.844745934009552, + "learning_rate": 8.708328578341687e-06, + "loss": 0.7228, + "step": 8581 + }, + { + "epoch": 0.47234300181628047, + "grad_norm": 0.8169287443161011, + "learning_rate": 8.708037808141814e-06, + "loss": 0.7076, + "step": 8582 + }, + { + "epoch": 0.47239804061863616, + "grad_norm": 0.7342209219932556, + "learning_rate": 8.707747010073322e-06, + "loss": 0.7997, + "step": 8583 + }, + { + "epoch": 0.4724530794209918, + "grad_norm": 0.7138200402259827, + "learning_rate": 8.707456184138394e-06, + "loss": 0.7796, + "step": 8584 + }, + { + "epoch": 0.4725081182233475, + "grad_norm": 0.7168061137199402, + "learning_rate": 8.70716533033922e-06, + "loss": 0.6876, + "step": 8585 + }, + { + "epoch": 0.4725631570257031, + "grad_norm": 0.7256397604942322, + "learning_rate": 8.706874448677982e-06, + "loss": 0.8296, + "step": 8586 + }, + { + "epoch": 0.4726181958280588, + "grad_norm": 0.8232730627059937, + "learning_rate": 8.70658353915687e-06, + "loss": 0.8001, + "step": 8587 + }, + { + "epoch": 0.47267323463041444, + "grad_norm": 0.7110162973403931, + "learning_rate": 8.706292601778067e-06, + "loss": 0.7061, + "step": 8588 + }, + { + "epoch": 0.47272827343277013, + "grad_norm": 0.9466721415519714, + "learning_rate": 8.706001636543761e-06, + "loss": 0.8713, + "step": 8589 + }, + { + "epoch": 0.47278331223512576, + "grad_norm": 0.7017776370048523, + "learning_rate": 8.705710643456138e-06, + "loss": 0.759, + "step": 8590 + }, + { + "epoch": 0.4728383510374814, + "grad_norm": 0.7140772938728333, + "learning_rate": 8.705419622517386e-06, + "loss": 0.6962, + "step": 8591 + }, + { + "epoch": 0.4728933898398371, + "grad_norm": 1.1076452732086182, + "learning_rate": 8.705128573729694e-06, + "loss": 0.8264, + "step": 8592 + }, + { + "epoch": 0.4729484286421927, + "grad_norm": 0.7308200597763062, + "learning_rate": 8.704837497095247e-06, + "loss": 0.6243, + "step": 8593 + }, + { + "epoch": 0.4730034674445484, + "grad_norm": 0.9445781111717224, + "learning_rate": 8.704546392616231e-06, + "loss": 0.6676, + "step": 8594 + }, + { + "epoch": 0.47305850624690404, + "grad_norm": 0.6527873277664185, + "learning_rate": 8.704255260294837e-06, + "loss": 0.6979, + "step": 8595 + }, + { + "epoch": 0.47311354504925973, + "grad_norm": 0.6732963919639587, + "learning_rate": 8.703964100133252e-06, + "loss": 0.7724, + "step": 8596 + }, + { + "epoch": 0.47316858385161537, + "grad_norm": 0.7661726474761963, + "learning_rate": 8.703672912133665e-06, + "loss": 0.7988, + "step": 8597 + }, + { + "epoch": 0.47322362265397105, + "grad_norm": 0.7006877660751343, + "learning_rate": 8.703381696298262e-06, + "loss": 0.6765, + "step": 8598 + }, + { + "epoch": 0.4732786614563267, + "grad_norm": 0.7195086479187012, + "learning_rate": 8.703090452629236e-06, + "loss": 0.6676, + "step": 8599 + }, + { + "epoch": 0.4733337002586824, + "grad_norm": 0.6692042350769043, + "learning_rate": 8.702799181128771e-06, + "loss": 0.7882, + "step": 8600 + }, + { + "epoch": 0.473388739061038, + "grad_norm": 0.7736524343490601, + "learning_rate": 8.70250788179906e-06, + "loss": 0.7977, + "step": 8601 + }, + { + "epoch": 0.4734437778633937, + "grad_norm": 0.8821607828140259, + "learning_rate": 8.70221655464229e-06, + "loss": 0.7465, + "step": 8602 + }, + { + "epoch": 0.47349881666574933, + "grad_norm": 0.7565156817436218, + "learning_rate": 8.701925199660652e-06, + "loss": 0.831, + "step": 8603 + }, + { + "epoch": 0.473553855468105, + "grad_norm": 0.8542304039001465, + "learning_rate": 8.701633816856335e-06, + "loss": 0.7538, + "step": 8604 + }, + { + "epoch": 0.47360889427046066, + "grad_norm": 0.6891050338745117, + "learning_rate": 8.701342406231529e-06, + "loss": 0.7687, + "step": 8605 + }, + { + "epoch": 0.47366393307281635, + "grad_norm": 0.8570719361305237, + "learning_rate": 8.701050967788424e-06, + "loss": 0.7236, + "step": 8606 + }, + { + "epoch": 0.473718971875172, + "grad_norm": 0.7921456098556519, + "learning_rate": 8.700759501529212e-06, + "loss": 0.8214, + "step": 8607 + }, + { + "epoch": 0.47377401067752767, + "grad_norm": 0.7584527730941772, + "learning_rate": 8.70046800745608e-06, + "loss": 0.8204, + "step": 8608 + }, + { + "epoch": 0.4738290494798833, + "grad_norm": 0.8033978343009949, + "learning_rate": 8.700176485571222e-06, + "loss": 0.8278, + "step": 8609 + }, + { + "epoch": 0.473884088282239, + "grad_norm": 0.9950750470161438, + "learning_rate": 8.699884935876828e-06, + "loss": 0.8181, + "step": 8610 + }, + { + "epoch": 0.4739391270845946, + "grad_norm": 0.7213684916496277, + "learning_rate": 8.69959335837509e-06, + "loss": 0.7099, + "step": 8611 + }, + { + "epoch": 0.4739941658869503, + "grad_norm": 0.7847200632095337, + "learning_rate": 8.699301753068199e-06, + "loss": 0.8272, + "step": 8612 + }, + { + "epoch": 0.47404920468930595, + "grad_norm": 0.7075058221817017, + "learning_rate": 8.699010119958344e-06, + "loss": 0.7127, + "step": 8613 + }, + { + "epoch": 0.47410424349166164, + "grad_norm": 0.682741641998291, + "learning_rate": 8.69871845904772e-06, + "loss": 0.8446, + "step": 8614 + }, + { + "epoch": 0.47415928229401727, + "grad_norm": 0.7120605111122131, + "learning_rate": 8.69842677033852e-06, + "loss": 0.7776, + "step": 8615 + }, + { + "epoch": 0.47421432109637296, + "grad_norm": 0.822405219078064, + "learning_rate": 8.698135053832933e-06, + "loss": 0.8018, + "step": 8616 + }, + { + "epoch": 0.4742693598987286, + "grad_norm": 0.6815186738967896, + "learning_rate": 8.697843309533152e-06, + "loss": 0.7413, + "step": 8617 + }, + { + "epoch": 0.4743243987010843, + "grad_norm": 0.7587849497795105, + "learning_rate": 8.69755153744137e-06, + "loss": 0.7809, + "step": 8618 + }, + { + "epoch": 0.4743794375034399, + "grad_norm": 0.7092488408088684, + "learning_rate": 8.697259737559782e-06, + "loss": 0.7921, + "step": 8619 + }, + { + "epoch": 0.4744344763057956, + "grad_norm": 0.7396836280822754, + "learning_rate": 8.69696790989058e-06, + "loss": 0.7946, + "step": 8620 + }, + { + "epoch": 0.47448951510815124, + "grad_norm": 0.6760729551315308, + "learning_rate": 8.696676054435955e-06, + "loss": 0.7389, + "step": 8621 + }, + { + "epoch": 0.4745445539105069, + "grad_norm": 1.1640692949295044, + "learning_rate": 8.696384171198105e-06, + "loss": 0.8291, + "step": 8622 + }, + { + "epoch": 0.47459959271286256, + "grad_norm": 0.7415158152580261, + "learning_rate": 8.696092260179219e-06, + "loss": 0.7534, + "step": 8623 + }, + { + "epoch": 0.47465463151521825, + "grad_norm": 0.7730052471160889, + "learning_rate": 8.695800321381492e-06, + "loss": 0.8447, + "step": 8624 + }, + { + "epoch": 0.4747096703175739, + "grad_norm": 0.811522364616394, + "learning_rate": 8.695508354807121e-06, + "loss": 0.7466, + "step": 8625 + }, + { + "epoch": 0.4747647091199296, + "grad_norm": 0.7908332347869873, + "learning_rate": 8.695216360458298e-06, + "loss": 0.7769, + "step": 8626 + }, + { + "epoch": 0.4748197479222852, + "grad_norm": 0.744971752166748, + "learning_rate": 8.694924338337217e-06, + "loss": 0.7651, + "step": 8627 + }, + { + "epoch": 0.4748747867246409, + "grad_norm": 0.705565869808197, + "learning_rate": 8.694632288446075e-06, + "loss": 0.8258, + "step": 8628 + }, + { + "epoch": 0.47492982552699653, + "grad_norm": 0.8199328780174255, + "learning_rate": 8.694340210787065e-06, + "loss": 0.733, + "step": 8629 + }, + { + "epoch": 0.4749848643293522, + "grad_norm": 0.6965511441230774, + "learning_rate": 8.694048105362382e-06, + "loss": 0.7548, + "step": 8630 + }, + { + "epoch": 0.47503990313170785, + "grad_norm": 0.7943055629730225, + "learning_rate": 8.693755972174225e-06, + "loss": 0.7518, + "step": 8631 + }, + { + "epoch": 0.47509494193406354, + "grad_norm": 0.6277437806129456, + "learning_rate": 8.693463811224785e-06, + "loss": 0.6941, + "step": 8632 + }, + { + "epoch": 0.4751499807364192, + "grad_norm": 1.0745574235916138, + "learning_rate": 8.693171622516259e-06, + "loss": 0.8056, + "step": 8633 + }, + { + "epoch": 0.4752050195387748, + "grad_norm": 0.7005153894424438, + "learning_rate": 8.692879406050844e-06, + "loss": 0.757, + "step": 8634 + }, + { + "epoch": 0.4752600583411305, + "grad_norm": 0.6971127986907959, + "learning_rate": 8.692587161830737e-06, + "loss": 0.7509, + "step": 8635 + }, + { + "epoch": 0.47531509714348613, + "grad_norm": 0.7583497762680054, + "learning_rate": 8.692294889858133e-06, + "loss": 0.7895, + "step": 8636 + }, + { + "epoch": 0.4753701359458418, + "grad_norm": 0.719932496547699, + "learning_rate": 8.692002590135228e-06, + "loss": 0.762, + "step": 8637 + }, + { + "epoch": 0.47542517474819745, + "grad_norm": 0.7041804790496826, + "learning_rate": 8.691710262664222e-06, + "loss": 0.7101, + "step": 8638 + }, + { + "epoch": 0.47548021355055314, + "grad_norm": 0.7395016551017761, + "learning_rate": 8.691417907447309e-06, + "loss": 0.723, + "step": 8639 + }, + { + "epoch": 0.4755352523529088, + "grad_norm": 0.6605637073516846, + "learning_rate": 8.691125524486686e-06, + "loss": 0.644, + "step": 8640 + }, + { + "epoch": 0.47559029115526447, + "grad_norm": 0.694732129573822, + "learning_rate": 8.690833113784552e-06, + "loss": 0.7162, + "step": 8641 + }, + { + "epoch": 0.4756453299576201, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.690540675343105e-06, + "loss": 0.6995, + "step": 8642 + }, + { + "epoch": 0.4757003687599758, + "grad_norm": 0.6961628794670105, + "learning_rate": 8.69024820916454e-06, + "loss": 0.7955, + "step": 8643 + }, + { + "epoch": 0.4757554075623314, + "grad_norm": 0.706266462802887, + "learning_rate": 8.68995571525106e-06, + "loss": 0.7237, + "step": 8644 + }, + { + "epoch": 0.4758104463646871, + "grad_norm": 0.7727495431900024, + "learning_rate": 8.689663193604858e-06, + "loss": 0.7215, + "step": 8645 + }, + { + "epoch": 0.47586548516704275, + "grad_norm": 0.7320648431777954, + "learning_rate": 8.689370644228136e-06, + "loss": 0.7592, + "step": 8646 + }, + { + "epoch": 0.47592052396939843, + "grad_norm": 0.8149487376213074, + "learning_rate": 8.689078067123093e-06, + "loss": 0.7666, + "step": 8647 + }, + { + "epoch": 0.47597556277175407, + "grad_norm": 0.6584552526473999, + "learning_rate": 8.688785462291927e-06, + "loss": 0.7497, + "step": 8648 + }, + { + "epoch": 0.47603060157410976, + "grad_norm": 0.7197825312614441, + "learning_rate": 8.688492829736836e-06, + "loss": 0.7559, + "step": 8649 + }, + { + "epoch": 0.4760856403764654, + "grad_norm": 0.8116913437843323, + "learning_rate": 8.68820016946002e-06, + "loss": 0.7029, + "step": 8650 + }, + { + "epoch": 0.4761406791788211, + "grad_norm": 0.6733378171920776, + "learning_rate": 8.68790748146368e-06, + "loss": 0.7242, + "step": 8651 + }, + { + "epoch": 0.4761957179811767, + "grad_norm": 0.690464437007904, + "learning_rate": 8.687614765750012e-06, + "loss": 0.6668, + "step": 8652 + }, + { + "epoch": 0.4762507567835324, + "grad_norm": 0.7901185154914856, + "learning_rate": 8.687322022321221e-06, + "loss": 0.7436, + "step": 8653 + }, + { + "epoch": 0.47630579558588804, + "grad_norm": 0.7608267068862915, + "learning_rate": 8.687029251179504e-06, + "loss": 0.8292, + "step": 8654 + }, + { + "epoch": 0.4763608343882437, + "grad_norm": 0.6851119995117188, + "learning_rate": 8.686736452327062e-06, + "loss": 0.7974, + "step": 8655 + }, + { + "epoch": 0.47641587319059936, + "grad_norm": 0.6946395635604858, + "learning_rate": 8.686443625766094e-06, + "loss": 0.6745, + "step": 8656 + }, + { + "epoch": 0.47647091199295505, + "grad_norm": 0.7403521537780762, + "learning_rate": 8.686150771498804e-06, + "loss": 0.7759, + "step": 8657 + }, + { + "epoch": 0.4765259507953107, + "grad_norm": 0.8415689468383789, + "learning_rate": 8.685857889527393e-06, + "loss": 0.7911, + "step": 8658 + }, + { + "epoch": 0.47658098959766637, + "grad_norm": 0.6947778463363647, + "learning_rate": 8.68556497985406e-06, + "loss": 0.8026, + "step": 8659 + }, + { + "epoch": 0.476636028400022, + "grad_norm": 0.6807059645652771, + "learning_rate": 8.685272042481006e-06, + "loss": 0.7194, + "step": 8660 + }, + { + "epoch": 0.4766910672023777, + "grad_norm": 0.8948639631271362, + "learning_rate": 8.684979077410434e-06, + "loss": 0.8017, + "step": 8661 + }, + { + "epoch": 0.4767461060047333, + "grad_norm": 0.6697849035263062, + "learning_rate": 8.684686084644546e-06, + "loss": 0.7653, + "step": 8662 + }, + { + "epoch": 0.476801144807089, + "grad_norm": 0.7303311228752136, + "learning_rate": 8.684393064185543e-06, + "loss": 0.8287, + "step": 8663 + }, + { + "epoch": 0.47685618360944465, + "grad_norm": 0.6545100808143616, + "learning_rate": 8.68410001603563e-06, + "loss": 0.7438, + "step": 8664 + }, + { + "epoch": 0.47691122241180034, + "grad_norm": 0.8757766485214233, + "learning_rate": 8.683806940197006e-06, + "loss": 0.8343, + "step": 8665 + }, + { + "epoch": 0.476966261214156, + "grad_norm": 0.6414330005645752, + "learning_rate": 8.683513836671876e-06, + "loss": 0.7201, + "step": 8666 + }, + { + "epoch": 0.47702130001651166, + "grad_norm": 0.6736441850662231, + "learning_rate": 8.68322070546244e-06, + "loss": 0.7365, + "step": 8667 + }, + { + "epoch": 0.4770763388188673, + "grad_norm": 0.780491054058075, + "learning_rate": 8.682927546570905e-06, + "loss": 0.924, + "step": 8668 + }, + { + "epoch": 0.477131377621223, + "grad_norm": 0.6913807988166809, + "learning_rate": 8.68263435999947e-06, + "loss": 0.8269, + "step": 8669 + }, + { + "epoch": 0.4771864164235786, + "grad_norm": 0.7264360189437866, + "learning_rate": 8.682341145750344e-06, + "loss": 0.788, + "step": 8670 + }, + { + "epoch": 0.4772414552259343, + "grad_norm": 0.7777243852615356, + "learning_rate": 8.682047903825725e-06, + "loss": 0.8691, + "step": 8671 + }, + { + "epoch": 0.47729649402828994, + "grad_norm": 0.7590457797050476, + "learning_rate": 8.681754634227821e-06, + "loss": 0.8249, + "step": 8672 + }, + { + "epoch": 0.47735153283064563, + "grad_norm": 0.7672324776649475, + "learning_rate": 8.681461336958836e-06, + "loss": 0.8334, + "step": 8673 + }, + { + "epoch": 0.47740657163300126, + "grad_norm": 0.7181395888328552, + "learning_rate": 8.681168012020971e-06, + "loss": 0.8089, + "step": 8674 + }, + { + "epoch": 0.47746161043535695, + "grad_norm": 0.7671428918838501, + "learning_rate": 8.680874659416433e-06, + "loss": 0.7634, + "step": 8675 + }, + { + "epoch": 0.4775166492377126, + "grad_norm": 0.73219895362854, + "learning_rate": 8.680581279147427e-06, + "loss": 0.7013, + "step": 8676 + }, + { + "epoch": 0.4775716880400682, + "grad_norm": 0.8050867319107056, + "learning_rate": 8.680287871216158e-06, + "loss": 0.7524, + "step": 8677 + }, + { + "epoch": 0.4776267268424239, + "grad_norm": 0.7154340744018555, + "learning_rate": 8.679994435624828e-06, + "loss": 0.802, + "step": 8678 + }, + { + "epoch": 0.47768176564477954, + "grad_norm": 0.7005884051322937, + "learning_rate": 8.679700972375647e-06, + "loss": 0.7633, + "step": 8679 + }, + { + "epoch": 0.47773680444713523, + "grad_norm": 0.8203871846199036, + "learning_rate": 8.679407481470818e-06, + "loss": 0.7782, + "step": 8680 + }, + { + "epoch": 0.47779184324949087, + "grad_norm": 0.6582844853401184, + "learning_rate": 8.679113962912547e-06, + "loss": 0.6799, + "step": 8681 + }, + { + "epoch": 0.47784688205184656, + "grad_norm": 0.7052889466285706, + "learning_rate": 8.67882041670304e-06, + "loss": 0.7814, + "step": 8682 + }, + { + "epoch": 0.4779019208542022, + "grad_norm": 0.7533165812492371, + "learning_rate": 8.678526842844504e-06, + "loss": 0.7983, + "step": 8683 + }, + { + "epoch": 0.4779569596565579, + "grad_norm": 0.7335212230682373, + "learning_rate": 8.678233241339144e-06, + "loss": 0.8023, + "step": 8684 + }, + { + "epoch": 0.4780119984589135, + "grad_norm": 0.7824274897575378, + "learning_rate": 8.67793961218917e-06, + "loss": 0.8219, + "step": 8685 + }, + { + "epoch": 0.4780670372612692, + "grad_norm": 0.6547996401786804, + "learning_rate": 8.677645955396784e-06, + "loss": 0.715, + "step": 8686 + }, + { + "epoch": 0.47812207606362483, + "grad_norm": 0.7507368326187134, + "learning_rate": 8.677352270964196e-06, + "loss": 0.9379, + "step": 8687 + }, + { + "epoch": 0.4781771148659805, + "grad_norm": 0.6403020620346069, + "learning_rate": 8.677058558893613e-06, + "loss": 0.659, + "step": 8688 + }, + { + "epoch": 0.47823215366833616, + "grad_norm": 0.7075803279876709, + "learning_rate": 8.676764819187242e-06, + "loss": 0.7515, + "step": 8689 + }, + { + "epoch": 0.47828719247069185, + "grad_norm": 0.6899601817131042, + "learning_rate": 8.676471051847291e-06, + "loss": 0.8398, + "step": 8690 + }, + { + "epoch": 0.4783422312730475, + "grad_norm": 0.7145645618438721, + "learning_rate": 8.676177256875969e-06, + "loss": 0.7711, + "step": 8691 + }, + { + "epoch": 0.47839727007540317, + "grad_norm": 0.7139655351638794, + "learning_rate": 8.675883434275479e-06, + "loss": 0.8664, + "step": 8692 + }, + { + "epoch": 0.4784523088777588, + "grad_norm": 0.7100433111190796, + "learning_rate": 8.675589584048037e-06, + "loss": 0.7812, + "step": 8693 + }, + { + "epoch": 0.4785073476801145, + "grad_norm": 0.6103882789611816, + "learning_rate": 8.675295706195845e-06, + "loss": 0.6565, + "step": 8694 + }, + { + "epoch": 0.4785623864824701, + "grad_norm": 0.7236714959144592, + "learning_rate": 8.675001800721114e-06, + "loss": 0.6849, + "step": 8695 + }, + { + "epoch": 0.4786174252848258, + "grad_norm": 0.7567160129547119, + "learning_rate": 8.674707867626056e-06, + "loss": 0.8289, + "step": 8696 + }, + { + "epoch": 0.47867246408718145, + "grad_norm": 0.7004136443138123, + "learning_rate": 8.674413906912876e-06, + "loss": 0.7466, + "step": 8697 + }, + { + "epoch": 0.47872750288953714, + "grad_norm": 0.713835597038269, + "learning_rate": 8.674119918583783e-06, + "loss": 0.7875, + "step": 8698 + }, + { + "epoch": 0.47878254169189277, + "grad_norm": 0.8476874232292175, + "learning_rate": 8.67382590264099e-06, + "loss": 0.8028, + "step": 8699 + }, + { + "epoch": 0.47883758049424846, + "grad_norm": 0.720273494720459, + "learning_rate": 8.673531859086706e-06, + "loss": 0.7829, + "step": 8700 + }, + { + "epoch": 0.4788926192966041, + "grad_norm": 0.8042417168617249, + "learning_rate": 8.673237787923137e-06, + "loss": 0.7914, + "step": 8701 + }, + { + "epoch": 0.4789476580989598, + "grad_norm": 0.7779260277748108, + "learning_rate": 8.672943689152498e-06, + "loss": 0.6921, + "step": 8702 + }, + { + "epoch": 0.4790026969013154, + "grad_norm": 0.7957637906074524, + "learning_rate": 8.672649562776997e-06, + "loss": 0.8761, + "step": 8703 + }, + { + "epoch": 0.4790577357036711, + "grad_norm": 0.7467649579048157, + "learning_rate": 8.672355408798845e-06, + "loss": 0.7984, + "step": 8704 + }, + { + "epoch": 0.47911277450602674, + "grad_norm": 0.6746538877487183, + "learning_rate": 8.672061227220252e-06, + "loss": 0.7392, + "step": 8705 + }, + { + "epoch": 0.47916781330838243, + "grad_norm": 0.7331795692443848, + "learning_rate": 8.671767018043432e-06, + "loss": 0.7171, + "step": 8706 + }, + { + "epoch": 0.47922285211073806, + "grad_norm": 0.7879608273506165, + "learning_rate": 8.671472781270592e-06, + "loss": 0.8497, + "step": 8707 + }, + { + "epoch": 0.47927789091309375, + "grad_norm": 0.8659428358078003, + "learning_rate": 8.671178516903946e-06, + "loss": 0.8102, + "step": 8708 + }, + { + "epoch": 0.4793329297154494, + "grad_norm": 0.6489408612251282, + "learning_rate": 8.670884224945704e-06, + "loss": 0.6752, + "step": 8709 + }, + { + "epoch": 0.4793879685178051, + "grad_norm": 0.8182825446128845, + "learning_rate": 8.670589905398079e-06, + "loss": 0.7972, + "step": 8710 + }, + { + "epoch": 0.4794430073201607, + "grad_norm": 0.7759343981742859, + "learning_rate": 8.670295558263285e-06, + "loss": 0.7856, + "step": 8711 + }, + { + "epoch": 0.4794980461225164, + "grad_norm": 0.7421835064888, + "learning_rate": 8.670001183543528e-06, + "loss": 0.8165, + "step": 8712 + }, + { + "epoch": 0.47955308492487203, + "grad_norm": 0.6498512625694275, + "learning_rate": 8.669706781241028e-06, + "loss": 0.7212, + "step": 8713 + }, + { + "epoch": 0.4796081237272277, + "grad_norm": 0.8493219614028931, + "learning_rate": 8.669412351357993e-06, + "loss": 0.8036, + "step": 8714 + }, + { + "epoch": 0.47966316252958335, + "grad_norm": 0.6834331750869751, + "learning_rate": 8.669117893896637e-06, + "loss": 0.8127, + "step": 8715 + }, + { + "epoch": 0.47971820133193904, + "grad_norm": 0.7793670296669006, + "learning_rate": 8.668823408859172e-06, + "loss": 0.7276, + "step": 8716 + }, + { + "epoch": 0.4797732401342947, + "grad_norm": 0.7108075022697449, + "learning_rate": 8.668528896247815e-06, + "loss": 0.8328, + "step": 8717 + }, + { + "epoch": 0.47982827893665037, + "grad_norm": 0.6662433743476868, + "learning_rate": 8.668234356064774e-06, + "loss": 0.6751, + "step": 8718 + }, + { + "epoch": 0.479883317739006, + "grad_norm": 0.6595591902732849, + "learning_rate": 8.667939788312267e-06, + "loss": 0.707, + "step": 8719 + }, + { + "epoch": 0.47993835654136163, + "grad_norm": 0.7435836791992188, + "learning_rate": 8.667645192992506e-06, + "loss": 0.7885, + "step": 8720 + }, + { + "epoch": 0.4799933953437173, + "grad_norm": 0.6999356746673584, + "learning_rate": 8.667350570107706e-06, + "loss": 0.7538, + "step": 8721 + }, + { + "epoch": 0.48004843414607296, + "grad_norm": 0.7111191749572754, + "learning_rate": 8.66705591966008e-06, + "loss": 0.6814, + "step": 8722 + }, + { + "epoch": 0.48010347294842864, + "grad_norm": 0.6752734780311584, + "learning_rate": 8.666761241651844e-06, + "loss": 0.7221, + "step": 8723 + }, + { + "epoch": 0.4801585117507843, + "grad_norm": 0.7432951331138611, + "learning_rate": 8.666466536085212e-06, + "loss": 0.7689, + "step": 8724 + }, + { + "epoch": 0.48021355055313997, + "grad_norm": 0.7384392023086548, + "learning_rate": 8.666171802962398e-06, + "loss": 0.7862, + "step": 8725 + }, + { + "epoch": 0.4802685893554956, + "grad_norm": 0.6878762245178223, + "learning_rate": 8.66587704228562e-06, + "loss": 0.7246, + "step": 8726 + }, + { + "epoch": 0.4803236281578513, + "grad_norm": 0.6640586853027344, + "learning_rate": 8.66558225405709e-06, + "loss": 0.7181, + "step": 8727 + }, + { + "epoch": 0.4803786669602069, + "grad_norm": 0.6808595061302185, + "learning_rate": 8.665287438279024e-06, + "loss": 0.7866, + "step": 8728 + }, + { + "epoch": 0.4804337057625626, + "grad_norm": 0.5966268181800842, + "learning_rate": 8.66499259495364e-06, + "loss": 0.6755, + "step": 8729 + }, + { + "epoch": 0.48048874456491825, + "grad_norm": 0.742016077041626, + "learning_rate": 8.664697724083152e-06, + "loss": 0.8682, + "step": 8730 + }, + { + "epoch": 0.48054378336727394, + "grad_norm": 0.6621154546737671, + "learning_rate": 8.66440282566978e-06, + "loss": 0.7525, + "step": 8731 + }, + { + "epoch": 0.48059882216962957, + "grad_norm": 0.7347434759140015, + "learning_rate": 8.664107899715733e-06, + "loss": 0.7919, + "step": 8732 + }, + { + "epoch": 0.48065386097198526, + "grad_norm": 0.7564681172370911, + "learning_rate": 8.663812946223234e-06, + "loss": 0.9172, + "step": 8733 + }, + { + "epoch": 0.4807088997743409, + "grad_norm": 0.7193084359169006, + "learning_rate": 8.663517965194497e-06, + "loss": 0.7931, + "step": 8734 + }, + { + "epoch": 0.4807639385766966, + "grad_norm": 0.6882064938545227, + "learning_rate": 8.66322295663174e-06, + "loss": 0.7678, + "step": 8735 + }, + { + "epoch": 0.4808189773790522, + "grad_norm": 0.7954713106155396, + "learning_rate": 8.662927920537179e-06, + "loss": 0.6357, + "step": 8736 + }, + { + "epoch": 0.4808740161814079, + "grad_norm": 0.7123041749000549, + "learning_rate": 8.662632856913034e-06, + "loss": 0.7234, + "step": 8737 + }, + { + "epoch": 0.48092905498376354, + "grad_norm": 0.745145320892334, + "learning_rate": 8.66233776576152e-06, + "loss": 0.7516, + "step": 8738 + }, + { + "epoch": 0.4809840937861192, + "grad_norm": 0.6904219388961792, + "learning_rate": 8.662042647084856e-06, + "loss": 0.7995, + "step": 8739 + }, + { + "epoch": 0.48103913258847486, + "grad_norm": 0.71831214427948, + "learning_rate": 8.661747500885258e-06, + "loss": 0.7965, + "step": 8740 + }, + { + "epoch": 0.48109417139083055, + "grad_norm": 0.8514378666877747, + "learning_rate": 8.661452327164948e-06, + "loss": 0.8023, + "step": 8741 + }, + { + "epoch": 0.4811492101931862, + "grad_norm": 0.7411143779754639, + "learning_rate": 8.66115712592614e-06, + "loss": 0.797, + "step": 8742 + }, + { + "epoch": 0.4812042489955419, + "grad_norm": 0.737178385257721, + "learning_rate": 8.660861897171057e-06, + "loss": 0.7286, + "step": 8743 + }, + { + "epoch": 0.4812592877978975, + "grad_norm": 0.6823513507843018, + "learning_rate": 8.660566640901918e-06, + "loss": 0.7482, + "step": 8744 + }, + { + "epoch": 0.4813143266002532, + "grad_norm": 0.7205879092216492, + "learning_rate": 8.660271357120937e-06, + "loss": 0.8294, + "step": 8745 + }, + { + "epoch": 0.48136936540260883, + "grad_norm": 0.6887338757514954, + "learning_rate": 8.659976045830337e-06, + "loss": 0.7711, + "step": 8746 + }, + { + "epoch": 0.4814244042049645, + "grad_norm": 0.7498533129692078, + "learning_rate": 8.659680707032336e-06, + "loss": 0.7296, + "step": 8747 + }, + { + "epoch": 0.48147944300732015, + "grad_norm": 0.8041636943817139, + "learning_rate": 8.659385340729155e-06, + "loss": 0.9213, + "step": 8748 + }, + { + "epoch": 0.48153448180967584, + "grad_norm": 0.8623721599578857, + "learning_rate": 8.659089946923014e-06, + "loss": 0.8024, + "step": 8749 + }, + { + "epoch": 0.4815895206120315, + "grad_norm": 0.7212050557136536, + "learning_rate": 8.658794525616132e-06, + "loss": 0.732, + "step": 8750 + }, + { + "epoch": 0.48164455941438716, + "grad_norm": 0.7141492366790771, + "learning_rate": 8.658499076810729e-06, + "loss": 0.8062, + "step": 8751 + }, + { + "epoch": 0.4816995982167428, + "grad_norm": 0.7191516160964966, + "learning_rate": 8.658203600509027e-06, + "loss": 0.805, + "step": 8752 + }, + { + "epoch": 0.4817546370190985, + "grad_norm": 0.71059650182724, + "learning_rate": 8.657908096713245e-06, + "loss": 0.6755, + "step": 8753 + }, + { + "epoch": 0.4818096758214541, + "grad_norm": 0.6715459823608398, + "learning_rate": 8.657612565425607e-06, + "loss": 0.8093, + "step": 8754 + }, + { + "epoch": 0.4818647146238098, + "grad_norm": 0.7438814640045166, + "learning_rate": 8.65731700664833e-06, + "loss": 0.8059, + "step": 8755 + }, + { + "epoch": 0.48191975342616544, + "grad_norm": 0.7295387387275696, + "learning_rate": 8.657021420383637e-06, + "loss": 0.8437, + "step": 8756 + }, + { + "epoch": 0.48197479222852113, + "grad_norm": 0.7053797245025635, + "learning_rate": 8.656725806633753e-06, + "loss": 0.8424, + "step": 8757 + }, + { + "epoch": 0.48202983103087677, + "grad_norm": 0.6902007460594177, + "learning_rate": 8.656430165400894e-06, + "loss": 0.6967, + "step": 8758 + }, + { + "epoch": 0.48208486983323245, + "grad_norm": 0.66749507188797, + "learning_rate": 8.656134496687286e-06, + "loss": 0.7858, + "step": 8759 + }, + { + "epoch": 0.4821399086355881, + "grad_norm": 0.6755428314208984, + "learning_rate": 8.65583880049515e-06, + "loss": 0.6669, + "step": 8760 + }, + { + "epoch": 0.4821949474379438, + "grad_norm": 0.921096920967102, + "learning_rate": 8.655543076826706e-06, + "loss": 0.8545, + "step": 8761 + }, + { + "epoch": 0.4822499862402994, + "grad_norm": 0.7931553721427917, + "learning_rate": 8.65524732568418e-06, + "loss": 0.8708, + "step": 8762 + }, + { + "epoch": 0.48230502504265504, + "grad_norm": 0.7891780734062195, + "learning_rate": 8.654951547069794e-06, + "loss": 0.687, + "step": 8763 + }, + { + "epoch": 0.48236006384501073, + "grad_norm": 0.747662365436554, + "learning_rate": 8.65465574098577e-06, + "loss": 0.8153, + "step": 8764 + }, + { + "epoch": 0.48241510264736637, + "grad_norm": 0.7758497595787048, + "learning_rate": 8.65435990743433e-06, + "loss": 0.8018, + "step": 8765 + }, + { + "epoch": 0.48247014144972206, + "grad_norm": 0.6997805237770081, + "learning_rate": 8.654064046417703e-06, + "loss": 0.7845, + "step": 8766 + }, + { + "epoch": 0.4825251802520777, + "grad_norm": 0.7188366651535034, + "learning_rate": 8.653768157938106e-06, + "loss": 0.7528, + "step": 8767 + }, + { + "epoch": 0.4825802190544334, + "grad_norm": 0.6848055124282837, + "learning_rate": 8.653472241997767e-06, + "loss": 0.7658, + "step": 8768 + }, + { + "epoch": 0.482635257856789, + "grad_norm": 1.0603824853897095, + "learning_rate": 8.653176298598907e-06, + "loss": 0.7692, + "step": 8769 + }, + { + "epoch": 0.4826902966591447, + "grad_norm": 0.8191514611244202, + "learning_rate": 8.652880327743753e-06, + "loss": 0.7706, + "step": 8770 + }, + { + "epoch": 0.48274533546150034, + "grad_norm": 0.6318503618240356, + "learning_rate": 8.652584329434527e-06, + "loss": 0.6635, + "step": 8771 + }, + { + "epoch": 0.482800374263856, + "grad_norm": 0.6860769391059875, + "learning_rate": 8.652288303673457e-06, + "loss": 0.739, + "step": 8772 + }, + { + "epoch": 0.48285541306621166, + "grad_norm": 0.7414761185646057, + "learning_rate": 8.651992250462765e-06, + "loss": 0.7949, + "step": 8773 + }, + { + "epoch": 0.48291045186856735, + "grad_norm": 0.7255183458328247, + "learning_rate": 8.651696169804676e-06, + "loss": 0.8569, + "step": 8774 + }, + { + "epoch": 0.482965490670923, + "grad_norm": 0.7034135460853577, + "learning_rate": 8.651400061701417e-06, + "loss": 0.7562, + "step": 8775 + }, + { + "epoch": 0.48302052947327867, + "grad_norm": 0.7041038274765015, + "learning_rate": 8.651103926155212e-06, + "loss": 0.7194, + "step": 8776 + }, + { + "epoch": 0.4830755682756343, + "grad_norm": 1.0965619087219238, + "learning_rate": 8.650807763168287e-06, + "loss": 0.9033, + "step": 8777 + }, + { + "epoch": 0.48313060707799, + "grad_norm": 0.7400044798851013, + "learning_rate": 8.650511572742869e-06, + "loss": 0.7626, + "step": 8778 + }, + { + "epoch": 0.4831856458803456, + "grad_norm": 0.6957885026931763, + "learning_rate": 8.650215354881182e-06, + "loss": 0.7283, + "step": 8779 + }, + { + "epoch": 0.4832406846827013, + "grad_norm": 0.7992473840713501, + "learning_rate": 8.649919109585454e-06, + "loss": 0.8376, + "step": 8780 + }, + { + "epoch": 0.48329572348505695, + "grad_norm": 0.8556981086730957, + "learning_rate": 8.649622836857911e-06, + "loss": 0.7737, + "step": 8781 + }, + { + "epoch": 0.48335076228741264, + "grad_norm": 0.8476192355155945, + "learning_rate": 8.64932653670078e-06, + "loss": 0.8926, + "step": 8782 + }, + { + "epoch": 0.48340580108976827, + "grad_norm": 0.6461093425750732, + "learning_rate": 8.649030209116289e-06, + "loss": 0.7452, + "step": 8783 + }, + { + "epoch": 0.48346083989212396, + "grad_norm": 0.6997528076171875, + "learning_rate": 8.648733854106661e-06, + "loss": 0.7962, + "step": 8784 + }, + { + "epoch": 0.4835158786944796, + "grad_norm": 0.7606356739997864, + "learning_rate": 8.648437471674128e-06, + "loss": 0.6517, + "step": 8785 + }, + { + "epoch": 0.4835709174968353, + "grad_norm": 0.8118630051612854, + "learning_rate": 8.648141061820913e-06, + "loss": 0.7539, + "step": 8786 + }, + { + "epoch": 0.4836259562991909, + "grad_norm": 0.8778805136680603, + "learning_rate": 8.64784462454925e-06, + "loss": 0.763, + "step": 8787 + }, + { + "epoch": 0.4836809951015466, + "grad_norm": 0.7741022706031799, + "learning_rate": 8.647548159861361e-06, + "loss": 0.7749, + "step": 8788 + }, + { + "epoch": 0.48373603390390224, + "grad_norm": 0.76578688621521, + "learning_rate": 8.647251667759478e-06, + "loss": 0.6968, + "step": 8789 + }, + { + "epoch": 0.48379107270625793, + "grad_norm": 0.8477250933647156, + "learning_rate": 8.646955148245827e-06, + "loss": 0.8364, + "step": 8790 + }, + { + "epoch": 0.48384611150861356, + "grad_norm": 0.9105041027069092, + "learning_rate": 8.646658601322635e-06, + "loss": 0.823, + "step": 8791 + }, + { + "epoch": 0.48390115031096925, + "grad_norm": 0.7642726898193359, + "learning_rate": 8.646362026992135e-06, + "loss": 0.721, + "step": 8792 + }, + { + "epoch": 0.4839561891133249, + "grad_norm": 0.7567259669303894, + "learning_rate": 8.646065425256555e-06, + "loss": 0.7876, + "step": 8793 + }, + { + "epoch": 0.4840112279156806, + "grad_norm": 0.7691231966018677, + "learning_rate": 8.64576879611812e-06, + "loss": 0.8308, + "step": 8794 + }, + { + "epoch": 0.4840662667180362, + "grad_norm": 1.0769426822662354, + "learning_rate": 8.645472139579067e-06, + "loss": 0.892, + "step": 8795 + }, + { + "epoch": 0.4841213055203919, + "grad_norm": 0.6987955570220947, + "learning_rate": 8.64517545564162e-06, + "loss": 0.8254, + "step": 8796 + }, + { + "epoch": 0.48417634432274753, + "grad_norm": 0.7736005783081055, + "learning_rate": 8.644878744308007e-06, + "loss": 0.7666, + "step": 8797 + }, + { + "epoch": 0.4842313831251032, + "grad_norm": 0.6233380436897278, + "learning_rate": 8.644582005580464e-06, + "loss": 0.6443, + "step": 8798 + }, + { + "epoch": 0.48428642192745885, + "grad_norm": 0.7343530654907227, + "learning_rate": 8.644285239461217e-06, + "loss": 0.724, + "step": 8799 + }, + { + "epoch": 0.48434146072981454, + "grad_norm": 0.725321352481842, + "learning_rate": 8.643988445952499e-06, + "loss": 0.7249, + "step": 8800 + }, + { + "epoch": 0.4843964995321702, + "grad_norm": 0.7256256341934204, + "learning_rate": 8.643691625056539e-06, + "loss": 0.8656, + "step": 8801 + }, + { + "epoch": 0.48445153833452587, + "grad_norm": 0.8559528589248657, + "learning_rate": 8.643394776775567e-06, + "loss": 0.9186, + "step": 8802 + }, + { + "epoch": 0.4845065771368815, + "grad_norm": 0.6735692024230957, + "learning_rate": 8.643097901111815e-06, + "loss": 0.7007, + "step": 8803 + }, + { + "epoch": 0.4845616159392372, + "grad_norm": 0.8373280167579651, + "learning_rate": 8.642800998067515e-06, + "loss": 0.8774, + "step": 8804 + }, + { + "epoch": 0.4846166547415928, + "grad_norm": 0.731311023235321, + "learning_rate": 8.642504067644898e-06, + "loss": 0.7102, + "step": 8805 + }, + { + "epoch": 0.48467169354394846, + "grad_norm": 0.7259742617607117, + "learning_rate": 8.642207109846195e-06, + "loss": 0.7174, + "step": 8806 + }, + { + "epoch": 0.48472673234630415, + "grad_norm": 0.6454386115074158, + "learning_rate": 8.641910124673638e-06, + "loss": 0.7656, + "step": 8807 + }, + { + "epoch": 0.4847817711486598, + "grad_norm": 0.7701624631881714, + "learning_rate": 8.641613112129462e-06, + "loss": 0.7926, + "step": 8808 + }, + { + "epoch": 0.48483680995101547, + "grad_norm": 0.6812854409217834, + "learning_rate": 8.641316072215893e-06, + "loss": 0.7072, + "step": 8809 + }, + { + "epoch": 0.4848918487533711, + "grad_norm": 0.8180119395256042, + "learning_rate": 8.641019004935169e-06, + "loss": 0.8621, + "step": 8810 + }, + { + "epoch": 0.4849468875557268, + "grad_norm": 0.6346331834793091, + "learning_rate": 8.64072191028952e-06, + "loss": 0.6907, + "step": 8811 + }, + { + "epoch": 0.4850019263580824, + "grad_norm": 0.6819741129875183, + "learning_rate": 8.64042478828118e-06, + "loss": 0.77, + "step": 8812 + }, + { + "epoch": 0.4850569651604381, + "grad_norm": 0.9074214100837708, + "learning_rate": 8.640127638912383e-06, + "loss": 0.7799, + "step": 8813 + }, + { + "epoch": 0.48511200396279375, + "grad_norm": 0.8065158724784851, + "learning_rate": 8.63983046218536e-06, + "loss": 0.8033, + "step": 8814 + }, + { + "epoch": 0.48516704276514944, + "grad_norm": 0.6241241097450256, + "learning_rate": 8.639533258102345e-06, + "loss": 0.6936, + "step": 8815 + }, + { + "epoch": 0.48522208156750507, + "grad_norm": 0.6928265690803528, + "learning_rate": 8.639236026665573e-06, + "loss": 0.7526, + "step": 8816 + }, + { + "epoch": 0.48527712036986076, + "grad_norm": 0.8171425461769104, + "learning_rate": 8.638938767877276e-06, + "loss": 0.8227, + "step": 8817 + }, + { + "epoch": 0.4853321591722164, + "grad_norm": 0.7007083296775818, + "learning_rate": 8.638641481739692e-06, + "loss": 0.7439, + "step": 8818 + }, + { + "epoch": 0.4853871979745721, + "grad_norm": 0.8905115127563477, + "learning_rate": 8.63834416825505e-06, + "loss": 0.6873, + "step": 8819 + }, + { + "epoch": 0.4854422367769277, + "grad_norm": 0.702198326587677, + "learning_rate": 8.638046827425588e-06, + "loss": 0.7999, + "step": 8820 + }, + { + "epoch": 0.4854972755792834, + "grad_norm": 0.7280104160308838, + "learning_rate": 8.63774945925354e-06, + "loss": 0.8562, + "step": 8821 + }, + { + "epoch": 0.48555231438163904, + "grad_norm": 0.9803630113601685, + "learning_rate": 8.63745206374114e-06, + "loss": 0.8347, + "step": 8822 + }, + { + "epoch": 0.4856073531839947, + "grad_norm": 0.6781168580055237, + "learning_rate": 8.637154640890625e-06, + "loss": 0.8124, + "step": 8823 + }, + { + "epoch": 0.48566239198635036, + "grad_norm": 0.7219669222831726, + "learning_rate": 8.63685719070423e-06, + "loss": 0.8053, + "step": 8824 + }, + { + "epoch": 0.48571743078870605, + "grad_norm": 0.7077241539955139, + "learning_rate": 8.636559713184187e-06, + "loss": 0.7534, + "step": 8825 + }, + { + "epoch": 0.4857724695910617, + "grad_norm": 0.70063316822052, + "learning_rate": 8.636262208332737e-06, + "loss": 0.7509, + "step": 8826 + }, + { + "epoch": 0.4858275083934174, + "grad_norm": 0.7292184233665466, + "learning_rate": 8.635964676152114e-06, + "loss": 0.7485, + "step": 8827 + }, + { + "epoch": 0.485882547195773, + "grad_norm": 0.7970258593559265, + "learning_rate": 8.635667116644552e-06, + "loss": 0.8874, + "step": 8828 + }, + { + "epoch": 0.4859375859981287, + "grad_norm": 0.7090024352073669, + "learning_rate": 8.63536952981229e-06, + "loss": 0.7665, + "step": 8829 + }, + { + "epoch": 0.48599262480048433, + "grad_norm": 0.761409342288971, + "learning_rate": 8.635071915657565e-06, + "loss": 0.7977, + "step": 8830 + }, + { + "epoch": 0.48604766360284, + "grad_norm": 0.724896252155304, + "learning_rate": 8.634774274182611e-06, + "loss": 0.8591, + "step": 8831 + }, + { + "epoch": 0.48610270240519565, + "grad_norm": 0.737424910068512, + "learning_rate": 8.634476605389666e-06, + "loss": 0.8256, + "step": 8832 + }, + { + "epoch": 0.48615774120755134, + "grad_norm": 0.8261227607727051, + "learning_rate": 8.63417890928097e-06, + "loss": 0.8089, + "step": 8833 + }, + { + "epoch": 0.486212780009907, + "grad_norm": 0.6744595766067505, + "learning_rate": 8.633881185858756e-06, + "loss": 0.7821, + "step": 8834 + }, + { + "epoch": 0.48626781881226266, + "grad_norm": 0.6717672944068909, + "learning_rate": 8.633583435125263e-06, + "loss": 0.7823, + "step": 8835 + }, + { + "epoch": 0.4863228576146183, + "grad_norm": 0.753616213798523, + "learning_rate": 8.633285657082732e-06, + "loss": 0.8044, + "step": 8836 + }, + { + "epoch": 0.486377896416974, + "grad_norm": 0.6910914182662964, + "learning_rate": 8.632987851733397e-06, + "loss": 0.8244, + "step": 8837 + }, + { + "epoch": 0.4864329352193296, + "grad_norm": 0.9127064347267151, + "learning_rate": 8.632690019079499e-06, + "loss": 0.7918, + "step": 8838 + }, + { + "epoch": 0.4864879740216853, + "grad_norm": 0.715918779373169, + "learning_rate": 8.632392159123274e-06, + "loss": 0.744, + "step": 8839 + }, + { + "epoch": 0.48654301282404094, + "grad_norm": 0.8206684589385986, + "learning_rate": 8.632094271866963e-06, + "loss": 0.7852, + "step": 8840 + }, + { + "epoch": 0.48659805162639663, + "grad_norm": 0.6502171158790588, + "learning_rate": 8.631796357312802e-06, + "loss": 0.7653, + "step": 8841 + }, + { + "epoch": 0.48665309042875227, + "grad_norm": 0.6987786889076233, + "learning_rate": 8.631498415463033e-06, + "loss": 0.7669, + "step": 8842 + }, + { + "epoch": 0.48670812923110796, + "grad_norm": 0.7902390360832214, + "learning_rate": 8.631200446319894e-06, + "loss": 0.8438, + "step": 8843 + }, + { + "epoch": 0.4867631680334636, + "grad_norm": 0.7464659810066223, + "learning_rate": 8.630902449885625e-06, + "loss": 0.8276, + "step": 8844 + }, + { + "epoch": 0.4868182068358193, + "grad_norm": 0.7375630736351013, + "learning_rate": 8.630604426162465e-06, + "loss": 0.7921, + "step": 8845 + }, + { + "epoch": 0.4868732456381749, + "grad_norm": 0.7206295728683472, + "learning_rate": 8.630306375152653e-06, + "loss": 0.8424, + "step": 8846 + }, + { + "epoch": 0.4869282844405306, + "grad_norm": 0.7384368181228638, + "learning_rate": 8.63000829685843e-06, + "loss": 0.8702, + "step": 8847 + }, + { + "epoch": 0.48698332324288623, + "grad_norm": 0.7839015126228333, + "learning_rate": 8.629710191282037e-06, + "loss": 0.7064, + "step": 8848 + }, + { + "epoch": 0.48703836204524187, + "grad_norm": 0.6909724473953247, + "learning_rate": 8.629412058425712e-06, + "loss": 0.6924, + "step": 8849 + }, + { + "epoch": 0.48709340084759756, + "grad_norm": 0.6553036570549011, + "learning_rate": 8.6291138982917e-06, + "loss": 0.6526, + "step": 8850 + }, + { + "epoch": 0.4871484396499532, + "grad_norm": 0.7202072143554688, + "learning_rate": 8.628815710882239e-06, + "loss": 0.7272, + "step": 8851 + }, + { + "epoch": 0.4872034784523089, + "grad_norm": 0.6898619532585144, + "learning_rate": 8.62851749619957e-06, + "loss": 0.7687, + "step": 8852 + }, + { + "epoch": 0.4872585172546645, + "grad_norm": 0.7888908386230469, + "learning_rate": 8.628219254245935e-06, + "loss": 0.7654, + "step": 8853 + }, + { + "epoch": 0.4873135560570202, + "grad_norm": 0.7312424778938293, + "learning_rate": 8.627920985023575e-06, + "loss": 0.8053, + "step": 8854 + }, + { + "epoch": 0.48736859485937584, + "grad_norm": 0.6588439345359802, + "learning_rate": 8.627622688534731e-06, + "loss": 0.7229, + "step": 8855 + }, + { + "epoch": 0.4874236336617315, + "grad_norm": 0.8292293548583984, + "learning_rate": 8.627324364781647e-06, + "loss": 0.8482, + "step": 8856 + }, + { + "epoch": 0.48747867246408716, + "grad_norm": 0.7573973536491394, + "learning_rate": 8.627026013766564e-06, + "loss": 0.7282, + "step": 8857 + }, + { + "epoch": 0.48753371126644285, + "grad_norm": 1.2215768098831177, + "learning_rate": 8.626727635491726e-06, + "loss": 0.7771, + "step": 8858 + }, + { + "epoch": 0.4875887500687985, + "grad_norm": 0.7324759364128113, + "learning_rate": 8.626429229959369e-06, + "loss": 0.781, + "step": 8859 + }, + { + "epoch": 0.48764378887115417, + "grad_norm": 0.6995676159858704, + "learning_rate": 8.626130797171745e-06, + "loss": 0.6907, + "step": 8860 + }, + { + "epoch": 0.4876988276735098, + "grad_norm": 0.7400509119033813, + "learning_rate": 8.625832337131092e-06, + "loss": 0.6572, + "step": 8861 + }, + { + "epoch": 0.4877538664758655, + "grad_norm": 0.6634842753410339, + "learning_rate": 8.625533849839653e-06, + "loss": 0.7229, + "step": 8862 + }, + { + "epoch": 0.4878089052782211, + "grad_norm": 0.7357299327850342, + "learning_rate": 8.625235335299673e-06, + "loss": 0.6418, + "step": 8863 + }, + { + "epoch": 0.4878639440805768, + "grad_norm": 0.6473466157913208, + "learning_rate": 8.624936793513394e-06, + "loss": 0.6796, + "step": 8864 + }, + { + "epoch": 0.48791898288293245, + "grad_norm": 0.9110734462738037, + "learning_rate": 8.62463822448306e-06, + "loss": 0.8143, + "step": 8865 + }, + { + "epoch": 0.48797402168528814, + "grad_norm": 0.7932308316230774, + "learning_rate": 8.624339628210916e-06, + "loss": 0.9103, + "step": 8866 + }, + { + "epoch": 0.4880290604876438, + "grad_norm": 0.6677752137184143, + "learning_rate": 8.624041004699205e-06, + "loss": 0.8073, + "step": 8867 + }, + { + "epoch": 0.48808409928999946, + "grad_norm": 0.7379121780395508, + "learning_rate": 8.623742353950171e-06, + "loss": 0.8643, + "step": 8868 + }, + { + "epoch": 0.4881391380923551, + "grad_norm": 0.7479479312896729, + "learning_rate": 8.623443675966062e-06, + "loss": 0.6117, + "step": 8869 + }, + { + "epoch": 0.4881941768947108, + "grad_norm": 0.7822794914245605, + "learning_rate": 8.623144970749118e-06, + "loss": 0.8629, + "step": 8870 + }, + { + "epoch": 0.4882492156970664, + "grad_norm": 0.7040950655937195, + "learning_rate": 8.622846238301587e-06, + "loss": 0.7519, + "step": 8871 + }, + { + "epoch": 0.4883042544994221, + "grad_norm": 0.747368574142456, + "learning_rate": 8.622547478625714e-06, + "loss": 0.7459, + "step": 8872 + }, + { + "epoch": 0.48835929330177774, + "grad_norm": 0.6755948066711426, + "learning_rate": 8.622248691723742e-06, + "loss": 0.7515, + "step": 8873 + }, + { + "epoch": 0.48841433210413343, + "grad_norm": 0.7265586256980896, + "learning_rate": 8.62194987759792e-06, + "loss": 0.7691, + "step": 8874 + }, + { + "epoch": 0.48846937090648906, + "grad_norm": 0.6696380972862244, + "learning_rate": 8.621651036250493e-06, + "loss": 0.778, + "step": 8875 + }, + { + "epoch": 0.48852440970884475, + "grad_norm": 0.7666454911231995, + "learning_rate": 8.621352167683705e-06, + "loss": 0.7396, + "step": 8876 + }, + { + "epoch": 0.4885794485112004, + "grad_norm": 0.7079235315322876, + "learning_rate": 8.621053271899803e-06, + "loss": 0.7917, + "step": 8877 + }, + { + "epoch": 0.4886344873135561, + "grad_norm": 0.6888919472694397, + "learning_rate": 8.620754348901034e-06, + "loss": 0.605, + "step": 8878 + }, + { + "epoch": 0.4886895261159117, + "grad_norm": 0.7177572250366211, + "learning_rate": 8.620455398689645e-06, + "loss": 0.7534, + "step": 8879 + }, + { + "epoch": 0.4887445649182674, + "grad_norm": 0.7268772721290588, + "learning_rate": 8.620156421267883e-06, + "loss": 0.7748, + "step": 8880 + }, + { + "epoch": 0.48879960372062303, + "grad_norm": 0.8015080690383911, + "learning_rate": 8.619857416637993e-06, + "loss": 0.6716, + "step": 8881 + }, + { + "epoch": 0.4888546425229787, + "grad_norm": 0.7464118599891663, + "learning_rate": 8.619558384802226e-06, + "loss": 0.796, + "step": 8882 + }, + { + "epoch": 0.48890968132533436, + "grad_norm": 0.6829718351364136, + "learning_rate": 8.619259325762826e-06, + "loss": 0.788, + "step": 8883 + }, + { + "epoch": 0.48896472012769004, + "grad_norm": 0.6553084850311279, + "learning_rate": 8.618960239522041e-06, + "loss": 0.7215, + "step": 8884 + }, + { + "epoch": 0.4890197589300457, + "grad_norm": 0.8056252598762512, + "learning_rate": 8.618661126082119e-06, + "loss": 0.8588, + "step": 8885 + }, + { + "epoch": 0.48907479773240137, + "grad_norm": 0.8145674467086792, + "learning_rate": 8.618361985445309e-06, + "loss": 0.8095, + "step": 8886 + }, + { + "epoch": 0.489129836534757, + "grad_norm": 0.740031898021698, + "learning_rate": 8.61806281761386e-06, + "loss": 0.7029, + "step": 8887 + }, + { + "epoch": 0.4891848753371127, + "grad_norm": 0.7442640662193298, + "learning_rate": 8.617763622590019e-06, + "loss": 0.782, + "step": 8888 + }, + { + "epoch": 0.4892399141394683, + "grad_norm": 0.6992725133895874, + "learning_rate": 8.617464400376035e-06, + "loss": 0.7877, + "step": 8889 + }, + { + "epoch": 0.489294952941824, + "grad_norm": 1.19756281375885, + "learning_rate": 8.617165150974157e-06, + "loss": 0.6985, + "step": 8890 + }, + { + "epoch": 0.48934999174417965, + "grad_norm": 0.6418262720108032, + "learning_rate": 8.616865874386633e-06, + "loss": 0.7385, + "step": 8891 + }, + { + "epoch": 0.4894050305465353, + "grad_norm": 0.787406325340271, + "learning_rate": 8.616566570615714e-06, + "loss": 0.8686, + "step": 8892 + }, + { + "epoch": 0.48946006934889097, + "grad_norm": 0.6990430951118469, + "learning_rate": 8.616267239663648e-06, + "loss": 0.7683, + "step": 8893 + }, + { + "epoch": 0.4895151081512466, + "grad_norm": 0.7180235981941223, + "learning_rate": 8.615967881532687e-06, + "loss": 0.8337, + "step": 8894 + }, + { + "epoch": 0.4895701469536023, + "grad_norm": 0.7647475600242615, + "learning_rate": 8.615668496225077e-06, + "loss": 0.8668, + "step": 8895 + }, + { + "epoch": 0.4896251857559579, + "grad_norm": 0.843063473701477, + "learning_rate": 8.615369083743072e-06, + "loss": 0.7968, + "step": 8896 + }, + { + "epoch": 0.4896802245583136, + "grad_norm": 0.9526075124740601, + "learning_rate": 8.61506964408892e-06, + "loss": 0.8766, + "step": 8897 + }, + { + "epoch": 0.48973526336066925, + "grad_norm": 0.7850056290626526, + "learning_rate": 8.614770177264874e-06, + "loss": 0.8033, + "step": 8898 + }, + { + "epoch": 0.48979030216302494, + "grad_norm": 0.8658629655838013, + "learning_rate": 8.614470683273182e-06, + "loss": 0.8206, + "step": 8899 + }, + { + "epoch": 0.48984534096538057, + "grad_norm": 0.8060176968574524, + "learning_rate": 8.614171162116096e-06, + "loss": 0.7602, + "step": 8900 + }, + { + "epoch": 0.48990037976773626, + "grad_norm": 0.7398280501365662, + "learning_rate": 8.613871613795865e-06, + "loss": 0.8067, + "step": 8901 + }, + { + "epoch": 0.4899554185700919, + "grad_norm": 0.7341256141662598, + "learning_rate": 8.613572038314744e-06, + "loss": 0.7305, + "step": 8902 + }, + { + "epoch": 0.4900104573724476, + "grad_norm": 0.7832887172698975, + "learning_rate": 8.613272435674984e-06, + "loss": 0.7012, + "step": 8903 + }, + { + "epoch": 0.4900654961748032, + "grad_norm": 0.6536995768547058, + "learning_rate": 8.612972805878834e-06, + "loss": 0.745, + "step": 8904 + }, + { + "epoch": 0.4901205349771589, + "grad_norm": 0.7511856555938721, + "learning_rate": 8.612673148928547e-06, + "loss": 0.7741, + "step": 8905 + }, + { + "epoch": 0.49017557377951454, + "grad_norm": 0.6117261648178101, + "learning_rate": 8.612373464826377e-06, + "loss": 0.5813, + "step": 8906 + }, + { + "epoch": 0.49023061258187023, + "grad_norm": 0.7832254767417908, + "learning_rate": 8.612073753574574e-06, + "loss": 0.7426, + "step": 8907 + }, + { + "epoch": 0.49028565138422586, + "grad_norm": 0.7516622543334961, + "learning_rate": 8.611774015175393e-06, + "loss": 0.8205, + "step": 8908 + }, + { + "epoch": 0.49034069018658155, + "grad_norm": 0.7776936888694763, + "learning_rate": 8.611474249631085e-06, + "loss": 0.8457, + "step": 8909 + }, + { + "epoch": 0.4903957289889372, + "grad_norm": 0.9364853501319885, + "learning_rate": 8.6111744569439e-06, + "loss": 0.9114, + "step": 8910 + }, + { + "epoch": 0.4904507677912929, + "grad_norm": 0.7584181427955627, + "learning_rate": 8.610874637116099e-06, + "loss": 0.6852, + "step": 8911 + }, + { + "epoch": 0.4905058065936485, + "grad_norm": 0.7326254844665527, + "learning_rate": 8.610574790149929e-06, + "loss": 0.7843, + "step": 8912 + }, + { + "epoch": 0.4905608453960042, + "grad_norm": 0.918258547782898, + "learning_rate": 8.610274916047645e-06, + "loss": 0.766, + "step": 8913 + }, + { + "epoch": 0.49061588419835983, + "grad_norm": 1.0083420276641846, + "learning_rate": 8.609975014811502e-06, + "loss": 0.7436, + "step": 8914 + }, + { + "epoch": 0.4906709230007155, + "grad_norm": 0.712664783000946, + "learning_rate": 8.609675086443752e-06, + "loss": 0.7891, + "step": 8915 + }, + { + "epoch": 0.49072596180307115, + "grad_norm": 0.7635206580162048, + "learning_rate": 8.609375130946651e-06, + "loss": 0.7842, + "step": 8916 + }, + { + "epoch": 0.49078100060542684, + "grad_norm": 0.7567723989486694, + "learning_rate": 8.609075148322452e-06, + "loss": 0.8435, + "step": 8917 + }, + { + "epoch": 0.4908360394077825, + "grad_norm": 0.8918718099594116, + "learning_rate": 8.60877513857341e-06, + "loss": 0.8015, + "step": 8918 + }, + { + "epoch": 0.49089107821013817, + "grad_norm": 0.8701914548873901, + "learning_rate": 8.608475101701781e-06, + "loss": 0.7806, + "step": 8919 + }, + { + "epoch": 0.4909461170124938, + "grad_norm": 0.7528215646743774, + "learning_rate": 8.608175037709819e-06, + "loss": 0.7958, + "step": 8920 + }, + { + "epoch": 0.4910011558148495, + "grad_norm": 0.7277387380599976, + "learning_rate": 8.60787494659978e-06, + "loss": 0.7878, + "step": 8921 + }, + { + "epoch": 0.4910561946172051, + "grad_norm": 0.6739892959594727, + "learning_rate": 8.607574828373917e-06, + "loss": 0.7212, + "step": 8922 + }, + { + "epoch": 0.4911112334195608, + "grad_norm": 0.712480366230011, + "learning_rate": 8.607274683034487e-06, + "loss": 0.7966, + "step": 8923 + }, + { + "epoch": 0.49116627222191644, + "grad_norm": 0.7192126512527466, + "learning_rate": 8.606974510583747e-06, + "loss": 0.7032, + "step": 8924 + }, + { + "epoch": 0.49122131102427213, + "grad_norm": 0.7502614855766296, + "learning_rate": 8.606674311023953e-06, + "loss": 0.7465, + "step": 8925 + }, + { + "epoch": 0.49127634982662777, + "grad_norm": 0.8475236892700195, + "learning_rate": 8.606374084357361e-06, + "loss": 0.8083, + "step": 8926 + }, + { + "epoch": 0.49133138862898346, + "grad_norm": 0.6972761750221252, + "learning_rate": 8.606073830586224e-06, + "loss": 0.7206, + "step": 8927 + }, + { + "epoch": 0.4913864274313391, + "grad_norm": 0.6209561824798584, + "learning_rate": 8.605773549712803e-06, + "loss": 0.6664, + "step": 8928 + }, + { + "epoch": 0.4914414662336948, + "grad_norm": 0.7905771732330322, + "learning_rate": 8.605473241739353e-06, + "loss": 0.7243, + "step": 8929 + }, + { + "epoch": 0.4914965050360504, + "grad_norm": 0.762959897518158, + "learning_rate": 8.605172906668131e-06, + "loss": 0.7747, + "step": 8930 + }, + { + "epoch": 0.4915515438384061, + "grad_norm": 0.7297530174255371, + "learning_rate": 8.604872544501394e-06, + "loss": 0.7441, + "step": 8931 + }, + { + "epoch": 0.49160658264076174, + "grad_norm": 0.6732318997383118, + "learning_rate": 8.6045721552414e-06, + "loss": 0.7621, + "step": 8932 + }, + { + "epoch": 0.4916616214431174, + "grad_norm": 0.7010045647621155, + "learning_rate": 8.604271738890407e-06, + "loss": 0.7971, + "step": 8933 + }, + { + "epoch": 0.49171666024547306, + "grad_norm": 0.6996648907661438, + "learning_rate": 8.603971295450672e-06, + "loss": 0.8119, + "step": 8934 + }, + { + "epoch": 0.4917716990478287, + "grad_norm": 0.7679941058158875, + "learning_rate": 8.603670824924456e-06, + "loss": 0.8035, + "step": 8935 + }, + { + "epoch": 0.4918267378501844, + "grad_norm": 0.8009630441665649, + "learning_rate": 8.603370327314011e-06, + "loss": 0.7817, + "step": 8936 + }, + { + "epoch": 0.49188177665254, + "grad_norm": 0.7167709469795227, + "learning_rate": 8.603069802621601e-06, + "loss": 0.7621, + "step": 8937 + }, + { + "epoch": 0.4919368154548957, + "grad_norm": 0.7447960376739502, + "learning_rate": 8.602769250849483e-06, + "loss": 0.7664, + "step": 8938 + }, + { + "epoch": 0.49199185425725134, + "grad_norm": 0.653131365776062, + "learning_rate": 8.602468671999915e-06, + "loss": 0.6927, + "step": 8939 + }, + { + "epoch": 0.492046893059607, + "grad_norm": 0.6758691072463989, + "learning_rate": 8.602168066075158e-06, + "loss": 0.7519, + "step": 8940 + }, + { + "epoch": 0.49210193186196266, + "grad_norm": 0.9186220765113831, + "learning_rate": 8.60186743307747e-06, + "loss": 0.7265, + "step": 8941 + }, + { + "epoch": 0.49215697066431835, + "grad_norm": 0.6781855225563049, + "learning_rate": 8.60156677300911e-06, + "loss": 0.6719, + "step": 8942 + }, + { + "epoch": 0.492212009466674, + "grad_norm": 0.7262865304946899, + "learning_rate": 8.601266085872336e-06, + "loss": 0.6449, + "step": 8943 + }, + { + "epoch": 0.4922670482690297, + "grad_norm": 0.6877585053443909, + "learning_rate": 8.600965371669411e-06, + "loss": 0.6999, + "step": 8944 + }, + { + "epoch": 0.4923220870713853, + "grad_norm": 1.1133443117141724, + "learning_rate": 8.600664630402596e-06, + "loss": 0.7842, + "step": 8945 + }, + { + "epoch": 0.492377125873741, + "grad_norm": 0.643478274345398, + "learning_rate": 8.600363862074149e-06, + "loss": 0.7009, + "step": 8946 + }, + { + "epoch": 0.49243216467609663, + "grad_norm": 0.7692574262619019, + "learning_rate": 8.600063066686331e-06, + "loss": 0.7777, + "step": 8947 + }, + { + "epoch": 0.4924872034784523, + "grad_norm": 0.884963870048523, + "learning_rate": 8.599762244241403e-06, + "loss": 0.7789, + "step": 8948 + }, + { + "epoch": 0.49254224228080795, + "grad_norm": 0.6918813586235046, + "learning_rate": 8.599461394741624e-06, + "loss": 0.7769, + "step": 8949 + }, + { + "epoch": 0.49259728108316364, + "grad_norm": 0.7432044148445129, + "learning_rate": 8.599160518189258e-06, + "loss": 0.7972, + "step": 8950 + }, + { + "epoch": 0.4926523198855193, + "grad_norm": 0.7530491948127747, + "learning_rate": 8.598859614586564e-06, + "loss": 0.8812, + "step": 8951 + }, + { + "epoch": 0.49270735868787496, + "grad_norm": 0.8738592267036438, + "learning_rate": 8.598558683935806e-06, + "loss": 0.6967, + "step": 8952 + }, + { + "epoch": 0.4927623974902306, + "grad_norm": 1.032084584236145, + "learning_rate": 8.598257726239242e-06, + "loss": 0.8513, + "step": 8953 + }, + { + "epoch": 0.4928174362925863, + "grad_norm": 0.8717961311340332, + "learning_rate": 8.597956741499136e-06, + "loss": 0.7703, + "step": 8954 + }, + { + "epoch": 0.4928724750949419, + "grad_norm": 0.6788356900215149, + "learning_rate": 8.597655729717753e-06, + "loss": 0.7649, + "step": 8955 + }, + { + "epoch": 0.4929275138972976, + "grad_norm": 1.0595613718032837, + "learning_rate": 8.59735469089735e-06, + "loss": 0.6967, + "step": 8956 + }, + { + "epoch": 0.49298255269965324, + "grad_norm": 0.7583820819854736, + "learning_rate": 8.597053625040193e-06, + "loss": 0.8384, + "step": 8957 + }, + { + "epoch": 0.49303759150200893, + "grad_norm": 0.7232168912887573, + "learning_rate": 8.596752532148545e-06, + "loss": 0.7643, + "step": 8958 + }, + { + "epoch": 0.49309263030436457, + "grad_norm": 0.727190375328064, + "learning_rate": 8.596451412224666e-06, + "loss": 0.845, + "step": 8959 + }, + { + "epoch": 0.49314766910672025, + "grad_norm": 0.6844252347946167, + "learning_rate": 8.596150265270821e-06, + "loss": 0.7099, + "step": 8960 + }, + { + "epoch": 0.4932027079090759, + "grad_norm": 0.7379910945892334, + "learning_rate": 8.595849091289275e-06, + "loss": 0.8168, + "step": 8961 + }, + { + "epoch": 0.4932577467114316, + "grad_norm": 0.77718186378479, + "learning_rate": 8.595547890282288e-06, + "loss": 0.8457, + "step": 8962 + }, + { + "epoch": 0.4933127855137872, + "grad_norm": 0.686126172542572, + "learning_rate": 8.595246662252127e-06, + "loss": 0.7918, + "step": 8963 + }, + { + "epoch": 0.4933678243161429, + "grad_norm": 0.7406145930290222, + "learning_rate": 8.594945407201051e-06, + "loss": 0.6866, + "step": 8964 + }, + { + "epoch": 0.49342286311849853, + "grad_norm": 0.9543277025222778, + "learning_rate": 8.594644125131331e-06, + "loss": 0.8444, + "step": 8965 + }, + { + "epoch": 0.4934779019208542, + "grad_norm": 0.8659517765045166, + "learning_rate": 8.594342816045228e-06, + "loss": 0.7661, + "step": 8966 + }, + { + "epoch": 0.49353294072320986, + "grad_norm": 0.7289552092552185, + "learning_rate": 8.594041479945005e-06, + "loss": 0.7734, + "step": 8967 + }, + { + "epoch": 0.49358797952556555, + "grad_norm": 0.7232840657234192, + "learning_rate": 8.59374011683293e-06, + "loss": 0.8557, + "step": 8968 + }, + { + "epoch": 0.4936430183279212, + "grad_norm": 0.738684356212616, + "learning_rate": 8.593438726711265e-06, + "loss": 0.7779, + "step": 8969 + }, + { + "epoch": 0.49369805713027687, + "grad_norm": 0.7486668229103088, + "learning_rate": 8.593137309582276e-06, + "loss": 0.7326, + "step": 8970 + }, + { + "epoch": 0.4937530959326325, + "grad_norm": 0.6564297080039978, + "learning_rate": 8.59283586544823e-06, + "loss": 0.6927, + "step": 8971 + }, + { + "epoch": 0.4938081347349882, + "grad_norm": 0.722540557384491, + "learning_rate": 8.592534394311392e-06, + "loss": 0.7254, + "step": 8972 + }, + { + "epoch": 0.4938631735373438, + "grad_norm": 0.7466141581535339, + "learning_rate": 8.592232896174026e-06, + "loss": 0.8551, + "step": 8973 + }, + { + "epoch": 0.4939182123396995, + "grad_norm": 0.7819109559059143, + "learning_rate": 8.591931371038398e-06, + "loss": 0.7271, + "step": 8974 + }, + { + "epoch": 0.49397325114205515, + "grad_norm": 0.7847672700881958, + "learning_rate": 8.591629818906776e-06, + "loss": 0.8404, + "step": 8975 + }, + { + "epoch": 0.49402828994441084, + "grad_norm": 0.8167426586151123, + "learning_rate": 8.591328239781428e-06, + "loss": 0.7375, + "step": 8976 + }, + { + "epoch": 0.49408332874676647, + "grad_norm": 0.7894755005836487, + "learning_rate": 8.591026633664615e-06, + "loss": 0.7872, + "step": 8977 + }, + { + "epoch": 0.4941383675491221, + "grad_norm": 0.726204514503479, + "learning_rate": 8.590725000558609e-06, + "loss": 0.7289, + "step": 8978 + }, + { + "epoch": 0.4941934063514778, + "grad_norm": 0.7116577625274658, + "learning_rate": 8.590423340465675e-06, + "loss": 0.7379, + "step": 8979 + }, + { + "epoch": 0.4942484451538334, + "grad_norm": 0.7302193641662598, + "learning_rate": 8.59012165338808e-06, + "loss": 0.7951, + "step": 8980 + }, + { + "epoch": 0.4943034839561891, + "grad_norm": 0.680555522441864, + "learning_rate": 8.58981993932809e-06, + "loss": 0.7609, + "step": 8981 + }, + { + "epoch": 0.49435852275854475, + "grad_norm": 0.874546229839325, + "learning_rate": 8.589518198287976e-06, + "loss": 0.8025, + "step": 8982 + }, + { + "epoch": 0.49441356156090044, + "grad_norm": 0.7164583206176758, + "learning_rate": 8.589216430270004e-06, + "loss": 0.7466, + "step": 8983 + }, + { + "epoch": 0.49446860036325607, + "grad_norm": 0.9155141115188599, + "learning_rate": 8.588914635276442e-06, + "loss": 0.7896, + "step": 8984 + }, + { + "epoch": 0.49452363916561176, + "grad_norm": 0.6777059435844421, + "learning_rate": 8.588612813309558e-06, + "loss": 0.7468, + "step": 8985 + }, + { + "epoch": 0.4945786779679674, + "grad_norm": 0.7100371718406677, + "learning_rate": 8.58831096437162e-06, + "loss": 0.7216, + "step": 8986 + }, + { + "epoch": 0.4946337167703231, + "grad_norm": 0.6842584609985352, + "learning_rate": 8.5880090884649e-06, + "loss": 0.7103, + "step": 8987 + }, + { + "epoch": 0.4946887555726787, + "grad_norm": 0.6347573399543762, + "learning_rate": 8.587707185591661e-06, + "loss": 0.7103, + "step": 8988 + }, + { + "epoch": 0.4947437943750344, + "grad_norm": 0.7175829410552979, + "learning_rate": 8.587405255754177e-06, + "loss": 0.8375, + "step": 8989 + }, + { + "epoch": 0.49479883317739004, + "grad_norm": 0.8402735590934753, + "learning_rate": 8.587103298954715e-06, + "loss": 0.6841, + "step": 8990 + }, + { + "epoch": 0.49485387197974573, + "grad_norm": 0.6988743543624878, + "learning_rate": 8.586801315195545e-06, + "loss": 0.7637, + "step": 8991 + }, + { + "epoch": 0.49490891078210136, + "grad_norm": 0.6672561168670654, + "learning_rate": 8.586499304478934e-06, + "loss": 0.7103, + "step": 8992 + }, + { + "epoch": 0.49496394958445705, + "grad_norm": 0.6821330189704895, + "learning_rate": 8.586197266807158e-06, + "loss": 0.6881, + "step": 8993 + }, + { + "epoch": 0.4950189883868127, + "grad_norm": 0.7886170744895935, + "learning_rate": 8.585895202182482e-06, + "loss": 0.7892, + "step": 8994 + }, + { + "epoch": 0.4950740271891684, + "grad_norm": 0.7348074913024902, + "learning_rate": 8.585593110607177e-06, + "loss": 0.7835, + "step": 8995 + }, + { + "epoch": 0.495129065991524, + "grad_norm": 0.9375506639480591, + "learning_rate": 8.585290992083514e-06, + "loss": 0.8017, + "step": 8996 + }, + { + "epoch": 0.4951841047938797, + "grad_norm": 0.7442331910133362, + "learning_rate": 8.584988846613765e-06, + "loss": 0.72, + "step": 8997 + }, + { + "epoch": 0.49523914359623533, + "grad_norm": 0.7347918748855591, + "learning_rate": 8.584686674200197e-06, + "loss": 0.8229, + "step": 8998 + }, + { + "epoch": 0.495294182398591, + "grad_norm": 0.7168740630149841, + "learning_rate": 8.584384474845084e-06, + "loss": 0.7288, + "step": 8999 + }, + { + "epoch": 0.49534922120094665, + "grad_norm": 0.7834853529930115, + "learning_rate": 8.584082248550697e-06, + "loss": 0.8521, + "step": 9000 + }, + { + "epoch": 0.49540426000330234, + "grad_norm": 0.6499035358428955, + "learning_rate": 8.58377999531931e-06, + "loss": 0.6887, + "step": 9001 + }, + { + "epoch": 0.495459298805658, + "grad_norm": 0.8000181913375854, + "learning_rate": 8.583477715153189e-06, + "loss": 0.8688, + "step": 9002 + }, + { + "epoch": 0.49551433760801367, + "grad_norm": 0.7539342045783997, + "learning_rate": 8.58317540805461e-06, + "loss": 0.6151, + "step": 9003 + }, + { + "epoch": 0.4955693764103693, + "grad_norm": 0.7677812576293945, + "learning_rate": 8.582873074025841e-06, + "loss": 0.8168, + "step": 9004 + }, + { + "epoch": 0.495624415212725, + "grad_norm": 0.7679157853126526, + "learning_rate": 8.58257071306916e-06, + "loss": 0.7719, + "step": 9005 + }, + { + "epoch": 0.4956794540150806, + "grad_norm": 0.9745703935623169, + "learning_rate": 8.582268325186836e-06, + "loss": 0.8272, + "step": 9006 + }, + { + "epoch": 0.4957344928174363, + "grad_norm": 0.66932612657547, + "learning_rate": 8.581965910381143e-06, + "loss": 0.7256, + "step": 9007 + }, + { + "epoch": 0.49578953161979195, + "grad_norm": 0.7630981206893921, + "learning_rate": 8.581663468654351e-06, + "loss": 0.7594, + "step": 9008 + }, + { + "epoch": 0.49584457042214763, + "grad_norm": 0.7420778870582581, + "learning_rate": 8.581361000008737e-06, + "loss": 0.7834, + "step": 9009 + }, + { + "epoch": 0.49589960922450327, + "grad_norm": 0.6775205731391907, + "learning_rate": 8.58105850444657e-06, + "loss": 0.7609, + "step": 9010 + }, + { + "epoch": 0.49595464802685896, + "grad_norm": 0.6588264107704163, + "learning_rate": 8.580755981970128e-06, + "loss": 0.805, + "step": 9011 + }, + { + "epoch": 0.4960096868292146, + "grad_norm": 0.7325689196586609, + "learning_rate": 8.580453432581681e-06, + "loss": 0.8817, + "step": 9012 + }, + { + "epoch": 0.4960647256315703, + "grad_norm": 0.7319273948669434, + "learning_rate": 8.580150856283505e-06, + "loss": 0.8001, + "step": 9013 + }, + { + "epoch": 0.4961197644339259, + "grad_norm": 0.7841789126396179, + "learning_rate": 8.579848253077875e-06, + "loss": 0.8415, + "step": 9014 + }, + { + "epoch": 0.4961748032362816, + "grad_norm": 0.7593979239463806, + "learning_rate": 8.579545622967062e-06, + "loss": 0.8238, + "step": 9015 + }, + { + "epoch": 0.49622984203863724, + "grad_norm": 0.6938808560371399, + "learning_rate": 8.579242965953343e-06, + "loss": 0.7325, + "step": 9016 + }, + { + "epoch": 0.4962848808409929, + "grad_norm": 0.7907594442367554, + "learning_rate": 8.578940282038993e-06, + "loss": 0.6947, + "step": 9017 + }, + { + "epoch": 0.49633991964334856, + "grad_norm": 0.708703875541687, + "learning_rate": 8.578637571226283e-06, + "loss": 0.6712, + "step": 9018 + }, + { + "epoch": 0.49639495844570425, + "grad_norm": 0.6820377707481384, + "learning_rate": 8.578334833517492e-06, + "loss": 0.7269, + "step": 9019 + }, + { + "epoch": 0.4964499972480599, + "grad_norm": 0.6858653426170349, + "learning_rate": 8.578032068914896e-06, + "loss": 0.7325, + "step": 9020 + }, + { + "epoch": 0.4965050360504155, + "grad_norm": 0.8758736848831177, + "learning_rate": 8.577729277420768e-06, + "loss": 0.6652, + "step": 9021 + }, + { + "epoch": 0.4965600748527712, + "grad_norm": 0.731316328048706, + "learning_rate": 8.577426459037383e-06, + "loss": 0.7835, + "step": 9022 + }, + { + "epoch": 0.49661511365512684, + "grad_norm": 0.813778817653656, + "learning_rate": 8.57712361376702e-06, + "loss": 0.8025, + "step": 9023 + }, + { + "epoch": 0.4966701524574825, + "grad_norm": 0.7167351841926575, + "learning_rate": 8.576820741611952e-06, + "loss": 0.7483, + "step": 9024 + }, + { + "epoch": 0.49672519125983816, + "grad_norm": 0.7243192791938782, + "learning_rate": 8.576517842574457e-06, + "loss": 0.8411, + "step": 9025 + }, + { + "epoch": 0.49678023006219385, + "grad_norm": 0.5869036316871643, + "learning_rate": 8.576214916656814e-06, + "loss": 0.6661, + "step": 9026 + }, + { + "epoch": 0.4968352688645495, + "grad_norm": 0.7502203583717346, + "learning_rate": 8.575911963861293e-06, + "loss": 0.8838, + "step": 9027 + }, + { + "epoch": 0.4968903076669052, + "grad_norm": 0.687562108039856, + "learning_rate": 8.575608984190177e-06, + "loss": 0.7446, + "step": 9028 + }, + { + "epoch": 0.4969453464692608, + "grad_norm": 0.7735342383384705, + "learning_rate": 8.57530597764574e-06, + "loss": 0.8464, + "step": 9029 + }, + { + "epoch": 0.4970003852716165, + "grad_norm": 0.7828487753868103, + "learning_rate": 8.575002944230261e-06, + "loss": 0.7504, + "step": 9030 + }, + { + "epoch": 0.49705542407397213, + "grad_norm": 0.6359286904335022, + "learning_rate": 8.574699883946018e-06, + "loss": 0.6805, + "step": 9031 + }, + { + "epoch": 0.4971104628763278, + "grad_norm": 0.7462830543518066, + "learning_rate": 8.574396796795285e-06, + "loss": 0.8317, + "step": 9032 + }, + { + "epoch": 0.49716550167868345, + "grad_norm": 0.705115795135498, + "learning_rate": 8.574093682780344e-06, + "loss": 0.7401, + "step": 9033 + }, + { + "epoch": 0.49722054048103914, + "grad_norm": 0.6466538310050964, + "learning_rate": 8.573790541903472e-06, + "loss": 0.7761, + "step": 9034 + }, + { + "epoch": 0.4972755792833948, + "grad_norm": 0.7479867339134216, + "learning_rate": 8.573487374166946e-06, + "loss": 0.8394, + "step": 9035 + }, + { + "epoch": 0.49733061808575046, + "grad_norm": 0.7378019094467163, + "learning_rate": 8.573184179573046e-06, + "loss": 0.8215, + "step": 9036 + }, + { + "epoch": 0.4973856568881061, + "grad_norm": 0.6526094675064087, + "learning_rate": 8.57288095812405e-06, + "loss": 0.8055, + "step": 9037 + }, + { + "epoch": 0.4974406956904618, + "grad_norm": 0.679595947265625, + "learning_rate": 8.572577709822238e-06, + "loss": 0.8241, + "step": 9038 + }, + { + "epoch": 0.4974957344928174, + "grad_norm": 0.753466010093689, + "learning_rate": 8.572274434669886e-06, + "loss": 0.896, + "step": 9039 + }, + { + "epoch": 0.4975507732951731, + "grad_norm": 0.7068368792533875, + "learning_rate": 8.571971132669277e-06, + "loss": 0.778, + "step": 9040 + }, + { + "epoch": 0.49760581209752874, + "grad_norm": 0.7397973537445068, + "learning_rate": 8.571667803822689e-06, + "loss": 0.782, + "step": 9041 + }, + { + "epoch": 0.49766085089988443, + "grad_norm": 0.7837033271789551, + "learning_rate": 8.571364448132402e-06, + "loss": 0.7509, + "step": 9042 + }, + { + "epoch": 0.49771588970224007, + "grad_norm": 0.6808765530586243, + "learning_rate": 8.571061065600696e-06, + "loss": 0.672, + "step": 9043 + }, + { + "epoch": 0.49777092850459576, + "grad_norm": 0.6574100255966187, + "learning_rate": 8.570757656229852e-06, + "loss": 0.751, + "step": 9044 + }, + { + "epoch": 0.4978259673069514, + "grad_norm": 0.7357671856880188, + "learning_rate": 8.570454220022146e-06, + "loss": 0.7977, + "step": 9045 + }, + { + "epoch": 0.4978810061093071, + "grad_norm": 0.7937216758728027, + "learning_rate": 8.570150756979865e-06, + "loss": 0.8151, + "step": 9046 + }, + { + "epoch": 0.4979360449116627, + "grad_norm": 0.7050907611846924, + "learning_rate": 8.569847267105285e-06, + "loss": 0.7667, + "step": 9047 + }, + { + "epoch": 0.4979910837140184, + "grad_norm": 0.7105300426483154, + "learning_rate": 8.569543750400688e-06, + "loss": 0.7031, + "step": 9048 + }, + { + "epoch": 0.49804612251637403, + "grad_norm": 0.7174646854400635, + "learning_rate": 8.569240206868358e-06, + "loss": 0.7692, + "step": 9049 + }, + { + "epoch": 0.4981011613187297, + "grad_norm": 0.7525906562805176, + "learning_rate": 8.568936636510573e-06, + "loss": 0.7584, + "step": 9050 + }, + { + "epoch": 0.49815620012108536, + "grad_norm": 1.5518100261688232, + "learning_rate": 8.568633039329615e-06, + "loss": 0.7932, + "step": 9051 + }, + { + "epoch": 0.49821123892344105, + "grad_norm": 0.7037720084190369, + "learning_rate": 8.568329415327766e-06, + "loss": 0.8345, + "step": 9052 + }, + { + "epoch": 0.4982662777257967, + "grad_norm": 0.6422694325447083, + "learning_rate": 8.568025764507308e-06, + "loss": 0.7396, + "step": 9053 + }, + { + "epoch": 0.49832131652815237, + "grad_norm": 0.777306854724884, + "learning_rate": 8.567722086870525e-06, + "loss": 0.8605, + "step": 9054 + }, + { + "epoch": 0.498376355330508, + "grad_norm": 0.6619865298271179, + "learning_rate": 8.567418382419697e-06, + "loss": 0.7395, + "step": 9055 + }, + { + "epoch": 0.4984313941328637, + "grad_norm": 0.7214456796646118, + "learning_rate": 8.567114651157106e-06, + "loss": 0.7932, + "step": 9056 + }, + { + "epoch": 0.4984864329352193, + "grad_norm": 0.75806725025177, + "learning_rate": 8.566810893085037e-06, + "loss": 0.7998, + "step": 9057 + }, + { + "epoch": 0.498541471737575, + "grad_norm": 0.8089895844459534, + "learning_rate": 8.566507108205773e-06, + "loss": 0.7849, + "step": 9058 + }, + { + "epoch": 0.49859651053993065, + "grad_norm": 0.817814290523529, + "learning_rate": 8.566203296521597e-06, + "loss": 0.7261, + "step": 9059 + }, + { + "epoch": 0.49865154934228634, + "grad_norm": 0.7417539954185486, + "learning_rate": 8.56589945803479e-06, + "loss": 0.7087, + "step": 9060 + }, + { + "epoch": 0.49870658814464197, + "grad_norm": 0.7518000602722168, + "learning_rate": 8.565595592747639e-06, + "loss": 0.7245, + "step": 9061 + }, + { + "epoch": 0.49876162694699766, + "grad_norm": 0.9537304043769836, + "learning_rate": 8.565291700662423e-06, + "loss": 0.901, + "step": 9062 + }, + { + "epoch": 0.4988166657493533, + "grad_norm": 0.784545361995697, + "learning_rate": 8.56498778178143e-06, + "loss": 0.7813, + "step": 9063 + }, + { + "epoch": 0.4988717045517089, + "grad_norm": 0.9218429923057556, + "learning_rate": 8.564683836106945e-06, + "loss": 0.8452, + "step": 9064 + }, + { + "epoch": 0.4989267433540646, + "grad_norm": 0.6902065277099609, + "learning_rate": 8.56437986364125e-06, + "loss": 0.7527, + "step": 9065 + }, + { + "epoch": 0.49898178215642025, + "grad_norm": 0.7388677000999451, + "learning_rate": 8.56407586438663e-06, + "loss": 0.82, + "step": 9066 + }, + { + "epoch": 0.49903682095877594, + "grad_norm": 0.6959313154220581, + "learning_rate": 8.563771838345369e-06, + "loss": 0.7274, + "step": 9067 + }, + { + "epoch": 0.4990918597611316, + "grad_norm": 0.6582610607147217, + "learning_rate": 8.563467785519753e-06, + "loss": 0.6518, + "step": 9068 + }, + { + "epoch": 0.49914689856348726, + "grad_norm": 0.6525924801826477, + "learning_rate": 8.563163705912066e-06, + "loss": 0.7006, + "step": 9069 + }, + { + "epoch": 0.4992019373658429, + "grad_norm": 0.8092843890190125, + "learning_rate": 8.562859599524596e-06, + "loss": 0.6915, + "step": 9070 + }, + { + "epoch": 0.4992569761681986, + "grad_norm": 0.6540575623512268, + "learning_rate": 8.562555466359626e-06, + "loss": 0.6729, + "step": 9071 + }, + { + "epoch": 0.4993120149705542, + "grad_norm": 0.8220445513725281, + "learning_rate": 8.562251306419443e-06, + "loss": 0.8172, + "step": 9072 + }, + { + "epoch": 0.4993670537729099, + "grad_norm": 0.7461502552032471, + "learning_rate": 8.561947119706334e-06, + "loss": 0.6902, + "step": 9073 + }, + { + "epoch": 0.49942209257526554, + "grad_norm": 0.8166316151618958, + "learning_rate": 8.56164290622258e-06, + "loss": 0.8238, + "step": 9074 + }, + { + "epoch": 0.49947713137762123, + "grad_norm": 0.8453896641731262, + "learning_rate": 8.561338665970476e-06, + "loss": 0.7697, + "step": 9075 + }, + { + "epoch": 0.49953217017997686, + "grad_norm": 0.7606340050697327, + "learning_rate": 8.5610343989523e-06, + "loss": 0.6951, + "step": 9076 + }, + { + "epoch": 0.49958720898233255, + "grad_norm": 0.7408013343811035, + "learning_rate": 8.560730105170345e-06, + "loss": 0.8298, + "step": 9077 + }, + { + "epoch": 0.4996422477846882, + "grad_norm": 0.7625541090965271, + "learning_rate": 8.560425784626896e-06, + "loss": 0.6738, + "step": 9078 + }, + { + "epoch": 0.4996972865870439, + "grad_norm": 0.6940996646881104, + "learning_rate": 8.560121437324238e-06, + "loss": 0.78, + "step": 9079 + }, + { + "epoch": 0.4997523253893995, + "grad_norm": 0.8087461590766907, + "learning_rate": 8.559817063264661e-06, + "loss": 0.7831, + "step": 9080 + }, + { + "epoch": 0.4998073641917552, + "grad_norm": 0.7418510317802429, + "learning_rate": 8.559512662450452e-06, + "loss": 0.801, + "step": 9081 + }, + { + "epoch": 0.49986240299411083, + "grad_norm": 0.6793946027755737, + "learning_rate": 8.5592082348839e-06, + "loss": 0.7329, + "step": 9082 + }, + { + "epoch": 0.4999174417964665, + "grad_norm": 0.8197429180145264, + "learning_rate": 8.55890378056729e-06, + "loss": 0.804, + "step": 9083 + }, + { + "epoch": 0.49997248059882216, + "grad_norm": 0.7526460886001587, + "learning_rate": 8.558599299502912e-06, + "loss": 0.8378, + "step": 9084 + }, + { + "epoch": 0.5000275194011778, + "grad_norm": 0.8169133067131042, + "learning_rate": 8.558294791693055e-06, + "loss": 0.828, + "step": 9085 + }, + { + "epoch": 0.5000825582035335, + "grad_norm": 0.8386932015419006, + "learning_rate": 8.557990257140007e-06, + "loss": 0.7961, + "step": 9086 + }, + { + "epoch": 0.5001375970058891, + "grad_norm": 0.7183443903923035, + "learning_rate": 8.557685695846057e-06, + "loss": 0.6964, + "step": 9087 + }, + { + "epoch": 0.5001926358082448, + "grad_norm": 0.77079176902771, + "learning_rate": 8.557381107813491e-06, + "loss": 0.8222, + "step": 9088 + }, + { + "epoch": 0.5002476746106005, + "grad_norm": 0.6519342660903931, + "learning_rate": 8.557076493044603e-06, + "loss": 0.772, + "step": 9089 + }, + { + "epoch": 0.5003027134129562, + "grad_norm": 0.7039975523948669, + "learning_rate": 8.556771851541678e-06, + "loss": 0.7491, + "step": 9090 + }, + { + "epoch": 0.5003577522153118, + "grad_norm": 0.6459039449691772, + "learning_rate": 8.556467183307012e-06, + "loss": 0.7104, + "step": 9091 + }, + { + "epoch": 0.5004127910176674, + "grad_norm": 0.7359183430671692, + "learning_rate": 8.556162488342887e-06, + "loss": 0.829, + "step": 9092 + }, + { + "epoch": 0.5004678298200231, + "grad_norm": 0.7029602527618408, + "learning_rate": 8.555857766651599e-06, + "loss": 0.8163, + "step": 9093 + }, + { + "epoch": 0.5005228686223788, + "grad_norm": 0.6687049865722656, + "learning_rate": 8.555553018235435e-06, + "loss": 0.7589, + "step": 9094 + }, + { + "epoch": 0.5005779074247344, + "grad_norm": 0.7277147173881531, + "learning_rate": 8.555248243096686e-06, + "loss": 0.8334, + "step": 9095 + }, + { + "epoch": 0.5006329462270901, + "grad_norm": 0.6512065529823303, + "learning_rate": 8.554943441237642e-06, + "loss": 0.7174, + "step": 9096 + }, + { + "epoch": 0.5006879850294458, + "grad_norm": 0.725351095199585, + "learning_rate": 8.554638612660594e-06, + "loss": 0.6514, + "step": 9097 + }, + { + "epoch": 0.5007430238318015, + "grad_norm": 0.7983208894729614, + "learning_rate": 8.554333757367836e-06, + "loss": 0.8385, + "step": 9098 + }, + { + "epoch": 0.500798062634157, + "grad_norm": 0.6631388068199158, + "learning_rate": 8.554028875361657e-06, + "loss": 0.7103, + "step": 9099 + }, + { + "epoch": 0.5008531014365127, + "grad_norm": 0.730421245098114, + "learning_rate": 8.553723966644347e-06, + "loss": 0.8005, + "step": 9100 + }, + { + "epoch": 0.5009081402388684, + "grad_norm": 0.7385838627815247, + "learning_rate": 8.5534190312182e-06, + "loss": 0.7586, + "step": 9101 + }, + { + "epoch": 0.5009631790412241, + "grad_norm": 0.712458610534668, + "learning_rate": 8.553114069085506e-06, + "loss": 0.7587, + "step": 9102 + }, + { + "epoch": 0.5010182178435797, + "grad_norm": 0.7393542528152466, + "learning_rate": 8.552809080248559e-06, + "loss": 0.746, + "step": 9103 + }, + { + "epoch": 0.5010732566459354, + "grad_norm": 0.6596370935440063, + "learning_rate": 8.552504064709649e-06, + "loss": 0.6968, + "step": 9104 + }, + { + "epoch": 0.5011282954482911, + "grad_norm": 0.7340545654296875, + "learning_rate": 8.552199022471069e-06, + "loss": 0.8326, + "step": 9105 + }, + { + "epoch": 0.5011833342506467, + "grad_norm": 0.6586140990257263, + "learning_rate": 8.55189395353511e-06, + "loss": 0.7144, + "step": 9106 + }, + { + "epoch": 0.5012383730530023, + "grad_norm": 0.6875959038734436, + "learning_rate": 8.551588857904071e-06, + "loss": 0.721, + "step": 9107 + }, + { + "epoch": 0.501293411855358, + "grad_norm": 0.6754499077796936, + "learning_rate": 8.551283735580238e-06, + "loss": 0.6771, + "step": 9108 + }, + { + "epoch": 0.5013484506577137, + "grad_norm": 0.8027325868606567, + "learning_rate": 8.55097858656591e-06, + "loss": 0.8196, + "step": 9109 + }, + { + "epoch": 0.5014034894600693, + "grad_norm": 0.6992260217666626, + "learning_rate": 8.550673410863376e-06, + "loss": 0.7923, + "step": 9110 + }, + { + "epoch": 0.501458528262425, + "grad_norm": 0.741205632686615, + "learning_rate": 8.550368208474928e-06, + "loss": 0.7036, + "step": 9111 + }, + { + "epoch": 0.5015135670647807, + "grad_norm": 0.6485981345176697, + "learning_rate": 8.550062979402866e-06, + "loss": 0.6351, + "step": 9112 + }, + { + "epoch": 0.5015686058671364, + "grad_norm": 0.6984226703643799, + "learning_rate": 8.549757723649481e-06, + "loss": 0.7714, + "step": 9113 + }, + { + "epoch": 0.5016236446694919, + "grad_norm": 0.7773998975753784, + "learning_rate": 8.549452441217067e-06, + "loss": 0.8901, + "step": 9114 + }, + { + "epoch": 0.5016786834718476, + "grad_norm": 0.6912227272987366, + "learning_rate": 8.549147132107918e-06, + "loss": 0.7702, + "step": 9115 + }, + { + "epoch": 0.5017337222742033, + "grad_norm": 0.6742583513259888, + "learning_rate": 8.54884179632433e-06, + "loss": 0.7789, + "step": 9116 + }, + { + "epoch": 0.501788761076559, + "grad_norm": 0.7896195650100708, + "learning_rate": 8.548536433868595e-06, + "loss": 0.7358, + "step": 9117 + }, + { + "epoch": 0.5018437998789146, + "grad_norm": 0.7112523913383484, + "learning_rate": 8.548231044743011e-06, + "loss": 0.7286, + "step": 9118 + }, + { + "epoch": 0.5018988386812703, + "grad_norm": 0.9162774085998535, + "learning_rate": 8.547925628949873e-06, + "loss": 0.935, + "step": 9119 + }, + { + "epoch": 0.501953877483626, + "grad_norm": 0.6319599747657776, + "learning_rate": 8.547620186491477e-06, + "loss": 0.625, + "step": 9120 + }, + { + "epoch": 0.5020089162859817, + "grad_norm": 0.7074719667434692, + "learning_rate": 8.547314717370115e-06, + "loss": 0.6614, + "step": 9121 + }, + { + "epoch": 0.5020639550883372, + "grad_norm": 0.7417262196540833, + "learning_rate": 8.547009221588086e-06, + "loss": 0.8476, + "step": 9122 + }, + { + "epoch": 0.5021189938906929, + "grad_norm": 0.7057339549064636, + "learning_rate": 8.546703699147685e-06, + "loss": 0.805, + "step": 9123 + }, + { + "epoch": 0.5021740326930486, + "grad_norm": 0.7420887351036072, + "learning_rate": 8.546398150051207e-06, + "loss": 0.7331, + "step": 9124 + }, + { + "epoch": 0.5022290714954043, + "grad_norm": 0.9526195526123047, + "learning_rate": 8.546092574300953e-06, + "loss": 0.7803, + "step": 9125 + }, + { + "epoch": 0.5022841102977599, + "grad_norm": 0.748130202293396, + "learning_rate": 8.545786971899214e-06, + "loss": 0.7998, + "step": 9126 + }, + { + "epoch": 0.5023391491001156, + "grad_norm": 0.7266026139259338, + "learning_rate": 8.545481342848289e-06, + "loss": 0.8377, + "step": 9127 + }, + { + "epoch": 0.5023941879024713, + "grad_norm": 0.6762456893920898, + "learning_rate": 8.545175687150478e-06, + "loss": 0.7312, + "step": 9128 + }, + { + "epoch": 0.502449226704827, + "grad_norm": 0.7011429667472839, + "learning_rate": 8.544870004808072e-06, + "loss": 0.7666, + "step": 9129 + }, + { + "epoch": 0.5025042655071825, + "grad_norm": 0.6652229428291321, + "learning_rate": 8.544564295823375e-06, + "loss": 0.6904, + "step": 9130 + }, + { + "epoch": 0.5025593043095382, + "grad_norm": 0.8333765268325806, + "learning_rate": 8.54425856019868e-06, + "loss": 0.7318, + "step": 9131 + }, + { + "epoch": 0.5026143431118939, + "grad_norm": 0.6827245950698853, + "learning_rate": 8.543952797936285e-06, + "loss": 0.7692, + "step": 9132 + }, + { + "epoch": 0.5026693819142496, + "grad_norm": 0.8744323253631592, + "learning_rate": 8.543647009038491e-06, + "loss": 0.7316, + "step": 9133 + }, + { + "epoch": 0.5027244207166052, + "grad_norm": 0.7024276852607727, + "learning_rate": 8.543341193507594e-06, + "loss": 0.7008, + "step": 9134 + }, + { + "epoch": 0.5027794595189609, + "grad_norm": 0.8786055445671082, + "learning_rate": 8.543035351345895e-06, + "loss": 0.7054, + "step": 9135 + }, + { + "epoch": 0.5028344983213165, + "grad_norm": 0.727924108505249, + "learning_rate": 8.54272948255569e-06, + "loss": 0.8049, + "step": 9136 + }, + { + "epoch": 0.5028895371236722, + "grad_norm": 0.8366256356239319, + "learning_rate": 8.542423587139277e-06, + "loss": 0.7926, + "step": 9137 + }, + { + "epoch": 0.5029445759260278, + "grad_norm": 0.7657913565635681, + "learning_rate": 8.542117665098958e-06, + "loss": 0.8152, + "step": 9138 + }, + { + "epoch": 0.5029996147283835, + "grad_norm": 0.7543498277664185, + "learning_rate": 8.54181171643703e-06, + "loss": 0.7566, + "step": 9139 + }, + { + "epoch": 0.5030546535307392, + "grad_norm": 0.7771349549293518, + "learning_rate": 8.541505741155794e-06, + "loss": 0.7907, + "step": 9140 + }, + { + "epoch": 0.5031096923330949, + "grad_norm": 0.6661877632141113, + "learning_rate": 8.541199739257548e-06, + "loss": 0.7481, + "step": 9141 + }, + { + "epoch": 0.5031647311354505, + "grad_norm": 0.7700417637825012, + "learning_rate": 8.540893710744593e-06, + "loss": 0.7544, + "step": 9142 + }, + { + "epoch": 0.5032197699378061, + "grad_norm": 0.6476640105247498, + "learning_rate": 8.54058765561923e-06, + "loss": 0.7221, + "step": 9143 + }, + { + "epoch": 0.5032748087401618, + "grad_norm": 0.7098944187164307, + "learning_rate": 8.540281573883755e-06, + "loss": 0.8083, + "step": 9144 + }, + { + "epoch": 0.5033298475425175, + "grad_norm": 0.9733545184135437, + "learning_rate": 8.539975465540473e-06, + "loss": 0.7381, + "step": 9145 + }, + { + "epoch": 0.5033848863448731, + "grad_norm": 0.641211986541748, + "learning_rate": 8.539669330591685e-06, + "loss": 0.7511, + "step": 9146 + }, + { + "epoch": 0.5034399251472288, + "grad_norm": 0.626027524471283, + "learning_rate": 8.539363169039687e-06, + "loss": 0.7321, + "step": 9147 + }, + { + "epoch": 0.5034949639495845, + "grad_norm": 0.7627241611480713, + "learning_rate": 8.539056980886785e-06, + "loss": 0.7269, + "step": 9148 + }, + { + "epoch": 0.5035500027519401, + "grad_norm": 0.6711145639419556, + "learning_rate": 8.538750766135275e-06, + "loss": 0.8179, + "step": 9149 + }, + { + "epoch": 0.5036050415542958, + "grad_norm": 0.6981950998306274, + "learning_rate": 8.538444524787463e-06, + "loss": 0.8095, + "step": 9150 + }, + { + "epoch": 0.5036600803566514, + "grad_norm": 0.8869871497154236, + "learning_rate": 8.53813825684565e-06, + "loss": 0.8549, + "step": 9151 + }, + { + "epoch": 0.5037151191590071, + "grad_norm": 0.6461544036865234, + "learning_rate": 8.537831962312137e-06, + "loss": 0.7388, + "step": 9152 + }, + { + "epoch": 0.5037701579613627, + "grad_norm": 0.8279222249984741, + "learning_rate": 8.537525641189224e-06, + "loss": 0.8609, + "step": 9153 + }, + { + "epoch": 0.5038251967637184, + "grad_norm": 0.7117578387260437, + "learning_rate": 8.537219293479217e-06, + "loss": 0.802, + "step": 9154 + }, + { + "epoch": 0.5038802355660741, + "grad_norm": 0.6831860542297363, + "learning_rate": 8.536912919184416e-06, + "loss": 0.7821, + "step": 9155 + }, + { + "epoch": 0.5039352743684298, + "grad_norm": 1.1528539657592773, + "learning_rate": 8.536606518307125e-06, + "loss": 0.8578, + "step": 9156 + }, + { + "epoch": 0.5039903131707854, + "grad_norm": 0.6545060873031616, + "learning_rate": 8.536300090849645e-06, + "loss": 0.7744, + "step": 9157 + }, + { + "epoch": 0.504045351973141, + "grad_norm": 0.7176601886749268, + "learning_rate": 8.535993636814281e-06, + "loss": 0.8104, + "step": 9158 + }, + { + "epoch": 0.5041003907754967, + "grad_norm": 0.8458410501480103, + "learning_rate": 8.535687156203334e-06, + "loss": 0.8653, + "step": 9159 + }, + { + "epoch": 0.5041554295778524, + "grad_norm": 0.7500274777412415, + "learning_rate": 8.53538064901911e-06, + "loss": 0.8043, + "step": 9160 + }, + { + "epoch": 0.504210468380208, + "grad_norm": 0.6982965469360352, + "learning_rate": 8.535074115263911e-06, + "loss": 0.7564, + "step": 9161 + }, + { + "epoch": 0.5042655071825637, + "grad_norm": 0.8344218134880066, + "learning_rate": 8.534767554940042e-06, + "loss": 0.7575, + "step": 9162 + }, + { + "epoch": 0.5043205459849194, + "grad_norm": 0.7527137398719788, + "learning_rate": 8.534460968049806e-06, + "loss": 0.7757, + "step": 9163 + }, + { + "epoch": 0.5043755847872751, + "grad_norm": 0.7136969566345215, + "learning_rate": 8.534154354595508e-06, + "loss": 0.826, + "step": 9164 + }, + { + "epoch": 0.5044306235896306, + "grad_norm": 0.8102819919586182, + "learning_rate": 8.533847714579449e-06, + "loss": 0.7247, + "step": 9165 + }, + { + "epoch": 0.5044856623919863, + "grad_norm": 0.7568309903144836, + "learning_rate": 8.53354104800394e-06, + "loss": 0.8509, + "step": 9166 + }, + { + "epoch": 0.504540701194342, + "grad_norm": 0.7719592452049255, + "learning_rate": 8.53323435487128e-06, + "loss": 0.8039, + "step": 9167 + }, + { + "epoch": 0.5045957399966977, + "grad_norm": 0.7514411807060242, + "learning_rate": 8.532927635183778e-06, + "loss": 0.8759, + "step": 9168 + }, + { + "epoch": 0.5046507787990533, + "grad_norm": 0.9781903028488159, + "learning_rate": 8.532620888943736e-06, + "loss": 0.8022, + "step": 9169 + }, + { + "epoch": 0.504705817601409, + "grad_norm": 0.7713304758071899, + "learning_rate": 8.532314116153462e-06, + "loss": 0.8372, + "step": 9170 + }, + { + "epoch": 0.5047608564037647, + "grad_norm": 0.7519709467887878, + "learning_rate": 8.53200731681526e-06, + "loss": 0.7374, + "step": 9171 + }, + { + "epoch": 0.5048158952061204, + "grad_norm": 0.6923980712890625, + "learning_rate": 8.531700490931438e-06, + "loss": 0.7511, + "step": 9172 + }, + { + "epoch": 0.5048709340084759, + "grad_norm": 0.682357907295227, + "learning_rate": 8.5313936385043e-06, + "loss": 0.7647, + "step": 9173 + }, + { + "epoch": 0.5049259728108316, + "grad_norm": 0.8255659341812134, + "learning_rate": 8.531086759536152e-06, + "loss": 0.7533, + "step": 9174 + }, + { + "epoch": 0.5049810116131873, + "grad_norm": 0.6774975061416626, + "learning_rate": 8.530779854029301e-06, + "loss": 0.7019, + "step": 9175 + }, + { + "epoch": 0.505036050415543, + "grad_norm": 0.7973241209983826, + "learning_rate": 8.530472921986053e-06, + "loss": 0.7824, + "step": 9176 + }, + { + "epoch": 0.5050910892178986, + "grad_norm": 0.8216109275817871, + "learning_rate": 8.530165963408716e-06, + "loss": 0.8063, + "step": 9177 + }, + { + "epoch": 0.5051461280202543, + "grad_norm": 0.7277935743331909, + "learning_rate": 8.5298589782996e-06, + "loss": 0.7631, + "step": 9178 + }, + { + "epoch": 0.50520116682261, + "grad_norm": 0.6647855043411255, + "learning_rate": 8.529551966661004e-06, + "loss": 0.7462, + "step": 9179 + }, + { + "epoch": 0.5052562056249656, + "grad_norm": 0.766272783279419, + "learning_rate": 8.529244928495241e-06, + "loss": 0.8075, + "step": 9180 + }, + { + "epoch": 0.5053112444273212, + "grad_norm": 0.7276293635368347, + "learning_rate": 8.52893786380462e-06, + "loss": 0.7908, + "step": 9181 + }, + { + "epoch": 0.5053662832296769, + "grad_norm": 0.7864169478416443, + "learning_rate": 8.528630772591447e-06, + "loss": 0.8082, + "step": 9182 + }, + { + "epoch": 0.5054213220320326, + "grad_norm": 0.9106804132461548, + "learning_rate": 8.528323654858028e-06, + "loss": 0.8989, + "step": 9183 + }, + { + "epoch": 0.5054763608343883, + "grad_norm": 0.7288523316383362, + "learning_rate": 8.52801651060667e-06, + "loss": 0.7972, + "step": 9184 + }, + { + "epoch": 0.5055313996367439, + "grad_norm": 0.7149643301963806, + "learning_rate": 8.527709339839689e-06, + "loss": 0.8191, + "step": 9185 + }, + { + "epoch": 0.5055864384390996, + "grad_norm": 0.6661714911460876, + "learning_rate": 8.527402142559388e-06, + "loss": 0.6596, + "step": 9186 + }, + { + "epoch": 0.5056414772414553, + "grad_norm": 0.7071447372436523, + "learning_rate": 8.527094918768076e-06, + "loss": 0.7633, + "step": 9187 + }, + { + "epoch": 0.5056965160438109, + "grad_norm": 0.7314093112945557, + "learning_rate": 8.526787668468064e-06, + "loss": 0.7815, + "step": 9188 + }, + { + "epoch": 0.5057515548461665, + "grad_norm": 0.8200539946556091, + "learning_rate": 8.526480391661657e-06, + "loss": 0.8376, + "step": 9189 + }, + { + "epoch": 0.5058065936485222, + "grad_norm": 0.7422435283660889, + "learning_rate": 8.52617308835117e-06, + "loss": 0.8783, + "step": 9190 + }, + { + "epoch": 0.5058616324508779, + "grad_norm": 0.7845084071159363, + "learning_rate": 8.525865758538909e-06, + "loss": 0.8005, + "step": 9191 + }, + { + "epoch": 0.5059166712532335, + "grad_norm": 0.6854296922683716, + "learning_rate": 8.525558402227185e-06, + "loss": 0.8118, + "step": 9192 + }, + { + "epoch": 0.5059717100555892, + "grad_norm": 0.6805297136306763, + "learning_rate": 8.525251019418309e-06, + "loss": 0.6765, + "step": 9193 + }, + { + "epoch": 0.5060267488579449, + "grad_norm": 0.7194867134094238, + "learning_rate": 8.524943610114587e-06, + "loss": 0.6752, + "step": 9194 + }, + { + "epoch": 0.5060817876603005, + "grad_norm": 0.6935137510299683, + "learning_rate": 8.524636174318335e-06, + "loss": 0.7122, + "step": 9195 + }, + { + "epoch": 0.5061368264626561, + "grad_norm": 0.8652825951576233, + "learning_rate": 8.52432871203186e-06, + "loss": 0.7725, + "step": 9196 + }, + { + "epoch": 0.5061918652650118, + "grad_norm": 0.9104461669921875, + "learning_rate": 8.524021223257472e-06, + "loss": 0.8589, + "step": 9197 + }, + { + "epoch": 0.5062469040673675, + "grad_norm": 0.7680580019950867, + "learning_rate": 8.523713707997486e-06, + "loss": 0.842, + "step": 9198 + }, + { + "epoch": 0.5063019428697232, + "grad_norm": 0.7324872612953186, + "learning_rate": 8.52340616625421e-06, + "loss": 0.802, + "step": 9199 + }, + { + "epoch": 0.5063569816720788, + "grad_norm": 0.8812359571456909, + "learning_rate": 8.523098598029958e-06, + "loss": 0.8286, + "step": 9200 + }, + { + "epoch": 0.5064120204744345, + "grad_norm": 0.6992496848106384, + "learning_rate": 8.522791003327038e-06, + "loss": 0.811, + "step": 9201 + }, + { + "epoch": 0.5064670592767901, + "grad_norm": 0.8191942572593689, + "learning_rate": 8.522483382147766e-06, + "loss": 0.7192, + "step": 9202 + }, + { + "epoch": 0.5065220980791458, + "grad_norm": 0.9354501366615295, + "learning_rate": 8.522175734494452e-06, + "loss": 0.7424, + "step": 9203 + }, + { + "epoch": 0.5065771368815014, + "grad_norm": 0.6481999754905701, + "learning_rate": 8.521868060369405e-06, + "loss": 0.6385, + "step": 9204 + }, + { + "epoch": 0.5066321756838571, + "grad_norm": 0.7158499360084534, + "learning_rate": 8.521560359774943e-06, + "loss": 0.6116, + "step": 9205 + }, + { + "epoch": 0.5066872144862128, + "grad_norm": 0.8738408088684082, + "learning_rate": 8.521252632713376e-06, + "loss": 0.894, + "step": 9206 + }, + { + "epoch": 0.5067422532885685, + "grad_norm": 0.7037062644958496, + "learning_rate": 8.520944879187015e-06, + "loss": 0.6958, + "step": 9207 + }, + { + "epoch": 0.5067972920909241, + "grad_norm": 0.7205594778060913, + "learning_rate": 8.520637099198175e-06, + "loss": 0.7188, + "step": 9208 + }, + { + "epoch": 0.5068523308932797, + "grad_norm": 0.6761966347694397, + "learning_rate": 8.520329292749169e-06, + "loss": 0.7669, + "step": 9209 + }, + { + "epoch": 0.5069073696956354, + "grad_norm": 0.682556688785553, + "learning_rate": 8.520021459842312e-06, + "loss": 0.7745, + "step": 9210 + }, + { + "epoch": 0.5069624084979911, + "grad_norm": 0.6687794923782349, + "learning_rate": 8.519713600479913e-06, + "loss": 0.7814, + "step": 9211 + }, + { + "epoch": 0.5070174473003467, + "grad_norm": 0.6391967535018921, + "learning_rate": 8.51940571466429e-06, + "loss": 0.7331, + "step": 9212 + }, + { + "epoch": 0.5070724861027024, + "grad_norm": 0.8420151472091675, + "learning_rate": 8.519097802397758e-06, + "loss": 0.8257, + "step": 9213 + }, + { + "epoch": 0.5071275249050581, + "grad_norm": 0.692787230014801, + "learning_rate": 8.518789863682625e-06, + "loss": 0.7179, + "step": 9214 + }, + { + "epoch": 0.5071825637074138, + "grad_norm": 0.6874318718910217, + "learning_rate": 8.518481898521213e-06, + "loss": 0.6847, + "step": 9215 + }, + { + "epoch": 0.5072376025097693, + "grad_norm": 0.8107750415802002, + "learning_rate": 8.518173906915832e-06, + "loss": 0.8459, + "step": 9216 + }, + { + "epoch": 0.507292641312125, + "grad_norm": 0.7952812910079956, + "learning_rate": 8.517865888868797e-06, + "loss": 0.8503, + "step": 9217 + }, + { + "epoch": 0.5073476801144807, + "grad_norm": 0.6926921606063843, + "learning_rate": 8.517557844382424e-06, + "loss": 0.6713, + "step": 9218 + }, + { + "epoch": 0.5074027189168364, + "grad_norm": 0.8203585147857666, + "learning_rate": 8.517249773459026e-06, + "loss": 0.8483, + "step": 9219 + }, + { + "epoch": 0.507457757719192, + "grad_norm": 0.6788125038146973, + "learning_rate": 8.516941676100923e-06, + "loss": 0.7521, + "step": 9220 + }, + { + "epoch": 0.5075127965215477, + "grad_norm": 0.6439838409423828, + "learning_rate": 8.516633552310426e-06, + "loss": 0.7359, + "step": 9221 + }, + { + "epoch": 0.5075678353239034, + "grad_norm": 0.6872217655181885, + "learning_rate": 8.516325402089854e-06, + "loss": 0.73, + "step": 9222 + }, + { + "epoch": 0.5076228741262591, + "grad_norm": 0.6695985794067383, + "learning_rate": 8.51601722544152e-06, + "loss": 0.7519, + "step": 9223 + }, + { + "epoch": 0.5076779129286146, + "grad_norm": 0.7779402136802673, + "learning_rate": 8.515709022367741e-06, + "loss": 0.7325, + "step": 9224 + }, + { + "epoch": 0.5077329517309703, + "grad_norm": 0.9289746284484863, + "learning_rate": 8.515400792870836e-06, + "loss": 0.7839, + "step": 9225 + }, + { + "epoch": 0.507787990533326, + "grad_norm": 0.6949248313903809, + "learning_rate": 8.51509253695312e-06, + "loss": 0.7363, + "step": 9226 + }, + { + "epoch": 0.5078430293356817, + "grad_norm": 0.6463130116462708, + "learning_rate": 8.514784254616908e-06, + "loss": 0.7607, + "step": 9227 + }, + { + "epoch": 0.5078980681380373, + "grad_norm": 0.7332046031951904, + "learning_rate": 8.514475945864519e-06, + "loss": 0.6833, + "step": 9228 + }, + { + "epoch": 0.507953106940393, + "grad_norm": 0.8674100637435913, + "learning_rate": 8.51416761069827e-06, + "loss": 0.669, + "step": 9229 + }, + { + "epoch": 0.5080081457427487, + "grad_norm": 0.8073185682296753, + "learning_rate": 8.513859249120477e-06, + "loss": 0.7215, + "step": 9230 + }, + { + "epoch": 0.5080631845451044, + "grad_norm": 0.674117386341095, + "learning_rate": 8.51355086113346e-06, + "loss": 0.7813, + "step": 9231 + }, + { + "epoch": 0.5081182233474599, + "grad_norm": 0.8564596176147461, + "learning_rate": 8.513242446739534e-06, + "loss": 0.7393, + "step": 9232 + }, + { + "epoch": 0.5081732621498156, + "grad_norm": 0.684637188911438, + "learning_rate": 8.512934005941015e-06, + "loss": 0.781, + "step": 9233 + }, + { + "epoch": 0.5082283009521713, + "grad_norm": 0.816123902797699, + "learning_rate": 8.51262553874023e-06, + "loss": 0.8597, + "step": 9234 + }, + { + "epoch": 0.5082833397545269, + "grad_norm": 0.6582320332527161, + "learning_rate": 8.512317045139488e-06, + "loss": 0.6654, + "step": 9235 + }, + { + "epoch": 0.5083383785568826, + "grad_norm": 1.0153518915176392, + "learning_rate": 8.512008525141113e-06, + "loss": 0.7946, + "step": 9236 + }, + { + "epoch": 0.5083934173592383, + "grad_norm": 0.7455416917800903, + "learning_rate": 8.511699978747422e-06, + "loss": 0.8365, + "step": 9237 + }, + { + "epoch": 0.508448456161594, + "grad_norm": 0.6498221755027771, + "learning_rate": 8.511391405960733e-06, + "loss": 0.7252, + "step": 9238 + }, + { + "epoch": 0.5085034949639495, + "grad_norm": 0.6856792569160461, + "learning_rate": 8.511082806783368e-06, + "loss": 0.7282, + "step": 9239 + }, + { + "epoch": 0.5085585337663052, + "grad_norm": 0.6930065751075745, + "learning_rate": 8.510774181217643e-06, + "loss": 0.7404, + "step": 9240 + }, + { + "epoch": 0.5086135725686609, + "grad_norm": 0.6953150033950806, + "learning_rate": 8.51046552926588e-06, + "loss": 0.7684, + "step": 9241 + }, + { + "epoch": 0.5086686113710166, + "grad_norm": 0.7307711839675903, + "learning_rate": 8.510156850930395e-06, + "loss": 0.7557, + "step": 9242 + }, + { + "epoch": 0.5087236501733722, + "grad_norm": 0.7296478152275085, + "learning_rate": 8.509848146213513e-06, + "loss": 0.7469, + "step": 9243 + }, + { + "epoch": 0.5087786889757279, + "grad_norm": 0.7035672664642334, + "learning_rate": 8.509539415117553e-06, + "loss": 0.7151, + "step": 9244 + }, + { + "epoch": 0.5088337277780836, + "grad_norm": 0.7818698883056641, + "learning_rate": 8.509230657644832e-06, + "loss": 0.7134, + "step": 9245 + }, + { + "epoch": 0.5088887665804392, + "grad_norm": 0.7503119111061096, + "learning_rate": 8.508921873797674e-06, + "loss": 0.7028, + "step": 9246 + }, + { + "epoch": 0.5089438053827948, + "grad_norm": 0.7733498215675354, + "learning_rate": 8.508613063578397e-06, + "loss": 0.8159, + "step": 9247 + }, + { + "epoch": 0.5089988441851505, + "grad_norm": 0.9236353635787964, + "learning_rate": 8.508304226989326e-06, + "loss": 0.8013, + "step": 9248 + }, + { + "epoch": 0.5090538829875062, + "grad_norm": 0.6567198634147644, + "learning_rate": 8.507995364032777e-06, + "loss": 0.8285, + "step": 9249 + }, + { + "epoch": 0.5091089217898619, + "grad_norm": 0.6555445790290833, + "learning_rate": 8.507686474711074e-06, + "loss": 0.6917, + "step": 9250 + }, + { + "epoch": 0.5091639605922175, + "grad_norm": 0.8505375385284424, + "learning_rate": 8.507377559026539e-06, + "loss": 0.824, + "step": 9251 + }, + { + "epoch": 0.5092189993945732, + "grad_norm": 0.703413188457489, + "learning_rate": 8.507068616981493e-06, + "loss": 0.7162, + "step": 9252 + }, + { + "epoch": 0.5092740381969288, + "grad_norm": 0.7257823944091797, + "learning_rate": 8.50675964857826e-06, + "loss": 0.8031, + "step": 9253 + }, + { + "epoch": 0.5093290769992845, + "grad_norm": 0.6861198544502258, + "learning_rate": 8.506450653819159e-06, + "loss": 0.7724, + "step": 9254 + }, + { + "epoch": 0.5093841158016401, + "grad_norm": 0.7733107209205627, + "learning_rate": 8.506141632706512e-06, + "loss": 0.7834, + "step": 9255 + }, + { + "epoch": 0.5094391546039958, + "grad_norm": 0.7472217082977295, + "learning_rate": 8.505832585242644e-06, + "loss": 0.7594, + "step": 9256 + }, + { + "epoch": 0.5094941934063515, + "grad_norm": 0.6273325085639954, + "learning_rate": 8.505523511429876e-06, + "loss": 0.6798, + "step": 9257 + }, + { + "epoch": 0.5095492322087072, + "grad_norm": 0.7366517186164856, + "learning_rate": 8.505214411270533e-06, + "loss": 0.7916, + "step": 9258 + }, + { + "epoch": 0.5096042710110628, + "grad_norm": 0.6654453873634338, + "learning_rate": 8.504905284766936e-06, + "loss": 0.7228, + "step": 9259 + }, + { + "epoch": 0.5096593098134184, + "grad_norm": 0.7926275134086609, + "learning_rate": 8.50459613192141e-06, + "loss": 0.8303, + "step": 9260 + }, + { + "epoch": 0.5097143486157741, + "grad_norm": 0.7256377935409546, + "learning_rate": 8.504286952736277e-06, + "loss": 0.7977, + "step": 9261 + }, + { + "epoch": 0.5097693874181298, + "grad_norm": 0.7333946824073792, + "learning_rate": 8.50397774721386e-06, + "loss": 0.7978, + "step": 9262 + }, + { + "epoch": 0.5098244262204854, + "grad_norm": 0.6102882623672485, + "learning_rate": 8.503668515356485e-06, + "loss": 0.6386, + "step": 9263 + }, + { + "epoch": 0.5098794650228411, + "grad_norm": 0.7939823865890503, + "learning_rate": 8.503359257166477e-06, + "loss": 0.7328, + "step": 9264 + }, + { + "epoch": 0.5099345038251968, + "grad_norm": 0.7245013117790222, + "learning_rate": 8.503049972646157e-06, + "loss": 0.795, + "step": 9265 + }, + { + "epoch": 0.5099895426275525, + "grad_norm": 0.6722108125686646, + "learning_rate": 8.502740661797852e-06, + "loss": 0.7062, + "step": 9266 + }, + { + "epoch": 0.510044581429908, + "grad_norm": 0.6759012341499329, + "learning_rate": 8.502431324623884e-06, + "loss": 0.7427, + "step": 9267 + }, + { + "epoch": 0.5100996202322637, + "grad_norm": 0.6448835730552673, + "learning_rate": 8.502121961126581e-06, + "loss": 0.7381, + "step": 9268 + }, + { + "epoch": 0.5101546590346194, + "grad_norm": 0.6437426209449768, + "learning_rate": 8.501812571308266e-06, + "loss": 0.6733, + "step": 9269 + }, + { + "epoch": 0.5102096978369751, + "grad_norm": 0.6879013776779175, + "learning_rate": 8.501503155171267e-06, + "loss": 0.7227, + "step": 9270 + }, + { + "epoch": 0.5102647366393307, + "grad_norm": 0.6628512740135193, + "learning_rate": 8.501193712717906e-06, + "loss": 0.7151, + "step": 9271 + }, + { + "epoch": 0.5103197754416864, + "grad_norm": 0.7653747797012329, + "learning_rate": 8.500884243950511e-06, + "loss": 0.8189, + "step": 9272 + }, + { + "epoch": 0.5103748142440421, + "grad_norm": 0.7180060148239136, + "learning_rate": 8.500574748871407e-06, + "loss": 0.7633, + "step": 9273 + }, + { + "epoch": 0.5104298530463978, + "grad_norm": 0.7045086622238159, + "learning_rate": 8.50026522748292e-06, + "loss": 0.746, + "step": 9274 + }, + { + "epoch": 0.5104848918487533, + "grad_norm": 0.6224614381790161, + "learning_rate": 8.499955679787376e-06, + "loss": 0.7436, + "step": 9275 + }, + { + "epoch": 0.510539930651109, + "grad_norm": 0.6716495156288147, + "learning_rate": 8.499646105787103e-06, + "loss": 0.8006, + "step": 9276 + }, + { + "epoch": 0.5105949694534647, + "grad_norm": 0.83705735206604, + "learning_rate": 8.499336505484426e-06, + "loss": 0.886, + "step": 9277 + }, + { + "epoch": 0.5106500082558203, + "grad_norm": 0.7942199110984802, + "learning_rate": 8.499026878881673e-06, + "loss": 0.7709, + "step": 9278 + }, + { + "epoch": 0.510705047058176, + "grad_norm": 0.7500330209732056, + "learning_rate": 8.49871722598117e-06, + "loss": 0.7737, + "step": 9279 + }, + { + "epoch": 0.5107600858605317, + "grad_norm": 0.7283433675765991, + "learning_rate": 8.498407546785245e-06, + "loss": 0.8345, + "step": 9280 + }, + { + "epoch": 0.5108151246628874, + "grad_norm": 0.6970989108085632, + "learning_rate": 8.498097841296224e-06, + "loss": 0.7451, + "step": 9281 + }, + { + "epoch": 0.5108701634652429, + "grad_norm": 0.8338573575019836, + "learning_rate": 8.497788109516438e-06, + "loss": 0.8198, + "step": 9282 + }, + { + "epoch": 0.5109252022675986, + "grad_norm": 0.6544861197471619, + "learning_rate": 8.497478351448213e-06, + "loss": 0.7549, + "step": 9283 + }, + { + "epoch": 0.5109802410699543, + "grad_norm": 0.6627360582351685, + "learning_rate": 8.497168567093876e-06, + "loss": 0.7136, + "step": 9284 + }, + { + "epoch": 0.51103527987231, + "grad_norm": 0.7176669239997864, + "learning_rate": 8.496858756455755e-06, + "loss": 0.766, + "step": 9285 + }, + { + "epoch": 0.5110903186746656, + "grad_norm": 0.8260897397994995, + "learning_rate": 8.496548919536183e-06, + "loss": 0.8167, + "step": 9286 + }, + { + "epoch": 0.5111453574770213, + "grad_norm": 0.7077773809432983, + "learning_rate": 8.496239056337483e-06, + "loss": 0.776, + "step": 9287 + }, + { + "epoch": 0.511200396279377, + "grad_norm": 0.7609447836875916, + "learning_rate": 8.495929166861988e-06, + "loss": 0.7339, + "step": 9288 + }, + { + "epoch": 0.5112554350817327, + "grad_norm": 0.6896487474441528, + "learning_rate": 8.495619251112022e-06, + "loss": 0.7639, + "step": 9289 + }, + { + "epoch": 0.5113104738840882, + "grad_norm": 0.6946871280670166, + "learning_rate": 8.495309309089918e-06, + "loss": 0.8242, + "step": 9290 + }, + { + "epoch": 0.5113655126864439, + "grad_norm": 0.79847252368927, + "learning_rate": 8.494999340798007e-06, + "loss": 0.8226, + "step": 9291 + }, + { + "epoch": 0.5114205514887996, + "grad_norm": 0.7845447063446045, + "learning_rate": 8.494689346238615e-06, + "loss": 0.8593, + "step": 9292 + }, + { + "epoch": 0.5114755902911553, + "grad_norm": 1.1577119827270508, + "learning_rate": 8.494379325414074e-06, + "loss": 0.746, + "step": 9293 + }, + { + "epoch": 0.5115306290935109, + "grad_norm": 0.6720938682556152, + "learning_rate": 8.494069278326713e-06, + "loss": 0.6768, + "step": 9294 + }, + { + "epoch": 0.5115856678958666, + "grad_norm": 0.7389395833015442, + "learning_rate": 8.493759204978862e-06, + "loss": 0.8126, + "step": 9295 + }, + { + "epoch": 0.5116407066982223, + "grad_norm": 0.7629536986351013, + "learning_rate": 8.493449105372853e-06, + "loss": 0.7107, + "step": 9296 + }, + { + "epoch": 0.511695745500578, + "grad_norm": 0.7339474558830261, + "learning_rate": 8.493138979511015e-06, + "loss": 0.8144, + "step": 9297 + }, + { + "epoch": 0.5117507843029335, + "grad_norm": 0.7222825288772583, + "learning_rate": 8.49282882739568e-06, + "loss": 0.7512, + "step": 9298 + }, + { + "epoch": 0.5118058231052892, + "grad_norm": 0.676659107208252, + "learning_rate": 8.49251864902918e-06, + "loss": 0.6515, + "step": 9299 + }, + { + "epoch": 0.5118608619076449, + "grad_norm": 0.6336323618888855, + "learning_rate": 8.492208444413844e-06, + "loss": 0.719, + "step": 9300 + }, + { + "epoch": 0.5119159007100006, + "grad_norm": 0.701543927192688, + "learning_rate": 8.491898213552e-06, + "loss": 0.728, + "step": 9301 + }, + { + "epoch": 0.5119709395123562, + "grad_norm": 0.6809069514274597, + "learning_rate": 8.491587956445988e-06, + "loss": 0.8844, + "step": 9302 + }, + { + "epoch": 0.5120259783147119, + "grad_norm": 0.8046489357948303, + "learning_rate": 8.491277673098135e-06, + "loss": 0.817, + "step": 9303 + }, + { + "epoch": 0.5120810171170675, + "grad_norm": 0.8630616068840027, + "learning_rate": 8.490967363510774e-06, + "loss": 0.7745, + "step": 9304 + }, + { + "epoch": 0.5121360559194232, + "grad_norm": 0.7457678914070129, + "learning_rate": 8.490657027686235e-06, + "loss": 0.7956, + "step": 9305 + }, + { + "epoch": 0.5121910947217788, + "grad_norm": 0.6383466124534607, + "learning_rate": 8.490346665626854e-06, + "loss": 0.8046, + "step": 9306 + }, + { + "epoch": 0.5122461335241345, + "grad_norm": 0.7658202052116394, + "learning_rate": 8.49003627733496e-06, + "loss": 0.7905, + "step": 9307 + }, + { + "epoch": 0.5123011723264902, + "grad_norm": 0.6793283224105835, + "learning_rate": 8.48972586281289e-06, + "loss": 0.6646, + "step": 9308 + }, + { + "epoch": 0.5123562111288459, + "grad_norm": 0.7345246076583862, + "learning_rate": 8.489415422062972e-06, + "loss": 0.788, + "step": 9309 + }, + { + "epoch": 0.5124112499312015, + "grad_norm": 0.6665463447570801, + "learning_rate": 8.489104955087542e-06, + "loss": 0.706, + "step": 9310 + }, + { + "epoch": 0.5124662887335572, + "grad_norm": 0.7895458936691284, + "learning_rate": 8.488794461888934e-06, + "loss": 0.7464, + "step": 9311 + }, + { + "epoch": 0.5125213275359128, + "grad_norm": 0.7375221252441406, + "learning_rate": 8.488483942469481e-06, + "loss": 0.8029, + "step": 9312 + }, + { + "epoch": 0.5125763663382685, + "grad_norm": 0.792348325252533, + "learning_rate": 8.488173396831514e-06, + "loss": 0.7324, + "step": 9313 + }, + { + "epoch": 0.5126314051406241, + "grad_norm": 0.6500192880630493, + "learning_rate": 8.487862824977373e-06, + "loss": 0.7331, + "step": 9314 + }, + { + "epoch": 0.5126864439429798, + "grad_norm": 0.6607314348220825, + "learning_rate": 8.487552226909386e-06, + "loss": 0.7782, + "step": 9315 + }, + { + "epoch": 0.5127414827453355, + "grad_norm": 0.8261791467666626, + "learning_rate": 8.487241602629892e-06, + "loss": 0.8036, + "step": 9316 + }, + { + "epoch": 0.5127965215476912, + "grad_norm": 0.8301663994789124, + "learning_rate": 8.486930952141222e-06, + "loss": 0.7928, + "step": 9317 + }, + { + "epoch": 0.5128515603500468, + "grad_norm": 0.6957940459251404, + "learning_rate": 8.486620275445713e-06, + "loss": 0.7359, + "step": 9318 + }, + { + "epoch": 0.5129065991524024, + "grad_norm": 0.7562606334686279, + "learning_rate": 8.4863095725457e-06, + "loss": 0.7546, + "step": 9319 + }, + { + "epoch": 0.5129616379547581, + "grad_norm": 0.795886218547821, + "learning_rate": 8.485998843443517e-06, + "loss": 0.7558, + "step": 9320 + }, + { + "epoch": 0.5130166767571137, + "grad_norm": 0.6558147072792053, + "learning_rate": 8.4856880881415e-06, + "loss": 0.6832, + "step": 9321 + }, + { + "epoch": 0.5130717155594694, + "grad_norm": 0.7300151586532593, + "learning_rate": 8.485377306641984e-06, + "loss": 0.8018, + "step": 9322 + }, + { + "epoch": 0.5131267543618251, + "grad_norm": 0.7114105224609375, + "learning_rate": 8.485066498947305e-06, + "loss": 0.7374, + "step": 9323 + }, + { + "epoch": 0.5131817931641808, + "grad_norm": 0.7061085104942322, + "learning_rate": 8.484755665059798e-06, + "loss": 0.7905, + "step": 9324 + }, + { + "epoch": 0.5132368319665364, + "grad_norm": 0.8481647968292236, + "learning_rate": 8.484444804981802e-06, + "loss": 0.8518, + "step": 9325 + }, + { + "epoch": 0.513291870768892, + "grad_norm": 0.7583557367324829, + "learning_rate": 8.48413391871565e-06, + "loss": 0.8328, + "step": 9326 + }, + { + "epoch": 0.5133469095712477, + "grad_norm": 0.7381925582885742, + "learning_rate": 8.483823006263683e-06, + "loss": 0.76, + "step": 9327 + }, + { + "epoch": 0.5134019483736034, + "grad_norm": 0.8037852644920349, + "learning_rate": 8.483512067628232e-06, + "loss": 0.711, + "step": 9328 + }, + { + "epoch": 0.513456987175959, + "grad_norm": 0.6682618260383606, + "learning_rate": 8.483201102811637e-06, + "loss": 0.7479, + "step": 9329 + }, + { + "epoch": 0.5135120259783147, + "grad_norm": 0.662234365940094, + "learning_rate": 8.482890111816237e-06, + "loss": 0.7701, + "step": 9330 + }, + { + "epoch": 0.5135670647806704, + "grad_norm": 0.7081482410430908, + "learning_rate": 8.482579094644365e-06, + "loss": 0.8255, + "step": 9331 + }, + { + "epoch": 0.5136221035830261, + "grad_norm": 0.9659954905509949, + "learning_rate": 8.482268051298364e-06, + "loss": 0.8742, + "step": 9332 + }, + { + "epoch": 0.5136771423853816, + "grad_norm": 0.7837772369384766, + "learning_rate": 8.481956981780564e-06, + "loss": 0.7692, + "step": 9333 + }, + { + "epoch": 0.5137321811877373, + "grad_norm": 0.681918203830719, + "learning_rate": 8.481645886093311e-06, + "loss": 0.6952, + "step": 9334 + }, + { + "epoch": 0.513787219990093, + "grad_norm": 0.7253187894821167, + "learning_rate": 8.481334764238937e-06, + "loss": 0.7074, + "step": 9335 + }, + { + "epoch": 0.5138422587924487, + "grad_norm": 0.8845877051353455, + "learning_rate": 8.481023616219783e-06, + "loss": 0.675, + "step": 9336 + }, + { + "epoch": 0.5138972975948043, + "grad_norm": 0.6569344401359558, + "learning_rate": 8.480712442038188e-06, + "loss": 0.7181, + "step": 9337 + }, + { + "epoch": 0.51395233639716, + "grad_norm": 0.7372813820838928, + "learning_rate": 8.480401241696491e-06, + "loss": 0.8137, + "step": 9338 + }, + { + "epoch": 0.5140073751995157, + "grad_norm": 0.843099057674408, + "learning_rate": 8.48009001519703e-06, + "loss": 0.7648, + "step": 9339 + }, + { + "epoch": 0.5140624140018714, + "grad_norm": 0.7762032747268677, + "learning_rate": 8.479778762542142e-06, + "loss": 0.7805, + "step": 9340 + }, + { + "epoch": 0.5141174528042269, + "grad_norm": 0.739086925983429, + "learning_rate": 8.479467483734169e-06, + "loss": 0.7125, + "step": 9341 + }, + { + "epoch": 0.5141724916065826, + "grad_norm": 0.7351683974266052, + "learning_rate": 8.479156178775451e-06, + "loss": 0.7855, + "step": 9342 + }, + { + "epoch": 0.5142275304089383, + "grad_norm": 0.7601314187049866, + "learning_rate": 8.478844847668325e-06, + "loss": 0.8349, + "step": 9343 + }, + { + "epoch": 0.514282569211294, + "grad_norm": 0.6841638684272766, + "learning_rate": 8.478533490415133e-06, + "loss": 0.7986, + "step": 9344 + }, + { + "epoch": 0.5143376080136496, + "grad_norm": 0.6734872460365295, + "learning_rate": 8.478222107018213e-06, + "loss": 0.6941, + "step": 9345 + }, + { + "epoch": 0.5143926468160053, + "grad_norm": 0.801930844783783, + "learning_rate": 8.47791069747991e-06, + "loss": 0.8537, + "step": 9346 + }, + { + "epoch": 0.514447685618361, + "grad_norm": 0.6960629224777222, + "learning_rate": 8.477599261802558e-06, + "loss": 0.6629, + "step": 9347 + }, + { + "epoch": 0.5145027244207167, + "grad_norm": 0.7791358232498169, + "learning_rate": 8.477287799988502e-06, + "loss": 0.8777, + "step": 9348 + }, + { + "epoch": 0.5145577632230722, + "grad_norm": 0.7022722959518433, + "learning_rate": 8.476976312040082e-06, + "loss": 0.7116, + "step": 9349 + }, + { + "epoch": 0.5146128020254279, + "grad_norm": 0.7791306376457214, + "learning_rate": 8.476664797959639e-06, + "loss": 0.7262, + "step": 9350 + }, + { + "epoch": 0.5146678408277836, + "grad_norm": 0.7391177415847778, + "learning_rate": 8.476353257749514e-06, + "loss": 0.7308, + "step": 9351 + }, + { + "epoch": 0.5147228796301393, + "grad_norm": 0.6989552974700928, + "learning_rate": 8.476041691412046e-06, + "loss": 0.7754, + "step": 9352 + }, + { + "epoch": 0.5147779184324949, + "grad_norm": 0.7639930844306946, + "learning_rate": 8.475730098949582e-06, + "loss": 0.8385, + "step": 9353 + }, + { + "epoch": 0.5148329572348506, + "grad_norm": 0.7687931060791016, + "learning_rate": 8.47541848036446e-06, + "loss": 0.8118, + "step": 9354 + }, + { + "epoch": 0.5148879960372063, + "grad_norm": 0.8831589221954346, + "learning_rate": 8.475106835659024e-06, + "loss": 0.7705, + "step": 9355 + }, + { + "epoch": 0.5149430348395619, + "grad_norm": 0.7585502862930298, + "learning_rate": 8.474795164835614e-06, + "loss": 0.8167, + "step": 9356 + }, + { + "epoch": 0.5149980736419175, + "grad_norm": 0.7078690528869629, + "learning_rate": 8.474483467896572e-06, + "loss": 0.7412, + "step": 9357 + }, + { + "epoch": 0.5150531124442732, + "grad_norm": 0.8950889706611633, + "learning_rate": 8.474171744844246e-06, + "loss": 0.8132, + "step": 9358 + }, + { + "epoch": 0.5151081512466289, + "grad_norm": 0.7196077704429626, + "learning_rate": 8.473859995680973e-06, + "loss": 0.8041, + "step": 9359 + }, + { + "epoch": 0.5151631900489846, + "grad_norm": 0.7705141305923462, + "learning_rate": 8.473548220409099e-06, + "loss": 0.8437, + "step": 9360 + }, + { + "epoch": 0.5152182288513402, + "grad_norm": 0.6507467031478882, + "learning_rate": 8.473236419030966e-06, + "loss": 0.7713, + "step": 9361 + }, + { + "epoch": 0.5152732676536959, + "grad_norm": 0.7120817303657532, + "learning_rate": 8.472924591548917e-06, + "loss": 0.7688, + "step": 9362 + }, + { + "epoch": 0.5153283064560515, + "grad_norm": 0.7830487489700317, + "learning_rate": 8.472612737965297e-06, + "loss": 0.8875, + "step": 9363 + }, + { + "epoch": 0.5153833452584071, + "grad_norm": 0.8790529370307922, + "learning_rate": 8.47230085828245e-06, + "loss": 0.7648, + "step": 9364 + }, + { + "epoch": 0.5154383840607628, + "grad_norm": 0.8956806659698486, + "learning_rate": 8.471988952502718e-06, + "loss": 0.7891, + "step": 9365 + }, + { + "epoch": 0.5154934228631185, + "grad_norm": 0.7370011210441589, + "learning_rate": 8.471677020628448e-06, + "loss": 0.7609, + "step": 9366 + }, + { + "epoch": 0.5155484616654742, + "grad_norm": 0.6794238090515137, + "learning_rate": 8.471365062661982e-06, + "loss": 0.6679, + "step": 9367 + }, + { + "epoch": 0.5156035004678298, + "grad_norm": 0.7330273985862732, + "learning_rate": 8.471053078605664e-06, + "loss": 0.7276, + "step": 9368 + }, + { + "epoch": 0.5156585392701855, + "grad_norm": 0.7796601057052612, + "learning_rate": 8.470741068461843e-06, + "loss": 0.7897, + "step": 9369 + }, + { + "epoch": 0.5157135780725411, + "grad_norm": 0.6834099888801575, + "learning_rate": 8.470429032232858e-06, + "loss": 0.7924, + "step": 9370 + }, + { + "epoch": 0.5157686168748968, + "grad_norm": 0.6991616487503052, + "learning_rate": 8.47011696992106e-06, + "loss": 0.7901, + "step": 9371 + }, + { + "epoch": 0.5158236556772524, + "grad_norm": 0.7321401834487915, + "learning_rate": 8.469804881528792e-06, + "loss": 0.6718, + "step": 9372 + }, + { + "epoch": 0.5158786944796081, + "grad_norm": 0.7091043591499329, + "learning_rate": 8.469492767058398e-06, + "loss": 0.8204, + "step": 9373 + }, + { + "epoch": 0.5159337332819638, + "grad_norm": 0.8777012825012207, + "learning_rate": 8.469180626512223e-06, + "loss": 0.8045, + "step": 9374 + }, + { + "epoch": 0.5159887720843195, + "grad_norm": 0.6652738451957703, + "learning_rate": 8.468868459892619e-06, + "loss": 0.7248, + "step": 9375 + }, + { + "epoch": 0.5160438108866751, + "grad_norm": 0.7209659218788147, + "learning_rate": 8.468556267201925e-06, + "loss": 0.7508, + "step": 9376 + }, + { + "epoch": 0.5160988496890307, + "grad_norm": 0.7685441970825195, + "learning_rate": 8.468244048442494e-06, + "loss": 0.7501, + "step": 9377 + }, + { + "epoch": 0.5161538884913864, + "grad_norm": 0.6773725152015686, + "learning_rate": 8.467931803616665e-06, + "loss": 0.8036, + "step": 9378 + }, + { + "epoch": 0.5162089272937421, + "grad_norm": 0.7167890071868896, + "learning_rate": 8.467619532726792e-06, + "loss": 0.7229, + "step": 9379 + }, + { + "epoch": 0.5162639660960977, + "grad_norm": 0.7066929340362549, + "learning_rate": 8.467307235775218e-06, + "loss": 0.7433, + "step": 9380 + }, + { + "epoch": 0.5163190048984534, + "grad_norm": 0.7261828780174255, + "learning_rate": 8.46699491276429e-06, + "loss": 0.7873, + "step": 9381 + }, + { + "epoch": 0.5163740437008091, + "grad_norm": 0.7442463636398315, + "learning_rate": 8.466682563696356e-06, + "loss": 0.7953, + "step": 9382 + }, + { + "epoch": 0.5164290825031648, + "grad_norm": 0.5668768286705017, + "learning_rate": 8.466370188573765e-06, + "loss": 0.5602, + "step": 9383 + }, + { + "epoch": 0.5164841213055203, + "grad_norm": 0.7364997267723083, + "learning_rate": 8.466057787398864e-06, + "loss": 0.8274, + "step": 9384 + }, + { + "epoch": 0.516539160107876, + "grad_norm": 0.7793132066726685, + "learning_rate": 8.465745360174e-06, + "loss": 0.7832, + "step": 9385 + }, + { + "epoch": 0.5165941989102317, + "grad_norm": 0.6818128824234009, + "learning_rate": 8.46543290690152e-06, + "loss": 0.8314, + "step": 9386 + }, + { + "epoch": 0.5166492377125874, + "grad_norm": 0.7392195463180542, + "learning_rate": 8.465120427583778e-06, + "loss": 0.8124, + "step": 9387 + }, + { + "epoch": 0.516704276514943, + "grad_norm": 0.8582521677017212, + "learning_rate": 8.464807922223115e-06, + "loss": 0.7417, + "step": 9388 + }, + { + "epoch": 0.5167593153172987, + "grad_norm": 0.7322097420692444, + "learning_rate": 8.464495390821882e-06, + "loss": 0.7408, + "step": 9389 + }, + { + "epoch": 0.5168143541196544, + "grad_norm": 0.8177433013916016, + "learning_rate": 8.464182833382432e-06, + "loss": 0.87, + "step": 9390 + }, + { + "epoch": 0.5168693929220101, + "grad_norm": 0.7088115215301514, + "learning_rate": 8.46387024990711e-06, + "loss": 0.7748, + "step": 9391 + }, + { + "epoch": 0.5169244317243656, + "grad_norm": 0.6648650169372559, + "learning_rate": 8.463557640398268e-06, + "loss": 0.6302, + "step": 9392 + }, + { + "epoch": 0.5169794705267213, + "grad_norm": 0.6688859462738037, + "learning_rate": 8.463245004858251e-06, + "loss": 0.7252, + "step": 9393 + }, + { + "epoch": 0.517034509329077, + "grad_norm": 0.7231030464172363, + "learning_rate": 8.462932343289412e-06, + "loss": 0.8497, + "step": 9394 + }, + { + "epoch": 0.5170895481314327, + "grad_norm": 0.7142065763473511, + "learning_rate": 8.462619655694103e-06, + "loss": 0.7041, + "step": 9395 + }, + { + "epoch": 0.5171445869337883, + "grad_norm": 0.7197136878967285, + "learning_rate": 8.462306942074669e-06, + "loss": 0.7022, + "step": 9396 + }, + { + "epoch": 0.517199625736144, + "grad_norm": 0.7620192766189575, + "learning_rate": 8.461994202433463e-06, + "loss": 0.8243, + "step": 9397 + }, + { + "epoch": 0.5172546645384997, + "grad_norm": 0.7697533965110779, + "learning_rate": 8.461681436772836e-06, + "loss": 0.7861, + "step": 9398 + }, + { + "epoch": 0.5173097033408554, + "grad_norm": 0.7224711179733276, + "learning_rate": 8.461368645095138e-06, + "loss": 0.7588, + "step": 9399 + }, + { + "epoch": 0.5173647421432109, + "grad_norm": 0.9285979270935059, + "learning_rate": 8.46105582740272e-06, + "loss": 0.8113, + "step": 9400 + }, + { + "epoch": 0.5174197809455666, + "grad_norm": 0.7297842502593994, + "learning_rate": 8.460742983697934e-06, + "loss": 0.7115, + "step": 9401 + }, + { + "epoch": 0.5174748197479223, + "grad_norm": 0.6712872982025146, + "learning_rate": 8.460430113983126e-06, + "loss": 0.751, + "step": 9402 + }, + { + "epoch": 0.517529858550278, + "grad_norm": 0.7807186245918274, + "learning_rate": 8.460117218260657e-06, + "loss": 0.8375, + "step": 9403 + }, + { + "epoch": 0.5175848973526336, + "grad_norm": 0.621530294418335, + "learning_rate": 8.45980429653287e-06, + "loss": 0.638, + "step": 9404 + }, + { + "epoch": 0.5176399361549893, + "grad_norm": 0.7086256146430969, + "learning_rate": 8.45949134880212e-06, + "loss": 0.8304, + "step": 9405 + }, + { + "epoch": 0.517694974957345, + "grad_norm": 0.62705397605896, + "learning_rate": 8.45917837507076e-06, + "loss": 0.7008, + "step": 9406 + }, + { + "epoch": 0.5177500137597005, + "grad_norm": 0.9109121561050415, + "learning_rate": 8.458865375341142e-06, + "loss": 0.7529, + "step": 9407 + }, + { + "epoch": 0.5178050525620562, + "grad_norm": 0.6909900903701782, + "learning_rate": 8.458552349615615e-06, + "loss": 0.8453, + "step": 9408 + }, + { + "epoch": 0.5178600913644119, + "grad_norm": 0.7548434138298035, + "learning_rate": 8.458239297896536e-06, + "loss": 0.7516, + "step": 9409 + }, + { + "epoch": 0.5179151301667676, + "grad_norm": 0.7595730423927307, + "learning_rate": 8.457926220186257e-06, + "loss": 0.7599, + "step": 9410 + }, + { + "epoch": 0.5179701689691232, + "grad_norm": 0.7449337840080261, + "learning_rate": 8.45761311648713e-06, + "loss": 0.8236, + "step": 9411 + }, + { + "epoch": 0.5180252077714789, + "grad_norm": 0.7529160976409912, + "learning_rate": 8.457299986801507e-06, + "loss": 0.8655, + "step": 9412 + }, + { + "epoch": 0.5180802465738346, + "grad_norm": 0.6777701377868652, + "learning_rate": 8.456986831131742e-06, + "loss": 0.7737, + "step": 9413 + }, + { + "epoch": 0.5181352853761902, + "grad_norm": 0.9363510012626648, + "learning_rate": 8.456673649480191e-06, + "loss": 0.8227, + "step": 9414 + }, + { + "epoch": 0.5181903241785458, + "grad_norm": 0.798001229763031, + "learning_rate": 8.456360441849206e-06, + "loss": 0.8881, + "step": 9415 + }, + { + "epoch": 0.5182453629809015, + "grad_norm": 0.7212072610855103, + "learning_rate": 8.456047208241141e-06, + "loss": 0.8165, + "step": 9416 + }, + { + "epoch": 0.5183004017832572, + "grad_norm": 0.6918027997016907, + "learning_rate": 8.45573394865835e-06, + "loss": 0.8048, + "step": 9417 + }, + { + "epoch": 0.5183554405856129, + "grad_norm": 0.6474916338920593, + "learning_rate": 8.455420663103187e-06, + "loss": 0.6502, + "step": 9418 + }, + { + "epoch": 0.5184104793879685, + "grad_norm": 0.6592364311218262, + "learning_rate": 8.455107351578008e-06, + "loss": 0.7509, + "step": 9419 + }, + { + "epoch": 0.5184655181903242, + "grad_norm": 0.7658745646476746, + "learning_rate": 8.454794014085168e-06, + "loss": 0.8444, + "step": 9420 + }, + { + "epoch": 0.5185205569926798, + "grad_norm": 0.6814215183258057, + "learning_rate": 8.45448065062702e-06, + "loss": 0.7367, + "step": 9421 + }, + { + "epoch": 0.5185755957950355, + "grad_norm": 0.644740104675293, + "learning_rate": 8.45416726120592e-06, + "loss": 0.7456, + "step": 9422 + }, + { + "epoch": 0.5186306345973911, + "grad_norm": 0.8578751087188721, + "learning_rate": 8.453853845824225e-06, + "loss": 0.8481, + "step": 9423 + }, + { + "epoch": 0.5186856733997468, + "grad_norm": 0.6630389094352722, + "learning_rate": 8.453540404484288e-06, + "loss": 0.7487, + "step": 9424 + }, + { + "epoch": 0.5187407122021025, + "grad_norm": 0.7756431698799133, + "learning_rate": 8.453226937188466e-06, + "loss": 0.798, + "step": 9425 + }, + { + "epoch": 0.5187957510044582, + "grad_norm": 0.7856318354606628, + "learning_rate": 8.452913443939113e-06, + "loss": 0.785, + "step": 9426 + }, + { + "epoch": 0.5188507898068138, + "grad_norm": 0.7563977837562561, + "learning_rate": 8.45259992473859e-06, + "loss": 0.8182, + "step": 9427 + }, + { + "epoch": 0.5189058286091695, + "grad_norm": 0.6945043802261353, + "learning_rate": 8.452286379589247e-06, + "loss": 0.7262, + "step": 9428 + }, + { + "epoch": 0.5189608674115251, + "grad_norm": 0.6607717275619507, + "learning_rate": 8.451972808493444e-06, + "loss": 0.7257, + "step": 9429 + }, + { + "epoch": 0.5190159062138808, + "grad_norm": 0.6682843565940857, + "learning_rate": 8.451659211453539e-06, + "loss": 0.6775, + "step": 9430 + }, + { + "epoch": 0.5190709450162364, + "grad_norm": 0.7175559401512146, + "learning_rate": 8.451345588471886e-06, + "loss": 0.7154, + "step": 9431 + }, + { + "epoch": 0.5191259838185921, + "grad_norm": 0.7499119639396667, + "learning_rate": 8.451031939550845e-06, + "loss": 0.7537, + "step": 9432 + }, + { + "epoch": 0.5191810226209478, + "grad_norm": 0.65048748254776, + "learning_rate": 8.450718264692771e-06, + "loss": 0.7253, + "step": 9433 + }, + { + "epoch": 0.5192360614233035, + "grad_norm": 0.7067640423774719, + "learning_rate": 8.450404563900022e-06, + "loss": 0.7245, + "step": 9434 + }, + { + "epoch": 0.519291100225659, + "grad_norm": 0.7079932689666748, + "learning_rate": 8.450090837174956e-06, + "loss": 0.7776, + "step": 9435 + }, + { + "epoch": 0.5193461390280147, + "grad_norm": 0.8260107636451721, + "learning_rate": 8.44977708451993e-06, + "loss": 0.8529, + "step": 9436 + }, + { + "epoch": 0.5194011778303704, + "grad_norm": 0.6412167549133301, + "learning_rate": 8.449463305937304e-06, + "loss": 0.7371, + "step": 9437 + }, + { + "epoch": 0.5194562166327261, + "grad_norm": 0.7067576050758362, + "learning_rate": 8.449149501429435e-06, + "loss": 0.7161, + "step": 9438 + }, + { + "epoch": 0.5195112554350817, + "grad_norm": 0.6966904997825623, + "learning_rate": 8.448835670998681e-06, + "loss": 0.7285, + "step": 9439 + }, + { + "epoch": 0.5195662942374374, + "grad_norm": 0.8066132664680481, + "learning_rate": 8.448521814647401e-06, + "loss": 0.8265, + "step": 9440 + }, + { + "epoch": 0.5196213330397931, + "grad_norm": 0.7597149610519409, + "learning_rate": 8.448207932377957e-06, + "loss": 0.7721, + "step": 9441 + }, + { + "epoch": 0.5196763718421488, + "grad_norm": 0.6965302228927612, + "learning_rate": 8.447894024192702e-06, + "loss": 0.749, + "step": 9442 + }, + { + "epoch": 0.5197314106445043, + "grad_norm": 0.7032600045204163, + "learning_rate": 8.447580090094e-06, + "loss": 0.7923, + "step": 9443 + }, + { + "epoch": 0.51978644944686, + "grad_norm": 0.7255309820175171, + "learning_rate": 8.447266130084208e-06, + "loss": 0.6739, + "step": 9444 + }, + { + "epoch": 0.5198414882492157, + "grad_norm": 0.6602993011474609, + "learning_rate": 8.446952144165686e-06, + "loss": 0.7886, + "step": 9445 + }, + { + "epoch": 0.5198965270515714, + "grad_norm": 0.7017884850502014, + "learning_rate": 8.446638132340796e-06, + "loss": 0.7554, + "step": 9446 + }, + { + "epoch": 0.519951565853927, + "grad_norm": 0.7234843969345093, + "learning_rate": 8.446324094611894e-06, + "loss": 0.8294, + "step": 9447 + }, + { + "epoch": 0.5200066046562827, + "grad_norm": 0.6859332919120789, + "learning_rate": 8.446010030981347e-06, + "loss": 0.7563, + "step": 9448 + }, + { + "epoch": 0.5200616434586384, + "grad_norm": 0.7759458422660828, + "learning_rate": 8.445695941451507e-06, + "loss": 0.7577, + "step": 9449 + }, + { + "epoch": 0.520116682260994, + "grad_norm": 0.7852263450622559, + "learning_rate": 8.44538182602474e-06, + "loss": 0.7446, + "step": 9450 + }, + { + "epoch": 0.5201717210633496, + "grad_norm": 0.8143053650856018, + "learning_rate": 8.445067684703406e-06, + "loss": 0.7995, + "step": 9451 + }, + { + "epoch": 0.5202267598657053, + "grad_norm": 0.692738950252533, + "learning_rate": 8.444753517489865e-06, + "loss": 0.7185, + "step": 9452 + }, + { + "epoch": 0.520281798668061, + "grad_norm": 0.6615390181541443, + "learning_rate": 8.444439324386478e-06, + "loss": 0.7128, + "step": 9453 + }, + { + "epoch": 0.5203368374704166, + "grad_norm": 0.7360419034957886, + "learning_rate": 8.444125105395608e-06, + "loss": 0.6565, + "step": 9454 + }, + { + "epoch": 0.5203918762727723, + "grad_norm": 0.7280182838439941, + "learning_rate": 8.443810860519615e-06, + "loss": 0.7295, + "step": 9455 + }, + { + "epoch": 0.520446915075128, + "grad_norm": 0.787367582321167, + "learning_rate": 8.44349658976086e-06, + "loss": 0.7342, + "step": 9456 + }, + { + "epoch": 0.5205019538774837, + "grad_norm": 0.7496024966239929, + "learning_rate": 8.44318229312171e-06, + "loss": 0.7499, + "step": 9457 + }, + { + "epoch": 0.5205569926798392, + "grad_norm": 0.9167383909225464, + "learning_rate": 8.44286797060452e-06, + "loss": 0.7797, + "step": 9458 + }, + { + "epoch": 0.5206120314821949, + "grad_norm": 0.7032341957092285, + "learning_rate": 8.442553622211659e-06, + "loss": 0.7627, + "step": 9459 + }, + { + "epoch": 0.5206670702845506, + "grad_norm": 1.2905993461608887, + "learning_rate": 8.442239247945485e-06, + "loss": 0.7841, + "step": 9460 + }, + { + "epoch": 0.5207221090869063, + "grad_norm": 0.6909230351448059, + "learning_rate": 8.441924847808362e-06, + "loss": 0.7234, + "step": 9461 + }, + { + "epoch": 0.5207771478892619, + "grad_norm": 0.6632175445556641, + "learning_rate": 8.441610421802653e-06, + "loss": 0.6733, + "step": 9462 + }, + { + "epoch": 0.5208321866916176, + "grad_norm": 0.7838154435157776, + "learning_rate": 8.441295969930722e-06, + "loss": 0.7583, + "step": 9463 + }, + { + "epoch": 0.5208872254939733, + "grad_norm": 0.6380481123924255, + "learning_rate": 8.440981492194932e-06, + "loss": 0.7109, + "step": 9464 + }, + { + "epoch": 0.520942264296329, + "grad_norm": 0.6859052181243896, + "learning_rate": 8.440666988597646e-06, + "loss": 0.7387, + "step": 9465 + }, + { + "epoch": 0.5209973030986845, + "grad_norm": 0.7411379814147949, + "learning_rate": 8.440352459141226e-06, + "loss": 0.7852, + "step": 9466 + }, + { + "epoch": 0.5210523419010402, + "grad_norm": 0.6925216913223267, + "learning_rate": 8.44003790382804e-06, + "loss": 0.8228, + "step": 9467 + }, + { + "epoch": 0.5211073807033959, + "grad_norm": 0.7136396169662476, + "learning_rate": 8.43972332266045e-06, + "loss": 0.8168, + "step": 9468 + }, + { + "epoch": 0.5211624195057516, + "grad_norm": 0.719639003276825, + "learning_rate": 8.43940871564082e-06, + "loss": 0.6728, + "step": 9469 + }, + { + "epoch": 0.5212174583081072, + "grad_norm": 0.647861897945404, + "learning_rate": 8.439094082771513e-06, + "loss": 0.6986, + "step": 9470 + }, + { + "epoch": 0.5212724971104629, + "grad_norm": 0.6644579172134399, + "learning_rate": 8.438779424054897e-06, + "loss": 0.6263, + "step": 9471 + }, + { + "epoch": 0.5213275359128186, + "grad_norm": 0.7157352566719055, + "learning_rate": 8.438464739493335e-06, + "loss": 0.827, + "step": 9472 + }, + { + "epoch": 0.5213825747151742, + "grad_norm": 0.793765127658844, + "learning_rate": 8.438150029089193e-06, + "loss": 0.741, + "step": 9473 + }, + { + "epoch": 0.5214376135175298, + "grad_norm": 0.7078518867492676, + "learning_rate": 8.437835292844836e-06, + "loss": 0.7618, + "step": 9474 + }, + { + "epoch": 0.5214926523198855, + "grad_norm": 0.7492140531539917, + "learning_rate": 8.437520530762628e-06, + "loss": 0.7894, + "step": 9475 + }, + { + "epoch": 0.5215476911222412, + "grad_norm": 0.6534473299980164, + "learning_rate": 8.437205742844937e-06, + "loss": 0.7567, + "step": 9476 + }, + { + "epoch": 0.5216027299245969, + "grad_norm": 0.8745388984680176, + "learning_rate": 8.436890929094126e-06, + "loss": 0.8758, + "step": 9477 + }, + { + "epoch": 0.5216577687269525, + "grad_norm": 0.6804752349853516, + "learning_rate": 8.436576089512564e-06, + "loss": 0.7841, + "step": 9478 + }, + { + "epoch": 0.5217128075293082, + "grad_norm": 0.712065577507019, + "learning_rate": 8.436261224102615e-06, + "loss": 0.8079, + "step": 9479 + }, + { + "epoch": 0.5217678463316638, + "grad_norm": 0.8733783960342407, + "learning_rate": 8.435946332866648e-06, + "loss": 0.8295, + "step": 9480 + }, + { + "epoch": 0.5218228851340195, + "grad_norm": 0.6871289610862732, + "learning_rate": 8.435631415807028e-06, + "loss": 0.7087, + "step": 9481 + }, + { + "epoch": 0.5218779239363751, + "grad_norm": 0.8363185524940491, + "learning_rate": 8.43531647292612e-06, + "loss": 0.7329, + "step": 9482 + }, + { + "epoch": 0.5219329627387308, + "grad_norm": 0.6845195293426514, + "learning_rate": 8.435001504226295e-06, + "loss": 0.7651, + "step": 9483 + }, + { + "epoch": 0.5219880015410865, + "grad_norm": 0.7527645826339722, + "learning_rate": 8.434686509709917e-06, + "loss": 0.6856, + "step": 9484 + }, + { + "epoch": 0.5220430403434422, + "grad_norm": 0.6945710778236389, + "learning_rate": 8.434371489379356e-06, + "loss": 0.6875, + "step": 9485 + }, + { + "epoch": 0.5220980791457978, + "grad_norm": 0.7668873071670532, + "learning_rate": 8.434056443236977e-06, + "loss": 0.7662, + "step": 9486 + }, + { + "epoch": 0.5221531179481534, + "grad_norm": 0.9873473048210144, + "learning_rate": 8.433741371285148e-06, + "loss": 0.7662, + "step": 9487 + }, + { + "epoch": 0.5222081567505091, + "grad_norm": 0.8635447025299072, + "learning_rate": 8.43342627352624e-06, + "loss": 0.645, + "step": 9488 + }, + { + "epoch": 0.5222631955528648, + "grad_norm": 0.7836978435516357, + "learning_rate": 8.43311114996262e-06, + "loss": 0.7647, + "step": 9489 + }, + { + "epoch": 0.5223182343552204, + "grad_norm": 0.8370835185050964, + "learning_rate": 8.432796000596652e-06, + "loss": 0.8402, + "step": 9490 + }, + { + "epoch": 0.5223732731575761, + "grad_norm": 0.9627843499183655, + "learning_rate": 8.432480825430712e-06, + "loss": 0.6985, + "step": 9491 + }, + { + "epoch": 0.5224283119599318, + "grad_norm": 0.6774263978004456, + "learning_rate": 8.432165624467163e-06, + "loss": 0.7051, + "step": 9492 + }, + { + "epoch": 0.5224833507622874, + "grad_norm": 0.6590597033500671, + "learning_rate": 8.431850397708375e-06, + "loss": 0.7147, + "step": 9493 + }, + { + "epoch": 0.522538389564643, + "grad_norm": 0.8153522610664368, + "learning_rate": 8.43153514515672e-06, + "loss": 0.6759, + "step": 9494 + }, + { + "epoch": 0.5225934283669987, + "grad_norm": 0.7457708716392517, + "learning_rate": 8.431219866814563e-06, + "loss": 0.7168, + "step": 9495 + }, + { + "epoch": 0.5226484671693544, + "grad_norm": 0.6994161009788513, + "learning_rate": 8.430904562684278e-06, + "loss": 0.8393, + "step": 9496 + }, + { + "epoch": 0.52270350597171, + "grad_norm": 0.780337393283844, + "learning_rate": 8.430589232768232e-06, + "loss": 0.6528, + "step": 9497 + }, + { + "epoch": 0.5227585447740657, + "grad_norm": 0.6833232641220093, + "learning_rate": 8.430273877068796e-06, + "loss": 0.7545, + "step": 9498 + }, + { + "epoch": 0.5228135835764214, + "grad_norm": 0.7330057621002197, + "learning_rate": 8.42995849558834e-06, + "loss": 0.7932, + "step": 9499 + }, + { + "epoch": 0.5228686223787771, + "grad_norm": 0.8131541609764099, + "learning_rate": 8.429643088329233e-06, + "loss": 0.7546, + "step": 9500 + }, + { + "epoch": 0.5229236611811326, + "grad_norm": 0.7353833317756653, + "learning_rate": 8.42932765529385e-06, + "loss": 0.7508, + "step": 9501 + }, + { + "epoch": 0.5229786999834883, + "grad_norm": 0.7166246771812439, + "learning_rate": 8.429012196484554e-06, + "loss": 0.728, + "step": 9502 + }, + { + "epoch": 0.523033738785844, + "grad_norm": 0.732064962387085, + "learning_rate": 8.428696711903721e-06, + "loss": 0.8306, + "step": 9503 + }, + { + "epoch": 0.5230887775881997, + "grad_norm": 0.6858934164047241, + "learning_rate": 8.428381201553721e-06, + "loss": 0.7801, + "step": 9504 + }, + { + "epoch": 0.5231438163905553, + "grad_norm": 0.7046478986740112, + "learning_rate": 8.428065665436928e-06, + "loss": 0.7365, + "step": 9505 + }, + { + "epoch": 0.523198855192911, + "grad_norm": 0.6669325828552246, + "learning_rate": 8.42775010355571e-06, + "loss": 0.7764, + "step": 9506 + }, + { + "epoch": 0.5232538939952667, + "grad_norm": 0.655619740486145, + "learning_rate": 8.427434515912438e-06, + "loss": 0.7919, + "step": 9507 + }, + { + "epoch": 0.5233089327976224, + "grad_norm": 0.6236690878868103, + "learning_rate": 8.427118902509487e-06, + "loss": 0.6653, + "step": 9508 + }, + { + "epoch": 0.5233639715999779, + "grad_norm": 0.8233165740966797, + "learning_rate": 8.426803263349228e-06, + "loss": 0.8012, + "step": 9509 + }, + { + "epoch": 0.5234190104023336, + "grad_norm": 0.6626759171485901, + "learning_rate": 8.426487598434035e-06, + "loss": 0.7728, + "step": 9510 + }, + { + "epoch": 0.5234740492046893, + "grad_norm": 0.9209974408149719, + "learning_rate": 8.426171907766275e-06, + "loss": 0.769, + "step": 9511 + }, + { + "epoch": 0.523529088007045, + "grad_norm": 0.6297587156295776, + "learning_rate": 8.425856191348325e-06, + "loss": 0.7333, + "step": 9512 + }, + { + "epoch": 0.5235841268094006, + "grad_norm": 0.6995256543159485, + "learning_rate": 8.425540449182558e-06, + "loss": 0.7486, + "step": 9513 + }, + { + "epoch": 0.5236391656117563, + "grad_norm": 0.8076607584953308, + "learning_rate": 8.425224681271345e-06, + "loss": 0.8533, + "step": 9514 + }, + { + "epoch": 0.523694204414112, + "grad_norm": 1.2198601961135864, + "learning_rate": 8.42490888761706e-06, + "loss": 0.7291, + "step": 9515 + }, + { + "epoch": 0.5237492432164677, + "grad_norm": 0.7047159671783447, + "learning_rate": 8.424593068222076e-06, + "loss": 0.713, + "step": 9516 + }, + { + "epoch": 0.5238042820188232, + "grad_norm": 0.7652333378791809, + "learning_rate": 8.424277223088768e-06, + "loss": 0.8149, + "step": 9517 + }, + { + "epoch": 0.5238593208211789, + "grad_norm": 1.1311010122299194, + "learning_rate": 8.42396135221951e-06, + "loss": 0.8195, + "step": 9518 + }, + { + "epoch": 0.5239143596235346, + "grad_norm": 0.7855533957481384, + "learning_rate": 8.423645455616674e-06, + "loss": 0.7901, + "step": 9519 + }, + { + "epoch": 0.5239693984258903, + "grad_norm": 0.7028971314430237, + "learning_rate": 8.423329533282635e-06, + "loss": 0.8006, + "step": 9520 + }, + { + "epoch": 0.5240244372282459, + "grad_norm": 0.703809916973114, + "learning_rate": 8.423013585219769e-06, + "loss": 0.7581, + "step": 9521 + }, + { + "epoch": 0.5240794760306016, + "grad_norm": 0.94233238697052, + "learning_rate": 8.422697611430448e-06, + "loss": 0.7689, + "step": 9522 + }, + { + "epoch": 0.5241345148329573, + "grad_norm": 0.8164071440696716, + "learning_rate": 8.422381611917047e-06, + "loss": 0.8761, + "step": 9523 + }, + { + "epoch": 0.5241895536353129, + "grad_norm": 0.6242091059684753, + "learning_rate": 8.422065586681944e-06, + "loss": 0.6975, + "step": 9524 + }, + { + "epoch": 0.5242445924376685, + "grad_norm": 0.6607261300086975, + "learning_rate": 8.42174953572751e-06, + "loss": 0.6847, + "step": 9525 + }, + { + "epoch": 0.5242996312400242, + "grad_norm": 0.7174261212348938, + "learning_rate": 8.421433459056123e-06, + "loss": 0.7905, + "step": 9526 + }, + { + "epoch": 0.5243546700423799, + "grad_norm": 0.7414089441299438, + "learning_rate": 8.42111735667016e-06, + "loss": 0.7788, + "step": 9527 + }, + { + "epoch": 0.5244097088447356, + "grad_norm": 0.7347442507743835, + "learning_rate": 8.420801228571992e-06, + "loss": 0.7691, + "step": 9528 + }, + { + "epoch": 0.5244647476470912, + "grad_norm": 0.6947832107543945, + "learning_rate": 8.420485074763999e-06, + "loss": 0.6702, + "step": 9529 + }, + { + "epoch": 0.5245197864494469, + "grad_norm": 0.6865423321723938, + "learning_rate": 8.420168895248557e-06, + "loss": 0.7577, + "step": 9530 + }, + { + "epoch": 0.5245748252518025, + "grad_norm": 0.7023190855979919, + "learning_rate": 8.419852690028039e-06, + "loss": 0.7711, + "step": 9531 + }, + { + "epoch": 0.5246298640541582, + "grad_norm": 0.8312145471572876, + "learning_rate": 8.419536459104824e-06, + "loss": 0.7999, + "step": 9532 + }, + { + "epoch": 0.5246849028565138, + "grad_norm": 0.6700688600540161, + "learning_rate": 8.419220202481288e-06, + "loss": 0.7163, + "step": 9533 + }, + { + "epoch": 0.5247399416588695, + "grad_norm": 0.767062246799469, + "learning_rate": 8.418903920159809e-06, + "loss": 0.7451, + "step": 9534 + }, + { + "epoch": 0.5247949804612252, + "grad_norm": 0.6814010143280029, + "learning_rate": 8.418587612142763e-06, + "loss": 0.771, + "step": 9535 + }, + { + "epoch": 0.5248500192635808, + "grad_norm": 0.6728426218032837, + "learning_rate": 8.418271278432528e-06, + "loss": 0.8336, + "step": 9536 + }, + { + "epoch": 0.5249050580659365, + "grad_norm": 0.7112382650375366, + "learning_rate": 8.417954919031482e-06, + "loss": 0.7392, + "step": 9537 + }, + { + "epoch": 0.5249600968682921, + "grad_norm": 0.7371365427970886, + "learning_rate": 8.417638533942e-06, + "loss": 0.8233, + "step": 9538 + }, + { + "epoch": 0.5250151356706478, + "grad_norm": 0.6593502163887024, + "learning_rate": 8.41732212316646e-06, + "loss": 0.7455, + "step": 9539 + }, + { + "epoch": 0.5250701744730034, + "grad_norm": 0.685553252696991, + "learning_rate": 8.417005686707245e-06, + "loss": 0.7783, + "step": 9540 + }, + { + "epoch": 0.5251252132753591, + "grad_norm": 0.7003353238105774, + "learning_rate": 8.41668922456673e-06, + "loss": 0.7733, + "step": 9541 + }, + { + "epoch": 0.5251802520777148, + "grad_norm": 0.7602891325950623, + "learning_rate": 8.416372736747292e-06, + "loss": 0.7236, + "step": 9542 + }, + { + "epoch": 0.5252352908800705, + "grad_norm": 0.647531270980835, + "learning_rate": 8.41605622325131e-06, + "loss": 0.7388, + "step": 9543 + }, + { + "epoch": 0.5252903296824261, + "grad_norm": 0.7309756875038147, + "learning_rate": 8.415739684081165e-06, + "loss": 0.7178, + "step": 9544 + }, + { + "epoch": 0.5253453684847817, + "grad_norm": 0.6991532444953918, + "learning_rate": 8.415423119239236e-06, + "loss": 0.8078, + "step": 9545 + }, + { + "epoch": 0.5254004072871374, + "grad_norm": 0.7392330765724182, + "learning_rate": 8.4151065287279e-06, + "loss": 0.8452, + "step": 9546 + }, + { + "epoch": 0.5254554460894931, + "grad_norm": 0.7617329955101013, + "learning_rate": 8.414789912549537e-06, + "loss": 0.7885, + "step": 9547 + }, + { + "epoch": 0.5255104848918487, + "grad_norm": 1.160125732421875, + "learning_rate": 8.414473270706527e-06, + "loss": 0.9628, + "step": 9548 + }, + { + "epoch": 0.5255655236942044, + "grad_norm": 0.7578685879707336, + "learning_rate": 8.414156603201252e-06, + "loss": 0.7745, + "step": 9549 + }, + { + "epoch": 0.5256205624965601, + "grad_norm": 0.6963017582893372, + "learning_rate": 8.413839910036089e-06, + "loss": 0.7693, + "step": 9550 + }, + { + "epoch": 0.5256756012989158, + "grad_norm": 0.6631398797035217, + "learning_rate": 8.413523191213415e-06, + "loss": 0.6606, + "step": 9551 + }, + { + "epoch": 0.5257306401012714, + "grad_norm": 0.707343339920044, + "learning_rate": 8.41320644673562e-06, + "loss": 0.7161, + "step": 9552 + }, + { + "epoch": 0.525785678903627, + "grad_norm": 0.833448588848114, + "learning_rate": 8.412889676605075e-06, + "loss": 0.7509, + "step": 9553 + }, + { + "epoch": 0.5258407177059827, + "grad_norm": 0.6214264631271362, + "learning_rate": 8.412572880824168e-06, + "loss": 0.7436, + "step": 9554 + }, + { + "epoch": 0.5258957565083384, + "grad_norm": 0.6479233503341675, + "learning_rate": 8.412256059395274e-06, + "loss": 0.7359, + "step": 9555 + }, + { + "epoch": 0.525950795310694, + "grad_norm": 0.7596501111984253, + "learning_rate": 8.411939212320778e-06, + "loss": 0.7422, + "step": 9556 + }, + { + "epoch": 0.5260058341130497, + "grad_norm": 0.8040934205055237, + "learning_rate": 8.41162233960306e-06, + "loss": 0.7721, + "step": 9557 + }, + { + "epoch": 0.5260608729154054, + "grad_norm": 0.7190027832984924, + "learning_rate": 8.411305441244505e-06, + "loss": 0.8794, + "step": 9558 + }, + { + "epoch": 0.5261159117177611, + "grad_norm": 0.8002649545669556, + "learning_rate": 8.410988517247486e-06, + "loss": 0.7958, + "step": 9559 + }, + { + "epoch": 0.5261709505201166, + "grad_norm": 0.7151750326156616, + "learning_rate": 8.410671567614394e-06, + "loss": 0.7597, + "step": 9560 + }, + { + "epoch": 0.5262259893224723, + "grad_norm": 0.9718102812767029, + "learning_rate": 8.410354592347607e-06, + "loss": 0.8272, + "step": 9561 + }, + { + "epoch": 0.526281028124828, + "grad_norm": 0.701932966709137, + "learning_rate": 8.410037591449506e-06, + "loss": 0.808, + "step": 9562 + }, + { + "epoch": 0.5263360669271837, + "grad_norm": 0.8247585296630859, + "learning_rate": 8.409720564922476e-06, + "loss": 0.7598, + "step": 9563 + }, + { + "epoch": 0.5263911057295393, + "grad_norm": 0.7305104732513428, + "learning_rate": 8.409403512768899e-06, + "loss": 0.8161, + "step": 9564 + }, + { + "epoch": 0.526446144531895, + "grad_norm": 0.8726410865783691, + "learning_rate": 8.409086434991158e-06, + "loss": 0.8598, + "step": 9565 + }, + { + "epoch": 0.5265011833342507, + "grad_norm": 0.7329155802726746, + "learning_rate": 8.408769331591637e-06, + "loss": 0.7355, + "step": 9566 + }, + { + "epoch": 0.5265562221366064, + "grad_norm": 0.8227902054786682, + "learning_rate": 8.408452202572716e-06, + "loss": 0.7888, + "step": 9567 + }, + { + "epoch": 0.5266112609389619, + "grad_norm": 0.7190666794776917, + "learning_rate": 8.408135047936783e-06, + "loss": 0.669, + "step": 9568 + }, + { + "epoch": 0.5266662997413176, + "grad_norm": 0.6529938578605652, + "learning_rate": 8.407817867686217e-06, + "loss": 0.7345, + "step": 9569 + }, + { + "epoch": 0.5267213385436733, + "grad_norm": 0.6985379457473755, + "learning_rate": 8.407500661823407e-06, + "loss": 0.852, + "step": 9570 + }, + { + "epoch": 0.526776377346029, + "grad_norm": 0.7480047345161438, + "learning_rate": 8.407183430350732e-06, + "loss": 0.7422, + "step": 9571 + }, + { + "epoch": 0.5268314161483846, + "grad_norm": 0.7599420547485352, + "learning_rate": 8.406866173270579e-06, + "loss": 0.7499, + "step": 9572 + }, + { + "epoch": 0.5268864549507403, + "grad_norm": 0.813448965549469, + "learning_rate": 8.406548890585331e-06, + "loss": 0.7979, + "step": 9573 + }, + { + "epoch": 0.526941493753096, + "grad_norm": 0.6029278039932251, + "learning_rate": 8.406231582297374e-06, + "loss": 0.7289, + "step": 9574 + }, + { + "epoch": 0.5269965325554516, + "grad_norm": 0.656829297542572, + "learning_rate": 8.40591424840909e-06, + "loss": 0.6778, + "step": 9575 + }, + { + "epoch": 0.5270515713578072, + "grad_norm": 0.7147198915481567, + "learning_rate": 8.405596888922869e-06, + "loss": 0.7212, + "step": 9576 + }, + { + "epoch": 0.5271066101601629, + "grad_norm": 0.7722035050392151, + "learning_rate": 8.405279503841094e-06, + "loss": 0.8008, + "step": 9577 + }, + { + "epoch": 0.5271616489625186, + "grad_norm": 0.6828493475914001, + "learning_rate": 8.40496209316615e-06, + "loss": 0.787, + "step": 9578 + }, + { + "epoch": 0.5272166877648742, + "grad_norm": 0.6965187788009644, + "learning_rate": 8.40464465690042e-06, + "loss": 0.6803, + "step": 9579 + }, + { + "epoch": 0.5272717265672299, + "grad_norm": 0.7300547957420349, + "learning_rate": 8.404327195046293e-06, + "loss": 0.8165, + "step": 9580 + }, + { + "epoch": 0.5273267653695856, + "grad_norm": 0.7367526292800903, + "learning_rate": 8.404009707606153e-06, + "loss": 0.7709, + "step": 9581 + }, + { + "epoch": 0.5273818041719412, + "grad_norm": 0.6694689989089966, + "learning_rate": 8.40369219458239e-06, + "loss": 0.7971, + "step": 9582 + }, + { + "epoch": 0.5274368429742968, + "grad_norm": 0.6723141074180603, + "learning_rate": 8.403374655977384e-06, + "loss": 0.695, + "step": 9583 + }, + { + "epoch": 0.5274918817766525, + "grad_norm": 0.7737089395523071, + "learning_rate": 8.403057091793528e-06, + "loss": 0.7765, + "step": 9584 + }, + { + "epoch": 0.5275469205790082, + "grad_norm": 0.8378487825393677, + "learning_rate": 8.402739502033204e-06, + "loss": 0.7984, + "step": 9585 + }, + { + "epoch": 0.5276019593813639, + "grad_norm": 0.7496509552001953, + "learning_rate": 8.402421886698802e-06, + "loss": 0.7846, + "step": 9586 + }, + { + "epoch": 0.5276569981837195, + "grad_norm": 0.7020435929298401, + "learning_rate": 8.402104245792706e-06, + "loss": 0.8102, + "step": 9587 + }, + { + "epoch": 0.5277120369860752, + "grad_norm": 0.8877277374267578, + "learning_rate": 8.401786579317308e-06, + "loss": 0.6995, + "step": 9588 + }, + { + "epoch": 0.5277670757884309, + "grad_norm": 0.6975196599960327, + "learning_rate": 8.401468887274991e-06, + "loss": 0.7475, + "step": 9589 + }, + { + "epoch": 0.5278221145907865, + "grad_norm": 0.8267357349395752, + "learning_rate": 8.401151169668144e-06, + "loss": 0.7091, + "step": 9590 + }, + { + "epoch": 0.5278771533931421, + "grad_norm": 0.6778179407119751, + "learning_rate": 8.400833426499156e-06, + "loss": 0.8198, + "step": 9591 + }, + { + "epoch": 0.5279321921954978, + "grad_norm": 0.7343330979347229, + "learning_rate": 8.400515657770414e-06, + "loss": 0.7565, + "step": 9592 + }, + { + "epoch": 0.5279872309978535, + "grad_norm": 0.7745271325111389, + "learning_rate": 8.400197863484307e-06, + "loss": 0.7991, + "step": 9593 + }, + { + "epoch": 0.5280422698002092, + "grad_norm": 0.7652345895767212, + "learning_rate": 8.399880043643224e-06, + "loss": 0.7752, + "step": 9594 + }, + { + "epoch": 0.5280973086025648, + "grad_norm": 0.9764432311058044, + "learning_rate": 8.399562198249551e-06, + "loss": 0.784, + "step": 9595 + }, + { + "epoch": 0.5281523474049205, + "grad_norm": 0.6763052940368652, + "learning_rate": 8.399244327305678e-06, + "loss": 0.7695, + "step": 9596 + }, + { + "epoch": 0.5282073862072761, + "grad_norm": 0.7788934111595154, + "learning_rate": 8.398926430813996e-06, + "loss": 0.8152, + "step": 9597 + }, + { + "epoch": 0.5282624250096318, + "grad_norm": 0.8088317513465881, + "learning_rate": 8.398608508776894e-06, + "loss": 0.7751, + "step": 9598 + }, + { + "epoch": 0.5283174638119874, + "grad_norm": 0.6735319495201111, + "learning_rate": 8.398290561196756e-06, + "loss": 0.7305, + "step": 9599 + }, + { + "epoch": 0.5283725026143431, + "grad_norm": 0.7279297113418579, + "learning_rate": 8.39797258807598e-06, + "loss": 0.7381, + "step": 9600 + }, + { + "epoch": 0.5284275414166988, + "grad_norm": 0.74604332447052, + "learning_rate": 8.39765458941695e-06, + "loss": 0.8138, + "step": 9601 + }, + { + "epoch": 0.5284825802190545, + "grad_norm": 0.7735850214958191, + "learning_rate": 8.397336565222057e-06, + "loss": 0.7364, + "step": 9602 + }, + { + "epoch": 0.52853761902141, + "grad_norm": 0.7890003323554993, + "learning_rate": 8.397018515493693e-06, + "loss": 0.8301, + "step": 9603 + }, + { + "epoch": 0.5285926578237657, + "grad_norm": 0.739054262638092, + "learning_rate": 8.396700440234245e-06, + "loss": 0.7503, + "step": 9604 + }, + { + "epoch": 0.5286476966261214, + "grad_norm": 0.7611023783683777, + "learning_rate": 8.396382339446108e-06, + "loss": 0.7225, + "step": 9605 + }, + { + "epoch": 0.5287027354284771, + "grad_norm": 0.770602285861969, + "learning_rate": 8.39606421313167e-06, + "loss": 0.71, + "step": 9606 + }, + { + "epoch": 0.5287577742308327, + "grad_norm": 0.7495261430740356, + "learning_rate": 8.395746061293322e-06, + "loss": 0.7729, + "step": 9607 + }, + { + "epoch": 0.5288128130331884, + "grad_norm": 0.7159668207168579, + "learning_rate": 8.395427883933456e-06, + "loss": 0.8457, + "step": 9608 + }, + { + "epoch": 0.5288678518355441, + "grad_norm": 0.7663426399230957, + "learning_rate": 8.395109681054463e-06, + "loss": 0.784, + "step": 9609 + }, + { + "epoch": 0.5289228906378998, + "grad_norm": 0.7271933555603027, + "learning_rate": 8.394791452658732e-06, + "loss": 0.7981, + "step": 9610 + }, + { + "epoch": 0.5289779294402553, + "grad_norm": 0.7782096266746521, + "learning_rate": 8.394473198748661e-06, + "loss": 0.7953, + "step": 9611 + }, + { + "epoch": 0.529032968242611, + "grad_norm": 0.8318955302238464, + "learning_rate": 8.394154919326636e-06, + "loss": 0.6875, + "step": 9612 + }, + { + "epoch": 0.5290880070449667, + "grad_norm": 0.7402167916297913, + "learning_rate": 8.393836614395051e-06, + "loss": 0.7805, + "step": 9613 + }, + { + "epoch": 0.5291430458473224, + "grad_norm": 0.6314370632171631, + "learning_rate": 8.393518283956299e-06, + "loss": 0.6841, + "step": 9614 + }, + { + "epoch": 0.529198084649678, + "grad_norm": 0.8387365937232971, + "learning_rate": 8.393199928012772e-06, + "loss": 0.8503, + "step": 9615 + }, + { + "epoch": 0.5292531234520337, + "grad_norm": 0.7066243886947632, + "learning_rate": 8.392881546566863e-06, + "loss": 0.8494, + "step": 9616 + }, + { + "epoch": 0.5293081622543894, + "grad_norm": 0.7034226059913635, + "learning_rate": 8.392563139620964e-06, + "loss": 0.7335, + "step": 9617 + }, + { + "epoch": 0.5293632010567451, + "grad_norm": 0.6969622373580933, + "learning_rate": 8.392244707177468e-06, + "loss": 0.7203, + "step": 9618 + }, + { + "epoch": 0.5294182398591006, + "grad_norm": 0.7694050073623657, + "learning_rate": 8.391926249238768e-06, + "loss": 0.7864, + "step": 9619 + }, + { + "epoch": 0.5294732786614563, + "grad_norm": 0.7284281253814697, + "learning_rate": 8.391607765807262e-06, + "loss": 0.6704, + "step": 9620 + }, + { + "epoch": 0.529528317463812, + "grad_norm": 1.0466688871383667, + "learning_rate": 8.391289256885337e-06, + "loss": 0.7807, + "step": 9621 + }, + { + "epoch": 0.5295833562661676, + "grad_norm": 0.7118388414382935, + "learning_rate": 8.39097072247539e-06, + "loss": 0.738, + "step": 9622 + }, + { + "epoch": 0.5296383950685233, + "grad_norm": 0.794377863407135, + "learning_rate": 8.390652162579815e-06, + "loss": 0.6831, + "step": 9623 + }, + { + "epoch": 0.529693433870879, + "grad_norm": 0.6042492389678955, + "learning_rate": 8.390333577201007e-06, + "loss": 0.6773, + "step": 9624 + }, + { + "epoch": 0.5297484726732347, + "grad_norm": 0.6452521681785583, + "learning_rate": 8.390014966341357e-06, + "loss": 0.7168, + "step": 9625 + }, + { + "epoch": 0.5298035114755902, + "grad_norm": 0.7113651633262634, + "learning_rate": 8.389696330003265e-06, + "loss": 0.709, + "step": 9626 + }, + { + "epoch": 0.5298585502779459, + "grad_norm": 0.6469250917434692, + "learning_rate": 8.38937766818912e-06, + "loss": 0.6804, + "step": 9627 + }, + { + "epoch": 0.5299135890803016, + "grad_norm": 0.7529417872428894, + "learning_rate": 8.389058980901322e-06, + "loss": 0.8537, + "step": 9628 + }, + { + "epoch": 0.5299686278826573, + "grad_norm": 0.7681186199188232, + "learning_rate": 8.388740268142262e-06, + "loss": 0.7383, + "step": 9629 + }, + { + "epoch": 0.5300236666850129, + "grad_norm": 0.6585648655891418, + "learning_rate": 8.388421529914337e-06, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 0.5300787054873686, + "grad_norm": 0.7432085871696472, + "learning_rate": 8.388102766219943e-06, + "loss": 0.7391, + "step": 9631 + }, + { + "epoch": 0.5301337442897243, + "grad_norm": 0.6672815084457397, + "learning_rate": 8.387783977061476e-06, + "loss": 0.8056, + "step": 9632 + }, + { + "epoch": 0.53018878309208, + "grad_norm": 0.7566675543785095, + "learning_rate": 8.387465162441332e-06, + "loss": 0.7858, + "step": 9633 + }, + { + "epoch": 0.5302438218944355, + "grad_norm": 0.6522077322006226, + "learning_rate": 8.387146322361907e-06, + "loss": 0.759, + "step": 9634 + }, + { + "epoch": 0.5302988606967912, + "grad_norm": 0.7246397137641907, + "learning_rate": 8.386827456825597e-06, + "loss": 0.8158, + "step": 9635 + }, + { + "epoch": 0.5303538994991469, + "grad_norm": 0.7577807307243347, + "learning_rate": 8.386508565834797e-06, + "loss": 0.7495, + "step": 9636 + }, + { + "epoch": 0.5304089383015026, + "grad_norm": 0.7080703973770142, + "learning_rate": 8.386189649391906e-06, + "loss": 0.8086, + "step": 9637 + }, + { + "epoch": 0.5304639771038582, + "grad_norm": 0.7505277395248413, + "learning_rate": 8.385870707499321e-06, + "loss": 0.7206, + "step": 9638 + }, + { + "epoch": 0.5305190159062139, + "grad_norm": 0.7044165134429932, + "learning_rate": 8.385551740159437e-06, + "loss": 0.7838, + "step": 9639 + }, + { + "epoch": 0.5305740547085696, + "grad_norm": 0.7921645641326904, + "learning_rate": 8.385232747374652e-06, + "loss": 0.7604, + "step": 9640 + }, + { + "epoch": 0.5306290935109252, + "grad_norm": 0.9930111169815063, + "learning_rate": 8.384913729147364e-06, + "loss": 0.7839, + "step": 9641 + }, + { + "epoch": 0.5306841323132808, + "grad_norm": 0.7333244681358337, + "learning_rate": 8.38459468547997e-06, + "loss": 0.7941, + "step": 9642 + }, + { + "epoch": 0.5307391711156365, + "grad_norm": 0.7857590913772583, + "learning_rate": 8.384275616374868e-06, + "loss": 0.8535, + "step": 9643 + }, + { + "epoch": 0.5307942099179922, + "grad_norm": 0.8568746447563171, + "learning_rate": 8.383956521834459e-06, + "loss": 0.6586, + "step": 9644 + }, + { + "epoch": 0.5308492487203479, + "grad_norm": 0.7061276435852051, + "learning_rate": 8.383637401861136e-06, + "loss": 0.7288, + "step": 9645 + }, + { + "epoch": 0.5309042875227035, + "grad_norm": 0.7348940968513489, + "learning_rate": 8.383318256457303e-06, + "loss": 0.8099, + "step": 9646 + }, + { + "epoch": 0.5309593263250592, + "grad_norm": 0.6526725888252258, + "learning_rate": 8.382999085625353e-06, + "loss": 0.6702, + "step": 9647 + }, + { + "epoch": 0.5310143651274148, + "grad_norm": 0.8122747540473938, + "learning_rate": 8.382679889367687e-06, + "loss": 0.67, + "step": 9648 + }, + { + "epoch": 0.5310694039297705, + "grad_norm": 0.9145376682281494, + "learning_rate": 8.382360667686706e-06, + "loss": 0.7719, + "step": 9649 + }, + { + "epoch": 0.5311244427321261, + "grad_norm": 0.6659818887710571, + "learning_rate": 8.382041420584807e-06, + "loss": 0.806, + "step": 9650 + }, + { + "epoch": 0.5311794815344818, + "grad_norm": 0.7088539004325867, + "learning_rate": 8.381722148064391e-06, + "loss": 0.7046, + "step": 9651 + }, + { + "epoch": 0.5312345203368375, + "grad_norm": 0.8610590696334839, + "learning_rate": 8.381402850127854e-06, + "loss": 0.6998, + "step": 9652 + }, + { + "epoch": 0.5312895591391932, + "grad_norm": 0.775830864906311, + "learning_rate": 8.3810835267776e-06, + "loss": 0.8874, + "step": 9653 + }, + { + "epoch": 0.5313445979415488, + "grad_norm": 0.6871606707572937, + "learning_rate": 8.380764178016028e-06, + "loss": 0.7903, + "step": 9654 + }, + { + "epoch": 0.5313996367439044, + "grad_norm": 0.7005272507667542, + "learning_rate": 8.380444803845537e-06, + "loss": 0.6685, + "step": 9655 + }, + { + "epoch": 0.5314546755462601, + "grad_norm": 0.8922042846679688, + "learning_rate": 8.380125404268527e-06, + "loss": 0.7797, + "step": 9656 + }, + { + "epoch": 0.5315097143486158, + "grad_norm": 0.7242267727851868, + "learning_rate": 8.3798059792874e-06, + "loss": 0.863, + "step": 9657 + }, + { + "epoch": 0.5315647531509714, + "grad_norm": 0.6625328660011292, + "learning_rate": 8.379486528904555e-06, + "loss": 0.7, + "step": 9658 + }, + { + "epoch": 0.5316197919533271, + "grad_norm": 0.9882226586341858, + "learning_rate": 8.379167053122394e-06, + "loss": 0.7534, + "step": 9659 + }, + { + "epoch": 0.5316748307556828, + "grad_norm": 0.6894702911376953, + "learning_rate": 8.378847551943318e-06, + "loss": 0.7503, + "step": 9660 + }, + { + "epoch": 0.5317298695580385, + "grad_norm": 0.6820259690284729, + "learning_rate": 8.37852802536973e-06, + "loss": 0.7713, + "step": 9661 + }, + { + "epoch": 0.531784908360394, + "grad_norm": 0.667918860912323, + "learning_rate": 8.378208473404028e-06, + "loss": 0.7524, + "step": 9662 + }, + { + "epoch": 0.5318399471627497, + "grad_norm": 0.7789241075515747, + "learning_rate": 8.377888896048617e-06, + "loss": 0.6906, + "step": 9663 + }, + { + "epoch": 0.5318949859651054, + "grad_norm": 0.7264542579650879, + "learning_rate": 8.377569293305894e-06, + "loss": 0.7836, + "step": 9664 + }, + { + "epoch": 0.531950024767461, + "grad_norm": 0.6979835629463196, + "learning_rate": 8.377249665178267e-06, + "loss": 0.7739, + "step": 9665 + }, + { + "epoch": 0.5320050635698167, + "grad_norm": 0.8008072376251221, + "learning_rate": 8.376930011668136e-06, + "loss": 0.7853, + "step": 9666 + }, + { + "epoch": 0.5320601023721724, + "grad_norm": 0.7185621857643127, + "learning_rate": 8.376610332777901e-06, + "loss": 0.7311, + "step": 9667 + }, + { + "epoch": 0.5321151411745281, + "grad_norm": 0.7644047141075134, + "learning_rate": 8.376290628509969e-06, + "loss": 0.6919, + "step": 9668 + }, + { + "epoch": 0.5321701799768837, + "grad_norm": 0.7387600541114807, + "learning_rate": 8.37597089886674e-06, + "loss": 0.7285, + "step": 9669 + }, + { + "epoch": 0.5322252187792393, + "grad_norm": 0.7344895005226135, + "learning_rate": 8.375651143850614e-06, + "loss": 0.7514, + "step": 9670 + }, + { + "epoch": 0.532280257581595, + "grad_norm": 0.6930707097053528, + "learning_rate": 8.375331363464002e-06, + "loss": 0.8318, + "step": 9671 + }, + { + "epoch": 0.5323352963839507, + "grad_norm": 0.678162693977356, + "learning_rate": 8.3750115577093e-06, + "loss": 0.7123, + "step": 9672 + }, + { + "epoch": 0.5323903351863063, + "grad_norm": 0.7780481576919556, + "learning_rate": 8.374691726588914e-06, + "loss": 0.7672, + "step": 9673 + }, + { + "epoch": 0.532445373988662, + "grad_norm": 0.6664674282073975, + "learning_rate": 8.374371870105252e-06, + "loss": 0.6994, + "step": 9674 + }, + { + "epoch": 0.5325004127910177, + "grad_norm": 0.6952562928199768, + "learning_rate": 8.374051988260712e-06, + "loss": 0.8638, + "step": 9675 + }, + { + "epoch": 0.5325554515933734, + "grad_norm": 0.764005184173584, + "learning_rate": 8.373732081057699e-06, + "loss": 0.756, + "step": 9676 + }, + { + "epoch": 0.5326104903957289, + "grad_norm": 0.9434393048286438, + "learning_rate": 8.373412148498621e-06, + "loss": 0.8668, + "step": 9677 + }, + { + "epoch": 0.5326655291980846, + "grad_norm": 0.752609133720398, + "learning_rate": 8.373092190585878e-06, + "loss": 0.8078, + "step": 9678 + }, + { + "epoch": 0.5327205680004403, + "grad_norm": 0.671940803527832, + "learning_rate": 8.37277220732188e-06, + "loss": 0.7726, + "step": 9679 + }, + { + "epoch": 0.532775606802796, + "grad_norm": 0.7824863791465759, + "learning_rate": 8.372452198709027e-06, + "loss": 0.8246, + "step": 9680 + }, + { + "epoch": 0.5328306456051516, + "grad_norm": 0.7300587892532349, + "learning_rate": 8.372132164749726e-06, + "loss": 0.7953, + "step": 9681 + }, + { + "epoch": 0.5328856844075073, + "grad_norm": 0.7146018743515015, + "learning_rate": 8.371812105446384e-06, + "loss": 0.7409, + "step": 9682 + }, + { + "epoch": 0.532940723209863, + "grad_norm": 0.73857581615448, + "learning_rate": 8.371492020801404e-06, + "loss": 0.8067, + "step": 9683 + }, + { + "epoch": 0.5329957620122187, + "grad_norm": 0.6760877966880798, + "learning_rate": 8.37117191081719e-06, + "loss": 0.7363, + "step": 9684 + }, + { + "epoch": 0.5330508008145742, + "grad_norm": 0.766482412815094, + "learning_rate": 8.370851775496154e-06, + "loss": 0.7358, + "step": 9685 + }, + { + "epoch": 0.5331058396169299, + "grad_norm": 0.7230576276779175, + "learning_rate": 8.370531614840697e-06, + "loss": 0.8154, + "step": 9686 + }, + { + "epoch": 0.5331608784192856, + "grad_norm": 0.7357933521270752, + "learning_rate": 8.370211428853225e-06, + "loss": 0.7187, + "step": 9687 + }, + { + "epoch": 0.5332159172216413, + "grad_norm": 0.8208534121513367, + "learning_rate": 8.369891217536148e-06, + "loss": 0.8037, + "step": 9688 + }, + { + "epoch": 0.5332709560239969, + "grad_norm": 0.6771863698959351, + "learning_rate": 8.36957098089187e-06, + "loss": 0.733, + "step": 9689 + }, + { + "epoch": 0.5333259948263526, + "grad_norm": 0.6382480263710022, + "learning_rate": 8.369250718922798e-06, + "loss": 0.7391, + "step": 9690 + }, + { + "epoch": 0.5333810336287083, + "grad_norm": 0.6638994812965393, + "learning_rate": 8.368930431631342e-06, + "loss": 0.7176, + "step": 9691 + }, + { + "epoch": 0.533436072431064, + "grad_norm": 0.7599604725837708, + "learning_rate": 8.368610119019903e-06, + "loss": 0.8814, + "step": 9692 + }, + { + "epoch": 0.5334911112334195, + "grad_norm": 0.6896547079086304, + "learning_rate": 8.368289781090894e-06, + "loss": 0.7618, + "step": 9693 + }, + { + "epoch": 0.5335461500357752, + "grad_norm": 0.7081224918365479, + "learning_rate": 8.36796941784672e-06, + "loss": 0.656, + "step": 9694 + }, + { + "epoch": 0.5336011888381309, + "grad_norm": 0.8819646835327148, + "learning_rate": 8.367649029289791e-06, + "loss": 0.8946, + "step": 9695 + }, + { + "epoch": 0.5336562276404866, + "grad_norm": 0.6597925424575806, + "learning_rate": 8.367328615422512e-06, + "loss": 0.6891, + "step": 9696 + }, + { + "epoch": 0.5337112664428422, + "grad_norm": 0.6855770945549011, + "learning_rate": 8.367008176247294e-06, + "loss": 0.7158, + "step": 9697 + }, + { + "epoch": 0.5337663052451979, + "grad_norm": 0.6874905228614807, + "learning_rate": 8.366687711766541e-06, + "loss": 0.7445, + "step": 9698 + }, + { + "epoch": 0.5338213440475535, + "grad_norm": 0.6990895867347717, + "learning_rate": 8.366367221982666e-06, + "loss": 0.6189, + "step": 9699 + }, + { + "epoch": 0.5338763828499092, + "grad_norm": 0.7235365509986877, + "learning_rate": 8.366046706898075e-06, + "loss": 0.6406, + "step": 9700 + }, + { + "epoch": 0.5339314216522648, + "grad_norm": 0.7563154697418213, + "learning_rate": 8.36572616651518e-06, + "loss": 0.7798, + "step": 9701 + }, + { + "epoch": 0.5339864604546205, + "grad_norm": 0.6845980286598206, + "learning_rate": 8.365405600836387e-06, + "loss": 0.7665, + "step": 9702 + }, + { + "epoch": 0.5340414992569762, + "grad_norm": 0.6374378204345703, + "learning_rate": 8.365085009864106e-06, + "loss": 0.6935, + "step": 9703 + }, + { + "epoch": 0.5340965380593319, + "grad_norm": 0.726672887802124, + "learning_rate": 8.364764393600747e-06, + "loss": 0.7821, + "step": 9704 + }, + { + "epoch": 0.5341515768616875, + "grad_norm": 0.6784456372261047, + "learning_rate": 8.364443752048719e-06, + "loss": 0.7722, + "step": 9705 + }, + { + "epoch": 0.5342066156640431, + "grad_norm": 0.6344080567359924, + "learning_rate": 8.364123085210433e-06, + "loss": 0.7256, + "step": 9706 + }, + { + "epoch": 0.5342616544663988, + "grad_norm": 0.7913152575492859, + "learning_rate": 8.363802393088299e-06, + "loss": 0.7892, + "step": 9707 + }, + { + "epoch": 0.5343166932687544, + "grad_norm": 0.6792107820510864, + "learning_rate": 8.363481675684726e-06, + "loss": 0.7374, + "step": 9708 + }, + { + "epoch": 0.5343717320711101, + "grad_norm": 1.0153685808181763, + "learning_rate": 8.363160933002126e-06, + "loss": 0.7396, + "step": 9709 + }, + { + "epoch": 0.5344267708734658, + "grad_norm": 0.7655258774757385, + "learning_rate": 8.362840165042906e-06, + "loss": 0.7746, + "step": 9710 + }, + { + "epoch": 0.5344818096758215, + "grad_norm": 0.7830179929733276, + "learning_rate": 8.362519371809483e-06, + "loss": 0.7082, + "step": 9711 + }, + { + "epoch": 0.5345368484781771, + "grad_norm": 0.7410556674003601, + "learning_rate": 8.362198553304261e-06, + "loss": 0.7055, + "step": 9712 + }, + { + "epoch": 0.5345918872805328, + "grad_norm": 0.6542297005653381, + "learning_rate": 8.361877709529658e-06, + "loss": 0.7153, + "step": 9713 + }, + { + "epoch": 0.5346469260828884, + "grad_norm": 0.6752653121948242, + "learning_rate": 8.36155684048808e-06, + "loss": 0.6901, + "step": 9714 + }, + { + "epoch": 0.5347019648852441, + "grad_norm": 0.7158684134483337, + "learning_rate": 8.361235946181943e-06, + "loss": 0.7775, + "step": 9715 + }, + { + "epoch": 0.5347570036875997, + "grad_norm": 0.6174392700195312, + "learning_rate": 8.360915026613652e-06, + "loss": 0.6501, + "step": 9716 + }, + { + "epoch": 0.5348120424899554, + "grad_norm": 0.7110500931739807, + "learning_rate": 8.360594081785627e-06, + "loss": 0.742, + "step": 9717 + }, + { + "epoch": 0.5348670812923111, + "grad_norm": 0.8456488251686096, + "learning_rate": 8.360273111700276e-06, + "loss": 0.8237, + "step": 9718 + }, + { + "epoch": 0.5349221200946668, + "grad_norm": 0.6660711169242859, + "learning_rate": 8.359952116360011e-06, + "loss": 0.7856, + "step": 9719 + }, + { + "epoch": 0.5349771588970224, + "grad_norm": 0.7661204934120178, + "learning_rate": 8.359631095767244e-06, + "loss": 0.8336, + "step": 9720 + }, + { + "epoch": 0.535032197699378, + "grad_norm": 0.7747855186462402, + "learning_rate": 8.359310049924392e-06, + "loss": 0.7302, + "step": 9721 + }, + { + "epoch": 0.5350872365017337, + "grad_norm": 0.8156001567840576, + "learning_rate": 8.358988978833864e-06, + "loss": 0.7878, + "step": 9722 + }, + { + "epoch": 0.5351422753040894, + "grad_norm": 0.7371010780334473, + "learning_rate": 8.358667882498073e-06, + "loss": 0.803, + "step": 9723 + }, + { + "epoch": 0.535197314106445, + "grad_norm": 0.7141744494438171, + "learning_rate": 8.358346760919431e-06, + "loss": 0.687, + "step": 9724 + }, + { + "epoch": 0.5352523529088007, + "grad_norm": 0.6395956873893738, + "learning_rate": 8.358025614100358e-06, + "loss": 0.7052, + "step": 9725 + }, + { + "epoch": 0.5353073917111564, + "grad_norm": 0.7135289311408997, + "learning_rate": 8.35770444204326e-06, + "loss": 0.7882, + "step": 9726 + }, + { + "epoch": 0.5353624305135121, + "grad_norm": 0.702408492565155, + "learning_rate": 8.357383244750557e-06, + "loss": 0.6965, + "step": 9727 + }, + { + "epoch": 0.5354174693158676, + "grad_norm": 0.731193482875824, + "learning_rate": 8.357062022224658e-06, + "loss": 0.7525, + "step": 9728 + }, + { + "epoch": 0.5354725081182233, + "grad_norm": 0.8115057945251465, + "learning_rate": 8.356740774467982e-06, + "loss": 0.7466, + "step": 9729 + }, + { + "epoch": 0.535527546920579, + "grad_norm": 0.8644380569458008, + "learning_rate": 8.356419501482938e-06, + "loss": 0.7989, + "step": 9730 + }, + { + "epoch": 0.5355825857229347, + "grad_norm": 1.414620041847229, + "learning_rate": 8.356098203271945e-06, + "loss": 0.7782, + "step": 9731 + }, + { + "epoch": 0.5356376245252903, + "grad_norm": 0.7355421185493469, + "learning_rate": 8.355776879837417e-06, + "loss": 0.7163, + "step": 9732 + }, + { + "epoch": 0.535692663327646, + "grad_norm": 0.6556879281997681, + "learning_rate": 8.355455531181766e-06, + "loss": 0.7543, + "step": 9733 + }, + { + "epoch": 0.5357477021300017, + "grad_norm": 0.6632516980171204, + "learning_rate": 8.355134157307412e-06, + "loss": 0.7382, + "step": 9734 + }, + { + "epoch": 0.5358027409323574, + "grad_norm": 0.7096145153045654, + "learning_rate": 8.354812758216767e-06, + "loss": 0.7797, + "step": 9735 + }, + { + "epoch": 0.5358577797347129, + "grad_norm": 0.6404649019241333, + "learning_rate": 8.354491333912244e-06, + "loss": 0.6637, + "step": 9736 + }, + { + "epoch": 0.5359128185370686, + "grad_norm": 0.6987022757530212, + "learning_rate": 8.354169884396266e-06, + "loss": 0.7682, + "step": 9737 + }, + { + "epoch": 0.5359678573394243, + "grad_norm": 0.6593581438064575, + "learning_rate": 8.353848409671245e-06, + "loss": 0.6747, + "step": 9738 + }, + { + "epoch": 0.53602289614178, + "grad_norm": 0.6999880075454712, + "learning_rate": 8.353526909739596e-06, + "loss": 0.6659, + "step": 9739 + }, + { + "epoch": 0.5360779349441356, + "grad_norm": 0.6448989510536194, + "learning_rate": 8.353205384603735e-06, + "loss": 0.7297, + "step": 9740 + }, + { + "epoch": 0.5361329737464913, + "grad_norm": 0.6666765213012695, + "learning_rate": 8.352883834266082e-06, + "loss": 0.6459, + "step": 9741 + }, + { + "epoch": 0.536188012548847, + "grad_norm": 0.8020225763320923, + "learning_rate": 8.352562258729051e-06, + "loss": 0.8122, + "step": 9742 + }, + { + "epoch": 0.5362430513512026, + "grad_norm": 0.6883382201194763, + "learning_rate": 8.35224065799506e-06, + "loss": 0.7084, + "step": 9743 + }, + { + "epoch": 0.5362980901535582, + "grad_norm": 0.7366660237312317, + "learning_rate": 8.351919032066525e-06, + "loss": 0.848, + "step": 9744 + }, + { + "epoch": 0.5363531289559139, + "grad_norm": 0.7408311367034912, + "learning_rate": 8.351597380945863e-06, + "loss": 0.798, + "step": 9745 + }, + { + "epoch": 0.5364081677582696, + "grad_norm": 0.6841676235198975, + "learning_rate": 8.351275704635495e-06, + "loss": 0.7372, + "step": 9746 + }, + { + "epoch": 0.5364632065606253, + "grad_norm": 0.6903505325317383, + "learning_rate": 8.350954003137833e-06, + "loss": 0.7371, + "step": 9747 + }, + { + "epoch": 0.5365182453629809, + "grad_norm": 0.6444700956344604, + "learning_rate": 8.350632276455298e-06, + "loss": 0.6685, + "step": 9748 + }, + { + "epoch": 0.5365732841653366, + "grad_norm": 0.6821029186248779, + "learning_rate": 8.350310524590307e-06, + "loss": 0.8796, + "step": 9749 + }, + { + "epoch": 0.5366283229676923, + "grad_norm": 0.6733999848365784, + "learning_rate": 8.349988747545282e-06, + "loss": 0.6833, + "step": 9750 + }, + { + "epoch": 0.5366833617700478, + "grad_norm": 0.8097321391105652, + "learning_rate": 8.349666945322636e-06, + "loss": 0.834, + "step": 9751 + }, + { + "epoch": 0.5367384005724035, + "grad_norm": 0.7692395448684692, + "learning_rate": 8.34934511792479e-06, + "loss": 0.7866, + "step": 9752 + }, + { + "epoch": 0.5367934393747592, + "grad_norm": 0.7551112174987793, + "learning_rate": 8.349023265354164e-06, + "loss": 0.8378, + "step": 9753 + }, + { + "epoch": 0.5368484781771149, + "grad_norm": 0.5796393156051636, + "learning_rate": 8.348701387613176e-06, + "loss": 0.5995, + "step": 9754 + }, + { + "epoch": 0.5369035169794705, + "grad_norm": 0.6839799284934998, + "learning_rate": 8.348379484704244e-06, + "loss": 0.8262, + "step": 9755 + }, + { + "epoch": 0.5369585557818262, + "grad_norm": 0.7710869908332825, + "learning_rate": 8.348057556629786e-06, + "loss": 0.7796, + "step": 9756 + }, + { + "epoch": 0.5370135945841819, + "grad_norm": 0.733096718788147, + "learning_rate": 8.347735603392225e-06, + "loss": 0.8233, + "step": 9757 + }, + { + "epoch": 0.5370686333865375, + "grad_norm": 0.6438466906547546, + "learning_rate": 8.347413624993982e-06, + "loss": 0.7582, + "step": 9758 + }, + { + "epoch": 0.5371236721888931, + "grad_norm": 0.6877560615539551, + "learning_rate": 8.34709162143747e-06, + "loss": 0.7428, + "step": 9759 + }, + { + "epoch": 0.5371787109912488, + "grad_norm": 1.060831069946289, + "learning_rate": 8.346769592725115e-06, + "loss": 0.8636, + "step": 9760 + }, + { + "epoch": 0.5372337497936045, + "grad_norm": 0.6828434467315674, + "learning_rate": 8.346447538859334e-06, + "loss": 0.7801, + "step": 9761 + }, + { + "epoch": 0.5372887885959602, + "grad_norm": 0.6784753203392029, + "learning_rate": 8.346125459842552e-06, + "loss": 0.7356, + "step": 9762 + }, + { + "epoch": 0.5373438273983158, + "grad_norm": 0.6493560075759888, + "learning_rate": 8.345803355677185e-06, + "loss": 0.749, + "step": 9763 + }, + { + "epoch": 0.5373988662006715, + "grad_norm": 0.7109258770942688, + "learning_rate": 8.345481226365657e-06, + "loss": 0.7599, + "step": 9764 + }, + { + "epoch": 0.5374539050030271, + "grad_norm": 0.8526985049247742, + "learning_rate": 8.345159071910387e-06, + "loss": 0.6605, + "step": 9765 + }, + { + "epoch": 0.5375089438053828, + "grad_norm": 0.9194039702415466, + "learning_rate": 8.344836892313797e-06, + "loss": 0.794, + "step": 9766 + }, + { + "epoch": 0.5375639826077384, + "grad_norm": 0.7258954048156738, + "learning_rate": 8.344514687578307e-06, + "loss": 0.871, + "step": 9767 + }, + { + "epoch": 0.5376190214100941, + "grad_norm": 0.7099377512931824, + "learning_rate": 8.34419245770634e-06, + "loss": 0.8098, + "step": 9768 + }, + { + "epoch": 0.5376740602124498, + "grad_norm": 0.7883020639419556, + "learning_rate": 8.34387020270032e-06, + "loss": 0.8383, + "step": 9769 + }, + { + "epoch": 0.5377290990148055, + "grad_norm": 0.7009730339050293, + "learning_rate": 8.343547922562664e-06, + "loss": 0.7794, + "step": 9770 + }, + { + "epoch": 0.5377841378171611, + "grad_norm": 0.6569581031799316, + "learning_rate": 8.343225617295798e-06, + "loss": 0.7574, + "step": 9771 + }, + { + "epoch": 0.5378391766195167, + "grad_norm": 0.6159278154373169, + "learning_rate": 8.342903286902142e-06, + "loss": 0.7136, + "step": 9772 + }, + { + "epoch": 0.5378942154218724, + "grad_norm": 0.6594879627227783, + "learning_rate": 8.342580931384121e-06, + "loss": 0.6906, + "step": 9773 + }, + { + "epoch": 0.5379492542242281, + "grad_norm": 0.7002933025360107, + "learning_rate": 8.342258550744156e-06, + "loss": 0.7272, + "step": 9774 + }, + { + "epoch": 0.5380042930265837, + "grad_norm": 0.8243216276168823, + "learning_rate": 8.341936144984672e-06, + "loss": 0.8105, + "step": 9775 + }, + { + "epoch": 0.5380593318289394, + "grad_norm": 0.8358921408653259, + "learning_rate": 8.34161371410809e-06, + "loss": 0.7118, + "step": 9776 + }, + { + "epoch": 0.5381143706312951, + "grad_norm": 0.6339066028594971, + "learning_rate": 8.34129125811683e-06, + "loss": 0.7035, + "step": 9777 + }, + { + "epoch": 0.5381694094336508, + "grad_norm": 0.7407625317573547, + "learning_rate": 8.340968777013324e-06, + "loss": 0.7447, + "step": 9778 + }, + { + "epoch": 0.5382244482360063, + "grad_norm": 0.6876600384712219, + "learning_rate": 8.340646270799991e-06, + "loss": 0.7298, + "step": 9779 + }, + { + "epoch": 0.538279487038362, + "grad_norm": 0.7021264433860779, + "learning_rate": 8.340323739479251e-06, + "loss": 0.7869, + "step": 9780 + }, + { + "epoch": 0.5383345258407177, + "grad_norm": 0.7341023087501526, + "learning_rate": 8.340001183053535e-06, + "loss": 0.7447, + "step": 9781 + }, + { + "epoch": 0.5383895646430734, + "grad_norm": 0.6829406023025513, + "learning_rate": 8.339678601525263e-06, + "loss": 0.7438, + "step": 9782 + }, + { + "epoch": 0.538444603445429, + "grad_norm": 0.7671583294868469, + "learning_rate": 8.33935599489686e-06, + "loss": 0.8678, + "step": 9783 + }, + { + "epoch": 0.5384996422477847, + "grad_norm": 0.701797366142273, + "learning_rate": 8.339033363170753e-06, + "loss": 0.8431, + "step": 9784 + }, + { + "epoch": 0.5385546810501404, + "grad_norm": 0.748235285282135, + "learning_rate": 8.338710706349363e-06, + "loss": 0.7905, + "step": 9785 + }, + { + "epoch": 0.5386097198524961, + "grad_norm": 0.8202430605888367, + "learning_rate": 8.338388024435119e-06, + "loss": 0.7734, + "step": 9786 + }, + { + "epoch": 0.5386647586548516, + "grad_norm": 0.8218014240264893, + "learning_rate": 8.338065317430442e-06, + "loss": 0.846, + "step": 9787 + }, + { + "epoch": 0.5387197974572073, + "grad_norm": 0.6773214936256409, + "learning_rate": 8.337742585337762e-06, + "loss": 0.7692, + "step": 9788 + }, + { + "epoch": 0.538774836259563, + "grad_norm": 0.7011464834213257, + "learning_rate": 8.337419828159501e-06, + "loss": 0.7534, + "step": 9789 + }, + { + "epoch": 0.5388298750619187, + "grad_norm": 0.8299004435539246, + "learning_rate": 8.337097045898087e-06, + "loss": 0.7997, + "step": 9790 + }, + { + "epoch": 0.5388849138642743, + "grad_norm": 0.8600753545761108, + "learning_rate": 8.336774238555942e-06, + "loss": 0.8307, + "step": 9791 + }, + { + "epoch": 0.53893995266663, + "grad_norm": 0.676490843296051, + "learning_rate": 8.336451406135498e-06, + "loss": 0.7748, + "step": 9792 + }, + { + "epoch": 0.5389949914689857, + "grad_norm": 0.7094627618789673, + "learning_rate": 8.336128548639177e-06, + "loss": 0.7524, + "step": 9793 + }, + { + "epoch": 0.5390500302713412, + "grad_norm": 0.6804066896438599, + "learning_rate": 8.335805666069407e-06, + "loss": 0.8299, + "step": 9794 + }, + { + "epoch": 0.5391050690736969, + "grad_norm": 0.6992025971412659, + "learning_rate": 8.335482758428614e-06, + "loss": 0.7548, + "step": 9795 + }, + { + "epoch": 0.5391601078760526, + "grad_norm": 0.6649640798568726, + "learning_rate": 8.335159825719227e-06, + "loss": 0.6595, + "step": 9796 + }, + { + "epoch": 0.5392151466784083, + "grad_norm": 0.7292002439498901, + "learning_rate": 8.33483686794367e-06, + "loss": 0.7944, + "step": 9797 + }, + { + "epoch": 0.5392701854807639, + "grad_norm": 0.9124587178230286, + "learning_rate": 8.334513885104375e-06, + "loss": 0.8586, + "step": 9798 + }, + { + "epoch": 0.5393252242831196, + "grad_norm": 0.7091020941734314, + "learning_rate": 8.334190877203761e-06, + "loss": 0.7019, + "step": 9799 + }, + { + "epoch": 0.5393802630854753, + "grad_norm": 0.7470952272415161, + "learning_rate": 8.333867844244265e-06, + "loss": 0.7866, + "step": 9800 + }, + { + "epoch": 0.539435301887831, + "grad_norm": 0.7368966341018677, + "learning_rate": 8.333544786228309e-06, + "loss": 0.8135, + "step": 9801 + }, + { + "epoch": 0.5394903406901865, + "grad_norm": 0.668305516242981, + "learning_rate": 8.333221703158322e-06, + "loss": 0.7549, + "step": 9802 + }, + { + "epoch": 0.5395453794925422, + "grad_norm": 0.6788874268531799, + "learning_rate": 8.332898595036735e-06, + "loss": 0.8077, + "step": 9803 + }, + { + "epoch": 0.5396004182948979, + "grad_norm": 0.654863715171814, + "learning_rate": 8.332575461865972e-06, + "loss": 0.7695, + "step": 9804 + }, + { + "epoch": 0.5396554570972536, + "grad_norm": 0.7460314631462097, + "learning_rate": 8.332252303648464e-06, + "loss": 0.7711, + "step": 9805 + }, + { + "epoch": 0.5397104958996092, + "grad_norm": 0.7923582792282104, + "learning_rate": 8.331929120386643e-06, + "loss": 0.7348, + "step": 9806 + }, + { + "epoch": 0.5397655347019649, + "grad_norm": 0.6570843458175659, + "learning_rate": 8.331605912082932e-06, + "loss": 0.7029, + "step": 9807 + }, + { + "epoch": 0.5398205735043206, + "grad_norm": 0.7728865742683411, + "learning_rate": 8.331282678739762e-06, + "loss": 0.8249, + "step": 9808 + }, + { + "epoch": 0.5398756123066762, + "grad_norm": 0.7121468186378479, + "learning_rate": 8.330959420359565e-06, + "loss": 0.8698, + "step": 9809 + }, + { + "epoch": 0.5399306511090318, + "grad_norm": 0.7779444456100464, + "learning_rate": 8.330636136944768e-06, + "loss": 0.7448, + "step": 9810 + }, + { + "epoch": 0.5399856899113875, + "grad_norm": 0.7770833373069763, + "learning_rate": 8.330312828497801e-06, + "loss": 0.8489, + "step": 9811 + }, + { + "epoch": 0.5400407287137432, + "grad_norm": 0.6705769896507263, + "learning_rate": 8.329989495021096e-06, + "loss": 0.7349, + "step": 9812 + }, + { + "epoch": 0.5400957675160989, + "grad_norm": 0.6775381565093994, + "learning_rate": 8.329666136517079e-06, + "loss": 0.8093, + "step": 9813 + }, + { + "epoch": 0.5401508063184545, + "grad_norm": 0.6621832251548767, + "learning_rate": 8.329342752988183e-06, + "loss": 0.7877, + "step": 9814 + }, + { + "epoch": 0.5402058451208102, + "grad_norm": 0.704339861869812, + "learning_rate": 8.329019344436839e-06, + "loss": 0.7708, + "step": 9815 + }, + { + "epoch": 0.5402608839231658, + "grad_norm": 0.789944052696228, + "learning_rate": 8.328695910865476e-06, + "loss": 0.7563, + "step": 9816 + }, + { + "epoch": 0.5403159227255215, + "grad_norm": 0.6997420191764832, + "learning_rate": 8.328372452276525e-06, + "loss": 0.7023, + "step": 9817 + }, + { + "epoch": 0.5403709615278771, + "grad_norm": 0.6453180313110352, + "learning_rate": 8.328048968672418e-06, + "loss": 0.7193, + "step": 9818 + }, + { + "epoch": 0.5404260003302328, + "grad_norm": 0.7059640884399414, + "learning_rate": 8.327725460055586e-06, + "loss": 0.7875, + "step": 9819 + }, + { + "epoch": 0.5404810391325885, + "grad_norm": 0.7725005745887756, + "learning_rate": 8.327401926428461e-06, + "loss": 0.7503, + "step": 9820 + }, + { + "epoch": 0.5405360779349442, + "grad_norm": 0.7710940837860107, + "learning_rate": 8.327078367793473e-06, + "loss": 0.8314, + "step": 9821 + }, + { + "epoch": 0.5405911167372998, + "grad_norm": 0.9090666770935059, + "learning_rate": 8.326754784153055e-06, + "loss": 0.8021, + "step": 9822 + }, + { + "epoch": 0.5406461555396554, + "grad_norm": 0.7135322690010071, + "learning_rate": 8.326431175509638e-06, + "loss": 0.8084, + "step": 9823 + }, + { + "epoch": 0.5407011943420111, + "grad_norm": 0.9126102328300476, + "learning_rate": 8.326107541865656e-06, + "loss": 0.75, + "step": 9824 + }, + { + "epoch": 0.5407562331443668, + "grad_norm": 0.7263361215591431, + "learning_rate": 8.325783883223539e-06, + "loss": 0.6808, + "step": 9825 + }, + { + "epoch": 0.5408112719467224, + "grad_norm": 0.7234700918197632, + "learning_rate": 8.32546019958572e-06, + "loss": 0.7582, + "step": 9826 + }, + { + "epoch": 0.5408663107490781, + "grad_norm": 0.7043294310569763, + "learning_rate": 8.325136490954633e-06, + "loss": 0.8421, + "step": 9827 + }, + { + "epoch": 0.5409213495514338, + "grad_norm": 0.7947664856910706, + "learning_rate": 8.32481275733271e-06, + "loss": 0.8672, + "step": 9828 + }, + { + "epoch": 0.5409763883537895, + "grad_norm": 0.704590916633606, + "learning_rate": 8.324488998722384e-06, + "loss": 0.7356, + "step": 9829 + }, + { + "epoch": 0.541031427156145, + "grad_norm": 0.7630662322044373, + "learning_rate": 8.32416521512609e-06, + "loss": 0.7082, + "step": 9830 + }, + { + "epoch": 0.5410864659585007, + "grad_norm": 0.728721022605896, + "learning_rate": 8.323841406546259e-06, + "loss": 0.7987, + "step": 9831 + }, + { + "epoch": 0.5411415047608564, + "grad_norm": 0.7164294719696045, + "learning_rate": 8.323517572985326e-06, + "loss": 0.721, + "step": 9832 + }, + { + "epoch": 0.5411965435632121, + "grad_norm": 0.7555723190307617, + "learning_rate": 8.323193714445722e-06, + "loss": 0.814, + "step": 9833 + }, + { + "epoch": 0.5412515823655677, + "grad_norm": 0.827485978603363, + "learning_rate": 8.322869830929887e-06, + "loss": 0.8817, + "step": 9834 + }, + { + "epoch": 0.5413066211679234, + "grad_norm": 0.718950092792511, + "learning_rate": 8.322545922440252e-06, + "loss": 0.8648, + "step": 9835 + }, + { + "epoch": 0.5413616599702791, + "grad_norm": 0.7361611723899841, + "learning_rate": 8.32222198897925e-06, + "loss": 0.7392, + "step": 9836 + }, + { + "epoch": 0.5414166987726347, + "grad_norm": 0.6712168455123901, + "learning_rate": 8.321898030549316e-06, + "loss": 0.7505, + "step": 9837 + }, + { + "epoch": 0.5414717375749903, + "grad_norm": 0.7475710511207581, + "learning_rate": 8.321574047152887e-06, + "loss": 0.7969, + "step": 9838 + }, + { + "epoch": 0.541526776377346, + "grad_norm": 0.9751361608505249, + "learning_rate": 8.321250038792397e-06, + "loss": 0.8534, + "step": 9839 + }, + { + "epoch": 0.5415818151797017, + "grad_norm": 0.6858723163604736, + "learning_rate": 8.32092600547028e-06, + "loss": 0.8277, + "step": 9840 + }, + { + "epoch": 0.5416368539820573, + "grad_norm": 0.8899725675582886, + "learning_rate": 8.320601947188971e-06, + "loss": 0.8599, + "step": 9841 + }, + { + "epoch": 0.541691892784413, + "grad_norm": 0.7140665650367737, + "learning_rate": 8.320277863950907e-06, + "loss": 0.7429, + "step": 9842 + }, + { + "epoch": 0.5417469315867687, + "grad_norm": 0.7467615604400635, + "learning_rate": 8.319953755758525e-06, + "loss": 0.7826, + "step": 9843 + }, + { + "epoch": 0.5418019703891244, + "grad_norm": 0.6578202843666077, + "learning_rate": 8.319629622614258e-06, + "loss": 0.6833, + "step": 9844 + }, + { + "epoch": 0.5418570091914799, + "grad_norm": 0.9430698156356812, + "learning_rate": 8.319305464520543e-06, + "loss": 0.8243, + "step": 9845 + }, + { + "epoch": 0.5419120479938356, + "grad_norm": 0.8632097840309143, + "learning_rate": 8.318981281479817e-06, + "loss": 0.7975, + "step": 9846 + }, + { + "epoch": 0.5419670867961913, + "grad_norm": 0.7241839170455933, + "learning_rate": 8.318657073494517e-06, + "loss": 0.7226, + "step": 9847 + }, + { + "epoch": 0.542022125598547, + "grad_norm": 0.6927164196968079, + "learning_rate": 8.318332840567078e-06, + "loss": 0.7125, + "step": 9848 + }, + { + "epoch": 0.5420771644009026, + "grad_norm": 0.6414939761161804, + "learning_rate": 8.318008582699937e-06, + "loss": 0.7366, + "step": 9849 + }, + { + "epoch": 0.5421322032032583, + "grad_norm": 0.7584436535835266, + "learning_rate": 8.317684299895533e-06, + "loss": 0.8601, + "step": 9850 + }, + { + "epoch": 0.542187242005614, + "grad_norm": 0.6045856475830078, + "learning_rate": 8.317359992156302e-06, + "loss": 0.6697, + "step": 9851 + }, + { + "epoch": 0.5422422808079697, + "grad_norm": 0.715048611164093, + "learning_rate": 8.31703565948468e-06, + "loss": 0.7535, + "step": 9852 + }, + { + "epoch": 0.5422973196103252, + "grad_norm": 0.6925113201141357, + "learning_rate": 8.316711301883106e-06, + "loss": 0.8122, + "step": 9853 + }, + { + "epoch": 0.5423523584126809, + "grad_norm": 0.6787780523300171, + "learning_rate": 8.316386919354018e-06, + "loss": 0.7428, + "step": 9854 + }, + { + "epoch": 0.5424073972150366, + "grad_norm": 0.6831366419792175, + "learning_rate": 8.316062511899855e-06, + "loss": 0.767, + "step": 9855 + }, + { + "epoch": 0.5424624360173923, + "grad_norm": 0.6865691542625427, + "learning_rate": 8.315738079523053e-06, + "loss": 0.6549, + "step": 9856 + }, + { + "epoch": 0.5425174748197479, + "grad_norm": 0.7149406671524048, + "learning_rate": 8.31541362222605e-06, + "loss": 0.8127, + "step": 9857 + }, + { + "epoch": 0.5425725136221036, + "grad_norm": 0.6826779842376709, + "learning_rate": 8.315089140011286e-06, + "loss": 0.706, + "step": 9858 + }, + { + "epoch": 0.5426275524244593, + "grad_norm": 0.688204288482666, + "learning_rate": 8.3147646328812e-06, + "loss": 0.8675, + "step": 9859 + }, + { + "epoch": 0.542682591226815, + "grad_norm": 0.6659492254257202, + "learning_rate": 8.31444010083823e-06, + "loss": 0.7851, + "step": 9860 + }, + { + "epoch": 0.5427376300291705, + "grad_norm": 0.8049291372299194, + "learning_rate": 8.314115543884816e-06, + "loss": 0.7442, + "step": 9861 + }, + { + "epoch": 0.5427926688315262, + "grad_norm": 0.7505989670753479, + "learning_rate": 8.313790962023397e-06, + "loss": 0.8391, + "step": 9862 + }, + { + "epoch": 0.5428477076338819, + "grad_norm": 0.6810199618339539, + "learning_rate": 8.31346635525641e-06, + "loss": 0.8131, + "step": 9863 + }, + { + "epoch": 0.5429027464362376, + "grad_norm": 0.6724215745925903, + "learning_rate": 8.313141723586298e-06, + "loss": 0.75, + "step": 9864 + }, + { + "epoch": 0.5429577852385932, + "grad_norm": 0.7804376482963562, + "learning_rate": 8.3128170670155e-06, + "loss": 0.704, + "step": 9865 + }, + { + "epoch": 0.5430128240409489, + "grad_norm": 0.9494230151176453, + "learning_rate": 8.312492385546455e-06, + "loss": 0.8578, + "step": 9866 + }, + { + "epoch": 0.5430678628433045, + "grad_norm": 0.6780333518981934, + "learning_rate": 8.312167679181606e-06, + "loss": 0.701, + "step": 9867 + }, + { + "epoch": 0.5431229016456602, + "grad_norm": 0.7407701015472412, + "learning_rate": 8.31184294792339e-06, + "loss": 0.8505, + "step": 9868 + }, + { + "epoch": 0.5431779404480158, + "grad_norm": 0.680903434753418, + "learning_rate": 8.311518191774249e-06, + "loss": 0.7645, + "step": 9869 + }, + { + "epoch": 0.5432329792503715, + "grad_norm": 0.6695752143859863, + "learning_rate": 8.311193410736622e-06, + "loss": 0.816, + "step": 9870 + }, + { + "epoch": 0.5432880180527272, + "grad_norm": 0.6725142598152161, + "learning_rate": 8.310868604812954e-06, + "loss": 0.7044, + "step": 9871 + }, + { + "epoch": 0.5433430568550829, + "grad_norm": 0.922627866268158, + "learning_rate": 8.310543774005684e-06, + "loss": 0.7589, + "step": 9872 + }, + { + "epoch": 0.5433980956574385, + "grad_norm": 1.0136839151382446, + "learning_rate": 8.310218918317251e-06, + "loss": 0.7573, + "step": 9873 + }, + { + "epoch": 0.5434531344597942, + "grad_norm": 0.9053532481193542, + "learning_rate": 8.309894037750099e-06, + "loss": 0.8269, + "step": 9874 + }, + { + "epoch": 0.5435081732621498, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.309569132306671e-06, + "loss": 0.716, + "step": 9875 + }, + { + "epoch": 0.5435632120645055, + "grad_norm": 0.7157679796218872, + "learning_rate": 8.309244201989408e-06, + "loss": 0.7433, + "step": 9876 + }, + { + "epoch": 0.5436182508668611, + "grad_norm": 0.9316089749336243, + "learning_rate": 8.308919246800748e-06, + "loss": 0.7499, + "step": 9877 + }, + { + "epoch": 0.5436732896692168, + "grad_norm": 0.6682490110397339, + "learning_rate": 8.308594266743139e-06, + "loss": 0.7286, + "step": 9878 + }, + { + "epoch": 0.5437283284715725, + "grad_norm": 0.7241143584251404, + "learning_rate": 8.308269261819022e-06, + "loss": 0.7934, + "step": 9879 + }, + { + "epoch": 0.5437833672739281, + "grad_norm": 0.7402396202087402, + "learning_rate": 8.307944232030838e-06, + "loss": 0.7361, + "step": 9880 + }, + { + "epoch": 0.5438384060762838, + "grad_norm": 0.6839993596076965, + "learning_rate": 8.307619177381029e-06, + "loss": 0.749, + "step": 9881 + }, + { + "epoch": 0.5438934448786394, + "grad_norm": 0.6536363363265991, + "learning_rate": 8.307294097872041e-06, + "loss": 0.706, + "step": 9882 + }, + { + "epoch": 0.5439484836809951, + "grad_norm": 0.602644681930542, + "learning_rate": 8.306968993506317e-06, + "loss": 0.6857, + "step": 9883 + }, + { + "epoch": 0.5440035224833507, + "grad_norm": 0.6567881107330322, + "learning_rate": 8.306643864286297e-06, + "loss": 0.6989, + "step": 9884 + }, + { + "epoch": 0.5440585612857064, + "grad_norm": 1.0013506412506104, + "learning_rate": 8.306318710214427e-06, + "loss": 0.7251, + "step": 9885 + }, + { + "epoch": 0.5441136000880621, + "grad_norm": 0.7016813158988953, + "learning_rate": 8.305993531293153e-06, + "loss": 0.7535, + "step": 9886 + }, + { + "epoch": 0.5441686388904178, + "grad_norm": 0.7345741391181946, + "learning_rate": 8.305668327524915e-06, + "loss": 0.887, + "step": 9887 + }, + { + "epoch": 0.5442236776927734, + "grad_norm": 1.0925754308700562, + "learning_rate": 8.305343098912158e-06, + "loss": 0.7779, + "step": 9888 + }, + { + "epoch": 0.544278716495129, + "grad_norm": 0.79815274477005, + "learning_rate": 8.305017845457328e-06, + "loss": 0.7736, + "step": 9889 + }, + { + "epoch": 0.5443337552974847, + "grad_norm": 0.6324154138565063, + "learning_rate": 8.304692567162868e-06, + "loss": 0.6823, + "step": 9890 + }, + { + "epoch": 0.5443887940998404, + "grad_norm": 0.6990262866020203, + "learning_rate": 8.304367264031223e-06, + "loss": 0.7804, + "step": 9891 + }, + { + "epoch": 0.544443832902196, + "grad_norm": 1.4203195571899414, + "learning_rate": 8.304041936064839e-06, + "loss": 0.8702, + "step": 9892 + }, + { + "epoch": 0.5444988717045517, + "grad_norm": 0.6986544132232666, + "learning_rate": 8.303716583266161e-06, + "loss": 0.7666, + "step": 9893 + }, + { + "epoch": 0.5445539105069074, + "grad_norm": 0.7037138938903809, + "learning_rate": 8.303391205637632e-06, + "loss": 0.7995, + "step": 9894 + }, + { + "epoch": 0.5446089493092631, + "grad_norm": 0.7101728320121765, + "learning_rate": 8.3030658031817e-06, + "loss": 0.8185, + "step": 9895 + }, + { + "epoch": 0.5446639881116186, + "grad_norm": 0.6571425795555115, + "learning_rate": 8.302740375900808e-06, + "loss": 0.6152, + "step": 9896 + }, + { + "epoch": 0.5447190269139743, + "grad_norm": 0.7560263276100159, + "learning_rate": 8.302414923797406e-06, + "loss": 0.9037, + "step": 9897 + }, + { + "epoch": 0.54477406571633, + "grad_norm": 0.8692007064819336, + "learning_rate": 8.302089446873935e-06, + "loss": 0.7689, + "step": 9898 + }, + { + "epoch": 0.5448291045186857, + "grad_norm": 0.7533506751060486, + "learning_rate": 8.301763945132845e-06, + "loss": 0.7671, + "step": 9899 + }, + { + "epoch": 0.5448841433210413, + "grad_norm": 0.6992233991622925, + "learning_rate": 8.301438418576581e-06, + "loss": 0.723, + "step": 9900 + }, + { + "epoch": 0.544939182123397, + "grad_norm": 0.7966120839118958, + "learning_rate": 8.301112867207589e-06, + "loss": 0.7968, + "step": 9901 + }, + { + "epoch": 0.5449942209257527, + "grad_norm": 0.800558865070343, + "learning_rate": 8.300787291028316e-06, + "loss": 0.8583, + "step": 9902 + }, + { + "epoch": 0.5450492597281084, + "grad_norm": 0.7019909024238586, + "learning_rate": 8.30046169004121e-06, + "loss": 0.7045, + "step": 9903 + }, + { + "epoch": 0.5451042985304639, + "grad_norm": 0.7778449654579163, + "learning_rate": 8.300136064248717e-06, + "loss": 0.7964, + "step": 9904 + }, + { + "epoch": 0.5451593373328196, + "grad_norm": 0.6894309520721436, + "learning_rate": 8.299810413653284e-06, + "loss": 0.7382, + "step": 9905 + }, + { + "epoch": 0.5452143761351753, + "grad_norm": 0.6942182183265686, + "learning_rate": 8.299484738257361e-06, + "loss": 0.73, + "step": 9906 + }, + { + "epoch": 0.545269414937531, + "grad_norm": 0.6607787609100342, + "learning_rate": 8.299159038063394e-06, + "loss": 0.6987, + "step": 9907 + }, + { + "epoch": 0.5453244537398866, + "grad_norm": 0.7447709441184998, + "learning_rate": 8.29883331307383e-06, + "loss": 0.7787, + "step": 9908 + }, + { + "epoch": 0.5453794925422423, + "grad_norm": 0.6315301656723022, + "learning_rate": 8.298507563291116e-06, + "loss": 0.7047, + "step": 9909 + }, + { + "epoch": 0.545434531344598, + "grad_norm": 0.8095656633377075, + "learning_rate": 8.298181788717705e-06, + "loss": 0.691, + "step": 9910 + }, + { + "epoch": 0.5454895701469537, + "grad_norm": 0.6419453024864197, + "learning_rate": 8.29785598935604e-06, + "loss": 0.7333, + "step": 9911 + }, + { + "epoch": 0.5455446089493092, + "grad_norm": 0.7209222316741943, + "learning_rate": 8.297530165208574e-06, + "loss": 0.8174, + "step": 9912 + }, + { + "epoch": 0.5455996477516649, + "grad_norm": 0.6778598427772522, + "learning_rate": 8.297204316277754e-06, + "loss": 0.7696, + "step": 9913 + }, + { + "epoch": 0.5456546865540206, + "grad_norm": 0.6573307514190674, + "learning_rate": 8.296878442566028e-06, + "loss": 0.7843, + "step": 9914 + }, + { + "epoch": 0.5457097253563763, + "grad_norm": 0.6987473964691162, + "learning_rate": 8.296552544075847e-06, + "loss": 0.809, + "step": 9915 + }, + { + "epoch": 0.5457647641587319, + "grad_norm": 0.7149204015731812, + "learning_rate": 8.29622662080966e-06, + "loss": 0.848, + "step": 9916 + }, + { + "epoch": 0.5458198029610876, + "grad_norm": 0.6252632141113281, + "learning_rate": 8.295900672769913e-06, + "loss": 0.7029, + "step": 9917 + }, + { + "epoch": 0.5458748417634433, + "grad_norm": 0.713376522064209, + "learning_rate": 8.295574699959062e-06, + "loss": 0.726, + "step": 9918 + }, + { + "epoch": 0.5459298805657989, + "grad_norm": 0.6864717602729797, + "learning_rate": 8.295248702379552e-06, + "loss": 0.7428, + "step": 9919 + }, + { + "epoch": 0.5459849193681545, + "grad_norm": 0.8085678219795227, + "learning_rate": 8.294922680033837e-06, + "loss": 0.8697, + "step": 9920 + }, + { + "epoch": 0.5460399581705102, + "grad_norm": 0.7366700768470764, + "learning_rate": 8.294596632924363e-06, + "loss": 0.7714, + "step": 9921 + }, + { + "epoch": 0.5460949969728659, + "grad_norm": 0.670632541179657, + "learning_rate": 8.294270561053583e-06, + "loss": 0.7032, + "step": 9922 + }, + { + "epoch": 0.5461500357752215, + "grad_norm": 0.7867220640182495, + "learning_rate": 8.293944464423946e-06, + "loss": 0.8903, + "step": 9923 + }, + { + "epoch": 0.5462050745775772, + "grad_norm": 0.8441565632820129, + "learning_rate": 8.293618343037907e-06, + "loss": 0.8694, + "step": 9924 + }, + { + "epoch": 0.5462601133799329, + "grad_norm": 0.7048027515411377, + "learning_rate": 8.293292196897913e-06, + "loss": 0.8226, + "step": 9925 + }, + { + "epoch": 0.5463151521822885, + "grad_norm": 0.6344078779220581, + "learning_rate": 8.292966026006416e-06, + "loss": 0.7615, + "step": 9926 + }, + { + "epoch": 0.5463701909846441, + "grad_norm": 0.6744484901428223, + "learning_rate": 8.292639830365867e-06, + "loss": 0.6944, + "step": 9927 + }, + { + "epoch": 0.5464252297869998, + "grad_norm": 0.8113303780555725, + "learning_rate": 8.292313609978721e-06, + "loss": 0.7558, + "step": 9928 + }, + { + "epoch": 0.5464802685893555, + "grad_norm": 0.640190839767456, + "learning_rate": 8.291987364847425e-06, + "loss": 0.7167, + "step": 9929 + }, + { + "epoch": 0.5465353073917112, + "grad_norm": 0.7714816331863403, + "learning_rate": 8.291661094974434e-06, + "loss": 0.8662, + "step": 9930 + }, + { + "epoch": 0.5465903461940668, + "grad_norm": 0.6785402894020081, + "learning_rate": 8.291334800362199e-06, + "loss": 0.6835, + "step": 9931 + }, + { + "epoch": 0.5466453849964225, + "grad_norm": 0.704868495464325, + "learning_rate": 8.291008481013173e-06, + "loss": 0.7343, + "step": 9932 + }, + { + "epoch": 0.5467004237987781, + "grad_norm": 0.7587466239929199, + "learning_rate": 8.290682136929809e-06, + "loss": 0.7856, + "step": 9933 + }, + { + "epoch": 0.5467554626011338, + "grad_norm": 0.7460505962371826, + "learning_rate": 8.290355768114557e-06, + "loss": 0.7463, + "step": 9934 + }, + { + "epoch": 0.5468105014034894, + "grad_norm": 0.7185021042823792, + "learning_rate": 8.290029374569873e-06, + "loss": 0.8106, + "step": 9935 + }, + { + "epoch": 0.5468655402058451, + "grad_norm": 0.7023874521255493, + "learning_rate": 8.289702956298209e-06, + "loss": 0.6863, + "step": 9936 + }, + { + "epoch": 0.5469205790082008, + "grad_norm": 0.8688495755195618, + "learning_rate": 8.289376513302017e-06, + "loss": 0.8898, + "step": 9937 + }, + { + "epoch": 0.5469756178105565, + "grad_norm": 0.6405122876167297, + "learning_rate": 8.289050045583752e-06, + "loss": 0.6804, + "step": 9938 + }, + { + "epoch": 0.5470306566129121, + "grad_norm": 0.8364881277084351, + "learning_rate": 8.288723553145868e-06, + "loss": 0.8356, + "step": 9939 + }, + { + "epoch": 0.5470856954152677, + "grad_norm": 0.6621617078781128, + "learning_rate": 8.288397035990818e-06, + "loss": 0.7508, + "step": 9940 + }, + { + "epoch": 0.5471407342176234, + "grad_norm": 0.6822347640991211, + "learning_rate": 8.288070494121056e-06, + "loss": 0.7722, + "step": 9941 + }, + { + "epoch": 0.5471957730199791, + "grad_norm": 0.6727223992347717, + "learning_rate": 8.287743927539036e-06, + "loss": 0.743, + "step": 9942 + }, + { + "epoch": 0.5472508118223347, + "grad_norm": 0.7852441668510437, + "learning_rate": 8.287417336247214e-06, + "loss": 0.8321, + "step": 9943 + }, + { + "epoch": 0.5473058506246904, + "grad_norm": 0.6982126235961914, + "learning_rate": 8.287090720248041e-06, + "loss": 0.6669, + "step": 9944 + }, + { + "epoch": 0.5473608894270461, + "grad_norm": 0.7820166945457458, + "learning_rate": 8.286764079543976e-06, + "loss": 0.7592, + "step": 9945 + }, + { + "epoch": 0.5474159282294018, + "grad_norm": 0.6868422627449036, + "learning_rate": 8.28643741413747e-06, + "loss": 0.8308, + "step": 9946 + }, + { + "epoch": 0.5474709670317573, + "grad_norm": 0.8227942585945129, + "learning_rate": 8.286110724030982e-06, + "loss": 0.7982, + "step": 9947 + }, + { + "epoch": 0.547526005834113, + "grad_norm": 0.6838171482086182, + "learning_rate": 8.285784009226964e-06, + "loss": 0.7907, + "step": 9948 + }, + { + "epoch": 0.5475810446364687, + "grad_norm": 0.7200812697410583, + "learning_rate": 8.285457269727875e-06, + "loss": 0.88, + "step": 9949 + }, + { + "epoch": 0.5476360834388244, + "grad_norm": 0.7469412684440613, + "learning_rate": 8.285130505536168e-06, + "loss": 0.8167, + "step": 9950 + }, + { + "epoch": 0.54769112224118, + "grad_norm": 0.6660227179527283, + "learning_rate": 8.284803716654298e-06, + "loss": 0.7685, + "step": 9951 + }, + { + "epoch": 0.5477461610435357, + "grad_norm": 0.7116572260856628, + "learning_rate": 8.284476903084723e-06, + "loss": 0.7415, + "step": 9952 + }, + { + "epoch": 0.5478011998458914, + "grad_norm": 0.6540791988372803, + "learning_rate": 8.284150064829899e-06, + "loss": 0.6571, + "step": 9953 + }, + { + "epoch": 0.5478562386482471, + "grad_norm": 0.7527759075164795, + "learning_rate": 8.283823201892283e-06, + "loss": 0.8678, + "step": 9954 + }, + { + "epoch": 0.5479112774506026, + "grad_norm": 0.7795953750610352, + "learning_rate": 8.283496314274331e-06, + "loss": 0.8086, + "step": 9955 + }, + { + "epoch": 0.5479663162529583, + "grad_norm": 0.862503170967102, + "learning_rate": 8.283169401978498e-06, + "loss": 0.7442, + "step": 9956 + }, + { + "epoch": 0.548021355055314, + "grad_norm": 0.6552054286003113, + "learning_rate": 8.282842465007244e-06, + "loss": 0.6664, + "step": 9957 + }, + { + "epoch": 0.5480763938576697, + "grad_norm": 0.7242427468299866, + "learning_rate": 8.282515503363024e-06, + "loss": 0.8199, + "step": 9958 + }, + { + "epoch": 0.5481314326600253, + "grad_norm": 0.7529763579368591, + "learning_rate": 8.282188517048295e-06, + "loss": 0.761, + "step": 9959 + }, + { + "epoch": 0.548186471462381, + "grad_norm": 0.7909425497055054, + "learning_rate": 8.281861506065519e-06, + "loss": 0.7389, + "step": 9960 + }, + { + "epoch": 0.5482415102647367, + "grad_norm": 0.6594850420951843, + "learning_rate": 8.281534470417147e-06, + "loss": 0.7473, + "step": 9961 + }, + { + "epoch": 0.5482965490670924, + "grad_norm": 0.6900844573974609, + "learning_rate": 8.281207410105642e-06, + "loss": 0.7551, + "step": 9962 + }, + { + "epoch": 0.5483515878694479, + "grad_norm": 0.6922640204429626, + "learning_rate": 8.28088032513346e-06, + "loss": 0.7654, + "step": 9963 + }, + { + "epoch": 0.5484066266718036, + "grad_norm": 0.7758432626724243, + "learning_rate": 8.28055321550306e-06, + "loss": 0.8033, + "step": 9964 + }, + { + "epoch": 0.5484616654741593, + "grad_norm": 0.7074280977249146, + "learning_rate": 8.2802260812169e-06, + "loss": 0.7302, + "step": 9965 + }, + { + "epoch": 0.5485167042765149, + "grad_norm": 0.7724928259849548, + "learning_rate": 8.27989892227744e-06, + "loss": 0.7621, + "step": 9966 + }, + { + "epoch": 0.5485717430788706, + "grad_norm": 0.7364168167114258, + "learning_rate": 8.279571738687137e-06, + "loss": 0.7587, + "step": 9967 + }, + { + "epoch": 0.5486267818812263, + "grad_norm": 0.7298350930213928, + "learning_rate": 8.27924453044845e-06, + "loss": 0.7371, + "step": 9968 + }, + { + "epoch": 0.548681820683582, + "grad_norm": 0.8056737780570984, + "learning_rate": 8.27891729756384e-06, + "loss": 0.9871, + "step": 9969 + }, + { + "epoch": 0.5487368594859375, + "grad_norm": 0.7499688267707825, + "learning_rate": 8.278590040035763e-06, + "loss": 0.8574, + "step": 9970 + }, + { + "epoch": 0.5487918982882932, + "grad_norm": 0.7398175001144409, + "learning_rate": 8.278262757866683e-06, + "loss": 0.744, + "step": 9971 + }, + { + "epoch": 0.5488469370906489, + "grad_norm": 0.7099171876907349, + "learning_rate": 8.277935451059058e-06, + "loss": 0.7108, + "step": 9972 + }, + { + "epoch": 0.5489019758930046, + "grad_norm": 0.6720188856124878, + "learning_rate": 8.277608119615345e-06, + "loss": 0.8565, + "step": 9973 + }, + { + "epoch": 0.5489570146953602, + "grad_norm": 0.7870737910270691, + "learning_rate": 8.27728076353801e-06, + "loss": 0.7429, + "step": 9974 + }, + { + "epoch": 0.5490120534977159, + "grad_norm": 0.7358133792877197, + "learning_rate": 8.276953382829507e-06, + "loss": 0.7549, + "step": 9975 + }, + { + "epoch": 0.5490670923000716, + "grad_norm": 0.8968467116355896, + "learning_rate": 8.276625977492303e-06, + "loss": 0.6983, + "step": 9976 + }, + { + "epoch": 0.5491221311024272, + "grad_norm": 0.7346875071525574, + "learning_rate": 8.276298547528852e-06, + "loss": 0.8541, + "step": 9977 + }, + { + "epoch": 0.5491771699047828, + "grad_norm": 0.7297229170799255, + "learning_rate": 8.27597109294162e-06, + "loss": 0.8378, + "step": 9978 + }, + { + "epoch": 0.5492322087071385, + "grad_norm": 0.6907635927200317, + "learning_rate": 8.275643613733064e-06, + "loss": 0.7058, + "step": 9979 + }, + { + "epoch": 0.5492872475094942, + "grad_norm": 0.7612239718437195, + "learning_rate": 8.27531610990565e-06, + "loss": 0.6827, + "step": 9980 + }, + { + "epoch": 0.5493422863118499, + "grad_norm": 1.3160386085510254, + "learning_rate": 8.274988581461837e-06, + "loss": 0.7357, + "step": 9981 + }, + { + "epoch": 0.5493973251142055, + "grad_norm": 0.6370541453361511, + "learning_rate": 8.274661028404083e-06, + "loss": 0.7323, + "step": 9982 + }, + { + "epoch": 0.5494523639165612, + "grad_norm": 0.7051724195480347, + "learning_rate": 8.274333450734856e-06, + "loss": 0.7714, + "step": 9983 + }, + { + "epoch": 0.5495074027189168, + "grad_norm": 0.7452969551086426, + "learning_rate": 8.274005848456614e-06, + "loss": 0.7516, + "step": 9984 + }, + { + "epoch": 0.5495624415212725, + "grad_norm": 0.7132626175880432, + "learning_rate": 8.273678221571823e-06, + "loss": 0.6417, + "step": 9985 + }, + { + "epoch": 0.5496174803236281, + "grad_norm": 0.7873446345329285, + "learning_rate": 8.273350570082941e-06, + "loss": 0.8457, + "step": 9986 + }, + { + "epoch": 0.5496725191259838, + "grad_norm": 0.691470205783844, + "learning_rate": 8.273022893992432e-06, + "loss": 0.7871, + "step": 9987 + }, + { + "epoch": 0.5497275579283395, + "grad_norm": 0.6671431064605713, + "learning_rate": 8.27269519330276e-06, + "loss": 0.6919, + "step": 9988 + }, + { + "epoch": 0.5497825967306952, + "grad_norm": 0.8026914596557617, + "learning_rate": 8.272367468016387e-06, + "loss": 0.6885, + "step": 9989 + }, + { + "epoch": 0.5498376355330508, + "grad_norm": 0.9003152251243591, + "learning_rate": 8.272039718135774e-06, + "loss": 0.7671, + "step": 9990 + }, + { + "epoch": 0.5498926743354065, + "grad_norm": 0.6515254378318787, + "learning_rate": 8.271711943663388e-06, + "loss": 0.7589, + "step": 9991 + }, + { + "epoch": 0.5499477131377621, + "grad_norm": 0.6495782136917114, + "learning_rate": 8.27138414460169e-06, + "loss": 0.7277, + "step": 9992 + }, + { + "epoch": 0.5500027519401178, + "grad_norm": 0.7564565539360046, + "learning_rate": 8.271056320953146e-06, + "loss": 0.6977, + "step": 9993 + }, + { + "epoch": 0.5500577907424734, + "grad_norm": 0.8551548719406128, + "learning_rate": 8.270728472720218e-06, + "loss": 0.684, + "step": 9994 + }, + { + "epoch": 0.5501128295448291, + "grad_norm": 0.6614843010902405, + "learning_rate": 8.270400599905369e-06, + "loss": 0.6559, + "step": 9995 + }, + { + "epoch": 0.5501678683471848, + "grad_norm": 0.6920068264007568, + "learning_rate": 8.270072702511065e-06, + "loss": 0.7497, + "step": 9996 + }, + { + "epoch": 0.5502229071495405, + "grad_norm": 0.7426198124885559, + "learning_rate": 8.26974478053977e-06, + "loss": 0.7434, + "step": 9997 + }, + { + "epoch": 0.550277945951896, + "grad_norm": 1.2630934715270996, + "learning_rate": 8.269416833993949e-06, + "loss": 0.7306, + "step": 9998 + }, + { + "epoch": 0.5503329847542517, + "grad_norm": 0.7069457769393921, + "learning_rate": 8.269088862876066e-06, + "loss": 0.6735, + "step": 9999 + }, + { + "epoch": 0.5503880235566074, + "grad_norm": 0.8945016264915466, + "learning_rate": 8.268760867188586e-06, + "loss": 0.7575, + "step": 10000 + }, + { + "epoch": 0.5504430623589631, + "grad_norm": 0.7708195447921753, + "learning_rate": 8.268432846933974e-06, + "loss": 0.6988, + "step": 10001 + }, + { + "epoch": 0.5504981011613187, + "grad_norm": 0.7884799838066101, + "learning_rate": 8.268104802114696e-06, + "loss": 0.8085, + "step": 10002 + }, + { + "epoch": 0.5505531399636744, + "grad_norm": 0.7801569104194641, + "learning_rate": 8.267776732733217e-06, + "loss": 0.886, + "step": 10003 + }, + { + "epoch": 0.5506081787660301, + "grad_norm": 0.714645504951477, + "learning_rate": 8.267448638792004e-06, + "loss": 0.7151, + "step": 10004 + }, + { + "epoch": 0.5506632175683858, + "grad_norm": 0.653136134147644, + "learning_rate": 8.267120520293519e-06, + "loss": 0.6347, + "step": 10005 + }, + { + "epoch": 0.5507182563707413, + "grad_norm": 0.8821585774421692, + "learning_rate": 8.266792377240233e-06, + "loss": 0.6457, + "step": 10006 + }, + { + "epoch": 0.550773295173097, + "grad_norm": 0.7056930661201477, + "learning_rate": 8.266464209634608e-06, + "loss": 0.8709, + "step": 10007 + }, + { + "epoch": 0.5508283339754527, + "grad_norm": 0.6505821347236633, + "learning_rate": 8.266136017479113e-06, + "loss": 0.7674, + "step": 10008 + }, + { + "epoch": 0.5508833727778083, + "grad_norm": 0.7947389483451843, + "learning_rate": 8.265807800776216e-06, + "loss": 0.7882, + "step": 10009 + }, + { + "epoch": 0.550938411580164, + "grad_norm": 0.7466071844100952, + "learning_rate": 8.265479559528379e-06, + "loss": 0.7673, + "step": 10010 + }, + { + "epoch": 0.5509934503825197, + "grad_norm": 0.706430971622467, + "learning_rate": 8.265151293738074e-06, + "loss": 0.7796, + "step": 10011 + }, + { + "epoch": 0.5510484891848754, + "grad_norm": 0.7701015472412109, + "learning_rate": 8.264823003407765e-06, + "loss": 0.7631, + "step": 10012 + }, + { + "epoch": 0.551103527987231, + "grad_norm": 0.6923625469207764, + "learning_rate": 8.264494688539922e-06, + "loss": 0.7659, + "step": 10013 + }, + { + "epoch": 0.5511585667895866, + "grad_norm": 0.6585322618484497, + "learning_rate": 8.264166349137008e-06, + "loss": 0.7248, + "step": 10014 + }, + { + "epoch": 0.5512136055919423, + "grad_norm": 0.698451578617096, + "learning_rate": 8.263837985201493e-06, + "loss": 0.7768, + "step": 10015 + }, + { + "epoch": 0.551268644394298, + "grad_norm": 0.7585058808326721, + "learning_rate": 8.263509596735847e-06, + "loss": 0.8535, + "step": 10016 + }, + { + "epoch": 0.5513236831966536, + "grad_norm": 0.6973930597305298, + "learning_rate": 8.263181183742536e-06, + "loss": 0.8253, + "step": 10017 + }, + { + "epoch": 0.5513787219990093, + "grad_norm": 0.6752467751502991, + "learning_rate": 8.26285274622403e-06, + "loss": 0.7402, + "step": 10018 + }, + { + "epoch": 0.551433760801365, + "grad_norm": 0.717555820941925, + "learning_rate": 8.262524284182794e-06, + "loss": 0.8057, + "step": 10019 + }, + { + "epoch": 0.5514887996037207, + "grad_norm": 0.6975438594818115, + "learning_rate": 8.2621957976213e-06, + "loss": 0.803, + "step": 10020 + }, + { + "epoch": 0.5515438384060762, + "grad_norm": 0.667797327041626, + "learning_rate": 8.261867286542016e-06, + "loss": 0.7387, + "step": 10021 + }, + { + "epoch": 0.5515988772084319, + "grad_norm": 0.7330532670021057, + "learning_rate": 8.261538750947411e-06, + "loss": 0.8143, + "step": 10022 + }, + { + "epoch": 0.5516539160107876, + "grad_norm": 0.7034017443656921, + "learning_rate": 8.261210190839952e-06, + "loss": 0.739, + "step": 10023 + }, + { + "epoch": 0.5517089548131433, + "grad_norm": 0.709284245967865, + "learning_rate": 8.260881606222113e-06, + "loss": 0.8021, + "step": 10024 + }, + { + "epoch": 0.5517639936154989, + "grad_norm": 0.7587909698486328, + "learning_rate": 8.260552997096359e-06, + "loss": 0.8346, + "step": 10025 + }, + { + "epoch": 0.5518190324178546, + "grad_norm": 0.7413986325263977, + "learning_rate": 8.26022436346516e-06, + "loss": 0.6777, + "step": 10026 + }, + { + "epoch": 0.5518740712202103, + "grad_norm": 0.7112768292427063, + "learning_rate": 8.25989570533099e-06, + "loss": 0.7017, + "step": 10027 + }, + { + "epoch": 0.551929110022566, + "grad_norm": 0.7097088098526001, + "learning_rate": 8.259567022696315e-06, + "loss": 0.7315, + "step": 10028 + }, + { + "epoch": 0.5519841488249215, + "grad_norm": 0.6544226408004761, + "learning_rate": 8.259238315563606e-06, + "loss": 0.7729, + "step": 10029 + }, + { + "epoch": 0.5520391876272772, + "grad_norm": 0.6892885565757751, + "learning_rate": 8.258909583935335e-06, + "loss": 0.7919, + "step": 10030 + }, + { + "epoch": 0.5520942264296329, + "grad_norm": 0.697424054145813, + "learning_rate": 8.258580827813972e-06, + "loss": 0.7514, + "step": 10031 + }, + { + "epoch": 0.5521492652319886, + "grad_norm": 0.7021437883377075, + "learning_rate": 8.258252047201989e-06, + "loss": 0.747, + "step": 10032 + }, + { + "epoch": 0.5522043040343442, + "grad_norm": 0.6974816918373108, + "learning_rate": 8.257923242101854e-06, + "loss": 0.7245, + "step": 10033 + }, + { + "epoch": 0.5522593428366999, + "grad_norm": 0.6645311117172241, + "learning_rate": 8.25759441251604e-06, + "loss": 0.649, + "step": 10034 + }, + { + "epoch": 0.5523143816390556, + "grad_norm": 0.7223736643791199, + "learning_rate": 8.25726555844702e-06, + "loss": 0.7792, + "step": 10035 + }, + { + "epoch": 0.5523694204414112, + "grad_norm": 0.7253531813621521, + "learning_rate": 8.256936679897262e-06, + "loss": 0.7636, + "step": 10036 + }, + { + "epoch": 0.5524244592437668, + "grad_norm": 0.6979514956474304, + "learning_rate": 8.256607776869241e-06, + "loss": 0.7929, + "step": 10037 + }, + { + "epoch": 0.5524794980461225, + "grad_norm": 0.7442019581794739, + "learning_rate": 8.25627884936543e-06, + "loss": 0.6984, + "step": 10038 + }, + { + "epoch": 0.5525345368484782, + "grad_norm": 0.7519513964653015, + "learning_rate": 8.255949897388294e-06, + "loss": 0.7228, + "step": 10039 + }, + { + "epoch": 0.5525895756508339, + "grad_norm": 0.7302790880203247, + "learning_rate": 8.255620920940313e-06, + "loss": 0.7555, + "step": 10040 + }, + { + "epoch": 0.5526446144531895, + "grad_norm": 0.6521434187889099, + "learning_rate": 8.255291920023956e-06, + "loss": 0.7825, + "step": 10041 + }, + { + "epoch": 0.5526996532555452, + "grad_norm": 0.8270126581192017, + "learning_rate": 8.254962894641695e-06, + "loss": 0.7939, + "step": 10042 + }, + { + "epoch": 0.5527546920579008, + "grad_norm": 0.7209310531616211, + "learning_rate": 8.254633844796007e-06, + "loss": 0.8286, + "step": 10043 + }, + { + "epoch": 0.5528097308602565, + "grad_norm": 0.6506814360618591, + "learning_rate": 8.25430477048936e-06, + "loss": 0.7209, + "step": 10044 + }, + { + "epoch": 0.5528647696626121, + "grad_norm": 0.6914637684822083, + "learning_rate": 8.25397567172423e-06, + "loss": 0.705, + "step": 10045 + }, + { + "epoch": 0.5529198084649678, + "grad_norm": 0.8369725942611694, + "learning_rate": 8.253646548503091e-06, + "loss": 0.8254, + "step": 10046 + }, + { + "epoch": 0.5529748472673235, + "grad_norm": 0.7809324860572815, + "learning_rate": 8.253317400828414e-06, + "loss": 0.8117, + "step": 10047 + }, + { + "epoch": 0.5530298860696792, + "grad_norm": 0.7184550762176514, + "learning_rate": 8.252988228702676e-06, + "loss": 0.738, + "step": 10048 + }, + { + "epoch": 0.5530849248720348, + "grad_norm": 0.7111478447914124, + "learning_rate": 8.252659032128347e-06, + "loss": 0.7143, + "step": 10049 + }, + { + "epoch": 0.5531399636743904, + "grad_norm": 0.7506794333457947, + "learning_rate": 8.252329811107905e-06, + "loss": 0.7721, + "step": 10050 + }, + { + "epoch": 0.5531950024767461, + "grad_norm": 0.7700625658035278, + "learning_rate": 8.252000565643823e-06, + "loss": 0.7993, + "step": 10051 + }, + { + "epoch": 0.5532500412791017, + "grad_norm": 0.6985816955566406, + "learning_rate": 8.251671295738575e-06, + "loss": 0.7461, + "step": 10052 + }, + { + "epoch": 0.5533050800814574, + "grad_norm": 0.6932175755500793, + "learning_rate": 8.251342001394635e-06, + "loss": 0.6804, + "step": 10053 + }, + { + "epoch": 0.5533601188838131, + "grad_norm": 0.8060765266418457, + "learning_rate": 8.25101268261448e-06, + "loss": 0.7137, + "step": 10054 + }, + { + "epoch": 0.5534151576861688, + "grad_norm": 0.6853482127189636, + "learning_rate": 8.250683339400582e-06, + "loss": 0.7229, + "step": 10055 + }, + { + "epoch": 0.5534701964885244, + "grad_norm": 0.7581862211227417, + "learning_rate": 8.25035397175542e-06, + "loss": 0.8091, + "step": 10056 + }, + { + "epoch": 0.55352523529088, + "grad_norm": 0.7375245094299316, + "learning_rate": 8.250024579681466e-06, + "loss": 0.7234, + "step": 10057 + }, + { + "epoch": 0.5535802740932357, + "grad_norm": 0.7904585599899292, + "learning_rate": 8.249695163181198e-06, + "loss": 0.7295, + "step": 10058 + }, + { + "epoch": 0.5536353128955914, + "grad_norm": 0.6593602895736694, + "learning_rate": 8.249365722257092e-06, + "loss": 0.7492, + "step": 10059 + }, + { + "epoch": 0.553690351697947, + "grad_norm": 0.7226922512054443, + "learning_rate": 8.249036256911622e-06, + "loss": 0.8177, + "step": 10060 + }, + { + "epoch": 0.5537453905003027, + "grad_norm": 0.7268722653388977, + "learning_rate": 8.248706767147265e-06, + "loss": 0.8059, + "step": 10061 + }, + { + "epoch": 0.5538004293026584, + "grad_norm": 0.7797269225120544, + "learning_rate": 8.248377252966499e-06, + "loss": 0.8122, + "step": 10062 + }, + { + "epoch": 0.5538554681050141, + "grad_norm": 0.7199145555496216, + "learning_rate": 8.248047714371797e-06, + "loss": 0.7312, + "step": 10063 + }, + { + "epoch": 0.5539105069073696, + "grad_norm": 0.6950703263282776, + "learning_rate": 8.24771815136564e-06, + "loss": 0.757, + "step": 10064 + }, + { + "epoch": 0.5539655457097253, + "grad_norm": 0.6413441896438599, + "learning_rate": 8.247388563950502e-06, + "loss": 0.6955, + "step": 10065 + }, + { + "epoch": 0.554020584512081, + "grad_norm": 0.7650758624076843, + "learning_rate": 8.24705895212886e-06, + "loss": 0.8355, + "step": 10066 + }, + { + "epoch": 0.5540756233144367, + "grad_norm": 0.7067090272903442, + "learning_rate": 8.246729315903192e-06, + "loss": 0.7409, + "step": 10067 + }, + { + "epoch": 0.5541306621167923, + "grad_norm": 0.7763532996177673, + "learning_rate": 8.246399655275976e-06, + "loss": 0.8097, + "step": 10068 + }, + { + "epoch": 0.554185700919148, + "grad_norm": 0.6865057945251465, + "learning_rate": 8.246069970249689e-06, + "loss": 0.7597, + "step": 10069 + }, + { + "epoch": 0.5542407397215037, + "grad_norm": 0.7643107771873474, + "learning_rate": 8.24574026082681e-06, + "loss": 0.7403, + "step": 10070 + }, + { + "epoch": 0.5542957785238594, + "grad_norm": 0.7354087829589844, + "learning_rate": 8.245410527009815e-06, + "loss": 0.8896, + "step": 10071 + }, + { + "epoch": 0.5543508173262149, + "grad_norm": 0.7971135973930359, + "learning_rate": 8.245080768801183e-06, + "loss": 0.7738, + "step": 10072 + }, + { + "epoch": 0.5544058561285706, + "grad_norm": 1.0506731271743774, + "learning_rate": 8.244750986203394e-06, + "loss": 0.7888, + "step": 10073 + }, + { + "epoch": 0.5544608949309263, + "grad_norm": 0.8305885195732117, + "learning_rate": 8.244421179218925e-06, + "loss": 0.8186, + "step": 10074 + }, + { + "epoch": 0.554515933733282, + "grad_norm": 0.9507874250411987, + "learning_rate": 8.244091347850253e-06, + "loss": 0.7975, + "step": 10075 + }, + { + "epoch": 0.5545709725356376, + "grad_norm": 0.7146797776222229, + "learning_rate": 8.243761492099861e-06, + "loss": 0.6895, + "step": 10076 + }, + { + "epoch": 0.5546260113379933, + "grad_norm": 0.734990656375885, + "learning_rate": 8.243431611970225e-06, + "loss": 0.8087, + "step": 10077 + }, + { + "epoch": 0.554681050140349, + "grad_norm": 0.6807795166969299, + "learning_rate": 8.243101707463825e-06, + "loss": 0.7861, + "step": 10078 + }, + { + "epoch": 0.5547360889427047, + "grad_norm": 0.7412874698638916, + "learning_rate": 8.242771778583142e-06, + "loss": 0.7864, + "step": 10079 + }, + { + "epoch": 0.5547911277450602, + "grad_norm": 0.6655074954032898, + "learning_rate": 8.242441825330652e-06, + "loss": 0.6554, + "step": 10080 + }, + { + "epoch": 0.5548461665474159, + "grad_norm": 0.7549700140953064, + "learning_rate": 8.242111847708838e-06, + "loss": 0.8031, + "step": 10081 + }, + { + "epoch": 0.5549012053497716, + "grad_norm": 0.8907766342163086, + "learning_rate": 8.241781845720181e-06, + "loss": 0.8068, + "step": 10082 + }, + { + "epoch": 0.5549562441521273, + "grad_norm": 0.7347774505615234, + "learning_rate": 8.241451819367157e-06, + "loss": 0.7453, + "step": 10083 + }, + { + "epoch": 0.5550112829544829, + "grad_norm": 0.6856632828712463, + "learning_rate": 8.24112176865225e-06, + "loss": 0.6235, + "step": 10084 + }, + { + "epoch": 0.5550663217568386, + "grad_norm": 0.7134507298469543, + "learning_rate": 8.24079169357794e-06, + "loss": 0.7991, + "step": 10085 + }, + { + "epoch": 0.5551213605591943, + "grad_norm": 0.7814854383468628, + "learning_rate": 8.240461594146704e-06, + "loss": 0.7681, + "step": 10086 + }, + { + "epoch": 0.5551763993615499, + "grad_norm": 0.6893261671066284, + "learning_rate": 8.240131470361028e-06, + "loss": 0.7746, + "step": 10087 + }, + { + "epoch": 0.5552314381639055, + "grad_norm": 0.925003170967102, + "learning_rate": 8.239801322223393e-06, + "loss": 0.7621, + "step": 10088 + }, + { + "epoch": 0.5552864769662612, + "grad_norm": 0.6261017918586731, + "learning_rate": 8.239471149736277e-06, + "loss": 0.7673, + "step": 10089 + }, + { + "epoch": 0.5553415157686169, + "grad_norm": 0.7268226146697998, + "learning_rate": 8.239140952902162e-06, + "loss": 0.7375, + "step": 10090 + }, + { + "epoch": 0.5553965545709726, + "grad_norm": 0.8062194585800171, + "learning_rate": 8.238810731723532e-06, + "loss": 0.8002, + "step": 10091 + }, + { + "epoch": 0.5554515933733282, + "grad_norm": 0.892842173576355, + "learning_rate": 8.238480486202867e-06, + "loss": 0.7959, + "step": 10092 + }, + { + "epoch": 0.5555066321756839, + "grad_norm": 0.7530377507209778, + "learning_rate": 8.23815021634265e-06, + "loss": 0.8137, + "step": 10093 + }, + { + "epoch": 0.5555616709780395, + "grad_norm": 0.6994850635528564, + "learning_rate": 8.237819922145364e-06, + "loss": 0.7966, + "step": 10094 + }, + { + "epoch": 0.5556167097803951, + "grad_norm": 0.8502941727638245, + "learning_rate": 8.237489603613488e-06, + "loss": 0.7668, + "step": 10095 + }, + { + "epoch": 0.5556717485827508, + "grad_norm": 0.6583576798439026, + "learning_rate": 8.237159260749507e-06, + "loss": 0.7379, + "step": 10096 + }, + { + "epoch": 0.5557267873851065, + "grad_norm": 0.9539539217948914, + "learning_rate": 8.236828893555904e-06, + "loss": 0.7563, + "step": 10097 + }, + { + "epoch": 0.5557818261874622, + "grad_norm": 0.7446413040161133, + "learning_rate": 8.236498502035162e-06, + "loss": 0.7329, + "step": 10098 + }, + { + "epoch": 0.5558368649898178, + "grad_norm": 0.8950835466384888, + "learning_rate": 8.236168086189761e-06, + "loss": 0.8144, + "step": 10099 + }, + { + "epoch": 0.5558919037921735, + "grad_norm": 0.7255009412765503, + "learning_rate": 8.235837646022191e-06, + "loss": 0.6946, + "step": 10100 + }, + { + "epoch": 0.5559469425945291, + "grad_norm": 0.6983402967453003, + "learning_rate": 8.235507181534929e-06, + "loss": 0.7371, + "step": 10101 + }, + { + "epoch": 0.5560019813968848, + "grad_norm": 1.043593168258667, + "learning_rate": 8.235176692730463e-06, + "loss": 0.6763, + "step": 10102 + }, + { + "epoch": 0.5560570201992404, + "grad_norm": 0.7452800869941711, + "learning_rate": 8.234846179611272e-06, + "loss": 0.8945, + "step": 10103 + }, + { + "epoch": 0.5561120590015961, + "grad_norm": 0.6367164254188538, + "learning_rate": 8.234515642179845e-06, + "loss": 0.6542, + "step": 10104 + }, + { + "epoch": 0.5561670978039518, + "grad_norm": 0.8377598524093628, + "learning_rate": 8.234185080438664e-06, + "loss": 0.787, + "step": 10105 + }, + { + "epoch": 0.5562221366063075, + "grad_norm": 0.7353680729866028, + "learning_rate": 8.233854494390214e-06, + "loss": 0.6391, + "step": 10106 + }, + { + "epoch": 0.5562771754086631, + "grad_norm": 0.7431599497795105, + "learning_rate": 8.233523884036977e-06, + "loss": 0.8221, + "step": 10107 + }, + { + "epoch": 0.5563322142110187, + "grad_norm": 0.7292743921279907, + "learning_rate": 8.233193249381442e-06, + "loss": 0.7791, + "step": 10108 + }, + { + "epoch": 0.5563872530133744, + "grad_norm": 0.7251895666122437, + "learning_rate": 8.232862590426091e-06, + "loss": 0.7993, + "step": 10109 + }, + { + "epoch": 0.5564422918157301, + "grad_norm": 0.7373167276382446, + "learning_rate": 8.23253190717341e-06, + "loss": 0.861, + "step": 10110 + }, + { + "epoch": 0.5564973306180857, + "grad_norm": 0.6689401268959045, + "learning_rate": 8.232201199625887e-06, + "loss": 0.7002, + "step": 10111 + }, + { + "epoch": 0.5565523694204414, + "grad_norm": 0.7405139207839966, + "learning_rate": 8.231870467786003e-06, + "loss": 0.8041, + "step": 10112 + }, + { + "epoch": 0.5566074082227971, + "grad_norm": 0.7561736702919006, + "learning_rate": 8.231539711656246e-06, + "loss": 0.7687, + "step": 10113 + }, + { + "epoch": 0.5566624470251528, + "grad_norm": 0.6857489943504333, + "learning_rate": 8.231208931239103e-06, + "loss": 0.7175, + "step": 10114 + }, + { + "epoch": 0.5567174858275084, + "grad_norm": 0.7410408854484558, + "learning_rate": 8.230878126537057e-06, + "loss": 0.7337, + "step": 10115 + }, + { + "epoch": 0.556772524629864, + "grad_norm": 0.7533249258995056, + "learning_rate": 8.230547297552595e-06, + "loss": 0.7226, + "step": 10116 + }, + { + "epoch": 0.5568275634322197, + "grad_norm": 0.6227561235427856, + "learning_rate": 8.230216444288207e-06, + "loss": 0.711, + "step": 10117 + }, + { + "epoch": 0.5568826022345754, + "grad_norm": 0.6790871024131775, + "learning_rate": 8.229885566746373e-06, + "loss": 0.728, + "step": 10118 + }, + { + "epoch": 0.556937641036931, + "grad_norm": 1.0007857084274292, + "learning_rate": 8.229554664929587e-06, + "loss": 0.9193, + "step": 10119 + }, + { + "epoch": 0.5569926798392867, + "grad_norm": 0.7167220711708069, + "learning_rate": 8.229223738840331e-06, + "loss": 0.8288, + "step": 10120 + }, + { + "epoch": 0.5570477186416424, + "grad_norm": 0.8037107586860657, + "learning_rate": 8.228892788481095e-06, + "loss": 0.8462, + "step": 10121 + }, + { + "epoch": 0.5571027574439981, + "grad_norm": 0.7355597615242004, + "learning_rate": 8.228561813854363e-06, + "loss": 0.7998, + "step": 10122 + }, + { + "epoch": 0.5571577962463536, + "grad_norm": 0.7384124994277954, + "learning_rate": 8.228230814962625e-06, + "loss": 0.7861, + "step": 10123 + }, + { + "epoch": 0.5572128350487093, + "grad_norm": 0.8170364499092102, + "learning_rate": 8.227899791808371e-06, + "loss": 0.8005, + "step": 10124 + }, + { + "epoch": 0.557267873851065, + "grad_norm": 0.678702175617218, + "learning_rate": 8.227568744394084e-06, + "loss": 0.7408, + "step": 10125 + }, + { + "epoch": 0.5573229126534207, + "grad_norm": 0.7212443947792053, + "learning_rate": 8.227237672722255e-06, + "loss": 0.7127, + "step": 10126 + }, + { + "epoch": 0.5573779514557763, + "grad_norm": 0.7035290002822876, + "learning_rate": 8.22690657679537e-06, + "loss": 0.8263, + "step": 10127 + }, + { + "epoch": 0.557432990258132, + "grad_norm": 0.6535285115242004, + "learning_rate": 8.226575456615921e-06, + "loss": 0.6979, + "step": 10128 + }, + { + "epoch": 0.5574880290604877, + "grad_norm": 0.7353794574737549, + "learning_rate": 8.226244312186396e-06, + "loss": 0.6838, + "step": 10129 + }, + { + "epoch": 0.5575430678628434, + "grad_norm": 0.5839618444442749, + "learning_rate": 8.225913143509278e-06, + "loss": 0.5925, + "step": 10130 + }, + { + "epoch": 0.5575981066651989, + "grad_norm": 0.6922228336334229, + "learning_rate": 8.225581950587063e-06, + "loss": 0.6808, + "step": 10131 + }, + { + "epoch": 0.5576531454675546, + "grad_norm": 0.753989040851593, + "learning_rate": 8.225250733422236e-06, + "loss": 0.6567, + "step": 10132 + }, + { + "epoch": 0.5577081842699103, + "grad_norm": 0.7327600717544556, + "learning_rate": 8.22491949201729e-06, + "loss": 0.8311, + "step": 10133 + }, + { + "epoch": 0.557763223072266, + "grad_norm": 0.6435133218765259, + "learning_rate": 8.224588226374712e-06, + "loss": 0.6684, + "step": 10134 + }, + { + "epoch": 0.5578182618746216, + "grad_norm": 0.6402057409286499, + "learning_rate": 8.22425693649699e-06, + "loss": 0.7569, + "step": 10135 + }, + { + "epoch": 0.5578733006769773, + "grad_norm": 0.7454472780227661, + "learning_rate": 8.223925622386617e-06, + "loss": 0.7908, + "step": 10136 + }, + { + "epoch": 0.557928339479333, + "grad_norm": 0.7373154759407043, + "learning_rate": 8.223594284046084e-06, + "loss": 0.8232, + "step": 10137 + }, + { + "epoch": 0.5579833782816885, + "grad_norm": 0.6478374004364014, + "learning_rate": 8.223262921477878e-06, + "loss": 0.7353, + "step": 10138 + }, + { + "epoch": 0.5580384170840442, + "grad_norm": 0.715212881565094, + "learning_rate": 8.222931534684488e-06, + "loss": 0.729, + "step": 10139 + }, + { + "epoch": 0.5580934558863999, + "grad_norm": 0.9226915240287781, + "learning_rate": 8.22260012366841e-06, + "loss": 0.7846, + "step": 10140 + }, + { + "epoch": 0.5581484946887556, + "grad_norm": 0.6481993198394775, + "learning_rate": 8.222268688432132e-06, + "loss": 0.6955, + "step": 10141 + }, + { + "epoch": 0.5582035334911112, + "grad_norm": 0.7240349054336548, + "learning_rate": 8.221937228978145e-06, + "loss": 0.7956, + "step": 10142 + }, + { + "epoch": 0.5582585722934669, + "grad_norm": 0.7089122533798218, + "learning_rate": 8.221605745308939e-06, + "loss": 0.7481, + "step": 10143 + }, + { + "epoch": 0.5583136110958226, + "grad_norm": 0.7292537093162537, + "learning_rate": 8.221274237427009e-06, + "loss": 0.7797, + "step": 10144 + }, + { + "epoch": 0.5583686498981782, + "grad_norm": 0.7104652523994446, + "learning_rate": 8.220942705334841e-06, + "loss": 0.7966, + "step": 10145 + }, + { + "epoch": 0.5584236887005338, + "grad_norm": 0.7656546831130981, + "learning_rate": 8.220611149034931e-06, + "loss": 0.7541, + "step": 10146 + }, + { + "epoch": 0.5584787275028895, + "grad_norm": 0.7618892788887024, + "learning_rate": 8.22027956852977e-06, + "loss": 0.6994, + "step": 10147 + }, + { + "epoch": 0.5585337663052452, + "grad_norm": 0.6445756554603577, + "learning_rate": 8.219947963821851e-06, + "loss": 0.7303, + "step": 10148 + }, + { + "epoch": 0.5585888051076009, + "grad_norm": 0.6529820561408997, + "learning_rate": 8.219616334913663e-06, + "loss": 0.7008, + "step": 10149 + }, + { + "epoch": 0.5586438439099565, + "grad_norm": 0.6890642046928406, + "learning_rate": 8.219284681807703e-06, + "loss": 0.8124, + "step": 10150 + }, + { + "epoch": 0.5586988827123122, + "grad_norm": 0.7273370027542114, + "learning_rate": 8.218953004506458e-06, + "loss": 0.7507, + "step": 10151 + }, + { + "epoch": 0.5587539215146679, + "grad_norm": 0.7239277362823486, + "learning_rate": 8.218621303012425e-06, + "loss": 0.7929, + "step": 10152 + }, + { + "epoch": 0.5588089603170235, + "grad_norm": 0.660275399684906, + "learning_rate": 8.218289577328096e-06, + "loss": 0.7418, + "step": 10153 + }, + { + "epoch": 0.5588639991193791, + "grad_norm": 0.7406648993492126, + "learning_rate": 8.217957827455965e-06, + "loss": 0.8072, + "step": 10154 + }, + { + "epoch": 0.5589190379217348, + "grad_norm": 0.7051703333854675, + "learning_rate": 8.217626053398522e-06, + "loss": 0.6562, + "step": 10155 + }, + { + "epoch": 0.5589740767240905, + "grad_norm": 0.93423992395401, + "learning_rate": 8.217294255158266e-06, + "loss": 0.738, + "step": 10156 + }, + { + "epoch": 0.5590291155264462, + "grad_norm": 0.8362720608711243, + "learning_rate": 8.216962432737685e-06, + "loss": 0.8585, + "step": 10157 + }, + { + "epoch": 0.5590841543288018, + "grad_norm": 0.9195587038993835, + "learning_rate": 8.216630586139277e-06, + "loss": 0.8778, + "step": 10158 + }, + { + "epoch": 0.5591391931311575, + "grad_norm": 0.7181550860404968, + "learning_rate": 8.216298715365534e-06, + "loss": 0.702, + "step": 10159 + }, + { + "epoch": 0.5591942319335131, + "grad_norm": 0.6900259852409363, + "learning_rate": 8.21596682041895e-06, + "loss": 0.7652, + "step": 10160 + }, + { + "epoch": 0.5592492707358688, + "grad_norm": 0.7523833513259888, + "learning_rate": 8.215634901302022e-06, + "loss": 0.7881, + "step": 10161 + }, + { + "epoch": 0.5593043095382244, + "grad_norm": 0.6659645438194275, + "learning_rate": 8.215302958017241e-06, + "loss": 0.694, + "step": 10162 + }, + { + "epoch": 0.5593593483405801, + "grad_norm": 0.8898606300354004, + "learning_rate": 8.214970990567105e-06, + "loss": 0.8534, + "step": 10163 + }, + { + "epoch": 0.5594143871429358, + "grad_norm": 0.6759241819381714, + "learning_rate": 8.214638998954108e-06, + "loss": 0.8241, + "step": 10164 + }, + { + "epoch": 0.5594694259452915, + "grad_norm": 0.7136911749839783, + "learning_rate": 8.214306983180744e-06, + "loss": 0.7846, + "step": 10165 + }, + { + "epoch": 0.559524464747647, + "grad_norm": 0.6781616806983948, + "learning_rate": 8.213974943249509e-06, + "loss": 0.7116, + "step": 10166 + }, + { + "epoch": 0.5595795035500027, + "grad_norm": 0.7134156227111816, + "learning_rate": 8.213642879162898e-06, + "loss": 0.7537, + "step": 10167 + }, + { + "epoch": 0.5596345423523584, + "grad_norm": 1.306710124015808, + "learning_rate": 8.213310790923408e-06, + "loss": 0.8506, + "step": 10168 + }, + { + "epoch": 0.5596895811547141, + "grad_norm": 0.725304901599884, + "learning_rate": 8.212978678533534e-06, + "loss": 0.8115, + "step": 10169 + }, + { + "epoch": 0.5597446199570697, + "grad_norm": 0.7833520174026489, + "learning_rate": 8.212646541995772e-06, + "loss": 0.919, + "step": 10170 + }, + { + "epoch": 0.5597996587594254, + "grad_norm": 0.6938104033470154, + "learning_rate": 8.212314381312621e-06, + "loss": 0.7303, + "step": 10171 + }, + { + "epoch": 0.5598546975617811, + "grad_norm": 0.6860232949256897, + "learning_rate": 8.211982196486573e-06, + "loss": 0.7709, + "step": 10172 + }, + { + "epoch": 0.5599097363641368, + "grad_norm": 0.6611567139625549, + "learning_rate": 8.211649987520126e-06, + "loss": 0.7711, + "step": 10173 + }, + { + "epoch": 0.5599647751664923, + "grad_norm": 0.8603463172912598, + "learning_rate": 8.211317754415778e-06, + "loss": 0.8527, + "step": 10174 + }, + { + "epoch": 0.560019813968848, + "grad_norm": 0.7350558638572693, + "learning_rate": 8.210985497176025e-06, + "loss": 0.8148, + "step": 10175 + }, + { + "epoch": 0.5600748527712037, + "grad_norm": 0.6881470084190369, + "learning_rate": 8.210653215803365e-06, + "loss": 0.7526, + "step": 10176 + }, + { + "epoch": 0.5601298915735594, + "grad_norm": 0.6879626512527466, + "learning_rate": 8.210320910300296e-06, + "loss": 0.7649, + "step": 10177 + }, + { + "epoch": 0.560184930375915, + "grad_norm": 0.6843587160110474, + "learning_rate": 8.209988580669312e-06, + "loss": 0.8131, + "step": 10178 + }, + { + "epoch": 0.5602399691782707, + "grad_norm": 0.6684302687644958, + "learning_rate": 8.209656226912915e-06, + "loss": 0.7256, + "step": 10179 + }, + { + "epoch": 0.5602950079806264, + "grad_norm": 0.7973861694335938, + "learning_rate": 8.209323849033601e-06, + "loss": 0.7924, + "step": 10180 + }, + { + "epoch": 0.560350046782982, + "grad_norm": 0.6850616931915283, + "learning_rate": 8.208991447033867e-06, + "loss": 0.7423, + "step": 10181 + }, + { + "epoch": 0.5604050855853376, + "grad_norm": 0.8284440636634827, + "learning_rate": 8.208659020916213e-06, + "loss": 0.7637, + "step": 10182 + }, + { + "epoch": 0.5604601243876933, + "grad_norm": 0.7671821713447571, + "learning_rate": 8.208326570683136e-06, + "loss": 0.7688, + "step": 10183 + }, + { + "epoch": 0.560515163190049, + "grad_norm": 0.8359144330024719, + "learning_rate": 8.207994096337135e-06, + "loss": 0.8179, + "step": 10184 + }, + { + "epoch": 0.5605702019924046, + "grad_norm": 0.6389699578285217, + "learning_rate": 8.207661597880709e-06, + "loss": 0.6987, + "step": 10185 + }, + { + "epoch": 0.5606252407947603, + "grad_norm": 0.6472755074501038, + "learning_rate": 8.20732907531636e-06, + "loss": 0.6984, + "step": 10186 + }, + { + "epoch": 0.560680279597116, + "grad_norm": 0.8231903314590454, + "learning_rate": 8.20699652864658e-06, + "loss": 0.8212, + "step": 10187 + }, + { + "epoch": 0.5607353183994717, + "grad_norm": 0.7550386190414429, + "learning_rate": 8.206663957873876e-06, + "loss": 0.7446, + "step": 10188 + }, + { + "epoch": 0.5607903572018272, + "grad_norm": 0.6704659461975098, + "learning_rate": 8.206331363000743e-06, + "loss": 0.7035, + "step": 10189 + }, + { + "epoch": 0.5608453960041829, + "grad_norm": 0.7258654236793518, + "learning_rate": 8.20599874402968e-06, + "loss": 0.7032, + "step": 10190 + }, + { + "epoch": 0.5609004348065386, + "grad_norm": 0.674609363079071, + "learning_rate": 8.20566610096319e-06, + "loss": 0.7545, + "step": 10191 + }, + { + "epoch": 0.5609554736088943, + "grad_norm": 0.6978347301483154, + "learning_rate": 8.205333433803773e-06, + "loss": 0.8198, + "step": 10192 + }, + { + "epoch": 0.5610105124112499, + "grad_norm": 0.6252121329307556, + "learning_rate": 8.205000742553925e-06, + "loss": 0.6639, + "step": 10193 + }, + { + "epoch": 0.5610655512136056, + "grad_norm": 0.7288224101066589, + "learning_rate": 8.204668027216152e-06, + "loss": 0.8035, + "step": 10194 + }, + { + "epoch": 0.5611205900159613, + "grad_norm": 0.6591556072235107, + "learning_rate": 8.20433528779295e-06, + "loss": 0.7552, + "step": 10195 + }, + { + "epoch": 0.561175628818317, + "grad_norm": 0.769827127456665, + "learning_rate": 8.204002524286823e-06, + "loss": 0.7279, + "step": 10196 + }, + { + "epoch": 0.5612306676206725, + "grad_norm": 0.74398273229599, + "learning_rate": 8.203669736700271e-06, + "loss": 0.7638, + "step": 10197 + }, + { + "epoch": 0.5612857064230282, + "grad_norm": 0.9343454241752625, + "learning_rate": 8.203336925035795e-06, + "loss": 0.7513, + "step": 10198 + }, + { + "epoch": 0.5613407452253839, + "grad_norm": 0.6667190194129944, + "learning_rate": 8.203004089295894e-06, + "loss": 0.77, + "step": 10199 + }, + { + "epoch": 0.5613957840277396, + "grad_norm": 0.7684557437896729, + "learning_rate": 8.202671229483073e-06, + "loss": 0.803, + "step": 10200 + }, + { + "epoch": 0.5614508228300952, + "grad_norm": 0.6551374793052673, + "learning_rate": 8.202338345599832e-06, + "loss": 0.6914, + "step": 10201 + }, + { + "epoch": 0.5615058616324509, + "grad_norm": 0.717464029788971, + "learning_rate": 8.202005437648674e-06, + "loss": 0.6797, + "step": 10202 + }, + { + "epoch": 0.5615609004348066, + "grad_norm": 0.7053301334381104, + "learning_rate": 8.2016725056321e-06, + "loss": 0.7857, + "step": 10203 + }, + { + "epoch": 0.5616159392371622, + "grad_norm": 0.8392077684402466, + "learning_rate": 8.20133954955261e-06, + "loss": 0.8321, + "step": 10204 + }, + { + "epoch": 0.5616709780395178, + "grad_norm": 0.6630520820617676, + "learning_rate": 8.201006569412711e-06, + "loss": 0.7093, + "step": 10205 + }, + { + "epoch": 0.5617260168418735, + "grad_norm": 0.6835867762565613, + "learning_rate": 8.200673565214905e-06, + "loss": 0.6623, + "step": 10206 + }, + { + "epoch": 0.5617810556442292, + "grad_norm": 0.7635336518287659, + "learning_rate": 8.200340536961691e-06, + "loss": 0.8378, + "step": 10207 + }, + { + "epoch": 0.5618360944465849, + "grad_norm": 0.6500052213668823, + "learning_rate": 8.200007484655575e-06, + "loss": 0.6836, + "step": 10208 + }, + { + "epoch": 0.5618911332489405, + "grad_norm": 0.6549860835075378, + "learning_rate": 8.199674408299058e-06, + "loss": 0.6868, + "step": 10209 + }, + { + "epoch": 0.5619461720512962, + "grad_norm": 0.7995957732200623, + "learning_rate": 8.199341307894647e-06, + "loss": 0.7719, + "step": 10210 + }, + { + "epoch": 0.5620012108536518, + "grad_norm": 0.6869412064552307, + "learning_rate": 8.199008183444843e-06, + "loss": 0.7921, + "step": 10211 + }, + { + "epoch": 0.5620562496560075, + "grad_norm": 0.9125131964683533, + "learning_rate": 8.198675034952149e-06, + "loss": 0.9015, + "step": 10212 + }, + { + "epoch": 0.5621112884583631, + "grad_norm": 0.6851146221160889, + "learning_rate": 8.198341862419068e-06, + "loss": 0.7773, + "step": 10213 + }, + { + "epoch": 0.5621663272607188, + "grad_norm": 0.6808778047561646, + "learning_rate": 8.198008665848108e-06, + "loss": 0.7375, + "step": 10214 + }, + { + "epoch": 0.5622213660630745, + "grad_norm": 0.6419697999954224, + "learning_rate": 8.19767544524177e-06, + "loss": 0.7496, + "step": 10215 + }, + { + "epoch": 0.5622764048654302, + "grad_norm": 0.7325716614723206, + "learning_rate": 8.197342200602559e-06, + "loss": 0.7424, + "step": 10216 + }, + { + "epoch": 0.5623314436677858, + "grad_norm": 0.6165832281112671, + "learning_rate": 8.19700893193298e-06, + "loss": 0.6364, + "step": 10217 + }, + { + "epoch": 0.5623864824701414, + "grad_norm": 0.7632125020027161, + "learning_rate": 8.196675639235539e-06, + "loss": 0.7175, + "step": 10218 + }, + { + "epoch": 0.5624415212724971, + "grad_norm": 0.6789713501930237, + "learning_rate": 8.196342322512738e-06, + "loss": 0.7122, + "step": 10219 + }, + { + "epoch": 0.5624965600748528, + "grad_norm": 0.7341050505638123, + "learning_rate": 8.196008981767084e-06, + "loss": 0.7598, + "step": 10220 + }, + { + "epoch": 0.5625515988772084, + "grad_norm": 0.7318429350852966, + "learning_rate": 8.195675617001083e-06, + "loss": 0.7723, + "step": 10221 + }, + { + "epoch": 0.5626066376795641, + "grad_norm": 0.6940313577651978, + "learning_rate": 8.195342228217238e-06, + "loss": 0.7885, + "step": 10222 + }, + { + "epoch": 0.5626616764819198, + "grad_norm": 0.8792300820350647, + "learning_rate": 8.195008815418058e-06, + "loss": 0.7657, + "step": 10223 + }, + { + "epoch": 0.5627167152842754, + "grad_norm": 0.7234559655189514, + "learning_rate": 8.194675378606044e-06, + "loss": 0.7988, + "step": 10224 + }, + { + "epoch": 0.562771754086631, + "grad_norm": 0.6698254942893982, + "learning_rate": 8.194341917783708e-06, + "loss": 0.6378, + "step": 10225 + }, + { + "epoch": 0.5628267928889867, + "grad_norm": 0.6546483635902405, + "learning_rate": 8.194008432953552e-06, + "loss": 0.7113, + "step": 10226 + }, + { + "epoch": 0.5628818316913424, + "grad_norm": 0.6532583832740784, + "learning_rate": 8.193674924118085e-06, + "loss": 0.6782, + "step": 10227 + }, + { + "epoch": 0.562936870493698, + "grad_norm": 0.770578920841217, + "learning_rate": 8.19334139127981e-06, + "loss": 0.8519, + "step": 10228 + }, + { + "epoch": 0.5629919092960537, + "grad_norm": 0.7255409359931946, + "learning_rate": 8.193007834441235e-06, + "loss": 0.6555, + "step": 10229 + }, + { + "epoch": 0.5630469480984094, + "grad_norm": 0.6659883856773376, + "learning_rate": 8.19267425360487e-06, + "loss": 0.7836, + "step": 10230 + }, + { + "epoch": 0.5631019869007651, + "grad_norm": 0.6596028208732605, + "learning_rate": 8.192340648773221e-06, + "loss": 0.6199, + "step": 10231 + }, + { + "epoch": 0.5631570257031207, + "grad_norm": 0.8226001858711243, + "learning_rate": 8.192007019948793e-06, + "loss": 0.8101, + "step": 10232 + }, + { + "epoch": 0.5632120645054763, + "grad_norm": 0.7465038895606995, + "learning_rate": 8.191673367134094e-06, + "loss": 0.8437, + "step": 10233 + }, + { + "epoch": 0.563267103307832, + "grad_norm": 1.0008004903793335, + "learning_rate": 8.191339690331632e-06, + "loss": 0.8626, + "step": 10234 + }, + { + "epoch": 0.5633221421101877, + "grad_norm": 0.7538222670555115, + "learning_rate": 8.191005989543917e-06, + "loss": 0.7222, + "step": 10235 + }, + { + "epoch": 0.5633771809125433, + "grad_norm": 0.6252872943878174, + "learning_rate": 8.190672264773454e-06, + "loss": 0.8038, + "step": 10236 + }, + { + "epoch": 0.563432219714899, + "grad_norm": 0.7083514928817749, + "learning_rate": 8.190338516022752e-06, + "loss": 0.7863, + "step": 10237 + }, + { + "epoch": 0.5634872585172547, + "grad_norm": 0.6887454390525818, + "learning_rate": 8.19000474329432e-06, + "loss": 0.7034, + "step": 10238 + }, + { + "epoch": 0.5635422973196104, + "grad_norm": 0.7487072348594666, + "learning_rate": 8.189670946590666e-06, + "loss": 0.8618, + "step": 10239 + }, + { + "epoch": 0.5635973361219659, + "grad_norm": 0.6999371647834778, + "learning_rate": 8.189337125914298e-06, + "loss": 0.7613, + "step": 10240 + }, + { + "epoch": 0.5636523749243216, + "grad_norm": 0.8265380263328552, + "learning_rate": 8.18900328126773e-06, + "loss": 0.7576, + "step": 10241 + }, + { + "epoch": 0.5637074137266773, + "grad_norm": 0.6688962578773499, + "learning_rate": 8.188669412653463e-06, + "loss": 0.712, + "step": 10242 + }, + { + "epoch": 0.563762452529033, + "grad_norm": 0.6343923211097717, + "learning_rate": 8.188335520074011e-06, + "loss": 0.7239, + "step": 10243 + }, + { + "epoch": 0.5638174913313886, + "grad_norm": 0.7122388482093811, + "learning_rate": 8.188001603531883e-06, + "loss": 0.7892, + "step": 10244 + }, + { + "epoch": 0.5638725301337443, + "grad_norm": 0.6646286845207214, + "learning_rate": 8.187667663029587e-06, + "loss": 0.7805, + "step": 10245 + }, + { + "epoch": 0.5639275689361, + "grad_norm": 0.742938220500946, + "learning_rate": 8.187333698569638e-06, + "loss": 0.8444, + "step": 10246 + }, + { + "epoch": 0.5639826077384557, + "grad_norm": 0.7260885238647461, + "learning_rate": 8.18699971015454e-06, + "loss": 0.8621, + "step": 10247 + }, + { + "epoch": 0.5640376465408112, + "grad_norm": 0.7920067310333252, + "learning_rate": 8.186665697786804e-06, + "loss": 0.7391, + "step": 10248 + }, + { + "epoch": 0.5640926853431669, + "grad_norm": 0.7472825646400452, + "learning_rate": 8.186331661468943e-06, + "loss": 0.7249, + "step": 10249 + }, + { + "epoch": 0.5641477241455226, + "grad_norm": 0.692643940448761, + "learning_rate": 8.185997601203465e-06, + "loss": 0.7884, + "step": 10250 + }, + { + "epoch": 0.5642027629478783, + "grad_norm": 0.715455174446106, + "learning_rate": 8.185663516992884e-06, + "loss": 0.7369, + "step": 10251 + }, + { + "epoch": 0.5642578017502339, + "grad_norm": 0.7566105723381042, + "learning_rate": 8.185329408839705e-06, + "loss": 0.7378, + "step": 10252 + }, + { + "epoch": 0.5643128405525896, + "grad_norm": 0.8163520693778992, + "learning_rate": 8.184995276746445e-06, + "loss": 0.7326, + "step": 10253 + }, + { + "epoch": 0.5643678793549453, + "grad_norm": 0.6280468106269836, + "learning_rate": 8.184661120715615e-06, + "loss": 0.6858, + "step": 10254 + }, + { + "epoch": 0.564422918157301, + "grad_norm": 0.7246795892715454, + "learning_rate": 8.184326940749723e-06, + "loss": 0.8111, + "step": 10255 + }, + { + "epoch": 0.5644779569596565, + "grad_norm": 0.7429527640342712, + "learning_rate": 8.18399273685128e-06, + "loss": 0.7642, + "step": 10256 + }, + { + "epoch": 0.5645329957620122, + "grad_norm": 0.7308861017227173, + "learning_rate": 8.183658509022802e-06, + "loss": 0.7844, + "step": 10257 + }, + { + "epoch": 0.5645880345643679, + "grad_norm": 0.7549033164978027, + "learning_rate": 8.1833242572668e-06, + "loss": 0.8585, + "step": 10258 + }, + { + "epoch": 0.5646430733667236, + "grad_norm": 0.6779888868331909, + "learning_rate": 8.182989981585782e-06, + "loss": 0.6808, + "step": 10259 + }, + { + "epoch": 0.5646981121690792, + "grad_norm": 0.887113630771637, + "learning_rate": 8.182655681982266e-06, + "loss": 0.8229, + "step": 10260 + }, + { + "epoch": 0.5647531509714349, + "grad_norm": 0.6405711770057678, + "learning_rate": 8.18232135845876e-06, + "loss": 0.6901, + "step": 10261 + }, + { + "epoch": 0.5648081897737905, + "grad_norm": 0.7302486300468445, + "learning_rate": 8.18198701101778e-06, + "loss": 0.6853, + "step": 10262 + }, + { + "epoch": 0.5648632285761462, + "grad_norm": 0.6374662518501282, + "learning_rate": 8.181652639661837e-06, + "loss": 0.7177, + "step": 10263 + }, + { + "epoch": 0.5649182673785018, + "grad_norm": 0.9267570972442627, + "learning_rate": 8.181318244393444e-06, + "loss": 0.7926, + "step": 10264 + }, + { + "epoch": 0.5649733061808575, + "grad_norm": 0.8196623921394348, + "learning_rate": 8.180983825215114e-06, + "loss": 0.7127, + "step": 10265 + }, + { + "epoch": 0.5650283449832132, + "grad_norm": 0.7004575133323669, + "learning_rate": 8.180649382129361e-06, + "loss": 0.7858, + "step": 10266 + }, + { + "epoch": 0.5650833837855688, + "grad_norm": 0.7667824625968933, + "learning_rate": 8.180314915138701e-06, + "loss": 0.7742, + "step": 10267 + }, + { + "epoch": 0.5651384225879245, + "grad_norm": 0.7372623682022095, + "learning_rate": 8.179980424245644e-06, + "loss": 0.7949, + "step": 10268 + }, + { + "epoch": 0.5651934613902801, + "grad_norm": 0.6417940258979797, + "learning_rate": 8.179645909452704e-06, + "loss": 0.6683, + "step": 10269 + }, + { + "epoch": 0.5652485001926358, + "grad_norm": 0.6736140251159668, + "learning_rate": 8.179311370762398e-06, + "loss": 0.6564, + "step": 10270 + }, + { + "epoch": 0.5653035389949914, + "grad_norm": 0.6727200746536255, + "learning_rate": 8.178976808177239e-06, + "loss": 0.8065, + "step": 10271 + }, + { + "epoch": 0.5653585777973471, + "grad_norm": 0.7565415501594543, + "learning_rate": 8.17864222169974e-06, + "loss": 0.9055, + "step": 10272 + }, + { + "epoch": 0.5654136165997028, + "grad_norm": 0.8938627243041992, + "learning_rate": 8.178307611332418e-06, + "loss": 0.8009, + "step": 10273 + }, + { + "epoch": 0.5654686554020585, + "grad_norm": 0.7439131140708923, + "learning_rate": 8.177972977077786e-06, + "loss": 0.7807, + "step": 10274 + }, + { + "epoch": 0.5655236942044141, + "grad_norm": 0.7603998184204102, + "learning_rate": 8.17763831893836e-06, + "loss": 0.818, + "step": 10275 + }, + { + "epoch": 0.5655787330067698, + "grad_norm": 0.7088946104049683, + "learning_rate": 8.177303636916655e-06, + "loss": 0.7741, + "step": 10276 + }, + { + "epoch": 0.5656337718091254, + "grad_norm": 0.6801518201828003, + "learning_rate": 8.176968931015187e-06, + "loss": 0.7633, + "step": 10277 + }, + { + "epoch": 0.5656888106114811, + "grad_norm": 0.6739299297332764, + "learning_rate": 8.17663420123647e-06, + "loss": 0.7772, + "step": 10278 + }, + { + "epoch": 0.5657438494138367, + "grad_norm": 0.7432494759559631, + "learning_rate": 8.176299447583021e-06, + "loss": 0.7368, + "step": 10279 + }, + { + "epoch": 0.5657988882161924, + "grad_norm": 0.7847158908843994, + "learning_rate": 8.175964670057357e-06, + "loss": 0.7824, + "step": 10280 + }, + { + "epoch": 0.5658539270185481, + "grad_norm": 0.8732449412345886, + "learning_rate": 8.17562986866199e-06, + "loss": 0.8035, + "step": 10281 + }, + { + "epoch": 0.5659089658209038, + "grad_norm": 0.7988447546958923, + "learning_rate": 8.17529504339944e-06, + "loss": 0.828, + "step": 10282 + }, + { + "epoch": 0.5659640046232594, + "grad_norm": 0.7063263058662415, + "learning_rate": 8.174960194272224e-06, + "loss": 0.7723, + "step": 10283 + }, + { + "epoch": 0.566019043425615, + "grad_norm": 0.7635022401809692, + "learning_rate": 8.174625321282856e-06, + "loss": 0.7156, + "step": 10284 + }, + { + "epoch": 0.5660740822279707, + "grad_norm": 0.6505927443504333, + "learning_rate": 8.174290424433853e-06, + "loss": 0.7409, + "step": 10285 + }, + { + "epoch": 0.5661291210303264, + "grad_norm": 0.6919816136360168, + "learning_rate": 8.173955503727734e-06, + "loss": 0.7829, + "step": 10286 + }, + { + "epoch": 0.566184159832682, + "grad_norm": 0.7024216651916504, + "learning_rate": 8.173620559167015e-06, + "loss": 0.7378, + "step": 10287 + }, + { + "epoch": 0.5662391986350377, + "grad_norm": 0.7134365439414978, + "learning_rate": 8.173285590754212e-06, + "loss": 0.7737, + "step": 10288 + }, + { + "epoch": 0.5662942374373934, + "grad_norm": 0.6867973804473877, + "learning_rate": 8.172950598491845e-06, + "loss": 0.7169, + "step": 10289 + }, + { + "epoch": 0.5663492762397491, + "grad_norm": 0.6900742650032043, + "learning_rate": 8.172615582382432e-06, + "loss": 0.7888, + "step": 10290 + }, + { + "epoch": 0.5664043150421046, + "grad_norm": 0.7026718854904175, + "learning_rate": 8.172280542428488e-06, + "loss": 0.8179, + "step": 10291 + }, + { + "epoch": 0.5664593538444603, + "grad_norm": 0.6940855979919434, + "learning_rate": 8.171945478632533e-06, + "loss": 0.7686, + "step": 10292 + }, + { + "epoch": 0.566514392646816, + "grad_norm": 0.6717686653137207, + "learning_rate": 8.171610390997085e-06, + "loss": 0.7865, + "step": 10293 + }, + { + "epoch": 0.5665694314491717, + "grad_norm": 0.6947711110115051, + "learning_rate": 8.171275279524661e-06, + "loss": 0.7811, + "step": 10294 + }, + { + "epoch": 0.5666244702515273, + "grad_norm": 0.6907814741134644, + "learning_rate": 8.170940144217782e-06, + "loss": 0.7095, + "step": 10295 + }, + { + "epoch": 0.566679509053883, + "grad_norm": 0.723952054977417, + "learning_rate": 8.170604985078965e-06, + "loss": 0.7814, + "step": 10296 + }, + { + "epoch": 0.5667345478562387, + "grad_norm": 0.7775490880012512, + "learning_rate": 8.17026980211073e-06, + "loss": 0.797, + "step": 10297 + }, + { + "epoch": 0.5667895866585944, + "grad_norm": 0.7557885646820068, + "learning_rate": 8.169934595315597e-06, + "loss": 0.8423, + "step": 10298 + }, + { + "epoch": 0.5668446254609499, + "grad_norm": 0.7838338017463684, + "learning_rate": 8.169599364696083e-06, + "loss": 0.7114, + "step": 10299 + }, + { + "epoch": 0.5668996642633056, + "grad_norm": 0.6632605791091919, + "learning_rate": 8.169264110254707e-06, + "loss": 0.6723, + "step": 10300 + }, + { + "epoch": 0.5669547030656613, + "grad_norm": 0.735756516456604, + "learning_rate": 8.168928831993991e-06, + "loss": 0.7533, + "step": 10301 + }, + { + "epoch": 0.567009741868017, + "grad_norm": 0.6981016993522644, + "learning_rate": 8.168593529916457e-06, + "loss": 0.7882, + "step": 10302 + }, + { + "epoch": 0.5670647806703726, + "grad_norm": 0.6413942575454712, + "learning_rate": 8.168258204024619e-06, + "loss": 0.6593, + "step": 10303 + }, + { + "epoch": 0.5671198194727283, + "grad_norm": 0.7040891051292419, + "learning_rate": 8.167922854321002e-06, + "loss": 0.7295, + "step": 10304 + }, + { + "epoch": 0.567174858275084, + "grad_norm": 0.7132521867752075, + "learning_rate": 8.167587480808126e-06, + "loss": 0.7128, + "step": 10305 + }, + { + "epoch": 0.5672298970774396, + "grad_norm": 0.756529688835144, + "learning_rate": 8.167252083488508e-06, + "loss": 0.7044, + "step": 10306 + }, + { + "epoch": 0.5672849358797952, + "grad_norm": 0.8456888198852539, + "learning_rate": 8.166916662364672e-06, + "loss": 0.8304, + "step": 10307 + }, + { + "epoch": 0.5673399746821509, + "grad_norm": 0.7758522629737854, + "learning_rate": 8.166581217439138e-06, + "loss": 0.7192, + "step": 10308 + }, + { + "epoch": 0.5673950134845066, + "grad_norm": 0.8110343217849731, + "learning_rate": 8.166245748714428e-06, + "loss": 0.8794, + "step": 10309 + }, + { + "epoch": 0.5674500522868622, + "grad_norm": 0.6803586483001709, + "learning_rate": 8.165910256193062e-06, + "loss": 0.7402, + "step": 10310 + }, + { + "epoch": 0.5675050910892179, + "grad_norm": 0.7294176816940308, + "learning_rate": 8.165574739877563e-06, + "loss": 0.7325, + "step": 10311 + }, + { + "epoch": 0.5675601298915736, + "grad_norm": 0.835488498210907, + "learning_rate": 8.165239199770448e-06, + "loss": 0.8317, + "step": 10312 + }, + { + "epoch": 0.5676151686939293, + "grad_norm": 0.6497608423233032, + "learning_rate": 8.164903635874246e-06, + "loss": 0.6902, + "step": 10313 + }, + { + "epoch": 0.5676702074962848, + "grad_norm": 0.6782082915306091, + "learning_rate": 8.164568048191474e-06, + "loss": 0.7941, + "step": 10314 + }, + { + "epoch": 0.5677252462986405, + "grad_norm": 0.6974388957023621, + "learning_rate": 8.164232436724656e-06, + "loss": 0.7899, + "step": 10315 + }, + { + "epoch": 0.5677802851009962, + "grad_norm": 0.7222558259963989, + "learning_rate": 8.163896801476314e-06, + "loss": 0.8034, + "step": 10316 + }, + { + "epoch": 0.5678353239033519, + "grad_norm": 0.6562586426734924, + "learning_rate": 8.16356114244897e-06, + "loss": 0.7864, + "step": 10317 + }, + { + "epoch": 0.5678903627057075, + "grad_norm": 0.6888270378112793, + "learning_rate": 8.16322545964515e-06, + "loss": 0.8455, + "step": 10318 + }, + { + "epoch": 0.5679454015080632, + "grad_norm": 0.642084002494812, + "learning_rate": 8.162889753067372e-06, + "loss": 0.7478, + "step": 10319 + }, + { + "epoch": 0.5680004403104189, + "grad_norm": 0.7077270746231079, + "learning_rate": 8.16255402271816e-06, + "loss": 0.7281, + "step": 10320 + }, + { + "epoch": 0.5680554791127745, + "grad_norm": 0.7202198505401611, + "learning_rate": 8.16221826860004e-06, + "loss": 0.7893, + "step": 10321 + }, + { + "epoch": 0.5681105179151301, + "grad_norm": 0.8950369954109192, + "learning_rate": 8.161882490715534e-06, + "loss": 0.772, + "step": 10322 + }, + { + "epoch": 0.5681655567174858, + "grad_norm": 0.6986666917800903, + "learning_rate": 8.161546689067166e-06, + "loss": 0.7712, + "step": 10323 + }, + { + "epoch": 0.5682205955198415, + "grad_norm": 0.7095959782600403, + "learning_rate": 8.161210863657458e-06, + "loss": 0.8373, + "step": 10324 + }, + { + "epoch": 0.5682756343221972, + "grad_norm": 0.7510485649108887, + "learning_rate": 8.160875014488936e-06, + "loss": 0.9106, + "step": 10325 + }, + { + "epoch": 0.5683306731245528, + "grad_norm": 0.7558283805847168, + "learning_rate": 8.160539141564123e-06, + "loss": 0.8192, + "step": 10326 + }, + { + "epoch": 0.5683857119269085, + "grad_norm": 0.7523400187492371, + "learning_rate": 8.160203244885545e-06, + "loss": 0.8276, + "step": 10327 + }, + { + "epoch": 0.5684407507292641, + "grad_norm": 0.6911195516586304, + "learning_rate": 8.159867324455724e-06, + "loss": 0.6286, + "step": 10328 + }, + { + "epoch": 0.5684957895316198, + "grad_norm": 0.6456325054168701, + "learning_rate": 8.159531380277188e-06, + "loss": 0.7419, + "step": 10329 + }, + { + "epoch": 0.5685508283339754, + "grad_norm": 0.9318492412567139, + "learning_rate": 8.159195412352458e-06, + "loss": 0.8131, + "step": 10330 + }, + { + "epoch": 0.5686058671363311, + "grad_norm": 0.7012938857078552, + "learning_rate": 8.158859420684062e-06, + "loss": 0.7074, + "step": 10331 + }, + { + "epoch": 0.5686609059386868, + "grad_norm": 0.7152053117752075, + "learning_rate": 8.158523405274523e-06, + "loss": 0.7186, + "step": 10332 + }, + { + "epoch": 0.5687159447410425, + "grad_norm": 0.7074982523918152, + "learning_rate": 8.158187366126368e-06, + "loss": 0.8021, + "step": 10333 + }, + { + "epoch": 0.5687709835433981, + "grad_norm": 0.689536452293396, + "learning_rate": 8.157851303242123e-06, + "loss": 0.7493, + "step": 10334 + }, + { + "epoch": 0.5688260223457537, + "grad_norm": 0.7411753535270691, + "learning_rate": 8.157515216624313e-06, + "loss": 0.8012, + "step": 10335 + }, + { + "epoch": 0.5688810611481094, + "grad_norm": 0.6831420063972473, + "learning_rate": 8.157179106275463e-06, + "loss": 0.7114, + "step": 10336 + }, + { + "epoch": 0.5689360999504651, + "grad_norm": 0.6786901950836182, + "learning_rate": 8.1568429721981e-06, + "loss": 0.7638, + "step": 10337 + }, + { + "epoch": 0.5689911387528207, + "grad_norm": 0.7546970844268799, + "learning_rate": 8.15650681439475e-06, + "loss": 0.7711, + "step": 10338 + }, + { + "epoch": 0.5690461775551764, + "grad_norm": 0.8071785569190979, + "learning_rate": 8.156170632867942e-06, + "loss": 0.8105, + "step": 10339 + }, + { + "epoch": 0.5691012163575321, + "grad_norm": 0.7872087359428406, + "learning_rate": 8.155834427620198e-06, + "loss": 0.7657, + "step": 10340 + }, + { + "epoch": 0.5691562551598878, + "grad_norm": 0.724328875541687, + "learning_rate": 8.155498198654047e-06, + "loss": 0.7978, + "step": 10341 + }, + { + "epoch": 0.5692112939622433, + "grad_norm": 0.8559905886650085, + "learning_rate": 8.155161945972016e-06, + "loss": 0.7766, + "step": 10342 + }, + { + "epoch": 0.569266332764599, + "grad_norm": 0.607418417930603, + "learning_rate": 8.154825669576635e-06, + "loss": 0.642, + "step": 10343 + }, + { + "epoch": 0.5693213715669547, + "grad_norm": 0.7403624653816223, + "learning_rate": 8.154489369470426e-06, + "loss": 0.7301, + "step": 10344 + }, + { + "epoch": 0.5693764103693104, + "grad_norm": 0.7388540506362915, + "learning_rate": 8.154153045655922e-06, + "loss": 0.7895, + "step": 10345 + }, + { + "epoch": 0.569431449171666, + "grad_norm": 0.8327579498291016, + "learning_rate": 8.153816698135646e-06, + "loss": 0.7589, + "step": 10346 + }, + { + "epoch": 0.5694864879740217, + "grad_norm": 0.7738710641860962, + "learning_rate": 8.153480326912128e-06, + "loss": 0.7828, + "step": 10347 + }, + { + "epoch": 0.5695415267763774, + "grad_norm": 0.8280724287033081, + "learning_rate": 8.153143931987896e-06, + "loss": 0.8194, + "step": 10348 + }, + { + "epoch": 0.5695965655787331, + "grad_norm": 0.8290724754333496, + "learning_rate": 8.152807513365478e-06, + "loss": 0.5941, + "step": 10349 + }, + { + "epoch": 0.5696516043810886, + "grad_norm": 0.7514322400093079, + "learning_rate": 8.152471071047403e-06, + "loss": 0.676, + "step": 10350 + }, + { + "epoch": 0.5697066431834443, + "grad_norm": 0.6990258693695068, + "learning_rate": 8.1521346050362e-06, + "loss": 0.804, + "step": 10351 + }, + { + "epoch": 0.5697616819858, + "grad_norm": 0.6781288981437683, + "learning_rate": 8.151798115334396e-06, + "loss": 0.7372, + "step": 10352 + }, + { + "epoch": 0.5698167207881556, + "grad_norm": 0.764301061630249, + "learning_rate": 8.151461601944523e-06, + "loss": 0.8242, + "step": 10353 + }, + { + "epoch": 0.5698717595905113, + "grad_norm": 0.7577376961708069, + "learning_rate": 8.151125064869106e-06, + "loss": 0.7354, + "step": 10354 + }, + { + "epoch": 0.569926798392867, + "grad_norm": 0.767764687538147, + "learning_rate": 8.150788504110678e-06, + "loss": 0.7262, + "step": 10355 + }, + { + "epoch": 0.5699818371952227, + "grad_norm": 0.6634765267372131, + "learning_rate": 8.150451919671767e-06, + "loss": 0.7527, + "step": 10356 + }, + { + "epoch": 0.5700368759975782, + "grad_norm": 0.8803308010101318, + "learning_rate": 8.150115311554901e-06, + "loss": 0.8172, + "step": 10357 + }, + { + "epoch": 0.5700919147999339, + "grad_norm": 0.695791482925415, + "learning_rate": 8.149778679762611e-06, + "loss": 0.7538, + "step": 10358 + }, + { + "epoch": 0.5701469536022896, + "grad_norm": 0.7047555446624756, + "learning_rate": 8.149442024297432e-06, + "loss": 0.7533, + "step": 10359 + }, + { + "epoch": 0.5702019924046453, + "grad_norm": 0.7148274183273315, + "learning_rate": 8.149105345161886e-06, + "loss": 0.6736, + "step": 10360 + }, + { + "epoch": 0.5702570312070009, + "grad_norm": 0.673204243183136, + "learning_rate": 8.148768642358508e-06, + "loss": 0.7713, + "step": 10361 + }, + { + "epoch": 0.5703120700093566, + "grad_norm": 0.6258989572525024, + "learning_rate": 8.148431915889827e-06, + "loss": 0.6578, + "step": 10362 + }, + { + "epoch": 0.5703671088117123, + "grad_norm": 0.8411956429481506, + "learning_rate": 8.148095165758377e-06, + "loss": 0.8387, + "step": 10363 + }, + { + "epoch": 0.570422147614068, + "grad_norm": 0.7802130579948425, + "learning_rate": 8.147758391966685e-06, + "loss": 0.8564, + "step": 10364 + }, + { + "epoch": 0.5704771864164235, + "grad_norm": 0.6665176153182983, + "learning_rate": 8.147421594517282e-06, + "loss": 0.688, + "step": 10365 + }, + { + "epoch": 0.5705322252187792, + "grad_norm": 0.7166683673858643, + "learning_rate": 8.147084773412702e-06, + "loss": 0.6704, + "step": 10366 + }, + { + "epoch": 0.5705872640211349, + "grad_norm": 0.6948957443237305, + "learning_rate": 8.146747928655476e-06, + "loss": 0.7116, + "step": 10367 + }, + { + "epoch": 0.5706423028234906, + "grad_norm": 0.588965892791748, + "learning_rate": 8.146411060248134e-06, + "loss": 0.5644, + "step": 10368 + }, + { + "epoch": 0.5706973416258462, + "grad_norm": 0.8020890355110168, + "learning_rate": 8.14607416819321e-06, + "loss": 0.6978, + "step": 10369 + }, + { + "epoch": 0.5707523804282019, + "grad_norm": 0.9900732040405273, + "learning_rate": 8.145737252493234e-06, + "loss": 0.7295, + "step": 10370 + }, + { + "epoch": 0.5708074192305576, + "grad_norm": 0.7236563563346863, + "learning_rate": 8.145400313150737e-06, + "loss": 0.7555, + "step": 10371 + }, + { + "epoch": 0.5708624580329132, + "grad_norm": 0.6784152984619141, + "learning_rate": 8.145063350168257e-06, + "loss": 0.7283, + "step": 10372 + }, + { + "epoch": 0.5709174968352688, + "grad_norm": 0.6255244612693787, + "learning_rate": 8.14472636354832e-06, + "loss": 0.6722, + "step": 10373 + }, + { + "epoch": 0.5709725356376245, + "grad_norm": 0.8250948786735535, + "learning_rate": 8.14438935329346e-06, + "loss": 0.8406, + "step": 10374 + }, + { + "epoch": 0.5710275744399802, + "grad_norm": 0.7308233380317688, + "learning_rate": 8.144052319406215e-06, + "loss": 0.8084, + "step": 10375 + }, + { + "epoch": 0.5710826132423359, + "grad_norm": 0.7850058674812317, + "learning_rate": 8.143715261889112e-06, + "loss": 0.7892, + "step": 10376 + }, + { + "epoch": 0.5711376520446915, + "grad_norm": 0.81241774559021, + "learning_rate": 8.143378180744687e-06, + "loss": 0.7819, + "step": 10377 + }, + { + "epoch": 0.5711926908470472, + "grad_norm": 0.7174570560455322, + "learning_rate": 8.143041075975473e-06, + "loss": 0.7104, + "step": 10378 + }, + { + "epoch": 0.5712477296494028, + "grad_norm": 0.6954129934310913, + "learning_rate": 8.142703947584004e-06, + "loss": 0.7821, + "step": 10379 + }, + { + "epoch": 0.5713027684517585, + "grad_norm": 0.6895242929458618, + "learning_rate": 8.142366795572813e-06, + "loss": 0.7687, + "step": 10380 + }, + { + "epoch": 0.5713578072541141, + "grad_norm": 0.6543757319450378, + "learning_rate": 8.142029619944434e-06, + "loss": 0.7042, + "step": 10381 + }, + { + "epoch": 0.5714128460564698, + "grad_norm": 0.6712427139282227, + "learning_rate": 8.141692420701404e-06, + "loss": 0.6861, + "step": 10382 + }, + { + "epoch": 0.5714678848588255, + "grad_norm": 1.6716055870056152, + "learning_rate": 8.141355197846253e-06, + "loss": 0.8209, + "step": 10383 + }, + { + "epoch": 0.5715229236611812, + "grad_norm": 0.7509854435920715, + "learning_rate": 8.141017951381516e-06, + "loss": 0.8246, + "step": 10384 + }, + { + "epoch": 0.5715779624635368, + "grad_norm": 0.7161786556243896, + "learning_rate": 8.14068068130973e-06, + "loss": 0.835, + "step": 10385 + }, + { + "epoch": 0.5716330012658924, + "grad_norm": 0.7423714995384216, + "learning_rate": 8.140343387633427e-06, + "loss": 0.8004, + "step": 10386 + }, + { + "epoch": 0.5716880400682481, + "grad_norm": 0.6955768465995789, + "learning_rate": 8.140006070355146e-06, + "loss": 0.7299, + "step": 10387 + }, + { + "epoch": 0.5717430788706038, + "grad_norm": 0.6742254495620728, + "learning_rate": 8.13966872947742e-06, + "loss": 0.6549, + "step": 10388 + }, + { + "epoch": 0.5717981176729594, + "grad_norm": 0.7332299947738647, + "learning_rate": 8.139331365002782e-06, + "loss": 0.7945, + "step": 10389 + }, + { + "epoch": 0.5718531564753151, + "grad_norm": 0.6552133560180664, + "learning_rate": 8.138993976933771e-06, + "loss": 0.7193, + "step": 10390 + }, + { + "epoch": 0.5719081952776708, + "grad_norm": 0.6708530187606812, + "learning_rate": 8.138656565272923e-06, + "loss": 0.8053, + "step": 10391 + }, + { + "epoch": 0.5719632340800265, + "grad_norm": 0.7837093472480774, + "learning_rate": 8.138319130022771e-06, + "loss": 0.7752, + "step": 10392 + }, + { + "epoch": 0.572018272882382, + "grad_norm": 0.6910337805747986, + "learning_rate": 8.137981671185853e-06, + "loss": 0.7573, + "step": 10393 + }, + { + "epoch": 0.5720733116847377, + "grad_norm": 0.6758334636688232, + "learning_rate": 8.137644188764704e-06, + "loss": 0.8251, + "step": 10394 + }, + { + "epoch": 0.5721283504870934, + "grad_norm": 0.7513287663459778, + "learning_rate": 8.137306682761862e-06, + "loss": 0.6491, + "step": 10395 + }, + { + "epoch": 0.572183389289449, + "grad_norm": 0.678210973739624, + "learning_rate": 8.136969153179863e-06, + "loss": 0.7761, + "step": 10396 + }, + { + "epoch": 0.5722384280918047, + "grad_norm": 0.8256083726882935, + "learning_rate": 8.13663160002124e-06, + "loss": 0.7813, + "step": 10397 + }, + { + "epoch": 0.5722934668941604, + "grad_norm": 0.8383314609527588, + "learning_rate": 8.136294023288538e-06, + "loss": 0.7669, + "step": 10398 + }, + { + "epoch": 0.5723485056965161, + "grad_norm": 0.7150036692619324, + "learning_rate": 8.135956422984287e-06, + "loss": 0.8322, + "step": 10399 + }, + { + "epoch": 0.5724035444988717, + "grad_norm": 1.3011385202407837, + "learning_rate": 8.13561879911103e-06, + "loss": 0.8044, + "step": 10400 + }, + { + "epoch": 0.5724585833012273, + "grad_norm": 0.6749194860458374, + "learning_rate": 8.135281151671298e-06, + "loss": 0.6426, + "step": 10401 + }, + { + "epoch": 0.572513622103583, + "grad_norm": 0.7370286583900452, + "learning_rate": 8.134943480667635e-06, + "loss": 0.8051, + "step": 10402 + }, + { + "epoch": 0.5725686609059387, + "grad_norm": 0.6827631592750549, + "learning_rate": 8.134605786102574e-06, + "loss": 0.6961, + "step": 10403 + }, + { + "epoch": 0.5726236997082943, + "grad_norm": 0.7593247294425964, + "learning_rate": 8.134268067978655e-06, + "loss": 0.7514, + "step": 10404 + }, + { + "epoch": 0.57267873851065, + "grad_norm": 0.7229800224304199, + "learning_rate": 8.133930326298417e-06, + "loss": 0.8105, + "step": 10405 + }, + { + "epoch": 0.5727337773130057, + "grad_norm": 0.720973551273346, + "learning_rate": 8.133592561064396e-06, + "loss": 0.6866, + "step": 10406 + }, + { + "epoch": 0.5727888161153614, + "grad_norm": 0.7530742883682251, + "learning_rate": 8.133254772279135e-06, + "loss": 0.773, + "step": 10407 + }, + { + "epoch": 0.5728438549177169, + "grad_norm": 0.6897457838058472, + "learning_rate": 8.132916959945167e-06, + "loss": 0.8107, + "step": 10408 + }, + { + "epoch": 0.5728988937200726, + "grad_norm": 0.6659066081047058, + "learning_rate": 8.132579124065034e-06, + "loss": 0.8036, + "step": 10409 + }, + { + "epoch": 0.5729539325224283, + "grad_norm": 0.6925005316734314, + "learning_rate": 8.132241264641276e-06, + "loss": 0.7869, + "step": 10410 + }, + { + "epoch": 0.573008971324784, + "grad_norm": 0.8681634068489075, + "learning_rate": 8.131903381676433e-06, + "loss": 0.7411, + "step": 10411 + }, + { + "epoch": 0.5730640101271396, + "grad_norm": 0.669561505317688, + "learning_rate": 8.13156547517304e-06, + "loss": 0.7398, + "step": 10412 + }, + { + "epoch": 0.5731190489294953, + "grad_norm": 0.6737409234046936, + "learning_rate": 8.131227545133639e-06, + "loss": 0.7319, + "step": 10413 + }, + { + "epoch": 0.573174087731851, + "grad_norm": 0.7111513614654541, + "learning_rate": 8.130889591560772e-06, + "loss": 0.7192, + "step": 10414 + }, + { + "epoch": 0.5732291265342067, + "grad_norm": 0.6618744134902954, + "learning_rate": 8.130551614456974e-06, + "loss": 0.6636, + "step": 10415 + }, + { + "epoch": 0.5732841653365622, + "grad_norm": 0.8150144815444946, + "learning_rate": 8.13021361382479e-06, + "loss": 0.7168, + "step": 10416 + }, + { + "epoch": 0.5733392041389179, + "grad_norm": 0.744898796081543, + "learning_rate": 8.129875589666758e-06, + "loss": 0.8562, + "step": 10417 + }, + { + "epoch": 0.5733942429412736, + "grad_norm": 0.7831705212593079, + "learning_rate": 8.129537541985419e-06, + "loss": 0.8491, + "step": 10418 + }, + { + "epoch": 0.5734492817436293, + "grad_norm": 0.8097667098045349, + "learning_rate": 8.129199470783313e-06, + "loss": 0.7623, + "step": 10419 + }, + { + "epoch": 0.5735043205459849, + "grad_norm": 0.7951840758323669, + "learning_rate": 8.128861376062982e-06, + "loss": 0.8195, + "step": 10420 + }, + { + "epoch": 0.5735593593483406, + "grad_norm": 0.5902833938598633, + "learning_rate": 8.128523257826966e-06, + "loss": 0.6244, + "step": 10421 + }, + { + "epoch": 0.5736143981506963, + "grad_norm": 1.113287329673767, + "learning_rate": 8.128185116077805e-06, + "loss": 0.8382, + "step": 10422 + }, + { + "epoch": 0.573669436953052, + "grad_norm": 0.6899390816688538, + "learning_rate": 8.127846950818046e-06, + "loss": 0.7632, + "step": 10423 + }, + { + "epoch": 0.5737244757554075, + "grad_norm": 0.6905965805053711, + "learning_rate": 8.127508762050225e-06, + "loss": 0.7429, + "step": 10424 + }, + { + "epoch": 0.5737795145577632, + "grad_norm": 0.7036122679710388, + "learning_rate": 8.127170549776882e-06, + "loss": 0.7699, + "step": 10425 + }, + { + "epoch": 0.5738345533601189, + "grad_norm": 0.6599798202514648, + "learning_rate": 8.126832314000566e-06, + "loss": 0.7169, + "step": 10426 + }, + { + "epoch": 0.5738895921624746, + "grad_norm": 0.8682155609130859, + "learning_rate": 8.126494054723815e-06, + "loss": 0.851, + "step": 10427 + }, + { + "epoch": 0.5739446309648302, + "grad_norm": 0.6661516427993774, + "learning_rate": 8.12615577194917e-06, + "loss": 0.7287, + "step": 10428 + }, + { + "epoch": 0.5739996697671859, + "grad_norm": 0.6805256009101868, + "learning_rate": 8.125817465679176e-06, + "loss": 0.7033, + "step": 10429 + }, + { + "epoch": 0.5740547085695415, + "grad_norm": 0.7088646292686462, + "learning_rate": 8.125479135916375e-06, + "loss": 0.7295, + "step": 10430 + }, + { + "epoch": 0.5741097473718972, + "grad_norm": 0.6854971647262573, + "learning_rate": 8.12514078266331e-06, + "loss": 0.8102, + "step": 10431 + }, + { + "epoch": 0.5741647861742528, + "grad_norm": 0.7481474876403809, + "learning_rate": 8.124802405922521e-06, + "loss": 0.7463, + "step": 10432 + }, + { + "epoch": 0.5742198249766085, + "grad_norm": 0.8280898928642273, + "learning_rate": 8.124464005696556e-06, + "loss": 0.8067, + "step": 10433 + }, + { + "epoch": 0.5742748637789642, + "grad_norm": 0.696812629699707, + "learning_rate": 8.124125581987953e-06, + "loss": 0.7041, + "step": 10434 + }, + { + "epoch": 0.5743299025813199, + "grad_norm": 0.791084349155426, + "learning_rate": 8.123787134799262e-06, + "loss": 0.8244, + "step": 10435 + }, + { + "epoch": 0.5743849413836755, + "grad_norm": 0.7422665953636169, + "learning_rate": 8.123448664133022e-06, + "loss": 0.7792, + "step": 10436 + }, + { + "epoch": 0.5744399801860312, + "grad_norm": 0.7302834987640381, + "learning_rate": 8.123110169991777e-06, + "loss": 0.7617, + "step": 10437 + }, + { + "epoch": 0.5744950189883868, + "grad_norm": 0.6640440821647644, + "learning_rate": 8.122771652378071e-06, + "loss": 0.7965, + "step": 10438 + }, + { + "epoch": 0.5745500577907424, + "grad_norm": 0.7704516649246216, + "learning_rate": 8.12243311129445e-06, + "loss": 0.7814, + "step": 10439 + }, + { + "epoch": 0.5746050965930981, + "grad_norm": 0.673254668712616, + "learning_rate": 8.122094546743459e-06, + "loss": 0.7364, + "step": 10440 + }, + { + "epoch": 0.5746601353954538, + "grad_norm": 0.7648451924324036, + "learning_rate": 8.121755958727639e-06, + "loss": 0.8585, + "step": 10441 + }, + { + "epoch": 0.5747151741978095, + "grad_norm": 0.6660173535346985, + "learning_rate": 8.121417347249539e-06, + "loss": 0.6989, + "step": 10442 + }, + { + "epoch": 0.5747702130001651, + "grad_norm": 0.7128653526306152, + "learning_rate": 8.1210787123117e-06, + "loss": 0.8317, + "step": 10443 + }, + { + "epoch": 0.5748252518025208, + "grad_norm": 0.6404966115951538, + "learning_rate": 8.12074005391667e-06, + "loss": 0.6957, + "step": 10444 + }, + { + "epoch": 0.5748802906048764, + "grad_norm": 0.9597657918930054, + "learning_rate": 8.120401372066993e-06, + "loss": 0.9266, + "step": 10445 + }, + { + "epoch": 0.5749353294072321, + "grad_norm": 0.7735045552253723, + "learning_rate": 8.120062666765213e-06, + "loss": 0.8159, + "step": 10446 + }, + { + "epoch": 0.5749903682095877, + "grad_norm": 0.8031814098358154, + "learning_rate": 8.11972393801388e-06, + "loss": 0.7741, + "step": 10447 + }, + { + "epoch": 0.5750454070119434, + "grad_norm": 0.7008558511734009, + "learning_rate": 8.119385185815535e-06, + "loss": 0.6558, + "step": 10448 + }, + { + "epoch": 0.5751004458142991, + "grad_norm": 0.8162875175476074, + "learning_rate": 8.119046410172725e-06, + "loss": 0.7196, + "step": 10449 + }, + { + "epoch": 0.5751554846166548, + "grad_norm": 0.8142701983451843, + "learning_rate": 8.118707611088e-06, + "loss": 0.7709, + "step": 10450 + }, + { + "epoch": 0.5752105234190104, + "grad_norm": 0.7671986818313599, + "learning_rate": 8.118368788563902e-06, + "loss": 0.8725, + "step": 10451 + }, + { + "epoch": 0.575265562221366, + "grad_norm": 0.6604374051094055, + "learning_rate": 8.118029942602979e-06, + "loss": 0.7119, + "step": 10452 + }, + { + "epoch": 0.5753206010237217, + "grad_norm": 0.7119179368019104, + "learning_rate": 8.117691073207776e-06, + "loss": 0.7445, + "step": 10453 + }, + { + "epoch": 0.5753756398260774, + "grad_norm": 0.7572842240333557, + "learning_rate": 8.117352180380843e-06, + "loss": 0.7672, + "step": 10454 + }, + { + "epoch": 0.575430678628433, + "grad_norm": 0.688667356967926, + "learning_rate": 8.117013264124725e-06, + "loss": 0.7733, + "step": 10455 + }, + { + "epoch": 0.5754857174307887, + "grad_norm": 0.6683163046836853, + "learning_rate": 8.116674324441971e-06, + "loss": 0.6381, + "step": 10456 + }, + { + "epoch": 0.5755407562331444, + "grad_norm": 0.7792099714279175, + "learning_rate": 8.116335361335126e-06, + "loss": 0.7781, + "step": 10457 + }, + { + "epoch": 0.5755957950355001, + "grad_norm": 0.702132523059845, + "learning_rate": 8.115996374806738e-06, + "loss": 0.7442, + "step": 10458 + }, + { + "epoch": 0.5756508338378556, + "grad_norm": 0.7021365761756897, + "learning_rate": 8.115657364859356e-06, + "loss": 0.7215, + "step": 10459 + }, + { + "epoch": 0.5757058726402113, + "grad_norm": 0.7032247185707092, + "learning_rate": 8.115318331495527e-06, + "loss": 0.7069, + "step": 10460 + }, + { + "epoch": 0.575760911442567, + "grad_norm": 0.8301237225532532, + "learning_rate": 8.1149792747178e-06, + "loss": 0.789, + "step": 10461 + }, + { + "epoch": 0.5758159502449227, + "grad_norm": 0.7051018476486206, + "learning_rate": 8.11464019452872e-06, + "loss": 0.7511, + "step": 10462 + }, + { + "epoch": 0.5758709890472783, + "grad_norm": 0.8422626256942749, + "learning_rate": 8.114301090930843e-06, + "loss": 0.6507, + "step": 10463 + }, + { + "epoch": 0.575926027849634, + "grad_norm": 0.7751632332801819, + "learning_rate": 8.113961963926708e-06, + "loss": 0.7357, + "step": 10464 + }, + { + "epoch": 0.5759810666519897, + "grad_norm": 0.7158333659172058, + "learning_rate": 8.11362281351887e-06, + "loss": 0.8382, + "step": 10465 + }, + { + "epoch": 0.5760361054543454, + "grad_norm": 0.6926481127738953, + "learning_rate": 8.113283639709878e-06, + "loss": 0.7078, + "step": 10466 + }, + { + "epoch": 0.5760911442567009, + "grad_norm": 0.7091588973999023, + "learning_rate": 8.112944442502277e-06, + "loss": 0.7932, + "step": 10467 + }, + { + "epoch": 0.5761461830590566, + "grad_norm": 0.6979780197143555, + "learning_rate": 8.11260522189862e-06, + "loss": 0.6812, + "step": 10468 + }, + { + "epoch": 0.5762012218614123, + "grad_norm": 0.6735736131668091, + "learning_rate": 8.112265977901455e-06, + "loss": 0.7499, + "step": 10469 + }, + { + "epoch": 0.576256260663768, + "grad_norm": 0.6995692849159241, + "learning_rate": 8.111926710513334e-06, + "loss": 0.7123, + "step": 10470 + }, + { + "epoch": 0.5763112994661236, + "grad_norm": 0.7162681818008423, + "learning_rate": 8.111587419736802e-06, + "loss": 0.7586, + "step": 10471 + }, + { + "epoch": 0.5763663382684793, + "grad_norm": 0.945935070514679, + "learning_rate": 8.111248105574414e-06, + "loss": 0.8474, + "step": 10472 + }, + { + "epoch": 0.576421377070835, + "grad_norm": 0.608730673789978, + "learning_rate": 8.110908768028716e-06, + "loss": 0.6433, + "step": 10473 + }, + { + "epoch": 0.5764764158731907, + "grad_norm": 0.6777853965759277, + "learning_rate": 8.110569407102263e-06, + "loss": 0.7913, + "step": 10474 + }, + { + "epoch": 0.5765314546755462, + "grad_norm": 0.6310930848121643, + "learning_rate": 8.1102300227976e-06, + "loss": 0.719, + "step": 10475 + }, + { + "epoch": 0.5765864934779019, + "grad_norm": 0.7048485279083252, + "learning_rate": 8.109890615117282e-06, + "loss": 0.7341, + "step": 10476 + }, + { + "epoch": 0.5766415322802576, + "grad_norm": 0.672987163066864, + "learning_rate": 8.10955118406386e-06, + "loss": 0.7637, + "step": 10477 + }, + { + "epoch": 0.5766965710826133, + "grad_norm": 0.7018216252326965, + "learning_rate": 8.109211729639882e-06, + "loss": 0.6924, + "step": 10478 + }, + { + "epoch": 0.5767516098849689, + "grad_norm": 0.7183761596679688, + "learning_rate": 8.108872251847901e-06, + "loss": 0.7945, + "step": 10479 + }, + { + "epoch": 0.5768066486873246, + "grad_norm": 0.7332683801651001, + "learning_rate": 8.108532750690469e-06, + "loss": 0.7686, + "step": 10480 + }, + { + "epoch": 0.5768616874896803, + "grad_norm": 0.7118290066719055, + "learning_rate": 8.108193226170139e-06, + "loss": 0.6917, + "step": 10481 + }, + { + "epoch": 0.5769167262920358, + "grad_norm": 0.8242507576942444, + "learning_rate": 8.107853678289456e-06, + "loss": 0.9119, + "step": 10482 + }, + { + "epoch": 0.5769717650943915, + "grad_norm": 0.7138590216636658, + "learning_rate": 8.10751410705098e-06, + "loss": 0.7095, + "step": 10483 + }, + { + "epoch": 0.5770268038967472, + "grad_norm": 0.7541199326515198, + "learning_rate": 8.107174512457259e-06, + "loss": 0.8042, + "step": 10484 + }, + { + "epoch": 0.5770818426991029, + "grad_norm": 0.7776939868927002, + "learning_rate": 8.106834894510846e-06, + "loss": 0.8075, + "step": 10485 + }, + { + "epoch": 0.5771368815014585, + "grad_norm": 0.6466917395591736, + "learning_rate": 8.106495253214293e-06, + "loss": 0.707, + "step": 10486 + }, + { + "epoch": 0.5771919203038142, + "grad_norm": 0.687101423740387, + "learning_rate": 8.106155588570153e-06, + "loss": 0.6945, + "step": 10487 + }, + { + "epoch": 0.5772469591061699, + "grad_norm": 0.8338418006896973, + "learning_rate": 8.10581590058098e-06, + "loss": 0.8044, + "step": 10488 + }, + { + "epoch": 0.5773019979085255, + "grad_norm": 0.7052263617515564, + "learning_rate": 8.105476189249325e-06, + "loss": 0.8216, + "step": 10489 + }, + { + "epoch": 0.5773570367108811, + "grad_norm": 0.7205906510353088, + "learning_rate": 8.105136454577744e-06, + "loss": 0.8853, + "step": 10490 + }, + { + "epoch": 0.5774120755132368, + "grad_norm": 0.7875076532363892, + "learning_rate": 8.10479669656879e-06, + "loss": 0.822, + "step": 10491 + }, + { + "epoch": 0.5774671143155925, + "grad_norm": 0.6858797669410706, + "learning_rate": 8.104456915225012e-06, + "loss": 0.7924, + "step": 10492 + }, + { + "epoch": 0.5775221531179482, + "grad_norm": 0.6991322636604309, + "learning_rate": 8.104117110548968e-06, + "loss": 0.8144, + "step": 10493 + }, + { + "epoch": 0.5775771919203038, + "grad_norm": 0.7768846750259399, + "learning_rate": 8.103777282543209e-06, + "loss": 0.7793, + "step": 10494 + }, + { + "epoch": 0.5776322307226595, + "grad_norm": 0.7055716514587402, + "learning_rate": 8.103437431210293e-06, + "loss": 0.7653, + "step": 10495 + }, + { + "epoch": 0.5776872695250151, + "grad_norm": 1.009839653968811, + "learning_rate": 8.10309755655277e-06, + "loss": 0.7646, + "step": 10496 + }, + { + "epoch": 0.5777423083273708, + "grad_norm": 0.699435293674469, + "learning_rate": 8.102757658573197e-06, + "loss": 0.7806, + "step": 10497 + }, + { + "epoch": 0.5777973471297264, + "grad_norm": 0.8566381931304932, + "learning_rate": 8.102417737274129e-06, + "loss": 0.8302, + "step": 10498 + }, + { + "epoch": 0.5778523859320821, + "grad_norm": 0.745801568031311, + "learning_rate": 8.10207779265812e-06, + "loss": 0.91, + "step": 10499 + }, + { + "epoch": 0.5779074247344378, + "grad_norm": 0.6867349743843079, + "learning_rate": 8.101737824727724e-06, + "loss": 0.771, + "step": 10500 + }, + { + "epoch": 0.5779624635367935, + "grad_norm": 0.6693048477172852, + "learning_rate": 8.101397833485496e-06, + "loss": 0.7967, + "step": 10501 + }, + { + "epoch": 0.5780175023391491, + "grad_norm": 0.7485450506210327, + "learning_rate": 8.101057818933993e-06, + "loss": 0.7132, + "step": 10502 + }, + { + "epoch": 0.5780725411415047, + "grad_norm": 0.7619839906692505, + "learning_rate": 8.100717781075769e-06, + "loss": 0.7379, + "step": 10503 + }, + { + "epoch": 0.5781275799438604, + "grad_norm": 0.7651955485343933, + "learning_rate": 8.100377719913382e-06, + "loss": 0.8437, + "step": 10504 + }, + { + "epoch": 0.5781826187462161, + "grad_norm": 0.692385196685791, + "learning_rate": 8.100037635449384e-06, + "loss": 0.7666, + "step": 10505 + }, + { + "epoch": 0.5782376575485717, + "grad_norm": 0.7332374453544617, + "learning_rate": 8.099697527686334e-06, + "loss": 0.7476, + "step": 10506 + }, + { + "epoch": 0.5782926963509274, + "grad_norm": 0.6934877634048462, + "learning_rate": 8.099357396626786e-06, + "loss": 0.8054, + "step": 10507 + }, + { + "epoch": 0.5783477351532831, + "grad_norm": 0.8393011689186096, + "learning_rate": 8.099017242273298e-06, + "loss": 0.8655, + "step": 10508 + }, + { + "epoch": 0.5784027739556388, + "grad_norm": 0.6850646734237671, + "learning_rate": 8.098677064628425e-06, + "loss": 0.7424, + "step": 10509 + }, + { + "epoch": 0.5784578127579943, + "grad_norm": 0.7302095293998718, + "learning_rate": 8.098336863694728e-06, + "loss": 0.903, + "step": 10510 + }, + { + "epoch": 0.57851285156035, + "grad_norm": 0.7474033236503601, + "learning_rate": 8.097996639474757e-06, + "loss": 0.7509, + "step": 10511 + }, + { + "epoch": 0.5785678903627057, + "grad_norm": 0.6525655388832092, + "learning_rate": 8.097656391971074e-06, + "loss": 0.7097, + "step": 10512 + }, + { + "epoch": 0.5786229291650614, + "grad_norm": 0.8197451829910278, + "learning_rate": 8.097316121186234e-06, + "loss": 0.7401, + "step": 10513 + }, + { + "epoch": 0.578677967967417, + "grad_norm": 0.7048231959342957, + "learning_rate": 8.096975827122795e-06, + "loss": 0.7964, + "step": 10514 + }, + { + "epoch": 0.5787330067697727, + "grad_norm": 0.8417022228240967, + "learning_rate": 8.096635509783315e-06, + "loss": 0.7703, + "step": 10515 + }, + { + "epoch": 0.5787880455721284, + "grad_norm": 0.7313926815986633, + "learning_rate": 8.096295169170352e-06, + "loss": 0.7565, + "step": 10516 + }, + { + "epoch": 0.5788430843744841, + "grad_norm": 0.7156692147254944, + "learning_rate": 8.095954805286464e-06, + "loss": 0.7456, + "step": 10517 + }, + { + "epoch": 0.5788981231768396, + "grad_norm": 0.7366768717765808, + "learning_rate": 8.095614418134205e-06, + "loss": 0.72, + "step": 10518 + }, + { + "epoch": 0.5789531619791953, + "grad_norm": 0.7011533379554749, + "learning_rate": 8.09527400771614e-06, + "loss": 0.7683, + "step": 10519 + }, + { + "epoch": 0.579008200781551, + "grad_norm": 0.6849086284637451, + "learning_rate": 8.094933574034823e-06, + "loss": 0.6938, + "step": 10520 + }, + { + "epoch": 0.5790632395839067, + "grad_norm": 0.7351469397544861, + "learning_rate": 8.094593117092814e-06, + "loss": 0.7364, + "step": 10521 + }, + { + "epoch": 0.5791182783862623, + "grad_norm": 0.7133724689483643, + "learning_rate": 8.09425263689267e-06, + "loss": 0.7328, + "step": 10522 + }, + { + "epoch": 0.579173317188618, + "grad_norm": 0.6713461875915527, + "learning_rate": 8.093912133436954e-06, + "loss": 0.7296, + "step": 10523 + }, + { + "epoch": 0.5792283559909737, + "grad_norm": 0.7057825922966003, + "learning_rate": 8.093571606728222e-06, + "loss": 0.7732, + "step": 10524 + }, + { + "epoch": 0.5792833947933292, + "grad_norm": 0.7378783226013184, + "learning_rate": 8.093231056769033e-06, + "loss": 0.7907, + "step": 10525 + }, + { + "epoch": 0.5793384335956849, + "grad_norm": 0.8796947598457336, + "learning_rate": 8.092890483561947e-06, + "loss": 0.7325, + "step": 10526 + }, + { + "epoch": 0.5793934723980406, + "grad_norm": 0.7326352000236511, + "learning_rate": 8.092549887109525e-06, + "loss": 0.7948, + "step": 10527 + }, + { + "epoch": 0.5794485112003963, + "grad_norm": 0.7131063342094421, + "learning_rate": 8.092209267414325e-06, + "loss": 0.7595, + "step": 10528 + }, + { + "epoch": 0.5795035500027519, + "grad_norm": 0.6993252635002136, + "learning_rate": 8.091868624478908e-06, + "loss": 0.782, + "step": 10529 + }, + { + "epoch": 0.5795585888051076, + "grad_norm": 0.6945857405662537, + "learning_rate": 8.091527958305835e-06, + "loss": 0.7283, + "step": 10530 + }, + { + "epoch": 0.5796136276074633, + "grad_norm": 0.8203904032707214, + "learning_rate": 8.091187268897667e-06, + "loss": 0.7787, + "step": 10531 + }, + { + "epoch": 0.579668666409819, + "grad_norm": 0.6450221538543701, + "learning_rate": 8.09084655625696e-06, + "loss": 0.7092, + "step": 10532 + }, + { + "epoch": 0.5797237052121745, + "grad_norm": 0.6852096915245056, + "learning_rate": 8.090505820386279e-06, + "loss": 0.7916, + "step": 10533 + }, + { + "epoch": 0.5797787440145302, + "grad_norm": 1.0816445350646973, + "learning_rate": 8.090165061288182e-06, + "loss": 0.7545, + "step": 10534 + }, + { + "epoch": 0.5798337828168859, + "grad_norm": 0.7312847375869751, + "learning_rate": 8.089824278965233e-06, + "loss": 0.7395, + "step": 10535 + }, + { + "epoch": 0.5798888216192416, + "grad_norm": 0.7281426191329956, + "learning_rate": 8.089483473419992e-06, + "loss": 0.7677, + "step": 10536 + }, + { + "epoch": 0.5799438604215972, + "grad_norm": 0.7392409443855286, + "learning_rate": 8.08914264465502e-06, + "loss": 0.7674, + "step": 10537 + }, + { + "epoch": 0.5799988992239529, + "grad_norm": 0.7041863799095154, + "learning_rate": 8.088801792672877e-06, + "loss": 0.6156, + "step": 10538 + }, + { + "epoch": 0.5800539380263086, + "grad_norm": 0.7113755345344543, + "learning_rate": 8.088460917476128e-06, + "loss": 0.7677, + "step": 10539 + }, + { + "epoch": 0.5801089768286642, + "grad_norm": 0.673966646194458, + "learning_rate": 8.088120019067334e-06, + "loss": 0.7557, + "step": 10540 + }, + { + "epoch": 0.5801640156310198, + "grad_norm": 0.8165854215621948, + "learning_rate": 8.087779097449055e-06, + "loss": 0.8102, + "step": 10541 + }, + { + "epoch": 0.5802190544333755, + "grad_norm": 0.7010880708694458, + "learning_rate": 8.087438152623857e-06, + "loss": 0.7816, + "step": 10542 + }, + { + "epoch": 0.5802740932357312, + "grad_norm": 0.726177990436554, + "learning_rate": 8.0870971845943e-06, + "loss": 0.7671, + "step": 10543 + }, + { + "epoch": 0.5803291320380869, + "grad_norm": 0.7403919696807861, + "learning_rate": 8.086756193362946e-06, + "loss": 0.8449, + "step": 10544 + }, + { + "epoch": 0.5803841708404425, + "grad_norm": 0.6897104382514954, + "learning_rate": 8.086415178932358e-06, + "loss": 0.7563, + "step": 10545 + }, + { + "epoch": 0.5804392096427982, + "grad_norm": 0.7682604193687439, + "learning_rate": 8.0860741413051e-06, + "loss": 0.8019, + "step": 10546 + }, + { + "epoch": 0.5804942484451538, + "grad_norm": 0.7317522168159485, + "learning_rate": 8.085733080483736e-06, + "loss": 0.7446, + "step": 10547 + }, + { + "epoch": 0.5805492872475095, + "grad_norm": 0.8503430485725403, + "learning_rate": 8.085391996470826e-06, + "loss": 0.7343, + "step": 10548 + }, + { + "epoch": 0.5806043260498651, + "grad_norm": 0.8550657629966736, + "learning_rate": 8.085050889268937e-06, + "loss": 0.9267, + "step": 10549 + }, + { + "epoch": 0.5806593648522208, + "grad_norm": 0.7751224637031555, + "learning_rate": 8.084709758880633e-06, + "loss": 0.7404, + "step": 10550 + }, + { + "epoch": 0.5807144036545765, + "grad_norm": 0.6346186399459839, + "learning_rate": 8.084368605308475e-06, + "loss": 0.66, + "step": 10551 + }, + { + "epoch": 0.5807694424569322, + "grad_norm": 0.7295717597007751, + "learning_rate": 8.084027428555027e-06, + "loss": 0.8313, + "step": 10552 + }, + { + "epoch": 0.5808244812592878, + "grad_norm": 0.6962289810180664, + "learning_rate": 8.083686228622856e-06, + "loss": 0.7871, + "step": 10553 + }, + { + "epoch": 0.5808795200616435, + "grad_norm": 0.6968896389007568, + "learning_rate": 8.083345005514522e-06, + "loss": 0.7261, + "step": 10554 + }, + { + "epoch": 0.5809345588639991, + "grad_norm": 0.8374869227409363, + "learning_rate": 8.083003759232595e-06, + "loss": 0.797, + "step": 10555 + }, + { + "epoch": 0.5809895976663548, + "grad_norm": 0.6511034369468689, + "learning_rate": 8.082662489779637e-06, + "loss": 0.7237, + "step": 10556 + }, + { + "epoch": 0.5810446364687104, + "grad_norm": 0.6644287705421448, + "learning_rate": 8.082321197158212e-06, + "loss": 0.6969, + "step": 10557 + }, + { + "epoch": 0.5810996752710661, + "grad_norm": 0.7681102752685547, + "learning_rate": 8.081979881370884e-06, + "loss": 0.7193, + "step": 10558 + }, + { + "epoch": 0.5811547140734218, + "grad_norm": 0.7930792570114136, + "learning_rate": 8.081638542420224e-06, + "loss": 0.7198, + "step": 10559 + }, + { + "epoch": 0.5812097528757775, + "grad_norm": 0.7227992415428162, + "learning_rate": 8.081297180308791e-06, + "loss": 0.7533, + "step": 10560 + }, + { + "epoch": 0.581264791678133, + "grad_norm": 0.7293071150779724, + "learning_rate": 8.080955795039156e-06, + "loss": 0.6228, + "step": 10561 + }, + { + "epoch": 0.5813198304804887, + "grad_norm": 0.7356483936309814, + "learning_rate": 8.080614386613879e-06, + "loss": 0.7299, + "step": 10562 + }, + { + "epoch": 0.5813748692828444, + "grad_norm": 0.8181473016738892, + "learning_rate": 8.080272955035531e-06, + "loss": 0.6576, + "step": 10563 + }, + { + "epoch": 0.5814299080852001, + "grad_norm": 0.7066958546638489, + "learning_rate": 8.079931500306675e-06, + "loss": 0.7372, + "step": 10564 + }, + { + "epoch": 0.5814849468875557, + "grad_norm": 0.6821097135543823, + "learning_rate": 8.079590022429877e-06, + "loss": 0.7516, + "step": 10565 + }, + { + "epoch": 0.5815399856899114, + "grad_norm": 0.6879069209098816, + "learning_rate": 8.079248521407707e-06, + "loss": 0.7525, + "step": 10566 + }, + { + "epoch": 0.5815950244922671, + "grad_norm": 0.956345796585083, + "learning_rate": 8.078906997242729e-06, + "loss": 0.8175, + "step": 10567 + }, + { + "epoch": 0.5816500632946227, + "grad_norm": 0.6942328214645386, + "learning_rate": 8.078565449937508e-06, + "loss": 0.6264, + "step": 10568 + }, + { + "epoch": 0.5817051020969783, + "grad_norm": 0.7073766589164734, + "learning_rate": 8.078223879494615e-06, + "loss": 0.766, + "step": 10569 + }, + { + "epoch": 0.581760140899334, + "grad_norm": 0.7649571895599365, + "learning_rate": 8.077882285916614e-06, + "loss": 0.8767, + "step": 10570 + }, + { + "epoch": 0.5818151797016897, + "grad_norm": 0.6384355425834656, + "learning_rate": 8.077540669206076e-06, + "loss": 0.7444, + "step": 10571 + }, + { + "epoch": 0.5818702185040453, + "grad_norm": 0.7173928022384644, + "learning_rate": 8.077199029365565e-06, + "loss": 0.8277, + "step": 10572 + }, + { + "epoch": 0.581925257306401, + "grad_norm": 0.7310757637023926, + "learning_rate": 8.076857366397648e-06, + "loss": 0.8425, + "step": 10573 + }, + { + "epoch": 0.5819802961087567, + "grad_norm": 0.6888872385025024, + "learning_rate": 8.076515680304897e-06, + "loss": 0.6961, + "step": 10574 + }, + { + "epoch": 0.5820353349111124, + "grad_norm": 0.7290124297142029, + "learning_rate": 8.076173971089877e-06, + "loss": 0.7865, + "step": 10575 + }, + { + "epoch": 0.582090373713468, + "grad_norm": 0.7402634024620056, + "learning_rate": 8.075832238755156e-06, + "loss": 0.7196, + "step": 10576 + }, + { + "epoch": 0.5821454125158236, + "grad_norm": 0.74916672706604, + "learning_rate": 8.075490483303305e-06, + "loss": 0.8361, + "step": 10577 + }, + { + "epoch": 0.5822004513181793, + "grad_norm": 0.8146494626998901, + "learning_rate": 8.07514870473689e-06, + "loss": 0.7398, + "step": 10578 + }, + { + "epoch": 0.582255490120535, + "grad_norm": 0.6632487177848816, + "learning_rate": 8.07480690305848e-06, + "loss": 0.7239, + "step": 10579 + }, + { + "epoch": 0.5823105289228906, + "grad_norm": 0.6912766695022583, + "learning_rate": 8.074465078270645e-06, + "loss": 0.7488, + "step": 10580 + }, + { + "epoch": 0.5823655677252463, + "grad_norm": 0.7410522699356079, + "learning_rate": 8.074123230375952e-06, + "loss": 0.7413, + "step": 10581 + }, + { + "epoch": 0.582420606527602, + "grad_norm": 0.7932689189910889, + "learning_rate": 8.073781359376972e-06, + "loss": 0.7894, + "step": 10582 + }, + { + "epoch": 0.5824756453299577, + "grad_norm": 0.6710309982299805, + "learning_rate": 8.073439465276277e-06, + "loss": 0.6727, + "step": 10583 + }, + { + "epoch": 0.5825306841323132, + "grad_norm": 0.7457143068313599, + "learning_rate": 8.07309754807643e-06, + "loss": 0.6719, + "step": 10584 + }, + { + "epoch": 0.5825857229346689, + "grad_norm": 0.7340453863143921, + "learning_rate": 8.072755607780008e-06, + "loss": 0.7397, + "step": 10585 + }, + { + "epoch": 0.5826407617370246, + "grad_norm": 0.7532176971435547, + "learning_rate": 8.072413644389574e-06, + "loss": 0.7368, + "step": 10586 + }, + { + "epoch": 0.5826958005393803, + "grad_norm": 0.9317812919616699, + "learning_rate": 8.072071657907703e-06, + "loss": 0.9113, + "step": 10587 + }, + { + "epoch": 0.5827508393417359, + "grad_norm": 0.8535491228103638, + "learning_rate": 8.071729648336963e-06, + "loss": 0.7708, + "step": 10588 + }, + { + "epoch": 0.5828058781440916, + "grad_norm": 0.6720348000526428, + "learning_rate": 8.071387615679926e-06, + "loss": 0.7521, + "step": 10589 + }, + { + "epoch": 0.5828609169464473, + "grad_norm": 0.7113864421844482, + "learning_rate": 8.071045559939162e-06, + "loss": 0.8713, + "step": 10590 + }, + { + "epoch": 0.582915955748803, + "grad_norm": 0.7760024070739746, + "learning_rate": 8.070703481117242e-06, + "loss": 0.7567, + "step": 10591 + }, + { + "epoch": 0.5829709945511585, + "grad_norm": 0.9548617005348206, + "learning_rate": 8.070361379216735e-06, + "loss": 0.7937, + "step": 10592 + }, + { + "epoch": 0.5830260333535142, + "grad_norm": 0.7796840667724609, + "learning_rate": 8.070019254240216e-06, + "loss": 0.7485, + "step": 10593 + }, + { + "epoch": 0.5830810721558699, + "grad_norm": 0.7006514668464661, + "learning_rate": 8.069677106190253e-06, + "loss": 0.7813, + "step": 10594 + }, + { + "epoch": 0.5831361109582256, + "grad_norm": 0.646396279335022, + "learning_rate": 8.069334935069417e-06, + "loss": 0.7437, + "step": 10595 + }, + { + "epoch": 0.5831911497605812, + "grad_norm": 0.8257368206977844, + "learning_rate": 8.068992740880283e-06, + "loss": 0.7351, + "step": 10596 + }, + { + "epoch": 0.5832461885629369, + "grad_norm": 0.6646208763122559, + "learning_rate": 8.068650523625422e-06, + "loss": 0.6554, + "step": 10597 + }, + { + "epoch": 0.5833012273652926, + "grad_norm": 0.8495579957962036, + "learning_rate": 8.068308283307402e-06, + "loss": 0.791, + "step": 10598 + }, + { + "epoch": 0.5833562661676482, + "grad_norm": 0.7283076047897339, + "learning_rate": 8.0679660199288e-06, + "loss": 0.7327, + "step": 10599 + }, + { + "epoch": 0.5834113049700038, + "grad_norm": 0.704572856426239, + "learning_rate": 8.067623733492187e-06, + "loss": 0.6094, + "step": 10600 + }, + { + "epoch": 0.5834663437723595, + "grad_norm": 0.6435144543647766, + "learning_rate": 8.067281424000136e-06, + "loss": 0.6974, + "step": 10601 + }, + { + "epoch": 0.5835213825747152, + "grad_norm": 0.9628346562385559, + "learning_rate": 8.066939091455215e-06, + "loss": 0.8933, + "step": 10602 + }, + { + "epoch": 0.5835764213770709, + "grad_norm": 0.6856930255889893, + "learning_rate": 8.066596735860004e-06, + "loss": 0.7414, + "step": 10603 + }, + { + "epoch": 0.5836314601794265, + "grad_norm": 0.7341175675392151, + "learning_rate": 8.066254357217072e-06, + "loss": 0.7553, + "step": 10604 + }, + { + "epoch": 0.5836864989817822, + "grad_norm": 0.7124871611595154, + "learning_rate": 8.065911955528995e-06, + "loss": 0.663, + "step": 10605 + }, + { + "epoch": 0.5837415377841378, + "grad_norm": 0.816028892993927, + "learning_rate": 8.065569530798341e-06, + "loss": 0.8778, + "step": 10606 + }, + { + "epoch": 0.5837965765864935, + "grad_norm": 0.8735721111297607, + "learning_rate": 8.06522708302769e-06, + "loss": 0.7866, + "step": 10607 + }, + { + "epoch": 0.5838516153888491, + "grad_norm": 0.6780036687850952, + "learning_rate": 8.06488461221961e-06, + "loss": 0.7329, + "step": 10608 + }, + { + "epoch": 0.5839066541912048, + "grad_norm": 0.7624822854995728, + "learning_rate": 8.06454211837668e-06, + "loss": 0.8095, + "step": 10609 + }, + { + "epoch": 0.5839616929935605, + "grad_norm": 0.8269234895706177, + "learning_rate": 8.06419960150147e-06, + "loss": 0.7194, + "step": 10610 + }, + { + "epoch": 0.5840167317959161, + "grad_norm": 0.6748649477958679, + "learning_rate": 8.063857061596558e-06, + "loss": 0.702, + "step": 10611 + }, + { + "epoch": 0.5840717705982718, + "grad_norm": 0.9700273275375366, + "learning_rate": 8.063514498664515e-06, + "loss": 0.7917, + "step": 10612 + }, + { + "epoch": 0.5841268094006274, + "grad_norm": 0.7798827290534973, + "learning_rate": 8.063171912707916e-06, + "loss": 0.798, + "step": 10613 + }, + { + "epoch": 0.5841818482029831, + "grad_norm": 0.6613249778747559, + "learning_rate": 8.06282930372934e-06, + "loss": 0.7216, + "step": 10614 + }, + { + "epoch": 0.5842368870053387, + "grad_norm": 0.727116048336029, + "learning_rate": 8.062486671731357e-06, + "loss": 0.8054, + "step": 10615 + }, + { + "epoch": 0.5842919258076944, + "grad_norm": 0.6704444289207458, + "learning_rate": 8.062144016716543e-06, + "loss": 0.7503, + "step": 10616 + }, + { + "epoch": 0.5843469646100501, + "grad_norm": 0.6867938041687012, + "learning_rate": 8.061801338687477e-06, + "loss": 0.8005, + "step": 10617 + }, + { + "epoch": 0.5844020034124058, + "grad_norm": 0.7097555994987488, + "learning_rate": 8.061458637646729e-06, + "loss": 0.8515, + "step": 10618 + }, + { + "epoch": 0.5844570422147614, + "grad_norm": 0.6624881625175476, + "learning_rate": 8.061115913596878e-06, + "loss": 0.7735, + "step": 10619 + }, + { + "epoch": 0.584512081017117, + "grad_norm": 0.6649004220962524, + "learning_rate": 8.060773166540498e-06, + "loss": 0.7837, + "step": 10620 + }, + { + "epoch": 0.5845671198194727, + "grad_norm": 0.6732968091964722, + "learning_rate": 8.06043039648017e-06, + "loss": 0.7846, + "step": 10621 + }, + { + "epoch": 0.5846221586218284, + "grad_norm": 0.7551947236061096, + "learning_rate": 8.060087603418464e-06, + "loss": 0.6868, + "step": 10622 + }, + { + "epoch": 0.584677197424184, + "grad_norm": 0.7781728506088257, + "learning_rate": 8.059744787357959e-06, + "loss": 0.8088, + "step": 10623 + }, + { + "epoch": 0.5847322362265397, + "grad_norm": 0.6362790465354919, + "learning_rate": 8.05940194830123e-06, + "loss": 0.664, + "step": 10624 + }, + { + "epoch": 0.5847872750288954, + "grad_norm": 0.670386791229248, + "learning_rate": 8.059059086250856e-06, + "loss": 0.6839, + "step": 10625 + }, + { + "epoch": 0.5848423138312511, + "grad_norm": 0.7030045986175537, + "learning_rate": 8.058716201209414e-06, + "loss": 0.7243, + "step": 10626 + }, + { + "epoch": 0.5848973526336066, + "grad_norm": 0.7881805896759033, + "learning_rate": 8.058373293179477e-06, + "loss": 0.7994, + "step": 10627 + }, + { + "epoch": 0.5849523914359623, + "grad_norm": 0.7077344059944153, + "learning_rate": 8.058030362163628e-06, + "loss": 0.822, + "step": 10628 + }, + { + "epoch": 0.585007430238318, + "grad_norm": 0.6787039637565613, + "learning_rate": 8.057687408164439e-06, + "loss": 0.7619, + "step": 10629 + }, + { + "epoch": 0.5850624690406737, + "grad_norm": 1.1377217769622803, + "learning_rate": 8.05734443118449e-06, + "loss": 0.8632, + "step": 10630 + }, + { + "epoch": 0.5851175078430293, + "grad_norm": 0.7002600431442261, + "learning_rate": 8.05700143122636e-06, + "loss": 0.8184, + "step": 10631 + }, + { + "epoch": 0.585172546645385, + "grad_norm": 0.7016324400901794, + "learning_rate": 8.056658408292626e-06, + "loss": 0.658, + "step": 10632 + }, + { + "epoch": 0.5852275854477407, + "grad_norm": 0.6674843430519104, + "learning_rate": 8.056315362385864e-06, + "loss": 0.7281, + "step": 10633 + }, + { + "epoch": 0.5852826242500964, + "grad_norm": 0.6789288520812988, + "learning_rate": 8.055972293508653e-06, + "loss": 0.8192, + "step": 10634 + }, + { + "epoch": 0.5853376630524519, + "grad_norm": 0.6740062236785889, + "learning_rate": 8.055629201663575e-06, + "loss": 0.7343, + "step": 10635 + }, + { + "epoch": 0.5853927018548076, + "grad_norm": 0.7417730689048767, + "learning_rate": 8.055286086853204e-06, + "loss": 0.8161, + "step": 10636 + }, + { + "epoch": 0.5854477406571633, + "grad_norm": 0.6680465340614319, + "learning_rate": 8.054942949080122e-06, + "loss": 0.7589, + "step": 10637 + }, + { + "epoch": 0.585502779459519, + "grad_norm": 0.7205108404159546, + "learning_rate": 8.054599788346904e-06, + "loss": 0.6837, + "step": 10638 + }, + { + "epoch": 0.5855578182618746, + "grad_norm": 0.8694404363632202, + "learning_rate": 8.054256604656134e-06, + "loss": 0.8033, + "step": 10639 + }, + { + "epoch": 0.5856128570642303, + "grad_norm": 0.685471773147583, + "learning_rate": 8.053913398010389e-06, + "loss": 0.7654, + "step": 10640 + }, + { + "epoch": 0.585667895866586, + "grad_norm": 1.3463424444198608, + "learning_rate": 8.053570168412249e-06, + "loss": 0.7743, + "step": 10641 + }, + { + "epoch": 0.5857229346689417, + "grad_norm": 0.9380106329917908, + "learning_rate": 8.05322691586429e-06, + "loss": 0.8984, + "step": 10642 + }, + { + "epoch": 0.5857779734712972, + "grad_norm": 0.7408519387245178, + "learning_rate": 8.052883640369096e-06, + "loss": 0.7716, + "step": 10643 + }, + { + "epoch": 0.5858330122736529, + "grad_norm": 0.7712904214859009, + "learning_rate": 8.052540341929248e-06, + "loss": 0.7767, + "step": 10644 + }, + { + "epoch": 0.5858880510760086, + "grad_norm": 0.8464158177375793, + "learning_rate": 8.052197020547321e-06, + "loss": 0.8333, + "step": 10645 + }, + { + "epoch": 0.5859430898783643, + "grad_norm": 0.6970158219337463, + "learning_rate": 8.0518536762259e-06, + "loss": 0.7354, + "step": 10646 + }, + { + "epoch": 0.5859981286807199, + "grad_norm": 0.7048965096473694, + "learning_rate": 8.051510308967563e-06, + "loss": 0.8333, + "step": 10647 + }, + { + "epoch": 0.5860531674830756, + "grad_norm": 0.6443868279457092, + "learning_rate": 8.05116691877489e-06, + "loss": 0.7386, + "step": 10648 + }, + { + "epoch": 0.5861082062854313, + "grad_norm": 0.6653542518615723, + "learning_rate": 8.050823505650465e-06, + "loss": 0.8116, + "step": 10649 + }, + { + "epoch": 0.5861632450877869, + "grad_norm": 0.7293158769607544, + "learning_rate": 8.050480069596868e-06, + "loss": 0.8231, + "step": 10650 + }, + { + "epoch": 0.5862182838901425, + "grad_norm": 0.6876117587089539, + "learning_rate": 8.050136610616676e-06, + "loss": 0.7856, + "step": 10651 + }, + { + "epoch": 0.5862733226924982, + "grad_norm": 0.6811665296554565, + "learning_rate": 8.049793128712477e-06, + "loss": 0.7667, + "step": 10652 + }, + { + "epoch": 0.5863283614948539, + "grad_norm": 0.701034426689148, + "learning_rate": 8.049449623886849e-06, + "loss": 0.7812, + "step": 10653 + }, + { + "epoch": 0.5863834002972095, + "grad_norm": 0.6872833967208862, + "learning_rate": 8.049106096142372e-06, + "loss": 0.755, + "step": 10654 + }, + { + "epoch": 0.5864384390995652, + "grad_norm": 0.6643580198287964, + "learning_rate": 8.04876254548163e-06, + "loss": 0.7692, + "step": 10655 + }, + { + "epoch": 0.5864934779019209, + "grad_norm": 0.6672106981277466, + "learning_rate": 8.048418971907206e-06, + "loss": 0.7424, + "step": 10656 + }, + { + "epoch": 0.5865485167042765, + "grad_norm": 0.8030515313148499, + "learning_rate": 8.04807537542168e-06, + "loss": 0.8074, + "step": 10657 + }, + { + "epoch": 0.5866035555066321, + "grad_norm": 0.713417112827301, + "learning_rate": 8.047731756027637e-06, + "loss": 0.6974, + "step": 10658 + }, + { + "epoch": 0.5866585943089878, + "grad_norm": 0.7715572118759155, + "learning_rate": 8.047388113727657e-06, + "loss": 0.7353, + "step": 10659 + }, + { + "epoch": 0.5867136331113435, + "grad_norm": 0.7009812593460083, + "learning_rate": 8.047044448524323e-06, + "loss": 0.7992, + "step": 10660 + }, + { + "epoch": 0.5867686719136992, + "grad_norm": 0.6425079107284546, + "learning_rate": 8.046700760420219e-06, + "loss": 0.7394, + "step": 10661 + }, + { + "epoch": 0.5868237107160548, + "grad_norm": 0.7713460922241211, + "learning_rate": 8.046357049417927e-06, + "loss": 0.7759, + "step": 10662 + }, + { + "epoch": 0.5868787495184105, + "grad_norm": 0.7310347557067871, + "learning_rate": 8.046013315520033e-06, + "loss": 0.7278, + "step": 10663 + }, + { + "epoch": 0.5869337883207661, + "grad_norm": 0.7493315935134888, + "learning_rate": 8.045669558729117e-06, + "loss": 0.7808, + "step": 10664 + }, + { + "epoch": 0.5869888271231218, + "grad_norm": 0.7547439336776733, + "learning_rate": 8.045325779047763e-06, + "loss": 0.8245, + "step": 10665 + }, + { + "epoch": 0.5870438659254774, + "grad_norm": 0.7556985020637512, + "learning_rate": 8.044981976478557e-06, + "loss": 0.8, + "step": 10666 + }, + { + "epoch": 0.5870989047278331, + "grad_norm": 0.8330736756324768, + "learning_rate": 8.04463815102408e-06, + "loss": 0.8177, + "step": 10667 + }, + { + "epoch": 0.5871539435301888, + "grad_norm": 0.7823941111564636, + "learning_rate": 8.04429430268692e-06, + "loss": 0.8306, + "step": 10668 + }, + { + "epoch": 0.5872089823325445, + "grad_norm": 0.9141719937324524, + "learning_rate": 8.043950431469657e-06, + "loss": 0.9137, + "step": 10669 + }, + { + "epoch": 0.5872640211349001, + "grad_norm": 0.6967095732688904, + "learning_rate": 8.043606537374878e-06, + "loss": 0.7262, + "step": 10670 + }, + { + "epoch": 0.5873190599372557, + "grad_norm": 0.7909649014472961, + "learning_rate": 8.043262620405166e-06, + "loss": 0.8332, + "step": 10671 + }, + { + "epoch": 0.5873740987396114, + "grad_norm": 0.7967168092727661, + "learning_rate": 8.042918680563107e-06, + "loss": 0.7966, + "step": 10672 + }, + { + "epoch": 0.5874291375419671, + "grad_norm": 0.7637625336647034, + "learning_rate": 8.042574717851287e-06, + "loss": 0.8322, + "step": 10673 + }, + { + "epoch": 0.5874841763443227, + "grad_norm": 0.6968004107475281, + "learning_rate": 8.04223073227229e-06, + "loss": 0.8061, + "step": 10674 + }, + { + "epoch": 0.5875392151466784, + "grad_norm": 0.7325586080551147, + "learning_rate": 8.0418867238287e-06, + "loss": 0.7922, + "step": 10675 + }, + { + "epoch": 0.5875942539490341, + "grad_norm": 0.6784406304359436, + "learning_rate": 8.041542692523103e-06, + "loss": 0.7327, + "step": 10676 + }, + { + "epoch": 0.5876492927513898, + "grad_norm": 0.8297861218452454, + "learning_rate": 8.041198638358088e-06, + "loss": 0.9347, + "step": 10677 + }, + { + "epoch": 0.5877043315537454, + "grad_norm": 0.6227413415908813, + "learning_rate": 8.040854561336236e-06, + "loss": 0.655, + "step": 10678 + }, + { + "epoch": 0.587759370356101, + "grad_norm": 0.752098023891449, + "learning_rate": 8.040510461460134e-06, + "loss": 0.7608, + "step": 10679 + }, + { + "epoch": 0.5878144091584567, + "grad_norm": 0.7008342146873474, + "learning_rate": 8.040166338732372e-06, + "loss": 0.7385, + "step": 10680 + }, + { + "epoch": 0.5878694479608124, + "grad_norm": 0.6768027544021606, + "learning_rate": 8.039822193155532e-06, + "loss": 0.6812, + "step": 10681 + }, + { + "epoch": 0.587924486763168, + "grad_norm": 0.7728545069694519, + "learning_rate": 8.039478024732203e-06, + "loss": 0.7696, + "step": 10682 + }, + { + "epoch": 0.5879795255655237, + "grad_norm": 0.7257505655288696, + "learning_rate": 8.03913383346497e-06, + "loss": 0.6686, + "step": 10683 + }, + { + "epoch": 0.5880345643678794, + "grad_norm": 0.7755837440490723, + "learning_rate": 8.03878961935642e-06, + "loss": 0.8469, + "step": 10684 + }, + { + "epoch": 0.5880896031702351, + "grad_norm": 0.7187668085098267, + "learning_rate": 8.038445382409142e-06, + "loss": 0.8249, + "step": 10685 + }, + { + "epoch": 0.5881446419725906, + "grad_norm": 0.638053834438324, + "learning_rate": 8.038101122625722e-06, + "loss": 0.6876, + "step": 10686 + }, + { + "epoch": 0.5881996807749463, + "grad_norm": 0.7323756217956543, + "learning_rate": 8.037756840008746e-06, + "loss": 0.7489, + "step": 10687 + }, + { + "epoch": 0.588254719577302, + "grad_norm": 0.6795439720153809, + "learning_rate": 8.037412534560804e-06, + "loss": 0.7246, + "step": 10688 + }, + { + "epoch": 0.5883097583796577, + "grad_norm": 0.8136376142501831, + "learning_rate": 8.037068206284482e-06, + "loss": 0.8518, + "step": 10689 + }, + { + "epoch": 0.5883647971820133, + "grad_norm": 0.6484195590019226, + "learning_rate": 8.036723855182367e-06, + "loss": 0.7018, + "step": 10690 + }, + { + "epoch": 0.588419835984369, + "grad_norm": 0.7465028166770935, + "learning_rate": 8.036379481257048e-06, + "loss": 0.8276, + "step": 10691 + }, + { + "epoch": 0.5884748747867247, + "grad_norm": 0.7761173844337463, + "learning_rate": 8.036035084511116e-06, + "loss": 0.6371, + "step": 10692 + }, + { + "epoch": 0.5885299135890804, + "grad_norm": 0.830008864402771, + "learning_rate": 8.035690664947156e-06, + "loss": 0.8199, + "step": 10693 + }, + { + "epoch": 0.5885849523914359, + "grad_norm": 0.6614254117012024, + "learning_rate": 8.03534622256776e-06, + "loss": 0.656, + "step": 10694 + }, + { + "epoch": 0.5886399911937916, + "grad_norm": 0.7229047417640686, + "learning_rate": 8.035001757375509e-06, + "loss": 0.7622, + "step": 10695 + }, + { + "epoch": 0.5886950299961473, + "grad_norm": 0.7044325470924377, + "learning_rate": 8.034657269373001e-06, + "loss": 0.7678, + "step": 10696 + }, + { + "epoch": 0.5887500687985029, + "grad_norm": 0.7109018564224243, + "learning_rate": 8.03431275856282e-06, + "loss": 0.7976, + "step": 10697 + }, + { + "epoch": 0.5888051076008586, + "grad_norm": 0.7812879085540771, + "learning_rate": 8.033968224947557e-06, + "loss": 0.7163, + "step": 10698 + }, + { + "epoch": 0.5888601464032143, + "grad_norm": 0.7408469915390015, + "learning_rate": 8.033623668529802e-06, + "loss": 0.6895, + "step": 10699 + }, + { + "epoch": 0.58891518520557, + "grad_norm": 0.7654302716255188, + "learning_rate": 8.033279089312142e-06, + "loss": 0.8126, + "step": 10700 + }, + { + "epoch": 0.5889702240079255, + "grad_norm": 0.7307846546173096, + "learning_rate": 8.032934487297169e-06, + "loss": 0.7958, + "step": 10701 + }, + { + "epoch": 0.5890252628102812, + "grad_norm": 0.6658591032028198, + "learning_rate": 8.032589862487472e-06, + "loss": 0.717, + "step": 10702 + }, + { + "epoch": 0.5890803016126369, + "grad_norm": 1.4167139530181885, + "learning_rate": 8.03224521488564e-06, + "loss": 0.8599, + "step": 10703 + }, + { + "epoch": 0.5891353404149926, + "grad_norm": 0.6723609566688538, + "learning_rate": 8.031900544494266e-06, + "loss": 0.8167, + "step": 10704 + }, + { + "epoch": 0.5891903792173482, + "grad_norm": 0.6420501470565796, + "learning_rate": 8.03155585131594e-06, + "loss": 0.692, + "step": 10705 + }, + { + "epoch": 0.5892454180197039, + "grad_norm": 0.6973454356193542, + "learning_rate": 8.031211135353251e-06, + "loss": 0.7709, + "step": 10706 + }, + { + "epoch": 0.5893004568220596, + "grad_norm": 0.7752252221107483, + "learning_rate": 8.03086639660879e-06, + "loss": 0.7795, + "step": 10707 + }, + { + "epoch": 0.5893554956244152, + "grad_norm": 0.8193135857582092, + "learning_rate": 8.030521635085149e-06, + "loss": 0.812, + "step": 10708 + }, + { + "epoch": 0.5894105344267708, + "grad_norm": 0.7976878881454468, + "learning_rate": 8.03017685078492e-06, + "loss": 0.8039, + "step": 10709 + }, + { + "epoch": 0.5894655732291265, + "grad_norm": 0.7545839548110962, + "learning_rate": 8.02983204371069e-06, + "loss": 0.8238, + "step": 10710 + }, + { + "epoch": 0.5895206120314822, + "grad_norm": 0.6544732451438904, + "learning_rate": 8.029487213865054e-06, + "loss": 0.7471, + "step": 10711 + }, + { + "epoch": 0.5895756508338379, + "grad_norm": 0.7054508924484253, + "learning_rate": 8.029142361250603e-06, + "loss": 0.8283, + "step": 10712 + }, + { + "epoch": 0.5896306896361935, + "grad_norm": 0.7425236105918884, + "learning_rate": 8.02879748586993e-06, + "loss": 0.8031, + "step": 10713 + }, + { + "epoch": 0.5896857284385492, + "grad_norm": 0.8390052318572998, + "learning_rate": 8.028452587725626e-06, + "loss": 0.7218, + "step": 10714 + }, + { + "epoch": 0.5897407672409049, + "grad_norm": 0.8116903901100159, + "learning_rate": 8.028107666820282e-06, + "loss": 0.8057, + "step": 10715 + }, + { + "epoch": 0.5897958060432605, + "grad_norm": 0.602308452129364, + "learning_rate": 8.027762723156492e-06, + "loss": 0.6428, + "step": 10716 + }, + { + "epoch": 0.5898508448456161, + "grad_norm": 0.7480159401893616, + "learning_rate": 8.027417756736848e-06, + "loss": 0.7566, + "step": 10717 + }, + { + "epoch": 0.5899058836479718, + "grad_norm": 0.6823177933692932, + "learning_rate": 8.027072767563943e-06, + "loss": 0.8337, + "step": 10718 + }, + { + "epoch": 0.5899609224503275, + "grad_norm": 0.6841796040534973, + "learning_rate": 8.026727755640367e-06, + "loss": 0.751, + "step": 10719 + }, + { + "epoch": 0.5900159612526832, + "grad_norm": 0.7257139086723328, + "learning_rate": 8.026382720968718e-06, + "loss": 0.7373, + "step": 10720 + }, + { + "epoch": 0.5900710000550388, + "grad_norm": 0.6318400502204895, + "learning_rate": 8.026037663551584e-06, + "loss": 0.7205, + "step": 10721 + }, + { + "epoch": 0.5901260388573945, + "grad_norm": 0.6612908840179443, + "learning_rate": 8.025692583391564e-06, + "loss": 0.7613, + "step": 10722 + }, + { + "epoch": 0.5901810776597501, + "grad_norm": 0.7555351853370667, + "learning_rate": 8.025347480491246e-06, + "loss": 0.718, + "step": 10723 + }, + { + "epoch": 0.5902361164621058, + "grad_norm": 0.6944366097450256, + "learning_rate": 8.025002354853227e-06, + "loss": 0.7775, + "step": 10724 + }, + { + "epoch": 0.5902911552644614, + "grad_norm": 0.6968230605125427, + "learning_rate": 8.0246572064801e-06, + "loss": 0.7316, + "step": 10725 + }, + { + "epoch": 0.5903461940668171, + "grad_norm": 0.7083567380905151, + "learning_rate": 8.024312035374459e-06, + "loss": 0.7844, + "step": 10726 + }, + { + "epoch": 0.5904012328691728, + "grad_norm": 0.7183080315589905, + "learning_rate": 8.0239668415389e-06, + "loss": 0.8308, + "step": 10727 + }, + { + "epoch": 0.5904562716715285, + "grad_norm": 0.8350495100021362, + "learning_rate": 8.023621624976014e-06, + "loss": 0.9077, + "step": 10728 + }, + { + "epoch": 0.590511310473884, + "grad_norm": 0.6876987218856812, + "learning_rate": 8.023276385688396e-06, + "loss": 0.7483, + "step": 10729 + }, + { + "epoch": 0.5905663492762397, + "grad_norm": 0.8617128133773804, + "learning_rate": 8.022931123678646e-06, + "loss": 0.7058, + "step": 10730 + }, + { + "epoch": 0.5906213880785954, + "grad_norm": 0.6921959519386292, + "learning_rate": 8.02258583894935e-06, + "loss": 0.7542, + "step": 10731 + }, + { + "epoch": 0.5906764268809511, + "grad_norm": 0.7394077181816101, + "learning_rate": 8.02224053150311e-06, + "loss": 0.7761, + "step": 10732 + }, + { + "epoch": 0.5907314656833067, + "grad_norm": 0.6672187447547913, + "learning_rate": 8.02189520134252e-06, + "loss": 0.6904, + "step": 10733 + }, + { + "epoch": 0.5907865044856624, + "grad_norm": 0.7498076558113098, + "learning_rate": 8.021549848470174e-06, + "loss": 0.7994, + "step": 10734 + }, + { + "epoch": 0.5908415432880181, + "grad_norm": 0.699832558631897, + "learning_rate": 8.021204472888669e-06, + "loss": 0.7413, + "step": 10735 + }, + { + "epoch": 0.5908965820903738, + "grad_norm": 0.7628722190856934, + "learning_rate": 8.020859074600598e-06, + "loss": 0.8202, + "step": 10736 + }, + { + "epoch": 0.5909516208927293, + "grad_norm": 0.8023744225502014, + "learning_rate": 8.020513653608558e-06, + "loss": 0.8225, + "step": 10737 + }, + { + "epoch": 0.591006659695085, + "grad_norm": 0.7283689379692078, + "learning_rate": 8.02016820991515e-06, + "loss": 0.6706, + "step": 10738 + }, + { + "epoch": 0.5910616984974407, + "grad_norm": 0.7199996113777161, + "learning_rate": 8.019822743522962e-06, + "loss": 0.8258, + "step": 10739 + }, + { + "epoch": 0.5911167372997963, + "grad_norm": 0.623249888420105, + "learning_rate": 8.019477254434598e-06, + "loss": 0.6188, + "step": 10740 + }, + { + "epoch": 0.591171776102152, + "grad_norm": 0.7331949472427368, + "learning_rate": 8.01913174265265e-06, + "loss": 0.8013, + "step": 10741 + }, + { + "epoch": 0.5912268149045077, + "grad_norm": 0.7003010511398315, + "learning_rate": 8.018786208179716e-06, + "loss": 0.8305, + "step": 10742 + }, + { + "epoch": 0.5912818537068634, + "grad_norm": 0.6879638433456421, + "learning_rate": 8.01844065101839e-06, + "loss": 0.7622, + "step": 10743 + }, + { + "epoch": 0.591336892509219, + "grad_norm": 0.6597324013710022, + "learning_rate": 8.018095071171276e-06, + "loss": 0.7362, + "step": 10744 + }, + { + "epoch": 0.5913919313115746, + "grad_norm": 0.664905846118927, + "learning_rate": 8.017749468640967e-06, + "loss": 0.7629, + "step": 10745 + }, + { + "epoch": 0.5914469701139303, + "grad_norm": 0.7358053922653198, + "learning_rate": 8.017403843430059e-06, + "loss": 0.7798, + "step": 10746 + }, + { + "epoch": 0.591502008916286, + "grad_norm": 0.699603259563446, + "learning_rate": 8.017058195541152e-06, + "loss": 0.6249, + "step": 10747 + }, + { + "epoch": 0.5915570477186416, + "grad_norm": 0.6736140847206116, + "learning_rate": 8.016712524976843e-06, + "loss": 0.6904, + "step": 10748 + }, + { + "epoch": 0.5916120865209973, + "grad_norm": 0.6803401112556458, + "learning_rate": 8.016366831739732e-06, + "loss": 0.6868, + "step": 10749 + }, + { + "epoch": 0.591667125323353, + "grad_norm": 0.7152959704399109, + "learning_rate": 8.016021115832413e-06, + "loss": 0.7747, + "step": 10750 + }, + { + "epoch": 0.5917221641257087, + "grad_norm": 0.6469255685806274, + "learning_rate": 8.015675377257489e-06, + "loss": 0.7309, + "step": 10751 + }, + { + "epoch": 0.5917772029280642, + "grad_norm": 0.7902734875679016, + "learning_rate": 8.015329616017554e-06, + "loss": 0.7575, + "step": 10752 + }, + { + "epoch": 0.5918322417304199, + "grad_norm": 0.7447189688682556, + "learning_rate": 8.014983832115208e-06, + "loss": 0.7759, + "step": 10753 + }, + { + "epoch": 0.5918872805327756, + "grad_norm": 0.6135374903678894, + "learning_rate": 8.014638025553053e-06, + "loss": 0.6681, + "step": 10754 + }, + { + "epoch": 0.5919423193351313, + "grad_norm": 0.8614835739135742, + "learning_rate": 8.014292196333684e-06, + "loss": 0.7203, + "step": 10755 + }, + { + "epoch": 0.5919973581374869, + "grad_norm": 0.7649008631706238, + "learning_rate": 8.013946344459703e-06, + "loss": 0.7966, + "step": 10756 + }, + { + "epoch": 0.5920523969398426, + "grad_norm": 1.0862764120101929, + "learning_rate": 8.013600469933707e-06, + "loss": 0.866, + "step": 10757 + }, + { + "epoch": 0.5921074357421983, + "grad_norm": 0.7304185628890991, + "learning_rate": 8.013254572758296e-06, + "loss": 0.7599, + "step": 10758 + }, + { + "epoch": 0.592162474544554, + "grad_norm": 0.6329634785652161, + "learning_rate": 8.012908652936072e-06, + "loss": 0.6855, + "step": 10759 + }, + { + "epoch": 0.5922175133469095, + "grad_norm": 0.6692202687263489, + "learning_rate": 8.012562710469631e-06, + "loss": 0.817, + "step": 10760 + }, + { + "epoch": 0.5922725521492652, + "grad_norm": 0.6577631235122681, + "learning_rate": 8.012216745361577e-06, + "loss": 0.7813, + "step": 10761 + }, + { + "epoch": 0.5923275909516209, + "grad_norm": 0.6877861022949219, + "learning_rate": 8.011870757614506e-06, + "loss": 0.7142, + "step": 10762 + }, + { + "epoch": 0.5923826297539766, + "grad_norm": 0.7132022380828857, + "learning_rate": 8.011524747231023e-06, + "loss": 0.747, + "step": 10763 + }, + { + "epoch": 0.5924376685563322, + "grad_norm": 0.7841360569000244, + "learning_rate": 8.011178714213726e-06, + "loss": 0.7511, + "step": 10764 + }, + { + "epoch": 0.5924927073586879, + "grad_norm": 0.8572794198989868, + "learning_rate": 8.010832658565215e-06, + "loss": 0.8704, + "step": 10765 + }, + { + "epoch": 0.5925477461610436, + "grad_norm": 0.6825506687164307, + "learning_rate": 8.010486580288092e-06, + "loss": 0.7472, + "step": 10766 + }, + { + "epoch": 0.5926027849633992, + "grad_norm": 0.7484591603279114, + "learning_rate": 8.010140479384957e-06, + "loss": 0.7679, + "step": 10767 + }, + { + "epoch": 0.5926578237657548, + "grad_norm": 0.712602436542511, + "learning_rate": 8.009794355858412e-06, + "loss": 0.7706, + "step": 10768 + }, + { + "epoch": 0.5927128625681105, + "grad_norm": 0.8911493420600891, + "learning_rate": 8.00944820971106e-06, + "loss": 0.8396, + "step": 10769 + }, + { + "epoch": 0.5927679013704662, + "grad_norm": 0.7300251126289368, + "learning_rate": 8.009102040945498e-06, + "loss": 0.7611, + "step": 10770 + }, + { + "epoch": 0.5928229401728219, + "grad_norm": 0.727343738079071, + "learning_rate": 8.008755849564333e-06, + "loss": 0.6785, + "step": 10771 + }, + { + "epoch": 0.5928779789751775, + "grad_norm": 0.8323808908462524, + "learning_rate": 8.008409635570163e-06, + "loss": 0.7429, + "step": 10772 + }, + { + "epoch": 0.5929330177775332, + "grad_norm": 0.6651942133903503, + "learning_rate": 8.00806339896559e-06, + "loss": 0.7683, + "step": 10773 + }, + { + "epoch": 0.5929880565798888, + "grad_norm": 0.7164554595947266, + "learning_rate": 8.007717139753222e-06, + "loss": 0.7742, + "step": 10774 + }, + { + "epoch": 0.5930430953822445, + "grad_norm": 0.6906408667564392, + "learning_rate": 8.007370857935654e-06, + "loss": 0.7322, + "step": 10775 + }, + { + "epoch": 0.5930981341846001, + "grad_norm": 0.6384999752044678, + "learning_rate": 8.007024553515493e-06, + "loss": 0.7011, + "step": 10776 + }, + { + "epoch": 0.5931531729869558, + "grad_norm": 0.6997355222702026, + "learning_rate": 8.006678226495338e-06, + "loss": 0.7303, + "step": 10777 + }, + { + "epoch": 0.5932082117893115, + "grad_norm": 0.6730707287788391, + "learning_rate": 8.006331876877797e-06, + "loss": 0.7461, + "step": 10778 + }, + { + "epoch": 0.5932632505916672, + "grad_norm": 0.7529115080833435, + "learning_rate": 8.00598550466547e-06, + "loss": 0.7487, + "step": 10779 + }, + { + "epoch": 0.5933182893940228, + "grad_norm": 0.7186329960823059, + "learning_rate": 8.00563910986096e-06, + "loss": 0.8025, + "step": 10780 + }, + { + "epoch": 0.5933733281963784, + "grad_norm": 0.7523752450942993, + "learning_rate": 8.005292692466869e-06, + "loss": 0.8291, + "step": 10781 + }, + { + "epoch": 0.5934283669987341, + "grad_norm": 1.182645559310913, + "learning_rate": 8.004946252485806e-06, + "loss": 0.8037, + "step": 10782 + }, + { + "epoch": 0.5934834058010897, + "grad_norm": 0.736570417881012, + "learning_rate": 8.004599789920369e-06, + "loss": 0.8259, + "step": 10783 + }, + { + "epoch": 0.5935384446034454, + "grad_norm": 0.757665753364563, + "learning_rate": 8.004253304773165e-06, + "loss": 0.7773, + "step": 10784 + }, + { + "epoch": 0.5935934834058011, + "grad_norm": 0.6988566517829895, + "learning_rate": 8.003906797046798e-06, + "loss": 0.7895, + "step": 10785 + }, + { + "epoch": 0.5936485222081568, + "grad_norm": 0.6921454071998596, + "learning_rate": 8.00356026674387e-06, + "loss": 0.8068, + "step": 10786 + }, + { + "epoch": 0.5937035610105124, + "grad_norm": 0.7053877115249634, + "learning_rate": 8.003213713866988e-06, + "loss": 0.7632, + "step": 10787 + }, + { + "epoch": 0.593758599812868, + "grad_norm": 0.8193650245666504, + "learning_rate": 8.002867138418757e-06, + "loss": 0.759, + "step": 10788 + }, + { + "epoch": 0.5938136386152237, + "grad_norm": 0.6089804768562317, + "learning_rate": 8.002520540401779e-06, + "loss": 0.7117, + "step": 10789 + }, + { + "epoch": 0.5938686774175794, + "grad_norm": 0.6869456768035889, + "learning_rate": 8.002173919818662e-06, + "loss": 0.7724, + "step": 10790 + }, + { + "epoch": 0.593923716219935, + "grad_norm": 0.7279118895530701, + "learning_rate": 8.001827276672007e-06, + "loss": 0.7578, + "step": 10791 + }, + { + "epoch": 0.5939787550222907, + "grad_norm": 0.6960133910179138, + "learning_rate": 8.00148061096442e-06, + "loss": 0.7887, + "step": 10792 + }, + { + "epoch": 0.5940337938246464, + "grad_norm": 0.6774740815162659, + "learning_rate": 8.001133922698511e-06, + "loss": 0.7146, + "step": 10793 + }, + { + "epoch": 0.5940888326270021, + "grad_norm": 0.6696349382400513, + "learning_rate": 8.000787211876883e-06, + "loss": 0.7829, + "step": 10794 + }, + { + "epoch": 0.5941438714293577, + "grad_norm": 1.5037024021148682, + "learning_rate": 8.000440478502142e-06, + "loss": 0.8198, + "step": 10795 + }, + { + "epoch": 0.5941989102317133, + "grad_norm": 0.7373353838920593, + "learning_rate": 8.000093722576893e-06, + "loss": 0.7864, + "step": 10796 + }, + { + "epoch": 0.594253949034069, + "grad_norm": 0.8120700120925903, + "learning_rate": 7.999746944103743e-06, + "loss": 0.7918, + "step": 10797 + }, + { + "epoch": 0.5943089878364247, + "grad_norm": 0.7669811844825745, + "learning_rate": 7.999400143085296e-06, + "loss": 0.751, + "step": 10798 + }, + { + "epoch": 0.5943640266387803, + "grad_norm": 0.8090860843658447, + "learning_rate": 7.999053319524163e-06, + "loss": 0.8387, + "step": 10799 + }, + { + "epoch": 0.594419065441136, + "grad_norm": 0.6994315385818481, + "learning_rate": 7.998706473422945e-06, + "loss": 0.7084, + "step": 10800 + }, + { + "epoch": 0.5944741042434917, + "grad_norm": 0.7913107872009277, + "learning_rate": 7.998359604784254e-06, + "loss": 0.7454, + "step": 10801 + }, + { + "epoch": 0.5945291430458474, + "grad_norm": 0.6831398010253906, + "learning_rate": 7.998012713610696e-06, + "loss": 0.7422, + "step": 10802 + }, + { + "epoch": 0.5945841818482029, + "grad_norm": 0.7324068546295166, + "learning_rate": 7.997665799904875e-06, + "loss": 0.7622, + "step": 10803 + }, + { + "epoch": 0.5946392206505586, + "grad_norm": 0.8192811012268066, + "learning_rate": 7.997318863669399e-06, + "loss": 0.7783, + "step": 10804 + }, + { + "epoch": 0.5946942594529143, + "grad_norm": 0.8008341789245605, + "learning_rate": 7.996971904906879e-06, + "loss": 0.7673, + "step": 10805 + }, + { + "epoch": 0.59474929825527, + "grad_norm": 0.6899568438529968, + "learning_rate": 7.99662492361992e-06, + "loss": 0.7477, + "step": 10806 + }, + { + "epoch": 0.5948043370576256, + "grad_norm": 0.7322555780410767, + "learning_rate": 7.996277919811132e-06, + "loss": 0.7673, + "step": 10807 + }, + { + "epoch": 0.5948593758599813, + "grad_norm": 1.008300542831421, + "learning_rate": 7.995930893483117e-06, + "loss": 0.7556, + "step": 10808 + }, + { + "epoch": 0.594914414662337, + "grad_norm": 0.7211925387382507, + "learning_rate": 7.99558384463849e-06, + "loss": 0.761, + "step": 10809 + }, + { + "epoch": 0.5949694534646927, + "grad_norm": 0.7143383622169495, + "learning_rate": 7.995236773279855e-06, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.5950244922670482, + "grad_norm": 0.7682802677154541, + "learning_rate": 7.994889679409825e-06, + "loss": 0.8538, + "step": 10811 + }, + { + "epoch": 0.5950795310694039, + "grad_norm": 0.6304698586463928, + "learning_rate": 7.994542563031004e-06, + "loss": 0.7343, + "step": 10812 + }, + { + "epoch": 0.5951345698717596, + "grad_norm": 0.6704440116882324, + "learning_rate": 7.994195424146002e-06, + "loss": 0.6921, + "step": 10813 + }, + { + "epoch": 0.5951896086741153, + "grad_norm": 0.8626209497451782, + "learning_rate": 7.99384826275743e-06, + "loss": 0.7049, + "step": 10814 + }, + { + "epoch": 0.5952446474764709, + "grad_norm": 0.810922384262085, + "learning_rate": 7.993501078867895e-06, + "loss": 0.793, + "step": 10815 + }, + { + "epoch": 0.5952996862788266, + "grad_norm": 0.8495855927467346, + "learning_rate": 7.993153872480009e-06, + "loss": 0.8078, + "step": 10816 + }, + { + "epoch": 0.5953547250811823, + "grad_norm": 0.7430331707000732, + "learning_rate": 7.992806643596378e-06, + "loss": 0.7957, + "step": 10817 + }, + { + "epoch": 0.595409763883538, + "grad_norm": 0.7188051342964172, + "learning_rate": 7.992459392219614e-06, + "loss": 0.725, + "step": 10818 + }, + { + "epoch": 0.5954648026858935, + "grad_norm": 0.7046926021575928, + "learning_rate": 7.992112118352326e-06, + "loss": 0.7438, + "step": 10819 + }, + { + "epoch": 0.5955198414882492, + "grad_norm": 0.7982804775238037, + "learning_rate": 7.991764821997123e-06, + "loss": 0.7046, + "step": 10820 + }, + { + "epoch": 0.5955748802906049, + "grad_norm": 0.6392245292663574, + "learning_rate": 7.991417503156618e-06, + "loss": 0.7413, + "step": 10821 + }, + { + "epoch": 0.5956299190929606, + "grad_norm": 0.7518960237503052, + "learning_rate": 7.99107016183342e-06, + "loss": 0.7661, + "step": 10822 + }, + { + "epoch": 0.5956849578953162, + "grad_norm": 0.7413721680641174, + "learning_rate": 7.99072279803014e-06, + "loss": 0.6538, + "step": 10823 + }, + { + "epoch": 0.5957399966976719, + "grad_norm": 0.7729454636573792, + "learning_rate": 7.990375411749384e-06, + "loss": 0.8056, + "step": 10824 + }, + { + "epoch": 0.5957950355000275, + "grad_norm": 0.8059296607971191, + "learning_rate": 7.99002800299377e-06, + "loss": 0.8699, + "step": 10825 + }, + { + "epoch": 0.5958500743023831, + "grad_norm": 0.5947105288505554, + "learning_rate": 7.989680571765907e-06, + "loss": 0.6481, + "step": 10826 + }, + { + "epoch": 0.5959051131047388, + "grad_norm": 0.7303743362426758, + "learning_rate": 7.989333118068404e-06, + "loss": 0.7401, + "step": 10827 + }, + { + "epoch": 0.5959601519070945, + "grad_norm": 0.7121400237083435, + "learning_rate": 7.988985641903873e-06, + "loss": 0.78, + "step": 10828 + }, + { + "epoch": 0.5960151907094502, + "grad_norm": 0.6921802163124084, + "learning_rate": 7.988638143274926e-06, + "loss": 0.7234, + "step": 10829 + }, + { + "epoch": 0.5960702295118058, + "grad_norm": 0.6715331673622131, + "learning_rate": 7.988290622184174e-06, + "loss": 0.7606, + "step": 10830 + }, + { + "epoch": 0.5961252683141615, + "grad_norm": 0.6315215229988098, + "learning_rate": 7.98794307863423e-06, + "loss": 0.6902, + "step": 10831 + }, + { + "epoch": 0.5961803071165171, + "grad_norm": 0.6884782314300537, + "learning_rate": 7.987595512627707e-06, + "loss": 0.7808, + "step": 10832 + }, + { + "epoch": 0.5962353459188728, + "grad_norm": 0.7050700783729553, + "learning_rate": 7.987247924167215e-06, + "loss": 0.7248, + "step": 10833 + }, + { + "epoch": 0.5962903847212284, + "grad_norm": 0.7232446074485779, + "learning_rate": 7.986900313255367e-06, + "loss": 0.8686, + "step": 10834 + }, + { + "epoch": 0.5963454235235841, + "grad_norm": 0.693631649017334, + "learning_rate": 7.986552679894778e-06, + "loss": 0.7567, + "step": 10835 + }, + { + "epoch": 0.5964004623259398, + "grad_norm": 0.6462356448173523, + "learning_rate": 7.986205024088054e-06, + "loss": 0.7091, + "step": 10836 + }, + { + "epoch": 0.5964555011282955, + "grad_norm": 0.7465559840202332, + "learning_rate": 7.985857345837814e-06, + "loss": 0.8965, + "step": 10837 + }, + { + "epoch": 0.5965105399306511, + "grad_norm": 0.6803271770477295, + "learning_rate": 7.985509645146672e-06, + "loss": 0.7602, + "step": 10838 + }, + { + "epoch": 0.5965655787330068, + "grad_norm": 1.1414798498153687, + "learning_rate": 7.985161922017238e-06, + "loss": 0.7806, + "step": 10839 + }, + { + "epoch": 0.5966206175353624, + "grad_norm": 0.6583230495452881, + "learning_rate": 7.984814176452123e-06, + "loss": 0.6727, + "step": 10840 + }, + { + "epoch": 0.5966756563377181, + "grad_norm": 0.6582550406455994, + "learning_rate": 7.984466408453946e-06, + "loss": 0.6794, + "step": 10841 + }, + { + "epoch": 0.5967306951400737, + "grad_norm": 0.8680793642997742, + "learning_rate": 7.984118618025318e-06, + "loss": 0.7999, + "step": 10842 + }, + { + "epoch": 0.5967857339424294, + "grad_norm": 0.772777795791626, + "learning_rate": 7.983770805168853e-06, + "loss": 0.6278, + "step": 10843 + }, + { + "epoch": 0.5968407727447851, + "grad_norm": 0.8099700808525085, + "learning_rate": 7.983422969887167e-06, + "loss": 0.7631, + "step": 10844 + }, + { + "epoch": 0.5968958115471408, + "grad_norm": 0.660271406173706, + "learning_rate": 7.983075112182871e-06, + "loss": 0.7557, + "step": 10845 + }, + { + "epoch": 0.5969508503494964, + "grad_norm": 0.7205530405044556, + "learning_rate": 7.982727232058582e-06, + "loss": 0.8258, + "step": 10846 + }, + { + "epoch": 0.597005889151852, + "grad_norm": 0.7925810813903809, + "learning_rate": 7.982379329516912e-06, + "loss": 0.7534, + "step": 10847 + }, + { + "epoch": 0.5970609279542077, + "grad_norm": 0.7255545854568481, + "learning_rate": 7.982031404560477e-06, + "loss": 0.8394, + "step": 10848 + }, + { + "epoch": 0.5971159667565634, + "grad_norm": 0.835394561290741, + "learning_rate": 7.981683457191893e-06, + "loss": 0.8384, + "step": 10849 + }, + { + "epoch": 0.597171005558919, + "grad_norm": 0.6781747937202454, + "learning_rate": 7.981335487413775e-06, + "loss": 0.8173, + "step": 10850 + }, + { + "epoch": 0.5972260443612747, + "grad_norm": 0.8602943420410156, + "learning_rate": 7.980987495228737e-06, + "loss": 0.8257, + "step": 10851 + }, + { + "epoch": 0.5972810831636304, + "grad_norm": 0.7157264947891235, + "learning_rate": 7.980639480639394e-06, + "loss": 0.7267, + "step": 10852 + }, + { + "epoch": 0.5973361219659861, + "grad_norm": 0.7695063352584839, + "learning_rate": 7.980291443648364e-06, + "loss": 0.7794, + "step": 10853 + }, + { + "epoch": 0.5973911607683416, + "grad_norm": 0.723971426486969, + "learning_rate": 7.979943384258262e-06, + "loss": 0.7761, + "step": 10854 + }, + { + "epoch": 0.5974461995706973, + "grad_norm": 0.691722571849823, + "learning_rate": 7.979595302471702e-06, + "loss": 0.7276, + "step": 10855 + }, + { + "epoch": 0.597501238373053, + "grad_norm": 0.7019701600074768, + "learning_rate": 7.9792471982913e-06, + "loss": 0.7965, + "step": 10856 + }, + { + "epoch": 0.5975562771754087, + "grad_norm": 0.6626996994018555, + "learning_rate": 7.978899071719675e-06, + "loss": 0.7124, + "step": 10857 + }, + { + "epoch": 0.5976113159777643, + "grad_norm": 0.6871625781059265, + "learning_rate": 7.978550922759443e-06, + "loss": 0.7742, + "step": 10858 + }, + { + "epoch": 0.59766635478012, + "grad_norm": 0.7153579592704773, + "learning_rate": 7.978202751413217e-06, + "loss": 0.7852, + "step": 10859 + }, + { + "epoch": 0.5977213935824757, + "grad_norm": 0.6891841292381287, + "learning_rate": 7.977854557683619e-06, + "loss": 0.7873, + "step": 10860 + }, + { + "epoch": 0.5977764323848314, + "grad_norm": 0.6864004731178284, + "learning_rate": 7.977506341573262e-06, + "loss": 0.7223, + "step": 10861 + }, + { + "epoch": 0.5978314711871869, + "grad_norm": 0.7163059115409851, + "learning_rate": 7.977158103084764e-06, + "loss": 0.679, + "step": 10862 + }, + { + "epoch": 0.5978865099895426, + "grad_norm": 0.6727336049079895, + "learning_rate": 7.976809842220742e-06, + "loss": 0.7148, + "step": 10863 + }, + { + "epoch": 0.5979415487918983, + "grad_norm": 0.672960638999939, + "learning_rate": 7.976461558983814e-06, + "loss": 0.7263, + "step": 10864 + }, + { + "epoch": 0.597996587594254, + "grad_norm": 0.9124444127082825, + "learning_rate": 7.976113253376601e-06, + "loss": 0.6876, + "step": 10865 + }, + { + "epoch": 0.5980516263966096, + "grad_norm": 0.6415041089057922, + "learning_rate": 7.975764925401715e-06, + "loss": 0.6655, + "step": 10866 + }, + { + "epoch": 0.5981066651989653, + "grad_norm": 0.7342595458030701, + "learning_rate": 7.975416575061776e-06, + "loss": 0.7753, + "step": 10867 + }, + { + "epoch": 0.598161704001321, + "grad_norm": 0.7161775231361389, + "learning_rate": 7.975068202359402e-06, + "loss": 0.7525, + "step": 10868 + }, + { + "epoch": 0.5982167428036765, + "grad_norm": 0.7087578773498535, + "learning_rate": 7.974719807297212e-06, + "loss": 0.7196, + "step": 10869 + }, + { + "epoch": 0.5982717816060322, + "grad_norm": 0.6472536325454712, + "learning_rate": 7.974371389877826e-06, + "loss": 0.6837, + "step": 10870 + }, + { + "epoch": 0.5983268204083879, + "grad_norm": 0.6625581383705139, + "learning_rate": 7.97402295010386e-06, + "loss": 0.6379, + "step": 10871 + }, + { + "epoch": 0.5983818592107436, + "grad_norm": 0.7621071934700012, + "learning_rate": 7.973674487977934e-06, + "loss": 0.8291, + "step": 10872 + }, + { + "epoch": 0.5984368980130992, + "grad_norm": 0.693394660949707, + "learning_rate": 7.973326003502666e-06, + "loss": 0.7677, + "step": 10873 + }, + { + "epoch": 0.5984919368154549, + "grad_norm": 0.6393985152244568, + "learning_rate": 7.972977496680674e-06, + "loss": 0.7058, + "step": 10874 + }, + { + "epoch": 0.5985469756178106, + "grad_norm": 0.7101462483406067, + "learning_rate": 7.972628967514582e-06, + "loss": 0.7396, + "step": 10875 + }, + { + "epoch": 0.5986020144201663, + "grad_norm": 0.8131522536277771, + "learning_rate": 7.972280416007003e-06, + "loss": 0.8461, + "step": 10876 + }, + { + "epoch": 0.5986570532225218, + "grad_norm": 0.7186655402183533, + "learning_rate": 7.971931842160564e-06, + "loss": 0.7721, + "step": 10877 + }, + { + "epoch": 0.5987120920248775, + "grad_norm": 0.7520855069160461, + "learning_rate": 7.971583245977877e-06, + "loss": 0.7733, + "step": 10878 + }, + { + "epoch": 0.5987671308272332, + "grad_norm": 0.6548848748207092, + "learning_rate": 7.971234627461569e-06, + "loss": 0.6555, + "step": 10879 + }, + { + "epoch": 0.5988221696295889, + "grad_norm": 0.7341775894165039, + "learning_rate": 7.970885986614254e-06, + "loss": 0.8292, + "step": 10880 + }, + { + "epoch": 0.5988772084319445, + "grad_norm": 0.7126352190971375, + "learning_rate": 7.970537323438556e-06, + "loss": 0.7704, + "step": 10881 + }, + { + "epoch": 0.5989322472343002, + "grad_norm": 0.7291527390480042, + "learning_rate": 7.970188637937097e-06, + "loss": 0.8175, + "step": 10882 + }, + { + "epoch": 0.5989872860366559, + "grad_norm": 0.682767927646637, + "learning_rate": 7.969839930112493e-06, + "loss": 0.8187, + "step": 10883 + }, + { + "epoch": 0.5990423248390115, + "grad_norm": 0.7820014953613281, + "learning_rate": 7.969491199967368e-06, + "loss": 0.7949, + "step": 10884 + }, + { + "epoch": 0.5990973636413671, + "grad_norm": 0.7257336974143982, + "learning_rate": 7.969142447504341e-06, + "loss": 0.8461, + "step": 10885 + }, + { + "epoch": 0.5991524024437228, + "grad_norm": 0.6813532114028931, + "learning_rate": 7.968793672726033e-06, + "loss": 0.7889, + "step": 10886 + }, + { + "epoch": 0.5992074412460785, + "grad_norm": 0.6868439316749573, + "learning_rate": 7.96844487563507e-06, + "loss": 0.7268, + "step": 10887 + }, + { + "epoch": 0.5992624800484342, + "grad_norm": 0.6547278761863708, + "learning_rate": 7.968096056234067e-06, + "loss": 0.7026, + "step": 10888 + }, + { + "epoch": 0.5993175188507898, + "grad_norm": 0.6704558730125427, + "learning_rate": 7.96774721452565e-06, + "loss": 0.6994, + "step": 10889 + }, + { + "epoch": 0.5993725576531455, + "grad_norm": 0.7134065628051758, + "learning_rate": 7.967398350512439e-06, + "loss": 0.7728, + "step": 10890 + }, + { + "epoch": 0.5994275964555011, + "grad_norm": 0.751265823841095, + "learning_rate": 7.967049464197056e-06, + "loss": 0.8421, + "step": 10891 + }, + { + "epoch": 0.5994826352578568, + "grad_norm": 0.8558571934700012, + "learning_rate": 7.966700555582125e-06, + "loss": 0.9144, + "step": 10892 + }, + { + "epoch": 0.5995376740602124, + "grad_norm": 0.8338084816932678, + "learning_rate": 7.966351624670263e-06, + "loss": 0.7502, + "step": 10893 + }, + { + "epoch": 0.5995927128625681, + "grad_norm": 0.7017131447792053, + "learning_rate": 7.9660026714641e-06, + "loss": 0.7778, + "step": 10894 + }, + { + "epoch": 0.5996477516649238, + "grad_norm": 0.7176111340522766, + "learning_rate": 7.965653695966253e-06, + "loss": 0.8478, + "step": 10895 + }, + { + "epoch": 0.5997027904672795, + "grad_norm": 0.7026060819625854, + "learning_rate": 7.965304698179349e-06, + "loss": 0.7111, + "step": 10896 + }, + { + "epoch": 0.5997578292696351, + "grad_norm": 0.6383810639381409, + "learning_rate": 7.964955678106005e-06, + "loss": 0.6429, + "step": 10897 + }, + { + "epoch": 0.5998128680719907, + "grad_norm": 0.8024059534072876, + "learning_rate": 7.96460663574885e-06, + "loss": 0.7308, + "step": 10898 + }, + { + "epoch": 0.5998679068743464, + "grad_norm": 0.7378466725349426, + "learning_rate": 7.964257571110504e-06, + "loss": 0.7593, + "step": 10899 + }, + { + "epoch": 0.5999229456767021, + "grad_norm": 0.7089043855667114, + "learning_rate": 7.963908484193593e-06, + "loss": 0.6862, + "step": 10900 + }, + { + "epoch": 0.5999779844790577, + "grad_norm": 0.765295684337616, + "learning_rate": 7.963559375000738e-06, + "loss": 0.6759, + "step": 10901 + }, + { + "epoch": 0.6000330232814134, + "grad_norm": 0.7040783166885376, + "learning_rate": 7.963210243534565e-06, + "loss": 0.7754, + "step": 10902 + }, + { + "epoch": 0.6000880620837691, + "grad_norm": 0.8593736886978149, + "learning_rate": 7.962861089797698e-06, + "loss": 0.8765, + "step": 10903 + }, + { + "epoch": 0.6001431008861248, + "grad_norm": 0.6613926291465759, + "learning_rate": 7.962511913792758e-06, + "loss": 0.6697, + "step": 10904 + }, + { + "epoch": 0.6001981396884803, + "grad_norm": 0.6369597911834717, + "learning_rate": 7.962162715522372e-06, + "loss": 0.7145, + "step": 10905 + }, + { + "epoch": 0.600253178490836, + "grad_norm": 1.1790162324905396, + "learning_rate": 7.961813494989164e-06, + "loss": 0.8067, + "step": 10906 + }, + { + "epoch": 0.6003082172931917, + "grad_norm": 0.7548268437385559, + "learning_rate": 7.961464252195759e-06, + "loss": 0.7936, + "step": 10907 + }, + { + "epoch": 0.6003632560955474, + "grad_norm": 0.6204384565353394, + "learning_rate": 7.961114987144781e-06, + "loss": 0.6374, + "step": 10908 + }, + { + "epoch": 0.600418294897903, + "grad_norm": 0.7149941921234131, + "learning_rate": 7.960765699838854e-06, + "loss": 0.8422, + "step": 10909 + }, + { + "epoch": 0.6004733337002587, + "grad_norm": 0.7040171027183533, + "learning_rate": 7.960416390280608e-06, + "loss": 0.8261, + "step": 10910 + }, + { + "epoch": 0.6005283725026144, + "grad_norm": 0.713591456413269, + "learning_rate": 7.960067058472663e-06, + "loss": 0.7908, + "step": 10911 + }, + { + "epoch": 0.60058341130497, + "grad_norm": 0.654086172580719, + "learning_rate": 7.959717704417645e-06, + "loss": 0.6971, + "step": 10912 + }, + { + "epoch": 0.6006384501073256, + "grad_norm": 0.7293223738670349, + "learning_rate": 7.959368328118183e-06, + "loss": 0.7032, + "step": 10913 + }, + { + "epoch": 0.6006934889096813, + "grad_norm": 0.705434262752533, + "learning_rate": 7.959018929576898e-06, + "loss": 0.7193, + "step": 10914 + }, + { + "epoch": 0.600748527712037, + "grad_norm": 0.7406907677650452, + "learning_rate": 7.958669508796422e-06, + "loss": 0.8464, + "step": 10915 + }, + { + "epoch": 0.6008035665143926, + "grad_norm": 0.6683858036994934, + "learning_rate": 7.958320065779377e-06, + "loss": 0.699, + "step": 10916 + }, + { + "epoch": 0.6008586053167483, + "grad_norm": 0.7380560636520386, + "learning_rate": 7.95797060052839e-06, + "loss": 0.7409, + "step": 10917 + }, + { + "epoch": 0.600913644119104, + "grad_norm": 0.7729377746582031, + "learning_rate": 7.957621113046088e-06, + "loss": 0.8838, + "step": 10918 + }, + { + "epoch": 0.6009686829214597, + "grad_norm": 0.6842743158340454, + "learning_rate": 7.957271603335097e-06, + "loss": 0.781, + "step": 10919 + }, + { + "epoch": 0.6010237217238152, + "grad_norm": 0.6864648461341858, + "learning_rate": 7.956922071398045e-06, + "loss": 0.6717, + "step": 10920 + }, + { + "epoch": 0.6010787605261709, + "grad_norm": 0.7718262672424316, + "learning_rate": 7.956572517237557e-06, + "loss": 0.8023, + "step": 10921 + }, + { + "epoch": 0.6011337993285266, + "grad_norm": 0.686338484287262, + "learning_rate": 7.956222940856261e-06, + "loss": 0.7139, + "step": 10922 + }, + { + "epoch": 0.6011888381308823, + "grad_norm": 0.7064465284347534, + "learning_rate": 7.955873342256789e-06, + "loss": 0.845, + "step": 10923 + }, + { + "epoch": 0.6012438769332379, + "grad_norm": 0.6847875714302063, + "learning_rate": 7.955523721441761e-06, + "loss": 0.7078, + "step": 10924 + }, + { + "epoch": 0.6012989157355936, + "grad_norm": 0.6879494786262512, + "learning_rate": 7.955174078413806e-06, + "loss": 0.7532, + "step": 10925 + }, + { + "epoch": 0.6013539545379493, + "grad_norm": 0.6569855213165283, + "learning_rate": 7.954824413175554e-06, + "loss": 0.7529, + "step": 10926 + }, + { + "epoch": 0.601408993340305, + "grad_norm": 0.6225974559783936, + "learning_rate": 7.954474725729635e-06, + "loss": 0.6595, + "step": 10927 + }, + { + "epoch": 0.6014640321426605, + "grad_norm": 0.7067761421203613, + "learning_rate": 7.954125016078675e-06, + "loss": 0.7851, + "step": 10928 + }, + { + "epoch": 0.6015190709450162, + "grad_norm": 0.683030903339386, + "learning_rate": 7.9537752842253e-06, + "loss": 0.7461, + "step": 10929 + }, + { + "epoch": 0.6015741097473719, + "grad_norm": 0.6411080956459045, + "learning_rate": 7.953425530172143e-06, + "loss": 0.6945, + "step": 10930 + }, + { + "epoch": 0.6016291485497276, + "grad_norm": 0.6254550814628601, + "learning_rate": 7.953075753921829e-06, + "loss": 0.7143, + "step": 10931 + }, + { + "epoch": 0.6016841873520832, + "grad_norm": 0.684100866317749, + "learning_rate": 7.952725955476987e-06, + "loss": 0.8137, + "step": 10932 + }, + { + "epoch": 0.6017392261544389, + "grad_norm": 0.6341036558151245, + "learning_rate": 7.95237613484025e-06, + "loss": 0.6692, + "step": 10933 + }, + { + "epoch": 0.6017942649567946, + "grad_norm": 0.7311153411865234, + "learning_rate": 7.952026292014242e-06, + "loss": 0.7091, + "step": 10934 + }, + { + "epoch": 0.6018493037591502, + "grad_norm": 0.7265943884849548, + "learning_rate": 7.951676427001596e-06, + "loss": 0.765, + "step": 10935 + }, + { + "epoch": 0.6019043425615058, + "grad_norm": 0.8777397274971008, + "learning_rate": 7.951326539804938e-06, + "loss": 0.7824, + "step": 10936 + }, + { + "epoch": 0.6019593813638615, + "grad_norm": 0.7241179347038269, + "learning_rate": 7.9509766304269e-06, + "loss": 0.7913, + "step": 10937 + }, + { + "epoch": 0.6020144201662172, + "grad_norm": 0.8090667128562927, + "learning_rate": 7.950626698870113e-06, + "loss": 0.8208, + "step": 10938 + }, + { + "epoch": 0.6020694589685729, + "grad_norm": 0.7376043796539307, + "learning_rate": 7.950276745137206e-06, + "loss": 0.7176, + "step": 10939 + }, + { + "epoch": 0.6021244977709285, + "grad_norm": 0.7149157524108887, + "learning_rate": 7.949926769230809e-06, + "loss": 0.7949, + "step": 10940 + }, + { + "epoch": 0.6021795365732842, + "grad_norm": 0.8721579909324646, + "learning_rate": 7.949576771153549e-06, + "loss": 0.8433, + "step": 10941 + }, + { + "epoch": 0.6022345753756398, + "grad_norm": 0.7946182489395142, + "learning_rate": 7.949226750908062e-06, + "loss": 0.7412, + "step": 10942 + }, + { + "epoch": 0.6022896141779955, + "grad_norm": 0.6661237478256226, + "learning_rate": 7.948876708496975e-06, + "loss": 0.725, + "step": 10943 + }, + { + "epoch": 0.6023446529803511, + "grad_norm": 0.8346213698387146, + "learning_rate": 7.948526643922922e-06, + "loss": 0.6817, + "step": 10944 + }, + { + "epoch": 0.6023996917827068, + "grad_norm": 0.7911655306816101, + "learning_rate": 7.94817655718853e-06, + "loss": 0.7398, + "step": 10945 + }, + { + "epoch": 0.6024547305850625, + "grad_norm": 0.6480078101158142, + "learning_rate": 7.947826448296432e-06, + "loss": 0.6822, + "step": 10946 + }, + { + "epoch": 0.6025097693874182, + "grad_norm": 0.6950085759162903, + "learning_rate": 7.94747631724926e-06, + "loss": 0.8073, + "step": 10947 + }, + { + "epoch": 0.6025648081897738, + "grad_norm": 0.7142168879508972, + "learning_rate": 7.947126164049645e-06, + "loss": 0.6159, + "step": 10948 + }, + { + "epoch": 0.6026198469921294, + "grad_norm": 0.7459015846252441, + "learning_rate": 7.946775988700219e-06, + "loss": 0.8377, + "step": 10949 + }, + { + "epoch": 0.6026748857944851, + "grad_norm": 1.050179362297058, + "learning_rate": 7.946425791203614e-06, + "loss": 0.8098, + "step": 10950 + }, + { + "epoch": 0.6027299245968408, + "grad_norm": 0.7473265528678894, + "learning_rate": 7.94607557156246e-06, + "loss": 0.6846, + "step": 10951 + }, + { + "epoch": 0.6027849633991964, + "grad_norm": 0.7990789413452148, + "learning_rate": 7.945725329779392e-06, + "loss": 0.8216, + "step": 10952 + }, + { + "epoch": 0.6028400022015521, + "grad_norm": 0.6461700201034546, + "learning_rate": 7.94537506585704e-06, + "loss": 0.7864, + "step": 10953 + }, + { + "epoch": 0.6028950410039078, + "grad_norm": 0.661123514175415, + "learning_rate": 7.945024779798038e-06, + "loss": 0.7466, + "step": 10954 + }, + { + "epoch": 0.6029500798062634, + "grad_norm": 0.6998088359832764, + "learning_rate": 7.944674471605018e-06, + "loss": 0.7846, + "step": 10955 + }, + { + "epoch": 0.603005118608619, + "grad_norm": 0.6917386651039124, + "learning_rate": 7.944324141280613e-06, + "loss": 0.7699, + "step": 10956 + }, + { + "epoch": 0.6030601574109747, + "grad_norm": 0.7304503321647644, + "learning_rate": 7.943973788827455e-06, + "loss": 0.8015, + "step": 10957 + }, + { + "epoch": 0.6031151962133304, + "grad_norm": 0.7996858358383179, + "learning_rate": 7.94362341424818e-06, + "loss": 0.7093, + "step": 10958 + }, + { + "epoch": 0.603170235015686, + "grad_norm": 0.7445322871208191, + "learning_rate": 7.943273017545419e-06, + "loss": 0.7388, + "step": 10959 + }, + { + "epoch": 0.6032252738180417, + "grad_norm": 0.6672174334526062, + "learning_rate": 7.942922598721805e-06, + "loss": 0.7703, + "step": 10960 + }, + { + "epoch": 0.6032803126203974, + "grad_norm": 0.7313557267189026, + "learning_rate": 7.94257215777997e-06, + "loss": 0.6637, + "step": 10961 + }, + { + "epoch": 0.6033353514227531, + "grad_norm": 0.7248823642730713, + "learning_rate": 7.942221694722553e-06, + "loss": 0.836, + "step": 10962 + }, + { + "epoch": 0.6033903902251087, + "grad_norm": 0.6583372354507446, + "learning_rate": 7.941871209552187e-06, + "loss": 0.7582, + "step": 10963 + }, + { + "epoch": 0.6034454290274643, + "grad_norm": 0.7502591013908386, + "learning_rate": 7.941520702271503e-06, + "loss": 0.7455, + "step": 10964 + }, + { + "epoch": 0.60350046782982, + "grad_norm": 0.6899349689483643, + "learning_rate": 7.941170172883135e-06, + "loss": 0.7677, + "step": 10965 + }, + { + "epoch": 0.6035555066321757, + "grad_norm": 0.693321943283081, + "learning_rate": 7.940819621389722e-06, + "loss": 0.7754, + "step": 10966 + }, + { + "epoch": 0.6036105454345313, + "grad_norm": 0.7376342415809631, + "learning_rate": 7.940469047793893e-06, + "loss": 0.7761, + "step": 10967 + }, + { + "epoch": 0.603665584236887, + "grad_norm": 0.6377952694892883, + "learning_rate": 7.940118452098289e-06, + "loss": 0.6612, + "step": 10968 + }, + { + "epoch": 0.6037206230392427, + "grad_norm": 0.8041388988494873, + "learning_rate": 7.939767834305538e-06, + "loss": 0.8358, + "step": 10969 + }, + { + "epoch": 0.6037756618415984, + "grad_norm": 1.5993521213531494, + "learning_rate": 7.939417194418282e-06, + "loss": 0.8536, + "step": 10970 + }, + { + "epoch": 0.6038307006439539, + "grad_norm": 0.6718295216560364, + "learning_rate": 7.939066532439153e-06, + "loss": 0.717, + "step": 10971 + }, + { + "epoch": 0.6038857394463096, + "grad_norm": 0.7951062917709351, + "learning_rate": 7.938715848370787e-06, + "loss": 0.6919, + "step": 10972 + }, + { + "epoch": 0.6039407782486653, + "grad_norm": 0.707804262638092, + "learning_rate": 7.938365142215816e-06, + "loss": 0.7346, + "step": 10973 + }, + { + "epoch": 0.603995817051021, + "grad_norm": 0.7244500517845154, + "learning_rate": 7.938014413976883e-06, + "loss": 0.708, + "step": 10974 + }, + { + "epoch": 0.6040508558533766, + "grad_norm": 0.7533566951751709, + "learning_rate": 7.937663663656617e-06, + "loss": 0.6761, + "step": 10975 + }, + { + "epoch": 0.6041058946557323, + "grad_norm": 0.8844665288925171, + "learning_rate": 7.93731289125766e-06, + "loss": 0.7833, + "step": 10976 + }, + { + "epoch": 0.604160933458088, + "grad_norm": 0.6413047313690186, + "learning_rate": 7.936962096782643e-06, + "loss": 0.7175, + "step": 10977 + }, + { + "epoch": 0.6042159722604437, + "grad_norm": 0.765943706035614, + "learning_rate": 7.936611280234206e-06, + "loss": 0.7654, + "step": 10978 + }, + { + "epoch": 0.6042710110627992, + "grad_norm": 0.6833398938179016, + "learning_rate": 7.936260441614985e-06, + "loss": 0.7459, + "step": 10979 + }, + { + "epoch": 0.6043260498651549, + "grad_norm": 0.6363481283187866, + "learning_rate": 7.935909580927617e-06, + "loss": 0.7173, + "step": 10980 + }, + { + "epoch": 0.6043810886675106, + "grad_norm": 0.7731046080589294, + "learning_rate": 7.935558698174738e-06, + "loss": 0.8428, + "step": 10981 + }, + { + "epoch": 0.6044361274698663, + "grad_norm": 0.7346602082252502, + "learning_rate": 7.935207793358986e-06, + "loss": 0.832, + "step": 10982 + }, + { + "epoch": 0.6044911662722219, + "grad_norm": 0.6711193919181824, + "learning_rate": 7.934856866482998e-06, + "loss": 0.742, + "step": 10983 + }, + { + "epoch": 0.6045462050745776, + "grad_norm": 0.6931266784667969, + "learning_rate": 7.934505917549411e-06, + "loss": 0.7779, + "step": 10984 + }, + { + "epoch": 0.6046012438769333, + "grad_norm": 0.7624725699424744, + "learning_rate": 7.934154946560862e-06, + "loss": 0.7229, + "step": 10985 + }, + { + "epoch": 0.604656282679289, + "grad_norm": 0.6594272255897522, + "learning_rate": 7.933803953519991e-06, + "loss": 0.7776, + "step": 10986 + }, + { + "epoch": 0.6047113214816445, + "grad_norm": 0.674521803855896, + "learning_rate": 7.933452938429435e-06, + "loss": 0.6904, + "step": 10987 + }, + { + "epoch": 0.6047663602840002, + "grad_norm": 0.7352569699287415, + "learning_rate": 7.933101901291831e-06, + "loss": 0.7655, + "step": 10988 + }, + { + "epoch": 0.6048213990863559, + "grad_norm": 0.8560347557067871, + "learning_rate": 7.932750842109817e-06, + "loss": 0.7894, + "step": 10989 + }, + { + "epoch": 0.6048764378887116, + "grad_norm": 0.769496500492096, + "learning_rate": 7.932399760886037e-06, + "loss": 0.8255, + "step": 10990 + }, + { + "epoch": 0.6049314766910672, + "grad_norm": 0.9399588108062744, + "learning_rate": 7.932048657623122e-06, + "loss": 0.8554, + "step": 10991 + }, + { + "epoch": 0.6049865154934229, + "grad_norm": 0.6662001609802246, + "learning_rate": 7.931697532323716e-06, + "loss": 0.7788, + "step": 10992 + }, + { + "epoch": 0.6050415542957785, + "grad_norm": 0.758263111114502, + "learning_rate": 7.931346384990455e-06, + "loss": 0.7907, + "step": 10993 + }, + { + "epoch": 0.6050965930981342, + "grad_norm": 0.7283937335014343, + "learning_rate": 7.930995215625978e-06, + "loss": 0.8415, + "step": 10994 + }, + { + "epoch": 0.6051516319004898, + "grad_norm": 0.6611599922180176, + "learning_rate": 7.930644024232927e-06, + "loss": 0.7145, + "step": 10995 + }, + { + "epoch": 0.6052066707028455, + "grad_norm": 0.8450857400894165, + "learning_rate": 7.93029281081394e-06, + "loss": 0.7208, + "step": 10996 + }, + { + "epoch": 0.6052617095052012, + "grad_norm": 0.649010181427002, + "learning_rate": 7.929941575371655e-06, + "loss": 0.6928, + "step": 10997 + }, + { + "epoch": 0.6053167483075568, + "grad_norm": 0.7022100687026978, + "learning_rate": 7.929590317908718e-06, + "loss": 0.7329, + "step": 10998 + }, + { + "epoch": 0.6053717871099125, + "grad_norm": 0.768598198890686, + "learning_rate": 7.92923903842776e-06, + "loss": 0.7799, + "step": 10999 + }, + { + "epoch": 0.6054268259122682, + "grad_norm": 0.6648436784744263, + "learning_rate": 7.928887736931428e-06, + "loss": 0.7728, + "step": 11000 + }, + { + "epoch": 0.6054818647146238, + "grad_norm": 0.6946157813072205, + "learning_rate": 7.928536413422357e-06, + "loss": 0.7609, + "step": 11001 + }, + { + "epoch": 0.6055369035169794, + "grad_norm": 0.7779337167739868, + "learning_rate": 7.928185067903191e-06, + "loss": 0.7679, + "step": 11002 + }, + { + "epoch": 0.6055919423193351, + "grad_norm": 0.6520814895629883, + "learning_rate": 7.927833700376573e-06, + "loss": 0.6734, + "step": 11003 + }, + { + "epoch": 0.6056469811216908, + "grad_norm": 0.7724258899688721, + "learning_rate": 7.927482310845138e-06, + "loss": 0.7564, + "step": 11004 + }, + { + "epoch": 0.6057020199240465, + "grad_norm": 0.6649174690246582, + "learning_rate": 7.927130899311529e-06, + "loss": 0.7217, + "step": 11005 + }, + { + "epoch": 0.6057570587264021, + "grad_norm": 0.6807287931442261, + "learning_rate": 7.926779465778389e-06, + "loss": 0.6966, + "step": 11006 + }, + { + "epoch": 0.6058120975287578, + "grad_norm": 0.6644826531410217, + "learning_rate": 7.926428010248357e-06, + "loss": 0.7238, + "step": 11007 + }, + { + "epoch": 0.6058671363311134, + "grad_norm": 0.7533535957336426, + "learning_rate": 7.926076532724077e-06, + "loss": 0.855, + "step": 11008 + }, + { + "epoch": 0.6059221751334691, + "grad_norm": 0.6457169055938721, + "learning_rate": 7.925725033208187e-06, + "loss": 0.6717, + "step": 11009 + }, + { + "epoch": 0.6059772139358247, + "grad_norm": 0.724719762802124, + "learning_rate": 7.925373511703332e-06, + "loss": 0.8701, + "step": 11010 + }, + { + "epoch": 0.6060322527381804, + "grad_norm": 0.746755063533783, + "learning_rate": 7.925021968212153e-06, + "loss": 0.8509, + "step": 11011 + }, + { + "epoch": 0.6060872915405361, + "grad_norm": 0.7377174496650696, + "learning_rate": 7.924670402737292e-06, + "loss": 0.8053, + "step": 11012 + }, + { + "epoch": 0.6061423303428918, + "grad_norm": 0.9791839718818665, + "learning_rate": 7.92431881528139e-06, + "loss": 0.7893, + "step": 11013 + }, + { + "epoch": 0.6061973691452474, + "grad_norm": 0.7472195029258728, + "learning_rate": 7.923967205847089e-06, + "loss": 0.7195, + "step": 11014 + }, + { + "epoch": 0.606252407947603, + "grad_norm": 0.672851026058197, + "learning_rate": 7.923615574437037e-06, + "loss": 0.8234, + "step": 11015 + }, + { + "epoch": 0.6063074467499587, + "grad_norm": 0.739942729473114, + "learning_rate": 7.923263921053872e-06, + "loss": 0.8582, + "step": 11016 + }, + { + "epoch": 0.6063624855523144, + "grad_norm": 0.7337772846221924, + "learning_rate": 7.922912245700236e-06, + "loss": 0.8008, + "step": 11017 + }, + { + "epoch": 0.60641752435467, + "grad_norm": 0.6707174777984619, + "learning_rate": 7.922560548378774e-06, + "loss": 0.8531, + "step": 11018 + }, + { + "epoch": 0.6064725631570257, + "grad_norm": 0.6783839464187622, + "learning_rate": 7.922208829092133e-06, + "loss": 0.7963, + "step": 11019 + }, + { + "epoch": 0.6065276019593814, + "grad_norm": 0.6133253574371338, + "learning_rate": 7.92185708784295e-06, + "loss": 0.7375, + "step": 11020 + }, + { + "epoch": 0.6065826407617371, + "grad_norm": 0.8300097584724426, + "learning_rate": 7.921505324633868e-06, + "loss": 0.7976, + "step": 11021 + }, + { + "epoch": 0.6066376795640926, + "grad_norm": 0.6800658702850342, + "learning_rate": 7.921153539467538e-06, + "loss": 0.7321, + "step": 11022 + }, + { + "epoch": 0.6066927183664483, + "grad_norm": 0.6849787831306458, + "learning_rate": 7.920801732346602e-06, + "loss": 0.7134, + "step": 11023 + }, + { + "epoch": 0.606747757168804, + "grad_norm": 0.7675080895423889, + "learning_rate": 7.920449903273697e-06, + "loss": 0.7402, + "step": 11024 + }, + { + "epoch": 0.6068027959711597, + "grad_norm": 0.7431055903434753, + "learning_rate": 7.920098052251476e-06, + "loss": 0.7872, + "step": 11025 + }, + { + "epoch": 0.6068578347735153, + "grad_norm": 0.6264036297798157, + "learning_rate": 7.919746179282577e-06, + "loss": 0.7496, + "step": 11026 + }, + { + "epoch": 0.606912873575871, + "grad_norm": 0.7800843715667725, + "learning_rate": 7.919394284369648e-06, + "loss": 0.7917, + "step": 11027 + }, + { + "epoch": 0.6069679123782267, + "grad_norm": 0.7665574550628662, + "learning_rate": 7.919042367515336e-06, + "loss": 0.7905, + "step": 11028 + }, + { + "epoch": 0.6070229511805824, + "grad_norm": 0.7473214864730835, + "learning_rate": 7.918690428722279e-06, + "loss": 0.7732, + "step": 11029 + }, + { + "epoch": 0.6070779899829379, + "grad_norm": 0.6717211008071899, + "learning_rate": 7.918338467993127e-06, + "loss": 0.8221, + "step": 11030 + }, + { + "epoch": 0.6071330287852936, + "grad_norm": 0.6745431423187256, + "learning_rate": 7.917986485330525e-06, + "loss": 0.6899, + "step": 11031 + }, + { + "epoch": 0.6071880675876493, + "grad_norm": 0.6838263273239136, + "learning_rate": 7.917634480737117e-06, + "loss": 0.7133, + "step": 11032 + }, + { + "epoch": 0.607243106390005, + "grad_norm": 0.7975682020187378, + "learning_rate": 7.91728245421555e-06, + "loss": 0.8283, + "step": 11033 + }, + { + "epoch": 0.6072981451923606, + "grad_norm": 0.7112031579017639, + "learning_rate": 7.916930405768468e-06, + "loss": 0.7423, + "step": 11034 + }, + { + "epoch": 0.6073531839947163, + "grad_norm": 0.7006776928901672, + "learning_rate": 7.91657833539852e-06, + "loss": 0.716, + "step": 11035 + }, + { + "epoch": 0.607408222797072, + "grad_norm": 0.7523549795150757, + "learning_rate": 7.916226243108348e-06, + "loss": 0.8591, + "step": 11036 + }, + { + "epoch": 0.6074632615994277, + "grad_norm": 0.7257835268974304, + "learning_rate": 7.9158741289006e-06, + "loss": 0.7471, + "step": 11037 + }, + { + "epoch": 0.6075183004017832, + "grad_norm": 0.8100149631500244, + "learning_rate": 7.915521992777922e-06, + "loss": 0.8373, + "step": 11038 + }, + { + "epoch": 0.6075733392041389, + "grad_norm": 0.7781035304069519, + "learning_rate": 7.915169834742964e-06, + "loss": 0.8471, + "step": 11039 + }, + { + "epoch": 0.6076283780064946, + "grad_norm": 0.7426049709320068, + "learning_rate": 7.914817654798368e-06, + "loss": 0.753, + "step": 11040 + }, + { + "epoch": 0.6076834168088502, + "grad_norm": 0.6990010738372803, + "learning_rate": 7.914465452946782e-06, + "loss": 0.7556, + "step": 11041 + }, + { + "epoch": 0.6077384556112059, + "grad_norm": 0.8038754463195801, + "learning_rate": 7.914113229190856e-06, + "loss": 0.7787, + "step": 11042 + }, + { + "epoch": 0.6077934944135616, + "grad_norm": 0.6434115767478943, + "learning_rate": 7.913760983533233e-06, + "loss": 0.7831, + "step": 11043 + }, + { + "epoch": 0.6078485332159173, + "grad_norm": 0.8119033575057983, + "learning_rate": 7.913408715976562e-06, + "loss": 0.7691, + "step": 11044 + }, + { + "epoch": 0.6079035720182728, + "grad_norm": 0.6710149049758911, + "learning_rate": 7.913056426523493e-06, + "loss": 0.7542, + "step": 11045 + }, + { + "epoch": 0.6079586108206285, + "grad_norm": 0.7458183765411377, + "learning_rate": 7.912704115176671e-06, + "loss": 0.7673, + "step": 11046 + }, + { + "epoch": 0.6080136496229842, + "grad_norm": 0.8061705827713013, + "learning_rate": 7.912351781938745e-06, + "loss": 0.9255, + "step": 11047 + }, + { + "epoch": 0.6080686884253399, + "grad_norm": 0.7193130850791931, + "learning_rate": 7.91199942681236e-06, + "loss": 0.8154, + "step": 11048 + }, + { + "epoch": 0.6081237272276955, + "grad_norm": 0.7785167098045349, + "learning_rate": 7.911647049800171e-06, + "loss": 0.7747, + "step": 11049 + }, + { + "epoch": 0.6081787660300512, + "grad_norm": 0.665765106678009, + "learning_rate": 7.911294650904818e-06, + "loss": 0.7573, + "step": 11050 + }, + { + "epoch": 0.6082338048324069, + "grad_norm": 0.7940623760223389, + "learning_rate": 7.910942230128956e-06, + "loss": 0.6628, + "step": 11051 + }, + { + "epoch": 0.6082888436347625, + "grad_norm": 0.8364549875259399, + "learning_rate": 7.910589787475232e-06, + "loss": 0.8103, + "step": 11052 + }, + { + "epoch": 0.6083438824371181, + "grad_norm": 0.6153101325035095, + "learning_rate": 7.910237322946292e-06, + "loss": 0.76, + "step": 11053 + }, + { + "epoch": 0.6083989212394738, + "grad_norm": 0.8381257653236389, + "learning_rate": 7.909884836544789e-06, + "loss": 0.8366, + "step": 11054 + }, + { + "epoch": 0.6084539600418295, + "grad_norm": 0.6602391600608826, + "learning_rate": 7.90953232827337e-06, + "loss": 0.7389, + "step": 11055 + }, + { + "epoch": 0.6085089988441852, + "grad_norm": 0.7329971194267273, + "learning_rate": 7.909179798134685e-06, + "loss": 0.8217, + "step": 11056 + }, + { + "epoch": 0.6085640376465408, + "grad_norm": 0.7319926023483276, + "learning_rate": 7.908827246131383e-06, + "loss": 0.78, + "step": 11057 + }, + { + "epoch": 0.6086190764488965, + "grad_norm": 0.6491387486457825, + "learning_rate": 7.908474672266114e-06, + "loss": 0.7496, + "step": 11058 + }, + { + "epoch": 0.6086741152512521, + "grad_norm": 0.656434953212738, + "learning_rate": 7.908122076541529e-06, + "loss": 0.7462, + "step": 11059 + }, + { + "epoch": 0.6087291540536078, + "grad_norm": 0.6908577680587769, + "learning_rate": 7.907769458960275e-06, + "loss": 0.7505, + "step": 11060 + }, + { + "epoch": 0.6087841928559634, + "grad_norm": 0.774424135684967, + "learning_rate": 7.907416819525007e-06, + "loss": 0.8275, + "step": 11061 + }, + { + "epoch": 0.6088392316583191, + "grad_norm": 0.6796718835830688, + "learning_rate": 7.90706415823837e-06, + "loss": 0.7606, + "step": 11062 + }, + { + "epoch": 0.6088942704606748, + "grad_norm": 0.9576514959335327, + "learning_rate": 7.906711475103016e-06, + "loss": 0.807, + "step": 11063 + }, + { + "epoch": 0.6089493092630305, + "grad_norm": 0.9848490953445435, + "learning_rate": 7.9063587701216e-06, + "loss": 0.7856, + "step": 11064 + }, + { + "epoch": 0.6090043480653861, + "grad_norm": 0.9490165710449219, + "learning_rate": 7.906006043296768e-06, + "loss": 0.8519, + "step": 11065 + }, + { + "epoch": 0.6090593868677417, + "grad_norm": 0.631382942199707, + "learning_rate": 7.905653294631172e-06, + "loss": 0.7041, + "step": 11066 + }, + { + "epoch": 0.6091144256700974, + "grad_norm": 0.6969574093818665, + "learning_rate": 7.905300524127464e-06, + "loss": 0.7556, + "step": 11067 + }, + { + "epoch": 0.6091694644724531, + "grad_norm": 0.6990532279014587, + "learning_rate": 7.904947731788295e-06, + "loss": 0.799, + "step": 11068 + }, + { + "epoch": 0.6092245032748087, + "grad_norm": 0.7216916084289551, + "learning_rate": 7.904594917616315e-06, + "loss": 0.7617, + "step": 11069 + }, + { + "epoch": 0.6092795420771644, + "grad_norm": 0.6874147653579712, + "learning_rate": 7.904242081614179e-06, + "loss": 0.7616, + "step": 11070 + }, + { + "epoch": 0.6093345808795201, + "grad_norm": 0.6909550428390503, + "learning_rate": 7.903889223784535e-06, + "loss": 0.7649, + "step": 11071 + }, + { + "epoch": 0.6093896196818758, + "grad_norm": 0.7796370387077332, + "learning_rate": 7.90353634413004e-06, + "loss": 0.7557, + "step": 11072 + }, + { + "epoch": 0.6094446584842313, + "grad_norm": 0.807448148727417, + "learning_rate": 7.903183442653341e-06, + "loss": 0.7519, + "step": 11073 + }, + { + "epoch": 0.609499697286587, + "grad_norm": 0.846371054649353, + "learning_rate": 7.902830519357092e-06, + "loss": 0.9342, + "step": 11074 + }, + { + "epoch": 0.6095547360889427, + "grad_norm": 1.0386929512023926, + "learning_rate": 7.902477574243947e-06, + "loss": 0.6802, + "step": 11075 + }, + { + "epoch": 0.6096097748912984, + "grad_norm": 0.8011854887008667, + "learning_rate": 7.902124607316558e-06, + "loss": 0.7756, + "step": 11076 + }, + { + "epoch": 0.609664813693654, + "grad_norm": 0.6560170650482178, + "learning_rate": 7.901771618577574e-06, + "loss": 0.7831, + "step": 11077 + }, + { + "epoch": 0.6097198524960097, + "grad_norm": 0.656891942024231, + "learning_rate": 7.901418608029655e-06, + "loss": 0.7239, + "step": 11078 + }, + { + "epoch": 0.6097748912983654, + "grad_norm": 0.7451794743537903, + "learning_rate": 7.901065575675448e-06, + "loss": 0.7426, + "step": 11079 + }, + { + "epoch": 0.6098299301007211, + "grad_norm": 0.6805453300476074, + "learning_rate": 7.90071252151761e-06, + "loss": 0.7257, + "step": 11080 + }, + { + "epoch": 0.6098849689030766, + "grad_norm": 0.7747140526771545, + "learning_rate": 7.900359445558791e-06, + "loss": 0.8554, + "step": 11081 + }, + { + "epoch": 0.6099400077054323, + "grad_norm": 0.7276260256767273, + "learning_rate": 7.900006347801649e-06, + "loss": 0.7608, + "step": 11082 + }, + { + "epoch": 0.609995046507788, + "grad_norm": 0.7496321201324463, + "learning_rate": 7.899653228248836e-06, + "loss": 0.7707, + "step": 11083 + }, + { + "epoch": 0.6100500853101436, + "grad_norm": 0.6810722947120667, + "learning_rate": 7.899300086903006e-06, + "loss": 0.7425, + "step": 11084 + }, + { + "epoch": 0.6101051241124993, + "grad_norm": 0.7245593070983887, + "learning_rate": 7.89894692376681e-06, + "loss": 0.8404, + "step": 11085 + }, + { + "epoch": 0.610160162914855, + "grad_norm": 0.7139402627944946, + "learning_rate": 7.898593738842906e-06, + "loss": 0.7219, + "step": 11086 + }, + { + "epoch": 0.6102152017172107, + "grad_norm": 0.6483772397041321, + "learning_rate": 7.898240532133947e-06, + "loss": 0.7571, + "step": 11087 + }, + { + "epoch": 0.6102702405195662, + "grad_norm": 0.7347467541694641, + "learning_rate": 7.89788730364259e-06, + "loss": 0.7666, + "step": 11088 + }, + { + "epoch": 0.6103252793219219, + "grad_norm": 0.8899261355400085, + "learning_rate": 7.897534053371485e-06, + "loss": 0.6886, + "step": 11089 + }, + { + "epoch": 0.6103803181242776, + "grad_norm": 0.7005650401115417, + "learning_rate": 7.89718078132329e-06, + "loss": 0.6771, + "step": 11090 + }, + { + "epoch": 0.6104353569266333, + "grad_norm": 0.776589035987854, + "learning_rate": 7.896827487500662e-06, + "loss": 0.7731, + "step": 11091 + }, + { + "epoch": 0.6104903957289889, + "grad_norm": 0.7039395570755005, + "learning_rate": 7.896474171906252e-06, + "loss": 0.7415, + "step": 11092 + }, + { + "epoch": 0.6105454345313446, + "grad_norm": 0.7453792095184326, + "learning_rate": 7.896120834542718e-06, + "loss": 0.8507, + "step": 11093 + }, + { + "epoch": 0.6106004733337003, + "grad_norm": 0.7516497373580933, + "learning_rate": 7.895767475412717e-06, + "loss": 0.8271, + "step": 11094 + }, + { + "epoch": 0.610655512136056, + "grad_norm": 0.6751283407211304, + "learning_rate": 7.895414094518901e-06, + "loss": 0.7788, + "step": 11095 + }, + { + "epoch": 0.6107105509384115, + "grad_norm": 0.7240836024284363, + "learning_rate": 7.895060691863927e-06, + "loss": 0.7507, + "step": 11096 + }, + { + "epoch": 0.6107655897407672, + "grad_norm": 0.8286149501800537, + "learning_rate": 7.894707267450451e-06, + "loss": 0.7033, + "step": 11097 + }, + { + "epoch": 0.6108206285431229, + "grad_norm": 0.8814655542373657, + "learning_rate": 7.894353821281131e-06, + "loss": 0.73, + "step": 11098 + }, + { + "epoch": 0.6108756673454786, + "grad_norm": 0.6792872548103333, + "learning_rate": 7.894000353358624e-06, + "loss": 0.7445, + "step": 11099 + }, + { + "epoch": 0.6109307061478342, + "grad_norm": 0.6442595720291138, + "learning_rate": 7.893646863685584e-06, + "loss": 0.7228, + "step": 11100 + }, + { + "epoch": 0.6109857449501899, + "grad_norm": 0.6775944828987122, + "learning_rate": 7.89329335226467e-06, + "loss": 0.7937, + "step": 11101 + }, + { + "epoch": 0.6110407837525456, + "grad_norm": 0.6315211653709412, + "learning_rate": 7.892939819098534e-06, + "loss": 0.7328, + "step": 11102 + }, + { + "epoch": 0.6110958225549012, + "grad_norm": 0.7419382929801941, + "learning_rate": 7.89258626418984e-06, + "loss": 0.8088, + "step": 11103 + }, + { + "epoch": 0.6111508613572568, + "grad_norm": 0.6645117402076721, + "learning_rate": 7.89223268754124e-06, + "loss": 0.7844, + "step": 11104 + }, + { + "epoch": 0.6112059001596125, + "grad_norm": 0.6389926075935364, + "learning_rate": 7.891879089155397e-06, + "loss": 0.6353, + "step": 11105 + }, + { + "epoch": 0.6112609389619682, + "grad_norm": 0.8223785758018494, + "learning_rate": 7.891525469034963e-06, + "loss": 0.7377, + "step": 11106 + }, + { + "epoch": 0.6113159777643239, + "grad_norm": 0.7627747058868408, + "learning_rate": 7.891171827182595e-06, + "loss": 0.8317, + "step": 11107 + }, + { + "epoch": 0.6113710165666795, + "grad_norm": 0.8015971183776855, + "learning_rate": 7.890818163600956e-06, + "loss": 0.8324, + "step": 11108 + }, + { + "epoch": 0.6114260553690352, + "grad_norm": 0.7180280089378357, + "learning_rate": 7.8904644782927e-06, + "loss": 0.8211, + "step": 11109 + }, + { + "epoch": 0.6114810941713908, + "grad_norm": 0.7855646014213562, + "learning_rate": 7.890110771260487e-06, + "loss": 0.8629, + "step": 11110 + }, + { + "epoch": 0.6115361329737465, + "grad_norm": 0.7389342784881592, + "learning_rate": 7.889757042506976e-06, + "loss": 0.6917, + "step": 11111 + }, + { + "epoch": 0.6115911717761021, + "grad_norm": 0.7996030449867249, + "learning_rate": 7.889403292034825e-06, + "loss": 0.7361, + "step": 11112 + }, + { + "epoch": 0.6116462105784578, + "grad_norm": 0.6658353805541992, + "learning_rate": 7.88904951984669e-06, + "loss": 0.7048, + "step": 11113 + }, + { + "epoch": 0.6117012493808135, + "grad_norm": 0.8128555417060852, + "learning_rate": 7.888695725945235e-06, + "loss": 0.7772, + "step": 11114 + }, + { + "epoch": 0.6117562881831692, + "grad_norm": 0.7597428560256958, + "learning_rate": 7.888341910333114e-06, + "loss": 0.7447, + "step": 11115 + }, + { + "epoch": 0.6118113269855248, + "grad_norm": 0.7330088019371033, + "learning_rate": 7.88798807301299e-06, + "loss": 0.849, + "step": 11116 + }, + { + "epoch": 0.6118663657878805, + "grad_norm": 0.8374074101448059, + "learning_rate": 7.88763421398752e-06, + "loss": 0.6149, + "step": 11117 + }, + { + "epoch": 0.6119214045902361, + "grad_norm": 0.7507160305976868, + "learning_rate": 7.887280333259364e-06, + "loss": 0.7737, + "step": 11118 + }, + { + "epoch": 0.6119764433925918, + "grad_norm": 0.7218281626701355, + "learning_rate": 7.886926430831181e-06, + "loss": 0.8151, + "step": 11119 + }, + { + "epoch": 0.6120314821949474, + "grad_norm": 0.6761744618415833, + "learning_rate": 7.886572506705634e-06, + "loss": 0.7429, + "step": 11120 + }, + { + "epoch": 0.6120865209973031, + "grad_norm": 0.8243520259857178, + "learning_rate": 7.886218560885379e-06, + "loss": 0.819, + "step": 11121 + }, + { + "epoch": 0.6121415597996588, + "grad_norm": 0.9675465822219849, + "learning_rate": 7.885864593373078e-06, + "loss": 0.7834, + "step": 11122 + }, + { + "epoch": 0.6121965986020145, + "grad_norm": 0.7220338582992554, + "learning_rate": 7.885510604171391e-06, + "loss": 0.8266, + "step": 11123 + }, + { + "epoch": 0.61225163740437, + "grad_norm": 0.7185316681861877, + "learning_rate": 7.88515659328298e-06, + "loss": 0.7949, + "step": 11124 + }, + { + "epoch": 0.6123066762067257, + "grad_norm": 0.67637038230896, + "learning_rate": 7.884802560710503e-06, + "loss": 0.7456, + "step": 11125 + }, + { + "epoch": 0.6123617150090814, + "grad_norm": 0.7886855602264404, + "learning_rate": 7.884448506456622e-06, + "loss": 0.7181, + "step": 11126 + }, + { + "epoch": 0.612416753811437, + "grad_norm": 0.7250227928161621, + "learning_rate": 7.884094430523999e-06, + "loss": 0.7537, + "step": 11127 + }, + { + "epoch": 0.6124717926137927, + "grad_norm": 0.6771906614303589, + "learning_rate": 7.883740332915295e-06, + "loss": 0.7642, + "step": 11128 + }, + { + "epoch": 0.6125268314161484, + "grad_norm": 0.8375886082649231, + "learning_rate": 7.88338621363317e-06, + "loss": 0.7231, + "step": 11129 + }, + { + "epoch": 0.6125818702185041, + "grad_norm": 0.6782773733139038, + "learning_rate": 7.883032072680285e-06, + "loss": 0.8391, + "step": 11130 + }, + { + "epoch": 0.6126369090208597, + "grad_norm": 0.7103945016860962, + "learning_rate": 7.882677910059304e-06, + "loss": 0.7838, + "step": 11131 + }, + { + "epoch": 0.6126919478232153, + "grad_norm": 0.7037224769592285, + "learning_rate": 7.882323725772887e-06, + "loss": 0.7906, + "step": 11132 + }, + { + "epoch": 0.612746986625571, + "grad_norm": 0.6872009634971619, + "learning_rate": 7.881969519823695e-06, + "loss": 0.7764, + "step": 11133 + }, + { + "epoch": 0.6128020254279267, + "grad_norm": 0.7377448678016663, + "learning_rate": 7.881615292214393e-06, + "loss": 0.8231, + "step": 11134 + }, + { + "epoch": 0.6128570642302823, + "grad_norm": 0.62479168176651, + "learning_rate": 7.881261042947642e-06, + "loss": 0.6522, + "step": 11135 + }, + { + "epoch": 0.612912103032638, + "grad_norm": 0.7989023923873901, + "learning_rate": 7.880906772026105e-06, + "loss": 0.7326, + "step": 11136 + }, + { + "epoch": 0.6129671418349937, + "grad_norm": 0.6322734951972961, + "learning_rate": 7.880552479452441e-06, + "loss": 0.6775, + "step": 11137 + }, + { + "epoch": 0.6130221806373494, + "grad_norm": 0.8628767132759094, + "learning_rate": 7.880198165229318e-06, + "loss": 0.7705, + "step": 11138 + }, + { + "epoch": 0.613077219439705, + "grad_norm": 0.7386173605918884, + "learning_rate": 7.879843829359396e-06, + "loss": 0.7297, + "step": 11139 + }, + { + "epoch": 0.6131322582420606, + "grad_norm": 0.6882045269012451, + "learning_rate": 7.879489471845339e-06, + "loss": 0.6875, + "step": 11140 + }, + { + "epoch": 0.6131872970444163, + "grad_norm": 0.5986032485961914, + "learning_rate": 7.879135092689809e-06, + "loss": 0.6329, + "step": 11141 + }, + { + "epoch": 0.613242335846772, + "grad_norm": 0.7973099946975708, + "learning_rate": 7.878780691895472e-06, + "loss": 0.809, + "step": 11142 + }, + { + "epoch": 0.6132973746491276, + "grad_norm": 0.6828579902648926, + "learning_rate": 7.878426269464989e-06, + "loss": 0.7777, + "step": 11143 + }, + { + "epoch": 0.6133524134514833, + "grad_norm": 0.8179183006286621, + "learning_rate": 7.878071825401024e-06, + "loss": 0.7275, + "step": 11144 + }, + { + "epoch": 0.613407452253839, + "grad_norm": 0.7290762066841125, + "learning_rate": 7.877717359706242e-06, + "loss": 0.7424, + "step": 11145 + }, + { + "epoch": 0.6134624910561947, + "grad_norm": 0.732510507106781, + "learning_rate": 7.877362872383305e-06, + "loss": 0.6157, + "step": 11146 + }, + { + "epoch": 0.6135175298585502, + "grad_norm": 0.9205982685089111, + "learning_rate": 7.877008363434881e-06, + "loss": 0.7723, + "step": 11147 + }, + { + "epoch": 0.6135725686609059, + "grad_norm": 0.7138587832450867, + "learning_rate": 7.876653832863633e-06, + "loss": 0.7773, + "step": 11148 + }, + { + "epoch": 0.6136276074632616, + "grad_norm": 0.7323171496391296, + "learning_rate": 7.876299280672224e-06, + "loss": 0.8265, + "step": 11149 + }, + { + "epoch": 0.6136826462656173, + "grad_norm": 0.6717494130134583, + "learning_rate": 7.875944706863318e-06, + "loss": 0.788, + "step": 11150 + }, + { + "epoch": 0.6137376850679729, + "grad_norm": 0.7779331207275391, + "learning_rate": 7.875590111439582e-06, + "loss": 0.7864, + "step": 11151 + }, + { + "epoch": 0.6137927238703286, + "grad_norm": 0.6706684827804565, + "learning_rate": 7.875235494403683e-06, + "loss": 0.6673, + "step": 11152 + }, + { + "epoch": 0.6138477626726843, + "grad_norm": 0.7142137885093689, + "learning_rate": 7.874880855758281e-06, + "loss": 0.8031, + "step": 11153 + }, + { + "epoch": 0.61390280147504, + "grad_norm": 0.6962595582008362, + "learning_rate": 7.874526195506045e-06, + "loss": 0.692, + "step": 11154 + }, + { + "epoch": 0.6139578402773955, + "grad_norm": 0.7237100601196289, + "learning_rate": 7.874171513649638e-06, + "loss": 0.7504, + "step": 11155 + }, + { + "epoch": 0.6140128790797512, + "grad_norm": 0.8235127925872803, + "learning_rate": 7.87381681019173e-06, + "loss": 0.8132, + "step": 11156 + }, + { + "epoch": 0.6140679178821069, + "grad_norm": 0.7483351826667786, + "learning_rate": 7.873462085134981e-06, + "loss": 0.7589, + "step": 11157 + }, + { + "epoch": 0.6141229566844626, + "grad_norm": 0.7309976816177368, + "learning_rate": 7.873107338482062e-06, + "loss": 0.7722, + "step": 11158 + }, + { + "epoch": 0.6141779954868182, + "grad_norm": 0.8871245384216309, + "learning_rate": 7.872752570235639e-06, + "loss": 0.882, + "step": 11159 + }, + { + "epoch": 0.6142330342891739, + "grad_norm": 0.5987886190414429, + "learning_rate": 7.872397780398374e-06, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.6142880730915296, + "grad_norm": 0.7320038080215454, + "learning_rate": 7.872042968972937e-06, + "loss": 0.7444, + "step": 11161 + }, + { + "epoch": 0.6143431118938852, + "grad_norm": 0.8111129999160767, + "learning_rate": 7.871688135961995e-06, + "loss": 0.7413, + "step": 11162 + }, + { + "epoch": 0.6143981506962408, + "grad_norm": 0.7497085332870483, + "learning_rate": 7.871333281368211e-06, + "loss": 0.8413, + "step": 11163 + }, + { + "epoch": 0.6144531894985965, + "grad_norm": 0.8341198563575745, + "learning_rate": 7.870978405194256e-06, + "loss": 0.7959, + "step": 11164 + }, + { + "epoch": 0.6145082283009522, + "grad_norm": 0.6293482780456543, + "learning_rate": 7.870623507442797e-06, + "loss": 0.6429, + "step": 11165 + }, + { + "epoch": 0.6145632671033079, + "grad_norm": 1.2423945665359497, + "learning_rate": 7.870268588116499e-06, + "loss": 0.6309, + "step": 11166 + }, + { + "epoch": 0.6146183059056635, + "grad_norm": 0.7811731100082397, + "learning_rate": 7.86991364721803e-06, + "loss": 0.738, + "step": 11167 + }, + { + "epoch": 0.6146733447080192, + "grad_norm": 0.6904361248016357, + "learning_rate": 7.869558684750061e-06, + "loss": 0.7995, + "step": 11168 + }, + { + "epoch": 0.6147283835103748, + "grad_norm": 0.7267210483551025, + "learning_rate": 7.869203700715254e-06, + "loss": 0.6989, + "step": 11169 + }, + { + "epoch": 0.6147834223127304, + "grad_norm": 0.7183068990707397, + "learning_rate": 7.868848695116282e-06, + "loss": 0.7872, + "step": 11170 + }, + { + "epoch": 0.6148384611150861, + "grad_norm": 0.6774286031723022, + "learning_rate": 7.868493667955808e-06, + "loss": 0.7502, + "step": 11171 + }, + { + "epoch": 0.6148934999174418, + "grad_norm": 0.7587934732437134, + "learning_rate": 7.868138619236507e-06, + "loss": 0.8037, + "step": 11172 + }, + { + "epoch": 0.6149485387197975, + "grad_norm": 0.6825854182243347, + "learning_rate": 7.867783548961043e-06, + "loss": 0.7924, + "step": 11173 + }, + { + "epoch": 0.6150035775221531, + "grad_norm": 0.6243380904197693, + "learning_rate": 7.867428457132084e-06, + "loss": 0.5953, + "step": 11174 + }, + { + "epoch": 0.6150586163245088, + "grad_norm": 0.6630006432533264, + "learning_rate": 7.8670733437523e-06, + "loss": 0.7102, + "step": 11175 + }, + { + "epoch": 0.6151136551268644, + "grad_norm": 0.7059652805328369, + "learning_rate": 7.866718208824362e-06, + "loss": 0.6847, + "step": 11176 + }, + { + "epoch": 0.6151686939292201, + "grad_norm": 0.6768305897712708, + "learning_rate": 7.866363052350938e-06, + "loss": 0.7152, + "step": 11177 + }, + { + "epoch": 0.6152237327315757, + "grad_norm": 0.6850628852844238, + "learning_rate": 7.866007874334696e-06, + "loss": 0.767, + "step": 11178 + }, + { + "epoch": 0.6152787715339314, + "grad_norm": 0.6767143607139587, + "learning_rate": 7.865652674778305e-06, + "loss": 0.6826, + "step": 11179 + }, + { + "epoch": 0.6153338103362871, + "grad_norm": 0.8240014314651489, + "learning_rate": 7.865297453684436e-06, + "loss": 0.8493, + "step": 11180 + }, + { + "epoch": 0.6153888491386428, + "grad_norm": 0.7725485563278198, + "learning_rate": 7.864942211055758e-06, + "loss": 0.8704, + "step": 11181 + }, + { + "epoch": 0.6154438879409984, + "grad_norm": 0.9260931015014648, + "learning_rate": 7.864586946894941e-06, + "loss": 0.7926, + "step": 11182 + }, + { + "epoch": 0.615498926743354, + "grad_norm": 0.7558152079582214, + "learning_rate": 7.864231661204655e-06, + "loss": 0.8436, + "step": 11183 + }, + { + "epoch": 0.6155539655457097, + "grad_norm": 0.7899817824363708, + "learning_rate": 7.863876353987571e-06, + "loss": 0.7579, + "step": 11184 + }, + { + "epoch": 0.6156090043480654, + "grad_norm": 0.7757478952407837, + "learning_rate": 7.863521025246362e-06, + "loss": 0.7534, + "step": 11185 + }, + { + "epoch": 0.615664043150421, + "grad_norm": 0.6563131809234619, + "learning_rate": 7.863165674983693e-06, + "loss": 0.728, + "step": 11186 + }, + { + "epoch": 0.6157190819527767, + "grad_norm": 0.6516488790512085, + "learning_rate": 7.862810303202234e-06, + "loss": 0.736, + "step": 11187 + }, + { + "epoch": 0.6157741207551324, + "grad_norm": 0.6867820620536804, + "learning_rate": 7.862454909904665e-06, + "loss": 0.8032, + "step": 11188 + }, + { + "epoch": 0.6158291595574881, + "grad_norm": 0.7399753928184509, + "learning_rate": 7.862099495093647e-06, + "loss": 0.8681, + "step": 11189 + }, + { + "epoch": 0.6158841983598436, + "grad_norm": 0.7249311804771423, + "learning_rate": 7.861744058771857e-06, + "loss": 0.7868, + "step": 11190 + }, + { + "epoch": 0.6159392371621993, + "grad_norm": 0.8579045534133911, + "learning_rate": 7.861388600941964e-06, + "loss": 0.7915, + "step": 11191 + }, + { + "epoch": 0.615994275964555, + "grad_norm": 0.6855454444885254, + "learning_rate": 7.86103312160664e-06, + "loss": 0.8442, + "step": 11192 + }, + { + "epoch": 0.6160493147669107, + "grad_norm": 0.7412910461425781, + "learning_rate": 7.860677620768558e-06, + "loss": 0.7684, + "step": 11193 + }, + { + "epoch": 0.6161043535692663, + "grad_norm": 0.8567430377006531, + "learning_rate": 7.860322098430389e-06, + "loss": 0.8801, + "step": 11194 + }, + { + "epoch": 0.616159392371622, + "grad_norm": 0.7504804134368896, + "learning_rate": 7.859966554594802e-06, + "loss": 0.7359, + "step": 11195 + }, + { + "epoch": 0.6162144311739777, + "grad_norm": 0.7086803317070007, + "learning_rate": 7.859610989264474e-06, + "loss": 0.8498, + "step": 11196 + }, + { + "epoch": 0.6162694699763334, + "grad_norm": 0.7201757431030273, + "learning_rate": 7.859255402442075e-06, + "loss": 0.608, + "step": 11197 + }, + { + "epoch": 0.6163245087786889, + "grad_norm": 0.8968291282653809, + "learning_rate": 7.858899794130279e-06, + "loss": 0.8067, + "step": 11198 + }, + { + "epoch": 0.6163795475810446, + "grad_norm": 0.7474254965782166, + "learning_rate": 7.858544164331756e-06, + "loss": 0.8355, + "step": 11199 + }, + { + "epoch": 0.6164345863834003, + "grad_norm": 0.6907560229301453, + "learning_rate": 7.85818851304918e-06, + "loss": 0.788, + "step": 11200 + }, + { + "epoch": 0.616489625185756, + "grad_norm": 0.725330650806427, + "learning_rate": 7.857832840285224e-06, + "loss": 0.8157, + "step": 11201 + }, + { + "epoch": 0.6165446639881116, + "grad_norm": 0.682722270488739, + "learning_rate": 7.857477146042562e-06, + "loss": 0.7939, + "step": 11202 + }, + { + "epoch": 0.6165997027904673, + "grad_norm": 0.661533534526825, + "learning_rate": 7.857121430323866e-06, + "loss": 0.7173, + "step": 11203 + }, + { + "epoch": 0.616654741592823, + "grad_norm": 0.6922706961631775, + "learning_rate": 7.856765693131811e-06, + "loss": 0.7719, + "step": 11204 + }, + { + "epoch": 0.6167097803951787, + "grad_norm": 0.72809898853302, + "learning_rate": 7.856409934469071e-06, + "loss": 0.7362, + "step": 11205 + }, + { + "epoch": 0.6167648191975342, + "grad_norm": 0.7540956735610962, + "learning_rate": 7.856054154338317e-06, + "loss": 0.7883, + "step": 11206 + }, + { + "epoch": 0.6168198579998899, + "grad_norm": 0.6777094006538391, + "learning_rate": 7.855698352742224e-06, + "loss": 0.6938, + "step": 11207 + }, + { + "epoch": 0.6168748968022456, + "grad_norm": 0.6771852970123291, + "learning_rate": 7.855342529683467e-06, + "loss": 0.697, + "step": 11208 + }, + { + "epoch": 0.6169299356046013, + "grad_norm": 0.7810118198394775, + "learning_rate": 7.854986685164721e-06, + "loss": 0.6875, + "step": 11209 + }, + { + "epoch": 0.6169849744069569, + "grad_norm": 0.6992766261100769, + "learning_rate": 7.854630819188658e-06, + "loss": 0.6553, + "step": 11210 + }, + { + "epoch": 0.6170400132093126, + "grad_norm": 0.7409703135490417, + "learning_rate": 7.854274931757954e-06, + "loss": 0.7685, + "step": 11211 + }, + { + "epoch": 0.6170950520116683, + "grad_norm": 0.7263410687446594, + "learning_rate": 7.853919022875285e-06, + "loss": 0.7939, + "step": 11212 + }, + { + "epoch": 0.6171500908140238, + "grad_norm": 0.8451918959617615, + "learning_rate": 7.853563092543323e-06, + "loss": 0.7522, + "step": 11213 + }, + { + "epoch": 0.6172051296163795, + "grad_norm": 0.672926664352417, + "learning_rate": 7.853207140764745e-06, + "loss": 0.732, + "step": 11214 + }, + { + "epoch": 0.6172601684187352, + "grad_norm": 0.6607885956764221, + "learning_rate": 7.852851167542226e-06, + "loss": 0.7441, + "step": 11215 + }, + { + "epoch": 0.6173152072210909, + "grad_norm": 0.730385422706604, + "learning_rate": 7.85249517287844e-06, + "loss": 0.7925, + "step": 11216 + }, + { + "epoch": 0.6173702460234465, + "grad_norm": 0.7338821887969971, + "learning_rate": 7.852139156776067e-06, + "loss": 0.8106, + "step": 11217 + }, + { + "epoch": 0.6174252848258022, + "grad_norm": 0.7662163376808167, + "learning_rate": 7.851783119237777e-06, + "loss": 0.8166, + "step": 11218 + }, + { + "epoch": 0.6174803236281579, + "grad_norm": 0.7738409042358398, + "learning_rate": 7.85142706026625e-06, + "loss": 0.7898, + "step": 11219 + }, + { + "epoch": 0.6175353624305135, + "grad_norm": 0.8129978775978088, + "learning_rate": 7.851070979864159e-06, + "loss": 0.7618, + "step": 11220 + }, + { + "epoch": 0.6175904012328691, + "grad_norm": 0.7923482060432434, + "learning_rate": 7.850714878034183e-06, + "loss": 0.7341, + "step": 11221 + }, + { + "epoch": 0.6176454400352248, + "grad_norm": 0.7189306020736694, + "learning_rate": 7.850358754778996e-06, + "loss": 0.7775, + "step": 11222 + }, + { + "epoch": 0.6177004788375805, + "grad_norm": 0.9873724579811096, + "learning_rate": 7.850002610101276e-06, + "loss": 0.8521, + "step": 11223 + }, + { + "epoch": 0.6177555176399362, + "grad_norm": 0.6350038051605225, + "learning_rate": 7.8496464440037e-06, + "loss": 0.6356, + "step": 11224 + }, + { + "epoch": 0.6178105564422918, + "grad_norm": 0.8059771060943604, + "learning_rate": 7.849290256488941e-06, + "loss": 0.821, + "step": 11225 + }, + { + "epoch": 0.6178655952446475, + "grad_norm": 0.7469610571861267, + "learning_rate": 7.848934047559684e-06, + "loss": 0.7782, + "step": 11226 + }, + { + "epoch": 0.6179206340470031, + "grad_norm": 0.6423176527023315, + "learning_rate": 7.848577817218597e-06, + "loss": 0.6693, + "step": 11227 + }, + { + "epoch": 0.6179756728493588, + "grad_norm": 0.7298387885093689, + "learning_rate": 7.848221565468363e-06, + "loss": 0.775, + "step": 11228 + }, + { + "epoch": 0.6180307116517144, + "grad_norm": 0.7125145196914673, + "learning_rate": 7.84786529231166e-06, + "loss": 0.7507, + "step": 11229 + }, + { + "epoch": 0.6180857504540701, + "grad_norm": 0.6658627390861511, + "learning_rate": 7.847508997751163e-06, + "loss": 0.7506, + "step": 11230 + }, + { + "epoch": 0.6181407892564258, + "grad_norm": 0.6425275206565857, + "learning_rate": 7.847152681789549e-06, + "loss": 0.657, + "step": 11231 + }, + { + "epoch": 0.6181958280587815, + "grad_norm": 0.8075960278511047, + "learning_rate": 7.846796344429498e-06, + "loss": 0.5434, + "step": 11232 + }, + { + "epoch": 0.6182508668611371, + "grad_norm": 0.8481889367103577, + "learning_rate": 7.846439985673689e-06, + "loss": 0.8303, + "step": 11233 + }, + { + "epoch": 0.6183059056634927, + "grad_norm": 0.7216358184814453, + "learning_rate": 7.846083605524799e-06, + "loss": 0.7589, + "step": 11234 + }, + { + "epoch": 0.6183609444658484, + "grad_norm": 0.8399745225906372, + "learning_rate": 7.845727203985504e-06, + "loss": 0.8096, + "step": 11235 + }, + { + "epoch": 0.6184159832682041, + "grad_norm": 0.6708692908287048, + "learning_rate": 7.845370781058489e-06, + "loss": 0.6858, + "step": 11236 + }, + { + "epoch": 0.6184710220705597, + "grad_norm": 0.6309100389480591, + "learning_rate": 7.845014336746426e-06, + "loss": 0.6093, + "step": 11237 + }, + { + "epoch": 0.6185260608729154, + "grad_norm": 0.8138728141784668, + "learning_rate": 7.844657871051997e-06, + "loss": 0.8259, + "step": 11238 + }, + { + "epoch": 0.6185810996752711, + "grad_norm": 0.6763564348220825, + "learning_rate": 7.844301383977882e-06, + "loss": 0.7056, + "step": 11239 + }, + { + "epoch": 0.6186361384776268, + "grad_norm": 0.792085587978363, + "learning_rate": 7.843944875526758e-06, + "loss": 0.7364, + "step": 11240 + }, + { + "epoch": 0.6186911772799824, + "grad_norm": 0.8738027811050415, + "learning_rate": 7.843588345701306e-06, + "loss": 0.7092, + "step": 11241 + }, + { + "epoch": 0.618746216082338, + "grad_norm": 0.7694413065910339, + "learning_rate": 7.843231794504205e-06, + "loss": 0.852, + "step": 11242 + }, + { + "epoch": 0.6188012548846937, + "grad_norm": 0.8211640119552612, + "learning_rate": 7.842875221938135e-06, + "loss": 0.8218, + "step": 11243 + }, + { + "epoch": 0.6188562936870494, + "grad_norm": 0.620566189289093, + "learning_rate": 7.842518628005776e-06, + "loss": 0.7176, + "step": 11244 + }, + { + "epoch": 0.618911332489405, + "grad_norm": 0.7044099569320679, + "learning_rate": 7.84216201270981e-06, + "loss": 0.8068, + "step": 11245 + }, + { + "epoch": 0.6189663712917607, + "grad_norm": 0.765209436416626, + "learning_rate": 7.841805376052912e-06, + "loss": 0.8002, + "step": 11246 + }, + { + "epoch": 0.6190214100941164, + "grad_norm": 0.7565444707870483, + "learning_rate": 7.841448718037765e-06, + "loss": 0.7997, + "step": 11247 + }, + { + "epoch": 0.6190764488964721, + "grad_norm": 0.9544101357460022, + "learning_rate": 7.841092038667052e-06, + "loss": 0.647, + "step": 11248 + }, + { + "epoch": 0.6191314876988276, + "grad_norm": 0.7319634556770325, + "learning_rate": 7.840735337943452e-06, + "loss": 0.7982, + "step": 11249 + }, + { + "epoch": 0.6191865265011833, + "grad_norm": 0.6017479300498962, + "learning_rate": 7.840378615869645e-06, + "loss": 0.6817, + "step": 11250 + }, + { + "epoch": 0.619241565303539, + "grad_norm": 0.6936477422714233, + "learning_rate": 7.840021872448312e-06, + "loss": 0.7227, + "step": 11251 + }, + { + "epoch": 0.6192966041058947, + "grad_norm": 0.6962631940841675, + "learning_rate": 7.839665107682135e-06, + "loss": 0.779, + "step": 11252 + }, + { + "epoch": 0.6193516429082503, + "grad_norm": 0.9580947160720825, + "learning_rate": 7.839308321573797e-06, + "loss": 0.8821, + "step": 11253 + }, + { + "epoch": 0.619406681710606, + "grad_norm": 0.7721261978149414, + "learning_rate": 7.838951514125977e-06, + "loss": 0.7146, + "step": 11254 + }, + { + "epoch": 0.6194617205129617, + "grad_norm": 0.7349434494972229, + "learning_rate": 7.838594685341354e-06, + "loss": 0.7601, + "step": 11255 + }, + { + "epoch": 0.6195167593153172, + "grad_norm": 0.6787356734275818, + "learning_rate": 7.838237835222618e-06, + "loss": 0.706, + "step": 11256 + }, + { + "epoch": 0.6195717981176729, + "grad_norm": 0.7658288478851318, + "learning_rate": 7.837880963772445e-06, + "loss": 0.7102, + "step": 11257 + }, + { + "epoch": 0.6196268369200286, + "grad_norm": 0.8083927035331726, + "learning_rate": 7.837524070993516e-06, + "loss": 0.8501, + "step": 11258 + }, + { + "epoch": 0.6196818757223843, + "grad_norm": 0.7656283974647522, + "learning_rate": 7.837167156888516e-06, + "loss": 0.7558, + "step": 11259 + }, + { + "epoch": 0.6197369145247399, + "grad_norm": 0.7897886037826538, + "learning_rate": 7.836810221460128e-06, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.6197919533270956, + "grad_norm": 0.6858190298080444, + "learning_rate": 7.836453264711035e-06, + "loss": 0.717, + "step": 11261 + }, + { + "epoch": 0.6198469921294513, + "grad_norm": 0.7423431873321533, + "learning_rate": 7.836096286643917e-06, + "loss": 0.7047, + "step": 11262 + }, + { + "epoch": 0.619902030931807, + "grad_norm": 0.8277921676635742, + "learning_rate": 7.835739287261458e-06, + "loss": 0.7418, + "step": 11263 + }, + { + "epoch": 0.6199570697341625, + "grad_norm": 0.7102510929107666, + "learning_rate": 7.835382266566343e-06, + "loss": 0.8202, + "step": 11264 + }, + { + "epoch": 0.6200121085365182, + "grad_norm": 0.6705429553985596, + "learning_rate": 7.835025224561252e-06, + "loss": 0.7332, + "step": 11265 + }, + { + "epoch": 0.6200671473388739, + "grad_norm": 0.6529950499534607, + "learning_rate": 7.834668161248873e-06, + "loss": 0.7579, + "step": 11266 + }, + { + "epoch": 0.6201221861412296, + "grad_norm": 0.7189938426017761, + "learning_rate": 7.834311076631885e-06, + "loss": 0.7323, + "step": 11267 + }, + { + "epoch": 0.6201772249435852, + "grad_norm": 0.6559470891952515, + "learning_rate": 7.833953970712973e-06, + "loss": 0.5973, + "step": 11268 + }, + { + "epoch": 0.6202322637459409, + "grad_norm": 0.7971723675727844, + "learning_rate": 7.833596843494824e-06, + "loss": 0.804, + "step": 11269 + }, + { + "epoch": 0.6202873025482966, + "grad_norm": 0.7800958752632141, + "learning_rate": 7.833239694980118e-06, + "loss": 0.772, + "step": 11270 + }, + { + "epoch": 0.6203423413506522, + "grad_norm": 0.6831466555595398, + "learning_rate": 7.83288252517154e-06, + "loss": 0.7341, + "step": 11271 + }, + { + "epoch": 0.6203973801530078, + "grad_norm": 0.6504807472229004, + "learning_rate": 7.832525334071776e-06, + "loss": 0.6462, + "step": 11272 + }, + { + "epoch": 0.6204524189553635, + "grad_norm": 0.6973552703857422, + "learning_rate": 7.832168121683512e-06, + "loss": 0.7504, + "step": 11273 + }, + { + "epoch": 0.6205074577577192, + "grad_norm": 0.6772480607032776, + "learning_rate": 7.831810888009427e-06, + "loss": 0.7273, + "step": 11274 + }, + { + "epoch": 0.6205624965600749, + "grad_norm": 0.7077416777610779, + "learning_rate": 7.831453633052212e-06, + "loss": 0.7365, + "step": 11275 + }, + { + "epoch": 0.6206175353624305, + "grad_norm": 0.7338337898254395, + "learning_rate": 7.831096356814548e-06, + "loss": 0.7959, + "step": 11276 + }, + { + "epoch": 0.6206725741647862, + "grad_norm": 0.6313255429267883, + "learning_rate": 7.830739059299123e-06, + "loss": 0.7027, + "step": 11277 + }, + { + "epoch": 0.6207276129671419, + "grad_norm": 0.7377570867538452, + "learning_rate": 7.830381740508619e-06, + "loss": 0.6903, + "step": 11278 + }, + { + "epoch": 0.6207826517694975, + "grad_norm": 0.6868650317192078, + "learning_rate": 7.830024400445724e-06, + "loss": 0.6882, + "step": 11279 + }, + { + "epoch": 0.6208376905718531, + "grad_norm": 0.7632661461830139, + "learning_rate": 7.829667039113124e-06, + "loss": 0.8437, + "step": 11280 + }, + { + "epoch": 0.6208927293742088, + "grad_norm": 0.9241608381271362, + "learning_rate": 7.829309656513504e-06, + "loss": 0.779, + "step": 11281 + }, + { + "epoch": 0.6209477681765645, + "grad_norm": 0.6857842206954956, + "learning_rate": 7.828952252649551e-06, + "loss": 0.7882, + "step": 11282 + }, + { + "epoch": 0.6210028069789202, + "grad_norm": 0.695659875869751, + "learning_rate": 7.828594827523947e-06, + "loss": 0.7471, + "step": 11283 + }, + { + "epoch": 0.6210578457812758, + "grad_norm": 0.6398521661758423, + "learning_rate": 7.828237381139383e-06, + "loss": 0.7328, + "step": 11284 + }, + { + "epoch": 0.6211128845836315, + "grad_norm": 0.7386063933372498, + "learning_rate": 7.827879913498544e-06, + "loss": 0.748, + "step": 11285 + }, + { + "epoch": 0.6211679233859871, + "grad_norm": 0.6740923523902893, + "learning_rate": 7.827522424604117e-06, + "loss": 0.6866, + "step": 11286 + }, + { + "epoch": 0.6212229621883428, + "grad_norm": 0.6794413924217224, + "learning_rate": 7.82716491445879e-06, + "loss": 0.7299, + "step": 11287 + }, + { + "epoch": 0.6212780009906984, + "grad_norm": 0.6471715569496155, + "learning_rate": 7.826807383065245e-06, + "loss": 0.7071, + "step": 11288 + }, + { + "epoch": 0.6213330397930541, + "grad_norm": 0.9716162085533142, + "learning_rate": 7.826449830426174e-06, + "loss": 0.7417, + "step": 11289 + }, + { + "epoch": 0.6213880785954098, + "grad_norm": 0.6928716897964478, + "learning_rate": 7.826092256544263e-06, + "loss": 0.7757, + "step": 11290 + }, + { + "epoch": 0.6214431173977655, + "grad_norm": 0.6739227175712585, + "learning_rate": 7.825734661422197e-06, + "loss": 0.7576, + "step": 11291 + }, + { + "epoch": 0.621498156200121, + "grad_norm": 1.2619935274124146, + "learning_rate": 7.825377045062668e-06, + "loss": 0.7454, + "step": 11292 + }, + { + "epoch": 0.6215531950024767, + "grad_norm": 0.6713572144508362, + "learning_rate": 7.825019407468361e-06, + "loss": 0.7916, + "step": 11293 + }, + { + "epoch": 0.6216082338048324, + "grad_norm": 0.6143541932106018, + "learning_rate": 7.824661748641964e-06, + "loss": 0.6765, + "step": 11294 + }, + { + "epoch": 0.6216632726071881, + "grad_norm": 0.7141658067703247, + "learning_rate": 7.824304068586163e-06, + "loss": 0.7773, + "step": 11295 + }, + { + "epoch": 0.6217183114095437, + "grad_norm": 0.7320290803909302, + "learning_rate": 7.823946367303653e-06, + "loss": 0.8062, + "step": 11296 + }, + { + "epoch": 0.6217733502118994, + "grad_norm": 0.7523403167724609, + "learning_rate": 7.823588644797115e-06, + "loss": 0.7126, + "step": 11297 + }, + { + "epoch": 0.6218283890142551, + "grad_norm": 0.6512221097946167, + "learning_rate": 7.823230901069242e-06, + "loss": 0.7563, + "step": 11298 + }, + { + "epoch": 0.6218834278166107, + "grad_norm": 0.6512733697891235, + "learning_rate": 7.82287313612272e-06, + "loss": 0.7603, + "step": 11299 + }, + { + "epoch": 0.6219384666189663, + "grad_norm": 1.0590927600860596, + "learning_rate": 7.82251534996024e-06, + "loss": 0.8325, + "step": 11300 + }, + { + "epoch": 0.621993505421322, + "grad_norm": 0.6763397455215454, + "learning_rate": 7.82215754258449e-06, + "loss": 0.7915, + "step": 11301 + }, + { + "epoch": 0.6220485442236777, + "grad_norm": 0.6640639901161194, + "learning_rate": 7.82179971399816e-06, + "loss": 0.6953, + "step": 11302 + }, + { + "epoch": 0.6221035830260333, + "grad_norm": 0.6611515283584595, + "learning_rate": 7.821441864203938e-06, + "loss": 0.8331, + "step": 11303 + }, + { + "epoch": 0.622158621828389, + "grad_norm": 0.8226057887077332, + "learning_rate": 7.821083993204514e-06, + "loss": 0.7448, + "step": 11304 + }, + { + "epoch": 0.6222136606307447, + "grad_norm": 0.6798059940338135, + "learning_rate": 7.820726101002578e-06, + "loss": 0.717, + "step": 11305 + }, + { + "epoch": 0.6222686994331004, + "grad_norm": 0.7623499631881714, + "learning_rate": 7.820368187600821e-06, + "loss": 0.7343, + "step": 11306 + }, + { + "epoch": 0.622323738235456, + "grad_norm": 0.703886866569519, + "learning_rate": 7.82001025300193e-06, + "loss": 0.8008, + "step": 11307 + }, + { + "epoch": 0.6223787770378116, + "grad_norm": 0.6817659735679626, + "learning_rate": 7.819652297208597e-06, + "loss": 0.7534, + "step": 11308 + }, + { + "epoch": 0.6224338158401673, + "grad_norm": 0.8991402983665466, + "learning_rate": 7.819294320223513e-06, + "loss": 0.6236, + "step": 11309 + }, + { + "epoch": 0.622488854642523, + "grad_norm": 0.791199803352356, + "learning_rate": 7.818936322049366e-06, + "loss": 0.772, + "step": 11310 + }, + { + "epoch": 0.6225438934448786, + "grad_norm": 0.6401470303535461, + "learning_rate": 7.81857830268885e-06, + "loss": 0.7749, + "step": 11311 + }, + { + "epoch": 0.6225989322472343, + "grad_norm": 0.6731516122817993, + "learning_rate": 7.818220262144653e-06, + "loss": 0.7506, + "step": 11312 + }, + { + "epoch": 0.62265397104959, + "grad_norm": 0.7391661405563354, + "learning_rate": 7.817862200419467e-06, + "loss": 0.7288, + "step": 11313 + }, + { + "epoch": 0.6227090098519457, + "grad_norm": 0.7363784909248352, + "learning_rate": 7.817504117515984e-06, + "loss": 0.7087, + "step": 11314 + }, + { + "epoch": 0.6227640486543012, + "grad_norm": 0.7609296441078186, + "learning_rate": 7.817146013436893e-06, + "loss": 0.7553, + "step": 11315 + }, + { + "epoch": 0.6228190874566569, + "grad_norm": 0.6818829774856567, + "learning_rate": 7.816787888184886e-06, + "loss": 0.7534, + "step": 11316 + }, + { + "epoch": 0.6228741262590126, + "grad_norm": 0.7434844374656677, + "learning_rate": 7.816429741762657e-06, + "loss": 0.8008, + "step": 11317 + }, + { + "epoch": 0.6229291650613683, + "grad_norm": 0.6881742477416992, + "learning_rate": 7.816071574172895e-06, + "loss": 0.7324, + "step": 11318 + }, + { + "epoch": 0.6229842038637239, + "grad_norm": 0.7109540104866028, + "learning_rate": 7.815713385418293e-06, + "loss": 0.7954, + "step": 11319 + }, + { + "epoch": 0.6230392426660796, + "grad_norm": 0.6868860721588135, + "learning_rate": 7.815355175501542e-06, + "loss": 0.6703, + "step": 11320 + }, + { + "epoch": 0.6230942814684353, + "grad_norm": 0.7851449847221375, + "learning_rate": 7.814996944425337e-06, + "loss": 0.8321, + "step": 11321 + }, + { + "epoch": 0.623149320270791, + "grad_norm": 0.7966809272766113, + "learning_rate": 7.814638692192367e-06, + "loss": 0.7603, + "step": 11322 + }, + { + "epoch": 0.6232043590731465, + "grad_norm": 0.6612964272499084, + "learning_rate": 7.814280418805327e-06, + "loss": 0.8096, + "step": 11323 + }, + { + "epoch": 0.6232593978755022, + "grad_norm": 0.6398881077766418, + "learning_rate": 7.813922124266908e-06, + "loss": 0.7559, + "step": 11324 + }, + { + "epoch": 0.6233144366778579, + "grad_norm": 0.8062521815299988, + "learning_rate": 7.813563808579804e-06, + "loss": 0.7863, + "step": 11325 + }, + { + "epoch": 0.6233694754802136, + "grad_norm": 0.7083317041397095, + "learning_rate": 7.813205471746708e-06, + "loss": 0.7358, + "step": 11326 + }, + { + "epoch": 0.6234245142825692, + "grad_norm": 0.6190419793128967, + "learning_rate": 7.812847113770312e-06, + "loss": 0.637, + "step": 11327 + }, + { + "epoch": 0.6234795530849249, + "grad_norm": 0.7036548256874084, + "learning_rate": 7.812488734653309e-06, + "loss": 0.8049, + "step": 11328 + }, + { + "epoch": 0.6235345918872806, + "grad_norm": 0.7952288389205933, + "learning_rate": 7.812130334398395e-06, + "loss": 0.781, + "step": 11329 + }, + { + "epoch": 0.6235896306896362, + "grad_norm": 0.7925593852996826, + "learning_rate": 7.811771913008262e-06, + "loss": 0.7913, + "step": 11330 + }, + { + "epoch": 0.6236446694919918, + "grad_norm": 0.7190900444984436, + "learning_rate": 7.811413470485604e-06, + "loss": 0.7464, + "step": 11331 + }, + { + "epoch": 0.6236997082943475, + "grad_norm": 0.6476338505744934, + "learning_rate": 7.811055006833114e-06, + "loss": 0.699, + "step": 11332 + }, + { + "epoch": 0.6237547470967032, + "grad_norm": 0.7412729263305664, + "learning_rate": 7.810696522053487e-06, + "loss": 0.7958, + "step": 11333 + }, + { + "epoch": 0.6238097858990589, + "grad_norm": 0.6646767854690552, + "learning_rate": 7.81033801614942e-06, + "loss": 0.6276, + "step": 11334 + }, + { + "epoch": 0.6238648247014145, + "grad_norm": 0.6912583112716675, + "learning_rate": 7.809979489123601e-06, + "loss": 0.7611, + "step": 11335 + }, + { + "epoch": 0.6239198635037702, + "grad_norm": 0.7324331998825073, + "learning_rate": 7.80962094097873e-06, + "loss": 0.7436, + "step": 11336 + }, + { + "epoch": 0.6239749023061258, + "grad_norm": 0.7046643495559692, + "learning_rate": 7.809262371717501e-06, + "loss": 0.7287, + "step": 11337 + }, + { + "epoch": 0.6240299411084815, + "grad_norm": 0.6013771891593933, + "learning_rate": 7.808903781342607e-06, + "loss": 0.6822, + "step": 11338 + }, + { + "epoch": 0.6240849799108371, + "grad_norm": 0.633074164390564, + "learning_rate": 7.808545169856745e-06, + "loss": 0.7758, + "step": 11339 + }, + { + "epoch": 0.6241400187131928, + "grad_norm": 0.6603411436080933, + "learning_rate": 7.808186537262608e-06, + "loss": 0.6797, + "step": 11340 + }, + { + "epoch": 0.6241950575155485, + "grad_norm": 0.8316327929496765, + "learning_rate": 7.807827883562894e-06, + "loss": 0.777, + "step": 11341 + }, + { + "epoch": 0.6242500963179041, + "grad_norm": 0.7954252362251282, + "learning_rate": 7.807469208760295e-06, + "loss": 0.6581, + "step": 11342 + }, + { + "epoch": 0.6243051351202598, + "grad_norm": 0.6108134984970093, + "learning_rate": 7.80711051285751e-06, + "loss": 0.7126, + "step": 11343 + }, + { + "epoch": 0.6243601739226154, + "grad_norm": 0.7224909067153931, + "learning_rate": 7.806751795857235e-06, + "loss": 0.8677, + "step": 11344 + }, + { + "epoch": 0.6244152127249711, + "grad_norm": 0.720923125743866, + "learning_rate": 7.806393057762165e-06, + "loss": 0.7174, + "step": 11345 + }, + { + "epoch": 0.6244702515273267, + "grad_norm": 0.6837444305419922, + "learning_rate": 7.806034298574993e-06, + "loss": 0.7431, + "step": 11346 + }, + { + "epoch": 0.6245252903296824, + "grad_norm": 0.8486534953117371, + "learning_rate": 7.80567551829842e-06, + "loss": 0.7955, + "step": 11347 + }, + { + "epoch": 0.6245803291320381, + "grad_norm": 0.6459395885467529, + "learning_rate": 7.805316716935143e-06, + "loss": 0.7681, + "step": 11348 + }, + { + "epoch": 0.6246353679343938, + "grad_norm": 0.8414636850357056, + "learning_rate": 7.804957894487854e-06, + "loss": 0.8985, + "step": 11349 + }, + { + "epoch": 0.6246904067367494, + "grad_norm": 0.7930828928947449, + "learning_rate": 7.804599050959254e-06, + "loss": 0.7389, + "step": 11350 + }, + { + "epoch": 0.624745445539105, + "grad_norm": 0.7102516889572144, + "learning_rate": 7.804240186352038e-06, + "loss": 0.8072, + "step": 11351 + }, + { + "epoch": 0.6248004843414607, + "grad_norm": 0.773341178894043, + "learning_rate": 7.803881300668901e-06, + "loss": 0.7531, + "step": 11352 + }, + { + "epoch": 0.6248555231438164, + "grad_norm": 0.6354981064796448, + "learning_rate": 7.803522393912544e-06, + "loss": 0.6761, + "step": 11353 + }, + { + "epoch": 0.624910561946172, + "grad_norm": 0.7833859324455261, + "learning_rate": 7.803163466085663e-06, + "loss": 0.7768, + "step": 11354 + }, + { + "epoch": 0.6249656007485277, + "grad_norm": 0.6982376575469971, + "learning_rate": 7.802804517190957e-06, + "loss": 0.7472, + "step": 11355 + }, + { + "epoch": 0.6250206395508834, + "grad_norm": 0.7214694023132324, + "learning_rate": 7.80244554723112e-06, + "loss": 0.7919, + "step": 11356 + }, + { + "epoch": 0.6250756783532391, + "grad_norm": 0.8002933859825134, + "learning_rate": 7.802086556208855e-06, + "loss": 0.8278, + "step": 11357 + }, + { + "epoch": 0.6251307171555947, + "grad_norm": 0.7619680762290955, + "learning_rate": 7.801727544126858e-06, + "loss": 0.7775, + "step": 11358 + }, + { + "epoch": 0.6251857559579503, + "grad_norm": 0.6340392827987671, + "learning_rate": 7.801368510987825e-06, + "loss": 0.7324, + "step": 11359 + }, + { + "epoch": 0.625240794760306, + "grad_norm": 0.6754844784736633, + "learning_rate": 7.801009456794457e-06, + "loss": 0.7296, + "step": 11360 + }, + { + "epoch": 0.6252958335626617, + "grad_norm": 0.6871771216392517, + "learning_rate": 7.80065038154945e-06, + "loss": 0.7398, + "step": 11361 + }, + { + "epoch": 0.6253508723650173, + "grad_norm": 0.6610772013664246, + "learning_rate": 7.800291285255505e-06, + "loss": 0.738, + "step": 11362 + }, + { + "epoch": 0.625405911167373, + "grad_norm": 0.6858081221580505, + "learning_rate": 7.799932167915322e-06, + "loss": 0.7353, + "step": 11363 + }, + { + "epoch": 0.6254609499697287, + "grad_norm": 0.6698840856552124, + "learning_rate": 7.799573029531597e-06, + "loss": 0.7505, + "step": 11364 + }, + { + "epoch": 0.6255159887720844, + "grad_norm": 0.7374000549316406, + "learning_rate": 7.799213870107031e-06, + "loss": 0.7974, + "step": 11365 + }, + { + "epoch": 0.6255710275744399, + "grad_norm": 0.6962621808052063, + "learning_rate": 7.798854689644324e-06, + "loss": 0.8183, + "step": 11366 + }, + { + "epoch": 0.6256260663767956, + "grad_norm": 0.8477681279182434, + "learning_rate": 7.798495488146173e-06, + "loss": 0.7533, + "step": 11367 + }, + { + "epoch": 0.6256811051791513, + "grad_norm": 0.6963459253311157, + "learning_rate": 7.798136265615278e-06, + "loss": 0.6362, + "step": 11368 + }, + { + "epoch": 0.625736143981507, + "grad_norm": 0.7125601172447205, + "learning_rate": 7.79777702205434e-06, + "loss": 0.7296, + "step": 11369 + }, + { + "epoch": 0.6257911827838626, + "grad_norm": 0.6650554537773132, + "learning_rate": 7.79741775746606e-06, + "loss": 0.8231, + "step": 11370 + }, + { + "epoch": 0.6258462215862183, + "grad_norm": 0.6556620597839355, + "learning_rate": 7.797058471853138e-06, + "loss": 0.6952, + "step": 11371 + }, + { + "epoch": 0.625901260388574, + "grad_norm": 0.6350956559181213, + "learning_rate": 7.79669916521827e-06, + "loss": 0.686, + "step": 11372 + }, + { + "epoch": 0.6259562991909297, + "grad_norm": 0.6346702575683594, + "learning_rate": 7.796339837564163e-06, + "loss": 0.7234, + "step": 11373 + }, + { + "epoch": 0.6260113379932852, + "grad_norm": 0.741437554359436, + "learning_rate": 7.795980488893514e-06, + "loss": 0.8096, + "step": 11374 + }, + { + "epoch": 0.6260663767956409, + "grad_norm": 0.7057582139968872, + "learning_rate": 7.795621119209021e-06, + "loss": 0.8022, + "step": 11375 + }, + { + "epoch": 0.6261214155979966, + "grad_norm": 0.658107578754425, + "learning_rate": 7.79526172851339e-06, + "loss": 0.7564, + "step": 11376 + }, + { + "epoch": 0.6261764544003523, + "grad_norm": 0.7974086403846741, + "learning_rate": 7.79490231680932e-06, + "loss": 0.7721, + "step": 11377 + }, + { + "epoch": 0.6262314932027079, + "grad_norm": 0.6669130921363831, + "learning_rate": 7.794542884099513e-06, + "loss": 0.7652, + "step": 11378 + }, + { + "epoch": 0.6262865320050636, + "grad_norm": 0.7364919185638428, + "learning_rate": 7.794183430386669e-06, + "loss": 0.8679, + "step": 11379 + }, + { + "epoch": 0.6263415708074193, + "grad_norm": 0.7383667230606079, + "learning_rate": 7.793823955673489e-06, + "loss": 0.7715, + "step": 11380 + }, + { + "epoch": 0.626396609609775, + "grad_norm": 0.6688774228096008, + "learning_rate": 7.793464459962679e-06, + "loss": 0.7503, + "step": 11381 + }, + { + "epoch": 0.6264516484121305, + "grad_norm": 0.6771709322929382, + "learning_rate": 7.793104943256935e-06, + "loss": 0.7479, + "step": 11382 + }, + { + "epoch": 0.6265066872144862, + "grad_norm": 0.7121349573135376, + "learning_rate": 7.792745405558964e-06, + "loss": 0.7655, + "step": 11383 + }, + { + "epoch": 0.6265617260168419, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.792385846871465e-06, + "loss": 0.7418, + "step": 11384 + }, + { + "epoch": 0.6266167648191975, + "grad_norm": 0.6701569557189941, + "learning_rate": 7.792026267197142e-06, + "loss": 0.7669, + "step": 11385 + }, + { + "epoch": 0.6266718036215532, + "grad_norm": 0.6890652179718018, + "learning_rate": 7.791666666538697e-06, + "loss": 0.7659, + "step": 11386 + }, + { + "epoch": 0.6267268424239089, + "grad_norm": 0.7636297345161438, + "learning_rate": 7.791307044898833e-06, + "loss": 0.7272, + "step": 11387 + }, + { + "epoch": 0.6267818812262645, + "grad_norm": 0.6563602089881897, + "learning_rate": 7.790947402280252e-06, + "loss": 0.7603, + "step": 11388 + }, + { + "epoch": 0.6268369200286201, + "grad_norm": 0.7252678275108337, + "learning_rate": 7.790587738685655e-06, + "loss": 0.7789, + "step": 11389 + }, + { + "epoch": 0.6268919588309758, + "grad_norm": 0.6703618764877319, + "learning_rate": 7.79022805411775e-06, + "loss": 0.6883, + "step": 11390 + }, + { + "epoch": 0.6269469976333315, + "grad_norm": 0.7165848612785339, + "learning_rate": 7.789868348579239e-06, + "loss": 0.7944, + "step": 11391 + }, + { + "epoch": 0.6270020364356872, + "grad_norm": 0.9325329065322876, + "learning_rate": 7.789508622072822e-06, + "loss": 0.9059, + "step": 11392 + }, + { + "epoch": 0.6270570752380428, + "grad_norm": 0.6875555515289307, + "learning_rate": 7.789148874601204e-06, + "loss": 0.7115, + "step": 11393 + }, + { + "epoch": 0.6271121140403985, + "grad_norm": 0.6470181941986084, + "learning_rate": 7.788789106167093e-06, + "loss": 0.7603, + "step": 11394 + }, + { + "epoch": 0.6271671528427541, + "grad_norm": 0.688685417175293, + "learning_rate": 7.788429316773188e-06, + "loss": 0.8397, + "step": 11395 + }, + { + "epoch": 0.6272221916451098, + "grad_norm": 0.6299887895584106, + "learning_rate": 7.788069506422193e-06, + "loss": 0.7026, + "step": 11396 + }, + { + "epoch": 0.6272772304474654, + "grad_norm": 0.8046191930770874, + "learning_rate": 7.787709675116817e-06, + "loss": 0.8573, + "step": 11397 + }, + { + "epoch": 0.6273322692498211, + "grad_norm": 0.6700685620307922, + "learning_rate": 7.78734982285976e-06, + "loss": 0.7225, + "step": 11398 + }, + { + "epoch": 0.6273873080521768, + "grad_norm": 0.6968538761138916, + "learning_rate": 7.786989949653726e-06, + "loss": 0.6571, + "step": 11399 + }, + { + "epoch": 0.6274423468545325, + "grad_norm": 0.6857314705848694, + "learning_rate": 7.786630055501425e-06, + "loss": 0.8131, + "step": 11400 + }, + { + "epoch": 0.6274973856568881, + "grad_norm": 0.702316403388977, + "learning_rate": 7.786270140405557e-06, + "loss": 0.7222, + "step": 11401 + }, + { + "epoch": 0.6275524244592438, + "grad_norm": 0.6987283825874329, + "learning_rate": 7.785910204368827e-06, + "loss": 0.7171, + "step": 11402 + }, + { + "epoch": 0.6276074632615994, + "grad_norm": 0.6835529208183289, + "learning_rate": 7.785550247393943e-06, + "loss": 0.8077, + "step": 11403 + }, + { + "epoch": 0.6276625020639551, + "grad_norm": 0.6423392295837402, + "learning_rate": 7.785190269483609e-06, + "loss": 0.6689, + "step": 11404 + }, + { + "epoch": 0.6277175408663107, + "grad_norm": 0.6995517611503601, + "learning_rate": 7.78483027064053e-06, + "loss": 0.7417, + "step": 11405 + }, + { + "epoch": 0.6277725796686664, + "grad_norm": 0.6639729142189026, + "learning_rate": 7.784470250867413e-06, + "loss": 0.6521, + "step": 11406 + }, + { + "epoch": 0.6278276184710221, + "grad_norm": 0.7280262112617493, + "learning_rate": 7.784110210166961e-06, + "loss": 0.7686, + "step": 11407 + }, + { + "epoch": 0.6278826572733778, + "grad_norm": 0.6741863489151001, + "learning_rate": 7.783750148541884e-06, + "loss": 0.7794, + "step": 11408 + }, + { + "epoch": 0.6279376960757334, + "grad_norm": 0.8160151243209839, + "learning_rate": 7.783390065994885e-06, + "loss": 0.7065, + "step": 11409 + }, + { + "epoch": 0.627992734878089, + "grad_norm": 0.7288973927497864, + "learning_rate": 7.783029962528672e-06, + "loss": 0.8337, + "step": 11410 + }, + { + "epoch": 0.6280477736804447, + "grad_norm": 0.7764643430709839, + "learning_rate": 7.782669838145952e-06, + "loss": 0.8812, + "step": 11411 + }, + { + "epoch": 0.6281028124828004, + "grad_norm": 0.8145303130149841, + "learning_rate": 7.782309692849425e-06, + "loss": 0.9206, + "step": 11412 + }, + { + "epoch": 0.628157851285156, + "grad_norm": 0.6883288621902466, + "learning_rate": 7.781949526641808e-06, + "loss": 0.7779, + "step": 11413 + }, + { + "epoch": 0.6282128900875117, + "grad_norm": 0.7281043529510498, + "learning_rate": 7.781589339525803e-06, + "loss": 0.7933, + "step": 11414 + }, + { + "epoch": 0.6282679288898674, + "grad_norm": 0.7998347878456116, + "learning_rate": 7.781229131504115e-06, + "loss": 0.8772, + "step": 11415 + }, + { + "epoch": 0.6283229676922231, + "grad_norm": 0.7591177225112915, + "learning_rate": 7.780868902579455e-06, + "loss": 0.9054, + "step": 11416 + }, + { + "epoch": 0.6283780064945786, + "grad_norm": 0.7209650278091431, + "learning_rate": 7.780508652754528e-06, + "loss": 0.7781, + "step": 11417 + }, + { + "epoch": 0.6284330452969343, + "grad_norm": 1.2373511791229248, + "learning_rate": 7.780148382032042e-06, + "loss": 0.7501, + "step": 11418 + }, + { + "epoch": 0.62848808409929, + "grad_norm": 0.6281551122665405, + "learning_rate": 7.779788090414704e-06, + "loss": 0.8122, + "step": 11419 + }, + { + "epoch": 0.6285431229016457, + "grad_norm": 0.6954115629196167, + "learning_rate": 7.779427777905224e-06, + "loss": 0.7815, + "step": 11420 + }, + { + "epoch": 0.6285981617040013, + "grad_norm": 0.727043628692627, + "learning_rate": 7.77906744450631e-06, + "loss": 0.7116, + "step": 11421 + }, + { + "epoch": 0.628653200506357, + "grad_norm": 0.6979809403419495, + "learning_rate": 7.778707090220667e-06, + "loss": 0.7707, + "step": 11422 + }, + { + "epoch": 0.6287082393087127, + "grad_norm": 0.6851169466972351, + "learning_rate": 7.778346715051006e-06, + "loss": 0.811, + "step": 11423 + }, + { + "epoch": 0.6287632781110684, + "grad_norm": 0.70259028673172, + "learning_rate": 7.777986319000036e-06, + "loss": 0.7766, + "step": 11424 + }, + { + "epoch": 0.6288183169134239, + "grad_norm": 0.7436364889144897, + "learning_rate": 7.777625902070463e-06, + "loss": 0.8449, + "step": 11425 + }, + { + "epoch": 0.6288733557157796, + "grad_norm": 0.6452080607414246, + "learning_rate": 7.777265464264998e-06, + "loss": 0.7138, + "step": 11426 + }, + { + "epoch": 0.6289283945181353, + "grad_norm": 0.6329460144042969, + "learning_rate": 7.776905005586349e-06, + "loss": 0.6482, + "step": 11427 + }, + { + "epoch": 0.6289834333204909, + "grad_norm": 0.7521186470985413, + "learning_rate": 7.776544526037225e-06, + "loss": 0.751, + "step": 11428 + }, + { + "epoch": 0.6290384721228466, + "grad_norm": 0.7105319499969482, + "learning_rate": 7.776184025620334e-06, + "loss": 0.843, + "step": 11429 + }, + { + "epoch": 0.6290935109252023, + "grad_norm": 0.7329964637756348, + "learning_rate": 7.77582350433839e-06, + "loss": 0.6992, + "step": 11430 + }, + { + "epoch": 0.629148549727558, + "grad_norm": 0.7492092847824097, + "learning_rate": 7.775462962194098e-06, + "loss": 0.7579, + "step": 11431 + }, + { + "epoch": 0.6292035885299135, + "grad_norm": 0.7332866191864014, + "learning_rate": 7.77510239919017e-06, + "loss": 0.7758, + "step": 11432 + }, + { + "epoch": 0.6292586273322692, + "grad_norm": 0.7532867193222046, + "learning_rate": 7.774741815329315e-06, + "loss": 0.8157, + "step": 11433 + }, + { + "epoch": 0.6293136661346249, + "grad_norm": 0.7498316168785095, + "learning_rate": 7.774381210614244e-06, + "loss": 0.7671, + "step": 11434 + }, + { + "epoch": 0.6293687049369806, + "grad_norm": 0.8017444610595703, + "learning_rate": 7.774020585047666e-06, + "loss": 0.6989, + "step": 11435 + }, + { + "epoch": 0.6294237437393362, + "grad_norm": 0.7827737927436829, + "learning_rate": 7.77365993863229e-06, + "loss": 0.852, + "step": 11436 + }, + { + "epoch": 0.6294787825416919, + "grad_norm": 1.1411668062210083, + "learning_rate": 7.77329927137083e-06, + "loss": 0.9303, + "step": 11437 + }, + { + "epoch": 0.6295338213440476, + "grad_norm": 1.2931067943572998, + "learning_rate": 7.772938583265995e-06, + "loss": 0.8913, + "step": 11438 + }, + { + "epoch": 0.6295888601464033, + "grad_norm": 0.7407616376876831, + "learning_rate": 7.772577874320494e-06, + "loss": 0.9247, + "step": 11439 + }, + { + "epoch": 0.6296438989487588, + "grad_norm": 0.6544716954231262, + "learning_rate": 7.772217144537043e-06, + "loss": 0.7879, + "step": 11440 + }, + { + "epoch": 0.6296989377511145, + "grad_norm": 0.7467932105064392, + "learning_rate": 7.77185639391835e-06, + "loss": 0.7624, + "step": 11441 + }, + { + "epoch": 0.6297539765534702, + "grad_norm": 0.6845136880874634, + "learning_rate": 7.771495622467123e-06, + "loss": 0.691, + "step": 11442 + }, + { + "epoch": 0.6298090153558259, + "grad_norm": 0.7881575226783752, + "learning_rate": 7.771134830186079e-06, + "loss": 0.7567, + "step": 11443 + }, + { + "epoch": 0.6298640541581815, + "grad_norm": 0.6910528540611267, + "learning_rate": 7.770774017077928e-06, + "loss": 0.7527, + "step": 11444 + }, + { + "epoch": 0.6299190929605372, + "grad_norm": 0.7395550608634949, + "learning_rate": 7.770413183145379e-06, + "loss": 0.8288, + "step": 11445 + }, + { + "epoch": 0.6299741317628929, + "grad_norm": 0.6876364350318909, + "learning_rate": 7.770052328391147e-06, + "loss": 0.7759, + "step": 11446 + }, + { + "epoch": 0.6300291705652485, + "grad_norm": 0.7936999201774597, + "learning_rate": 7.769691452817945e-06, + "loss": 0.6885, + "step": 11447 + }, + { + "epoch": 0.6300842093676041, + "grad_norm": 0.721479058265686, + "learning_rate": 7.769330556428482e-06, + "loss": 0.7215, + "step": 11448 + }, + { + "epoch": 0.6301392481699598, + "grad_norm": 0.6549312472343445, + "learning_rate": 7.76896963922547e-06, + "loss": 0.7523, + "step": 11449 + }, + { + "epoch": 0.6301942869723155, + "grad_norm": 0.6684648394584656, + "learning_rate": 7.768608701211627e-06, + "loss": 0.768, + "step": 11450 + }, + { + "epoch": 0.6302493257746712, + "grad_norm": 0.7014286518096924, + "learning_rate": 7.76824774238966e-06, + "loss": 0.7534, + "step": 11451 + }, + { + "epoch": 0.6303043645770268, + "grad_norm": 0.9186445474624634, + "learning_rate": 7.767886762762284e-06, + "loss": 0.8398, + "step": 11452 + }, + { + "epoch": 0.6303594033793825, + "grad_norm": 0.787187933921814, + "learning_rate": 7.76752576233221e-06, + "loss": 0.8035, + "step": 11453 + }, + { + "epoch": 0.6304144421817381, + "grad_norm": 0.7471121549606323, + "learning_rate": 7.767164741102157e-06, + "loss": 0.7983, + "step": 11454 + }, + { + "epoch": 0.6304694809840938, + "grad_norm": 0.6810591816902161, + "learning_rate": 7.766803699074834e-06, + "loss": 0.7132, + "step": 11455 + }, + { + "epoch": 0.6305245197864494, + "grad_norm": 0.7154163122177124, + "learning_rate": 7.766442636252953e-06, + "loss": 0.7942, + "step": 11456 + }, + { + "epoch": 0.6305795585888051, + "grad_norm": 0.6990880966186523, + "learning_rate": 7.766081552639231e-06, + "loss": 0.7296, + "step": 11457 + }, + { + "epoch": 0.6306345973911608, + "grad_norm": 0.8848066926002502, + "learning_rate": 7.76572044823638e-06, + "loss": 0.621, + "step": 11458 + }, + { + "epoch": 0.6306896361935165, + "grad_norm": 0.6929910182952881, + "learning_rate": 7.765359323047116e-06, + "loss": 0.5917, + "step": 11459 + }, + { + "epoch": 0.6307446749958721, + "grad_norm": 0.6874505281448364, + "learning_rate": 7.764998177074149e-06, + "loss": 0.7244, + "step": 11460 + }, + { + "epoch": 0.6307997137982277, + "grad_norm": 0.6823066473007202, + "learning_rate": 7.764637010320197e-06, + "loss": 0.7299, + "step": 11461 + }, + { + "epoch": 0.6308547526005834, + "grad_norm": 0.7315061688423157, + "learning_rate": 7.764275822787972e-06, + "loss": 0.7759, + "step": 11462 + }, + { + "epoch": 0.6309097914029391, + "grad_norm": 0.6186662316322327, + "learning_rate": 7.763914614480192e-06, + "loss": 0.6746, + "step": 11463 + }, + { + "epoch": 0.6309648302052947, + "grad_norm": 0.6751530170440674, + "learning_rate": 7.763553385399569e-06, + "loss": 0.8371, + "step": 11464 + }, + { + "epoch": 0.6310198690076504, + "grad_norm": 1.0283396244049072, + "learning_rate": 7.763192135548818e-06, + "loss": 0.7743, + "step": 11465 + }, + { + "epoch": 0.6310749078100061, + "grad_norm": 0.7695029973983765, + "learning_rate": 7.762830864930655e-06, + "loss": 0.7387, + "step": 11466 + }, + { + "epoch": 0.6311299466123618, + "grad_norm": 0.8087024688720703, + "learning_rate": 7.762469573547795e-06, + "loss": 0.8357, + "step": 11467 + }, + { + "epoch": 0.6311849854147173, + "grad_norm": 0.9203382134437561, + "learning_rate": 7.762108261402951e-06, + "loss": 0.8191, + "step": 11468 + }, + { + "epoch": 0.631240024217073, + "grad_norm": 0.6569168567657471, + "learning_rate": 7.761746928498843e-06, + "loss": 0.7035, + "step": 11469 + }, + { + "epoch": 0.6312950630194287, + "grad_norm": 0.7903677225112915, + "learning_rate": 7.761385574838183e-06, + "loss": 0.8295, + "step": 11470 + }, + { + "epoch": 0.6313501018217843, + "grad_norm": 0.6780279278755188, + "learning_rate": 7.76102420042369e-06, + "loss": 0.6497, + "step": 11471 + }, + { + "epoch": 0.63140514062414, + "grad_norm": 0.7150516510009766, + "learning_rate": 7.760662805258076e-06, + "loss": 0.7979, + "step": 11472 + }, + { + "epoch": 0.6314601794264957, + "grad_norm": 0.7278215885162354, + "learning_rate": 7.760301389344061e-06, + "loss": 0.8503, + "step": 11473 + }, + { + "epoch": 0.6315152182288514, + "grad_norm": 0.8695063591003418, + "learning_rate": 7.75993995268436e-06, + "loss": 0.7796, + "step": 11474 + }, + { + "epoch": 0.631570257031207, + "grad_norm": 0.7154332399368286, + "learning_rate": 7.759578495281688e-06, + "loss": 0.725, + "step": 11475 + }, + { + "epoch": 0.6316252958335626, + "grad_norm": 0.7151778936386108, + "learning_rate": 7.759217017138763e-06, + "loss": 0.6932, + "step": 11476 + }, + { + "epoch": 0.6316803346359183, + "grad_norm": 0.6328319311141968, + "learning_rate": 7.758855518258301e-06, + "loss": 0.7382, + "step": 11477 + }, + { + "epoch": 0.631735373438274, + "grad_norm": 0.8377438187599182, + "learning_rate": 7.75849399864302e-06, + "loss": 0.7782, + "step": 11478 + }, + { + "epoch": 0.6317904122406296, + "grad_norm": 0.6654751896858215, + "learning_rate": 7.758132458295637e-06, + "loss": 0.8076, + "step": 11479 + }, + { + "epoch": 0.6318454510429853, + "grad_norm": 0.6841873526573181, + "learning_rate": 7.757770897218869e-06, + "loss": 0.7195, + "step": 11480 + }, + { + "epoch": 0.631900489845341, + "grad_norm": 0.7791223526000977, + "learning_rate": 7.757409315415431e-06, + "loss": 0.7858, + "step": 11481 + }, + { + "epoch": 0.6319555286476967, + "grad_norm": 0.6412019729614258, + "learning_rate": 7.757047712888044e-06, + "loss": 0.6853, + "step": 11482 + }, + { + "epoch": 0.6320105674500522, + "grad_norm": 0.7058777213096619, + "learning_rate": 7.756686089639425e-06, + "loss": 0.8955, + "step": 11483 + }, + { + "epoch": 0.6320656062524079, + "grad_norm": 0.6950271725654602, + "learning_rate": 7.75632444567229e-06, + "loss": 0.7213, + "step": 11484 + }, + { + "epoch": 0.6321206450547636, + "grad_norm": 0.6938642859458923, + "learning_rate": 7.755962780989359e-06, + "loss": 0.749, + "step": 11485 + }, + { + "epoch": 0.6321756838571193, + "grad_norm": 4.447030544281006, + "learning_rate": 7.755601095593348e-06, + "loss": 0.7603, + "step": 11486 + }, + { + "epoch": 0.6322307226594749, + "grad_norm": 0.6693708896636963, + "learning_rate": 7.755239389486979e-06, + "loss": 0.769, + "step": 11487 + }, + { + "epoch": 0.6322857614618306, + "grad_norm": 0.830352246761322, + "learning_rate": 7.754877662672968e-06, + "loss": 0.8069, + "step": 11488 + }, + { + "epoch": 0.6323408002641863, + "grad_norm": 0.7211840748786926, + "learning_rate": 7.754515915154033e-06, + "loss": 0.7972, + "step": 11489 + }, + { + "epoch": 0.632395839066542, + "grad_norm": 0.723101019859314, + "learning_rate": 7.754154146932893e-06, + "loss": 0.7385, + "step": 11490 + }, + { + "epoch": 0.6324508778688975, + "grad_norm": 0.6515377759933472, + "learning_rate": 7.75379235801227e-06, + "loss": 0.7527, + "step": 11491 + }, + { + "epoch": 0.6325059166712532, + "grad_norm": 0.6296554803848267, + "learning_rate": 7.75343054839488e-06, + "loss": 0.7135, + "step": 11492 + }, + { + "epoch": 0.6325609554736089, + "grad_norm": 0.8153911232948303, + "learning_rate": 7.753068718083441e-06, + "loss": 0.7298, + "step": 11493 + }, + { + "epoch": 0.6326159942759646, + "grad_norm": 0.6735014915466309, + "learning_rate": 7.752706867080676e-06, + "loss": 0.6851, + "step": 11494 + }, + { + "epoch": 0.6326710330783202, + "grad_norm": 0.7077293992042542, + "learning_rate": 7.752344995389303e-06, + "loss": 0.7806, + "step": 11495 + }, + { + "epoch": 0.6327260718806759, + "grad_norm": 0.6928272843360901, + "learning_rate": 7.751983103012042e-06, + "loss": 0.7538, + "step": 11496 + }, + { + "epoch": 0.6327811106830316, + "grad_norm": 0.7058837413787842, + "learning_rate": 7.751621189951612e-06, + "loss": 0.7065, + "step": 11497 + }, + { + "epoch": 0.6328361494853872, + "grad_norm": 0.7272600531578064, + "learning_rate": 7.751259256210735e-06, + "loss": 0.7468, + "step": 11498 + }, + { + "epoch": 0.6328911882877428, + "grad_norm": 0.6175968050956726, + "learning_rate": 7.75089730179213e-06, + "loss": 0.7195, + "step": 11499 + }, + { + "epoch": 0.6329462270900985, + "grad_norm": 0.6567386984825134, + "learning_rate": 7.750535326698514e-06, + "loss": 0.8147, + "step": 11500 + }, + { + "epoch": 0.6330012658924542, + "grad_norm": 0.6325315237045288, + "learning_rate": 7.750173330932613e-06, + "loss": 0.7087, + "step": 11501 + }, + { + "epoch": 0.6330563046948099, + "grad_norm": 0.8607509732246399, + "learning_rate": 7.749811314497147e-06, + "loss": 0.8009, + "step": 11502 + }, + { + "epoch": 0.6331113434971655, + "grad_norm": 0.7452824711799622, + "learning_rate": 7.749449277394833e-06, + "loss": 0.7497, + "step": 11503 + }, + { + "epoch": 0.6331663822995212, + "grad_norm": 0.7371357679367065, + "learning_rate": 7.749087219628395e-06, + "loss": 0.8936, + "step": 11504 + }, + { + "epoch": 0.6332214211018768, + "grad_norm": 0.7177306413650513, + "learning_rate": 7.748725141200552e-06, + "loss": 0.8327, + "step": 11505 + }, + { + "epoch": 0.6332764599042325, + "grad_norm": 0.5938527584075928, + "learning_rate": 7.748363042114028e-06, + "loss": 0.6471, + "step": 11506 + }, + { + "epoch": 0.6333314987065881, + "grad_norm": 0.8827341198921204, + "learning_rate": 7.748000922371543e-06, + "loss": 0.7247, + "step": 11507 + }, + { + "epoch": 0.6333865375089438, + "grad_norm": 0.7008641958236694, + "learning_rate": 7.747638781975818e-06, + "loss": 0.684, + "step": 11508 + }, + { + "epoch": 0.6334415763112995, + "grad_norm": 0.7752355337142944, + "learning_rate": 7.747276620929576e-06, + "loss": 0.7993, + "step": 11509 + }, + { + "epoch": 0.6334966151136552, + "grad_norm": 0.6928088068962097, + "learning_rate": 7.74691443923554e-06, + "loss": 0.7213, + "step": 11510 + }, + { + "epoch": 0.6335516539160108, + "grad_norm": 0.8197296261787415, + "learning_rate": 7.746552236896428e-06, + "loss": 0.847, + "step": 11511 + }, + { + "epoch": 0.6336066927183664, + "grad_norm": 0.7912493348121643, + "learning_rate": 7.746190013914966e-06, + "loss": 0.8217, + "step": 11512 + }, + { + "epoch": 0.6336617315207221, + "grad_norm": 0.7726556062698364, + "learning_rate": 7.745827770293871e-06, + "loss": 0.7626, + "step": 11513 + }, + { + "epoch": 0.6337167703230777, + "grad_norm": 0.668569028377533, + "learning_rate": 7.745465506035873e-06, + "loss": 0.7141, + "step": 11514 + }, + { + "epoch": 0.6337718091254334, + "grad_norm": 0.7226139903068542, + "learning_rate": 7.745103221143694e-06, + "loss": 0.7262, + "step": 11515 + }, + { + "epoch": 0.6338268479277891, + "grad_norm": 0.7315354943275452, + "learning_rate": 7.744740915620051e-06, + "loss": 0.7955, + "step": 11516 + }, + { + "epoch": 0.6338818867301448, + "grad_norm": 0.6815279126167297, + "learning_rate": 7.744378589467668e-06, + "loss": 0.7347, + "step": 11517 + }, + { + "epoch": 0.6339369255325004, + "grad_norm": 0.6931445598602295, + "learning_rate": 7.744016242689272e-06, + "loss": 0.7959, + "step": 11518 + }, + { + "epoch": 0.633991964334856, + "grad_norm": 0.7156991362571716, + "learning_rate": 7.743653875287584e-06, + "loss": 0.7793, + "step": 11519 + }, + { + "epoch": 0.6340470031372117, + "grad_norm": 0.8503926396369934, + "learning_rate": 7.74329148726533e-06, + "loss": 0.823, + "step": 11520 + }, + { + "epoch": 0.6341020419395674, + "grad_norm": 0.6280057430267334, + "learning_rate": 7.742929078625228e-06, + "loss": 0.6729, + "step": 11521 + }, + { + "epoch": 0.634157080741923, + "grad_norm": 0.7004517316818237, + "learning_rate": 7.742566649370008e-06, + "loss": 0.7578, + "step": 11522 + }, + { + "epoch": 0.6342121195442787, + "grad_norm": 0.7147908210754395, + "learning_rate": 7.74220419950239e-06, + "loss": 0.7705, + "step": 11523 + }, + { + "epoch": 0.6342671583466344, + "grad_norm": 0.7191137671470642, + "learning_rate": 7.7418417290251e-06, + "loss": 0.789, + "step": 11524 + }, + { + "epoch": 0.6343221971489901, + "grad_norm": 0.7288943529129028, + "learning_rate": 7.741479237940862e-06, + "loss": 0.8204, + "step": 11525 + }, + { + "epoch": 0.6343772359513457, + "grad_norm": 0.714821994304657, + "learning_rate": 7.741116726252398e-06, + "loss": 0.8252, + "step": 11526 + }, + { + "epoch": 0.6344322747537013, + "grad_norm": 0.6869103312492371, + "learning_rate": 7.740754193962435e-06, + "loss": 0.8136, + "step": 11527 + }, + { + "epoch": 0.634487313556057, + "grad_norm": 0.6629248857498169, + "learning_rate": 7.740391641073698e-06, + "loss": 0.7049, + "step": 11528 + }, + { + "epoch": 0.6345423523584127, + "grad_norm": 0.7078685164451599, + "learning_rate": 7.74002906758891e-06, + "loss": 0.7345, + "step": 11529 + }, + { + "epoch": 0.6345973911607683, + "grad_norm": 0.7748367190361023, + "learning_rate": 7.739666473510798e-06, + "loss": 0.7085, + "step": 11530 + }, + { + "epoch": 0.634652429963124, + "grad_norm": 0.6661930084228516, + "learning_rate": 7.739303858842086e-06, + "loss": 0.7795, + "step": 11531 + }, + { + "epoch": 0.6347074687654797, + "grad_norm": 0.6847965121269226, + "learning_rate": 7.738941223585499e-06, + "loss": 0.797, + "step": 11532 + }, + { + "epoch": 0.6347625075678354, + "grad_norm": 0.695184051990509, + "learning_rate": 7.738578567743762e-06, + "loss": 0.8184, + "step": 11533 + }, + { + "epoch": 0.6348175463701909, + "grad_norm": 0.6620088815689087, + "learning_rate": 7.738215891319603e-06, + "loss": 0.721, + "step": 11534 + }, + { + "epoch": 0.6348725851725466, + "grad_norm": 0.6802023649215698, + "learning_rate": 7.737853194315745e-06, + "loss": 0.9207, + "step": 11535 + }, + { + "epoch": 0.6349276239749023, + "grad_norm": 1.0193618535995483, + "learning_rate": 7.737490476734916e-06, + "loss": 0.8495, + "step": 11536 + }, + { + "epoch": 0.634982662777258, + "grad_norm": 0.6578189730644226, + "learning_rate": 7.737127738579841e-06, + "loss": 0.7455, + "step": 11537 + }, + { + "epoch": 0.6350377015796136, + "grad_norm": 0.70018470287323, + "learning_rate": 7.736764979853248e-06, + "loss": 0.7414, + "step": 11538 + }, + { + "epoch": 0.6350927403819693, + "grad_norm": 0.8136304616928101, + "learning_rate": 7.736402200557862e-06, + "loss": 0.7327, + "step": 11539 + }, + { + "epoch": 0.635147779184325, + "grad_norm": 0.7805309295654297, + "learning_rate": 7.736039400696408e-06, + "loss": 0.7659, + "step": 11540 + }, + { + "epoch": 0.6352028179866807, + "grad_norm": 0.675215482711792, + "learning_rate": 7.735676580271615e-06, + "loss": 0.7532, + "step": 11541 + }, + { + "epoch": 0.6352578567890362, + "grad_norm": 0.6873239874839783, + "learning_rate": 7.735313739286208e-06, + "loss": 0.8123, + "step": 11542 + }, + { + "epoch": 0.6353128955913919, + "grad_norm": 0.6624773144721985, + "learning_rate": 7.734950877742917e-06, + "loss": 0.7642, + "step": 11543 + }, + { + "epoch": 0.6353679343937476, + "grad_norm": 0.8047438859939575, + "learning_rate": 7.734587995644468e-06, + "loss": 0.7452, + "step": 11544 + }, + { + "epoch": 0.6354229731961033, + "grad_norm": 0.7449815273284912, + "learning_rate": 7.734225092993585e-06, + "loss": 0.7756, + "step": 11545 + }, + { + "epoch": 0.6354780119984589, + "grad_norm": 0.693081259727478, + "learning_rate": 7.733862169792999e-06, + "loss": 0.7029, + "step": 11546 + }, + { + "epoch": 0.6355330508008146, + "grad_norm": 0.6593700051307678, + "learning_rate": 7.733499226045437e-06, + "loss": 0.6009, + "step": 11547 + }, + { + "epoch": 0.6355880896031703, + "grad_norm": 0.7402041554450989, + "learning_rate": 7.733136261753627e-06, + "loss": 0.6921, + "step": 11548 + }, + { + "epoch": 0.635643128405526, + "grad_norm": 0.7686228156089783, + "learning_rate": 7.732773276920294e-06, + "loss": 0.855, + "step": 11549 + }, + { + "epoch": 0.6356981672078815, + "grad_norm": 0.6776669025421143, + "learning_rate": 7.732410271548171e-06, + "loss": 0.7146, + "step": 11550 + }, + { + "epoch": 0.6357532060102372, + "grad_norm": 0.6055952906608582, + "learning_rate": 7.732047245639983e-06, + "loss": 0.6926, + "step": 11551 + }, + { + "epoch": 0.6358082448125929, + "grad_norm": 0.7452635765075684, + "learning_rate": 7.731684199198461e-06, + "loss": 0.7766, + "step": 11552 + }, + { + "epoch": 0.6358632836149486, + "grad_norm": 0.7482720017433167, + "learning_rate": 7.73132113222633e-06, + "loss": 0.7725, + "step": 11553 + }, + { + "epoch": 0.6359183224173042, + "grad_norm": 0.6534025073051453, + "learning_rate": 7.73095804472632e-06, + "loss": 0.7902, + "step": 11554 + }, + { + "epoch": 0.6359733612196599, + "grad_norm": 0.7364560961723328, + "learning_rate": 7.730594936701162e-06, + "loss": 0.7998, + "step": 11555 + }, + { + "epoch": 0.6360284000220155, + "grad_norm": 0.6881458163261414, + "learning_rate": 7.730231808153582e-06, + "loss": 0.7586, + "step": 11556 + }, + { + "epoch": 0.6360834388243711, + "grad_norm": 0.6574262976646423, + "learning_rate": 7.72986865908631e-06, + "loss": 0.6999, + "step": 11557 + }, + { + "epoch": 0.6361384776267268, + "grad_norm": 0.6976385712623596, + "learning_rate": 7.729505489502078e-06, + "loss": 0.7387, + "step": 11558 + }, + { + "epoch": 0.6361935164290825, + "grad_norm": 0.6482532620429993, + "learning_rate": 7.729142299403613e-06, + "loss": 0.7715, + "step": 11559 + }, + { + "epoch": 0.6362485552314382, + "grad_norm": 0.7140287160873413, + "learning_rate": 7.728779088793643e-06, + "loss": 0.8562, + "step": 11560 + }, + { + "epoch": 0.6363035940337938, + "grad_norm": 0.6579470634460449, + "learning_rate": 7.728415857674901e-06, + "loss": 0.727, + "step": 11561 + }, + { + "epoch": 0.6363586328361495, + "grad_norm": 0.8670933246612549, + "learning_rate": 7.728052606050116e-06, + "loss": 0.7459, + "step": 11562 + }, + { + "epoch": 0.6364136716385052, + "grad_norm": 0.7995489835739136, + "learning_rate": 7.72768933392202e-06, + "loss": 0.8228, + "step": 11563 + }, + { + "epoch": 0.6364687104408608, + "grad_norm": 0.6467362642288208, + "learning_rate": 7.727326041293336e-06, + "loss": 0.7545, + "step": 11564 + }, + { + "epoch": 0.6365237492432164, + "grad_norm": 0.6646577715873718, + "learning_rate": 7.726962728166803e-06, + "loss": 0.7824, + "step": 11565 + }, + { + "epoch": 0.6365787880455721, + "grad_norm": 0.6576912999153137, + "learning_rate": 7.726599394545149e-06, + "loss": 0.7324, + "step": 11566 + }, + { + "epoch": 0.6366338268479278, + "grad_norm": 0.7514963150024414, + "learning_rate": 7.726236040431101e-06, + "loss": 0.7712, + "step": 11567 + }, + { + "epoch": 0.6366888656502835, + "grad_norm": 0.7313328981399536, + "learning_rate": 7.725872665827394e-06, + "loss": 0.7361, + "step": 11568 + }, + { + "epoch": 0.6367439044526391, + "grad_norm": 0.7109994292259216, + "learning_rate": 7.725509270736759e-06, + "loss": 0.812, + "step": 11569 + }, + { + "epoch": 0.6367989432549948, + "grad_norm": 1.128675103187561, + "learning_rate": 7.725145855161924e-06, + "loss": 0.726, + "step": 11570 + }, + { + "epoch": 0.6368539820573504, + "grad_norm": 0.7357437014579773, + "learning_rate": 7.724782419105622e-06, + "loss": 0.7958, + "step": 11571 + }, + { + "epoch": 0.6369090208597061, + "grad_norm": 0.6874725222587585, + "learning_rate": 7.724418962570587e-06, + "loss": 0.751, + "step": 11572 + }, + { + "epoch": 0.6369640596620617, + "grad_norm": 0.7175989747047424, + "learning_rate": 7.724055485559545e-06, + "loss": 0.7191, + "step": 11573 + }, + { + "epoch": 0.6370190984644174, + "grad_norm": 0.6424688100814819, + "learning_rate": 7.723691988075235e-06, + "loss": 0.608, + "step": 11574 + }, + { + "epoch": 0.6370741372667731, + "grad_norm": 0.6845381855964661, + "learning_rate": 7.723328470120383e-06, + "loss": 0.7465, + "step": 11575 + }, + { + "epoch": 0.6371291760691288, + "grad_norm": 0.7955030202865601, + "learning_rate": 7.722964931697723e-06, + "loss": 0.745, + "step": 11576 + }, + { + "epoch": 0.6371842148714844, + "grad_norm": 0.6855689883232117, + "learning_rate": 7.722601372809989e-06, + "loss": 0.7764, + "step": 11577 + }, + { + "epoch": 0.63723925367384, + "grad_norm": 0.7505692839622498, + "learning_rate": 7.722237793459909e-06, + "loss": 0.8324, + "step": 11578 + }, + { + "epoch": 0.6372942924761957, + "grad_norm": 0.6852842569351196, + "learning_rate": 7.721874193650221e-06, + "loss": 0.7599, + "step": 11579 + }, + { + "epoch": 0.6373493312785514, + "grad_norm": 0.698210597038269, + "learning_rate": 7.721510573383654e-06, + "loss": 0.843, + "step": 11580 + }, + { + "epoch": 0.637404370080907, + "grad_norm": 0.8344444632530212, + "learning_rate": 7.721146932662942e-06, + "loss": 0.8602, + "step": 11581 + }, + { + "epoch": 0.6374594088832627, + "grad_norm": 0.6385721564292908, + "learning_rate": 7.72078327149082e-06, + "loss": 0.7449, + "step": 11582 + }, + { + "epoch": 0.6375144476856184, + "grad_norm": 0.6474401354789734, + "learning_rate": 7.720419589870016e-06, + "loss": 0.6328, + "step": 11583 + }, + { + "epoch": 0.6375694864879741, + "grad_norm": 0.6554263234138489, + "learning_rate": 7.720055887803268e-06, + "loss": 0.6672, + "step": 11584 + }, + { + "epoch": 0.6376245252903296, + "grad_norm": 0.6551910638809204, + "learning_rate": 7.719692165293309e-06, + "loss": 0.8024, + "step": 11585 + }, + { + "epoch": 0.6376795640926853, + "grad_norm": 0.693418025970459, + "learning_rate": 7.719328422342871e-06, + "loss": 0.726, + "step": 11586 + }, + { + "epoch": 0.637734602895041, + "grad_norm": 0.8642090559005737, + "learning_rate": 7.718964658954689e-06, + "loss": 0.8274, + "step": 11587 + }, + { + "epoch": 0.6377896416973967, + "grad_norm": 0.8255778551101685, + "learning_rate": 7.718600875131494e-06, + "loss": 0.7259, + "step": 11588 + }, + { + "epoch": 0.6378446804997523, + "grad_norm": 0.7492913007736206, + "learning_rate": 7.718237070876025e-06, + "loss": 0.7093, + "step": 11589 + }, + { + "epoch": 0.637899719302108, + "grad_norm": 0.7154868245124817, + "learning_rate": 7.717873246191013e-06, + "loss": 0.7909, + "step": 11590 + }, + { + "epoch": 0.6379547581044637, + "grad_norm": 0.7751424312591553, + "learning_rate": 7.717509401079194e-06, + "loss": 0.8528, + "step": 11591 + }, + { + "epoch": 0.6380097969068194, + "grad_norm": 0.68199223279953, + "learning_rate": 7.7171455355433e-06, + "loss": 0.7077, + "step": 11592 + }, + { + "epoch": 0.6380648357091749, + "grad_norm": 0.7340414524078369, + "learning_rate": 7.716781649586069e-06, + "loss": 0.693, + "step": 11593 + }, + { + "epoch": 0.6381198745115306, + "grad_norm": 0.6278988122940063, + "learning_rate": 7.716417743210234e-06, + "loss": 0.7049, + "step": 11594 + }, + { + "epoch": 0.6381749133138863, + "grad_norm": 0.9113193154335022, + "learning_rate": 7.716053816418532e-06, + "loss": 0.7757, + "step": 11595 + }, + { + "epoch": 0.638229952116242, + "grad_norm": 0.7059371471405029, + "learning_rate": 7.715689869213694e-06, + "loss": 0.7805, + "step": 11596 + }, + { + "epoch": 0.6382849909185976, + "grad_norm": 0.7508488297462463, + "learning_rate": 7.71532590159846e-06, + "loss": 0.7394, + "step": 11597 + }, + { + "epoch": 0.6383400297209533, + "grad_norm": 0.8222774863243103, + "learning_rate": 7.71496191357556e-06, + "loss": 0.7675, + "step": 11598 + }, + { + "epoch": 0.638395068523309, + "grad_norm": 0.7295246124267578, + "learning_rate": 7.714597905147736e-06, + "loss": 0.7766, + "step": 11599 + }, + { + "epoch": 0.6384501073256645, + "grad_norm": 0.7482065558433533, + "learning_rate": 7.71423387631772e-06, + "loss": 0.7334, + "step": 11600 + }, + { + "epoch": 0.6385051461280202, + "grad_norm": 0.7654659748077393, + "learning_rate": 7.71386982708825e-06, + "loss": 0.8097, + "step": 11601 + }, + { + "epoch": 0.6385601849303759, + "grad_norm": 0.9125531911849976, + "learning_rate": 7.71350575746206e-06, + "loss": 0.7776, + "step": 11602 + }, + { + "epoch": 0.6386152237327316, + "grad_norm": 0.8063878417015076, + "learning_rate": 7.713141667441886e-06, + "loss": 0.7899, + "step": 11603 + }, + { + "epoch": 0.6386702625350872, + "grad_norm": 0.7315171360969543, + "learning_rate": 7.712777557030466e-06, + "loss": 0.7884, + "step": 11604 + }, + { + "epoch": 0.6387253013374429, + "grad_norm": 0.7306345105171204, + "learning_rate": 7.712413426230536e-06, + "loss": 0.8646, + "step": 11605 + }, + { + "epoch": 0.6387803401397986, + "grad_norm": 0.8300313353538513, + "learning_rate": 7.712049275044833e-06, + "loss": 0.8131, + "step": 11606 + }, + { + "epoch": 0.6388353789421543, + "grad_norm": 0.7513623237609863, + "learning_rate": 7.711685103476093e-06, + "loss": 0.8115, + "step": 11607 + }, + { + "epoch": 0.6388904177445098, + "grad_norm": 0.7126060128211975, + "learning_rate": 7.711320911527054e-06, + "loss": 0.8198, + "step": 11608 + }, + { + "epoch": 0.6389454565468655, + "grad_norm": 0.7017398476600647, + "learning_rate": 7.710956699200454e-06, + "loss": 0.8088, + "step": 11609 + }, + { + "epoch": 0.6390004953492212, + "grad_norm": 0.7345026135444641, + "learning_rate": 7.710592466499027e-06, + "loss": 0.8228, + "step": 11610 + }, + { + "epoch": 0.6390555341515769, + "grad_norm": 0.6903058886528015, + "learning_rate": 7.710228213425514e-06, + "loss": 0.7058, + "step": 11611 + }, + { + "epoch": 0.6391105729539325, + "grad_norm": 0.6838604211807251, + "learning_rate": 7.70986393998265e-06, + "loss": 0.7091, + "step": 11612 + }, + { + "epoch": 0.6391656117562882, + "grad_norm": 0.7067943811416626, + "learning_rate": 7.709499646173177e-06, + "loss": 0.7631, + "step": 11613 + }, + { + "epoch": 0.6392206505586439, + "grad_norm": 0.7577057480812073, + "learning_rate": 7.709135331999827e-06, + "loss": 0.7545, + "step": 11614 + }, + { + "epoch": 0.6392756893609995, + "grad_norm": 0.6425572633743286, + "learning_rate": 7.70877099746534e-06, + "loss": 0.7188, + "step": 11615 + }, + { + "epoch": 0.6393307281633551, + "grad_norm": 0.7257497310638428, + "learning_rate": 7.708406642572459e-06, + "loss": 0.7514, + "step": 11616 + }, + { + "epoch": 0.6393857669657108, + "grad_norm": 0.8214251399040222, + "learning_rate": 7.708042267323916e-06, + "loss": 0.7824, + "step": 11617 + }, + { + "epoch": 0.6394408057680665, + "grad_norm": 0.7879108786582947, + "learning_rate": 7.707677871722453e-06, + "loss": 0.6122, + "step": 11618 + }, + { + "epoch": 0.6394958445704222, + "grad_norm": 0.6656795740127563, + "learning_rate": 7.707313455770808e-06, + "loss": 0.754, + "step": 11619 + }, + { + "epoch": 0.6395508833727778, + "grad_norm": 0.7196451425552368, + "learning_rate": 7.70694901947172e-06, + "loss": 0.7662, + "step": 11620 + }, + { + "epoch": 0.6396059221751335, + "grad_norm": 0.8213779926300049, + "learning_rate": 7.706584562827928e-06, + "loss": 0.8732, + "step": 11621 + }, + { + "epoch": 0.6396609609774891, + "grad_norm": 0.7114893794059753, + "learning_rate": 7.70622008584217e-06, + "loss": 0.8493, + "step": 11622 + }, + { + "epoch": 0.6397159997798448, + "grad_norm": 0.7009783983230591, + "learning_rate": 7.705855588517188e-06, + "loss": 0.738, + "step": 11623 + }, + { + "epoch": 0.6397710385822004, + "grad_norm": 0.7576995491981506, + "learning_rate": 7.705491070855717e-06, + "loss": 0.8839, + "step": 11624 + }, + { + "epoch": 0.6398260773845561, + "grad_norm": 0.705784022808075, + "learning_rate": 7.7051265328605e-06, + "loss": 0.7246, + "step": 11625 + }, + { + "epoch": 0.6398811161869118, + "grad_norm": 0.6696903109550476, + "learning_rate": 7.704761974534277e-06, + "loss": 0.7418, + "step": 11626 + }, + { + "epoch": 0.6399361549892675, + "grad_norm": 0.8617024421691895, + "learning_rate": 7.704397395879786e-06, + "loss": 0.8109, + "step": 11627 + }, + { + "epoch": 0.6399911937916231, + "grad_norm": 0.6819054484367371, + "learning_rate": 7.70403279689977e-06, + "loss": 0.6438, + "step": 11628 + }, + { + "epoch": 0.6400462325939787, + "grad_norm": 0.6145044565200806, + "learning_rate": 7.703668177596966e-06, + "loss": 0.6712, + "step": 11629 + }, + { + "epoch": 0.6401012713963344, + "grad_norm": 0.6946390271186829, + "learning_rate": 7.703303537974116e-06, + "loss": 0.8099, + "step": 11630 + }, + { + "epoch": 0.6401563101986901, + "grad_norm": 0.6791605949401855, + "learning_rate": 7.702938878033961e-06, + "loss": 0.7494, + "step": 11631 + }, + { + "epoch": 0.6402113490010457, + "grad_norm": 0.6718626618385315, + "learning_rate": 7.70257419777924e-06, + "loss": 0.7471, + "step": 11632 + }, + { + "epoch": 0.6402663878034014, + "grad_norm": 0.8051798343658447, + "learning_rate": 7.702209497212694e-06, + "loss": 0.8569, + "step": 11633 + }, + { + "epoch": 0.6403214266057571, + "grad_norm": 0.6602774858474731, + "learning_rate": 7.701844776337067e-06, + "loss": 0.7396, + "step": 11634 + }, + { + "epoch": 0.6403764654081128, + "grad_norm": 0.672363817691803, + "learning_rate": 7.701480035155096e-06, + "loss": 0.7584, + "step": 11635 + }, + { + "epoch": 0.6404315042104683, + "grad_norm": 0.7363641262054443, + "learning_rate": 7.701115273669524e-06, + "loss": 0.8149, + "step": 11636 + }, + { + "epoch": 0.640486543012824, + "grad_norm": 0.7238422632217407, + "learning_rate": 7.700750491883094e-06, + "loss": 0.7598, + "step": 11637 + }, + { + "epoch": 0.6405415818151797, + "grad_norm": 1.3627614974975586, + "learning_rate": 7.700385689798544e-06, + "loss": 0.8303, + "step": 11638 + }, + { + "epoch": 0.6405966206175354, + "grad_norm": 0.6339633464813232, + "learning_rate": 7.70002086741862e-06, + "loss": 0.7308, + "step": 11639 + }, + { + "epoch": 0.640651659419891, + "grad_norm": 0.6821589469909668, + "learning_rate": 7.699656024746062e-06, + "loss": 0.6728, + "step": 11640 + }, + { + "epoch": 0.6407066982222467, + "grad_norm": 0.8514766097068787, + "learning_rate": 7.699291161783611e-06, + "loss": 0.8693, + "step": 11641 + }, + { + "epoch": 0.6407617370246024, + "grad_norm": 0.649075984954834, + "learning_rate": 7.698926278534011e-06, + "loss": 0.7482, + "step": 11642 + }, + { + "epoch": 0.640816775826958, + "grad_norm": 0.6507017016410828, + "learning_rate": 7.698561375000001e-06, + "loss": 0.7841, + "step": 11643 + }, + { + "epoch": 0.6408718146293136, + "grad_norm": 0.6736069321632385, + "learning_rate": 7.69819645118433e-06, + "loss": 0.74, + "step": 11644 + }, + { + "epoch": 0.6409268534316693, + "grad_norm": 0.6727941632270813, + "learning_rate": 7.697831507089734e-06, + "loss": 0.806, + "step": 11645 + }, + { + "epoch": 0.640981892234025, + "grad_norm": 0.7089083194732666, + "learning_rate": 7.697466542718959e-06, + "loss": 0.8091, + "step": 11646 + }, + { + "epoch": 0.6410369310363806, + "grad_norm": 0.6355387568473816, + "learning_rate": 7.69710155807475e-06, + "loss": 0.7033, + "step": 11647 + }, + { + "epoch": 0.6410919698387363, + "grad_norm": 0.6327098608016968, + "learning_rate": 7.696736553159846e-06, + "loss": 0.7664, + "step": 11648 + }, + { + "epoch": 0.641147008641092, + "grad_norm": 0.6971945762634277, + "learning_rate": 7.69637152797699e-06, + "loss": 0.7441, + "step": 11649 + }, + { + "epoch": 0.6412020474434477, + "grad_norm": 0.7420539855957031, + "learning_rate": 7.696006482528929e-06, + "loss": 0.7909, + "step": 11650 + }, + { + "epoch": 0.6412570862458032, + "grad_norm": 0.6877853274345398, + "learning_rate": 7.695641416818405e-06, + "loss": 0.7624, + "step": 11651 + }, + { + "epoch": 0.6413121250481589, + "grad_norm": 0.7337075471878052, + "learning_rate": 7.695276330848162e-06, + "loss": 0.7829, + "step": 11652 + }, + { + "epoch": 0.6413671638505146, + "grad_norm": 0.6423582434654236, + "learning_rate": 7.694911224620944e-06, + "loss": 0.6686, + "step": 11653 + }, + { + "epoch": 0.6414222026528703, + "grad_norm": 0.7826602458953857, + "learning_rate": 7.694546098139492e-06, + "loss": 0.774, + "step": 11654 + }, + { + "epoch": 0.6414772414552259, + "grad_norm": 0.7678147554397583, + "learning_rate": 7.694180951406556e-06, + "loss": 0.8067, + "step": 11655 + }, + { + "epoch": 0.6415322802575816, + "grad_norm": 0.6400566101074219, + "learning_rate": 7.693815784424875e-06, + "loss": 0.7796, + "step": 11656 + }, + { + "epoch": 0.6415873190599373, + "grad_norm": 0.6606197357177734, + "learning_rate": 7.693450597197196e-06, + "loss": 0.7381, + "step": 11657 + }, + { + "epoch": 0.641642357862293, + "grad_norm": 0.7953683137893677, + "learning_rate": 7.693085389726262e-06, + "loss": 0.8867, + "step": 11658 + }, + { + "epoch": 0.6416973966646485, + "grad_norm": 0.6763843894004822, + "learning_rate": 7.692720162014822e-06, + "loss": 0.7579, + "step": 11659 + }, + { + "epoch": 0.6417524354670042, + "grad_norm": 0.6456292867660522, + "learning_rate": 7.692354914065617e-06, + "loss": 0.7814, + "step": 11660 + }, + { + "epoch": 0.6418074742693599, + "grad_norm": 0.702803373336792, + "learning_rate": 7.691989645881393e-06, + "loss": 0.7393, + "step": 11661 + }, + { + "epoch": 0.6418625130717156, + "grad_norm": 0.8328298926353455, + "learning_rate": 7.691624357464895e-06, + "loss": 0.6587, + "step": 11662 + }, + { + "epoch": 0.6419175518740712, + "grad_norm": 0.8409613966941833, + "learning_rate": 7.691259048818871e-06, + "loss": 0.8075, + "step": 11663 + }, + { + "epoch": 0.6419725906764269, + "grad_norm": 0.6969256401062012, + "learning_rate": 7.690893719946062e-06, + "loss": 0.8061, + "step": 11664 + }, + { + "epoch": 0.6420276294787826, + "grad_norm": 0.7689732313156128, + "learning_rate": 7.690528370849217e-06, + "loss": 0.7709, + "step": 11665 + }, + { + "epoch": 0.6420826682811382, + "grad_norm": 0.8239523768424988, + "learning_rate": 7.69016300153108e-06, + "loss": 0.7421, + "step": 11666 + }, + { + "epoch": 0.6421377070834938, + "grad_norm": 0.7199227809906006, + "learning_rate": 7.689797611994398e-06, + "loss": 0.7877, + "step": 11667 + }, + { + "epoch": 0.6421927458858495, + "grad_norm": 0.8315985798835754, + "learning_rate": 7.689432202241919e-06, + "loss": 0.8458, + "step": 11668 + }, + { + "epoch": 0.6422477846882052, + "grad_norm": 0.7213512063026428, + "learning_rate": 7.689066772276385e-06, + "loss": 0.7199, + "step": 11669 + }, + { + "epoch": 0.6423028234905609, + "grad_norm": 0.6023604273796082, + "learning_rate": 7.688701322100547e-06, + "loss": 0.6485, + "step": 11670 + }, + { + "epoch": 0.6423578622929165, + "grad_norm": 0.8171319365501404, + "learning_rate": 7.688335851717148e-06, + "loss": 0.7561, + "step": 11671 + }, + { + "epoch": 0.6424129010952722, + "grad_norm": 0.6545816659927368, + "learning_rate": 7.687970361128937e-06, + "loss": 0.6796, + "step": 11672 + }, + { + "epoch": 0.6424679398976278, + "grad_norm": 0.8093686103820801, + "learning_rate": 7.687604850338661e-06, + "loss": 0.8538, + "step": 11673 + }, + { + "epoch": 0.6425229786999835, + "grad_norm": 0.6438135504722595, + "learning_rate": 7.687239319349066e-06, + "loss": 0.7046, + "step": 11674 + }, + { + "epoch": 0.6425780175023391, + "grad_norm": 0.685100257396698, + "learning_rate": 7.6868737681629e-06, + "loss": 0.7568, + "step": 11675 + }, + { + "epoch": 0.6426330563046948, + "grad_norm": 0.6850112676620483, + "learning_rate": 7.68650819678291e-06, + "loss": 0.7082, + "step": 11676 + }, + { + "epoch": 0.6426880951070505, + "grad_norm": 0.7524490356445312, + "learning_rate": 7.686142605211843e-06, + "loss": 0.7285, + "step": 11677 + }, + { + "epoch": 0.6427431339094062, + "grad_norm": 0.7706617116928101, + "learning_rate": 7.685776993452446e-06, + "loss": 0.7934, + "step": 11678 + }, + { + "epoch": 0.6427981727117618, + "grad_norm": 0.6612235307693481, + "learning_rate": 7.68541136150747e-06, + "loss": 0.6538, + "step": 11679 + }, + { + "epoch": 0.6428532115141175, + "grad_norm": 0.6380587816238403, + "learning_rate": 7.68504570937966e-06, + "loss": 0.7, + "step": 11680 + }, + { + "epoch": 0.6429082503164731, + "grad_norm": 0.6563882231712341, + "learning_rate": 7.684680037071765e-06, + "loss": 0.6912, + "step": 11681 + }, + { + "epoch": 0.6429632891188288, + "grad_norm": 0.6579793095588684, + "learning_rate": 7.684314344586534e-06, + "loss": 0.7263, + "step": 11682 + }, + { + "epoch": 0.6430183279211844, + "grad_norm": 0.7029374837875366, + "learning_rate": 7.683948631926713e-06, + "loss": 0.7151, + "step": 11683 + }, + { + "epoch": 0.6430733667235401, + "grad_norm": 0.6683217883110046, + "learning_rate": 7.683582899095056e-06, + "loss": 0.7643, + "step": 11684 + }, + { + "epoch": 0.6431284055258958, + "grad_norm": 1.0482646226882935, + "learning_rate": 7.683217146094308e-06, + "loss": 0.8889, + "step": 11685 + }, + { + "epoch": 0.6431834443282514, + "grad_norm": 0.7101102471351624, + "learning_rate": 7.682851372927216e-06, + "loss": 0.7762, + "step": 11686 + }, + { + "epoch": 0.643238483130607, + "grad_norm": 0.674961268901825, + "learning_rate": 7.682485579596533e-06, + "loss": 0.736, + "step": 11687 + }, + { + "epoch": 0.6432935219329627, + "grad_norm": 0.7071837782859802, + "learning_rate": 7.682119766105005e-06, + "loss": 0.7231, + "step": 11688 + }, + { + "epoch": 0.6433485607353184, + "grad_norm": 0.6982744932174683, + "learning_rate": 7.681753932455383e-06, + "loss": 0.7498, + "step": 11689 + }, + { + "epoch": 0.643403599537674, + "grad_norm": 0.6927201747894287, + "learning_rate": 7.681388078650415e-06, + "loss": 0.803, + "step": 11690 + }, + { + "epoch": 0.6434586383400297, + "grad_norm": 0.7299236059188843, + "learning_rate": 7.681022204692854e-06, + "loss": 0.7386, + "step": 11691 + }, + { + "epoch": 0.6435136771423854, + "grad_norm": 0.8809047937393188, + "learning_rate": 7.680656310585449e-06, + "loss": 0.741, + "step": 11692 + }, + { + "epoch": 0.6435687159447411, + "grad_norm": 0.862843930721283, + "learning_rate": 7.680290396330947e-06, + "loss": 0.8357, + "step": 11693 + }, + { + "epoch": 0.6436237547470967, + "grad_norm": 0.7436664700508118, + "learning_rate": 7.679924461932098e-06, + "loss": 0.8352, + "step": 11694 + }, + { + "epoch": 0.6436787935494523, + "grad_norm": 0.6582232713699341, + "learning_rate": 7.679558507391657e-06, + "loss": 0.7107, + "step": 11695 + }, + { + "epoch": 0.643733832351808, + "grad_norm": 0.6798850297927856, + "learning_rate": 7.67919253271237e-06, + "loss": 0.6968, + "step": 11696 + }, + { + "epoch": 0.6437888711541637, + "grad_norm": 0.7747187614440918, + "learning_rate": 7.67882653789699e-06, + "loss": 0.7611, + "step": 11697 + }, + { + "epoch": 0.6438439099565193, + "grad_norm": 0.7097567915916443, + "learning_rate": 7.678460522948267e-06, + "loss": 0.7275, + "step": 11698 + }, + { + "epoch": 0.643898948758875, + "grad_norm": 0.6958394050598145, + "learning_rate": 7.678094487868952e-06, + "loss": 0.7441, + "step": 11699 + }, + { + "epoch": 0.6439539875612307, + "grad_norm": 0.9129040837287903, + "learning_rate": 7.677728432661794e-06, + "loss": 0.7693, + "step": 11700 + }, + { + "epoch": 0.6440090263635864, + "grad_norm": 1.1396137475967407, + "learning_rate": 7.677362357329548e-06, + "loss": 0.7479, + "step": 11701 + }, + { + "epoch": 0.644064065165942, + "grad_norm": 0.8163042664527893, + "learning_rate": 7.67699626187496e-06, + "loss": 0.835, + "step": 11702 + }, + { + "epoch": 0.6441191039682976, + "grad_norm": 0.9869117736816406, + "learning_rate": 7.676630146300787e-06, + "loss": 0.769, + "step": 11703 + }, + { + "epoch": 0.6441741427706533, + "grad_norm": 0.7439526915550232, + "learning_rate": 7.676264010609777e-06, + "loss": 0.8239, + "step": 11704 + }, + { + "epoch": 0.644229181573009, + "grad_norm": 0.6943735480308533, + "learning_rate": 7.675897854804685e-06, + "loss": 0.7702, + "step": 11705 + }, + { + "epoch": 0.6442842203753646, + "grad_norm": 0.7384238243103027, + "learning_rate": 7.67553167888826e-06, + "loss": 0.6911, + "step": 11706 + }, + { + "epoch": 0.6443392591777203, + "grad_norm": 0.660022497177124, + "learning_rate": 7.675165482863254e-06, + "loss": 0.7359, + "step": 11707 + }, + { + "epoch": 0.644394297980076, + "grad_norm": 0.6956108808517456, + "learning_rate": 7.674799266732422e-06, + "loss": 0.7845, + "step": 11708 + }, + { + "epoch": 0.6444493367824317, + "grad_norm": 0.7361618280410767, + "learning_rate": 7.674433030498513e-06, + "loss": 0.7391, + "step": 11709 + }, + { + "epoch": 0.6445043755847872, + "grad_norm": 0.7655043005943298, + "learning_rate": 7.674066774164284e-06, + "loss": 0.8305, + "step": 11710 + }, + { + "epoch": 0.6445594143871429, + "grad_norm": 0.7160911560058594, + "learning_rate": 7.673700497732483e-06, + "loss": 0.7654, + "step": 11711 + }, + { + "epoch": 0.6446144531894986, + "grad_norm": 0.7812016010284424, + "learning_rate": 7.673334201205866e-06, + "loss": 0.8212, + "step": 11712 + }, + { + "epoch": 0.6446694919918543, + "grad_norm": 0.7457767128944397, + "learning_rate": 7.672967884587184e-06, + "loss": 0.8084, + "step": 11713 + }, + { + "epoch": 0.6447245307942099, + "grad_norm": 0.7524051070213318, + "learning_rate": 7.672601547879189e-06, + "loss": 0.7525, + "step": 11714 + }, + { + "epoch": 0.6447795695965656, + "grad_norm": 0.7271043062210083, + "learning_rate": 7.672235191084638e-06, + "loss": 0.7627, + "step": 11715 + }, + { + "epoch": 0.6448346083989213, + "grad_norm": 0.6893014907836914, + "learning_rate": 7.671868814206283e-06, + "loss": 0.7969, + "step": 11716 + }, + { + "epoch": 0.644889647201277, + "grad_norm": 0.7057414054870605, + "learning_rate": 7.671502417246876e-06, + "loss": 0.7448, + "step": 11717 + }, + { + "epoch": 0.6449446860036325, + "grad_norm": 0.7490910887718201, + "learning_rate": 7.671136000209172e-06, + "loss": 0.8046, + "step": 11718 + }, + { + "epoch": 0.6449997248059882, + "grad_norm": 0.7338950634002686, + "learning_rate": 7.670769563095926e-06, + "loss": 0.8521, + "step": 11719 + }, + { + "epoch": 0.6450547636083439, + "grad_norm": 0.8669398427009583, + "learning_rate": 7.670403105909891e-06, + "loss": 0.7803, + "step": 11720 + }, + { + "epoch": 0.6451098024106996, + "grad_norm": 0.7012562155723572, + "learning_rate": 7.67003662865382e-06, + "loss": 0.8047, + "step": 11721 + }, + { + "epoch": 0.6451648412130552, + "grad_norm": 0.9933050274848938, + "learning_rate": 7.66967013133047e-06, + "loss": 0.7081, + "step": 11722 + }, + { + "epoch": 0.6452198800154109, + "grad_norm": 1.12044358253479, + "learning_rate": 7.669303613942592e-06, + "loss": 0.7315, + "step": 11723 + }, + { + "epoch": 0.6452749188177666, + "grad_norm": 0.8654733300209045, + "learning_rate": 7.668937076492943e-06, + "loss": 0.6849, + "step": 11724 + }, + { + "epoch": 0.6453299576201222, + "grad_norm": 0.7081291675567627, + "learning_rate": 7.668570518984277e-06, + "loss": 0.7584, + "step": 11725 + }, + { + "epoch": 0.6453849964224778, + "grad_norm": 0.7473898530006409, + "learning_rate": 7.66820394141935e-06, + "loss": 0.8364, + "step": 11726 + }, + { + "epoch": 0.6454400352248335, + "grad_norm": 0.7863657474517822, + "learning_rate": 7.667837343800916e-06, + "loss": 0.7235, + "step": 11727 + }, + { + "epoch": 0.6454950740271892, + "grad_norm": 0.6664546728134155, + "learning_rate": 7.667470726131732e-06, + "loss": 0.7203, + "step": 11728 + }, + { + "epoch": 0.6455501128295448, + "grad_norm": 0.7182374596595764, + "learning_rate": 7.667104088414552e-06, + "loss": 0.7376, + "step": 11729 + }, + { + "epoch": 0.6456051516319005, + "grad_norm": 0.6518070697784424, + "learning_rate": 7.666737430652128e-06, + "loss": 0.6804, + "step": 11730 + }, + { + "epoch": 0.6456601904342562, + "grad_norm": 0.7354047894477844, + "learning_rate": 7.666370752847223e-06, + "loss": 0.7648, + "step": 11731 + }, + { + "epoch": 0.6457152292366118, + "grad_norm": 0.7440805435180664, + "learning_rate": 7.666004055002588e-06, + "loss": 0.7674, + "step": 11732 + }, + { + "epoch": 0.6457702680389674, + "grad_norm": 1.6423569917678833, + "learning_rate": 7.665637337120981e-06, + "loss": 0.8957, + "step": 11733 + }, + { + "epoch": 0.6458253068413231, + "grad_norm": 0.6960558295249939, + "learning_rate": 7.665270599205156e-06, + "loss": 0.7278, + "step": 11734 + }, + { + "epoch": 0.6458803456436788, + "grad_norm": 0.6983850002288818, + "learning_rate": 7.664903841257871e-06, + "loss": 0.7351, + "step": 11735 + }, + { + "epoch": 0.6459353844460345, + "grad_norm": 0.6905686855316162, + "learning_rate": 7.664537063281883e-06, + "loss": 0.7558, + "step": 11736 + }, + { + "epoch": 0.6459904232483901, + "grad_norm": 0.7483980655670166, + "learning_rate": 7.664170265279946e-06, + "loss": 0.813, + "step": 11737 + }, + { + "epoch": 0.6460454620507458, + "grad_norm": 0.767756998538971, + "learning_rate": 7.66380344725482e-06, + "loss": 0.8397, + "step": 11738 + }, + { + "epoch": 0.6461005008531014, + "grad_norm": 0.7813250422477722, + "learning_rate": 7.66343660920926e-06, + "loss": 0.8034, + "step": 11739 + }, + { + "epoch": 0.6461555396554571, + "grad_norm": 0.7357046604156494, + "learning_rate": 7.663069751146022e-06, + "loss": 0.7604, + "step": 11740 + }, + { + "epoch": 0.6462105784578127, + "grad_norm": 0.620285153388977, + "learning_rate": 7.662702873067866e-06, + "loss": 0.6191, + "step": 11741 + }, + { + "epoch": 0.6462656172601684, + "grad_norm": 0.6711301803588867, + "learning_rate": 7.662335974977549e-06, + "loss": 0.7674, + "step": 11742 + }, + { + "epoch": 0.6463206560625241, + "grad_norm": 0.756258487701416, + "learning_rate": 7.661969056877824e-06, + "loss": 0.7074, + "step": 11743 + }, + { + "epoch": 0.6463756948648798, + "grad_norm": 0.8121050596237183, + "learning_rate": 7.661602118771456e-06, + "loss": 0.8028, + "step": 11744 + }, + { + "epoch": 0.6464307336672354, + "grad_norm": 0.735906720161438, + "learning_rate": 7.661235160661197e-06, + "loss": 0.7197, + "step": 11745 + }, + { + "epoch": 0.646485772469591, + "grad_norm": 0.644490122795105, + "learning_rate": 7.660868182549807e-06, + "loss": 0.6172, + "step": 11746 + }, + { + "epoch": 0.6465408112719467, + "grad_norm": 0.7228739261627197, + "learning_rate": 7.660501184440045e-06, + "loss": 0.8302, + "step": 11747 + }, + { + "epoch": 0.6465958500743024, + "grad_norm": 0.8292868137359619, + "learning_rate": 7.660134166334668e-06, + "loss": 0.7506, + "step": 11748 + }, + { + "epoch": 0.646650888876658, + "grad_norm": 0.7224695086479187, + "learning_rate": 7.659767128236433e-06, + "loss": 0.8043, + "step": 11749 + }, + { + "epoch": 0.6467059276790137, + "grad_norm": 0.7092188000679016, + "learning_rate": 7.659400070148102e-06, + "loss": 0.7838, + "step": 11750 + }, + { + "epoch": 0.6467609664813694, + "grad_norm": 0.6975178122520447, + "learning_rate": 7.65903299207243e-06, + "loss": 0.7576, + "step": 11751 + }, + { + "epoch": 0.6468160052837251, + "grad_norm": 0.6524471044540405, + "learning_rate": 7.658665894012179e-06, + "loss": 0.7822, + "step": 11752 + }, + { + "epoch": 0.6468710440860806, + "grad_norm": 0.8134269118309021, + "learning_rate": 7.658298775970107e-06, + "loss": 0.8116, + "step": 11753 + }, + { + "epoch": 0.6469260828884363, + "grad_norm": 0.7166362404823303, + "learning_rate": 7.657931637948974e-06, + "loss": 0.768, + "step": 11754 + }, + { + "epoch": 0.646981121690792, + "grad_norm": 0.6418643593788147, + "learning_rate": 7.657564479951535e-06, + "loss": 0.7488, + "step": 11755 + }, + { + "epoch": 0.6470361604931477, + "grad_norm": 0.7104085087776184, + "learning_rate": 7.657197301980556e-06, + "loss": 0.7518, + "step": 11756 + }, + { + "epoch": 0.6470911992955033, + "grad_norm": 0.7297894358634949, + "learning_rate": 7.656830104038793e-06, + "loss": 0.7877, + "step": 11757 + }, + { + "epoch": 0.647146238097859, + "grad_norm": 0.8037092089653015, + "learning_rate": 7.656462886129006e-06, + "loss": 0.7375, + "step": 11758 + }, + { + "epoch": 0.6472012769002147, + "grad_norm": 0.7498913407325745, + "learning_rate": 7.656095648253955e-06, + "loss": 0.7899, + "step": 11759 + }, + { + "epoch": 0.6472563157025704, + "grad_norm": 0.7383849620819092, + "learning_rate": 7.655728390416398e-06, + "loss": 0.8276, + "step": 11760 + }, + { + "epoch": 0.6473113545049259, + "grad_norm": 0.750481367111206, + "learning_rate": 7.6553611126191e-06, + "loss": 0.7649, + "step": 11761 + }, + { + "epoch": 0.6473663933072816, + "grad_norm": 0.8483286499977112, + "learning_rate": 7.654993814864817e-06, + "loss": 0.877, + "step": 11762 + }, + { + "epoch": 0.6474214321096373, + "grad_norm": 0.7938307523727417, + "learning_rate": 7.654626497156311e-06, + "loss": 0.8159, + "step": 11763 + }, + { + "epoch": 0.647476470911993, + "grad_norm": 0.6576653122901917, + "learning_rate": 7.654259159496343e-06, + "loss": 0.797, + "step": 11764 + }, + { + "epoch": 0.6475315097143486, + "grad_norm": 0.6495664715766907, + "learning_rate": 7.653891801887675e-06, + "loss": 0.6641, + "step": 11765 + }, + { + "epoch": 0.6475865485167043, + "grad_norm": 0.7447353601455688, + "learning_rate": 7.653524424333065e-06, + "loss": 0.667, + "step": 11766 + }, + { + "epoch": 0.64764158731906, + "grad_norm": 0.6565769910812378, + "learning_rate": 7.653157026835277e-06, + "loss": 0.7123, + "step": 11767 + }, + { + "epoch": 0.6476966261214157, + "grad_norm": 0.8406145572662354, + "learning_rate": 7.652789609397072e-06, + "loss": 0.7582, + "step": 11768 + }, + { + "epoch": 0.6477516649237712, + "grad_norm": 0.8478217720985413, + "learning_rate": 7.652422172021207e-06, + "loss": 0.6758, + "step": 11769 + }, + { + "epoch": 0.6478067037261269, + "grad_norm": 0.7230110168457031, + "learning_rate": 7.652054714710448e-06, + "loss": 0.8216, + "step": 11770 + }, + { + "epoch": 0.6478617425284826, + "grad_norm": 0.6718668341636658, + "learning_rate": 7.651687237467558e-06, + "loss": 0.7204, + "step": 11771 + }, + { + "epoch": 0.6479167813308382, + "grad_norm": 1.062383770942688, + "learning_rate": 7.651319740295296e-06, + "loss": 0.6853, + "step": 11772 + }, + { + "epoch": 0.6479718201331939, + "grad_norm": 0.7157385945320129, + "learning_rate": 7.650952223196423e-06, + "loss": 0.6826, + "step": 11773 + }, + { + "epoch": 0.6480268589355496, + "grad_norm": 0.6762190461158752, + "learning_rate": 7.650584686173703e-06, + "loss": 0.7673, + "step": 11774 + }, + { + "epoch": 0.6480818977379053, + "grad_norm": 0.7540121674537659, + "learning_rate": 7.650217129229897e-06, + "loss": 0.7361, + "step": 11775 + }, + { + "epoch": 0.6481369365402608, + "grad_norm": 1.0383096933364868, + "learning_rate": 7.649849552367771e-06, + "loss": 0.7936, + "step": 11776 + }, + { + "epoch": 0.6481919753426165, + "grad_norm": 0.6430917382240295, + "learning_rate": 7.649481955590084e-06, + "loss": 0.7738, + "step": 11777 + }, + { + "epoch": 0.6482470141449722, + "grad_norm": 0.7846735715866089, + "learning_rate": 7.6491143388996e-06, + "loss": 0.6892, + "step": 11778 + }, + { + "epoch": 0.6483020529473279, + "grad_norm": 0.7154437899589539, + "learning_rate": 7.64874670229908e-06, + "loss": 0.6889, + "step": 11779 + }, + { + "epoch": 0.6483570917496835, + "grad_norm": 0.731270432472229, + "learning_rate": 7.648379045791291e-06, + "loss": 0.6405, + "step": 11780 + }, + { + "epoch": 0.6484121305520392, + "grad_norm": 0.6782581210136414, + "learning_rate": 7.648011369378993e-06, + "loss": 0.7822, + "step": 11781 + }, + { + "epoch": 0.6484671693543949, + "grad_norm": 0.7025747299194336, + "learning_rate": 7.64764367306495e-06, + "loss": 0.6929, + "step": 11782 + }, + { + "epoch": 0.6485222081567505, + "grad_norm": 0.6791071891784668, + "learning_rate": 7.647275956851928e-06, + "loss": 0.7507, + "step": 11783 + }, + { + "epoch": 0.6485772469591061, + "grad_norm": 0.7598931193351746, + "learning_rate": 7.646908220742686e-06, + "loss": 0.776, + "step": 11784 + }, + { + "epoch": 0.6486322857614618, + "grad_norm": 0.6930273771286011, + "learning_rate": 7.646540464739993e-06, + "loss": 0.7653, + "step": 11785 + }, + { + "epoch": 0.6486873245638175, + "grad_norm": 0.7276393175125122, + "learning_rate": 7.646172688846608e-06, + "loss": 0.8102, + "step": 11786 + }, + { + "epoch": 0.6487423633661732, + "grad_norm": 0.6826562285423279, + "learning_rate": 7.645804893065298e-06, + "loss": 0.6182, + "step": 11787 + }, + { + "epoch": 0.6487974021685288, + "grad_norm": 0.7837507128715515, + "learning_rate": 7.645437077398827e-06, + "loss": 0.8124, + "step": 11788 + }, + { + "epoch": 0.6488524409708845, + "grad_norm": 0.6937540769577026, + "learning_rate": 7.645069241849959e-06, + "loss": 0.7831, + "step": 11789 + }, + { + "epoch": 0.6489074797732401, + "grad_norm": 0.6531546115875244, + "learning_rate": 7.644701386421458e-06, + "loss": 0.755, + "step": 11790 + }, + { + "epoch": 0.6489625185755958, + "grad_norm": 0.8563246726989746, + "learning_rate": 7.644333511116088e-06, + "loss": 0.7715, + "step": 11791 + }, + { + "epoch": 0.6490175573779514, + "grad_norm": 0.8330580592155457, + "learning_rate": 7.643965615936619e-06, + "loss": 0.6651, + "step": 11792 + }, + { + "epoch": 0.6490725961803071, + "grad_norm": 0.6478384137153625, + "learning_rate": 7.643597700885809e-06, + "loss": 0.7063, + "step": 11793 + }, + { + "epoch": 0.6491276349826628, + "grad_norm": 0.7169124484062195, + "learning_rate": 7.643229765966428e-06, + "loss": 0.7578, + "step": 11794 + }, + { + "epoch": 0.6491826737850185, + "grad_norm": 0.726198136806488, + "learning_rate": 7.642861811181239e-06, + "loss": 0.783, + "step": 11795 + }, + { + "epoch": 0.6492377125873741, + "grad_norm": 0.7167587280273438, + "learning_rate": 7.642493836533008e-06, + "loss": 0.81, + "step": 11796 + }, + { + "epoch": 0.6492927513897297, + "grad_norm": 0.7215337157249451, + "learning_rate": 7.642125842024502e-06, + "loss": 0.8176, + "step": 11797 + }, + { + "epoch": 0.6493477901920854, + "grad_norm": 0.7041502594947815, + "learning_rate": 7.641757827658484e-06, + "loss": 0.8117, + "step": 11798 + }, + { + "epoch": 0.6494028289944411, + "grad_norm": 1.0303698778152466, + "learning_rate": 7.64138979343772e-06, + "loss": 0.781, + "step": 11799 + }, + { + "epoch": 0.6494578677967967, + "grad_norm": 0.626518189907074, + "learning_rate": 7.64102173936498e-06, + "loss": 0.6668, + "step": 11800 + }, + { + "epoch": 0.6495129065991524, + "grad_norm": 0.8889065980911255, + "learning_rate": 7.640653665443025e-06, + "loss": 0.8076, + "step": 11801 + }, + { + "epoch": 0.6495679454015081, + "grad_norm": 0.8333556652069092, + "learning_rate": 7.640285571674626e-06, + "loss": 0.8111, + "step": 11802 + }, + { + "epoch": 0.6496229842038638, + "grad_norm": 0.7248615622520447, + "learning_rate": 7.639917458062547e-06, + "loss": 0.7876, + "step": 11803 + }, + { + "epoch": 0.6496780230062194, + "grad_norm": 0.8870820999145508, + "learning_rate": 7.639549324609554e-06, + "loss": 0.8586, + "step": 11804 + }, + { + "epoch": 0.649733061808575, + "grad_norm": 0.7777245044708252, + "learning_rate": 7.639181171318417e-06, + "loss": 0.7793, + "step": 11805 + }, + { + "epoch": 0.6497881006109307, + "grad_norm": 0.7858467102050781, + "learning_rate": 7.638812998191897e-06, + "loss": 0.7842, + "step": 11806 + }, + { + "epoch": 0.6498431394132864, + "grad_norm": 0.6278610825538635, + "learning_rate": 7.638444805232769e-06, + "loss": 0.6659, + "step": 11807 + }, + { + "epoch": 0.649898178215642, + "grad_norm": 0.6758826971054077, + "learning_rate": 7.638076592443795e-06, + "loss": 0.7047, + "step": 11808 + }, + { + "epoch": 0.6499532170179977, + "grad_norm": 0.745007336139679, + "learning_rate": 7.637708359827743e-06, + "loss": 0.8557, + "step": 11809 + }, + { + "epoch": 0.6500082558203534, + "grad_norm": 0.8092321157455444, + "learning_rate": 7.63734010738738e-06, + "loss": 0.7895, + "step": 11810 + }, + { + "epoch": 0.6500632946227091, + "grad_norm": 0.7055220603942871, + "learning_rate": 7.636971835125476e-06, + "loss": 0.7678, + "step": 11811 + }, + { + "epoch": 0.6501183334250646, + "grad_norm": 0.7130264043807983, + "learning_rate": 7.636603543044797e-06, + "loss": 0.7648, + "step": 11812 + }, + { + "epoch": 0.6501733722274203, + "grad_norm": 0.7494268417358398, + "learning_rate": 7.636235231148112e-06, + "loss": 0.7883, + "step": 11813 + }, + { + "epoch": 0.650228411029776, + "grad_norm": 0.7998068332672119, + "learning_rate": 7.635866899438189e-06, + "loss": 0.7849, + "step": 11814 + }, + { + "epoch": 0.6502834498321316, + "grad_norm": 0.6749094128608704, + "learning_rate": 7.635498547917795e-06, + "loss": 0.8488, + "step": 11815 + }, + { + "epoch": 0.6503384886344873, + "grad_norm": 0.743679940700531, + "learning_rate": 7.635130176589698e-06, + "loss": 0.7562, + "step": 11816 + }, + { + "epoch": 0.650393527436843, + "grad_norm": 0.8368289470672607, + "learning_rate": 7.634761785456671e-06, + "loss": 0.7012, + "step": 11817 + }, + { + "epoch": 0.6504485662391987, + "grad_norm": 0.7214943170547485, + "learning_rate": 7.634393374521478e-06, + "loss": 0.7386, + "step": 11818 + }, + { + "epoch": 0.6505036050415542, + "grad_norm": 0.7026216387748718, + "learning_rate": 7.63402494378689e-06, + "loss": 0.7444, + "step": 11819 + }, + { + "epoch": 0.6505586438439099, + "grad_norm": 0.6271201372146606, + "learning_rate": 7.633656493255677e-06, + "loss": 0.6567, + "step": 11820 + }, + { + "epoch": 0.6506136826462656, + "grad_norm": 0.8359349370002747, + "learning_rate": 7.633288022930606e-06, + "loss": 0.7081, + "step": 11821 + }, + { + "epoch": 0.6506687214486213, + "grad_norm": 0.7009666562080383, + "learning_rate": 7.632919532814444e-06, + "loss": 0.6892, + "step": 11822 + }, + { + "epoch": 0.6507237602509769, + "grad_norm": 0.7445069551467896, + "learning_rate": 7.632551022909966e-06, + "loss": 0.7854, + "step": 11823 + }, + { + "epoch": 0.6507787990533326, + "grad_norm": 0.7204466462135315, + "learning_rate": 7.63218249321994e-06, + "loss": 0.8065, + "step": 11824 + }, + { + "epoch": 0.6508338378556883, + "grad_norm": 0.7058166265487671, + "learning_rate": 7.631813943747135e-06, + "loss": 0.6668, + "step": 11825 + }, + { + "epoch": 0.650888876658044, + "grad_norm": 0.739919126033783, + "learning_rate": 7.631445374494319e-06, + "loss": 0.8657, + "step": 11826 + }, + { + "epoch": 0.6509439154603995, + "grad_norm": 1.0444670915603638, + "learning_rate": 7.631076785464263e-06, + "loss": 0.7226, + "step": 11827 + }, + { + "epoch": 0.6509989542627552, + "grad_norm": 0.7146627306938171, + "learning_rate": 7.630708176659743e-06, + "loss": 0.7567, + "step": 11828 + }, + { + "epoch": 0.6510539930651109, + "grad_norm": 0.6981074810028076, + "learning_rate": 7.630339548083521e-06, + "loss": 0.7158, + "step": 11829 + }, + { + "epoch": 0.6511090318674666, + "grad_norm": 0.7620309591293335, + "learning_rate": 7.629970899738372e-06, + "loss": 0.811, + "step": 11830 + }, + { + "epoch": 0.6511640706698222, + "grad_norm": 0.7017341256141663, + "learning_rate": 7.629602231627066e-06, + "loss": 0.7092, + "step": 11831 + }, + { + "epoch": 0.6512191094721779, + "grad_norm": 0.733524739742279, + "learning_rate": 7.629233543752373e-06, + "loss": 0.859, + "step": 11832 + }, + { + "epoch": 0.6512741482745336, + "grad_norm": 0.7246975898742676, + "learning_rate": 7.628864836117065e-06, + "loss": 0.7732, + "step": 11833 + }, + { + "epoch": 0.6513291870768892, + "grad_norm": 0.5763251185417175, + "learning_rate": 7.628496108723911e-06, + "loss": 0.6632, + "step": 11834 + }, + { + "epoch": 0.6513842258792448, + "grad_norm": 0.6120070815086365, + "learning_rate": 7.628127361575685e-06, + "loss": 0.6809, + "step": 11835 + }, + { + "epoch": 0.6514392646816005, + "grad_norm": 0.8650742769241333, + "learning_rate": 7.627758594675157e-06, + "loss": 0.6388, + "step": 11836 + }, + { + "epoch": 0.6514943034839562, + "grad_norm": 0.8650027513504028, + "learning_rate": 7.627389808025099e-06, + "loss": 0.7622, + "step": 11837 + }, + { + "epoch": 0.6515493422863119, + "grad_norm": 0.6683071851730347, + "learning_rate": 7.627021001628283e-06, + "loss": 0.7424, + "step": 11838 + }, + { + "epoch": 0.6516043810886675, + "grad_norm": 0.6821237206459045, + "learning_rate": 7.626652175487479e-06, + "loss": 0.7844, + "step": 11839 + }, + { + "epoch": 0.6516594198910232, + "grad_norm": 0.7142770886421204, + "learning_rate": 7.626283329605462e-06, + "loss": 0.7706, + "step": 11840 + }, + { + "epoch": 0.6517144586933789, + "grad_norm": 0.7870625257492065, + "learning_rate": 7.625914463985002e-06, + "loss": 0.7673, + "step": 11841 + }, + { + "epoch": 0.6517694974957345, + "grad_norm": 0.7386491894721985, + "learning_rate": 7.62554557862887e-06, + "loss": 0.7562, + "step": 11842 + }, + { + "epoch": 0.6518245362980901, + "grad_norm": 0.6529993414878845, + "learning_rate": 7.625176673539843e-06, + "loss": 0.8258, + "step": 11843 + }, + { + "epoch": 0.6518795751004458, + "grad_norm": 0.7010294795036316, + "learning_rate": 7.6248077487206895e-06, + "loss": 0.7773, + "step": 11844 + }, + { + "epoch": 0.6519346139028015, + "grad_norm": 0.6699075698852539, + "learning_rate": 7.624438804174184e-06, + "loss": 0.7163, + "step": 11845 + }, + { + "epoch": 0.6519896527051572, + "grad_norm": 0.6600161790847778, + "learning_rate": 7.624069839903099e-06, + "loss": 0.7355, + "step": 11846 + }, + { + "epoch": 0.6520446915075128, + "grad_norm": 0.6556873321533203, + "learning_rate": 7.623700855910205e-06, + "loss": 0.627, + "step": 11847 + }, + { + "epoch": 0.6520997303098685, + "grad_norm": 0.6867008805274963, + "learning_rate": 7.623331852198281e-06, + "loss": 0.8228, + "step": 11848 + }, + { + "epoch": 0.6521547691122241, + "grad_norm": 0.6885474324226379, + "learning_rate": 7.622962828770095e-06, + "loss": 0.6804, + "step": 11849 + }, + { + "epoch": 0.6522098079145798, + "grad_norm": 0.6903913021087646, + "learning_rate": 7.622593785628425e-06, + "loss": 0.6553, + "step": 11850 + }, + { + "epoch": 0.6522648467169354, + "grad_norm": 0.6581684947013855, + "learning_rate": 7.622224722776039e-06, + "loss": 0.7102, + "step": 11851 + }, + { + "epoch": 0.6523198855192911, + "grad_norm": 0.8261715769767761, + "learning_rate": 7.621855640215716e-06, + "loss": 0.676, + "step": 11852 + }, + { + "epoch": 0.6523749243216468, + "grad_norm": 0.6238247752189636, + "learning_rate": 7.6214865379502265e-06, + "loss": 0.7065, + "step": 11853 + }, + { + "epoch": 0.6524299631240025, + "grad_norm": 0.7350416779518127, + "learning_rate": 7.621117415982346e-06, + "loss": 0.7512, + "step": 11854 + }, + { + "epoch": 0.652485001926358, + "grad_norm": 0.7337208390235901, + "learning_rate": 7.620748274314851e-06, + "loss": 0.7593, + "step": 11855 + }, + { + "epoch": 0.6525400407287137, + "grad_norm": 0.6568214297294617, + "learning_rate": 7.620379112950511e-06, + "loss": 0.7363, + "step": 11856 + }, + { + "epoch": 0.6525950795310694, + "grad_norm": 0.7099055647850037, + "learning_rate": 7.620009931892105e-06, + "loss": 0.6631, + "step": 11857 + }, + { + "epoch": 0.652650118333425, + "grad_norm": 0.6563010215759277, + "learning_rate": 7.6196407311424035e-06, + "loss": 0.6617, + "step": 11858 + }, + { + "epoch": 0.6527051571357807, + "grad_norm": 0.6664251685142517, + "learning_rate": 7.6192715107041845e-06, + "loss": 0.7898, + "step": 11859 + }, + { + "epoch": 0.6527601959381364, + "grad_norm": 0.6524507403373718, + "learning_rate": 7.618902270580222e-06, + "loss": 0.767, + "step": 11860 + }, + { + "epoch": 0.6528152347404921, + "grad_norm": 0.7391313910484314, + "learning_rate": 7.61853301077329e-06, + "loss": 0.6015, + "step": 11861 + }, + { + "epoch": 0.6528702735428477, + "grad_norm": 0.7691878080368042, + "learning_rate": 7.618163731286167e-06, + "loss": 0.718, + "step": 11862 + }, + { + "epoch": 0.6529253123452033, + "grad_norm": 0.6524633765220642, + "learning_rate": 7.617794432121625e-06, + "loss": 0.6841, + "step": 11863 + }, + { + "epoch": 0.652980351147559, + "grad_norm": 0.7125405073165894, + "learning_rate": 7.61742511328244e-06, + "loss": 0.7654, + "step": 11864 + }, + { + "epoch": 0.6530353899499147, + "grad_norm": 0.7123568058013916, + "learning_rate": 7.617055774771389e-06, + "loss": 0.7189, + "step": 11865 + }, + { + "epoch": 0.6530904287522703, + "grad_norm": 0.6968240141868591, + "learning_rate": 7.616686416591248e-06, + "loss": 0.7201, + "step": 11866 + }, + { + "epoch": 0.653145467554626, + "grad_norm": 0.7208551168441772, + "learning_rate": 7.616317038744792e-06, + "loss": 0.6644, + "step": 11867 + }, + { + "epoch": 0.6532005063569817, + "grad_norm": 0.7320911884307861, + "learning_rate": 7.615947641234798e-06, + "loss": 0.7118, + "step": 11868 + }, + { + "epoch": 0.6532555451593374, + "grad_norm": 0.7762041687965393, + "learning_rate": 7.615578224064041e-06, + "loss": 0.7501, + "step": 11869 + }, + { + "epoch": 0.653310583961693, + "grad_norm": 0.7455989718437195, + "learning_rate": 7.6152087872352975e-06, + "loss": 0.8058, + "step": 11870 + }, + { + "epoch": 0.6533656227640486, + "grad_norm": 0.736044704914093, + "learning_rate": 7.614839330751347e-06, + "loss": 0.727, + "step": 11871 + }, + { + "epoch": 0.6534206615664043, + "grad_norm": 0.680171012878418, + "learning_rate": 7.614469854614961e-06, + "loss": 0.6722, + "step": 11872 + }, + { + "epoch": 0.65347570036876, + "grad_norm": 0.7598134279251099, + "learning_rate": 7.614100358828922e-06, + "loss": 0.7472, + "step": 11873 + }, + { + "epoch": 0.6535307391711156, + "grad_norm": 0.8288099765777588, + "learning_rate": 7.613730843396003e-06, + "loss": 0.7493, + "step": 11874 + }, + { + "epoch": 0.6535857779734713, + "grad_norm": 0.6436724066734314, + "learning_rate": 7.613361308318984e-06, + "loss": 0.7103, + "step": 11875 + }, + { + "epoch": 0.653640816775827, + "grad_norm": 0.671334981918335, + "learning_rate": 7.612991753600639e-06, + "loss": 0.6949, + "step": 11876 + }, + { + "epoch": 0.6536958555781827, + "grad_norm": 0.6019170880317688, + "learning_rate": 7.61262217924375e-06, + "loss": 0.6116, + "step": 11877 + }, + { + "epoch": 0.6537508943805382, + "grad_norm": 1.4682546854019165, + "learning_rate": 7.61225258525109e-06, + "loss": 0.9343, + "step": 11878 + }, + { + "epoch": 0.6538059331828939, + "grad_norm": 0.656822681427002, + "learning_rate": 7.611882971625439e-06, + "loss": 0.7357, + "step": 11879 + }, + { + "epoch": 0.6538609719852496, + "grad_norm": 0.635734498500824, + "learning_rate": 7.611513338369576e-06, + "loss": 0.6263, + "step": 11880 + }, + { + "epoch": 0.6539160107876053, + "grad_norm": 0.7123430967330933, + "learning_rate": 7.611143685486277e-06, + "loss": 0.8446, + "step": 11881 + }, + { + "epoch": 0.6539710495899609, + "grad_norm": 0.7597065567970276, + "learning_rate": 7.610774012978322e-06, + "loss": 0.7449, + "step": 11882 + }, + { + "epoch": 0.6540260883923166, + "grad_norm": 0.7555896043777466, + "learning_rate": 7.610404320848486e-06, + "loss": 0.7575, + "step": 11883 + }, + { + "epoch": 0.6540811271946723, + "grad_norm": 0.7572906613349915, + "learning_rate": 7.6100346090995506e-06, + "loss": 0.7547, + "step": 11884 + }, + { + "epoch": 0.654136165997028, + "grad_norm": 0.6663275957107544, + "learning_rate": 7.609664877734295e-06, + "loss": 0.7038, + "step": 11885 + }, + { + "epoch": 0.6541912047993835, + "grad_norm": 0.7346611618995667, + "learning_rate": 7.609295126755496e-06, + "loss": 0.7902, + "step": 11886 + }, + { + "epoch": 0.6542462436017392, + "grad_norm": 0.6846545338630676, + "learning_rate": 7.608925356165934e-06, + "loss": 0.7334, + "step": 11887 + }, + { + "epoch": 0.6543012824040949, + "grad_norm": 0.6714815497398376, + "learning_rate": 7.608555565968385e-06, + "loss": 0.7204, + "step": 11888 + }, + { + "epoch": 0.6543563212064506, + "grad_norm": 0.805095374584198, + "learning_rate": 7.608185756165634e-06, + "loss": 0.8521, + "step": 11889 + }, + { + "epoch": 0.6544113600088062, + "grad_norm": 0.8415316343307495, + "learning_rate": 7.607815926760456e-06, + "loss": 0.7076, + "step": 11890 + }, + { + "epoch": 0.6544663988111619, + "grad_norm": 0.7665743231773376, + "learning_rate": 7.607446077755632e-06, + "loss": 0.8072, + "step": 11891 + }, + { + "epoch": 0.6545214376135176, + "grad_norm": 0.6705248355865479, + "learning_rate": 7.607076209153939e-06, + "loss": 0.6607, + "step": 11892 + }, + { + "epoch": 0.6545764764158732, + "grad_norm": 0.6791796684265137, + "learning_rate": 7.606706320958159e-06, + "loss": 0.773, + "step": 11893 + }, + { + "epoch": 0.6546315152182288, + "grad_norm": 0.8177357316017151, + "learning_rate": 7.606336413171075e-06, + "loss": 0.8114, + "step": 11894 + }, + { + "epoch": 0.6546865540205845, + "grad_norm": 0.9491637945175171, + "learning_rate": 7.605966485795462e-06, + "loss": 0.7424, + "step": 11895 + }, + { + "epoch": 0.6547415928229402, + "grad_norm": 0.7326256036758423, + "learning_rate": 7.605596538834103e-06, + "loss": 0.8176, + "step": 11896 + }, + { + "epoch": 0.6547966316252959, + "grad_norm": 0.6081808805465698, + "learning_rate": 7.6052265722897775e-06, + "loss": 0.6827, + "step": 11897 + }, + { + "epoch": 0.6548516704276515, + "grad_norm": 0.7165681719779968, + "learning_rate": 7.604856586165268e-06, + "loss": 0.7854, + "step": 11898 + }, + { + "epoch": 0.6549067092300072, + "grad_norm": 0.8777725100517273, + "learning_rate": 7.604486580463353e-06, + "loss": 0.8084, + "step": 11899 + }, + { + "epoch": 0.6549617480323628, + "grad_norm": 0.6814439296722412, + "learning_rate": 7.604116555186811e-06, + "loss": 0.6869, + "step": 11900 + }, + { + "epoch": 0.6550167868347184, + "grad_norm": 0.7060914635658264, + "learning_rate": 7.60374651033843e-06, + "loss": 0.7066, + "step": 11901 + }, + { + "epoch": 0.6550718256370741, + "grad_norm": 0.6823089718818665, + "learning_rate": 7.603376445920987e-06, + "loss": 0.6095, + "step": 11902 + }, + { + "epoch": 0.6551268644394298, + "grad_norm": 0.7099863290786743, + "learning_rate": 7.603006361937262e-06, + "loss": 0.8037, + "step": 11903 + }, + { + "epoch": 0.6551819032417855, + "grad_norm": 0.6479066610336304, + "learning_rate": 7.602636258390037e-06, + "loss": 0.6844, + "step": 11904 + }, + { + "epoch": 0.6552369420441411, + "grad_norm": 0.6663268804550171, + "learning_rate": 7.602266135282097e-06, + "loss": 0.735, + "step": 11905 + }, + { + "epoch": 0.6552919808464968, + "grad_norm": 0.8670598268508911, + "learning_rate": 7.60189599261622e-06, + "loss": 0.779, + "step": 11906 + }, + { + "epoch": 0.6553470196488524, + "grad_norm": 0.607631504535675, + "learning_rate": 7.601525830395189e-06, + "loss": 0.6288, + "step": 11907 + }, + { + "epoch": 0.6554020584512081, + "grad_norm": 0.9054927229881287, + "learning_rate": 7.601155648621786e-06, + "loss": 0.8562, + "step": 11908 + }, + { + "epoch": 0.6554570972535637, + "grad_norm": 0.8069004416465759, + "learning_rate": 7.6007854472987955e-06, + "loss": 0.88, + "step": 11909 + }, + { + "epoch": 0.6555121360559194, + "grad_norm": 0.6393092274665833, + "learning_rate": 7.600415226428995e-06, + "loss": 0.6908, + "step": 11910 + }, + { + "epoch": 0.6555671748582751, + "grad_norm": 0.7533125281333923, + "learning_rate": 7.600044986015172e-06, + "loss": 0.8061, + "step": 11911 + }, + { + "epoch": 0.6556222136606308, + "grad_norm": 0.6859326958656311, + "learning_rate": 7.599674726060105e-06, + "loss": 0.7603, + "step": 11912 + }, + { + "epoch": 0.6556772524629864, + "grad_norm": 0.7284619808197021, + "learning_rate": 7.59930444656658e-06, + "loss": 0.7698, + "step": 11913 + }, + { + "epoch": 0.655732291265342, + "grad_norm": 1.074234127998352, + "learning_rate": 7.598934147537378e-06, + "loss": 0.8252, + "step": 11914 + }, + { + "epoch": 0.6557873300676977, + "grad_norm": 0.6899133920669556, + "learning_rate": 7.598563828975283e-06, + "loss": 0.6023, + "step": 11915 + }, + { + "epoch": 0.6558423688700534, + "grad_norm": 0.6736464500427246, + "learning_rate": 7.598193490883077e-06, + "loss": 0.788, + "step": 11916 + }, + { + "epoch": 0.655897407672409, + "grad_norm": 0.7646307349205017, + "learning_rate": 7.597823133263545e-06, + "loss": 0.7607, + "step": 11917 + }, + { + "epoch": 0.6559524464747647, + "grad_norm": 0.6413717865943909, + "learning_rate": 7.59745275611947e-06, + "loss": 0.6415, + "step": 11918 + }, + { + "epoch": 0.6560074852771204, + "grad_norm": 0.6605532169342041, + "learning_rate": 7.597082359453636e-06, + "loss": 0.6655, + "step": 11919 + }, + { + "epoch": 0.6560625240794761, + "grad_norm": 0.6573199033737183, + "learning_rate": 7.596711943268824e-06, + "loss": 0.624, + "step": 11920 + }, + { + "epoch": 0.6561175628818317, + "grad_norm": 0.8312102556228638, + "learning_rate": 7.596341507567822e-06, + "loss": 0.6803, + "step": 11921 + }, + { + "epoch": 0.6561726016841873, + "grad_norm": 0.6915873289108276, + "learning_rate": 7.59597105235341e-06, + "loss": 0.6897, + "step": 11922 + }, + { + "epoch": 0.656227640486543, + "grad_norm": 0.6916965842247009, + "learning_rate": 7.595600577628377e-06, + "loss": 0.7154, + "step": 11923 + }, + { + "epoch": 0.6562826792888987, + "grad_norm": 0.6712722182273865, + "learning_rate": 7.595230083395501e-06, + "loss": 0.7236, + "step": 11924 + }, + { + "epoch": 0.6563377180912543, + "grad_norm": 0.6514019966125488, + "learning_rate": 7.594859569657575e-06, + "loss": 0.6895, + "step": 11925 + }, + { + "epoch": 0.65639275689361, + "grad_norm": 0.7300555109977722, + "learning_rate": 7.594489036417378e-06, + "loss": 0.7563, + "step": 11926 + }, + { + "epoch": 0.6564477956959657, + "grad_norm": 0.8076907396316528, + "learning_rate": 7.594118483677695e-06, + "loss": 0.8883, + "step": 11927 + }, + { + "epoch": 0.6565028344983214, + "grad_norm": 0.666466236114502, + "learning_rate": 7.5937479114413114e-06, + "loss": 0.7641, + "step": 11928 + }, + { + "epoch": 0.6565578733006769, + "grad_norm": 0.6621832251548767, + "learning_rate": 7.593377319711013e-06, + "loss": 0.6687, + "step": 11929 + }, + { + "epoch": 0.6566129121030326, + "grad_norm": 0.8757139444351196, + "learning_rate": 7.593006708489585e-06, + "loss": 0.7746, + "step": 11930 + }, + { + "epoch": 0.6566679509053883, + "grad_norm": 0.646801769733429, + "learning_rate": 7.5926360777798135e-06, + "loss": 0.6884, + "step": 11931 + }, + { + "epoch": 0.656722989707744, + "grad_norm": 0.6703395843505859, + "learning_rate": 7.592265427584482e-06, + "loss": 0.6822, + "step": 11932 + }, + { + "epoch": 0.6567780285100996, + "grad_norm": 0.7653201222419739, + "learning_rate": 7.591894757906378e-06, + "loss": 0.7999, + "step": 11933 + }, + { + "epoch": 0.6568330673124553, + "grad_norm": 0.6921548247337341, + "learning_rate": 7.591524068748288e-06, + "loss": 0.7177, + "step": 11934 + }, + { + "epoch": 0.656888106114811, + "grad_norm": 0.7085320353507996, + "learning_rate": 7.591153360112995e-06, + "loss": 0.8395, + "step": 11935 + }, + { + "epoch": 0.6569431449171667, + "grad_norm": 0.6565294861793518, + "learning_rate": 7.590782632003287e-06, + "loss": 0.6969, + "step": 11936 + }, + { + "epoch": 0.6569981837195222, + "grad_norm": 0.7023206353187561, + "learning_rate": 7.590411884421952e-06, + "loss": 0.7321, + "step": 11937 + }, + { + "epoch": 0.6570532225218779, + "grad_norm": 0.7848044633865356, + "learning_rate": 7.590041117371774e-06, + "loss": 0.8857, + "step": 11938 + }, + { + "epoch": 0.6571082613242336, + "grad_norm": 1.004591703414917, + "learning_rate": 7.589670330855541e-06, + "loss": 0.8267, + "step": 11939 + }, + { + "epoch": 0.6571633001265893, + "grad_norm": 0.7525139451026917, + "learning_rate": 7.589299524876036e-06, + "loss": 0.6857, + "step": 11940 + }, + { + "epoch": 0.6572183389289449, + "grad_norm": 0.746224582195282, + "learning_rate": 7.588928699436051e-06, + "loss": 0.805, + "step": 11941 + }, + { + "epoch": 0.6572733777313006, + "grad_norm": 0.6304495930671692, + "learning_rate": 7.588557854538371e-06, + "loss": 0.652, + "step": 11942 + }, + { + "epoch": 0.6573284165336563, + "grad_norm": 0.761688768863678, + "learning_rate": 7.588186990185783e-06, + "loss": 0.7954, + "step": 11943 + }, + { + "epoch": 0.6573834553360118, + "grad_norm": 0.7735103368759155, + "learning_rate": 7.587816106381073e-06, + "loss": 0.7584, + "step": 11944 + }, + { + "epoch": 0.6574384941383675, + "grad_norm": 0.7351566553115845, + "learning_rate": 7.5874452031270305e-06, + "loss": 0.7984, + "step": 11945 + }, + { + "epoch": 0.6574935329407232, + "grad_norm": 0.7054993510246277, + "learning_rate": 7.587074280426443e-06, + "loss": 0.7057, + "step": 11946 + }, + { + "epoch": 0.6575485717430789, + "grad_norm": 0.7444368004798889, + "learning_rate": 7.586703338282099e-06, + "loss": 0.7476, + "step": 11947 + }, + { + "epoch": 0.6576036105454345, + "grad_norm": 0.6944568157196045, + "learning_rate": 7.586332376696782e-06, + "loss": 0.6874, + "step": 11948 + }, + { + "epoch": 0.6576586493477902, + "grad_norm": 0.6595578193664551, + "learning_rate": 7.585961395673287e-06, + "loss": 0.7541, + "step": 11949 + }, + { + "epoch": 0.6577136881501459, + "grad_norm": 0.6669502258300781, + "learning_rate": 7.585590395214396e-06, + "loss": 0.7515, + "step": 11950 + }, + { + "epoch": 0.6577687269525015, + "grad_norm": 0.7254583835601807, + "learning_rate": 7.585219375322901e-06, + "loss": 0.8089, + "step": 11951 + }, + { + "epoch": 0.6578237657548571, + "grad_norm": 1.0479141473770142, + "learning_rate": 7.584848336001587e-06, + "loss": 0.8108, + "step": 11952 + }, + { + "epoch": 0.6578788045572128, + "grad_norm": 0.6928718686103821, + "learning_rate": 7.584477277253246e-06, + "loss": 0.6325, + "step": 11953 + }, + { + "epoch": 0.6579338433595685, + "grad_norm": 0.8926869630813599, + "learning_rate": 7.584106199080666e-06, + "loss": 0.7294, + "step": 11954 + }, + { + "epoch": 0.6579888821619242, + "grad_norm": 0.7209964394569397, + "learning_rate": 7.583735101486635e-06, + "loss": 0.7646, + "step": 11955 + }, + { + "epoch": 0.6580439209642798, + "grad_norm": 0.7619316577911377, + "learning_rate": 7.583363984473941e-06, + "loss": 0.7756, + "step": 11956 + }, + { + "epoch": 0.6580989597666355, + "grad_norm": 0.6974903345108032, + "learning_rate": 7.582992848045378e-06, + "loss": 0.6497, + "step": 11957 + }, + { + "epoch": 0.6581539985689911, + "grad_norm": 0.8338617086410522, + "learning_rate": 7.582621692203731e-06, + "loss": 0.6619, + "step": 11958 + }, + { + "epoch": 0.6582090373713468, + "grad_norm": 0.9330396056175232, + "learning_rate": 7.5822505169517905e-06, + "loss": 0.8219, + "step": 11959 + }, + { + "epoch": 0.6582640761737024, + "grad_norm": 0.7725355625152588, + "learning_rate": 7.5818793222923445e-06, + "loss": 0.7262, + "step": 11960 + }, + { + "epoch": 0.6583191149760581, + "grad_norm": 0.7049654722213745, + "learning_rate": 7.5815081082281885e-06, + "loss": 0.7917, + "step": 11961 + }, + { + "epoch": 0.6583741537784138, + "grad_norm": 0.6801711916923523, + "learning_rate": 7.581136874762105e-06, + "loss": 0.6984, + "step": 11962 + }, + { + "epoch": 0.6584291925807695, + "grad_norm": 0.7774253487586975, + "learning_rate": 7.58076562189689e-06, + "loss": 0.7615, + "step": 11963 + }, + { + "epoch": 0.6584842313831251, + "grad_norm": 0.7436443567276001, + "learning_rate": 7.58039434963533e-06, + "loss": 0.7419, + "step": 11964 + }, + { + "epoch": 0.6585392701854808, + "grad_norm": 0.6857719421386719, + "learning_rate": 7.580023057980217e-06, + "loss": 0.8009, + "step": 11965 + }, + { + "epoch": 0.6585943089878364, + "grad_norm": 0.7194758653640747, + "learning_rate": 7.579651746934342e-06, + "loss": 0.7338, + "step": 11966 + }, + { + "epoch": 0.6586493477901921, + "grad_norm": 0.7248701453208923, + "learning_rate": 7.579280416500495e-06, + "loss": 0.6972, + "step": 11967 + }, + { + "epoch": 0.6587043865925477, + "grad_norm": 0.6719415783882141, + "learning_rate": 7.578909066681466e-06, + "loss": 0.7552, + "step": 11968 + }, + { + "epoch": 0.6587594253949034, + "grad_norm": 0.728338897228241, + "learning_rate": 7.578537697480046e-06, + "loss": 0.8386, + "step": 11969 + }, + { + "epoch": 0.6588144641972591, + "grad_norm": 0.7151786684989929, + "learning_rate": 7.578166308899029e-06, + "loss": 0.7186, + "step": 11970 + }, + { + "epoch": 0.6588695029996148, + "grad_norm": 0.664412260055542, + "learning_rate": 7.577794900941205e-06, + "loss": 0.6672, + "step": 11971 + }, + { + "epoch": 0.6589245418019704, + "grad_norm": 0.6915827989578247, + "learning_rate": 7.577423473609361e-06, + "loss": 0.7427, + "step": 11972 + }, + { + "epoch": 0.658979580604326, + "grad_norm": 0.705243706703186, + "learning_rate": 7.577052026906295e-06, + "loss": 0.7526, + "step": 11973 + }, + { + "epoch": 0.6590346194066817, + "grad_norm": 0.6559640169143677, + "learning_rate": 7.576680560834795e-06, + "loss": 0.8187, + "step": 11974 + }, + { + "epoch": 0.6590896582090374, + "grad_norm": 0.7359572649002075, + "learning_rate": 7.576309075397653e-06, + "loss": 0.8127, + "step": 11975 + }, + { + "epoch": 0.659144697011393, + "grad_norm": 0.6581039428710938, + "learning_rate": 7.575937570597661e-06, + "loss": 0.7066, + "step": 11976 + }, + { + "epoch": 0.6591997358137487, + "grad_norm": 0.8360844254493713, + "learning_rate": 7.5755660464376134e-06, + "loss": 0.7998, + "step": 11977 + }, + { + "epoch": 0.6592547746161044, + "grad_norm": 0.7201453447341919, + "learning_rate": 7.5751945029203e-06, + "loss": 0.7884, + "step": 11978 + }, + { + "epoch": 0.6593098134184601, + "grad_norm": 0.6985270977020264, + "learning_rate": 7.574822940048514e-06, + "loss": 0.7268, + "step": 11979 + }, + { + "epoch": 0.6593648522208156, + "grad_norm": 0.6405925154685974, + "learning_rate": 7.574451357825048e-06, + "loss": 0.6848, + "step": 11980 + }, + { + "epoch": 0.6594198910231713, + "grad_norm": 0.6656618714332581, + "learning_rate": 7.574079756252694e-06, + "loss": 0.7755, + "step": 11981 + }, + { + "epoch": 0.659474929825527, + "grad_norm": 0.8461045622825623, + "learning_rate": 7.573708135334248e-06, + "loss": 0.7171, + "step": 11982 + }, + { + "epoch": 0.6595299686278827, + "grad_norm": 0.5527384877204895, + "learning_rate": 7.573336495072498e-06, + "loss": 0.6668, + "step": 11983 + }, + { + "epoch": 0.6595850074302383, + "grad_norm": 0.6703749299049377, + "learning_rate": 7.572964835470241e-06, + "loss": 0.7128, + "step": 11984 + }, + { + "epoch": 0.659640046232594, + "grad_norm": 0.6824783682823181, + "learning_rate": 7.57259315653027e-06, + "loss": 0.8007, + "step": 11985 + }, + { + "epoch": 0.6596950850349497, + "grad_norm": 0.7369599938392639, + "learning_rate": 7.572221458255377e-06, + "loss": 0.7507, + "step": 11986 + }, + { + "epoch": 0.6597501238373052, + "grad_norm": 0.6976807713508606, + "learning_rate": 7.571849740648356e-06, + "loss": 0.7787, + "step": 11987 + }, + { + "epoch": 0.6598051626396609, + "grad_norm": 0.6735848784446716, + "learning_rate": 7.571478003711998e-06, + "loss": 0.6791, + "step": 11988 + }, + { + "epoch": 0.6598602014420166, + "grad_norm": 0.7245956659317017, + "learning_rate": 7.5711062474491025e-06, + "loss": 0.7999, + "step": 11989 + }, + { + "epoch": 0.6599152402443723, + "grad_norm": 0.760748565196991, + "learning_rate": 7.5707344718624595e-06, + "loss": 0.7904, + "step": 11990 + }, + { + "epoch": 0.6599702790467279, + "grad_norm": 0.6745715141296387, + "learning_rate": 7.5703626769548654e-06, + "loss": 0.6938, + "step": 11991 + }, + { + "epoch": 0.6600253178490836, + "grad_norm": 0.7301452159881592, + "learning_rate": 7.569990862729113e-06, + "loss": 0.7546, + "step": 11992 + }, + { + "epoch": 0.6600803566514393, + "grad_norm": 0.68801349401474, + "learning_rate": 7.569619029187998e-06, + "loss": 0.7592, + "step": 11993 + }, + { + "epoch": 0.660135395453795, + "grad_norm": 0.6839548349380493, + "learning_rate": 7.569247176334313e-06, + "loss": 0.7139, + "step": 11994 + }, + { + "epoch": 0.6601904342561505, + "grad_norm": 0.7490861415863037, + "learning_rate": 7.568875304170854e-06, + "loss": 0.7939, + "step": 11995 + }, + { + "epoch": 0.6602454730585062, + "grad_norm": 0.7098836302757263, + "learning_rate": 7.568503412700416e-06, + "loss": 0.7824, + "step": 11996 + }, + { + "epoch": 0.6603005118608619, + "grad_norm": 0.7427988052368164, + "learning_rate": 7.568131501925795e-06, + "loss": 0.7492, + "step": 11997 + }, + { + "epoch": 0.6603555506632176, + "grad_norm": 0.6715356111526489, + "learning_rate": 7.567759571849784e-06, + "loss": 0.6444, + "step": 11998 + }, + { + "epoch": 0.6604105894655732, + "grad_norm": 0.6697829961776733, + "learning_rate": 7.5673876224751795e-06, + "loss": 0.7064, + "step": 11999 + }, + { + "epoch": 0.6604656282679289, + "grad_norm": 0.6778494119644165, + "learning_rate": 7.567015653804777e-06, + "loss": 0.7517, + "step": 12000 + }, + { + "epoch": 0.6605206670702846, + "grad_norm": 0.6423540711402893, + "learning_rate": 7.566643665841371e-06, + "loss": 0.6321, + "step": 12001 + }, + { + "epoch": 0.6605757058726403, + "grad_norm": 0.6874244213104248, + "learning_rate": 7.566271658587761e-06, + "loss": 0.762, + "step": 12002 + }, + { + "epoch": 0.6606307446749958, + "grad_norm": 0.6805301308631897, + "learning_rate": 7.565899632046737e-06, + "loss": 0.765, + "step": 12003 + }, + { + "epoch": 0.6606857834773515, + "grad_norm": 0.7039558291435242, + "learning_rate": 7.5655275862211e-06, + "loss": 0.728, + "step": 12004 + }, + { + "epoch": 0.6607408222797072, + "grad_norm": 0.6513119339942932, + "learning_rate": 7.565155521113643e-06, + "loss": 0.7711, + "step": 12005 + }, + { + "epoch": 0.6607958610820629, + "grad_norm": 0.6483618021011353, + "learning_rate": 7.5647834367271655e-06, + "loss": 0.7015, + "step": 12006 + }, + { + "epoch": 0.6608508998844185, + "grad_norm": 0.7180553674697876, + "learning_rate": 7.564411333064461e-06, + "loss": 0.812, + "step": 12007 + }, + { + "epoch": 0.6609059386867742, + "grad_norm": 0.9036096334457397, + "learning_rate": 7.5640392101283285e-06, + "loss": 0.7858, + "step": 12008 + }, + { + "epoch": 0.6609609774891299, + "grad_norm": 0.7380802035331726, + "learning_rate": 7.563667067921563e-06, + "loss": 0.6615, + "step": 12009 + }, + { + "epoch": 0.6610160162914855, + "grad_norm": 0.6830628514289856, + "learning_rate": 7.5632949064469615e-06, + "loss": 0.7465, + "step": 12010 + }, + { + "epoch": 0.6610710550938411, + "grad_norm": 0.7562816143035889, + "learning_rate": 7.562922725707323e-06, + "loss": 0.8559, + "step": 12011 + }, + { + "epoch": 0.6611260938961968, + "grad_norm": 0.7376649379730225, + "learning_rate": 7.562550525705442e-06, + "loss": 0.7769, + "step": 12012 + }, + { + "epoch": 0.6611811326985525, + "grad_norm": 0.715466320514679, + "learning_rate": 7.562178306444116e-06, + "loss": 0.8233, + "step": 12013 + }, + { + "epoch": 0.6612361715009082, + "grad_norm": 0.6714800596237183, + "learning_rate": 7.561806067926147e-06, + "loss": 0.6025, + "step": 12014 + }, + { + "epoch": 0.6612912103032638, + "grad_norm": 0.7083391547203064, + "learning_rate": 7.561433810154328e-06, + "loss": 0.7063, + "step": 12015 + }, + { + "epoch": 0.6613462491056195, + "grad_norm": 0.8062768578529358, + "learning_rate": 7.561061533131457e-06, + "loss": 0.7992, + "step": 12016 + }, + { + "epoch": 0.6614012879079751, + "grad_norm": 0.741889476776123, + "learning_rate": 7.560689236860334e-06, + "loss": 0.8149, + "step": 12017 + }, + { + "epoch": 0.6614563267103308, + "grad_norm": 0.6834374666213989, + "learning_rate": 7.560316921343756e-06, + "loss": 0.782, + "step": 12018 + }, + { + "epoch": 0.6615113655126864, + "grad_norm": 0.7469872236251831, + "learning_rate": 7.559944586584522e-06, + "loss": 0.759, + "step": 12019 + }, + { + "epoch": 0.6615664043150421, + "grad_norm": 0.8300836086273193, + "learning_rate": 7.559572232585428e-06, + "loss": 0.8637, + "step": 12020 + }, + { + "epoch": 0.6616214431173978, + "grad_norm": 0.6241582632064819, + "learning_rate": 7.559199859349276e-06, + "loss": 0.7134, + "step": 12021 + }, + { + "epoch": 0.6616764819197535, + "grad_norm": 0.6696488261222839, + "learning_rate": 7.5588274668788634e-06, + "loss": 0.7457, + "step": 12022 + }, + { + "epoch": 0.6617315207221091, + "grad_norm": 0.7090815305709839, + "learning_rate": 7.558455055176987e-06, + "loss": 0.7449, + "step": 12023 + }, + { + "epoch": 0.6617865595244647, + "grad_norm": 0.6925215125083923, + "learning_rate": 7.558082624246448e-06, + "loss": 0.758, + "step": 12024 + }, + { + "epoch": 0.6618415983268204, + "grad_norm": 0.6658454537391663, + "learning_rate": 7.5577101740900425e-06, + "loss": 0.6918, + "step": 12025 + }, + { + "epoch": 0.6618966371291761, + "grad_norm": 0.6646405458450317, + "learning_rate": 7.557337704710574e-06, + "loss": 0.7293, + "step": 12026 + }, + { + "epoch": 0.6619516759315317, + "grad_norm": 0.6630399227142334, + "learning_rate": 7.556965216110841e-06, + "loss": 0.7572, + "step": 12027 + }, + { + "epoch": 0.6620067147338874, + "grad_norm": 0.7333918809890747, + "learning_rate": 7.556592708293641e-06, + "loss": 0.8012, + "step": 12028 + }, + { + "epoch": 0.6620617535362431, + "grad_norm": 0.7399254441261292, + "learning_rate": 7.556220181261773e-06, + "loss": 0.8406, + "step": 12029 + }, + { + "epoch": 0.6621167923385987, + "grad_norm": 0.6244909167289734, + "learning_rate": 7.55584763501804e-06, + "loss": 0.7427, + "step": 12030 + }, + { + "epoch": 0.6621718311409543, + "grad_norm": 0.6991485953330994, + "learning_rate": 7.55547506956524e-06, + "loss": 0.7583, + "step": 12031 + }, + { + "epoch": 0.66222686994331, + "grad_norm": 0.7115411162376404, + "learning_rate": 7.555102484906174e-06, + "loss": 0.7951, + "step": 12032 + }, + { + "epoch": 0.6622819087456657, + "grad_norm": 0.7684284448623657, + "learning_rate": 7.554729881043641e-06, + "loss": 0.717, + "step": 12033 + }, + { + "epoch": 0.6623369475480213, + "grad_norm": 0.7705931067466736, + "learning_rate": 7.554357257980443e-06, + "loss": 0.6903, + "step": 12034 + }, + { + "epoch": 0.662391986350377, + "grad_norm": 0.9283333420753479, + "learning_rate": 7.553984615719379e-06, + "loss": 0.7845, + "step": 12035 + }, + { + "epoch": 0.6624470251527327, + "grad_norm": 0.6867572665214539, + "learning_rate": 7.553611954263249e-06, + "loss": 0.8796, + "step": 12036 + }, + { + "epoch": 0.6625020639550884, + "grad_norm": 0.6129451990127563, + "learning_rate": 7.553239273614855e-06, + "loss": 0.6308, + "step": 12037 + }, + { + "epoch": 0.662557102757444, + "grad_norm": 0.749679446220398, + "learning_rate": 7.552866573777e-06, + "loss": 0.8308, + "step": 12038 + }, + { + "epoch": 0.6626121415597996, + "grad_norm": 0.7651422619819641, + "learning_rate": 7.552493854752483e-06, + "loss": 0.7266, + "step": 12039 + }, + { + "epoch": 0.6626671803621553, + "grad_norm": 0.9293195009231567, + "learning_rate": 7.552121116544104e-06, + "loss": 0.7795, + "step": 12040 + }, + { + "epoch": 0.662722219164511, + "grad_norm": 0.7321802973747253, + "learning_rate": 7.5517483591546655e-06, + "loss": 0.7294, + "step": 12041 + }, + { + "epoch": 0.6627772579668666, + "grad_norm": 0.702414333820343, + "learning_rate": 7.551375582586971e-06, + "loss": 0.7954, + "step": 12042 + }, + { + "epoch": 0.6628322967692223, + "grad_norm": 0.7497946619987488, + "learning_rate": 7.551002786843819e-06, + "loss": 0.7654, + "step": 12043 + }, + { + "epoch": 0.662887335571578, + "grad_norm": 0.6125331521034241, + "learning_rate": 7.550629971928017e-06, + "loss": 0.7299, + "step": 12044 + }, + { + "epoch": 0.6629423743739337, + "grad_norm": 0.7252177596092224, + "learning_rate": 7.550257137842358e-06, + "loss": 0.7553, + "step": 12045 + }, + { + "epoch": 0.6629974131762892, + "grad_norm": 0.6463978886604309, + "learning_rate": 7.5498842845896515e-06, + "loss": 0.7114, + "step": 12046 + }, + { + "epoch": 0.6630524519786449, + "grad_norm": 0.7392497062683105, + "learning_rate": 7.549511412172696e-06, + "loss": 0.6801, + "step": 12047 + }, + { + "epoch": 0.6631074907810006, + "grad_norm": 0.8068972229957581, + "learning_rate": 7.549138520594297e-06, + "loss": 0.8207, + "step": 12048 + }, + { + "epoch": 0.6631625295833563, + "grad_norm": 0.7632858753204346, + "learning_rate": 7.548765609857254e-06, + "loss": 0.7095, + "step": 12049 + }, + { + "epoch": 0.6632175683857119, + "grad_norm": 0.7252069115638733, + "learning_rate": 7.5483926799643705e-06, + "loss": 0.7796, + "step": 12050 + }, + { + "epoch": 0.6632726071880676, + "grad_norm": 1.048311471939087, + "learning_rate": 7.54801973091845e-06, + "loss": 0.7306, + "step": 12051 + }, + { + "epoch": 0.6633276459904233, + "grad_norm": 0.7432072758674622, + "learning_rate": 7.547646762722296e-06, + "loss": 0.8209, + "step": 12052 + }, + { + "epoch": 0.663382684792779, + "grad_norm": 0.7191399335861206, + "learning_rate": 7.547273775378709e-06, + "loss": 0.7011, + "step": 12053 + }, + { + "epoch": 0.6634377235951345, + "grad_norm": 0.5776329636573792, + "learning_rate": 7.5469007688904975e-06, + "loss": 0.6055, + "step": 12054 + }, + { + "epoch": 0.6634927623974902, + "grad_norm": 0.9296837449073792, + "learning_rate": 7.546527743260459e-06, + "loss": 0.7413, + "step": 12055 + }, + { + "epoch": 0.6635478011998459, + "grad_norm": 0.7279512286186218, + "learning_rate": 7.5461546984914e-06, + "loss": 0.7734, + "step": 12056 + }, + { + "epoch": 0.6636028400022016, + "grad_norm": 0.7297198176383972, + "learning_rate": 7.545781634586125e-06, + "loss": 0.7535, + "step": 12057 + }, + { + "epoch": 0.6636578788045572, + "grad_norm": 0.7094287872314453, + "learning_rate": 7.545408551547435e-06, + "loss": 0.7587, + "step": 12058 + }, + { + "epoch": 0.6637129176069129, + "grad_norm": 0.7559607028961182, + "learning_rate": 7.5450354493781374e-06, + "loss": 0.7358, + "step": 12059 + }, + { + "epoch": 0.6637679564092686, + "grad_norm": 0.8472892045974731, + "learning_rate": 7.544662328081034e-06, + "loss": 0.7537, + "step": 12060 + }, + { + "epoch": 0.6638229952116242, + "grad_norm": 0.6346176862716675, + "learning_rate": 7.544289187658929e-06, + "loss": 0.7658, + "step": 12061 + }, + { + "epoch": 0.6638780340139798, + "grad_norm": 0.7949367165565491, + "learning_rate": 7.543916028114628e-06, + "loss": 0.6837, + "step": 12062 + }, + { + "epoch": 0.6639330728163355, + "grad_norm": 0.7177689671516418, + "learning_rate": 7.5435428494509355e-06, + "loss": 0.7218, + "step": 12063 + }, + { + "epoch": 0.6639881116186912, + "grad_norm": 0.90680330991745, + "learning_rate": 7.5431696516706555e-06, + "loss": 0.8274, + "step": 12064 + }, + { + "epoch": 0.6640431504210469, + "grad_norm": 0.7799603939056396, + "learning_rate": 7.5427964347765916e-06, + "loss": 0.7528, + "step": 12065 + }, + { + "epoch": 0.6640981892234025, + "grad_norm": 0.7668048739433289, + "learning_rate": 7.542423198771553e-06, + "loss": 0.746, + "step": 12066 + }, + { + "epoch": 0.6641532280257582, + "grad_norm": 1.0042381286621094, + "learning_rate": 7.542049943658341e-06, + "loss": 0.7836, + "step": 12067 + }, + { + "epoch": 0.6642082668281138, + "grad_norm": 0.6915723085403442, + "learning_rate": 7.541676669439761e-06, + "loss": 0.8042, + "step": 12068 + }, + { + "epoch": 0.6642633056304695, + "grad_norm": 0.7268955707550049, + "learning_rate": 7.5413033761186215e-06, + "loss": 0.689, + "step": 12069 + }, + { + "epoch": 0.6643183444328251, + "grad_norm": 0.6418740749359131, + "learning_rate": 7.540930063697726e-06, + "loss": 0.6302, + "step": 12070 + }, + { + "epoch": 0.6643733832351808, + "grad_norm": 0.696384847164154, + "learning_rate": 7.540556732179879e-06, + "loss": 0.7978, + "step": 12071 + }, + { + "epoch": 0.6644284220375365, + "grad_norm": 0.7400668859481812, + "learning_rate": 7.540183381567889e-06, + "loss": 0.8768, + "step": 12072 + }, + { + "epoch": 0.6644834608398921, + "grad_norm": 0.6653871536254883, + "learning_rate": 7.539810011864559e-06, + "loss": 0.8107, + "step": 12073 + }, + { + "epoch": 0.6645384996422478, + "grad_norm": 0.7635810971260071, + "learning_rate": 7.539436623072698e-06, + "loss": 0.8476, + "step": 12074 + }, + { + "epoch": 0.6645935384446034, + "grad_norm": 0.6583054661750793, + "learning_rate": 7.53906321519511e-06, + "loss": 0.7093, + "step": 12075 + }, + { + "epoch": 0.6646485772469591, + "grad_norm": 0.8294859528541565, + "learning_rate": 7.538689788234604e-06, + "loss": 0.8107, + "step": 12076 + }, + { + "epoch": 0.6647036160493147, + "grad_norm": 0.6711081862449646, + "learning_rate": 7.538316342193983e-06, + "loss": 0.7491, + "step": 12077 + }, + { + "epoch": 0.6647586548516704, + "grad_norm": 0.7375408411026001, + "learning_rate": 7.5379428770760575e-06, + "loss": 0.7853, + "step": 12078 + }, + { + "epoch": 0.6648136936540261, + "grad_norm": 0.7322511672973633, + "learning_rate": 7.537569392883633e-06, + "loss": 0.7568, + "step": 12079 + }, + { + "epoch": 0.6648687324563818, + "grad_norm": 0.6390300393104553, + "learning_rate": 7.537195889619515e-06, + "loss": 0.7191, + "step": 12080 + }, + { + "epoch": 0.6649237712587374, + "grad_norm": 0.8155800104141235, + "learning_rate": 7.536822367286514e-06, + "loss": 0.7499, + "step": 12081 + }, + { + "epoch": 0.664978810061093, + "grad_norm": 0.7942230701446533, + "learning_rate": 7.536448825887432e-06, + "loss": 0.7797, + "step": 12082 + }, + { + "epoch": 0.6650338488634487, + "grad_norm": 0.7103378176689148, + "learning_rate": 7.536075265425083e-06, + "loss": 0.6814, + "step": 12083 + }, + { + "epoch": 0.6650888876658044, + "grad_norm": 0.8164991736412048, + "learning_rate": 7.535701685902268e-06, + "loss": 0.7917, + "step": 12084 + }, + { + "epoch": 0.66514392646816, + "grad_norm": 0.6970370411872864, + "learning_rate": 7.535328087321799e-06, + "loss": 0.7266, + "step": 12085 + }, + { + "epoch": 0.6651989652705157, + "grad_norm": 0.6468706130981445, + "learning_rate": 7.534954469686484e-06, + "loss": 0.7229, + "step": 12086 + }, + { + "epoch": 0.6652540040728714, + "grad_norm": 0.6551242470741272, + "learning_rate": 7.534580832999128e-06, + "loss": 0.6759, + "step": 12087 + }, + { + "epoch": 0.6653090428752271, + "grad_norm": 0.670215368270874, + "learning_rate": 7.534207177262543e-06, + "loss": 0.761, + "step": 12088 + }, + { + "epoch": 0.6653640816775827, + "grad_norm": 0.7365970015525818, + "learning_rate": 7.533833502479533e-06, + "loss": 0.7628, + "step": 12089 + }, + { + "epoch": 0.6654191204799383, + "grad_norm": 0.7419471740722656, + "learning_rate": 7.53345980865291e-06, + "loss": 0.8093, + "step": 12090 + }, + { + "epoch": 0.665474159282294, + "grad_norm": 0.6573269963264465, + "learning_rate": 7.53308609578548e-06, + "loss": 0.6806, + "step": 12091 + }, + { + "epoch": 0.6655291980846497, + "grad_norm": 0.9270638227462769, + "learning_rate": 7.5327123638800545e-06, + "loss": 0.8612, + "step": 12092 + }, + { + "epoch": 0.6655842368870053, + "grad_norm": 0.85124671459198, + "learning_rate": 7.532338612939441e-06, + "loss": 0.6776, + "step": 12093 + }, + { + "epoch": 0.665639275689361, + "grad_norm": 0.7791070342063904, + "learning_rate": 7.531964842966446e-06, + "loss": 0.7571, + "step": 12094 + }, + { + "epoch": 0.6656943144917167, + "grad_norm": 0.6604436635971069, + "learning_rate": 7.5315910539638825e-06, + "loss": 0.781, + "step": 12095 + }, + { + "epoch": 0.6657493532940724, + "grad_norm": 0.7567091584205627, + "learning_rate": 7.531217245934559e-06, + "loss": 0.8005, + "step": 12096 + }, + { + "epoch": 0.6658043920964279, + "grad_norm": 0.660637378692627, + "learning_rate": 7.530843418881282e-06, + "loss": 0.7351, + "step": 12097 + }, + { + "epoch": 0.6658594308987836, + "grad_norm": 0.6305738687515259, + "learning_rate": 7.530469572806865e-06, + "loss": 0.7452, + "step": 12098 + }, + { + "epoch": 0.6659144697011393, + "grad_norm": 0.8291265368461609, + "learning_rate": 7.5300957077141164e-06, + "loss": 0.7799, + "step": 12099 + }, + { + "epoch": 0.665969508503495, + "grad_norm": 0.7459661364555359, + "learning_rate": 7.5297218236058456e-06, + "loss": 0.8273, + "step": 12100 + }, + { + "epoch": 0.6660245473058506, + "grad_norm": 0.7570028901100159, + "learning_rate": 7.529347920484862e-06, + "loss": 0.7622, + "step": 12101 + }, + { + "epoch": 0.6660795861082063, + "grad_norm": 0.733403205871582, + "learning_rate": 7.528973998353977e-06, + "loss": 0.8357, + "step": 12102 + }, + { + "epoch": 0.666134624910562, + "grad_norm": 0.8814442753791809, + "learning_rate": 7.528600057216e-06, + "loss": 0.727, + "step": 12103 + }, + { + "epoch": 0.6661896637129177, + "grad_norm": 0.629338800907135, + "learning_rate": 7.528226097073742e-06, + "loss": 0.6758, + "step": 12104 + }, + { + "epoch": 0.6662447025152732, + "grad_norm": 0.7786098122596741, + "learning_rate": 7.527852117930014e-06, + "loss": 0.7476, + "step": 12105 + }, + { + "epoch": 0.6662997413176289, + "grad_norm": 0.6604528427124023, + "learning_rate": 7.527478119787626e-06, + "loss": 0.7275, + "step": 12106 + }, + { + "epoch": 0.6663547801199846, + "grad_norm": 0.6937400698661804, + "learning_rate": 7.527104102649387e-06, + "loss": 0.7187, + "step": 12107 + }, + { + "epoch": 0.6664098189223403, + "grad_norm": 0.6863219738006592, + "learning_rate": 7.526730066518113e-06, + "loss": 0.7512, + "step": 12108 + }, + { + "epoch": 0.6664648577246959, + "grad_norm": 0.7771461606025696, + "learning_rate": 7.526356011396609e-06, + "loss": 0.8439, + "step": 12109 + }, + { + "epoch": 0.6665198965270516, + "grad_norm": 0.7223722338676453, + "learning_rate": 7.525981937287692e-06, + "loss": 0.6488, + "step": 12110 + }, + { + "epoch": 0.6665749353294073, + "grad_norm": 0.8091556429862976, + "learning_rate": 7.52560784419417e-06, + "loss": 0.6618, + "step": 12111 + }, + { + "epoch": 0.666629974131763, + "grad_norm": 0.6435044407844543, + "learning_rate": 7.525233732118856e-06, + "loss": 0.6994, + "step": 12112 + }, + { + "epoch": 0.6666850129341185, + "grad_norm": 0.6933714151382446, + "learning_rate": 7.52485960106456e-06, + "loss": 0.6917, + "step": 12113 + }, + { + "epoch": 0.6667400517364742, + "grad_norm": 0.693192720413208, + "learning_rate": 7.524485451034097e-06, + "loss": 0.7941, + "step": 12114 + }, + { + "epoch": 0.6667950905388299, + "grad_norm": 1.1374844312667847, + "learning_rate": 7.524111282030275e-06, + "loss": 0.9112, + "step": 12115 + }, + { + "epoch": 0.6668501293411855, + "grad_norm": 0.6917465329170227, + "learning_rate": 7.523737094055911e-06, + "loss": 0.681, + "step": 12116 + }, + { + "epoch": 0.6669051681435412, + "grad_norm": 0.8057913184165955, + "learning_rate": 7.523362887113812e-06, + "loss": 0.8186, + "step": 12117 + }, + { + "epoch": 0.6669602069458969, + "grad_norm": 0.7194918394088745, + "learning_rate": 7.522988661206795e-06, + "loss": 0.7875, + "step": 12118 + }, + { + "epoch": 0.6670152457482525, + "grad_norm": 0.6829916834831238, + "learning_rate": 7.52261441633767e-06, + "loss": 0.6506, + "step": 12119 + }, + { + "epoch": 0.6670702845506081, + "grad_norm": 0.7869738936424255, + "learning_rate": 7.5222401525092495e-06, + "loss": 0.7091, + "step": 12120 + }, + { + "epoch": 0.6671253233529638, + "grad_norm": 0.6835895776748657, + "learning_rate": 7.5218658697243475e-06, + "loss": 0.7839, + "step": 12121 + }, + { + "epoch": 0.6671803621553195, + "grad_norm": 0.7462154030799866, + "learning_rate": 7.521491567985776e-06, + "loss": 0.7073, + "step": 12122 + }, + { + "epoch": 0.6672354009576752, + "grad_norm": 0.6413764953613281, + "learning_rate": 7.52111724729635e-06, + "loss": 0.6472, + "step": 12123 + }, + { + "epoch": 0.6672904397600308, + "grad_norm": 0.7085923552513123, + "learning_rate": 7.520742907658881e-06, + "loss": 0.8167, + "step": 12124 + }, + { + "epoch": 0.6673454785623865, + "grad_norm": 0.6490428447723389, + "learning_rate": 7.520368549076182e-06, + "loss": 0.7693, + "step": 12125 + }, + { + "epoch": 0.6674005173647422, + "grad_norm": 0.7082974910736084, + "learning_rate": 7.51999417155107e-06, + "loss": 0.6707, + "step": 12126 + }, + { + "epoch": 0.6674555561670978, + "grad_norm": 0.704335629940033, + "learning_rate": 7.519619775086355e-06, + "loss": 0.825, + "step": 12127 + }, + { + "epoch": 0.6675105949694534, + "grad_norm": 0.6815123558044434, + "learning_rate": 7.519245359684852e-06, + "loss": 0.762, + "step": 12128 + }, + { + "epoch": 0.6675656337718091, + "grad_norm": 0.6497910618782043, + "learning_rate": 7.518870925349376e-06, + "loss": 0.6934, + "step": 12129 + }, + { + "epoch": 0.6676206725741648, + "grad_norm": 0.6699943542480469, + "learning_rate": 7.51849647208274e-06, + "loss": 0.7816, + "step": 12130 + }, + { + "epoch": 0.6676757113765205, + "grad_norm": 0.7139337062835693, + "learning_rate": 7.51812199988776e-06, + "loss": 0.679, + "step": 12131 + }, + { + "epoch": 0.6677307501788761, + "grad_norm": 0.6762346029281616, + "learning_rate": 7.517747508767248e-06, + "loss": 0.7477, + "step": 12132 + }, + { + "epoch": 0.6677857889812318, + "grad_norm": 0.7429338693618774, + "learning_rate": 7.517372998724017e-06, + "loss": 0.7549, + "step": 12133 + }, + { + "epoch": 0.6678408277835874, + "grad_norm": 0.7392850518226624, + "learning_rate": 7.516998469760888e-06, + "loss": 0.8167, + "step": 12134 + }, + { + "epoch": 0.6678958665859431, + "grad_norm": 0.7511306405067444, + "learning_rate": 7.516623921880671e-06, + "loss": 0.7264, + "step": 12135 + }, + { + "epoch": 0.6679509053882987, + "grad_norm": 0.6757550835609436, + "learning_rate": 7.516249355086183e-06, + "loss": 0.7405, + "step": 12136 + }, + { + "epoch": 0.6680059441906544, + "grad_norm": 0.7433735132217407, + "learning_rate": 7.515874769380238e-06, + "loss": 0.7954, + "step": 12137 + }, + { + "epoch": 0.6680609829930101, + "grad_norm": 0.7390886545181274, + "learning_rate": 7.51550016476565e-06, + "loss": 0.7487, + "step": 12138 + }, + { + "epoch": 0.6681160217953658, + "grad_norm": 0.7405929565429688, + "learning_rate": 7.5151255412452385e-06, + "loss": 0.8127, + "step": 12139 + }, + { + "epoch": 0.6681710605977214, + "grad_norm": 0.6628968715667725, + "learning_rate": 7.514750898821817e-06, + "loss": 0.7009, + "step": 12140 + }, + { + "epoch": 0.668226099400077, + "grad_norm": 0.6777421832084656, + "learning_rate": 7.514376237498199e-06, + "loss": 0.6689, + "step": 12141 + }, + { + "epoch": 0.6682811382024327, + "grad_norm": 0.617261528968811, + "learning_rate": 7.514001557277202e-06, + "loss": 0.7597, + "step": 12142 + }, + { + "epoch": 0.6683361770047884, + "grad_norm": 0.6666202545166016, + "learning_rate": 7.5136268581616446e-06, + "loss": 0.6623, + "step": 12143 + }, + { + "epoch": 0.668391215807144, + "grad_norm": 0.7170178890228271, + "learning_rate": 7.513252140154339e-06, + "loss": 0.8224, + "step": 12144 + }, + { + "epoch": 0.6684462546094997, + "grad_norm": 0.6173199415206909, + "learning_rate": 7.512877403258103e-06, + "loss": 0.6784, + "step": 12145 + }, + { + "epoch": 0.6685012934118554, + "grad_norm": 0.6906641125679016, + "learning_rate": 7.512502647475753e-06, + "loss": 0.6649, + "step": 12146 + }, + { + "epoch": 0.6685563322142111, + "grad_norm": 0.6435873508453369, + "learning_rate": 7.5121278728101065e-06, + "loss": 0.751, + "step": 12147 + }, + { + "epoch": 0.6686113710165666, + "grad_norm": 0.8345947861671448, + "learning_rate": 7.511753079263978e-06, + "loss": 0.7841, + "step": 12148 + }, + { + "epoch": 0.6686664098189223, + "grad_norm": 0.6952378153800964, + "learning_rate": 7.511378266840187e-06, + "loss": 0.8187, + "step": 12149 + }, + { + "epoch": 0.668721448621278, + "grad_norm": 0.6878920793533325, + "learning_rate": 7.5110034355415484e-06, + "loss": 0.6726, + "step": 12150 + }, + { + "epoch": 0.6687764874236337, + "grad_norm": 0.7119094729423523, + "learning_rate": 7.5106285853708805e-06, + "loss": 0.7824, + "step": 12151 + }, + { + "epoch": 0.6688315262259893, + "grad_norm": 0.7261053323745728, + "learning_rate": 7.5102537163309994e-06, + "loss": 0.7122, + "step": 12152 + }, + { + "epoch": 0.668886565028345, + "grad_norm": 0.717268168926239, + "learning_rate": 7.509878828424725e-06, + "loss": 0.7144, + "step": 12153 + }, + { + "epoch": 0.6689416038307007, + "grad_norm": 0.8373270630836487, + "learning_rate": 7.5095039216548725e-06, + "loss": 0.7941, + "step": 12154 + }, + { + "epoch": 0.6689966426330564, + "grad_norm": 0.7113829851150513, + "learning_rate": 7.509128996024259e-06, + "loss": 0.705, + "step": 12155 + }, + { + "epoch": 0.6690516814354119, + "grad_norm": 0.7894094586372375, + "learning_rate": 7.508754051535705e-06, + "loss": 0.8284, + "step": 12156 + }, + { + "epoch": 0.6691067202377676, + "grad_norm": 0.6739659905433655, + "learning_rate": 7.508379088192028e-06, + "loss": 0.7264, + "step": 12157 + }, + { + "epoch": 0.6691617590401233, + "grad_norm": 0.735211193561554, + "learning_rate": 7.508004105996043e-06, + "loss": 0.8187, + "step": 12158 + }, + { + "epoch": 0.6692167978424789, + "grad_norm": 0.7438055872917175, + "learning_rate": 7.507629104950571e-06, + "loss": 0.8949, + "step": 12159 + }, + { + "epoch": 0.6692718366448346, + "grad_norm": 1.0734246969223022, + "learning_rate": 7.507254085058431e-06, + "loss": 0.7687, + "step": 12160 + }, + { + "epoch": 0.6693268754471903, + "grad_norm": 0.6719897985458374, + "learning_rate": 7.50687904632244e-06, + "loss": 0.7522, + "step": 12161 + }, + { + "epoch": 0.669381914249546, + "grad_norm": 0.7063966989517212, + "learning_rate": 7.506503988745416e-06, + "loss": 0.7794, + "step": 12162 + }, + { + "epoch": 0.6694369530519015, + "grad_norm": 0.6582265496253967, + "learning_rate": 7.506128912330179e-06, + "loss": 0.7012, + "step": 12163 + }, + { + "epoch": 0.6694919918542572, + "grad_norm": 0.7764506340026855, + "learning_rate": 7.50575381707955e-06, + "loss": 0.7816, + "step": 12164 + }, + { + "epoch": 0.6695470306566129, + "grad_norm": 0.7659780383110046, + "learning_rate": 7.505378702996344e-06, + "loss": 0.753, + "step": 12165 + }, + { + "epoch": 0.6696020694589686, + "grad_norm": 0.9013122916221619, + "learning_rate": 7.505003570083385e-06, + "loss": 0.8255, + "step": 12166 + }, + { + "epoch": 0.6696571082613242, + "grad_norm": 0.6417272686958313, + "learning_rate": 7.504628418343487e-06, + "loss": 0.6236, + "step": 12167 + }, + { + "epoch": 0.6697121470636799, + "grad_norm": 0.7511595487594604, + "learning_rate": 7.504253247779474e-06, + "loss": 0.7961, + "step": 12168 + }, + { + "epoch": 0.6697671858660356, + "grad_norm": 0.7987878918647766, + "learning_rate": 7.503878058394163e-06, + "loss": 0.7249, + "step": 12169 + }, + { + "epoch": 0.6698222246683913, + "grad_norm": 0.6860646605491638, + "learning_rate": 7.503502850190374e-06, + "loss": 0.7973, + "step": 12170 + }, + { + "epoch": 0.6698772634707468, + "grad_norm": 0.7334334850311279, + "learning_rate": 7.50312762317093e-06, + "loss": 0.8756, + "step": 12171 + }, + { + "epoch": 0.6699323022731025, + "grad_norm": 0.7792186737060547, + "learning_rate": 7.502752377338647e-06, + "loss": 0.8393, + "step": 12172 + }, + { + "epoch": 0.6699873410754582, + "grad_norm": 0.6532536149024963, + "learning_rate": 7.502377112696346e-06, + "loss": 0.6509, + "step": 12173 + }, + { + "epoch": 0.6700423798778139, + "grad_norm": 0.6595458984375, + "learning_rate": 7.50200182924685e-06, + "loss": 0.781, + "step": 12174 + }, + { + "epoch": 0.6700974186801695, + "grad_norm": 0.6668636202812195, + "learning_rate": 7.501626526992978e-06, + "loss": 0.7702, + "step": 12175 + }, + { + "epoch": 0.6701524574825252, + "grad_norm": 0.686851441860199, + "learning_rate": 7.501251205937551e-06, + "loss": 0.8648, + "step": 12176 + }, + { + "epoch": 0.6702074962848809, + "grad_norm": 0.7363078594207764, + "learning_rate": 7.500875866083388e-06, + "loss": 0.7309, + "step": 12177 + }, + { + "epoch": 0.6702625350872365, + "grad_norm": 0.6927379369735718, + "learning_rate": 7.500500507433312e-06, + "loss": 0.7258, + "step": 12178 + }, + { + "epoch": 0.6703175738895921, + "grad_norm": 0.6589936017990112, + "learning_rate": 7.5001251299901455e-06, + "loss": 0.6776, + "step": 12179 + }, + { + "epoch": 0.6703726126919478, + "grad_norm": 0.6402539610862732, + "learning_rate": 7.499749733756707e-06, + "loss": 0.7467, + "step": 12180 + }, + { + "epoch": 0.6704276514943035, + "grad_norm": 0.776469886302948, + "learning_rate": 7.499374318735817e-06, + "loss": 0.7856, + "step": 12181 + }, + { + "epoch": 0.6704826902966592, + "grad_norm": 0.7062460780143738, + "learning_rate": 7.4989988849303e-06, + "loss": 0.8286, + "step": 12182 + }, + { + "epoch": 0.6705377290990148, + "grad_norm": 0.6725799441337585, + "learning_rate": 7.4986234323429755e-06, + "loss": 0.7517, + "step": 12183 + }, + { + "epoch": 0.6705927679013705, + "grad_norm": 0.6444042921066284, + "learning_rate": 7.498247960976667e-06, + "loss": 0.5984, + "step": 12184 + }, + { + "epoch": 0.6706478067037261, + "grad_norm": 0.6968628764152527, + "learning_rate": 7.497872470834195e-06, + "loss": 0.6996, + "step": 12185 + }, + { + "epoch": 0.6707028455060818, + "grad_norm": 0.643500030040741, + "learning_rate": 7.497496961918381e-06, + "loss": 0.6252, + "step": 12186 + }, + { + "epoch": 0.6707578843084374, + "grad_norm": 0.7026870846748352, + "learning_rate": 7.49712143423205e-06, + "loss": 0.7883, + "step": 12187 + }, + { + "epoch": 0.6708129231107931, + "grad_norm": 0.8169240951538086, + "learning_rate": 7.496745887778022e-06, + "loss": 0.6717, + "step": 12188 + }, + { + "epoch": 0.6708679619131488, + "grad_norm": 0.6611927151679993, + "learning_rate": 7.496370322559121e-06, + "loss": 0.6674, + "step": 12189 + }, + { + "epoch": 0.6709230007155045, + "grad_norm": 0.7330195307731628, + "learning_rate": 7.495994738578169e-06, + "loss": 0.7809, + "step": 12190 + }, + { + "epoch": 0.6709780395178601, + "grad_norm": 0.6469636559486389, + "learning_rate": 7.495619135837988e-06, + "loss": 0.6511, + "step": 12191 + }, + { + "epoch": 0.6710330783202157, + "grad_norm": 0.6558564901351929, + "learning_rate": 7.495243514341402e-06, + "loss": 0.7284, + "step": 12192 + }, + { + "epoch": 0.6710881171225714, + "grad_norm": 0.6736281514167786, + "learning_rate": 7.494867874091233e-06, + "loss": 0.7007, + "step": 12193 + }, + { + "epoch": 0.6711431559249271, + "grad_norm": 0.7302053570747375, + "learning_rate": 7.494492215090304e-06, + "loss": 0.77, + "step": 12194 + }, + { + "epoch": 0.6711981947272827, + "grad_norm": 0.7368764877319336, + "learning_rate": 7.494116537341442e-06, + "loss": 0.8478, + "step": 12195 + }, + { + "epoch": 0.6712532335296384, + "grad_norm": 0.782767653465271, + "learning_rate": 7.493740840847466e-06, + "loss": 0.813, + "step": 12196 + }, + { + "epoch": 0.6713082723319941, + "grad_norm": 0.6787601113319397, + "learning_rate": 7.493365125611202e-06, + "loss": 0.7507, + "step": 12197 + }, + { + "epoch": 0.6713633111343498, + "grad_norm": 0.6912569999694824, + "learning_rate": 7.4929893916354715e-06, + "loss": 0.8003, + "step": 12198 + }, + { + "epoch": 0.6714183499367053, + "grad_norm": 0.7625328898429871, + "learning_rate": 7.4926136389231005e-06, + "loss": 0.8021, + "step": 12199 + }, + { + "epoch": 0.671473388739061, + "grad_norm": 0.6720984578132629, + "learning_rate": 7.4922378674769146e-06, + "loss": 0.7757, + "step": 12200 + }, + { + "epoch": 0.6715284275414167, + "grad_norm": 0.7816714644432068, + "learning_rate": 7.491862077299734e-06, + "loss": 0.7086, + "step": 12201 + }, + { + "epoch": 0.6715834663437723, + "grad_norm": 0.7546358108520508, + "learning_rate": 7.491486268394387e-06, + "loss": 0.8365, + "step": 12202 + }, + { + "epoch": 0.671638505146128, + "grad_norm": 0.7201979756355286, + "learning_rate": 7.491110440763695e-06, + "loss": 0.835, + "step": 12203 + }, + { + "epoch": 0.6716935439484837, + "grad_norm": 0.8177551031112671, + "learning_rate": 7.490734594410484e-06, + "loss": 0.8636, + "step": 12204 + }, + { + "epoch": 0.6717485827508394, + "grad_norm": 0.7433933019638062, + "learning_rate": 7.490358729337578e-06, + "loss": 0.745, + "step": 12205 + }, + { + "epoch": 0.671803621553195, + "grad_norm": 0.8013591170310974, + "learning_rate": 7.489982845547802e-06, + "loss": 0.7638, + "step": 12206 + }, + { + "epoch": 0.6718586603555506, + "grad_norm": 0.6561495065689087, + "learning_rate": 7.489606943043982e-06, + "loss": 0.7997, + "step": 12207 + }, + { + "epoch": 0.6719136991579063, + "grad_norm": 0.7291023135185242, + "learning_rate": 7.489231021828943e-06, + "loss": 0.7452, + "step": 12208 + }, + { + "epoch": 0.671968737960262, + "grad_norm": 0.6978216171264648, + "learning_rate": 7.488855081905511e-06, + "loss": 0.7984, + "step": 12209 + }, + { + "epoch": 0.6720237767626176, + "grad_norm": 0.701006293296814, + "learning_rate": 7.488479123276507e-06, + "loss": 0.7218, + "step": 12210 + }, + { + "epoch": 0.6720788155649733, + "grad_norm": 0.7275286912918091, + "learning_rate": 7.488103145944763e-06, + "loss": 0.6872, + "step": 12211 + }, + { + "epoch": 0.672133854367329, + "grad_norm": 0.7319645881652832, + "learning_rate": 7.487727149913101e-06, + "loss": 0.7862, + "step": 12212 + }, + { + "epoch": 0.6721888931696847, + "grad_norm": 0.7143612504005432, + "learning_rate": 7.487351135184348e-06, + "loss": 0.838, + "step": 12213 + }, + { + "epoch": 0.6722439319720402, + "grad_norm": 0.7135382294654846, + "learning_rate": 7.486975101761329e-06, + "loss": 0.7263, + "step": 12214 + }, + { + "epoch": 0.6722989707743959, + "grad_norm": 0.6283460259437561, + "learning_rate": 7.486599049646872e-06, + "loss": 0.7262, + "step": 12215 + }, + { + "epoch": 0.6723540095767516, + "grad_norm": 0.7196768522262573, + "learning_rate": 7.486222978843801e-06, + "loss": 0.6752, + "step": 12216 + }, + { + "epoch": 0.6724090483791073, + "grad_norm": 0.5856572389602661, + "learning_rate": 7.485846889354944e-06, + "loss": 0.6779, + "step": 12217 + }, + { + "epoch": 0.6724640871814629, + "grad_norm": 0.7671294808387756, + "learning_rate": 7.485470781183126e-06, + "loss": 0.766, + "step": 12218 + }, + { + "epoch": 0.6725191259838186, + "grad_norm": 0.6780520677566528, + "learning_rate": 7.485094654331177e-06, + "loss": 0.7474, + "step": 12219 + }, + { + "epoch": 0.6725741647861743, + "grad_norm": 0.7537981867790222, + "learning_rate": 7.484718508801921e-06, + "loss": 0.8347, + "step": 12220 + }, + { + "epoch": 0.67262920358853, + "grad_norm": 0.7451551556587219, + "learning_rate": 7.484342344598186e-06, + "loss": 0.8217, + "step": 12221 + }, + { + "epoch": 0.6726842423908855, + "grad_norm": 0.6656951904296875, + "learning_rate": 7.483966161722798e-06, + "loss": 0.7437, + "step": 12222 + }, + { + "epoch": 0.6727392811932412, + "grad_norm": 0.7306267619132996, + "learning_rate": 7.483589960178586e-06, + "loss": 0.8495, + "step": 12223 + }, + { + "epoch": 0.6727943199955969, + "grad_norm": 0.6619658470153809, + "learning_rate": 7.483213739968376e-06, + "loss": 0.6379, + "step": 12224 + }, + { + "epoch": 0.6728493587979526, + "grad_norm": 0.7066444754600525, + "learning_rate": 7.4828375010949974e-06, + "loss": 0.7307, + "step": 12225 + }, + { + "epoch": 0.6729043976003082, + "grad_norm": 0.7356079816818237, + "learning_rate": 7.482461243561276e-06, + "loss": 0.7781, + "step": 12226 + }, + { + "epoch": 0.6729594364026639, + "grad_norm": 0.6759988069534302, + "learning_rate": 7.48208496737004e-06, + "loss": 0.7808, + "step": 12227 + }, + { + "epoch": 0.6730144752050196, + "grad_norm": 0.7519234418869019, + "learning_rate": 7.481708672524119e-06, + "loss": 0.7948, + "step": 12228 + }, + { + "epoch": 0.6730695140073752, + "grad_norm": 0.6387592554092407, + "learning_rate": 7.48133235902634e-06, + "loss": 0.7423, + "step": 12229 + }, + { + "epoch": 0.6731245528097308, + "grad_norm": 1.0615060329437256, + "learning_rate": 7.480956026879529e-06, + "loss": 0.8668, + "step": 12230 + }, + { + "epoch": 0.6731795916120865, + "grad_norm": 0.7578469514846802, + "learning_rate": 7.480579676086519e-06, + "loss": 0.812, + "step": 12231 + }, + { + "epoch": 0.6732346304144422, + "grad_norm": 0.6669226884841919, + "learning_rate": 7.480203306650134e-06, + "loss": 0.7002, + "step": 12232 + }, + { + "epoch": 0.6732896692167979, + "grad_norm": 0.7110459208488464, + "learning_rate": 7.479826918573208e-06, + "loss": 0.8542, + "step": 12233 + }, + { + "epoch": 0.6733447080191535, + "grad_norm": 0.6632254123687744, + "learning_rate": 7.479450511858563e-06, + "loss": 0.6784, + "step": 12234 + }, + { + "epoch": 0.6733997468215092, + "grad_norm": 0.7368438839912415, + "learning_rate": 7.479074086509032e-06, + "loss": 0.7683, + "step": 12235 + }, + { + "epoch": 0.6734547856238648, + "grad_norm": 0.764905571937561, + "learning_rate": 7.478697642527447e-06, + "loss": 0.7585, + "step": 12236 + }, + { + "epoch": 0.6735098244262205, + "grad_norm": 0.7141197323799133, + "learning_rate": 7.478321179916632e-06, + "loss": 0.7409, + "step": 12237 + }, + { + "epoch": 0.6735648632285761, + "grad_norm": 0.6514197587966919, + "learning_rate": 7.477944698679419e-06, + "loss": 0.7623, + "step": 12238 + }, + { + "epoch": 0.6736199020309318, + "grad_norm": 0.7712671160697937, + "learning_rate": 7.477568198818636e-06, + "loss": 0.777, + "step": 12239 + }, + { + "epoch": 0.6736749408332875, + "grad_norm": 0.6690881252288818, + "learning_rate": 7.4771916803371145e-06, + "loss": 0.7275, + "step": 12240 + }, + { + "epoch": 0.6737299796356432, + "grad_norm": 0.7206465601921082, + "learning_rate": 7.476815143237683e-06, + "loss": 0.853, + "step": 12241 + }, + { + "epoch": 0.6737850184379988, + "grad_norm": 0.7052504420280457, + "learning_rate": 7.476438587523171e-06, + "loss": 0.774, + "step": 12242 + }, + { + "epoch": 0.6738400572403545, + "grad_norm": 1.6168169975280762, + "learning_rate": 7.476062013196411e-06, + "loss": 0.7423, + "step": 12243 + }, + { + "epoch": 0.6738950960427101, + "grad_norm": 0.715300977230072, + "learning_rate": 7.475685420260232e-06, + "loss": 0.78, + "step": 12244 + }, + { + "epoch": 0.6739501348450657, + "grad_norm": 0.7774379253387451, + "learning_rate": 7.475308808717463e-06, + "loss": 0.885, + "step": 12245 + }, + { + "epoch": 0.6740051736474214, + "grad_norm": 0.6998060941696167, + "learning_rate": 7.474932178570935e-06, + "loss": 0.807, + "step": 12246 + }, + { + "epoch": 0.6740602124497771, + "grad_norm": 0.6710013747215271, + "learning_rate": 7.47455552982348e-06, + "loss": 0.7639, + "step": 12247 + }, + { + "epoch": 0.6741152512521328, + "grad_norm": 0.707435667514801, + "learning_rate": 7.474178862477929e-06, + "loss": 0.7914, + "step": 12248 + }, + { + "epoch": 0.6741702900544884, + "grad_norm": 0.7344105243682861, + "learning_rate": 7.47380217653711e-06, + "loss": 0.7464, + "step": 12249 + }, + { + "epoch": 0.674225328856844, + "grad_norm": 0.7157585620880127, + "learning_rate": 7.473425472003858e-06, + "loss": 0.7747, + "step": 12250 + }, + { + "epoch": 0.6742803676591997, + "grad_norm": 0.6978434920310974, + "learning_rate": 7.473048748881001e-06, + "loss": 0.6903, + "step": 12251 + }, + { + "epoch": 0.6743354064615554, + "grad_norm": 0.6454086899757385, + "learning_rate": 7.472672007171372e-06, + "loss": 0.725, + "step": 12252 + }, + { + "epoch": 0.674390445263911, + "grad_norm": 0.6729341745376587, + "learning_rate": 7.4722952468778035e-06, + "loss": 0.7704, + "step": 12253 + }, + { + "epoch": 0.6744454840662667, + "grad_norm": 0.7995265126228333, + "learning_rate": 7.471918468003122e-06, + "loss": 0.7567, + "step": 12254 + }, + { + "epoch": 0.6745005228686224, + "grad_norm": 0.729629397392273, + "learning_rate": 7.471541670550165e-06, + "loss": 0.796, + "step": 12255 + }, + { + "epoch": 0.6745555616709781, + "grad_norm": 0.6923666000366211, + "learning_rate": 7.471164854521764e-06, + "loss": 0.6894, + "step": 12256 + }, + { + "epoch": 0.6746106004733337, + "grad_norm": 0.6485042572021484, + "learning_rate": 7.470788019920747e-06, + "loss": 0.6912, + "step": 12257 + }, + { + "epoch": 0.6746656392756893, + "grad_norm": 0.7569034099578857, + "learning_rate": 7.470411166749949e-06, + "loss": 0.8167, + "step": 12258 + }, + { + "epoch": 0.674720678078045, + "grad_norm": 0.6202835440635681, + "learning_rate": 7.470034295012203e-06, + "loss": 0.6409, + "step": 12259 + }, + { + "epoch": 0.6747757168804007, + "grad_norm": 0.6414007544517517, + "learning_rate": 7.4696574047103395e-06, + "loss": 0.7163, + "step": 12260 + }, + { + "epoch": 0.6748307556827563, + "grad_norm": 0.7012181878089905, + "learning_rate": 7.469280495847193e-06, + "loss": 0.7682, + "step": 12261 + }, + { + "epoch": 0.674885794485112, + "grad_norm": 0.7027888298034668, + "learning_rate": 7.468903568425596e-06, + "loss": 0.7561, + "step": 12262 + }, + { + "epoch": 0.6749408332874677, + "grad_norm": 0.7282221913337708, + "learning_rate": 7.4685266224483785e-06, + "loss": 0.7552, + "step": 12263 + }, + { + "epoch": 0.6749958720898234, + "grad_norm": 0.7349117398262024, + "learning_rate": 7.468149657918377e-06, + "loss": 0.8323, + "step": 12264 + }, + { + "epoch": 0.675050910892179, + "grad_norm": 0.8992187976837158, + "learning_rate": 7.467772674838424e-06, + "loss": 0.7589, + "step": 12265 + }, + { + "epoch": 0.6751059496945346, + "grad_norm": 0.6773034930229187, + "learning_rate": 7.4673956732113505e-06, + "loss": 0.7229, + "step": 12266 + }, + { + "epoch": 0.6751609884968903, + "grad_norm": 0.6563699841499329, + "learning_rate": 7.467018653039992e-06, + "loss": 0.7526, + "step": 12267 + }, + { + "epoch": 0.675216027299246, + "grad_norm": 0.7559765577316284, + "learning_rate": 7.466641614327181e-06, + "loss": 0.708, + "step": 12268 + }, + { + "epoch": 0.6752710661016016, + "grad_norm": 0.7077820897102356, + "learning_rate": 7.4662645570757545e-06, + "loss": 0.6568, + "step": 12269 + }, + { + "epoch": 0.6753261049039573, + "grad_norm": 0.8082162141799927, + "learning_rate": 7.465887481288541e-06, + "loss": 0.8751, + "step": 12270 + }, + { + "epoch": 0.675381143706313, + "grad_norm": 0.6940243244171143, + "learning_rate": 7.465510386968377e-06, + "loss": 0.7826, + "step": 12271 + }, + { + "epoch": 0.6754361825086687, + "grad_norm": 0.6634145379066467, + "learning_rate": 7.465133274118099e-06, + "loss": 0.6816, + "step": 12272 + }, + { + "epoch": 0.6754912213110242, + "grad_norm": 0.6797559857368469, + "learning_rate": 7.464756142740539e-06, + "loss": 0.7101, + "step": 12273 + }, + { + "epoch": 0.6755462601133799, + "grad_norm": 0.7696588635444641, + "learning_rate": 7.464378992838531e-06, + "loss": 0.8114, + "step": 12274 + }, + { + "epoch": 0.6756012989157356, + "grad_norm": 0.6733334064483643, + "learning_rate": 7.4640018244149105e-06, + "loss": 0.7585, + "step": 12275 + }, + { + "epoch": 0.6756563377180913, + "grad_norm": 0.7087474465370178, + "learning_rate": 7.463624637472512e-06, + "loss": 0.6911, + "step": 12276 + }, + { + "epoch": 0.6757113765204469, + "grad_norm": 0.6944451928138733, + "learning_rate": 7.46324743201417e-06, + "loss": 0.7726, + "step": 12277 + }, + { + "epoch": 0.6757664153228026, + "grad_norm": 0.7214855551719666, + "learning_rate": 7.46287020804272e-06, + "loss": 0.7844, + "step": 12278 + }, + { + "epoch": 0.6758214541251583, + "grad_norm": 0.7106257677078247, + "learning_rate": 7.462492965560995e-06, + "loss": 0.7724, + "step": 12279 + }, + { + "epoch": 0.675876492927514, + "grad_norm": 0.7403497695922852, + "learning_rate": 7.462115704571833e-06, + "loss": 0.7558, + "step": 12280 + }, + { + "epoch": 0.6759315317298695, + "grad_norm": 0.7157884836196899, + "learning_rate": 7.4617384250780685e-06, + "loss": 0.6681, + "step": 12281 + }, + { + "epoch": 0.6759865705322252, + "grad_norm": 0.6937661170959473, + "learning_rate": 7.461361127082538e-06, + "loss": 0.7852, + "step": 12282 + }, + { + "epoch": 0.6760416093345809, + "grad_norm": 0.7106412053108215, + "learning_rate": 7.4609838105880735e-06, + "loss": 0.7689, + "step": 12283 + }, + { + "epoch": 0.6760966481369366, + "grad_norm": 0.6860619187355042, + "learning_rate": 7.460606475597516e-06, + "loss": 0.6528, + "step": 12284 + }, + { + "epoch": 0.6761516869392922, + "grad_norm": 0.7085865139961243, + "learning_rate": 7.460229122113698e-06, + "loss": 0.7303, + "step": 12285 + }, + { + "epoch": 0.6762067257416479, + "grad_norm": 0.6648178100585938, + "learning_rate": 7.459851750139457e-06, + "loss": 0.6751, + "step": 12286 + }, + { + "epoch": 0.6762617645440036, + "grad_norm": 0.74468594789505, + "learning_rate": 7.459474359677629e-06, + "loss": 0.756, + "step": 12287 + }, + { + "epoch": 0.6763168033463591, + "grad_norm": 0.6408486366271973, + "learning_rate": 7.459096950731048e-06, + "loss": 0.7737, + "step": 12288 + }, + { + "epoch": 0.6763718421487148, + "grad_norm": 0.7204515933990479, + "learning_rate": 7.458719523302556e-06, + "loss": 0.7845, + "step": 12289 + }, + { + "epoch": 0.6764268809510705, + "grad_norm": 0.7373428344726562, + "learning_rate": 7.458342077394984e-06, + "loss": 0.7245, + "step": 12290 + }, + { + "epoch": 0.6764819197534262, + "grad_norm": 0.701654851436615, + "learning_rate": 7.45796461301117e-06, + "loss": 0.7711, + "step": 12291 + }, + { + "epoch": 0.6765369585557818, + "grad_norm": 0.7002573013305664, + "learning_rate": 7.4575871301539526e-06, + "loss": 0.8138, + "step": 12292 + }, + { + "epoch": 0.6765919973581375, + "grad_norm": 0.7460681200027466, + "learning_rate": 7.45720962882617e-06, + "loss": 0.8012, + "step": 12293 + }, + { + "epoch": 0.6766470361604932, + "grad_norm": 0.6478421092033386, + "learning_rate": 7.456832109030655e-06, + "loss": 0.7161, + "step": 12294 + }, + { + "epoch": 0.6767020749628488, + "grad_norm": 0.7101582288742065, + "learning_rate": 7.456454570770248e-06, + "loss": 0.7348, + "step": 12295 + }, + { + "epoch": 0.6767571137652044, + "grad_norm": 0.7735113501548767, + "learning_rate": 7.4560770140477865e-06, + "loss": 0.7584, + "step": 12296 + }, + { + "epoch": 0.6768121525675601, + "grad_norm": 0.6811535358428955, + "learning_rate": 7.4556994388661085e-06, + "loss": 0.7653, + "step": 12297 + }, + { + "epoch": 0.6768671913699158, + "grad_norm": 0.7445605397224426, + "learning_rate": 7.455321845228051e-06, + "loss": 0.7661, + "step": 12298 + }, + { + "epoch": 0.6769222301722715, + "grad_norm": 0.6862059831619263, + "learning_rate": 7.4549442331364505e-06, + "loss": 0.776, + "step": 12299 + }, + { + "epoch": 0.6769772689746271, + "grad_norm": 0.7030314207077026, + "learning_rate": 7.4545666025941465e-06, + "loss": 0.7393, + "step": 12300 + }, + { + "epoch": 0.6770323077769828, + "grad_norm": 0.6718610525131226, + "learning_rate": 7.454188953603978e-06, + "loss": 0.7375, + "step": 12301 + }, + { + "epoch": 0.6770873465793384, + "grad_norm": 0.6716088652610779, + "learning_rate": 7.453811286168782e-06, + "loss": 0.8021, + "step": 12302 + }, + { + "epoch": 0.6771423853816941, + "grad_norm": 0.8916372656822205, + "learning_rate": 7.453433600291395e-06, + "loss": 0.8274, + "step": 12303 + }, + { + "epoch": 0.6771974241840497, + "grad_norm": 0.7396363615989685, + "learning_rate": 7.45305589597466e-06, + "loss": 0.7892, + "step": 12304 + }, + { + "epoch": 0.6772524629864054, + "grad_norm": 0.8074424862861633, + "learning_rate": 7.452678173221413e-06, + "loss": 0.7586, + "step": 12305 + }, + { + "epoch": 0.6773075017887611, + "grad_norm": 0.6928194165229797, + "learning_rate": 7.452300432034494e-06, + "loss": 0.7914, + "step": 12306 + }, + { + "epoch": 0.6773625405911168, + "grad_norm": 0.7064313292503357, + "learning_rate": 7.451922672416739e-06, + "loss": 0.7948, + "step": 12307 + }, + { + "epoch": 0.6774175793934724, + "grad_norm": 0.6828622221946716, + "learning_rate": 7.451544894370992e-06, + "loss": 0.6723, + "step": 12308 + }, + { + "epoch": 0.677472618195828, + "grad_norm": 0.6794914603233337, + "learning_rate": 7.45116709790009e-06, + "loss": 0.7344, + "step": 12309 + }, + { + "epoch": 0.6775276569981837, + "grad_norm": 0.7643330097198486, + "learning_rate": 7.45078928300687e-06, + "loss": 0.7836, + "step": 12310 + }, + { + "epoch": 0.6775826958005394, + "grad_norm": 0.692569375038147, + "learning_rate": 7.450411449694176e-06, + "loss": 0.7608, + "step": 12311 + }, + { + "epoch": 0.677637734602895, + "grad_norm": 0.7718693614006042, + "learning_rate": 7.4500335979648455e-06, + "loss": 0.7131, + "step": 12312 + }, + { + "epoch": 0.6776927734052507, + "grad_norm": 0.6267405152320862, + "learning_rate": 7.449655727821716e-06, + "loss": 0.7543, + "step": 12313 + }, + { + "epoch": 0.6777478122076064, + "grad_norm": 0.8252732157707214, + "learning_rate": 7.4492778392676325e-06, + "loss": 0.8799, + "step": 12314 + }, + { + "epoch": 0.6778028510099621, + "grad_norm": 0.6310145854949951, + "learning_rate": 7.448899932305429e-06, + "loss": 0.7389, + "step": 12315 + }, + { + "epoch": 0.6778578898123176, + "grad_norm": 0.6115848422050476, + "learning_rate": 7.448522006937951e-06, + "loss": 0.6069, + "step": 12316 + }, + { + "epoch": 0.6779129286146733, + "grad_norm": 0.6809090971946716, + "learning_rate": 7.448144063168038e-06, + "loss": 0.7092, + "step": 12317 + }, + { + "epoch": 0.677967967417029, + "grad_norm": 0.7285470366477966, + "learning_rate": 7.447766100998529e-06, + "loss": 0.714, + "step": 12318 + }, + { + "epoch": 0.6780230062193847, + "grad_norm": 0.6637021899223328, + "learning_rate": 7.447388120432264e-06, + "loss": 0.7247, + "step": 12319 + }, + { + "epoch": 0.6780780450217403, + "grad_norm": 0.7735750675201416, + "learning_rate": 7.447010121472087e-06, + "loss": 0.7616, + "step": 12320 + }, + { + "epoch": 0.678133083824096, + "grad_norm": 0.7643262147903442, + "learning_rate": 7.446632104120836e-06, + "loss": 0.5863, + "step": 12321 + }, + { + "epoch": 0.6781881226264517, + "grad_norm": 0.6957301497459412, + "learning_rate": 7.446254068381352e-06, + "loss": 0.7125, + "step": 12322 + }, + { + "epoch": 0.6782431614288074, + "grad_norm": 0.6573877930641174, + "learning_rate": 7.445876014256479e-06, + "loss": 0.7115, + "step": 12323 + }, + { + "epoch": 0.6782982002311629, + "grad_norm": 0.6507790684700012, + "learning_rate": 7.445497941749056e-06, + "loss": 0.7266, + "step": 12324 + }, + { + "epoch": 0.6783532390335186, + "grad_norm": 0.8314819931983948, + "learning_rate": 7.4451198508619245e-06, + "loss": 0.6902, + "step": 12325 + }, + { + "epoch": 0.6784082778358743, + "grad_norm": 0.6907274127006531, + "learning_rate": 7.444741741597927e-06, + "loss": 0.8253, + "step": 12326 + }, + { + "epoch": 0.67846331663823, + "grad_norm": 0.7311725616455078, + "learning_rate": 7.444363613959904e-06, + "loss": 0.8641, + "step": 12327 + }, + { + "epoch": 0.6785183554405856, + "grad_norm": 0.6690121293067932, + "learning_rate": 7.443985467950701e-06, + "loss": 0.6966, + "step": 12328 + }, + { + "epoch": 0.6785733942429413, + "grad_norm": 0.6444346308708191, + "learning_rate": 7.443607303573155e-06, + "loss": 0.7848, + "step": 12329 + }, + { + "epoch": 0.678628433045297, + "grad_norm": 0.7553900480270386, + "learning_rate": 7.4432291208301125e-06, + "loss": 0.8196, + "step": 12330 + }, + { + "epoch": 0.6786834718476525, + "grad_norm": 0.6393183469772339, + "learning_rate": 7.442850919724411e-06, + "loss": 0.7622, + "step": 12331 + }, + { + "epoch": 0.6787385106500082, + "grad_norm": 0.7045423984527588, + "learning_rate": 7.442472700258898e-06, + "loss": 0.7483, + "step": 12332 + }, + { + "epoch": 0.6787935494523639, + "grad_norm": 0.7536678314208984, + "learning_rate": 7.442094462436414e-06, + "loss": 0.815, + "step": 12333 + }, + { + "epoch": 0.6788485882547196, + "grad_norm": 0.645391047000885, + "learning_rate": 7.441716206259801e-06, + "loss": 0.7394, + "step": 12334 + }, + { + "epoch": 0.6789036270570752, + "grad_norm": 0.8870118260383606, + "learning_rate": 7.441337931731905e-06, + "loss": 0.8076, + "step": 12335 + }, + { + "epoch": 0.6789586658594309, + "grad_norm": 0.6672457456588745, + "learning_rate": 7.440959638855564e-06, + "loss": 0.7573, + "step": 12336 + }, + { + "epoch": 0.6790137046617866, + "grad_norm": 0.7104566693305969, + "learning_rate": 7.440581327633625e-06, + "loss": 0.6855, + "step": 12337 + }, + { + "epoch": 0.6790687434641423, + "grad_norm": 0.7201581001281738, + "learning_rate": 7.4402029980689294e-06, + "loss": 0.7977, + "step": 12338 + }, + { + "epoch": 0.6791237822664978, + "grad_norm": 0.6685218811035156, + "learning_rate": 7.43982465016432e-06, + "loss": 0.8114, + "step": 12339 + }, + { + "epoch": 0.6791788210688535, + "grad_norm": 0.6913738250732422, + "learning_rate": 7.439446283922645e-06, + "loss": 0.7584, + "step": 12340 + }, + { + "epoch": 0.6792338598712092, + "grad_norm": 0.7332273721694946, + "learning_rate": 7.439067899346742e-06, + "loss": 0.7658, + "step": 12341 + }, + { + "epoch": 0.6792888986735649, + "grad_norm": 0.777909517288208, + "learning_rate": 7.438689496439458e-06, + "loss": 0.8064, + "step": 12342 + }, + { + "epoch": 0.6793439374759205, + "grad_norm": 0.7444930076599121, + "learning_rate": 7.438311075203636e-06, + "loss": 0.7896, + "step": 12343 + }, + { + "epoch": 0.6793989762782762, + "grad_norm": 0.7678806781768799, + "learning_rate": 7.4379326356421224e-06, + "loss": 0.8533, + "step": 12344 + }, + { + "epoch": 0.6794540150806319, + "grad_norm": 0.6653377413749695, + "learning_rate": 7.437554177757759e-06, + "loss": 0.7287, + "step": 12345 + }, + { + "epoch": 0.6795090538829875, + "grad_norm": 0.6270567178726196, + "learning_rate": 7.43717570155339e-06, + "loss": 0.6802, + "step": 12346 + }, + { + "epoch": 0.6795640926853431, + "grad_norm": 0.7091223001480103, + "learning_rate": 7.436797207031861e-06, + "loss": 0.7693, + "step": 12347 + }, + { + "epoch": 0.6796191314876988, + "grad_norm": 0.6583104133605957, + "learning_rate": 7.436418694196018e-06, + "loss": 0.7171, + "step": 12348 + }, + { + "epoch": 0.6796741702900545, + "grad_norm": 0.6897410750389099, + "learning_rate": 7.436040163048703e-06, + "loss": 0.7831, + "step": 12349 + }, + { + "epoch": 0.6797292090924102, + "grad_norm": 0.6506269574165344, + "learning_rate": 7.435661613592763e-06, + "loss": 0.8037, + "step": 12350 + }, + { + "epoch": 0.6797842478947658, + "grad_norm": 0.6772280931472778, + "learning_rate": 7.435283045831041e-06, + "loss": 0.8102, + "step": 12351 + }, + { + "epoch": 0.6798392866971215, + "grad_norm": 0.8470273017883301, + "learning_rate": 7.434904459766384e-06, + "loss": 0.7816, + "step": 12352 + }, + { + "epoch": 0.6798943254994771, + "grad_norm": 0.6969698071479797, + "learning_rate": 7.434525855401638e-06, + "loss": 0.6911, + "step": 12353 + }, + { + "epoch": 0.6799493643018328, + "grad_norm": 0.9969611763954163, + "learning_rate": 7.434147232739646e-06, + "loss": 0.7041, + "step": 12354 + }, + { + "epoch": 0.6800044031041884, + "grad_norm": 0.6697688698768616, + "learning_rate": 7.433768591783255e-06, + "loss": 0.6602, + "step": 12355 + }, + { + "epoch": 0.6800594419065441, + "grad_norm": 0.9857928156852722, + "learning_rate": 7.433389932535311e-06, + "loss": 0.6505, + "step": 12356 + }, + { + "epoch": 0.6801144807088998, + "grad_norm": 0.8787727355957031, + "learning_rate": 7.43301125499866e-06, + "loss": 0.7558, + "step": 12357 + }, + { + "epoch": 0.6801695195112555, + "grad_norm": 0.6035268306732178, + "learning_rate": 7.432632559176147e-06, + "loss": 0.6337, + "step": 12358 + }, + { + "epoch": 0.6802245583136111, + "grad_norm": 0.7977258563041687, + "learning_rate": 7.432253845070621e-06, + "loss": 0.7324, + "step": 12359 + }, + { + "epoch": 0.6802795971159667, + "grad_norm": 0.5842836499214172, + "learning_rate": 7.431875112684923e-06, + "loss": 0.677, + "step": 12360 + }, + { + "epoch": 0.6803346359183224, + "grad_norm": 0.7134125828742981, + "learning_rate": 7.431496362021905e-06, + "loss": 0.7034, + "step": 12361 + }, + { + "epoch": 0.6803896747206781, + "grad_norm": 0.7101823091506958, + "learning_rate": 7.431117593084411e-06, + "loss": 0.7526, + "step": 12362 + }, + { + "epoch": 0.6804447135230337, + "grad_norm": 0.6543304920196533, + "learning_rate": 7.4307388058752865e-06, + "loss": 0.7548, + "step": 12363 + }, + { + "epoch": 0.6804997523253894, + "grad_norm": 0.6522945761680603, + "learning_rate": 7.430360000397381e-06, + "loss": 0.7044, + "step": 12364 + }, + { + "epoch": 0.6805547911277451, + "grad_norm": 0.7405091524124146, + "learning_rate": 7.429981176653539e-06, + "loss": 0.8064, + "step": 12365 + }, + { + "epoch": 0.6806098299301008, + "grad_norm": 0.6454355716705322, + "learning_rate": 7.429602334646611e-06, + "loss": 0.7179, + "step": 12366 + }, + { + "epoch": 0.6806648687324564, + "grad_norm": 0.8131621479988098, + "learning_rate": 7.429223474379439e-06, + "loss": 0.7144, + "step": 12367 + }, + { + "epoch": 0.680719907534812, + "grad_norm": 0.7203080058097839, + "learning_rate": 7.428844595854876e-06, + "loss": 0.8189, + "step": 12368 + }, + { + "epoch": 0.6807749463371677, + "grad_norm": 0.650414228439331, + "learning_rate": 7.428465699075767e-06, + "loss": 0.7815, + "step": 12369 + }, + { + "epoch": 0.6808299851395234, + "grad_norm": 0.8152775168418884, + "learning_rate": 7.42808678404496e-06, + "loss": 0.7365, + "step": 12370 + }, + { + "epoch": 0.680885023941879, + "grad_norm": 0.5871601700782776, + "learning_rate": 7.427707850765302e-06, + "loss": 0.6804, + "step": 12371 + }, + { + "epoch": 0.6809400627442347, + "grad_norm": 0.7115684747695923, + "learning_rate": 7.427328899239643e-06, + "loss": 0.728, + "step": 12372 + }, + { + "epoch": 0.6809951015465904, + "grad_norm": 0.6575615406036377, + "learning_rate": 7.426949929470828e-06, + "loss": 0.725, + "step": 12373 + }, + { + "epoch": 0.681050140348946, + "grad_norm": 0.7744095325469971, + "learning_rate": 7.426570941461708e-06, + "loss": 0.7647, + "step": 12374 + }, + { + "epoch": 0.6811051791513016, + "grad_norm": 0.6856220364570618, + "learning_rate": 7.4261919352151305e-06, + "loss": 0.8121, + "step": 12375 + }, + { + "epoch": 0.6811602179536573, + "grad_norm": 0.8197830319404602, + "learning_rate": 7.425812910733943e-06, + "loss": 0.8685, + "step": 12376 + }, + { + "epoch": 0.681215256756013, + "grad_norm": 1.240628719329834, + "learning_rate": 7.425433868020996e-06, + "loss": 0.8063, + "step": 12377 + }, + { + "epoch": 0.6812702955583686, + "grad_norm": 0.8716747760772705, + "learning_rate": 7.425054807079136e-06, + "loss": 0.7384, + "step": 12378 + }, + { + "epoch": 0.6813253343607243, + "grad_norm": 0.7512598037719727, + "learning_rate": 7.4246757279112135e-06, + "loss": 0.7428, + "step": 12379 + }, + { + "epoch": 0.68138037316308, + "grad_norm": 0.7002312541007996, + "learning_rate": 7.424296630520078e-06, + "loss": 0.6066, + "step": 12380 + }, + { + "epoch": 0.6814354119654357, + "grad_norm": 0.6422720551490784, + "learning_rate": 7.423917514908578e-06, + "loss": 0.6645, + "step": 12381 + }, + { + "epoch": 0.6814904507677912, + "grad_norm": 0.8667505383491516, + "learning_rate": 7.423538381079562e-06, + "loss": 0.8663, + "step": 12382 + }, + { + "epoch": 0.6815454895701469, + "grad_norm": 0.7045377492904663, + "learning_rate": 7.423159229035881e-06, + "loss": 0.7684, + "step": 12383 + }, + { + "epoch": 0.6816005283725026, + "grad_norm": 0.7663894295692444, + "learning_rate": 7.422780058780385e-06, + "loss": 0.8051, + "step": 12384 + }, + { + "epoch": 0.6816555671748583, + "grad_norm": 0.7612582445144653, + "learning_rate": 7.42240087031592e-06, + "loss": 0.7771, + "step": 12385 + }, + { + "epoch": 0.6817106059772139, + "grad_norm": 0.8682271838188171, + "learning_rate": 7.42202166364534e-06, + "loss": 0.7761, + "step": 12386 + }, + { + "epoch": 0.6817656447795696, + "grad_norm": 0.712204098701477, + "learning_rate": 7.421642438771492e-06, + "loss": 0.7832, + "step": 12387 + }, + { + "epoch": 0.6818206835819253, + "grad_norm": 0.6726338863372803, + "learning_rate": 7.42126319569723e-06, + "loss": 0.7541, + "step": 12388 + }, + { + "epoch": 0.681875722384281, + "grad_norm": 0.647570788860321, + "learning_rate": 7.420883934425401e-06, + "loss": 0.7281, + "step": 12389 + }, + { + "epoch": 0.6819307611866365, + "grad_norm": 0.7058577537536621, + "learning_rate": 7.420504654958857e-06, + "loss": 0.8315, + "step": 12390 + }, + { + "epoch": 0.6819857999889922, + "grad_norm": 0.6683655977249146, + "learning_rate": 7.420125357300446e-06, + "loss": 0.772, + "step": 12391 + }, + { + "epoch": 0.6820408387913479, + "grad_norm": 0.6768681406974792, + "learning_rate": 7.419746041453022e-06, + "loss": 0.7023, + "step": 12392 + }, + { + "epoch": 0.6820958775937036, + "grad_norm": 0.8037514686584473, + "learning_rate": 7.419366707419434e-06, + "loss": 0.6894, + "step": 12393 + }, + { + "epoch": 0.6821509163960592, + "grad_norm": 0.6510934829711914, + "learning_rate": 7.418987355202534e-06, + "loss": 0.6411, + "step": 12394 + }, + { + "epoch": 0.6822059551984149, + "grad_norm": 0.7628617882728577, + "learning_rate": 7.418607984805173e-06, + "loss": 0.7681, + "step": 12395 + }, + { + "epoch": 0.6822609940007706, + "grad_norm": 0.7146260738372803, + "learning_rate": 7.418228596230201e-06, + "loss": 0.7003, + "step": 12396 + }, + { + "epoch": 0.6823160328031262, + "grad_norm": 0.6208338737487793, + "learning_rate": 7.41784918948047e-06, + "loss": 0.7138, + "step": 12397 + }, + { + "epoch": 0.6823710716054818, + "grad_norm": 0.7859066724777222, + "learning_rate": 7.417469764558832e-06, + "loss": 0.7984, + "step": 12398 + }, + { + "epoch": 0.6824261104078375, + "grad_norm": 0.7636224031448364, + "learning_rate": 7.417090321468138e-06, + "loss": 0.7445, + "step": 12399 + }, + { + "epoch": 0.6824811492101932, + "grad_norm": 0.9071671366691589, + "learning_rate": 7.41671086021124e-06, + "loss": 0.8058, + "step": 12400 + }, + { + "epoch": 0.6825361880125489, + "grad_norm": 0.5986278057098389, + "learning_rate": 7.416331380790991e-06, + "loss": 0.7001, + "step": 12401 + }, + { + "epoch": 0.6825912268149045, + "grad_norm": 0.6812893152236938, + "learning_rate": 7.415951883210242e-06, + "loss": 0.7745, + "step": 12402 + }, + { + "epoch": 0.6826462656172602, + "grad_norm": 0.666362464427948, + "learning_rate": 7.415572367471844e-06, + "loss": 0.7861, + "step": 12403 + }, + { + "epoch": 0.6827013044196159, + "grad_norm": 0.6963029503822327, + "learning_rate": 7.415192833578653e-06, + "loss": 0.7657, + "step": 12404 + }, + { + "epoch": 0.6827563432219715, + "grad_norm": 0.669876217842102, + "learning_rate": 7.414813281533517e-06, + "loss": 0.6441, + "step": 12405 + }, + { + "epoch": 0.6828113820243271, + "grad_norm": 0.6608602404594421, + "learning_rate": 7.414433711339293e-06, + "loss": 0.7203, + "step": 12406 + }, + { + "epoch": 0.6828664208266828, + "grad_norm": 0.7262642979621887, + "learning_rate": 7.41405412299883e-06, + "loss": 0.7842, + "step": 12407 + }, + { + "epoch": 0.6829214596290385, + "grad_norm": 0.7728527188301086, + "learning_rate": 7.413674516514983e-06, + "loss": 0.7551, + "step": 12408 + }, + { + "epoch": 0.6829764984313942, + "grad_norm": 0.7970840930938721, + "learning_rate": 7.4132948918906035e-06, + "loss": 0.8181, + "step": 12409 + }, + { + "epoch": 0.6830315372337498, + "grad_norm": 0.6672868728637695, + "learning_rate": 7.412915249128546e-06, + "loss": 0.7201, + "step": 12410 + }, + { + "epoch": 0.6830865760361055, + "grad_norm": 0.8261075019836426, + "learning_rate": 7.412535588231664e-06, + "loss": 0.6006, + "step": 12411 + }, + { + "epoch": 0.6831416148384611, + "grad_norm": 0.6768019795417786, + "learning_rate": 7.412155909202809e-06, + "loss": 0.7326, + "step": 12412 + }, + { + "epoch": 0.6831966536408168, + "grad_norm": 0.7482851147651672, + "learning_rate": 7.4117762120448364e-06, + "loss": 0.7913, + "step": 12413 + }, + { + "epoch": 0.6832516924431724, + "grad_norm": 0.7315956354141235, + "learning_rate": 7.411396496760601e-06, + "loss": 0.7949, + "step": 12414 + }, + { + "epoch": 0.6833067312455281, + "grad_norm": 0.7460561394691467, + "learning_rate": 7.411016763352954e-06, + "loss": 0.8445, + "step": 12415 + }, + { + "epoch": 0.6833617700478838, + "grad_norm": 0.7025588154792786, + "learning_rate": 7.410637011824749e-06, + "loss": 0.7658, + "step": 12416 + }, + { + "epoch": 0.6834168088502394, + "grad_norm": 0.7507885694503784, + "learning_rate": 7.410257242178842e-06, + "loss": 0.711, + "step": 12417 + }, + { + "epoch": 0.683471847652595, + "grad_norm": 0.6935780048370361, + "learning_rate": 7.409877454418088e-06, + "loss": 0.8376, + "step": 12418 + }, + { + "epoch": 0.6835268864549507, + "grad_norm": 0.7747789025306702, + "learning_rate": 7.409497648545341e-06, + "loss": 0.8173, + "step": 12419 + }, + { + "epoch": 0.6835819252573064, + "grad_norm": 0.6559001803398132, + "learning_rate": 7.4091178245634525e-06, + "loss": 0.7146, + "step": 12420 + }, + { + "epoch": 0.683636964059662, + "grad_norm": 0.7123926877975464, + "learning_rate": 7.408737982475279e-06, + "loss": 0.7544, + "step": 12421 + }, + { + "epoch": 0.6836920028620177, + "grad_norm": 0.8163334131240845, + "learning_rate": 7.408358122283678e-06, + "loss": 0.8008, + "step": 12422 + }, + { + "epoch": 0.6837470416643734, + "grad_norm": 0.6837686896324158, + "learning_rate": 7.4079782439915e-06, + "loss": 0.6595, + "step": 12423 + }, + { + "epoch": 0.6838020804667291, + "grad_norm": 0.9385979175567627, + "learning_rate": 7.407598347601601e-06, + "loss": 0.8135, + "step": 12424 + }, + { + "epoch": 0.6838571192690847, + "grad_norm": 0.7197830677032471, + "learning_rate": 7.407218433116839e-06, + "loss": 0.8401, + "step": 12425 + }, + { + "epoch": 0.6839121580714403, + "grad_norm": 0.7165716290473938, + "learning_rate": 7.406838500540069e-06, + "loss": 0.7864, + "step": 12426 + }, + { + "epoch": 0.683967196873796, + "grad_norm": 0.6844950318336487, + "learning_rate": 7.4064585498741435e-06, + "loss": 0.7409, + "step": 12427 + }, + { + "epoch": 0.6840222356761517, + "grad_norm": 0.6237946152687073, + "learning_rate": 7.40607858112192e-06, + "loss": 0.6915, + "step": 12428 + }, + { + "epoch": 0.6840772744785073, + "grad_norm": 0.7437137365341187, + "learning_rate": 7.405698594286252e-06, + "loss": 0.8191, + "step": 12429 + }, + { + "epoch": 0.684132313280863, + "grad_norm": 0.6956225633621216, + "learning_rate": 7.4053185893700006e-06, + "loss": 0.7662, + "step": 12430 + }, + { + "epoch": 0.6841873520832187, + "grad_norm": 0.6508380174636841, + "learning_rate": 7.404938566376018e-06, + "loss": 0.7758, + "step": 12431 + }, + { + "epoch": 0.6842423908855744, + "grad_norm": 0.6759025454521179, + "learning_rate": 7.404558525307159e-06, + "loss": 0.7713, + "step": 12432 + }, + { + "epoch": 0.68429742968793, + "grad_norm": 0.7280172109603882, + "learning_rate": 7.404178466166283e-06, + "loss": 0.7753, + "step": 12433 + }, + { + "epoch": 0.6843524684902856, + "grad_norm": 0.7599073052406311, + "learning_rate": 7.403798388956245e-06, + "loss": 0.6993, + "step": 12434 + }, + { + "epoch": 0.6844075072926413, + "grad_norm": 0.7962353229522705, + "learning_rate": 7.403418293679903e-06, + "loss": 0.771, + "step": 12435 + }, + { + "epoch": 0.684462546094997, + "grad_norm": 0.6714458465576172, + "learning_rate": 7.40303818034011e-06, + "loss": 0.7077, + "step": 12436 + }, + { + "epoch": 0.6845175848973526, + "grad_norm": 0.6770713925361633, + "learning_rate": 7.402658048939726e-06, + "loss": 0.7695, + "step": 12437 + }, + { + "epoch": 0.6845726236997083, + "grad_norm": 0.7337867617607117, + "learning_rate": 7.402277899481608e-06, + "loss": 0.9453, + "step": 12438 + }, + { + "epoch": 0.684627662502064, + "grad_norm": 0.7457698583602905, + "learning_rate": 7.401897731968612e-06, + "loss": 0.7569, + "step": 12439 + }, + { + "epoch": 0.6846827013044197, + "grad_norm": 0.6683285236358643, + "learning_rate": 7.401517546403595e-06, + "loss": 0.7215, + "step": 12440 + }, + { + "epoch": 0.6847377401067752, + "grad_norm": 0.6516628861427307, + "learning_rate": 7.401137342789415e-06, + "loss": 0.7433, + "step": 12441 + }, + { + "epoch": 0.6847927789091309, + "grad_norm": 0.7572295665740967, + "learning_rate": 7.400757121128932e-06, + "loss": 0.7204, + "step": 12442 + }, + { + "epoch": 0.6848478177114866, + "grad_norm": 0.6884106993675232, + "learning_rate": 7.400376881425e-06, + "loss": 0.6766, + "step": 12443 + }, + { + "epoch": 0.6849028565138423, + "grad_norm": 0.798926591873169, + "learning_rate": 7.399996623680475e-06, + "loss": 0.7673, + "step": 12444 + }, + { + "epoch": 0.6849578953161979, + "grad_norm": 0.7200846672058105, + "learning_rate": 7.399616347898221e-06, + "loss": 0.8032, + "step": 12445 + }, + { + "epoch": 0.6850129341185536, + "grad_norm": 0.7085461020469666, + "learning_rate": 7.3992360540810915e-06, + "loss": 0.7075, + "step": 12446 + }, + { + "epoch": 0.6850679729209093, + "grad_norm": 0.6885339021682739, + "learning_rate": 7.398855742231947e-06, + "loss": 0.7278, + "step": 12447 + }, + { + "epoch": 0.685123011723265, + "grad_norm": 0.6693943738937378, + "learning_rate": 7.398475412353643e-06, + "loss": 0.7134, + "step": 12448 + }, + { + "epoch": 0.6851780505256205, + "grad_norm": 0.6908173561096191, + "learning_rate": 7.398095064449041e-06, + "loss": 0.8054, + "step": 12449 + }, + { + "epoch": 0.6852330893279762, + "grad_norm": 0.6207892894744873, + "learning_rate": 7.397714698520999e-06, + "loss": 0.5789, + "step": 12450 + }, + { + "epoch": 0.6852881281303319, + "grad_norm": 0.8367832899093628, + "learning_rate": 7.397334314572374e-06, + "loss": 0.8186, + "step": 12451 + }, + { + "epoch": 0.6853431669326876, + "grad_norm": 0.7005738615989685, + "learning_rate": 7.396953912606026e-06, + "loss": 0.8177, + "step": 12452 + }, + { + "epoch": 0.6853982057350432, + "grad_norm": 0.7189906239509583, + "learning_rate": 7.396573492624814e-06, + "loss": 0.8387, + "step": 12453 + }, + { + "epoch": 0.6854532445373989, + "grad_norm": 1.040576457977295, + "learning_rate": 7.3961930546315995e-06, + "loss": 0.7165, + "step": 12454 + }, + { + "epoch": 0.6855082833397546, + "grad_norm": 0.6417170166969299, + "learning_rate": 7.3958125986292385e-06, + "loss": 0.6671, + "step": 12455 + }, + { + "epoch": 0.6855633221421102, + "grad_norm": 0.6443242430686951, + "learning_rate": 7.395432124620589e-06, + "loss": 0.6995, + "step": 12456 + }, + { + "epoch": 0.6856183609444658, + "grad_norm": 0.5764951705932617, + "learning_rate": 7.395051632608516e-06, + "loss": 0.6088, + "step": 12457 + }, + { + "epoch": 0.6856733997468215, + "grad_norm": 0.6193686127662659, + "learning_rate": 7.394671122595873e-06, + "loss": 0.7283, + "step": 12458 + }, + { + "epoch": 0.6857284385491772, + "grad_norm": 0.6773817539215088, + "learning_rate": 7.394290594585525e-06, + "loss": 0.8204, + "step": 12459 + }, + { + "epoch": 0.6857834773515328, + "grad_norm": 0.7906570434570312, + "learning_rate": 7.393910048580328e-06, + "loss": 0.7057, + "step": 12460 + }, + { + "epoch": 0.6858385161538885, + "grad_norm": 0.7544124126434326, + "learning_rate": 7.393529484583145e-06, + "loss": 0.8053, + "step": 12461 + }, + { + "epoch": 0.6858935549562442, + "grad_norm": 0.6878008842468262, + "learning_rate": 7.3931489025968365e-06, + "loss": 0.6972, + "step": 12462 + }, + { + "epoch": 0.6859485937585998, + "grad_norm": 0.6734861731529236, + "learning_rate": 7.392768302624259e-06, + "loss": 0.7921, + "step": 12463 + }, + { + "epoch": 0.6860036325609554, + "grad_norm": 0.6845618486404419, + "learning_rate": 7.392387684668276e-06, + "loss": 0.7461, + "step": 12464 + }, + { + "epoch": 0.6860586713633111, + "grad_norm": 0.6362663507461548, + "learning_rate": 7.392007048731748e-06, + "loss": 0.7108, + "step": 12465 + }, + { + "epoch": 0.6861137101656668, + "grad_norm": 0.7441046237945557, + "learning_rate": 7.391626394817537e-06, + "loss": 0.6944, + "step": 12466 + }, + { + "epoch": 0.6861687489680225, + "grad_norm": 1.0933935642242432, + "learning_rate": 7.391245722928501e-06, + "loss": 0.7744, + "step": 12467 + }, + { + "epoch": 0.6862237877703781, + "grad_norm": 0.6531348824501038, + "learning_rate": 7.3908650330675e-06, + "loss": 0.6772, + "step": 12468 + }, + { + "epoch": 0.6862788265727338, + "grad_norm": 0.7533715963363647, + "learning_rate": 7.390484325237399e-06, + "loss": 0.7385, + "step": 12469 + }, + { + "epoch": 0.6863338653750894, + "grad_norm": 0.618679940700531, + "learning_rate": 7.390103599441058e-06, + "loss": 0.6053, + "step": 12470 + }, + { + "epoch": 0.6863889041774451, + "grad_norm": 0.7102347612380981, + "learning_rate": 7.389722855681338e-06, + "loss": 0.7246, + "step": 12471 + }, + { + "epoch": 0.6864439429798007, + "grad_norm": 0.8545061945915222, + "learning_rate": 7.3893420939611e-06, + "loss": 0.7386, + "step": 12472 + }, + { + "epoch": 0.6864989817821564, + "grad_norm": 0.6298168897628784, + "learning_rate": 7.388961314283207e-06, + "loss": 0.6573, + "step": 12473 + }, + { + "epoch": 0.6865540205845121, + "grad_norm": 0.6909272074699402, + "learning_rate": 7.388580516650521e-06, + "loss": 0.7973, + "step": 12474 + }, + { + "epoch": 0.6866090593868678, + "grad_norm": 0.6782366037368774, + "learning_rate": 7.388199701065904e-06, + "loss": 0.7437, + "step": 12475 + }, + { + "epoch": 0.6866640981892234, + "grad_norm": 0.6826187372207642, + "learning_rate": 7.387818867532213e-06, + "loss": 0.6254, + "step": 12476 + }, + { + "epoch": 0.686719136991579, + "grad_norm": 0.7471422553062439, + "learning_rate": 7.387438016052318e-06, + "loss": 0.8668, + "step": 12477 + }, + { + "epoch": 0.6867741757939347, + "grad_norm": 0.7987646460533142, + "learning_rate": 7.38705714662908e-06, + "loss": 0.6759, + "step": 12478 + }, + { + "epoch": 0.6868292145962904, + "grad_norm": 0.7318877577781677, + "learning_rate": 7.386676259265356e-06, + "loss": 0.7167, + "step": 12479 + }, + { + "epoch": 0.686884253398646, + "grad_norm": 0.6655439138412476, + "learning_rate": 7.386295353964013e-06, + "loss": 0.7184, + "step": 12480 + }, + { + "epoch": 0.6869392922010017, + "grad_norm": 0.7323878407478333, + "learning_rate": 7.385914430727912e-06, + "loss": 0.7562, + "step": 12481 + }, + { + "epoch": 0.6869943310033574, + "grad_norm": 0.7813006639480591, + "learning_rate": 7.385533489559918e-06, + "loss": 0.7665, + "step": 12482 + }, + { + "epoch": 0.6870493698057131, + "grad_norm": 0.6889718770980835, + "learning_rate": 7.385152530462894e-06, + "loss": 0.6587, + "step": 12483 + }, + { + "epoch": 0.6871044086080687, + "grad_norm": 0.6930332183837891, + "learning_rate": 7.384771553439698e-06, + "loss": 0.8244, + "step": 12484 + }, + { + "epoch": 0.6871594474104243, + "grad_norm": 0.8294679522514343, + "learning_rate": 7.384390558493201e-06, + "loss": 0.6977, + "step": 12485 + }, + { + "epoch": 0.68721448621278, + "grad_norm": 0.7235204577445984, + "learning_rate": 7.384009545626262e-06, + "loss": 0.7946, + "step": 12486 + }, + { + "epoch": 0.6872695250151357, + "grad_norm": 0.6346727609634399, + "learning_rate": 7.3836285148417456e-06, + "loss": 0.6109, + "step": 12487 + }, + { + "epoch": 0.6873245638174913, + "grad_norm": 0.7168872356414795, + "learning_rate": 7.383247466142513e-06, + "loss": 0.7485, + "step": 12488 + }, + { + "epoch": 0.687379602619847, + "grad_norm": 0.6511938571929932, + "learning_rate": 7.382866399531434e-06, + "loss": 0.8048, + "step": 12489 + }, + { + "epoch": 0.6874346414222027, + "grad_norm": 0.7569704651832581, + "learning_rate": 7.3824853150113674e-06, + "loss": 0.8017, + "step": 12490 + }, + { + "epoch": 0.6874896802245584, + "grad_norm": 0.7708210945129395, + "learning_rate": 7.382104212585178e-06, + "loss": 0.7258, + "step": 12491 + }, + { + "epoch": 0.6875447190269139, + "grad_norm": 0.709702730178833, + "learning_rate": 7.381723092255731e-06, + "loss": 0.7707, + "step": 12492 + }, + { + "epoch": 0.6875997578292696, + "grad_norm": 0.6683183908462524, + "learning_rate": 7.381341954025892e-06, + "loss": 0.702, + "step": 12493 + }, + { + "epoch": 0.6876547966316253, + "grad_norm": 0.7639274597167969, + "learning_rate": 7.380960797898524e-06, + "loss": 0.7027, + "step": 12494 + }, + { + "epoch": 0.687709835433981, + "grad_norm": 0.6735698580741882, + "learning_rate": 7.380579623876492e-06, + "loss": 0.7124, + "step": 12495 + }, + { + "epoch": 0.6877648742363366, + "grad_norm": 0.6635340452194214, + "learning_rate": 7.38019843196266e-06, + "loss": 0.6968, + "step": 12496 + }, + { + "epoch": 0.6878199130386923, + "grad_norm": 0.7459729313850403, + "learning_rate": 7.379817222159895e-06, + "loss": 0.7629, + "step": 12497 + }, + { + "epoch": 0.687874951841048, + "grad_norm": 0.7408778667449951, + "learning_rate": 7.37943599447106e-06, + "loss": 0.8327, + "step": 12498 + }, + { + "epoch": 0.6879299906434037, + "grad_norm": 0.659736156463623, + "learning_rate": 7.379054748899021e-06, + "loss": 0.6746, + "step": 12499 + }, + { + "epoch": 0.6879850294457592, + "grad_norm": 0.7429264783859253, + "learning_rate": 7.3786734854466435e-06, + "loss": 0.8555, + "step": 12500 + }, + { + "epoch": 0.6880400682481149, + "grad_norm": 0.7492697834968567, + "learning_rate": 7.378292204116793e-06, + "loss": 0.7825, + "step": 12501 + }, + { + "epoch": 0.6880951070504706, + "grad_norm": 0.6664871573448181, + "learning_rate": 7.377910904912336e-06, + "loss": 0.7343, + "step": 12502 + }, + { + "epoch": 0.6881501458528262, + "grad_norm": 0.8010555505752563, + "learning_rate": 7.377529587836135e-06, + "loss": 0.6789, + "step": 12503 + }, + { + "epoch": 0.6882051846551819, + "grad_norm": 0.6339166164398193, + "learning_rate": 7.3771482528910585e-06, + "loss": 0.7471, + "step": 12504 + }, + { + "epoch": 0.6882602234575376, + "grad_norm": 0.6750906109809875, + "learning_rate": 7.376766900079973e-06, + "loss": 0.665, + "step": 12505 + }, + { + "epoch": 0.6883152622598933, + "grad_norm": 0.6440090537071228, + "learning_rate": 7.376385529405743e-06, + "loss": 0.6804, + "step": 12506 + }, + { + "epoch": 0.6883703010622488, + "grad_norm": 0.7159061431884766, + "learning_rate": 7.376004140871236e-06, + "loss": 0.7524, + "step": 12507 + }, + { + "epoch": 0.6884253398646045, + "grad_norm": 0.7551491260528564, + "learning_rate": 7.375622734479316e-06, + "loss": 0.891, + "step": 12508 + }, + { + "epoch": 0.6884803786669602, + "grad_norm": 0.6584289073944092, + "learning_rate": 7.375241310232854e-06, + "loss": 0.7313, + "step": 12509 + }, + { + "epoch": 0.6885354174693159, + "grad_norm": 0.7616147398948669, + "learning_rate": 7.374859868134713e-06, + "loss": 0.8351, + "step": 12510 + }, + { + "epoch": 0.6885904562716715, + "grad_norm": 0.669541597366333, + "learning_rate": 7.374478408187761e-06, + "loss": 0.6836, + "step": 12511 + }, + { + "epoch": 0.6886454950740272, + "grad_norm": 0.6483158469200134, + "learning_rate": 7.374096930394864e-06, + "loss": 0.6909, + "step": 12512 + }, + { + "epoch": 0.6887005338763829, + "grad_norm": 0.7079604864120483, + "learning_rate": 7.3737154347588925e-06, + "loss": 0.7151, + "step": 12513 + }, + { + "epoch": 0.6887555726787385, + "grad_norm": 0.6805073618888855, + "learning_rate": 7.373333921282709e-06, + "loss": 0.7761, + "step": 12514 + }, + { + "epoch": 0.6888106114810941, + "grad_norm": 0.757008969783783, + "learning_rate": 7.372952389969183e-06, + "loss": 0.7249, + "step": 12515 + }, + { + "epoch": 0.6888656502834498, + "grad_norm": 0.6990587711334229, + "learning_rate": 7.372570840821183e-06, + "loss": 0.7463, + "step": 12516 + }, + { + "epoch": 0.6889206890858055, + "grad_norm": 0.7405683398246765, + "learning_rate": 7.3721892738415745e-06, + "loss": 0.8039, + "step": 12517 + }, + { + "epoch": 0.6889757278881612, + "grad_norm": 0.6736571192741394, + "learning_rate": 7.371807689033228e-06, + "loss": 0.7084, + "step": 12518 + }, + { + "epoch": 0.6890307666905168, + "grad_norm": 0.752955436706543, + "learning_rate": 7.3714260863990095e-06, + "loss": 0.7951, + "step": 12519 + }, + { + "epoch": 0.6890858054928725, + "grad_norm": 0.6810917258262634, + "learning_rate": 7.3710444659417855e-06, + "loss": 0.7884, + "step": 12520 + }, + { + "epoch": 0.6891408442952281, + "grad_norm": 0.727500855922699, + "learning_rate": 7.370662827664427e-06, + "loss": 0.7617, + "step": 12521 + }, + { + "epoch": 0.6891958830975838, + "grad_norm": 0.6739845871925354, + "learning_rate": 7.3702811715698016e-06, + "loss": 0.6831, + "step": 12522 + }, + { + "epoch": 0.6892509218999394, + "grad_norm": 0.850913941860199, + "learning_rate": 7.369899497660779e-06, + "loss": 0.7658, + "step": 12523 + }, + { + "epoch": 0.6893059607022951, + "grad_norm": 0.7352884411811829, + "learning_rate": 7.369517805940223e-06, + "loss": 0.7748, + "step": 12524 + }, + { + "epoch": 0.6893609995046508, + "grad_norm": 0.6702300310134888, + "learning_rate": 7.369136096411008e-06, + "loss": 0.7557, + "step": 12525 + }, + { + "epoch": 0.6894160383070065, + "grad_norm": 0.7117186784744263, + "learning_rate": 7.368754369075999e-06, + "loss": 0.8147, + "step": 12526 + }, + { + "epoch": 0.6894710771093621, + "grad_norm": 0.6896687746047974, + "learning_rate": 7.368372623938067e-06, + "loss": 0.7753, + "step": 12527 + }, + { + "epoch": 0.6895261159117178, + "grad_norm": 0.669207751750946, + "learning_rate": 7.367990861000078e-06, + "loss": 0.739, + "step": 12528 + }, + { + "epoch": 0.6895811547140734, + "grad_norm": 0.7014279961585999, + "learning_rate": 7.367609080264906e-06, + "loss": 0.7712, + "step": 12529 + }, + { + "epoch": 0.6896361935164291, + "grad_norm": 1.0029237270355225, + "learning_rate": 7.367227281735418e-06, + "loss": 0.7641, + "step": 12530 + }, + { + "epoch": 0.6896912323187847, + "grad_norm": 0.6342340707778931, + "learning_rate": 7.3668454654144824e-06, + "loss": 0.7572, + "step": 12531 + }, + { + "epoch": 0.6897462711211404, + "grad_norm": 0.7475802302360535, + "learning_rate": 7.3664636313049696e-06, + "loss": 0.7969, + "step": 12532 + }, + { + "epoch": 0.6898013099234961, + "grad_norm": 0.7478888630867004, + "learning_rate": 7.36608177940975e-06, + "loss": 0.8299, + "step": 12533 + }, + { + "epoch": 0.6898563487258518, + "grad_norm": 0.7017174363136292, + "learning_rate": 7.365699909731694e-06, + "loss": 0.6608, + "step": 12534 + }, + { + "epoch": 0.6899113875282074, + "grad_norm": 0.7259606122970581, + "learning_rate": 7.3653180222736695e-06, + "loss": 0.7088, + "step": 12535 + }, + { + "epoch": 0.689966426330563, + "grad_norm": 0.7049521207809448, + "learning_rate": 7.364936117038548e-06, + "loss": 0.8177, + "step": 12536 + }, + { + "epoch": 0.6900214651329187, + "grad_norm": 0.6557304263114929, + "learning_rate": 7.364554194029201e-06, + "loss": 0.73, + "step": 12537 + }, + { + "epoch": 0.6900765039352744, + "grad_norm": 0.704140305519104, + "learning_rate": 7.364172253248497e-06, + "loss": 0.7671, + "step": 12538 + }, + { + "epoch": 0.69013154273763, + "grad_norm": 0.6879541873931885, + "learning_rate": 7.3637902946993064e-06, + "loss": 0.6707, + "step": 12539 + }, + { + "epoch": 0.6901865815399857, + "grad_norm": 0.7715931534767151, + "learning_rate": 7.363408318384501e-06, + "loss": 0.7494, + "step": 12540 + }, + { + "epoch": 0.6902416203423414, + "grad_norm": 0.7890990972518921, + "learning_rate": 7.363026324306952e-06, + "loss": 0.7499, + "step": 12541 + }, + { + "epoch": 0.6902966591446971, + "grad_norm": 0.7177792191505432, + "learning_rate": 7.362644312469529e-06, + "loss": 0.8053, + "step": 12542 + }, + { + "epoch": 0.6903516979470526, + "grad_norm": 0.7434332370758057, + "learning_rate": 7.3622622828751044e-06, + "loss": 0.7371, + "step": 12543 + }, + { + "epoch": 0.6904067367494083, + "grad_norm": 0.5836912989616394, + "learning_rate": 7.361880235526547e-06, + "loss": 0.6681, + "step": 12544 + }, + { + "epoch": 0.690461775551764, + "grad_norm": 0.6814625263214111, + "learning_rate": 7.3614981704267315e-06, + "loss": 0.7408, + "step": 12545 + }, + { + "epoch": 0.6905168143541196, + "grad_norm": 0.6524162292480469, + "learning_rate": 7.361116087578528e-06, + "loss": 0.6788, + "step": 12546 + }, + { + "epoch": 0.6905718531564753, + "grad_norm": 0.6614788174629211, + "learning_rate": 7.360733986984808e-06, + "loss": 0.75, + "step": 12547 + }, + { + "epoch": 0.690626891958831, + "grad_norm": 1.035152792930603, + "learning_rate": 7.360351868648442e-06, + "loss": 0.7181, + "step": 12548 + }, + { + "epoch": 0.6906819307611867, + "grad_norm": 0.7525657415390015, + "learning_rate": 7.359969732572305e-06, + "loss": 0.8149, + "step": 12549 + }, + { + "epoch": 0.6907369695635422, + "grad_norm": 0.8323431015014648, + "learning_rate": 7.359587578759267e-06, + "loss": 0.6908, + "step": 12550 + }, + { + "epoch": 0.6907920083658979, + "grad_norm": 0.7551344633102417, + "learning_rate": 7.3592054072122e-06, + "loss": 0.794, + "step": 12551 + }, + { + "epoch": 0.6908470471682536, + "grad_norm": 0.5937384366989136, + "learning_rate": 7.358823217933977e-06, + "loss": 0.6532, + "step": 12552 + }, + { + "epoch": 0.6909020859706093, + "grad_norm": 1.5515329837799072, + "learning_rate": 7.358441010927468e-06, + "loss": 0.7003, + "step": 12553 + }, + { + "epoch": 0.6909571247729649, + "grad_norm": 0.6838175654411316, + "learning_rate": 7.3580587861955495e-06, + "loss": 0.7184, + "step": 12554 + }, + { + "epoch": 0.6910121635753206, + "grad_norm": 0.7055354714393616, + "learning_rate": 7.357676543741092e-06, + "loss": 0.8372, + "step": 12555 + }, + { + "epoch": 0.6910672023776763, + "grad_norm": 0.8683249950408936, + "learning_rate": 7.3572942835669695e-06, + "loss": 0.7594, + "step": 12556 + }, + { + "epoch": 0.691122241180032, + "grad_norm": 0.8586179614067078, + "learning_rate": 7.3569120056760535e-06, + "loss": 0.8422, + "step": 12557 + }, + { + "epoch": 0.6911772799823875, + "grad_norm": 0.692132830619812, + "learning_rate": 7.356529710071217e-06, + "loss": 0.7872, + "step": 12558 + }, + { + "epoch": 0.6912323187847432, + "grad_norm": 0.7342404723167419, + "learning_rate": 7.356147396755335e-06, + "loss": 0.6908, + "step": 12559 + }, + { + "epoch": 0.6912873575870989, + "grad_norm": 0.6941357254981995, + "learning_rate": 7.35576506573128e-06, + "loss": 0.608, + "step": 12560 + }, + { + "epoch": 0.6913423963894546, + "grad_norm": 0.648225724697113, + "learning_rate": 7.355382717001925e-06, + "loss": 0.6923, + "step": 12561 + }, + { + "epoch": 0.6913974351918102, + "grad_norm": 0.6735422015190125, + "learning_rate": 7.355000350570144e-06, + "loss": 0.7502, + "step": 12562 + }, + { + "epoch": 0.6914524739941659, + "grad_norm": 0.8507662415504456, + "learning_rate": 7.3546179664388105e-06, + "loss": 0.7883, + "step": 12563 + }, + { + "epoch": 0.6915075127965216, + "grad_norm": 0.7287268042564392, + "learning_rate": 7.3542355646108e-06, + "loss": 0.8687, + "step": 12564 + }, + { + "epoch": 0.6915625515988773, + "grad_norm": 0.6085666418075562, + "learning_rate": 7.353853145088983e-06, + "loss": 0.6675, + "step": 12565 + }, + { + "epoch": 0.6916175904012328, + "grad_norm": 0.727668046951294, + "learning_rate": 7.353470707876237e-06, + "loss": 0.8591, + "step": 12566 + }, + { + "epoch": 0.6916726292035885, + "grad_norm": 0.724846601486206, + "learning_rate": 7.353088252975436e-06, + "loss": 0.8501, + "step": 12567 + }, + { + "epoch": 0.6917276680059442, + "grad_norm": 0.6801046133041382, + "learning_rate": 7.352705780389452e-06, + "loss": 0.7637, + "step": 12568 + }, + { + "epoch": 0.6917827068082999, + "grad_norm": 0.680496335029602, + "learning_rate": 7.352323290121161e-06, + "loss": 0.7308, + "step": 12569 + }, + { + "epoch": 0.6918377456106555, + "grad_norm": 0.7143607139587402, + "learning_rate": 7.351940782173439e-06, + "loss": 0.7494, + "step": 12570 + }, + { + "epoch": 0.6918927844130112, + "grad_norm": 0.679755687713623, + "learning_rate": 7.351558256549158e-06, + "loss": 0.7731, + "step": 12571 + }, + { + "epoch": 0.6919478232153669, + "grad_norm": 0.6626351475715637, + "learning_rate": 7.351175713251197e-06, + "loss": 0.8593, + "step": 12572 + }, + { + "epoch": 0.6920028620177225, + "grad_norm": 0.6830954551696777, + "learning_rate": 7.350793152282427e-06, + "loss": 0.6327, + "step": 12573 + }, + { + "epoch": 0.6920579008200781, + "grad_norm": 0.653810977935791, + "learning_rate": 7.350410573645726e-06, + "loss": 0.7341, + "step": 12574 + }, + { + "epoch": 0.6921129396224338, + "grad_norm": 0.6939566731452942, + "learning_rate": 7.3500279773439675e-06, + "loss": 0.7823, + "step": 12575 + }, + { + "epoch": 0.6921679784247895, + "grad_norm": 0.8212422728538513, + "learning_rate": 7.349645363380029e-06, + "loss": 0.6388, + "step": 12576 + }, + { + "epoch": 0.6922230172271452, + "grad_norm": 0.7703338265419006, + "learning_rate": 7.349262731756783e-06, + "loss": 0.7476, + "step": 12577 + }, + { + "epoch": 0.6922780560295008, + "grad_norm": 0.6710889935493469, + "learning_rate": 7.348880082477108e-06, + "loss": 0.7869, + "step": 12578 + }, + { + "epoch": 0.6923330948318565, + "grad_norm": 0.7384413480758667, + "learning_rate": 7.3484974155438795e-06, + "loss": 0.6628, + "step": 12579 + }, + { + "epoch": 0.6923881336342121, + "grad_norm": 0.7628176212310791, + "learning_rate": 7.348114730959973e-06, + "loss": 0.7599, + "step": 12580 + }, + { + "epoch": 0.6924431724365678, + "grad_norm": 0.683885931968689, + "learning_rate": 7.347732028728264e-06, + "loss": 0.7134, + "step": 12581 + }, + { + "epoch": 0.6924982112389234, + "grad_norm": 0.6710503697395325, + "learning_rate": 7.34734930885163e-06, + "loss": 0.7147, + "step": 12582 + }, + { + "epoch": 0.6925532500412791, + "grad_norm": 0.6984537243843079, + "learning_rate": 7.346966571332947e-06, + "loss": 0.7517, + "step": 12583 + }, + { + "epoch": 0.6926082888436348, + "grad_norm": 0.7563193440437317, + "learning_rate": 7.346583816175092e-06, + "loss": 0.7971, + "step": 12584 + }, + { + "epoch": 0.6926633276459905, + "grad_norm": 0.8407838940620422, + "learning_rate": 7.346201043380941e-06, + "loss": 0.8227, + "step": 12585 + }, + { + "epoch": 0.6927183664483461, + "grad_norm": 0.673098623752594, + "learning_rate": 7.345818252953369e-06, + "loss": 0.7514, + "step": 12586 + }, + { + "epoch": 0.6927734052507017, + "grad_norm": 0.6452111005783081, + "learning_rate": 7.345435444895257e-06, + "loss": 0.7201, + "step": 12587 + }, + { + "epoch": 0.6928284440530574, + "grad_norm": 0.8728383779525757, + "learning_rate": 7.345052619209481e-06, + "loss": 0.7452, + "step": 12588 + }, + { + "epoch": 0.692883482855413, + "grad_norm": 0.7032049298286438, + "learning_rate": 7.344669775898914e-06, + "loss": 0.8885, + "step": 12589 + }, + { + "epoch": 0.6929385216577687, + "grad_norm": 0.7744605541229248, + "learning_rate": 7.344286914966438e-06, + "loss": 0.8048, + "step": 12590 + }, + { + "epoch": 0.6929935604601244, + "grad_norm": 0.7334163784980774, + "learning_rate": 7.343904036414931e-06, + "loss": 0.8502, + "step": 12591 + }, + { + "epoch": 0.6930485992624801, + "grad_norm": 0.6684108376502991, + "learning_rate": 7.343521140247266e-06, + "loss": 0.8264, + "step": 12592 + }, + { + "epoch": 0.6931036380648357, + "grad_norm": 0.6192718744277954, + "learning_rate": 7.343138226466324e-06, + "loss": 0.6625, + "step": 12593 + }, + { + "epoch": 0.6931586768671913, + "grad_norm": 0.6410724520683289, + "learning_rate": 7.342755295074984e-06, + "loss": 0.717, + "step": 12594 + }, + { + "epoch": 0.693213715669547, + "grad_norm": 0.6854361891746521, + "learning_rate": 7.342372346076121e-06, + "loss": 0.7246, + "step": 12595 + }, + { + "epoch": 0.6932687544719027, + "grad_norm": 0.6920250058174133, + "learning_rate": 7.341989379472614e-06, + "loss": 0.7414, + "step": 12596 + }, + { + "epoch": 0.6933237932742583, + "grad_norm": 0.6545842885971069, + "learning_rate": 7.341606395267342e-06, + "loss": 0.7731, + "step": 12597 + }, + { + "epoch": 0.693378832076614, + "grad_norm": 0.6879072785377502, + "learning_rate": 7.341223393463184e-06, + "loss": 0.7272, + "step": 12598 + }, + { + "epoch": 0.6934338708789697, + "grad_norm": 0.7460979223251343, + "learning_rate": 7.340840374063018e-06, + "loss": 0.771, + "step": 12599 + }, + { + "epoch": 0.6934889096813254, + "grad_norm": 0.7836858630180359, + "learning_rate": 7.340457337069722e-06, + "loss": 0.846, + "step": 12600 + }, + { + "epoch": 0.693543948483681, + "grad_norm": 0.958403468132019, + "learning_rate": 7.340074282486174e-06, + "loss": 0.8913, + "step": 12601 + }, + { + "epoch": 0.6935989872860366, + "grad_norm": 0.6614813208580017, + "learning_rate": 7.339691210315254e-06, + "loss": 0.7129, + "step": 12602 + }, + { + "epoch": 0.6936540260883923, + "grad_norm": 0.7303252816200256, + "learning_rate": 7.339308120559843e-06, + "loss": 0.8395, + "step": 12603 + }, + { + "epoch": 0.693709064890748, + "grad_norm": 0.7341620922088623, + "learning_rate": 7.338925013222817e-06, + "loss": 0.8341, + "step": 12604 + }, + { + "epoch": 0.6937641036931036, + "grad_norm": 0.7077179551124573, + "learning_rate": 7.338541888307056e-06, + "loss": 0.7813, + "step": 12605 + }, + { + "epoch": 0.6938191424954593, + "grad_norm": 0.6654969453811646, + "learning_rate": 7.338158745815441e-06, + "loss": 0.7337, + "step": 12606 + }, + { + "epoch": 0.693874181297815, + "grad_norm": 0.6637474894523621, + "learning_rate": 7.337775585750852e-06, + "loss": 0.8197, + "step": 12607 + }, + { + "epoch": 0.6939292201001707, + "grad_norm": 0.654712975025177, + "learning_rate": 7.337392408116166e-06, + "loss": 0.6991, + "step": 12608 + }, + { + "epoch": 0.6939842589025262, + "grad_norm": 0.6698346138000488, + "learning_rate": 7.337009212914265e-06, + "loss": 0.7991, + "step": 12609 + }, + { + "epoch": 0.6940392977048819, + "grad_norm": 0.9616294503211975, + "learning_rate": 7.336626000148028e-06, + "loss": 0.7326, + "step": 12610 + }, + { + "epoch": 0.6940943365072376, + "grad_norm": 0.7749543786048889, + "learning_rate": 7.336242769820335e-06, + "loss": 0.8015, + "step": 12611 + }, + { + "epoch": 0.6941493753095933, + "grad_norm": 0.7263140678405762, + "learning_rate": 7.335859521934068e-06, + "loss": 0.7538, + "step": 12612 + }, + { + "epoch": 0.6942044141119489, + "grad_norm": 0.6383689641952515, + "learning_rate": 7.335476256492105e-06, + "loss": 0.7611, + "step": 12613 + }, + { + "epoch": 0.6942594529143046, + "grad_norm": 0.7464908957481384, + "learning_rate": 7.335092973497326e-06, + "loss": 0.7904, + "step": 12614 + }, + { + "epoch": 0.6943144917166603, + "grad_norm": 1.114864468574524, + "learning_rate": 7.334709672952615e-06, + "loss": 0.8518, + "step": 12615 + }, + { + "epoch": 0.694369530519016, + "grad_norm": 0.6712734699249268, + "learning_rate": 7.334326354860852e-06, + "loss": 0.7431, + "step": 12616 + }, + { + "epoch": 0.6944245693213715, + "grad_norm": 0.7559850811958313, + "learning_rate": 7.3339430192249166e-06, + "loss": 0.7556, + "step": 12617 + }, + { + "epoch": 0.6944796081237272, + "grad_norm": 0.7262033224105835, + "learning_rate": 7.333559666047689e-06, + "loss": 0.7624, + "step": 12618 + }, + { + "epoch": 0.6945346469260829, + "grad_norm": 0.6428695917129517, + "learning_rate": 7.333176295332053e-06, + "loss": 0.6894, + "step": 12619 + }, + { + "epoch": 0.6945896857284386, + "grad_norm": 0.7353672385215759, + "learning_rate": 7.3327929070808875e-06, + "loss": 0.7611, + "step": 12620 + }, + { + "epoch": 0.6946447245307942, + "grad_norm": 0.7063810229301453, + "learning_rate": 7.332409501297076e-06, + "loss": 0.7428, + "step": 12621 + }, + { + "epoch": 0.6946997633331499, + "grad_norm": 0.6552421450614929, + "learning_rate": 7.332026077983498e-06, + "loss": 0.7046, + "step": 12622 + }, + { + "epoch": 0.6947548021355056, + "grad_norm": 0.8843327760696411, + "learning_rate": 7.331642637143037e-06, + "loss": 0.6952, + "step": 12623 + }, + { + "epoch": 0.6948098409378612, + "grad_norm": 0.7279102802276611, + "learning_rate": 7.331259178778574e-06, + "loss": 0.7911, + "step": 12624 + }, + { + "epoch": 0.6948648797402168, + "grad_norm": 0.6585525870323181, + "learning_rate": 7.33087570289299e-06, + "loss": 0.7684, + "step": 12625 + }, + { + "epoch": 0.6949199185425725, + "grad_norm": 0.663185715675354, + "learning_rate": 7.3304922094891695e-06, + "loss": 0.6753, + "step": 12626 + }, + { + "epoch": 0.6949749573449282, + "grad_norm": 0.652765691280365, + "learning_rate": 7.330108698569993e-06, + "loss": 0.7333, + "step": 12627 + }, + { + "epoch": 0.6950299961472839, + "grad_norm": 0.7781688570976257, + "learning_rate": 7.329725170138343e-06, + "loss": 0.7312, + "step": 12628 + }, + { + "epoch": 0.6950850349496395, + "grad_norm": 0.6798241138458252, + "learning_rate": 7.329341624197102e-06, + "loss": 0.7747, + "step": 12629 + }, + { + "epoch": 0.6951400737519952, + "grad_norm": 0.7588373422622681, + "learning_rate": 7.328958060749153e-06, + "loss": 0.8535, + "step": 12630 + }, + { + "epoch": 0.6951951125543508, + "grad_norm": 0.8833348155021667, + "learning_rate": 7.328574479797379e-06, + "loss": 0.8345, + "step": 12631 + }, + { + "epoch": 0.6952501513567064, + "grad_norm": 0.799454927444458, + "learning_rate": 7.328190881344663e-06, + "loss": 0.7571, + "step": 12632 + }, + { + "epoch": 0.6953051901590621, + "grad_norm": 0.8030340671539307, + "learning_rate": 7.327807265393887e-06, + "loss": 0.7426, + "step": 12633 + }, + { + "epoch": 0.6953602289614178, + "grad_norm": 0.6246228218078613, + "learning_rate": 7.327423631947934e-06, + "loss": 0.6712, + "step": 12634 + }, + { + "epoch": 0.6954152677637735, + "grad_norm": 0.7203500866889954, + "learning_rate": 7.32703998100969e-06, + "loss": 0.8315, + "step": 12635 + }, + { + "epoch": 0.6954703065661291, + "grad_norm": 0.6128239035606384, + "learning_rate": 7.326656312582035e-06, + "loss": 0.6788, + "step": 12636 + }, + { + "epoch": 0.6955253453684848, + "grad_norm": 0.8052619695663452, + "learning_rate": 7.326272626667852e-06, + "loss": 0.8076, + "step": 12637 + }, + { + "epoch": 0.6955803841708404, + "grad_norm": 0.9128470420837402, + "learning_rate": 7.325888923270029e-06, + "loss": 0.7135, + "step": 12638 + }, + { + "epoch": 0.6956354229731961, + "grad_norm": 0.6815299391746521, + "learning_rate": 7.325505202391447e-06, + "loss": 0.7756, + "step": 12639 + }, + { + "epoch": 0.6956904617755517, + "grad_norm": 0.6278733611106873, + "learning_rate": 7.325121464034991e-06, + "loss": 0.6583, + "step": 12640 + }, + { + "epoch": 0.6957455005779074, + "grad_norm": 0.7161649465560913, + "learning_rate": 7.324737708203543e-06, + "loss": 0.7106, + "step": 12641 + }, + { + "epoch": 0.6958005393802631, + "grad_norm": 0.6827715635299683, + "learning_rate": 7.324353934899989e-06, + "loss": 0.7988, + "step": 12642 + }, + { + "epoch": 0.6958555781826188, + "grad_norm": 0.9999695420265198, + "learning_rate": 7.323970144127215e-06, + "loss": 0.8222, + "step": 12643 + }, + { + "epoch": 0.6959106169849744, + "grad_norm": 0.8048173785209656, + "learning_rate": 7.323586335888102e-06, + "loss": 0.7157, + "step": 12644 + }, + { + "epoch": 0.69596565578733, + "grad_norm": 0.7403637170791626, + "learning_rate": 7.323202510185536e-06, + "loss": 0.7516, + "step": 12645 + }, + { + "epoch": 0.6960206945896857, + "grad_norm": 0.6660793423652649, + "learning_rate": 7.322818667022402e-06, + "loss": 0.7081, + "step": 12646 + }, + { + "epoch": 0.6960757333920414, + "grad_norm": 0.713985800743103, + "learning_rate": 7.322434806401585e-06, + "loss": 0.7682, + "step": 12647 + }, + { + "epoch": 0.696130772194397, + "grad_norm": 0.739253044128418, + "learning_rate": 7.322050928325969e-06, + "loss": 0.838, + "step": 12648 + }, + { + "epoch": 0.6961858109967527, + "grad_norm": 0.8350489735603333, + "learning_rate": 7.32166703279844e-06, + "loss": 0.7627, + "step": 12649 + }, + { + "epoch": 0.6962408497991084, + "grad_norm": 0.580456018447876, + "learning_rate": 7.321283119821883e-06, + "loss": 0.6248, + "step": 12650 + }, + { + "epoch": 0.6962958886014641, + "grad_norm": 0.8619480729103088, + "learning_rate": 7.320899189399183e-06, + "loss": 0.848, + "step": 12651 + }, + { + "epoch": 0.6963509274038197, + "grad_norm": 0.6201381087303162, + "learning_rate": 7.320515241533227e-06, + "loss": 0.6506, + "step": 12652 + }, + { + "epoch": 0.6964059662061753, + "grad_norm": 0.6956773400306702, + "learning_rate": 7.320131276226898e-06, + "loss": 0.7561, + "step": 12653 + }, + { + "epoch": 0.696461005008531, + "grad_norm": 0.6382080912590027, + "learning_rate": 7.319747293483085e-06, + "loss": 0.6462, + "step": 12654 + }, + { + "epoch": 0.6965160438108867, + "grad_norm": 0.7288708686828613, + "learning_rate": 7.319363293304672e-06, + "loss": 0.7907, + "step": 12655 + }, + { + "epoch": 0.6965710826132423, + "grad_norm": 0.6280390024185181, + "learning_rate": 7.318979275694546e-06, + "loss": 0.6882, + "step": 12656 + }, + { + "epoch": 0.696626121415598, + "grad_norm": 0.7260308861732483, + "learning_rate": 7.31859524065559e-06, + "loss": 0.756, + "step": 12657 + }, + { + "epoch": 0.6966811602179537, + "grad_norm": 0.6715009212493896, + "learning_rate": 7.318211188190696e-06, + "loss": 0.7194, + "step": 12658 + }, + { + "epoch": 0.6967361990203094, + "grad_norm": 0.6770408749580383, + "learning_rate": 7.3178271183027465e-06, + "loss": 0.808, + "step": 12659 + }, + { + "epoch": 0.6967912378226649, + "grad_norm": 0.7209904789924622, + "learning_rate": 7.317443030994628e-06, + "loss": 0.7242, + "step": 12660 + }, + { + "epoch": 0.6968462766250206, + "grad_norm": 0.6943202018737793, + "learning_rate": 7.317058926269227e-06, + "loss": 0.758, + "step": 12661 + }, + { + "epoch": 0.6969013154273763, + "grad_norm": 0.6073412299156189, + "learning_rate": 7.316674804129432e-06, + "loss": 0.6571, + "step": 12662 + }, + { + "epoch": 0.696956354229732, + "grad_norm": 0.7065439224243164, + "learning_rate": 7.316290664578129e-06, + "loss": 0.7333, + "step": 12663 + }, + { + "epoch": 0.6970113930320876, + "grad_norm": 0.6275133490562439, + "learning_rate": 7.315906507618207e-06, + "loss": 0.6785, + "step": 12664 + }, + { + "epoch": 0.6970664318344433, + "grad_norm": 0.6484677791595459, + "learning_rate": 7.315522333252551e-06, + "loss": 0.7461, + "step": 12665 + }, + { + "epoch": 0.697121470636799, + "grad_norm": 0.6815413236618042, + "learning_rate": 7.315138141484049e-06, + "loss": 0.673, + "step": 12666 + }, + { + "epoch": 0.6971765094391547, + "grad_norm": 0.7227872610092163, + "learning_rate": 7.314753932315587e-06, + "loss": 0.7212, + "step": 12667 + }, + { + "epoch": 0.6972315482415102, + "grad_norm": 0.661568284034729, + "learning_rate": 7.314369705750055e-06, + "loss": 0.7633, + "step": 12668 + }, + { + "epoch": 0.6972865870438659, + "grad_norm": 0.5873990654945374, + "learning_rate": 7.3139854617903405e-06, + "loss": 0.6142, + "step": 12669 + }, + { + "epoch": 0.6973416258462216, + "grad_norm": 0.7015652656555176, + "learning_rate": 7.313601200439331e-06, + "loss": 0.6762, + "step": 12670 + }, + { + "epoch": 0.6973966646485773, + "grad_norm": 0.7060853242874146, + "learning_rate": 7.313216921699913e-06, + "loss": 0.8111, + "step": 12671 + }, + { + "epoch": 0.6974517034509329, + "grad_norm": 0.6198092699050903, + "learning_rate": 7.312832625574977e-06, + "loss": 0.7058, + "step": 12672 + }, + { + "epoch": 0.6975067422532886, + "grad_norm": 0.6785464286804199, + "learning_rate": 7.312448312067408e-06, + "loss": 0.7509, + "step": 12673 + }, + { + "epoch": 0.6975617810556443, + "grad_norm": 0.74974524974823, + "learning_rate": 7.312063981180097e-06, + "loss": 0.7679, + "step": 12674 + }, + { + "epoch": 0.6976168198579998, + "grad_norm": 0.6188651919364929, + "learning_rate": 7.311679632915934e-06, + "loss": 0.663, + "step": 12675 + }, + { + "epoch": 0.6976718586603555, + "grad_norm": 0.7458493113517761, + "learning_rate": 7.3112952672778044e-06, + "loss": 0.7316, + "step": 12676 + }, + { + "epoch": 0.6977268974627112, + "grad_norm": 0.7480403780937195, + "learning_rate": 7.310910884268597e-06, + "loss": 0.8476, + "step": 12677 + }, + { + "epoch": 0.6977819362650669, + "grad_norm": 0.6921943426132202, + "learning_rate": 7.310526483891204e-06, + "loss": 0.7931, + "step": 12678 + }, + { + "epoch": 0.6978369750674225, + "grad_norm": 0.7384023666381836, + "learning_rate": 7.3101420661485124e-06, + "loss": 0.7698, + "step": 12679 + }, + { + "epoch": 0.6978920138697782, + "grad_norm": 0.6693310141563416, + "learning_rate": 7.3097576310434105e-06, + "loss": 0.6838, + "step": 12680 + }, + { + "epoch": 0.6979470526721339, + "grad_norm": 0.6888617873191833, + "learning_rate": 7.309373178578789e-06, + "loss": 0.7196, + "step": 12681 + }, + { + "epoch": 0.6980020914744895, + "grad_norm": 0.7608165144920349, + "learning_rate": 7.308988708757536e-06, + "loss": 0.7483, + "step": 12682 + }, + { + "epoch": 0.6980571302768451, + "grad_norm": 0.6969812512397766, + "learning_rate": 7.308604221582543e-06, + "loss": 0.7415, + "step": 12683 + }, + { + "epoch": 0.6981121690792008, + "grad_norm": 0.7440872192382812, + "learning_rate": 7.3082197170566996e-06, + "loss": 0.7776, + "step": 12684 + }, + { + "epoch": 0.6981672078815565, + "grad_norm": 0.7920299768447876, + "learning_rate": 7.307835195182892e-06, + "loss": 0.746, + "step": 12685 + }, + { + "epoch": 0.6982222466839122, + "grad_norm": 0.7002919912338257, + "learning_rate": 7.3074506559640134e-06, + "loss": 0.7948, + "step": 12686 + }, + { + "epoch": 0.6982772854862678, + "grad_norm": 0.7199681997299194, + "learning_rate": 7.3070660994029554e-06, + "loss": 0.7568, + "step": 12687 + }, + { + "epoch": 0.6983323242886235, + "grad_norm": 0.6287575960159302, + "learning_rate": 7.306681525502604e-06, + "loss": 0.6564, + "step": 12688 + }, + { + "epoch": 0.6983873630909792, + "grad_norm": 0.6910778880119324, + "learning_rate": 7.306296934265853e-06, + "loss": 0.7892, + "step": 12689 + }, + { + "epoch": 0.6984424018933348, + "grad_norm": 0.6454603672027588, + "learning_rate": 7.30591232569559e-06, + "loss": 0.7848, + "step": 12690 + }, + { + "epoch": 0.6984974406956904, + "grad_norm": 0.7337101101875305, + "learning_rate": 7.305527699794709e-06, + "loss": 0.8012, + "step": 12691 + }, + { + "epoch": 0.6985524794980461, + "grad_norm": 0.6694337129592896, + "learning_rate": 7.305143056566098e-06, + "loss": 0.7767, + "step": 12692 + }, + { + "epoch": 0.6986075183004018, + "grad_norm": 0.6485214233398438, + "learning_rate": 7.30475839601265e-06, + "loss": 0.7142, + "step": 12693 + }, + { + "epoch": 0.6986625571027575, + "grad_norm": 0.6401854753494263, + "learning_rate": 7.304373718137253e-06, + "loss": 0.6562, + "step": 12694 + }, + { + "epoch": 0.6987175959051131, + "grad_norm": 0.7190635800361633, + "learning_rate": 7.303989022942801e-06, + "loss": 0.7513, + "step": 12695 + }, + { + "epoch": 0.6987726347074688, + "grad_norm": 0.7100299596786499, + "learning_rate": 7.3036043104321854e-06, + "loss": 0.759, + "step": 12696 + }, + { + "epoch": 0.6988276735098244, + "grad_norm": 0.8507145047187805, + "learning_rate": 7.303219580608295e-06, + "loss": 0.7567, + "step": 12697 + }, + { + "epoch": 0.6988827123121801, + "grad_norm": 0.6758378744125366, + "learning_rate": 7.302834833474022e-06, + "loss": 0.6751, + "step": 12698 + }, + { + "epoch": 0.6989377511145357, + "grad_norm": 0.7602974772453308, + "learning_rate": 7.30245006903226e-06, + "loss": 0.7304, + "step": 12699 + }, + { + "epoch": 0.6989927899168914, + "grad_norm": 0.7519045472145081, + "learning_rate": 7.3020652872859e-06, + "loss": 0.7573, + "step": 12700 + }, + { + "epoch": 0.6990478287192471, + "grad_norm": 0.6076456904411316, + "learning_rate": 7.301680488237832e-06, + "loss": 0.6335, + "step": 12701 + }, + { + "epoch": 0.6991028675216028, + "grad_norm": 0.6900685429573059, + "learning_rate": 7.30129567189095e-06, + "loss": 0.7787, + "step": 12702 + }, + { + "epoch": 0.6991579063239584, + "grad_norm": 0.7366316318511963, + "learning_rate": 7.300910838248146e-06, + "loss": 0.8176, + "step": 12703 + }, + { + "epoch": 0.699212945126314, + "grad_norm": 0.6658521890640259, + "learning_rate": 7.300525987312312e-06, + "loss": 0.6436, + "step": 12704 + }, + { + "epoch": 0.6992679839286697, + "grad_norm": 0.7635871171951294, + "learning_rate": 7.300141119086341e-06, + "loss": 0.8421, + "step": 12705 + }, + { + "epoch": 0.6993230227310254, + "grad_norm": 0.7257800698280334, + "learning_rate": 7.299756233573125e-06, + "loss": 0.6468, + "step": 12706 + }, + { + "epoch": 0.699378061533381, + "grad_norm": 0.7536096572875977, + "learning_rate": 7.299371330775558e-06, + "loss": 0.7782, + "step": 12707 + }, + { + "epoch": 0.6994331003357367, + "grad_norm": 0.7504379153251648, + "learning_rate": 7.298986410696529e-06, + "loss": 0.7097, + "step": 12708 + }, + { + "epoch": 0.6994881391380924, + "grad_norm": 0.7340306043624878, + "learning_rate": 7.298601473338936e-06, + "loss": 0.8165, + "step": 12709 + }, + { + "epoch": 0.6995431779404481, + "grad_norm": 0.6928045749664307, + "learning_rate": 7.298216518705667e-06, + "loss": 0.777, + "step": 12710 + }, + { + "epoch": 0.6995982167428036, + "grad_norm": 0.6942496299743652, + "learning_rate": 7.29783154679962e-06, + "loss": 0.6607, + "step": 12711 + }, + { + "epoch": 0.6996532555451593, + "grad_norm": 0.6646896600723267, + "learning_rate": 7.297446557623684e-06, + "loss": 0.712, + "step": 12712 + }, + { + "epoch": 0.699708294347515, + "grad_norm": 0.6828078627586365, + "learning_rate": 7.297061551180758e-06, + "loss": 0.7251, + "step": 12713 + }, + { + "epoch": 0.6997633331498707, + "grad_norm": 0.7554219365119934, + "learning_rate": 7.296676527473729e-06, + "loss": 0.8279, + "step": 12714 + }, + { + "epoch": 0.6998183719522263, + "grad_norm": 0.8122106194496155, + "learning_rate": 7.296291486505495e-06, + "loss": 0.8039, + "step": 12715 + }, + { + "epoch": 0.699873410754582, + "grad_norm": 0.6602222323417664, + "learning_rate": 7.295906428278949e-06, + "loss": 0.7149, + "step": 12716 + }, + { + "epoch": 0.6999284495569377, + "grad_norm": 0.8341954350471497, + "learning_rate": 7.2955213527969845e-06, + "loss": 0.7868, + "step": 12717 + }, + { + "epoch": 0.6999834883592932, + "grad_norm": 0.7157256603240967, + "learning_rate": 7.295136260062496e-06, + "loss": 0.745, + "step": 12718 + }, + { + "epoch": 0.7000385271616489, + "grad_norm": 0.5845672488212585, + "learning_rate": 7.294751150078379e-06, + "loss": 0.657, + "step": 12719 + }, + { + "epoch": 0.7000935659640046, + "grad_norm": 0.7370786070823669, + "learning_rate": 7.2943660228475265e-06, + "loss": 0.7883, + "step": 12720 + }, + { + "epoch": 0.7001486047663603, + "grad_norm": 0.6687451004981995, + "learning_rate": 7.293980878372833e-06, + "loss": 0.7945, + "step": 12721 + }, + { + "epoch": 0.7002036435687159, + "grad_norm": 0.6352105736732483, + "learning_rate": 7.293595716657192e-06, + "loss": 0.6581, + "step": 12722 + }, + { + "epoch": 0.7002586823710716, + "grad_norm": 0.7371370196342468, + "learning_rate": 7.293210537703499e-06, + "loss": 0.7859, + "step": 12723 + }, + { + "epoch": 0.7003137211734273, + "grad_norm": 0.6885504722595215, + "learning_rate": 7.292825341514651e-06, + "loss": 0.7355, + "step": 12724 + }, + { + "epoch": 0.700368759975783, + "grad_norm": 0.6930849552154541, + "learning_rate": 7.292440128093542e-06, + "loss": 0.8145, + "step": 12725 + }, + { + "epoch": 0.7004237987781385, + "grad_norm": 0.6767199635505676, + "learning_rate": 7.292054897443065e-06, + "loss": 0.7136, + "step": 12726 + }, + { + "epoch": 0.7004788375804942, + "grad_norm": 0.6672216653823853, + "learning_rate": 7.291669649566117e-06, + "loss": 0.6131, + "step": 12727 + }, + { + "epoch": 0.7005338763828499, + "grad_norm": 0.6618815064430237, + "learning_rate": 7.291284384465595e-06, + "loss": 0.7633, + "step": 12728 + }, + { + "epoch": 0.7005889151852056, + "grad_norm": 0.6573876142501831, + "learning_rate": 7.290899102144392e-06, + "loss": 0.7621, + "step": 12729 + }, + { + "epoch": 0.7006439539875612, + "grad_norm": 0.7449564337730408, + "learning_rate": 7.290513802605405e-06, + "loss": 0.6488, + "step": 12730 + }, + { + "epoch": 0.7006989927899169, + "grad_norm": 0.7307295203208923, + "learning_rate": 7.290128485851529e-06, + "loss": 0.7095, + "step": 12731 + }, + { + "epoch": 0.7007540315922726, + "grad_norm": 0.698699951171875, + "learning_rate": 7.2897431518856596e-06, + "loss": 0.7428, + "step": 12732 + }, + { + "epoch": 0.7008090703946283, + "grad_norm": 0.6334750056266785, + "learning_rate": 7.289357800710695e-06, + "loss": 0.6977, + "step": 12733 + }, + { + "epoch": 0.7008641091969838, + "grad_norm": 0.6526468396186829, + "learning_rate": 7.288972432329529e-06, + "loss": 0.6375, + "step": 12734 + }, + { + "epoch": 0.7009191479993395, + "grad_norm": 0.7282149791717529, + "learning_rate": 7.288587046745059e-06, + "loss": 0.7494, + "step": 12735 + }, + { + "epoch": 0.7009741868016952, + "grad_norm": 0.8511056900024414, + "learning_rate": 7.288201643960182e-06, + "loss": 0.7494, + "step": 12736 + }, + { + "epoch": 0.7010292256040509, + "grad_norm": 0.6908526420593262, + "learning_rate": 7.287816223977793e-06, + "loss": 0.6861, + "step": 12737 + }, + { + "epoch": 0.7010842644064065, + "grad_norm": 0.7582982182502747, + "learning_rate": 7.2874307868007896e-06, + "loss": 0.7758, + "step": 12738 + }, + { + "epoch": 0.7011393032087622, + "grad_norm": 0.9717779159545898, + "learning_rate": 7.2870453324320685e-06, + "loss": 0.7221, + "step": 12739 + }, + { + "epoch": 0.7011943420111179, + "grad_norm": 0.6532751321792603, + "learning_rate": 7.286659860874529e-06, + "loss": 0.8009, + "step": 12740 + }, + { + "epoch": 0.7012493808134735, + "grad_norm": 0.6708540320396423, + "learning_rate": 7.286274372131065e-06, + "loss": 0.7177, + "step": 12741 + }, + { + "epoch": 0.7013044196158291, + "grad_norm": 0.7624804973602295, + "learning_rate": 7.285888866204575e-06, + "loss": 0.7878, + "step": 12742 + }, + { + "epoch": 0.7013594584181848, + "grad_norm": 0.7167851328849792, + "learning_rate": 7.285503343097955e-06, + "loss": 0.7276, + "step": 12743 + }, + { + "epoch": 0.7014144972205405, + "grad_norm": 0.6592209935188293, + "learning_rate": 7.2851178028141045e-06, + "loss": 0.7665, + "step": 12744 + }, + { + "epoch": 0.7014695360228962, + "grad_norm": 0.684847354888916, + "learning_rate": 7.284732245355921e-06, + "loss": 0.7358, + "step": 12745 + }, + { + "epoch": 0.7015245748252518, + "grad_norm": 0.6852415800094604, + "learning_rate": 7.2843466707262985e-06, + "loss": 0.7805, + "step": 12746 + }, + { + "epoch": 0.7015796136276075, + "grad_norm": 0.6422114968299866, + "learning_rate": 7.283961078928141e-06, + "loss": 0.7386, + "step": 12747 + }, + { + "epoch": 0.7016346524299631, + "grad_norm": 0.7538495659828186, + "learning_rate": 7.283575469964343e-06, + "loss": 0.798, + "step": 12748 + }, + { + "epoch": 0.7016896912323188, + "grad_norm": 0.6646687984466553, + "learning_rate": 7.2831898438378025e-06, + "loss": 0.7048, + "step": 12749 + }, + { + "epoch": 0.7017447300346744, + "grad_norm": 0.8338429927825928, + "learning_rate": 7.2828042005514176e-06, + "loss": 0.8585, + "step": 12750 + }, + { + "epoch": 0.7017997688370301, + "grad_norm": 0.7086663842201233, + "learning_rate": 7.282418540108088e-06, + "loss": 0.8011, + "step": 12751 + }, + { + "epoch": 0.7018548076393858, + "grad_norm": 0.6040074229240417, + "learning_rate": 7.282032862510712e-06, + "loss": 0.6327, + "step": 12752 + }, + { + "epoch": 0.7019098464417415, + "grad_norm": 0.7030978798866272, + "learning_rate": 7.281647167762187e-06, + "loss": 0.6373, + "step": 12753 + }, + { + "epoch": 0.7019648852440971, + "grad_norm": 0.662308394908905, + "learning_rate": 7.281261455865414e-06, + "loss": 0.7283, + "step": 12754 + }, + { + "epoch": 0.7020199240464527, + "grad_norm": 0.7369368672370911, + "learning_rate": 7.28087572682329e-06, + "loss": 0.7632, + "step": 12755 + }, + { + "epoch": 0.7020749628488084, + "grad_norm": 0.6887282729148865, + "learning_rate": 7.280489980638714e-06, + "loss": 0.7629, + "step": 12756 + }, + { + "epoch": 0.702130001651164, + "grad_norm": 0.656512975692749, + "learning_rate": 7.280104217314587e-06, + "loss": 0.8028, + "step": 12757 + }, + { + "epoch": 0.7021850404535197, + "grad_norm": 0.7006264328956604, + "learning_rate": 7.279718436853805e-06, + "loss": 0.7025, + "step": 12758 + }, + { + "epoch": 0.7022400792558754, + "grad_norm": 0.675585925579071, + "learning_rate": 7.279332639259271e-06, + "loss": 0.8001, + "step": 12759 + }, + { + "epoch": 0.7022951180582311, + "grad_norm": 0.7105827331542969, + "learning_rate": 7.278946824533883e-06, + "loss": 0.7767, + "step": 12760 + }, + { + "epoch": 0.7023501568605867, + "grad_norm": 0.8310064673423767, + "learning_rate": 7.27856099268054e-06, + "loss": 0.7828, + "step": 12761 + }, + { + "epoch": 0.7024051956629423, + "grad_norm": 0.6885055899620056, + "learning_rate": 7.278175143702142e-06, + "loss": 0.7018, + "step": 12762 + }, + { + "epoch": 0.702460234465298, + "grad_norm": 0.6542866826057434, + "learning_rate": 7.27778927760159e-06, + "loss": 0.7118, + "step": 12763 + }, + { + "epoch": 0.7025152732676537, + "grad_norm": 0.9102655053138733, + "learning_rate": 7.277403394381784e-06, + "loss": 0.8381, + "step": 12764 + }, + { + "epoch": 0.7025703120700093, + "grad_norm": 0.6538355946540833, + "learning_rate": 7.277017494045624e-06, + "loss": 0.7766, + "step": 12765 + }, + { + "epoch": 0.702625350872365, + "grad_norm": 0.6691237092018127, + "learning_rate": 7.27663157659601e-06, + "loss": 0.8077, + "step": 12766 + }, + { + "epoch": 0.7026803896747207, + "grad_norm": 0.7159995436668396, + "learning_rate": 7.2762456420358414e-06, + "loss": 0.8333, + "step": 12767 + }, + { + "epoch": 0.7027354284770764, + "grad_norm": 0.6518422365188599, + "learning_rate": 7.275859690368022e-06, + "loss": 0.7634, + "step": 12768 + }, + { + "epoch": 0.702790467279432, + "grad_norm": 0.6969057321548462, + "learning_rate": 7.275473721595449e-06, + "loss": 0.7481, + "step": 12769 + }, + { + "epoch": 0.7028455060817876, + "grad_norm": 0.6788915395736694, + "learning_rate": 7.2750877357210225e-06, + "loss": 0.7402, + "step": 12770 + }, + { + "epoch": 0.7029005448841433, + "grad_norm": 0.7323998212814331, + "learning_rate": 7.274701732747649e-06, + "loss": 0.7122, + "step": 12771 + }, + { + "epoch": 0.702955583686499, + "grad_norm": 0.7224077582359314, + "learning_rate": 7.274315712678224e-06, + "loss": 0.7333, + "step": 12772 + }, + { + "epoch": 0.7030106224888546, + "grad_norm": 0.9009444117546082, + "learning_rate": 7.273929675515652e-06, + "loss": 0.6912, + "step": 12773 + }, + { + "epoch": 0.7030656612912103, + "grad_norm": 0.7076312899589539, + "learning_rate": 7.273543621262832e-06, + "loss": 0.7651, + "step": 12774 + }, + { + "epoch": 0.703120700093566, + "grad_norm": 0.78575599193573, + "learning_rate": 7.273157549922668e-06, + "loss": 0.7443, + "step": 12775 + }, + { + "epoch": 0.7031757388959217, + "grad_norm": 0.6957094669342041, + "learning_rate": 7.27277146149806e-06, + "loss": 0.7684, + "step": 12776 + }, + { + "epoch": 0.7032307776982772, + "grad_norm": 1.177878975868225, + "learning_rate": 7.27238535599191e-06, + "loss": 0.9033, + "step": 12777 + }, + { + "epoch": 0.7032858165006329, + "grad_norm": 0.6929007768630981, + "learning_rate": 7.27199923340712e-06, + "loss": 0.7411, + "step": 12778 + }, + { + "epoch": 0.7033408553029886, + "grad_norm": 0.7725315093994141, + "learning_rate": 7.2716130937465926e-06, + "loss": 0.7833, + "step": 12779 + }, + { + "epoch": 0.7033958941053443, + "grad_norm": 0.6512928605079651, + "learning_rate": 7.271226937013228e-06, + "loss": 0.7918, + "step": 12780 + }, + { + "epoch": 0.7034509329076999, + "grad_norm": 0.7033893465995789, + "learning_rate": 7.270840763209931e-06, + "loss": 0.843, + "step": 12781 + }, + { + "epoch": 0.7035059717100556, + "grad_norm": 0.7596432566642761, + "learning_rate": 7.2704545723396e-06, + "loss": 0.7916, + "step": 12782 + }, + { + "epoch": 0.7035610105124113, + "grad_norm": 0.6256046891212463, + "learning_rate": 7.270068364405143e-06, + "loss": 0.6531, + "step": 12783 + }, + { + "epoch": 0.703616049314767, + "grad_norm": 0.8107615113258362, + "learning_rate": 7.26968213940946e-06, + "loss": 0.7755, + "step": 12784 + }, + { + "epoch": 0.7036710881171225, + "grad_norm": 0.6742845177650452, + "learning_rate": 7.269295897355451e-06, + "loss": 0.834, + "step": 12785 + }, + { + "epoch": 0.7037261269194782, + "grad_norm": 0.6665072441101074, + "learning_rate": 7.268909638246024e-06, + "loss": 0.6864, + "step": 12786 + }, + { + "epoch": 0.7037811657218339, + "grad_norm": 0.68357914686203, + "learning_rate": 7.268523362084078e-06, + "loss": 0.7789, + "step": 12787 + }, + { + "epoch": 0.7038362045241896, + "grad_norm": 0.6878114938735962, + "learning_rate": 7.268137068872519e-06, + "loss": 0.7277, + "step": 12788 + }, + { + "epoch": 0.7038912433265452, + "grad_norm": 0.7173313498497009, + "learning_rate": 7.267750758614247e-06, + "loss": 0.8156, + "step": 12789 + }, + { + "epoch": 0.7039462821289009, + "grad_norm": 0.6523084044456482, + "learning_rate": 7.267364431312169e-06, + "loss": 0.7143, + "step": 12790 + }, + { + "epoch": 0.7040013209312566, + "grad_norm": 0.7403815388679504, + "learning_rate": 7.2669780869691865e-06, + "loss": 0.8196, + "step": 12791 + }, + { + "epoch": 0.7040563597336122, + "grad_norm": 0.6411255598068237, + "learning_rate": 7.266591725588204e-06, + "loss": 0.6645, + "step": 12792 + }, + { + "epoch": 0.7041113985359678, + "grad_norm": 0.9094020128250122, + "learning_rate": 7.266205347172124e-06, + "loss": 0.8023, + "step": 12793 + }, + { + "epoch": 0.7041664373383235, + "grad_norm": 1.1041208505630493, + "learning_rate": 7.265818951723851e-06, + "loss": 0.7011, + "step": 12794 + }, + { + "epoch": 0.7042214761406792, + "grad_norm": 0.7339954376220703, + "learning_rate": 7.265432539246289e-06, + "loss": 0.7467, + "step": 12795 + }, + { + "epoch": 0.7042765149430349, + "grad_norm": 0.7055865526199341, + "learning_rate": 7.265046109742344e-06, + "loss": 0.7364, + "step": 12796 + }, + { + "epoch": 0.7043315537453905, + "grad_norm": 0.7052320241928101, + "learning_rate": 7.264659663214917e-06, + "loss": 0.7611, + "step": 12797 + }, + { + "epoch": 0.7043865925477462, + "grad_norm": 0.7374194860458374, + "learning_rate": 7.264273199666915e-06, + "loss": 0.7612, + "step": 12798 + }, + { + "epoch": 0.7044416313501018, + "grad_norm": 0.634986162185669, + "learning_rate": 7.263886719101242e-06, + "loss": 0.8001, + "step": 12799 + }, + { + "epoch": 0.7044966701524574, + "grad_norm": 0.8178644180297852, + "learning_rate": 7.2635002215208014e-06, + "loss": 0.8404, + "step": 12800 + }, + { + "epoch": 0.7045517089548131, + "grad_norm": 0.7743822336196899, + "learning_rate": 7.263113706928501e-06, + "loss": 0.7297, + "step": 12801 + }, + { + "epoch": 0.7046067477571688, + "grad_norm": 0.6558601260185242, + "learning_rate": 7.262727175327242e-06, + "loss": 0.6933, + "step": 12802 + }, + { + "epoch": 0.7046617865595245, + "grad_norm": 1.0608787536621094, + "learning_rate": 7.262340626719933e-06, + "loss": 0.8792, + "step": 12803 + }, + { + "epoch": 0.7047168253618801, + "grad_norm": 0.7488270401954651, + "learning_rate": 7.261954061109475e-06, + "loss": 0.7755, + "step": 12804 + }, + { + "epoch": 0.7047718641642358, + "grad_norm": 0.8960574865341187, + "learning_rate": 7.261567478498778e-06, + "loss": 0.7274, + "step": 12805 + }, + { + "epoch": 0.7048269029665915, + "grad_norm": 0.6289944648742676, + "learning_rate": 7.2611808788907436e-06, + "loss": 0.6469, + "step": 12806 + }, + { + "epoch": 0.7048819417689471, + "grad_norm": 0.6488339900970459, + "learning_rate": 7.26079426228828e-06, + "loss": 0.7581, + "step": 12807 + }, + { + "epoch": 0.7049369805713027, + "grad_norm": 0.7354650497436523, + "learning_rate": 7.260407628694292e-06, + "loss": 0.7596, + "step": 12808 + }, + { + "epoch": 0.7049920193736584, + "grad_norm": 0.8163169026374817, + "learning_rate": 7.2600209781116834e-06, + "loss": 0.8291, + "step": 12809 + }, + { + "epoch": 0.7050470581760141, + "grad_norm": 0.8223916292190552, + "learning_rate": 7.259634310543364e-06, + "loss": 0.7089, + "step": 12810 + }, + { + "epoch": 0.7051020969783698, + "grad_norm": 0.7815924286842346, + "learning_rate": 7.2592476259922374e-06, + "loss": 0.8098, + "step": 12811 + }, + { + "epoch": 0.7051571357807254, + "grad_norm": 0.7027734518051147, + "learning_rate": 7.2588609244612105e-06, + "loss": 0.7276, + "step": 12812 + }, + { + "epoch": 0.705212174583081, + "grad_norm": 0.7345930337905884, + "learning_rate": 7.2584742059531894e-06, + "loss": 0.803, + "step": 12813 + }, + { + "epoch": 0.7052672133854367, + "grad_norm": 0.6998127102851868, + "learning_rate": 7.258087470471081e-06, + "loss": 0.7938, + "step": 12814 + }, + { + "epoch": 0.7053222521877924, + "grad_norm": 0.6418118476867676, + "learning_rate": 7.257700718017793e-06, + "loss": 0.66, + "step": 12815 + }, + { + "epoch": 0.705377290990148, + "grad_norm": 0.6774695515632629, + "learning_rate": 7.257313948596228e-06, + "loss": 0.7143, + "step": 12816 + }, + { + "epoch": 0.7054323297925037, + "grad_norm": 0.7107009291648865, + "learning_rate": 7.256927162209298e-06, + "loss": 0.8378, + "step": 12817 + }, + { + "epoch": 0.7054873685948594, + "grad_norm": 0.7287374138832092, + "learning_rate": 7.256540358859906e-06, + "loss": 0.88, + "step": 12818 + }, + { + "epoch": 0.7055424073972151, + "grad_norm": 0.651221752166748, + "learning_rate": 7.256153538550961e-06, + "loss": 0.7092, + "step": 12819 + }, + { + "epoch": 0.7055974461995707, + "grad_norm": 0.6549085974693298, + "learning_rate": 7.255766701285371e-06, + "loss": 0.6697, + "step": 12820 + }, + { + "epoch": 0.7056524850019263, + "grad_norm": 0.6617292165756226, + "learning_rate": 7.255379847066041e-06, + "loss": 0.7779, + "step": 12821 + }, + { + "epoch": 0.705707523804282, + "grad_norm": 0.6677221655845642, + "learning_rate": 7.254992975895879e-06, + "loss": 0.7821, + "step": 12822 + }, + { + "epoch": 0.7057625626066377, + "grad_norm": 0.8183515667915344, + "learning_rate": 7.2546060877777945e-06, + "loss": 0.7727, + "step": 12823 + }, + { + "epoch": 0.7058176014089933, + "grad_norm": 0.6574132442474365, + "learning_rate": 7.2542191827146945e-06, + "loss": 0.7118, + "step": 12824 + }, + { + "epoch": 0.705872640211349, + "grad_norm": 0.6874130964279175, + "learning_rate": 7.253832260709487e-06, + "loss": 0.7677, + "step": 12825 + }, + { + "epoch": 0.7059276790137047, + "grad_norm": 0.6460297107696533, + "learning_rate": 7.253445321765079e-06, + "loss": 0.725, + "step": 12826 + }, + { + "epoch": 0.7059827178160604, + "grad_norm": 0.6618219614028931, + "learning_rate": 7.253058365884379e-06, + "loss": 0.7504, + "step": 12827 + }, + { + "epoch": 0.706037756618416, + "grad_norm": 0.6519019603729248, + "learning_rate": 7.252671393070295e-06, + "loss": 0.7382, + "step": 12828 + }, + { + "epoch": 0.7060927954207716, + "grad_norm": 0.7114588022232056, + "learning_rate": 7.252284403325737e-06, + "loss": 0.8364, + "step": 12829 + }, + { + "epoch": 0.7061478342231273, + "grad_norm": 0.6304726600646973, + "learning_rate": 7.251897396653611e-06, + "loss": 0.6972, + "step": 12830 + }, + { + "epoch": 0.706202873025483, + "grad_norm": 0.6728807687759399, + "learning_rate": 7.251510373056827e-06, + "loss": 0.671, + "step": 12831 + }, + { + "epoch": 0.7062579118278386, + "grad_norm": 0.690641462802887, + "learning_rate": 7.251123332538295e-06, + "loss": 0.7381, + "step": 12832 + }, + { + "epoch": 0.7063129506301943, + "grad_norm": 0.7018027305603027, + "learning_rate": 7.2507362751009226e-06, + "loss": 0.7546, + "step": 12833 + }, + { + "epoch": 0.70636798943255, + "grad_norm": 0.7203684449195862, + "learning_rate": 7.250349200747617e-06, + "loss": 0.7534, + "step": 12834 + }, + { + "epoch": 0.7064230282349057, + "grad_norm": 0.6936585903167725, + "learning_rate": 7.24996210948129e-06, + "loss": 0.7716, + "step": 12835 + }, + { + "epoch": 0.7064780670372612, + "grad_norm": 0.7421281337738037, + "learning_rate": 7.249575001304851e-06, + "loss": 0.7517, + "step": 12836 + }, + { + "epoch": 0.7065331058396169, + "grad_norm": 0.6622288227081299, + "learning_rate": 7.249187876221207e-06, + "loss": 0.6799, + "step": 12837 + }, + { + "epoch": 0.7065881446419726, + "grad_norm": 0.7267055511474609, + "learning_rate": 7.24880073423327e-06, + "loss": 0.7871, + "step": 12838 + }, + { + "epoch": 0.7066431834443283, + "grad_norm": 0.6978085041046143, + "learning_rate": 7.2484135753439485e-06, + "loss": 0.7812, + "step": 12839 + }, + { + "epoch": 0.7066982222466839, + "grad_norm": 0.8353652358055115, + "learning_rate": 7.248026399556153e-06, + "loss": 0.7481, + "step": 12840 + }, + { + "epoch": 0.7067532610490396, + "grad_norm": 0.8402471542358398, + "learning_rate": 7.247639206872792e-06, + "loss": 0.783, + "step": 12841 + }, + { + "epoch": 0.7068082998513953, + "grad_norm": 0.8279419541358948, + "learning_rate": 7.247251997296777e-06, + "loss": 0.8177, + "step": 12842 + }, + { + "epoch": 0.7068633386537508, + "grad_norm": 0.6850735545158386, + "learning_rate": 7.246864770831017e-06, + "loss": 0.7586, + "step": 12843 + }, + { + "epoch": 0.7069183774561065, + "grad_norm": 0.7327665090560913, + "learning_rate": 7.246477527478422e-06, + "loss": 0.9327, + "step": 12844 + }, + { + "epoch": 0.7069734162584622, + "grad_norm": 0.6343075037002563, + "learning_rate": 7.246090267241905e-06, + "loss": 0.6957, + "step": 12845 + }, + { + "epoch": 0.7070284550608179, + "grad_norm": 0.7028965353965759, + "learning_rate": 7.245702990124373e-06, + "loss": 0.7524, + "step": 12846 + }, + { + "epoch": 0.7070834938631735, + "grad_norm": 0.7578299045562744, + "learning_rate": 7.24531569612874e-06, + "loss": 0.7302, + "step": 12847 + }, + { + "epoch": 0.7071385326655292, + "grad_norm": 0.8113438487052917, + "learning_rate": 7.2449283852579146e-06, + "loss": 0.7658, + "step": 12848 + }, + { + "epoch": 0.7071935714678849, + "grad_norm": 0.6442512273788452, + "learning_rate": 7.244541057514809e-06, + "loss": 0.6742, + "step": 12849 + }, + { + "epoch": 0.7072486102702406, + "grad_norm": 0.8595272898674011, + "learning_rate": 7.244153712902333e-06, + "loss": 0.7944, + "step": 12850 + }, + { + "epoch": 0.7073036490725961, + "grad_norm": 0.6565983891487122, + "learning_rate": 7.243766351423398e-06, + "loss": 0.7411, + "step": 12851 + }, + { + "epoch": 0.7073586878749518, + "grad_norm": 0.7935337424278259, + "learning_rate": 7.243378973080917e-06, + "loss": 0.8109, + "step": 12852 + }, + { + "epoch": 0.7074137266773075, + "grad_norm": 0.7083927392959595, + "learning_rate": 7.242991577877799e-06, + "loss": 0.8405, + "step": 12853 + }, + { + "epoch": 0.7074687654796632, + "grad_norm": 0.7452830672264099, + "learning_rate": 7.242604165816958e-06, + "loss": 0.7972, + "step": 12854 + }, + { + "epoch": 0.7075238042820188, + "grad_norm": 0.6775808334350586, + "learning_rate": 7.242216736901302e-06, + "loss": 0.7114, + "step": 12855 + }, + { + "epoch": 0.7075788430843745, + "grad_norm": 0.8069992661476135, + "learning_rate": 7.241829291133748e-06, + "loss": 0.6606, + "step": 12856 + }, + { + "epoch": 0.7076338818867302, + "grad_norm": 0.6690802574157715, + "learning_rate": 7.241441828517203e-06, + "loss": 0.742, + "step": 12857 + }, + { + "epoch": 0.7076889206890858, + "grad_norm": 0.8077805638313293, + "learning_rate": 7.2410543490545814e-06, + "loss": 0.7786, + "step": 12858 + }, + { + "epoch": 0.7077439594914414, + "grad_norm": 0.6906875967979431, + "learning_rate": 7.240666852748795e-06, + "loss": 0.7445, + "step": 12859 + }, + { + "epoch": 0.7077989982937971, + "grad_norm": 0.6830704808235168, + "learning_rate": 7.2402793396027585e-06, + "loss": 0.7664, + "step": 12860 + }, + { + "epoch": 0.7078540370961528, + "grad_norm": 0.8118640780448914, + "learning_rate": 7.23989180961938e-06, + "loss": 0.7654, + "step": 12861 + }, + { + "epoch": 0.7079090758985085, + "grad_norm": 0.6819882392883301, + "learning_rate": 7.2395042628015755e-06, + "loss": 0.649, + "step": 12862 + }, + { + "epoch": 0.7079641147008641, + "grad_norm": 0.6543441414833069, + "learning_rate": 7.239116699152256e-06, + "loss": 0.8054, + "step": 12863 + }, + { + "epoch": 0.7080191535032198, + "grad_norm": 0.8613989353179932, + "learning_rate": 7.238729118674335e-06, + "loss": 0.7283, + "step": 12864 + }, + { + "epoch": 0.7080741923055754, + "grad_norm": 0.6993124485015869, + "learning_rate": 7.238341521370725e-06, + "loss": 0.8145, + "step": 12865 + }, + { + "epoch": 0.7081292311079311, + "grad_norm": 0.7047560811042786, + "learning_rate": 7.237953907244339e-06, + "loss": 0.6729, + "step": 12866 + }, + { + "epoch": 0.7081842699102867, + "grad_norm": 0.7923689484596252, + "learning_rate": 7.237566276298091e-06, + "loss": 0.7615, + "step": 12867 + }, + { + "epoch": 0.7082393087126424, + "grad_norm": 0.6873850226402283, + "learning_rate": 7.237178628534894e-06, + "loss": 0.7638, + "step": 12868 + }, + { + "epoch": 0.7082943475149981, + "grad_norm": 0.6483134031295776, + "learning_rate": 7.236790963957661e-06, + "loss": 0.6366, + "step": 12869 + }, + { + "epoch": 0.7083493863173538, + "grad_norm": 0.6623784899711609, + "learning_rate": 7.236403282569305e-06, + "loss": 0.7032, + "step": 12870 + }, + { + "epoch": 0.7084044251197094, + "grad_norm": 0.7004366517066956, + "learning_rate": 7.236015584372741e-06, + "loss": 0.6436, + "step": 12871 + }, + { + "epoch": 0.708459463922065, + "grad_norm": 0.5676529407501221, + "learning_rate": 7.235627869370883e-06, + "loss": 0.6395, + "step": 12872 + }, + { + "epoch": 0.7085145027244207, + "grad_norm": 0.6909729838371277, + "learning_rate": 7.235240137566644e-06, + "loss": 0.7063, + "step": 12873 + }, + { + "epoch": 0.7085695415267764, + "grad_norm": 0.7635348439216614, + "learning_rate": 7.234852388962939e-06, + "loss": 0.7518, + "step": 12874 + }, + { + "epoch": 0.708624580329132, + "grad_norm": 0.7217742204666138, + "learning_rate": 7.2344646235626815e-06, + "loss": 0.7782, + "step": 12875 + }, + { + "epoch": 0.7086796191314877, + "grad_norm": 0.6506509184837341, + "learning_rate": 7.2340768413687855e-06, + "loss": 0.7456, + "step": 12876 + }, + { + "epoch": 0.7087346579338434, + "grad_norm": 0.6537386775016785, + "learning_rate": 7.2336890423841664e-06, + "loss": 0.7395, + "step": 12877 + }, + { + "epoch": 0.7087896967361991, + "grad_norm": 0.7759900689125061, + "learning_rate": 7.233301226611737e-06, + "loss": 0.8098, + "step": 12878 + }, + { + "epoch": 0.7088447355385546, + "grad_norm": 0.8476354479789734, + "learning_rate": 7.232913394054415e-06, + "loss": 0.8241, + "step": 12879 + }, + { + "epoch": 0.7088997743409103, + "grad_norm": 0.6770507097244263, + "learning_rate": 7.232525544715114e-06, + "loss": 0.6966, + "step": 12880 + }, + { + "epoch": 0.708954813143266, + "grad_norm": 0.7750027775764465, + "learning_rate": 7.232137678596747e-06, + "loss": 0.8038, + "step": 12881 + }, + { + "epoch": 0.7090098519456217, + "grad_norm": 0.6507213711738586, + "learning_rate": 7.231749795702232e-06, + "loss": 0.6446, + "step": 12882 + }, + { + "epoch": 0.7090648907479773, + "grad_norm": 0.7554625272750854, + "learning_rate": 7.231361896034481e-06, + "loss": 0.7769, + "step": 12883 + }, + { + "epoch": 0.709119929550333, + "grad_norm": 0.8175020813941956, + "learning_rate": 7.230973979596414e-06, + "loss": 0.8283, + "step": 12884 + }, + { + "epoch": 0.7091749683526887, + "grad_norm": 0.7528663873672485, + "learning_rate": 7.2305860463909416e-06, + "loss": 0.7737, + "step": 12885 + }, + { + "epoch": 0.7092300071550443, + "grad_norm": 0.9242768883705139, + "learning_rate": 7.230198096420983e-06, + "loss": 0.647, + "step": 12886 + }, + { + "epoch": 0.7092850459573999, + "grad_norm": 0.899874746799469, + "learning_rate": 7.229810129689452e-06, + "loss": 0.8952, + "step": 12887 + }, + { + "epoch": 0.7093400847597556, + "grad_norm": 0.8221275806427002, + "learning_rate": 7.229422146199266e-06, + "loss": 0.6845, + "step": 12888 + }, + { + "epoch": 0.7093951235621113, + "grad_norm": 0.6964027285575867, + "learning_rate": 7.229034145953338e-06, + "loss": 0.7153, + "step": 12889 + }, + { + "epoch": 0.7094501623644669, + "grad_norm": 0.8018684387207031, + "learning_rate": 7.228646128954588e-06, + "loss": 0.6421, + "step": 12890 + }, + { + "epoch": 0.7095052011668226, + "grad_norm": 0.6874614953994751, + "learning_rate": 7.228258095205928e-06, + "loss": 0.8024, + "step": 12891 + }, + { + "epoch": 0.7095602399691783, + "grad_norm": 0.7141417860984802, + "learning_rate": 7.227870044710277e-06, + "loss": 0.7746, + "step": 12892 + }, + { + "epoch": 0.709615278771534, + "grad_norm": 0.7109399437904358, + "learning_rate": 7.227481977470552e-06, + "loss": 0.7826, + "step": 12893 + }, + { + "epoch": 0.7096703175738895, + "grad_norm": 0.7021867036819458, + "learning_rate": 7.227093893489669e-06, + "loss": 0.7196, + "step": 12894 + }, + { + "epoch": 0.7097253563762452, + "grad_norm": 0.6896560788154602, + "learning_rate": 7.226705792770543e-06, + "loss": 0.6925, + "step": 12895 + }, + { + "epoch": 0.7097803951786009, + "grad_norm": 0.7138262987136841, + "learning_rate": 7.226317675316094e-06, + "loss": 0.7417, + "step": 12896 + }, + { + "epoch": 0.7098354339809566, + "grad_norm": 0.6789212226867676, + "learning_rate": 7.225929541129236e-06, + "loss": 0.7095, + "step": 12897 + }, + { + "epoch": 0.7098904727833122, + "grad_norm": 0.8102045059204102, + "learning_rate": 7.225541390212889e-06, + "loss": 0.9252, + "step": 12898 + }, + { + "epoch": 0.7099455115856679, + "grad_norm": 0.6220358610153198, + "learning_rate": 7.2251532225699674e-06, + "loss": 0.7205, + "step": 12899 + }, + { + "epoch": 0.7100005503880236, + "grad_norm": 0.6375265121459961, + "learning_rate": 7.224765038203391e-06, + "loss": 0.7974, + "step": 12900 + }, + { + "epoch": 0.7100555891903793, + "grad_norm": 0.7457360029220581, + "learning_rate": 7.224376837116075e-06, + "loss": 0.7083, + "step": 12901 + }, + { + "epoch": 0.7101106279927348, + "grad_norm": 0.7012878060340881, + "learning_rate": 7.2239886193109374e-06, + "loss": 0.7334, + "step": 12902 + }, + { + "epoch": 0.7101656667950905, + "grad_norm": 0.7437683343887329, + "learning_rate": 7.223600384790898e-06, + "loss": 0.82, + "step": 12903 + }, + { + "epoch": 0.7102207055974462, + "grad_norm": 0.6727370619773865, + "learning_rate": 7.223212133558872e-06, + "loss": 0.7339, + "step": 12904 + }, + { + "epoch": 0.7102757443998019, + "grad_norm": 0.9253849983215332, + "learning_rate": 7.222823865617781e-06, + "loss": 0.7398, + "step": 12905 + }, + { + "epoch": 0.7103307832021575, + "grad_norm": 0.6664100885391235, + "learning_rate": 7.222435580970539e-06, + "loss": 0.7519, + "step": 12906 + }, + { + "epoch": 0.7103858220045132, + "grad_norm": 0.7452943325042725, + "learning_rate": 7.222047279620066e-06, + "loss": 0.7382, + "step": 12907 + }, + { + "epoch": 0.7104408608068689, + "grad_norm": 0.7235015630722046, + "learning_rate": 7.22165896156928e-06, + "loss": 0.7726, + "step": 12908 + }, + { + "epoch": 0.7104958996092245, + "grad_norm": 0.6324653029441833, + "learning_rate": 7.221270626821102e-06, + "loss": 0.7451, + "step": 12909 + }, + { + "epoch": 0.7105509384115801, + "grad_norm": 0.789829432964325, + "learning_rate": 7.220882275378447e-06, + "loss": 0.7375, + "step": 12910 + }, + { + "epoch": 0.7106059772139358, + "grad_norm": 0.9090244174003601, + "learning_rate": 7.220493907244236e-06, + "loss": 0.8935, + "step": 12911 + }, + { + "epoch": 0.7106610160162915, + "grad_norm": 0.6570677757263184, + "learning_rate": 7.220105522421388e-06, + "loss": 0.7259, + "step": 12912 + }, + { + "epoch": 0.7107160548186472, + "grad_norm": 0.7142132520675659, + "learning_rate": 7.219717120912819e-06, + "loss": 0.7862, + "step": 12913 + }, + { + "epoch": 0.7107710936210028, + "grad_norm": 0.7359404563903809, + "learning_rate": 7.219328702721452e-06, + "loss": 0.7074, + "step": 12914 + }, + { + "epoch": 0.7108261324233585, + "grad_norm": 0.7118046283721924, + "learning_rate": 7.218940267850203e-06, + "loss": 0.8151, + "step": 12915 + }, + { + "epoch": 0.7108811712257141, + "grad_norm": 0.8301580548286438, + "learning_rate": 7.218551816301994e-06, + "loss": 0.7031, + "step": 12916 + }, + { + "epoch": 0.7109362100280698, + "grad_norm": 0.6647501587867737, + "learning_rate": 7.218163348079743e-06, + "loss": 0.8309, + "step": 12917 + }, + { + "epoch": 0.7109912488304254, + "grad_norm": 0.6546997427940369, + "learning_rate": 7.217774863186371e-06, + "loss": 0.717, + "step": 12918 + }, + { + "epoch": 0.7110462876327811, + "grad_norm": 0.6639735102653503, + "learning_rate": 7.217386361624795e-06, + "loss": 0.7308, + "step": 12919 + }, + { + "epoch": 0.7111013264351368, + "grad_norm": 0.724433183670044, + "learning_rate": 7.216997843397938e-06, + "loss": 0.7576, + "step": 12920 + }, + { + "epoch": 0.7111563652374925, + "grad_norm": 0.750253438949585, + "learning_rate": 7.216609308508719e-06, + "loss": 0.7014, + "step": 12921 + }, + { + "epoch": 0.7112114040398481, + "grad_norm": 0.7010897397994995, + "learning_rate": 7.216220756960058e-06, + "loss": 0.6951, + "step": 12922 + }, + { + "epoch": 0.7112664428422037, + "grad_norm": 0.7739251852035522, + "learning_rate": 7.215832188754873e-06, + "loss": 0.7392, + "step": 12923 + }, + { + "epoch": 0.7113214816445594, + "grad_norm": 0.6893059015274048, + "learning_rate": 7.215443603896088e-06, + "loss": 0.7029, + "step": 12924 + }, + { + "epoch": 0.7113765204469151, + "grad_norm": 0.8061872124671936, + "learning_rate": 7.215055002386622e-06, + "loss": 0.7557, + "step": 12925 + }, + { + "epoch": 0.7114315592492707, + "grad_norm": 1.089525580406189, + "learning_rate": 7.214666384229395e-06, + "loss": 0.6701, + "step": 12926 + }, + { + "epoch": 0.7114865980516264, + "grad_norm": 0.7601733207702637, + "learning_rate": 7.2142777494273275e-06, + "loss": 0.8113, + "step": 12927 + }, + { + "epoch": 0.7115416368539821, + "grad_norm": 0.7863540649414062, + "learning_rate": 7.213889097983342e-06, + "loss": 0.7945, + "step": 12928 + }, + { + "epoch": 0.7115966756563377, + "grad_norm": 0.7722556591033936, + "learning_rate": 7.21350042990036e-06, + "loss": 0.9492, + "step": 12929 + }, + { + "epoch": 0.7116517144586934, + "grad_norm": 0.6834682822227478, + "learning_rate": 7.213111745181299e-06, + "loss": 0.7138, + "step": 12930 + }, + { + "epoch": 0.711706753261049, + "grad_norm": 0.6974432468414307, + "learning_rate": 7.212723043829083e-06, + "loss": 0.7654, + "step": 12931 + }, + { + "epoch": 0.7117617920634047, + "grad_norm": 0.9797543883323669, + "learning_rate": 7.2123343258466334e-06, + "loss": 0.7786, + "step": 12932 + }, + { + "epoch": 0.7118168308657603, + "grad_norm": 0.6337804794311523, + "learning_rate": 7.211945591236872e-06, + "loss": 0.7147, + "step": 12933 + }, + { + "epoch": 0.711871869668116, + "grad_norm": 0.7450474500656128, + "learning_rate": 7.211556840002718e-06, + "loss": 0.8516, + "step": 12934 + }, + { + "epoch": 0.7119269084704717, + "grad_norm": 0.7786532640457153, + "learning_rate": 7.2111680721470965e-06, + "loss": 0.837, + "step": 12935 + }, + { + "epoch": 0.7119819472728274, + "grad_norm": 0.666020393371582, + "learning_rate": 7.210779287672927e-06, + "loss": 0.7646, + "step": 12936 + }, + { + "epoch": 0.712036986075183, + "grad_norm": 0.622648298740387, + "learning_rate": 7.210390486583132e-06, + "loss": 0.7102, + "step": 12937 + }, + { + "epoch": 0.7120920248775386, + "grad_norm": 0.7175952792167664, + "learning_rate": 7.210001668880634e-06, + "loss": 0.7043, + "step": 12938 + }, + { + "epoch": 0.7121470636798943, + "grad_norm": 0.8019681572914124, + "learning_rate": 7.209612834568353e-06, + "loss": 0.8166, + "step": 12939 + }, + { + "epoch": 0.71220210248225, + "grad_norm": 0.804457426071167, + "learning_rate": 7.209223983649216e-06, + "loss": 0.7182, + "step": 12940 + }, + { + "epoch": 0.7122571412846056, + "grad_norm": 0.7261730432510376, + "learning_rate": 7.208835116126143e-06, + "loss": 0.6634, + "step": 12941 + }, + { + "epoch": 0.7123121800869613, + "grad_norm": 0.7461307644844055, + "learning_rate": 7.208446232002055e-06, + "loss": 0.709, + "step": 12942 + }, + { + "epoch": 0.712367218889317, + "grad_norm": 0.6730383634567261, + "learning_rate": 7.208057331279877e-06, + "loss": 0.7111, + "step": 12943 + }, + { + "epoch": 0.7124222576916727, + "grad_norm": 0.829530656337738, + "learning_rate": 7.207668413962531e-06, + "loss": 0.729, + "step": 12944 + }, + { + "epoch": 0.7124772964940282, + "grad_norm": 0.5997991561889648, + "learning_rate": 7.20727948005294e-06, + "loss": 0.6385, + "step": 12945 + }, + { + "epoch": 0.7125323352963839, + "grad_norm": 0.9590086936950684, + "learning_rate": 7.206890529554027e-06, + "loss": 0.7217, + "step": 12946 + }, + { + "epoch": 0.7125873740987396, + "grad_norm": 0.7818330526351929, + "learning_rate": 7.206501562468717e-06, + "loss": 0.7276, + "step": 12947 + }, + { + "epoch": 0.7126424129010953, + "grad_norm": 0.6033679842948914, + "learning_rate": 7.206112578799931e-06, + "loss": 0.5935, + "step": 12948 + }, + { + "epoch": 0.7126974517034509, + "grad_norm": 0.7431650757789612, + "learning_rate": 7.205723578550593e-06, + "loss": 0.8649, + "step": 12949 + }, + { + "epoch": 0.7127524905058066, + "grad_norm": 0.7026848793029785, + "learning_rate": 7.205334561723627e-06, + "loss": 0.7484, + "step": 12950 + }, + { + "epoch": 0.7128075293081623, + "grad_norm": 0.6328058242797852, + "learning_rate": 7.204945528321956e-06, + "loss": 0.6994, + "step": 12951 + }, + { + "epoch": 0.712862568110518, + "grad_norm": 0.6806536912918091, + "learning_rate": 7.204556478348507e-06, + "loss": 0.7461, + "step": 12952 + }, + { + "epoch": 0.7129176069128735, + "grad_norm": 0.6822162866592407, + "learning_rate": 7.2041674118062e-06, + "loss": 0.7947, + "step": 12953 + }, + { + "epoch": 0.7129726457152292, + "grad_norm": 0.7283263802528381, + "learning_rate": 7.203778328697962e-06, + "loss": 0.7559, + "step": 12954 + }, + { + "epoch": 0.7130276845175849, + "grad_norm": 0.663564920425415, + "learning_rate": 7.203389229026714e-06, + "loss": 0.6898, + "step": 12955 + }, + { + "epoch": 0.7130827233199406, + "grad_norm": 0.7218708395957947, + "learning_rate": 7.203000112795383e-06, + "loss": 0.8095, + "step": 12956 + }, + { + "epoch": 0.7131377621222962, + "grad_norm": 0.6931518912315369, + "learning_rate": 7.202610980006893e-06, + "loss": 0.7591, + "step": 12957 + }, + { + "epoch": 0.7131928009246519, + "grad_norm": 0.6982918381690979, + "learning_rate": 7.2022218306641704e-06, + "loss": 0.7651, + "step": 12958 + }, + { + "epoch": 0.7132478397270076, + "grad_norm": 0.8033974170684814, + "learning_rate": 7.201832664770135e-06, + "loss": 0.8857, + "step": 12959 + }, + { + "epoch": 0.7133028785293632, + "grad_norm": 0.6625493764877319, + "learning_rate": 7.201443482327717e-06, + "loss": 0.752, + "step": 12960 + }, + { + "epoch": 0.7133579173317188, + "grad_norm": 0.8149683475494385, + "learning_rate": 7.201054283339838e-06, + "loss": 0.8528, + "step": 12961 + }, + { + "epoch": 0.7134129561340745, + "grad_norm": 0.7894958257675171, + "learning_rate": 7.200665067809425e-06, + "loss": 0.8554, + "step": 12962 + }, + { + "epoch": 0.7134679949364302, + "grad_norm": 0.7613523602485657, + "learning_rate": 7.200275835739401e-06, + "loss": 0.7435, + "step": 12963 + }, + { + "epoch": 0.7135230337387859, + "grad_norm": 0.665985643863678, + "learning_rate": 7.199886587132693e-06, + "loss": 0.7072, + "step": 12964 + }, + { + "epoch": 0.7135780725411415, + "grad_norm": 0.7523592710494995, + "learning_rate": 7.199497321992227e-06, + "loss": 0.7945, + "step": 12965 + }, + { + "epoch": 0.7136331113434972, + "grad_norm": 0.8894450664520264, + "learning_rate": 7.199108040320928e-06, + "loss": 0.7885, + "step": 12966 + }, + { + "epoch": 0.7136881501458529, + "grad_norm": 0.639108419418335, + "learning_rate": 7.198718742121722e-06, + "loss": 0.6975, + "step": 12967 + }, + { + "epoch": 0.7137431889482085, + "grad_norm": 0.670013964176178, + "learning_rate": 7.198329427397532e-06, + "loss": 0.7441, + "step": 12968 + }, + { + "epoch": 0.7137982277505641, + "grad_norm": 0.7695425748825073, + "learning_rate": 7.197940096151289e-06, + "loss": 0.7616, + "step": 12969 + }, + { + "epoch": 0.7138532665529198, + "grad_norm": 0.9098057150840759, + "learning_rate": 7.197550748385917e-06, + "loss": 0.9028, + "step": 12970 + }, + { + "epoch": 0.7139083053552755, + "grad_norm": 0.7677769660949707, + "learning_rate": 7.197161384104341e-06, + "loss": 0.7926, + "step": 12971 + }, + { + "epoch": 0.7139633441576311, + "grad_norm": 0.7020674347877502, + "learning_rate": 7.196772003309487e-06, + "loss": 0.7248, + "step": 12972 + }, + { + "epoch": 0.7140183829599868, + "grad_norm": 0.6616366505622864, + "learning_rate": 7.196382606004283e-06, + "loss": 0.7137, + "step": 12973 + }, + { + "epoch": 0.7140734217623425, + "grad_norm": 0.7174738645553589, + "learning_rate": 7.195993192191656e-06, + "loss": 0.8167, + "step": 12974 + }, + { + "epoch": 0.7141284605646981, + "grad_norm": 0.6672176122665405, + "learning_rate": 7.1956037618745325e-06, + "loss": 0.6516, + "step": 12975 + }, + { + "epoch": 0.7141834993670537, + "grad_norm": 0.714790403842926, + "learning_rate": 7.195214315055837e-06, + "loss": 0.865, + "step": 12976 + }, + { + "epoch": 0.7142385381694094, + "grad_norm": 0.6637690663337708, + "learning_rate": 7.1948248517385e-06, + "loss": 0.7328, + "step": 12977 + }, + { + "epoch": 0.7142935769717651, + "grad_norm": 0.8998367786407471, + "learning_rate": 7.194435371925446e-06, + "loss": 0.7097, + "step": 12978 + }, + { + "epoch": 0.7143486157741208, + "grad_norm": 0.7472445964813232, + "learning_rate": 7.194045875619604e-06, + "loss": 0.7556, + "step": 12979 + }, + { + "epoch": 0.7144036545764764, + "grad_norm": 0.7897135019302368, + "learning_rate": 7.1936563628239e-06, + "loss": 0.8728, + "step": 12980 + }, + { + "epoch": 0.714458693378832, + "grad_norm": 0.6520817279815674, + "learning_rate": 7.193266833541261e-06, + "loss": 0.6824, + "step": 12981 + }, + { + "epoch": 0.7145137321811877, + "grad_norm": 0.833849310874939, + "learning_rate": 7.192877287774618e-06, + "loss": 0.8877, + "step": 12982 + }, + { + "epoch": 0.7145687709835434, + "grad_norm": 0.7105151414871216, + "learning_rate": 7.192487725526896e-06, + "loss": 0.7799, + "step": 12983 + }, + { + "epoch": 0.714623809785899, + "grad_norm": 0.7515869140625, + "learning_rate": 7.192098146801021e-06, + "loss": 0.7012, + "step": 12984 + }, + { + "epoch": 0.7146788485882547, + "grad_norm": 0.7447199821472168, + "learning_rate": 7.191708551599923e-06, + "loss": 0.7545, + "step": 12985 + }, + { + "epoch": 0.7147338873906104, + "grad_norm": 0.8502823114395142, + "learning_rate": 7.191318939926532e-06, + "loss": 0.7232, + "step": 12986 + }, + { + "epoch": 0.7147889261929661, + "grad_norm": 0.7193031907081604, + "learning_rate": 7.190929311783774e-06, + "loss": 0.762, + "step": 12987 + }, + { + "epoch": 0.7148439649953217, + "grad_norm": 0.8479939699172974, + "learning_rate": 7.190539667174576e-06, + "loss": 0.7238, + "step": 12988 + }, + { + "epoch": 0.7148990037976773, + "grad_norm": 0.8313719630241394, + "learning_rate": 7.1901500061018704e-06, + "loss": 0.8145, + "step": 12989 + }, + { + "epoch": 0.714954042600033, + "grad_norm": 0.7019978165626526, + "learning_rate": 7.189760328568584e-06, + "loss": 0.6461, + "step": 12990 + }, + { + "epoch": 0.7150090814023887, + "grad_norm": 0.897280216217041, + "learning_rate": 7.1893706345776436e-06, + "loss": 0.818, + "step": 12991 + }, + { + "epoch": 0.7150641202047443, + "grad_norm": 0.7495617866516113, + "learning_rate": 7.1889809241319795e-06, + "loss": 0.7533, + "step": 12992 + }, + { + "epoch": 0.7151191590071, + "grad_norm": 0.733496904373169, + "learning_rate": 7.188591197234522e-06, + "loss": 0.7405, + "step": 12993 + }, + { + "epoch": 0.7151741978094557, + "grad_norm": 0.8873284459114075, + "learning_rate": 7.1882014538882e-06, + "loss": 0.7525, + "step": 12994 + }, + { + "epoch": 0.7152292366118114, + "grad_norm": 0.6693230271339417, + "learning_rate": 7.187811694095939e-06, + "loss": 0.7509, + "step": 12995 + }, + { + "epoch": 0.715284275414167, + "grad_norm": 0.8513357043266296, + "learning_rate": 7.187421917860671e-06, + "loss": 0.8111, + "step": 12996 + }, + { + "epoch": 0.7153393142165226, + "grad_norm": 0.6986566185951233, + "learning_rate": 7.187032125185326e-06, + "loss": 0.8013, + "step": 12997 + }, + { + "epoch": 0.7153943530188783, + "grad_norm": 0.7062557339668274, + "learning_rate": 7.1866423160728335e-06, + "loss": 0.7266, + "step": 12998 + }, + { + "epoch": 0.715449391821234, + "grad_norm": 0.6329573392868042, + "learning_rate": 7.186252490526122e-06, + "loss": 0.6753, + "step": 12999 + }, + { + "epoch": 0.7155044306235896, + "grad_norm": 0.6740719079971313, + "learning_rate": 7.185862648548122e-06, + "loss": 0.7197, + "step": 13000 + }, + { + "epoch": 0.7155594694259453, + "grad_norm": 0.7911732196807861, + "learning_rate": 7.185472790141764e-06, + "loss": 0.6939, + "step": 13001 + }, + { + "epoch": 0.715614508228301, + "grad_norm": 0.7368680238723755, + "learning_rate": 7.185082915309978e-06, + "loss": 0.6919, + "step": 13002 + }, + { + "epoch": 0.7156695470306567, + "grad_norm": 0.6374472975730896, + "learning_rate": 7.1846930240556925e-06, + "loss": 0.6645, + "step": 13003 + }, + { + "epoch": 0.7157245858330122, + "grad_norm": 0.6727073192596436, + "learning_rate": 7.184303116381839e-06, + "loss": 0.5995, + "step": 13004 + }, + { + "epoch": 0.7157796246353679, + "grad_norm": 0.6122208833694458, + "learning_rate": 7.183913192291348e-06, + "loss": 0.6755, + "step": 13005 + }, + { + "epoch": 0.7158346634377236, + "grad_norm": 0.7095892429351807, + "learning_rate": 7.1835232517871525e-06, + "loss": 0.8009, + "step": 13006 + }, + { + "epoch": 0.7158897022400793, + "grad_norm": 0.6828192472457886, + "learning_rate": 7.1831332948721786e-06, + "loss": 0.7755, + "step": 13007 + }, + { + "epoch": 0.7159447410424349, + "grad_norm": 0.7997334003448486, + "learning_rate": 7.182743321549359e-06, + "loss": 0.7259, + "step": 13008 + }, + { + "epoch": 0.7159997798447906, + "grad_norm": 0.7431252002716064, + "learning_rate": 7.182353331821626e-06, + "loss": 0.7765, + "step": 13009 + }, + { + "epoch": 0.7160548186471463, + "grad_norm": 0.7202625870704651, + "learning_rate": 7.181963325691907e-06, + "loss": 0.7638, + "step": 13010 + }, + { + "epoch": 0.716109857449502, + "grad_norm": 0.7617568373680115, + "learning_rate": 7.181573303163139e-06, + "loss": 0.825, + "step": 13011 + }, + { + "epoch": 0.7161648962518575, + "grad_norm": 0.7382665276527405, + "learning_rate": 7.181183264238247e-06, + "loss": 0.8005, + "step": 13012 + }, + { + "epoch": 0.7162199350542132, + "grad_norm": 0.7782611846923828, + "learning_rate": 7.180793208920167e-06, + "loss": 0.7044, + "step": 13013 + }, + { + "epoch": 0.7162749738565689, + "grad_norm": 0.7020898461341858, + "learning_rate": 7.18040313721183e-06, + "loss": 0.8059, + "step": 13014 + }, + { + "epoch": 0.7163300126589245, + "grad_norm": 1.2005099058151245, + "learning_rate": 7.1800130491161656e-06, + "loss": 0.6663, + "step": 13015 + }, + { + "epoch": 0.7163850514612802, + "grad_norm": 0.6663569211959839, + "learning_rate": 7.1796229446361066e-06, + "loss": 0.7046, + "step": 13016 + }, + { + "epoch": 0.7164400902636359, + "grad_norm": 0.7010110020637512, + "learning_rate": 7.1792328237745845e-06, + "loss": 0.6433, + "step": 13017 + }, + { + "epoch": 0.7164951290659916, + "grad_norm": 0.6447514891624451, + "learning_rate": 7.178842686534534e-06, + "loss": 0.7794, + "step": 13018 + }, + { + "epoch": 0.7165501678683471, + "grad_norm": 0.6813021302223206, + "learning_rate": 7.1784525329188835e-06, + "loss": 0.7413, + "step": 13019 + }, + { + "epoch": 0.7166052066707028, + "grad_norm": 0.6894733905792236, + "learning_rate": 7.178062362930567e-06, + "loss": 0.7896, + "step": 13020 + }, + { + "epoch": 0.7166602454730585, + "grad_norm": 0.6717034578323364, + "learning_rate": 7.177672176572517e-06, + "loss": 0.7599, + "step": 13021 + }, + { + "epoch": 0.7167152842754142, + "grad_norm": 0.7861666083335876, + "learning_rate": 7.177281973847665e-06, + "loss": 0.9068, + "step": 13022 + }, + { + "epoch": 0.7167703230777698, + "grad_norm": 0.6784214973449707, + "learning_rate": 7.176891754758946e-06, + "loss": 0.8319, + "step": 13023 + }, + { + "epoch": 0.7168253618801255, + "grad_norm": 0.7053580284118652, + "learning_rate": 7.176501519309289e-06, + "loss": 0.8085, + "step": 13024 + }, + { + "epoch": 0.7168804006824812, + "grad_norm": 0.9643208980560303, + "learning_rate": 7.176111267501631e-06, + "loss": 0.7799, + "step": 13025 + }, + { + "epoch": 0.7169354394848368, + "grad_norm": 0.8921111822128296, + "learning_rate": 7.175720999338902e-06, + "loss": 0.6465, + "step": 13026 + }, + { + "epoch": 0.7169904782871924, + "grad_norm": 0.7356166839599609, + "learning_rate": 7.1753307148240385e-06, + "loss": 0.7862, + "step": 13027 + }, + { + "epoch": 0.7170455170895481, + "grad_norm": 0.6906836628913879, + "learning_rate": 7.174940413959968e-06, + "loss": 0.7341, + "step": 13028 + }, + { + "epoch": 0.7171005558919038, + "grad_norm": 0.6229632496833801, + "learning_rate": 7.174550096749632e-06, + "loss": 0.721, + "step": 13029 + }, + { + "epoch": 0.7171555946942595, + "grad_norm": 0.6832499504089355, + "learning_rate": 7.174159763195958e-06, + "loss": 0.6733, + "step": 13030 + }, + { + "epoch": 0.7172106334966151, + "grad_norm": 0.8304060697555542, + "learning_rate": 7.1737694133018806e-06, + "loss": 0.7732, + "step": 13031 + }, + { + "epoch": 0.7172656722989708, + "grad_norm": 0.6813186407089233, + "learning_rate": 7.173379047070333e-06, + "loss": 0.7742, + "step": 13032 + }, + { + "epoch": 0.7173207111013264, + "grad_norm": 0.6671963930130005, + "learning_rate": 7.172988664504252e-06, + "loss": 0.6516, + "step": 13033 + }, + { + "epoch": 0.7173757499036821, + "grad_norm": 0.661108136177063, + "learning_rate": 7.172598265606569e-06, + "loss": 0.7361, + "step": 13034 + }, + { + "epoch": 0.7174307887060377, + "grad_norm": 0.7097620368003845, + "learning_rate": 7.1722078503802196e-06, + "loss": 0.8142, + "step": 13035 + }, + { + "epoch": 0.7174858275083934, + "grad_norm": 0.7663383483886719, + "learning_rate": 7.1718174188281365e-06, + "loss": 0.8149, + "step": 13036 + }, + { + "epoch": 0.7175408663107491, + "grad_norm": 0.7142401337623596, + "learning_rate": 7.171426970953256e-06, + "loss": 0.7539, + "step": 13037 + }, + { + "epoch": 0.7175959051131048, + "grad_norm": 0.667346715927124, + "learning_rate": 7.171036506758512e-06, + "loss": 0.7517, + "step": 13038 + }, + { + "epoch": 0.7176509439154604, + "grad_norm": 0.5933231711387634, + "learning_rate": 7.170646026246838e-06, + "loss": 0.6852, + "step": 13039 + }, + { + "epoch": 0.717705982717816, + "grad_norm": 0.730015218257904, + "learning_rate": 7.170255529421168e-06, + "loss": 0.7316, + "step": 13040 + }, + { + "epoch": 0.7177610215201717, + "grad_norm": 0.6146146059036255, + "learning_rate": 7.169865016284442e-06, + "loss": 0.6715, + "step": 13041 + }, + { + "epoch": 0.7178160603225274, + "grad_norm": 0.694131076335907, + "learning_rate": 7.16947448683959e-06, + "loss": 0.7944, + "step": 13042 + }, + { + "epoch": 0.717871099124883, + "grad_norm": 0.6736807823181152, + "learning_rate": 7.169083941089547e-06, + "loss": 0.7922, + "step": 13043 + }, + { + "epoch": 0.7179261379272387, + "grad_norm": 0.6748425364494324, + "learning_rate": 7.16869337903725e-06, + "loss": 0.6738, + "step": 13044 + }, + { + "epoch": 0.7179811767295944, + "grad_norm": 0.6807510852813721, + "learning_rate": 7.168302800685635e-06, + "loss": 0.7291, + "step": 13045 + }, + { + "epoch": 0.7180362155319501, + "grad_norm": 0.6613160371780396, + "learning_rate": 7.167912206037637e-06, + "loss": 0.6839, + "step": 13046 + }, + { + "epoch": 0.7180912543343057, + "grad_norm": 0.7184692621231079, + "learning_rate": 7.16752159509619e-06, + "loss": 0.6748, + "step": 13047 + }, + { + "epoch": 0.7181462931366613, + "grad_norm": 0.6938989758491516, + "learning_rate": 7.167130967864231e-06, + "loss": 0.7926, + "step": 13048 + }, + { + "epoch": 0.718201331939017, + "grad_norm": 0.6871020793914795, + "learning_rate": 7.166740324344696e-06, + "loss": 0.8229, + "step": 13049 + }, + { + "epoch": 0.7182563707413727, + "grad_norm": 0.8003624081611633, + "learning_rate": 7.166349664540521e-06, + "loss": 0.8488, + "step": 13050 + }, + { + "epoch": 0.7183114095437283, + "grad_norm": 0.7309357523918152, + "learning_rate": 7.165958988454642e-06, + "loss": 0.7442, + "step": 13051 + }, + { + "epoch": 0.718366448346084, + "grad_norm": 0.7462141513824463, + "learning_rate": 7.165568296089993e-06, + "loss": 0.8014, + "step": 13052 + }, + { + "epoch": 0.7184214871484397, + "grad_norm": 0.8335661292076111, + "learning_rate": 7.165177587449516e-06, + "loss": 0.6773, + "step": 13053 + }, + { + "epoch": 0.7184765259507954, + "grad_norm": 0.6996884346008301, + "learning_rate": 7.164786862536142e-06, + "loss": 0.7491, + "step": 13054 + }, + { + "epoch": 0.7185315647531509, + "grad_norm": 0.7203043103218079, + "learning_rate": 7.164396121352809e-06, + "loss": 0.7196, + "step": 13055 + }, + { + "epoch": 0.7185866035555066, + "grad_norm": 0.7109461426734924, + "learning_rate": 7.164005363902453e-06, + "loss": 0.7336, + "step": 13056 + }, + { + "epoch": 0.7186416423578623, + "grad_norm": 0.7057282328605652, + "learning_rate": 7.1636145901880135e-06, + "loss": 0.734, + "step": 13057 + }, + { + "epoch": 0.7186966811602179, + "grad_norm": 0.7288782000541687, + "learning_rate": 7.163223800212427e-06, + "loss": 0.8141, + "step": 13058 + }, + { + "epoch": 0.7187517199625736, + "grad_norm": 0.6812320947647095, + "learning_rate": 7.162832993978628e-06, + "loss": 0.7525, + "step": 13059 + }, + { + "epoch": 0.7188067587649293, + "grad_norm": 0.6782627105712891, + "learning_rate": 7.1624421714895546e-06, + "loss": 0.7647, + "step": 13060 + }, + { + "epoch": 0.718861797567285, + "grad_norm": 0.7361965775489807, + "learning_rate": 7.162051332748146e-06, + "loss": 0.7774, + "step": 13061 + }, + { + "epoch": 0.7189168363696405, + "grad_norm": 0.68894362449646, + "learning_rate": 7.161660477757337e-06, + "loss": 0.767, + "step": 13062 + }, + { + "epoch": 0.7189718751719962, + "grad_norm": 0.6440854668617249, + "learning_rate": 7.161269606520067e-06, + "loss": 0.7062, + "step": 13063 + }, + { + "epoch": 0.7190269139743519, + "grad_norm": 0.8411546945571899, + "learning_rate": 7.160878719039273e-06, + "loss": 0.728, + "step": 13064 + }, + { + "epoch": 0.7190819527767076, + "grad_norm": 0.6895145177841187, + "learning_rate": 7.160487815317895e-06, + "loss": 0.6667, + "step": 13065 + }, + { + "epoch": 0.7191369915790632, + "grad_norm": 0.6943626403808594, + "learning_rate": 7.160096895358866e-06, + "loss": 0.7579, + "step": 13066 + }, + { + "epoch": 0.7191920303814189, + "grad_norm": 0.7940205335617065, + "learning_rate": 7.1597059591651294e-06, + "loss": 0.7286, + "step": 13067 + }, + { + "epoch": 0.7192470691837746, + "grad_norm": 0.7350896000862122, + "learning_rate": 7.159315006739619e-06, + "loss": 0.7174, + "step": 13068 + }, + { + "epoch": 0.7193021079861303, + "grad_norm": 0.7663372159004211, + "learning_rate": 7.158924038085275e-06, + "loss": 0.7871, + "step": 13069 + }, + { + "epoch": 0.7193571467884858, + "grad_norm": 0.7368965744972229, + "learning_rate": 7.1585330532050375e-06, + "loss": 0.7356, + "step": 13070 + }, + { + "epoch": 0.7194121855908415, + "grad_norm": 0.7345212697982788, + "learning_rate": 7.158142052101843e-06, + "loss": 0.7784, + "step": 13071 + }, + { + "epoch": 0.7194672243931972, + "grad_norm": 0.7847188711166382, + "learning_rate": 7.157751034778629e-06, + "loss": 0.7899, + "step": 13072 + }, + { + "epoch": 0.7195222631955529, + "grad_norm": 0.757514476776123, + "learning_rate": 7.157360001238337e-06, + "loss": 0.8899, + "step": 13073 + }, + { + "epoch": 0.7195773019979085, + "grad_norm": 0.73405522108078, + "learning_rate": 7.156968951483905e-06, + "loss": 0.7283, + "step": 13074 + }, + { + "epoch": 0.7196323408002642, + "grad_norm": 0.7950206398963928, + "learning_rate": 7.156577885518271e-06, + "loss": 0.7338, + "step": 13075 + }, + { + "epoch": 0.7196873796026199, + "grad_norm": 0.8082411289215088, + "learning_rate": 7.156186803344374e-06, + "loss": 0.711, + "step": 13076 + }, + { + "epoch": 0.7197424184049755, + "grad_norm": 0.6868693828582764, + "learning_rate": 7.1557957049651574e-06, + "loss": 0.7583, + "step": 13077 + }, + { + "epoch": 0.7197974572073311, + "grad_norm": 0.7226251363754272, + "learning_rate": 7.155404590383554e-06, + "loss": 0.746, + "step": 13078 + }, + { + "epoch": 0.7198524960096868, + "grad_norm": 0.7437220811843872, + "learning_rate": 7.155013459602509e-06, + "loss": 0.6884, + "step": 13079 + }, + { + "epoch": 0.7199075348120425, + "grad_norm": 0.7486164569854736, + "learning_rate": 7.154622312624958e-06, + "loss": 0.6968, + "step": 13080 + }, + { + "epoch": 0.7199625736143982, + "grad_norm": 0.7709106802940369, + "learning_rate": 7.154231149453843e-06, + "loss": 0.838, + "step": 13081 + }, + { + "epoch": 0.7200176124167538, + "grad_norm": 0.6962981224060059, + "learning_rate": 7.153839970092104e-06, + "loss": 0.7186, + "step": 13082 + }, + { + "epoch": 0.7200726512191095, + "grad_norm": 0.8195380568504333, + "learning_rate": 7.15344877454268e-06, + "loss": 0.7949, + "step": 13083 + }, + { + "epoch": 0.7201276900214651, + "grad_norm": 0.735285758972168, + "learning_rate": 7.15305756280851e-06, + "loss": 0.7477, + "step": 13084 + }, + { + "epoch": 0.7201827288238208, + "grad_norm": 0.6121101379394531, + "learning_rate": 7.1526663348925375e-06, + "loss": 0.6686, + "step": 13085 + }, + { + "epoch": 0.7202377676261764, + "grad_norm": 0.7204885482788086, + "learning_rate": 7.1522750907977e-06, + "loss": 0.8013, + "step": 13086 + }, + { + "epoch": 0.7202928064285321, + "grad_norm": 0.6808584332466125, + "learning_rate": 7.15188383052694e-06, + "loss": 0.7847, + "step": 13087 + }, + { + "epoch": 0.7203478452308878, + "grad_norm": 0.7049086093902588, + "learning_rate": 7.151492554083195e-06, + "loss": 0.7563, + "step": 13088 + }, + { + "epoch": 0.7204028840332435, + "grad_norm": 0.765708327293396, + "learning_rate": 7.151101261469411e-06, + "loss": 0.7648, + "step": 13089 + }, + { + "epoch": 0.7204579228355991, + "grad_norm": 0.6810007095336914, + "learning_rate": 7.150709952688525e-06, + "loss": 0.731, + "step": 13090 + }, + { + "epoch": 0.7205129616379548, + "grad_norm": 0.7242745757102966, + "learning_rate": 7.150318627743478e-06, + "loss": 0.8027, + "step": 13091 + }, + { + "epoch": 0.7205680004403104, + "grad_norm": 0.7452220916748047, + "learning_rate": 7.14992728663721e-06, + "loss": 0.7848, + "step": 13092 + }, + { + "epoch": 0.7206230392426661, + "grad_norm": 0.6333943605422974, + "learning_rate": 7.149535929372667e-06, + "loss": 0.7105, + "step": 13093 + }, + { + "epoch": 0.7206780780450217, + "grad_norm": 0.7565333247184753, + "learning_rate": 7.149144555952785e-06, + "loss": 0.8006, + "step": 13094 + }, + { + "epoch": 0.7207331168473774, + "grad_norm": 0.7703632712364197, + "learning_rate": 7.14875316638051e-06, + "loss": 0.7323, + "step": 13095 + }, + { + "epoch": 0.7207881556497331, + "grad_norm": 0.6275011301040649, + "learning_rate": 7.148361760658779e-06, + "loss": 0.6817, + "step": 13096 + }, + { + "epoch": 0.7208431944520888, + "grad_norm": 0.7363598942756653, + "learning_rate": 7.147970338790537e-06, + "loss": 0.7641, + "step": 13097 + }, + { + "epoch": 0.7208982332544444, + "grad_norm": 0.6284294724464417, + "learning_rate": 7.147578900778727e-06, + "loss": 0.7117, + "step": 13098 + }, + { + "epoch": 0.7209532720568, + "grad_norm": 0.7878503203392029, + "learning_rate": 7.147187446626287e-06, + "loss": 0.8184, + "step": 13099 + }, + { + "epoch": 0.7210083108591557, + "grad_norm": 0.6973691582679749, + "learning_rate": 7.146795976336159e-06, + "loss": 0.7815, + "step": 13100 + }, + { + "epoch": 0.7210633496615113, + "grad_norm": 0.7018479704856873, + "learning_rate": 7.146404489911291e-06, + "loss": 0.7305, + "step": 13101 + }, + { + "epoch": 0.721118388463867, + "grad_norm": 0.6903830766677856, + "learning_rate": 7.14601298735462e-06, + "loss": 0.7074, + "step": 13102 + }, + { + "epoch": 0.7211734272662227, + "grad_norm": 0.7612621188163757, + "learning_rate": 7.145621468669089e-06, + "loss": 0.8189, + "step": 13103 + }, + { + "epoch": 0.7212284660685784, + "grad_norm": 0.7256856560707092, + "learning_rate": 7.145229933857643e-06, + "loss": 0.5959, + "step": 13104 + }, + { + "epoch": 0.721283504870934, + "grad_norm": 0.6632323265075684, + "learning_rate": 7.1448383829232205e-06, + "loss": 0.7519, + "step": 13105 + }, + { + "epoch": 0.7213385436732896, + "grad_norm": 0.6320651769638062, + "learning_rate": 7.144446815868768e-06, + "loss": 0.7259, + "step": 13106 + }, + { + "epoch": 0.7213935824756453, + "grad_norm": 0.6883212924003601, + "learning_rate": 7.144055232697227e-06, + "loss": 0.7776, + "step": 13107 + }, + { + "epoch": 0.721448621278001, + "grad_norm": 0.7159759402275085, + "learning_rate": 7.1436636334115415e-06, + "loss": 0.6915, + "step": 13108 + }, + { + "epoch": 0.7215036600803566, + "grad_norm": 0.7108080983161926, + "learning_rate": 7.1432720180146535e-06, + "loss": 0.731, + "step": 13109 + }, + { + "epoch": 0.7215586988827123, + "grad_norm": 0.7765033841133118, + "learning_rate": 7.142880386509506e-06, + "loss": 0.6965, + "step": 13110 + }, + { + "epoch": 0.721613737685068, + "grad_norm": 0.7205119132995605, + "learning_rate": 7.142488738899045e-06, + "loss": 0.7262, + "step": 13111 + }, + { + "epoch": 0.7216687764874237, + "grad_norm": 0.6786921620368958, + "learning_rate": 7.142097075186212e-06, + "loss": 0.805, + "step": 13112 + }, + { + "epoch": 0.7217238152897792, + "grad_norm": 0.7947409152984619, + "learning_rate": 7.141705395373949e-06, + "loss": 0.7701, + "step": 13113 + }, + { + "epoch": 0.7217788540921349, + "grad_norm": 0.6672971844673157, + "learning_rate": 7.141313699465204e-06, + "loss": 0.7325, + "step": 13114 + }, + { + "epoch": 0.7218338928944906, + "grad_norm": 0.641765296459198, + "learning_rate": 7.140921987462916e-06, + "loss": 0.7902, + "step": 13115 + }, + { + "epoch": 0.7218889316968463, + "grad_norm": 0.6675699353218079, + "learning_rate": 7.140530259370032e-06, + "loss": 0.7422, + "step": 13116 + }, + { + "epoch": 0.7219439704992019, + "grad_norm": 0.6940729022026062, + "learning_rate": 7.140138515189495e-06, + "loss": 0.6978, + "step": 13117 + }, + { + "epoch": 0.7219990093015576, + "grad_norm": 0.6805779337882996, + "learning_rate": 7.1397467549242514e-06, + "loss": 0.7498, + "step": 13118 + }, + { + "epoch": 0.7220540481039133, + "grad_norm": 0.6231662631034851, + "learning_rate": 7.139354978577243e-06, + "loss": 0.7344, + "step": 13119 + }, + { + "epoch": 0.722109086906269, + "grad_norm": 0.6883575916290283, + "learning_rate": 7.138963186151416e-06, + "loss": 0.835, + "step": 13120 + }, + { + "epoch": 0.7221641257086245, + "grad_norm": 0.6902666687965393, + "learning_rate": 7.138571377649712e-06, + "loss": 0.7427, + "step": 13121 + }, + { + "epoch": 0.7222191645109802, + "grad_norm": 0.7156440019607544, + "learning_rate": 7.1381795530750805e-06, + "loss": 0.7661, + "step": 13122 + }, + { + "epoch": 0.7222742033133359, + "grad_norm": 0.6727150678634644, + "learning_rate": 7.137787712430464e-06, + "loss": 0.7872, + "step": 13123 + }, + { + "epoch": 0.7223292421156916, + "grad_norm": 0.6200405359268188, + "learning_rate": 7.137395855718806e-06, + "loss": 0.6108, + "step": 13124 + }, + { + "epoch": 0.7223842809180472, + "grad_norm": 0.6384756565093994, + "learning_rate": 7.137003982943054e-06, + "loss": 0.698, + "step": 13125 + }, + { + "epoch": 0.7224393197204029, + "grad_norm": 0.7212089896202087, + "learning_rate": 7.1366120941061515e-06, + "loss": 0.7679, + "step": 13126 + }, + { + "epoch": 0.7224943585227586, + "grad_norm": 0.737352192401886, + "learning_rate": 7.136220189211044e-06, + "loss": 0.8173, + "step": 13127 + }, + { + "epoch": 0.7225493973251143, + "grad_norm": 0.6244099736213684, + "learning_rate": 7.135828268260679e-06, + "loss": 0.7224, + "step": 13128 + }, + { + "epoch": 0.7226044361274698, + "grad_norm": 0.8191885948181152, + "learning_rate": 7.135436331257997e-06, + "loss": 0.8122, + "step": 13129 + }, + { + "epoch": 0.7226594749298255, + "grad_norm": 0.7069095373153687, + "learning_rate": 7.135044378205949e-06, + "loss": 0.7844, + "step": 13130 + }, + { + "epoch": 0.7227145137321812, + "grad_norm": 0.6094380021095276, + "learning_rate": 7.13465240910748e-06, + "loss": 0.7093, + "step": 13131 + }, + { + "epoch": 0.7227695525345369, + "grad_norm": 0.7075843811035156, + "learning_rate": 7.134260423965534e-06, + "loss": 0.8109, + "step": 13132 + }, + { + "epoch": 0.7228245913368925, + "grad_norm": 0.6684398651123047, + "learning_rate": 7.133868422783057e-06, + "loss": 0.7224, + "step": 13133 + }, + { + "epoch": 0.7228796301392482, + "grad_norm": 0.6574007272720337, + "learning_rate": 7.133476405562998e-06, + "loss": 0.6763, + "step": 13134 + }, + { + "epoch": 0.7229346689416039, + "grad_norm": 0.7124022841453552, + "learning_rate": 7.133084372308301e-06, + "loss": 0.8047, + "step": 13135 + }, + { + "epoch": 0.7229897077439595, + "grad_norm": 0.7035976648330688, + "learning_rate": 7.1326923230219124e-06, + "loss": 0.7544, + "step": 13136 + }, + { + "epoch": 0.7230447465463151, + "grad_norm": 0.7007604241371155, + "learning_rate": 7.132300257706779e-06, + "loss": 0.7584, + "step": 13137 + }, + { + "epoch": 0.7230997853486708, + "grad_norm": 0.6917324066162109, + "learning_rate": 7.131908176365848e-06, + "loss": 0.6846, + "step": 13138 + }, + { + "epoch": 0.7231548241510265, + "grad_norm": 0.6857448816299438, + "learning_rate": 7.1315160790020666e-06, + "loss": 0.8142, + "step": 13139 + }, + { + "epoch": 0.7232098629533822, + "grad_norm": 0.8381820321083069, + "learning_rate": 7.13112396561838e-06, + "loss": 0.8132, + "step": 13140 + }, + { + "epoch": 0.7232649017557378, + "grad_norm": 0.7024879455566406, + "learning_rate": 7.130731836217735e-06, + "loss": 0.7157, + "step": 13141 + }, + { + "epoch": 0.7233199405580935, + "grad_norm": 0.7313332557678223, + "learning_rate": 7.130339690803081e-06, + "loss": 0.7623, + "step": 13142 + }, + { + "epoch": 0.7233749793604491, + "grad_norm": 0.697536051273346, + "learning_rate": 7.129947529377364e-06, + "loss": 0.7202, + "step": 13143 + }, + { + "epoch": 0.7234300181628047, + "grad_norm": 0.6946722865104675, + "learning_rate": 7.129555351943533e-06, + "loss": 0.7862, + "step": 13144 + }, + { + "epoch": 0.7234850569651604, + "grad_norm": 0.6643924117088318, + "learning_rate": 7.129163158504532e-06, + "loss": 0.7055, + "step": 13145 + }, + { + "epoch": 0.7235400957675161, + "grad_norm": 0.7285693287849426, + "learning_rate": 7.1287709490633104e-06, + "loss": 0.6815, + "step": 13146 + }, + { + "epoch": 0.7235951345698718, + "grad_norm": 1.2701799869537354, + "learning_rate": 7.128378723622818e-06, + "loss": 0.8596, + "step": 13147 + }, + { + "epoch": 0.7236501733722274, + "grad_norm": 0.7067306041717529, + "learning_rate": 7.127986482186e-06, + "loss": 0.7077, + "step": 13148 + }, + { + "epoch": 0.7237052121745831, + "grad_norm": 0.8863486051559448, + "learning_rate": 7.127594224755805e-06, + "loss": 0.8961, + "step": 13149 + }, + { + "epoch": 0.7237602509769387, + "grad_norm": 0.7286190986633301, + "learning_rate": 7.127201951335182e-06, + "loss": 0.7941, + "step": 13150 + }, + { + "epoch": 0.7238152897792944, + "grad_norm": 0.8756779432296753, + "learning_rate": 7.126809661927079e-06, + "loss": 0.7862, + "step": 13151 + }, + { + "epoch": 0.72387032858165, + "grad_norm": 0.7780876755714417, + "learning_rate": 7.126417356534443e-06, + "loss": 0.7095, + "step": 13152 + }, + { + "epoch": 0.7239253673840057, + "grad_norm": 0.6332812905311584, + "learning_rate": 7.1260250351602225e-06, + "loss": 0.7057, + "step": 13153 + }, + { + "epoch": 0.7239804061863614, + "grad_norm": 0.8350435495376587, + "learning_rate": 7.125632697807368e-06, + "loss": 0.7695, + "step": 13154 + }, + { + "epoch": 0.7240354449887171, + "grad_norm": 0.8306411504745483, + "learning_rate": 7.125240344478827e-06, + "loss": 0.6605, + "step": 13155 + }, + { + "epoch": 0.7240904837910727, + "grad_norm": 0.7495117783546448, + "learning_rate": 7.124847975177548e-06, + "loss": 0.8078, + "step": 13156 + }, + { + "epoch": 0.7241455225934283, + "grad_norm": 0.6481010317802429, + "learning_rate": 7.12445558990648e-06, + "loss": 0.8094, + "step": 13157 + }, + { + "epoch": 0.724200561395784, + "grad_norm": 0.7742613554000854, + "learning_rate": 7.124063188668573e-06, + "loss": 0.78, + "step": 13158 + }, + { + "epoch": 0.7242556001981397, + "grad_norm": 0.8394206762313843, + "learning_rate": 7.123670771466776e-06, + "loss": 0.8983, + "step": 13159 + }, + { + "epoch": 0.7243106390004953, + "grad_norm": 0.7196840047836304, + "learning_rate": 7.123278338304038e-06, + "loss": 0.7203, + "step": 13160 + }, + { + "epoch": 0.724365677802851, + "grad_norm": 0.5964440107345581, + "learning_rate": 7.122885889183309e-06, + "loss": 0.6251, + "step": 13161 + }, + { + "epoch": 0.7244207166052067, + "grad_norm": 0.7394048571586609, + "learning_rate": 7.1224934241075375e-06, + "loss": 0.7755, + "step": 13162 + }, + { + "epoch": 0.7244757554075624, + "grad_norm": 0.6427145004272461, + "learning_rate": 7.1221009430796724e-06, + "loss": 0.74, + "step": 13163 + }, + { + "epoch": 0.724530794209918, + "grad_norm": 0.7084387540817261, + "learning_rate": 7.121708446102667e-06, + "loss": 0.7464, + "step": 13164 + }, + { + "epoch": 0.7245858330122736, + "grad_norm": 0.6623230576515198, + "learning_rate": 7.121315933179466e-06, + "loss": 0.7237, + "step": 13165 + }, + { + "epoch": 0.7246408718146293, + "grad_norm": 0.9234243631362915, + "learning_rate": 7.120923404313024e-06, + "loss": 0.8238, + "step": 13166 + }, + { + "epoch": 0.724695910616985, + "grad_norm": 0.6458896994590759, + "learning_rate": 7.120530859506289e-06, + "loss": 0.8105, + "step": 13167 + }, + { + "epoch": 0.7247509494193406, + "grad_norm": 0.7160854935646057, + "learning_rate": 7.1201382987622115e-06, + "loss": 0.7954, + "step": 13168 + }, + { + "epoch": 0.7248059882216963, + "grad_norm": 0.6896069645881653, + "learning_rate": 7.119745722083742e-06, + "loss": 0.7281, + "step": 13169 + }, + { + "epoch": 0.724861027024052, + "grad_norm": 0.6609574556350708, + "learning_rate": 7.119353129473831e-06, + "loss": 0.7682, + "step": 13170 + }, + { + "epoch": 0.7249160658264077, + "grad_norm": 0.6477035880088806, + "learning_rate": 7.118960520935429e-06, + "loss": 0.8183, + "step": 13171 + }, + { + "epoch": 0.7249711046287632, + "grad_norm": 1.4488556385040283, + "learning_rate": 7.1185678964714885e-06, + "loss": 0.8321, + "step": 13172 + }, + { + "epoch": 0.7250261434311189, + "grad_norm": 0.8502382040023804, + "learning_rate": 7.118175256084958e-06, + "loss": 0.7881, + "step": 13173 + }, + { + "epoch": 0.7250811822334746, + "grad_norm": 0.6969912648200989, + "learning_rate": 7.117782599778788e-06, + "loss": 0.7598, + "step": 13174 + }, + { + "epoch": 0.7251362210358303, + "grad_norm": 0.7254889011383057, + "learning_rate": 7.117389927555933e-06, + "loss": 0.8473, + "step": 13175 + }, + { + "epoch": 0.7251912598381859, + "grad_norm": 0.9958444237709045, + "learning_rate": 7.116997239419341e-06, + "loss": 0.7558, + "step": 13176 + }, + { + "epoch": 0.7252462986405416, + "grad_norm": 0.6694881916046143, + "learning_rate": 7.116604535371963e-06, + "loss": 0.7072, + "step": 13177 + }, + { + "epoch": 0.7253013374428973, + "grad_norm": 1.0730634927749634, + "learning_rate": 7.116211815416754e-06, + "loss": 0.7607, + "step": 13178 + }, + { + "epoch": 0.725356376245253, + "grad_norm": 0.6770226359367371, + "learning_rate": 7.115819079556663e-06, + "loss": 0.7213, + "step": 13179 + }, + { + "epoch": 0.7254114150476085, + "grad_norm": 0.866215705871582, + "learning_rate": 7.115426327794642e-06, + "loss": 0.7273, + "step": 13180 + }, + { + "epoch": 0.7254664538499642, + "grad_norm": 0.7303730845451355, + "learning_rate": 7.115033560133642e-06, + "loss": 0.764, + "step": 13181 + }, + { + "epoch": 0.7255214926523199, + "grad_norm": 0.6900389194488525, + "learning_rate": 7.114640776576617e-06, + "loss": 0.6958, + "step": 13182 + }, + { + "epoch": 0.7255765314546756, + "grad_norm": 0.7255710959434509, + "learning_rate": 7.114247977126518e-06, + "loss": 0.6507, + "step": 13183 + }, + { + "epoch": 0.7256315702570312, + "grad_norm": 0.6848479509353638, + "learning_rate": 7.113855161786297e-06, + "loss": 0.6848, + "step": 13184 + }, + { + "epoch": 0.7256866090593869, + "grad_norm": 0.6800528764724731, + "learning_rate": 7.113462330558907e-06, + "loss": 0.7354, + "step": 13185 + }, + { + "epoch": 0.7257416478617426, + "grad_norm": 0.7271339297294617, + "learning_rate": 7.113069483447299e-06, + "loss": 0.7695, + "step": 13186 + }, + { + "epoch": 0.7257966866640981, + "grad_norm": 0.8212381601333618, + "learning_rate": 7.112676620454427e-06, + "loss": 0.7348, + "step": 13187 + }, + { + "epoch": 0.7258517254664538, + "grad_norm": 0.6714771389961243, + "learning_rate": 7.112283741583242e-06, + "loss": 0.75, + "step": 13188 + }, + { + "epoch": 0.7259067642688095, + "grad_norm": 0.7834941148757935, + "learning_rate": 7.111890846836699e-06, + "loss": 0.6914, + "step": 13189 + }, + { + "epoch": 0.7259618030711652, + "grad_norm": 0.8107824325561523, + "learning_rate": 7.111497936217748e-06, + "loss": 0.803, + "step": 13190 + }, + { + "epoch": 0.7260168418735208, + "grad_norm": 0.6306549906730652, + "learning_rate": 7.1111050097293464e-06, + "loss": 0.7915, + "step": 13191 + }, + { + "epoch": 0.7260718806758765, + "grad_norm": 0.7030252814292908, + "learning_rate": 7.110712067374444e-06, + "loss": 0.7091, + "step": 13192 + }, + { + "epoch": 0.7261269194782322, + "grad_norm": 0.7625641226768494, + "learning_rate": 7.110319109155992e-06, + "loss": 0.774, + "step": 13193 + }, + { + "epoch": 0.7261819582805878, + "grad_norm": 0.6382628083229065, + "learning_rate": 7.109926135076949e-06, + "loss": 0.6774, + "step": 13194 + }, + { + "epoch": 0.7262369970829434, + "grad_norm": 0.6594563722610474, + "learning_rate": 7.109533145140265e-06, + "loss": 0.7977, + "step": 13195 + }, + { + "epoch": 0.7262920358852991, + "grad_norm": 0.7177248001098633, + "learning_rate": 7.109140139348895e-06, + "loss": 0.6771, + "step": 13196 + }, + { + "epoch": 0.7263470746876548, + "grad_norm": 0.6631305813789368, + "learning_rate": 7.108747117705792e-06, + "loss": 0.6877, + "step": 13197 + }, + { + "epoch": 0.7264021134900105, + "grad_norm": 0.6783736944198608, + "learning_rate": 7.10835408021391e-06, + "loss": 0.8048, + "step": 13198 + }, + { + "epoch": 0.7264571522923661, + "grad_norm": 0.7368303537368774, + "learning_rate": 7.107961026876204e-06, + "loss": 0.7962, + "step": 13199 + }, + { + "epoch": 0.7265121910947218, + "grad_norm": 0.7697044014930725, + "learning_rate": 7.107567957695627e-06, + "loss": 0.769, + "step": 13200 + }, + { + "epoch": 0.7265672298970774, + "grad_norm": 0.639934241771698, + "learning_rate": 7.1071748726751325e-06, + "loss": 0.722, + "step": 13201 + }, + { + "epoch": 0.7266222686994331, + "grad_norm": 0.8410669565200806, + "learning_rate": 7.106781771817676e-06, + "loss": 0.8861, + "step": 13202 + }, + { + "epoch": 0.7266773075017887, + "grad_norm": 0.654924213886261, + "learning_rate": 7.106388655126212e-06, + "loss": 0.7463, + "step": 13203 + }, + { + "epoch": 0.7267323463041444, + "grad_norm": 0.719714879989624, + "learning_rate": 7.105995522603695e-06, + "loss": 0.759, + "step": 13204 + }, + { + "epoch": 0.7267873851065001, + "grad_norm": 0.7019139528274536, + "learning_rate": 7.105602374253078e-06, + "loss": 0.7965, + "step": 13205 + }, + { + "epoch": 0.7268424239088558, + "grad_norm": 0.7289487719535828, + "learning_rate": 7.105209210077318e-06, + "loss": 0.8591, + "step": 13206 + }, + { + "epoch": 0.7268974627112114, + "grad_norm": 0.670274019241333, + "learning_rate": 7.104816030079369e-06, + "loss": 0.7707, + "step": 13207 + }, + { + "epoch": 0.726952501513567, + "grad_norm": 0.7156813740730286, + "learning_rate": 7.104422834262187e-06, + "loss": 0.7724, + "step": 13208 + }, + { + "epoch": 0.7270075403159227, + "grad_norm": 0.6776198148727417, + "learning_rate": 7.104029622628726e-06, + "loss": 0.7331, + "step": 13209 + }, + { + "epoch": 0.7270625791182784, + "grad_norm": 0.8008358478546143, + "learning_rate": 7.103636395181941e-06, + "loss": 0.8279, + "step": 13210 + }, + { + "epoch": 0.727117617920634, + "grad_norm": 0.6622886061668396, + "learning_rate": 7.1032431519247876e-06, + "loss": 0.6646, + "step": 13211 + }, + { + "epoch": 0.7271726567229897, + "grad_norm": 0.6834877729415894, + "learning_rate": 7.102849892860223e-06, + "loss": 0.75, + "step": 13212 + }, + { + "epoch": 0.7272276955253454, + "grad_norm": 0.7659596800804138, + "learning_rate": 7.1024566179912e-06, + "loss": 0.6999, + "step": 13213 + }, + { + "epoch": 0.7272827343277011, + "grad_norm": 0.7368002533912659, + "learning_rate": 7.102063327320677e-06, + "loss": 0.7376, + "step": 13214 + }, + { + "epoch": 0.7273377731300567, + "grad_norm": 0.7286058664321899, + "learning_rate": 7.101670020851609e-06, + "loss": 0.8139, + "step": 13215 + }, + { + "epoch": 0.7273928119324123, + "grad_norm": 1.0521546602249146, + "learning_rate": 7.101276698586951e-06, + "loss": 0.8545, + "step": 13216 + }, + { + "epoch": 0.727447850734768, + "grad_norm": 0.6940305233001709, + "learning_rate": 7.100883360529659e-06, + "loss": 0.7534, + "step": 13217 + }, + { + "epoch": 0.7275028895371237, + "grad_norm": 0.8279024362564087, + "learning_rate": 7.100490006682691e-06, + "loss": 0.852, + "step": 13218 + }, + { + "epoch": 0.7275579283394793, + "grad_norm": 0.63093501329422, + "learning_rate": 7.100096637049002e-06, + "loss": 0.6728, + "step": 13219 + }, + { + "epoch": 0.727612967141835, + "grad_norm": 0.7576018571853638, + "learning_rate": 7.099703251631549e-06, + "loss": 0.6343, + "step": 13220 + }, + { + "epoch": 0.7276680059441907, + "grad_norm": 0.9493140578269958, + "learning_rate": 7.0993098504332894e-06, + "loss": 0.82, + "step": 13221 + }, + { + "epoch": 0.7277230447465464, + "grad_norm": 0.7279804944992065, + "learning_rate": 7.098916433457177e-06, + "loss": 0.8149, + "step": 13222 + }, + { + "epoch": 0.7277780835489019, + "grad_norm": 0.7660531401634216, + "learning_rate": 7.0985230007061725e-06, + "loss": 0.8278, + "step": 13223 + }, + { + "epoch": 0.7278331223512576, + "grad_norm": 0.6468318104743958, + "learning_rate": 7.09812955218323e-06, + "loss": 0.7193, + "step": 13224 + }, + { + "epoch": 0.7278881611536133, + "grad_norm": 0.6389151811599731, + "learning_rate": 7.097736087891306e-06, + "loss": 0.6744, + "step": 13225 + }, + { + "epoch": 0.727943199955969, + "grad_norm": 0.6565649509429932, + "learning_rate": 7.097342607833361e-06, + "loss": 0.7586, + "step": 13226 + }, + { + "epoch": 0.7279982387583246, + "grad_norm": 0.6867381930351257, + "learning_rate": 7.09694911201235e-06, + "loss": 0.684, + "step": 13227 + }, + { + "epoch": 0.7280532775606803, + "grad_norm": 0.7509286403656006, + "learning_rate": 7.096555600431229e-06, + "loss": 0.8242, + "step": 13228 + }, + { + "epoch": 0.728108316363036, + "grad_norm": 0.6997731328010559, + "learning_rate": 7.096162073092959e-06, + "loss": 0.8182, + "step": 13229 + }, + { + "epoch": 0.7281633551653915, + "grad_norm": 0.6698907017707825, + "learning_rate": 7.095768530000496e-06, + "loss": 0.7752, + "step": 13230 + }, + { + "epoch": 0.7282183939677472, + "grad_norm": 0.7219094634056091, + "learning_rate": 7.095374971156799e-06, + "loss": 0.792, + "step": 13231 + }, + { + "epoch": 0.7282734327701029, + "grad_norm": 0.6479744911193848, + "learning_rate": 7.094981396564822e-06, + "loss": 0.7556, + "step": 13232 + }, + { + "epoch": 0.7283284715724586, + "grad_norm": 0.6795497536659241, + "learning_rate": 7.094587806227527e-06, + "loss": 0.7611, + "step": 13233 + }, + { + "epoch": 0.7283835103748142, + "grad_norm": 0.7145074605941772, + "learning_rate": 7.094194200147871e-06, + "loss": 0.8064, + "step": 13234 + }, + { + "epoch": 0.7284385491771699, + "grad_norm": 0.6750605702400208, + "learning_rate": 7.093800578328811e-06, + "loss": 0.7054, + "step": 13235 + }, + { + "epoch": 0.7284935879795256, + "grad_norm": 0.7574751377105713, + "learning_rate": 7.093406940773307e-06, + "loss": 0.7878, + "step": 13236 + }, + { + "epoch": 0.7285486267818813, + "grad_norm": 0.7836418747901917, + "learning_rate": 7.093013287484316e-06, + "loss": 0.7445, + "step": 13237 + }, + { + "epoch": 0.7286036655842368, + "grad_norm": 0.7658870220184326, + "learning_rate": 7.092619618464799e-06, + "loss": 0.7513, + "step": 13238 + }, + { + "epoch": 0.7286587043865925, + "grad_norm": 1.1127573251724243, + "learning_rate": 7.092225933717711e-06, + "loss": 0.7601, + "step": 13239 + }, + { + "epoch": 0.7287137431889482, + "grad_norm": 0.7003853917121887, + "learning_rate": 7.091832233246015e-06, + "loss": 0.8533, + "step": 13240 + }, + { + "epoch": 0.7287687819913039, + "grad_norm": 0.6513979434967041, + "learning_rate": 7.091438517052667e-06, + "loss": 0.7285, + "step": 13241 + }, + { + "epoch": 0.7288238207936595, + "grad_norm": 0.7072234153747559, + "learning_rate": 7.091044785140626e-06, + "loss": 0.7741, + "step": 13242 + }, + { + "epoch": 0.7288788595960152, + "grad_norm": 0.8117190599441528, + "learning_rate": 7.090651037512854e-06, + "loss": 0.6851, + "step": 13243 + }, + { + "epoch": 0.7289338983983709, + "grad_norm": 0.6876427531242371, + "learning_rate": 7.090257274172306e-06, + "loss": 0.7162, + "step": 13244 + }, + { + "epoch": 0.7289889372007266, + "grad_norm": 0.7128324508666992, + "learning_rate": 7.0898634951219455e-06, + "loss": 0.7302, + "step": 13245 + }, + { + "epoch": 0.7290439760030821, + "grad_norm": 0.6918201446533203, + "learning_rate": 7.089469700364731e-06, + "loss": 0.8582, + "step": 13246 + }, + { + "epoch": 0.7290990148054378, + "grad_norm": 0.6172242164611816, + "learning_rate": 7.08907588990362e-06, + "loss": 0.6846, + "step": 13247 + }, + { + "epoch": 0.7291540536077935, + "grad_norm": 0.6799596548080444, + "learning_rate": 7.088682063741575e-06, + "loss": 0.7174, + "step": 13248 + }, + { + "epoch": 0.7292090924101492, + "grad_norm": 0.6663293838500977, + "learning_rate": 7.088288221881554e-06, + "loss": 0.7237, + "step": 13249 + }, + { + "epoch": 0.7292641312125048, + "grad_norm": 0.6758549213409424, + "learning_rate": 7.0878943643265175e-06, + "loss": 0.7912, + "step": 13250 + }, + { + "epoch": 0.7293191700148605, + "grad_norm": 0.6937153339385986, + "learning_rate": 7.087500491079427e-06, + "loss": 0.742, + "step": 13251 + }, + { + "epoch": 0.7293742088172162, + "grad_norm": 0.6441238522529602, + "learning_rate": 7.087106602143241e-06, + "loss": 0.7676, + "step": 13252 + }, + { + "epoch": 0.7294292476195718, + "grad_norm": 0.6615588068962097, + "learning_rate": 7.08671269752092e-06, + "loss": 0.7069, + "step": 13253 + }, + { + "epoch": 0.7294842864219274, + "grad_norm": 0.8052160739898682, + "learning_rate": 7.086318777215424e-06, + "loss": 0.811, + "step": 13254 + }, + { + "epoch": 0.7295393252242831, + "grad_norm": 0.7293280363082886, + "learning_rate": 7.085924841229716e-06, + "loss": 0.7127, + "step": 13255 + }, + { + "epoch": 0.7295943640266388, + "grad_norm": 0.7104617953300476, + "learning_rate": 7.085530889566756e-06, + "loss": 0.716, + "step": 13256 + }, + { + "epoch": 0.7296494028289945, + "grad_norm": 0.72947758436203, + "learning_rate": 7.085136922229503e-06, + "loss": 0.8144, + "step": 13257 + }, + { + "epoch": 0.7297044416313501, + "grad_norm": 0.7993913292884827, + "learning_rate": 7.08474293922092e-06, + "loss": 0.7609, + "step": 13258 + }, + { + "epoch": 0.7297594804337058, + "grad_norm": 0.7810680270195007, + "learning_rate": 7.0843489405439656e-06, + "loss": 0.8107, + "step": 13259 + }, + { + "epoch": 0.7298145192360614, + "grad_norm": 0.6383776664733887, + "learning_rate": 7.083954926201604e-06, + "loss": 0.7842, + "step": 13260 + }, + { + "epoch": 0.7298695580384171, + "grad_norm": 0.7653967142105103, + "learning_rate": 7.083560896196795e-06, + "loss": 0.729, + "step": 13261 + }, + { + "epoch": 0.7299245968407727, + "grad_norm": 0.6693821549415588, + "learning_rate": 7.083166850532498e-06, + "loss": 0.6901, + "step": 13262 + }, + { + "epoch": 0.7299796356431284, + "grad_norm": 0.7408621907234192, + "learning_rate": 7.082772789211678e-06, + "loss": 0.7415, + "step": 13263 + }, + { + "epoch": 0.7300346744454841, + "grad_norm": 0.6693123579025269, + "learning_rate": 7.082378712237295e-06, + "loss": 0.8102, + "step": 13264 + }, + { + "epoch": 0.7300897132478398, + "grad_norm": 0.6572727560997009, + "learning_rate": 7.081984619612311e-06, + "loss": 0.6595, + "step": 13265 + }, + { + "epoch": 0.7301447520501954, + "grad_norm": 0.7934693694114685, + "learning_rate": 7.081590511339687e-06, + "loss": 0.8024, + "step": 13266 + }, + { + "epoch": 0.730199790852551, + "grad_norm": 1.0663061141967773, + "learning_rate": 7.081196387422388e-06, + "loss": 0.7844, + "step": 13267 + }, + { + "epoch": 0.7302548296549067, + "grad_norm": 0.8005035519599915, + "learning_rate": 7.080802247863372e-06, + "loss": 0.751, + "step": 13268 + }, + { + "epoch": 0.7303098684572624, + "grad_norm": 0.6480177044868469, + "learning_rate": 7.0804080926656046e-06, + "loss": 0.7745, + "step": 13269 + }, + { + "epoch": 0.730364907259618, + "grad_norm": 0.7026820182800293, + "learning_rate": 7.080013921832047e-06, + "loss": 0.7545, + "step": 13270 + }, + { + "epoch": 0.7304199460619737, + "grad_norm": 0.673954427242279, + "learning_rate": 7.079619735365662e-06, + "loss": 0.7142, + "step": 13271 + }, + { + "epoch": 0.7304749848643294, + "grad_norm": 0.7296637296676636, + "learning_rate": 7.079225533269411e-06, + "loss": 0.8493, + "step": 13272 + }, + { + "epoch": 0.730530023666685, + "grad_norm": 0.7147308588027954, + "learning_rate": 7.0788313155462576e-06, + "loss": 0.7638, + "step": 13273 + }, + { + "epoch": 0.7305850624690406, + "grad_norm": 0.7531922459602356, + "learning_rate": 7.078437082199163e-06, + "loss": 0.8644, + "step": 13274 + }, + { + "epoch": 0.7306401012713963, + "grad_norm": 0.6581404805183411, + "learning_rate": 7.078042833231092e-06, + "loss": 0.7555, + "step": 13275 + }, + { + "epoch": 0.730695140073752, + "grad_norm": 0.6781187057495117, + "learning_rate": 7.0776485686450095e-06, + "loss": 0.7536, + "step": 13276 + }, + { + "epoch": 0.7307501788761076, + "grad_norm": 0.7164949774742126, + "learning_rate": 7.077254288443874e-06, + "loss": 0.7275, + "step": 13277 + }, + { + "epoch": 0.7308052176784633, + "grad_norm": 0.8158305287361145, + "learning_rate": 7.076859992630652e-06, + "loss": 0.6821, + "step": 13278 + }, + { + "epoch": 0.730860256480819, + "grad_norm": 0.7101448178291321, + "learning_rate": 7.076465681208307e-06, + "loss": 0.69, + "step": 13279 + }, + { + "epoch": 0.7309152952831747, + "grad_norm": 0.6844518780708313, + "learning_rate": 7.076071354179802e-06, + "loss": 0.7577, + "step": 13280 + }, + { + "epoch": 0.7309703340855302, + "grad_norm": 0.6564158797264099, + "learning_rate": 7.0756770115481e-06, + "loss": 0.6752, + "step": 13281 + }, + { + "epoch": 0.7310253728878859, + "grad_norm": 0.7444283962249756, + "learning_rate": 7.0752826533161655e-06, + "loss": 0.8118, + "step": 13282 + }, + { + "epoch": 0.7310804116902416, + "grad_norm": 0.7657533884048462, + "learning_rate": 7.074888279486962e-06, + "loss": 0.8819, + "step": 13283 + }, + { + "epoch": 0.7311354504925973, + "grad_norm": 0.6924453973770142, + "learning_rate": 7.074493890063453e-06, + "loss": 0.7674, + "step": 13284 + }, + { + "epoch": 0.7311904892949529, + "grad_norm": 0.676188588142395, + "learning_rate": 7.074099485048603e-06, + "loss": 0.7266, + "step": 13285 + }, + { + "epoch": 0.7312455280973086, + "grad_norm": 0.6325914263725281, + "learning_rate": 7.073705064445378e-06, + "loss": 0.6856, + "step": 13286 + }, + { + "epoch": 0.7313005668996643, + "grad_norm": 0.662558913230896, + "learning_rate": 7.073310628256739e-06, + "loss": 0.751, + "step": 13287 + }, + { + "epoch": 0.73135560570202, + "grad_norm": 0.8313137292861938, + "learning_rate": 7.072916176485654e-06, + "loss": 0.7187, + "step": 13288 + }, + { + "epoch": 0.7314106445043755, + "grad_norm": 0.7033550143241882, + "learning_rate": 7.072521709135084e-06, + "loss": 0.8132, + "step": 13289 + }, + { + "epoch": 0.7314656833067312, + "grad_norm": 0.715242862701416, + "learning_rate": 7.0721272262079965e-06, + "loss": 0.8551, + "step": 13290 + }, + { + "epoch": 0.7315207221090869, + "grad_norm": 0.7545164227485657, + "learning_rate": 7.071732727707356e-06, + "loss": 0.7772, + "step": 13291 + }, + { + "epoch": 0.7315757609114426, + "grad_norm": 0.7181825637817383, + "learning_rate": 7.071338213636126e-06, + "loss": 0.7378, + "step": 13292 + }, + { + "epoch": 0.7316307997137982, + "grad_norm": 0.7793779969215393, + "learning_rate": 7.070943683997273e-06, + "loss": 0.7801, + "step": 13293 + }, + { + "epoch": 0.7316858385161539, + "grad_norm": 0.7456476092338562, + "learning_rate": 7.070549138793762e-06, + "loss": 0.8038, + "step": 13294 + }, + { + "epoch": 0.7317408773185096, + "grad_norm": 0.652519702911377, + "learning_rate": 7.0701545780285576e-06, + "loss": 0.746, + "step": 13295 + }, + { + "epoch": 0.7317959161208653, + "grad_norm": 0.784450888633728, + "learning_rate": 7.069760001704625e-06, + "loss": 0.8065, + "step": 13296 + }, + { + "epoch": 0.7318509549232208, + "grad_norm": 0.8052587509155273, + "learning_rate": 7.069365409824931e-06, + "loss": 0.8098, + "step": 13297 + }, + { + "epoch": 0.7319059937255765, + "grad_norm": 0.6890794038772583, + "learning_rate": 7.06897080239244e-06, + "loss": 0.783, + "step": 13298 + }, + { + "epoch": 0.7319610325279322, + "grad_norm": 0.7470653057098389, + "learning_rate": 7.068576179410119e-06, + "loss": 0.7658, + "step": 13299 + }, + { + "epoch": 0.7320160713302879, + "grad_norm": 0.6831437945365906, + "learning_rate": 7.068181540880932e-06, + "loss": 0.7864, + "step": 13300 + }, + { + "epoch": 0.7320711101326435, + "grad_norm": 0.7058265209197998, + "learning_rate": 7.067786886807847e-06, + "loss": 0.8254, + "step": 13301 + }, + { + "epoch": 0.7321261489349992, + "grad_norm": 0.7938248515129089, + "learning_rate": 7.067392217193828e-06, + "loss": 0.7291, + "step": 13302 + }, + { + "epoch": 0.7321811877373549, + "grad_norm": 0.7261865735054016, + "learning_rate": 7.066997532041844e-06, + "loss": 0.8115, + "step": 13303 + }, + { + "epoch": 0.7322362265397105, + "grad_norm": 0.6971743702888489, + "learning_rate": 7.0666028313548586e-06, + "loss": 0.7504, + "step": 13304 + }, + { + "epoch": 0.7322912653420661, + "grad_norm": 0.844879150390625, + "learning_rate": 7.0662081151358405e-06, + "loss": 0.7903, + "step": 13305 + }, + { + "epoch": 0.7323463041444218, + "grad_norm": 0.6670572757720947, + "learning_rate": 7.065813383387755e-06, + "loss": 0.7597, + "step": 13306 + }, + { + "epoch": 0.7324013429467775, + "grad_norm": 0.669711172580719, + "learning_rate": 7.06541863611357e-06, + "loss": 0.7179, + "step": 13307 + }, + { + "epoch": 0.7324563817491332, + "grad_norm": 0.7176600098609924, + "learning_rate": 7.0650238733162506e-06, + "loss": 0.8157, + "step": 13308 + }, + { + "epoch": 0.7325114205514888, + "grad_norm": 0.7230100631713867, + "learning_rate": 7.064629094998765e-06, + "loss": 0.7902, + "step": 13309 + }, + { + "epoch": 0.7325664593538445, + "grad_norm": 0.8811234831809998, + "learning_rate": 7.064234301164078e-06, + "loss": 0.7746, + "step": 13310 + }, + { + "epoch": 0.7326214981562001, + "grad_norm": 0.6777653098106384, + "learning_rate": 7.06383949181516e-06, + "loss": 0.7708, + "step": 13311 + }, + { + "epoch": 0.7326765369585558, + "grad_norm": 0.6692547798156738, + "learning_rate": 7.063444666954977e-06, + "loss": 0.7103, + "step": 13312 + }, + { + "epoch": 0.7327315757609114, + "grad_norm": 1.2304950952529907, + "learning_rate": 7.063049826586496e-06, + "loss": 0.7878, + "step": 13313 + }, + { + "epoch": 0.7327866145632671, + "grad_norm": 0.7073930501937866, + "learning_rate": 7.0626549707126834e-06, + "loss": 0.7546, + "step": 13314 + }, + { + "epoch": 0.7328416533656228, + "grad_norm": 0.7184866070747375, + "learning_rate": 7.06226009933651e-06, + "loss": 0.7207, + "step": 13315 + }, + { + "epoch": 0.7328966921679784, + "grad_norm": 0.7098046541213989, + "learning_rate": 7.061865212460941e-06, + "loss": 0.6415, + "step": 13316 + }, + { + "epoch": 0.7329517309703341, + "grad_norm": 0.714379608631134, + "learning_rate": 7.0614703100889445e-06, + "loss": 0.7305, + "step": 13317 + }, + { + "epoch": 0.7330067697726897, + "grad_norm": 0.655060887336731, + "learning_rate": 7.061075392223491e-06, + "loss": 0.6125, + "step": 13318 + }, + { + "epoch": 0.7330618085750454, + "grad_norm": 0.6481055617332458, + "learning_rate": 7.060680458867545e-06, + "loss": 0.7059, + "step": 13319 + }, + { + "epoch": 0.733116847377401, + "grad_norm": 0.7123916745185852, + "learning_rate": 7.060285510024076e-06, + "loss": 0.8007, + "step": 13320 + }, + { + "epoch": 0.7331718861797567, + "grad_norm": 0.7231262922286987, + "learning_rate": 7.059890545696053e-06, + "loss": 0.7781, + "step": 13321 + }, + { + "epoch": 0.7332269249821124, + "grad_norm": 0.8415369391441345, + "learning_rate": 7.0594955658864435e-06, + "loss": 0.6649, + "step": 13322 + }, + { + "epoch": 0.7332819637844681, + "grad_norm": 0.7243070006370544, + "learning_rate": 7.059100570598217e-06, + "loss": 0.6588, + "step": 13323 + }, + { + "epoch": 0.7333370025868237, + "grad_norm": 0.6581026315689087, + "learning_rate": 7.058705559834342e-06, + "loss": 0.7938, + "step": 13324 + }, + { + "epoch": 0.7333920413891793, + "grad_norm": 0.6213739514350891, + "learning_rate": 7.058310533597787e-06, + "loss": 0.7092, + "step": 13325 + }, + { + "epoch": 0.733447080191535, + "grad_norm": 0.6857954859733582, + "learning_rate": 7.057915491891522e-06, + "loss": 0.698, + "step": 13326 + }, + { + "epoch": 0.7335021189938907, + "grad_norm": 0.7528544068336487, + "learning_rate": 7.0575204347185135e-06, + "loss": 0.7234, + "step": 13327 + }, + { + "epoch": 0.7335571577962463, + "grad_norm": 0.6449099779129028, + "learning_rate": 7.057125362081733e-06, + "loss": 0.7391, + "step": 13328 + }, + { + "epoch": 0.733612196598602, + "grad_norm": 0.640689492225647, + "learning_rate": 7.0567302739841495e-06, + "loss": 0.5316, + "step": 13329 + }, + { + "epoch": 0.7336672354009577, + "grad_norm": 0.6686868071556091, + "learning_rate": 7.056335170428731e-06, + "loss": 0.7713, + "step": 13330 + }, + { + "epoch": 0.7337222742033134, + "grad_norm": 0.7627772688865662, + "learning_rate": 7.055940051418447e-06, + "loss": 0.7706, + "step": 13331 + }, + { + "epoch": 0.733777313005669, + "grad_norm": 0.7421852350234985, + "learning_rate": 7.055544916956269e-06, + "loss": 0.6418, + "step": 13332 + }, + { + "epoch": 0.7338323518080246, + "grad_norm": 0.7414699196815491, + "learning_rate": 7.0551497670451666e-06, + "loss": 0.811, + "step": 13333 + }, + { + "epoch": 0.7338873906103803, + "grad_norm": 0.7054136991500854, + "learning_rate": 7.0547546016881064e-06, + "loss": 0.8005, + "step": 13334 + }, + { + "epoch": 0.733942429412736, + "grad_norm": 0.670174241065979, + "learning_rate": 7.054359420888062e-06, + "loss": 0.6136, + "step": 13335 + }, + { + "epoch": 0.7339974682150916, + "grad_norm": 0.728255033493042, + "learning_rate": 7.053964224648001e-06, + "loss": 0.848, + "step": 13336 + }, + { + "epoch": 0.7340525070174473, + "grad_norm": 0.729815661907196, + "learning_rate": 7.053569012970896e-06, + "loss": 0.6985, + "step": 13337 + }, + { + "epoch": 0.734107545819803, + "grad_norm": 0.7564244866371155, + "learning_rate": 7.053173785859715e-06, + "loss": 0.7995, + "step": 13338 + }, + { + "epoch": 0.7341625846221587, + "grad_norm": 0.7746061682701111, + "learning_rate": 7.05277854331743e-06, + "loss": 0.7663, + "step": 13339 + }, + { + "epoch": 0.7342176234245142, + "grad_norm": 0.6878651976585388, + "learning_rate": 7.052383285347011e-06, + "loss": 0.8624, + "step": 13340 + }, + { + "epoch": 0.7342726622268699, + "grad_norm": 0.6989734768867493, + "learning_rate": 7.051988011951428e-06, + "loss": 0.7221, + "step": 13341 + }, + { + "epoch": 0.7343277010292256, + "grad_norm": 0.6854223012924194, + "learning_rate": 7.051592723133654e-06, + "loss": 0.7878, + "step": 13342 + }, + { + "epoch": 0.7343827398315813, + "grad_norm": 0.746696949005127, + "learning_rate": 7.051197418896657e-06, + "loss": 0.7074, + "step": 13343 + }, + { + "epoch": 0.7344377786339369, + "grad_norm": 0.6933150887489319, + "learning_rate": 7.050802099243409e-06, + "loss": 0.7587, + "step": 13344 + }, + { + "epoch": 0.7344928174362926, + "grad_norm": 0.7285788655281067, + "learning_rate": 7.050406764176882e-06, + "loss": 0.6589, + "step": 13345 + }, + { + "epoch": 0.7345478562386483, + "grad_norm": 0.6834994554519653, + "learning_rate": 7.050011413700046e-06, + "loss": 0.7196, + "step": 13346 + }, + { + "epoch": 0.734602895041004, + "grad_norm": 0.6504353880882263, + "learning_rate": 7.049616047815873e-06, + "loss": 0.7675, + "step": 13347 + }, + { + "epoch": 0.7346579338433595, + "grad_norm": 0.7009296417236328, + "learning_rate": 7.049220666527335e-06, + "loss": 0.7638, + "step": 13348 + }, + { + "epoch": 0.7347129726457152, + "grad_norm": 0.6210034489631653, + "learning_rate": 7.0488252698374024e-06, + "loss": 0.6872, + "step": 13349 + }, + { + "epoch": 0.7347680114480709, + "grad_norm": 0.6280165910720825, + "learning_rate": 7.0484298577490485e-06, + "loss": 0.7084, + "step": 13350 + }, + { + "epoch": 0.7348230502504266, + "grad_norm": 0.8055418133735657, + "learning_rate": 7.048034430265242e-06, + "loss": 0.8202, + "step": 13351 + }, + { + "epoch": 0.7348780890527822, + "grad_norm": 0.6674166917800903, + "learning_rate": 7.047638987388959e-06, + "loss": 0.6368, + "step": 13352 + }, + { + "epoch": 0.7349331278551379, + "grad_norm": 0.9182783961296082, + "learning_rate": 7.04724352912317e-06, + "loss": 0.6734, + "step": 13353 + }, + { + "epoch": 0.7349881666574936, + "grad_norm": 0.6371243596076965, + "learning_rate": 7.046848055470845e-06, + "loss": 0.7308, + "step": 13354 + }, + { + "epoch": 0.7350432054598492, + "grad_norm": 0.6454519033432007, + "learning_rate": 7.046452566434959e-06, + "loss": 0.6882, + "step": 13355 + }, + { + "epoch": 0.7350982442622048, + "grad_norm": 0.648970365524292, + "learning_rate": 7.046057062018483e-06, + "loss": 0.7247, + "step": 13356 + }, + { + "epoch": 0.7351532830645605, + "grad_norm": 0.668886661529541, + "learning_rate": 7.04566154222439e-06, + "loss": 0.7379, + "step": 13357 + }, + { + "epoch": 0.7352083218669162, + "grad_norm": 0.6593654751777649, + "learning_rate": 7.045266007055651e-06, + "loss": 0.7473, + "step": 13358 + }, + { + "epoch": 0.7352633606692718, + "grad_norm": 0.8418927192687988, + "learning_rate": 7.044870456515241e-06, + "loss": 0.7949, + "step": 13359 + }, + { + "epoch": 0.7353183994716275, + "grad_norm": 0.7350470423698425, + "learning_rate": 7.044474890606132e-06, + "loss": 0.7545, + "step": 13360 + }, + { + "epoch": 0.7353734382739832, + "grad_norm": 0.7786250114440918, + "learning_rate": 7.044079309331298e-06, + "loss": 0.8587, + "step": 13361 + }, + { + "epoch": 0.7354284770763388, + "grad_norm": 0.6345693469047546, + "learning_rate": 7.04368371269371e-06, + "loss": 0.77, + "step": 13362 + }, + { + "epoch": 0.7354835158786944, + "grad_norm": 0.7030417919158936, + "learning_rate": 7.043288100696343e-06, + "loss": 0.7624, + "step": 13363 + }, + { + "epoch": 0.7355385546810501, + "grad_norm": 0.7526041865348816, + "learning_rate": 7.042892473342169e-06, + "loss": 0.8018, + "step": 13364 + }, + { + "epoch": 0.7355935934834058, + "grad_norm": 0.6419941782951355, + "learning_rate": 7.042496830634162e-06, + "loss": 0.6788, + "step": 13365 + }, + { + "epoch": 0.7356486322857615, + "grad_norm": 0.6952203512191772, + "learning_rate": 7.042101172575297e-06, + "loss": 0.7747, + "step": 13366 + }, + { + "epoch": 0.7357036710881171, + "grad_norm": 0.8046327829360962, + "learning_rate": 7.041705499168544e-06, + "loss": 0.8216, + "step": 13367 + }, + { + "epoch": 0.7357587098904728, + "grad_norm": 0.6641537547111511, + "learning_rate": 7.041309810416881e-06, + "loss": 0.7313, + "step": 13368 + }, + { + "epoch": 0.7358137486928285, + "grad_norm": 0.6824444532394409, + "learning_rate": 7.040914106323278e-06, + "loss": 0.7179, + "step": 13369 + }, + { + "epoch": 0.7358687874951841, + "grad_norm": 0.6469557285308838, + "learning_rate": 7.040518386890711e-06, + "loss": 0.7671, + "step": 13370 + }, + { + "epoch": 0.7359238262975397, + "grad_norm": 0.6826488971710205, + "learning_rate": 7.040122652122156e-06, + "loss": 0.7, + "step": 13371 + }, + { + "epoch": 0.7359788650998954, + "grad_norm": 0.6931618452072144, + "learning_rate": 7.039726902020583e-06, + "loss": 0.7641, + "step": 13372 + }, + { + "epoch": 0.7360339039022511, + "grad_norm": 0.7445465922355652, + "learning_rate": 7.039331136588971e-06, + "loss": 0.7458, + "step": 13373 + }, + { + "epoch": 0.7360889427046068, + "grad_norm": 0.6358756422996521, + "learning_rate": 7.038935355830289e-06, + "loss": 0.6125, + "step": 13374 + }, + { + "epoch": 0.7361439815069624, + "grad_norm": 0.6966063380241394, + "learning_rate": 7.038539559747517e-06, + "loss": 0.6812, + "step": 13375 + }, + { + "epoch": 0.736199020309318, + "grad_norm": 0.9898090362548828, + "learning_rate": 7.038143748343626e-06, + "loss": 0.707, + "step": 13376 + }, + { + "epoch": 0.7362540591116737, + "grad_norm": 0.685951828956604, + "learning_rate": 7.0377479216215935e-06, + "loss": 0.7932, + "step": 13377 + }, + { + "epoch": 0.7363090979140294, + "grad_norm": 0.7056856751441956, + "learning_rate": 7.037352079584392e-06, + "loss": 0.7432, + "step": 13378 + }, + { + "epoch": 0.736364136716385, + "grad_norm": 0.7802489995956421, + "learning_rate": 7.036956222234999e-06, + "loss": 0.8275, + "step": 13379 + }, + { + "epoch": 0.7364191755187407, + "grad_norm": 0.7990192770957947, + "learning_rate": 7.036560349576387e-06, + "loss": 0.893, + "step": 13380 + }, + { + "epoch": 0.7364742143210964, + "grad_norm": 0.6454586386680603, + "learning_rate": 7.0361644616115334e-06, + "loss": 0.751, + "step": 13381 + }, + { + "epoch": 0.7365292531234521, + "grad_norm": 0.7071009278297424, + "learning_rate": 7.035768558343412e-06, + "loss": 0.7771, + "step": 13382 + }, + { + "epoch": 0.7365842919258077, + "grad_norm": 0.6530466079711914, + "learning_rate": 7.035372639774999e-06, + "loss": 0.7529, + "step": 13383 + }, + { + "epoch": 0.7366393307281633, + "grad_norm": 0.728689968585968, + "learning_rate": 7.03497670590927e-06, + "loss": 0.7862, + "step": 13384 + }, + { + "epoch": 0.736694369530519, + "grad_norm": 0.6640015244483948, + "learning_rate": 7.034580756749202e-06, + "loss": 0.6876, + "step": 13385 + }, + { + "epoch": 0.7367494083328747, + "grad_norm": 0.7388426661491394, + "learning_rate": 7.034184792297769e-06, + "loss": 0.8168, + "step": 13386 + }, + { + "epoch": 0.7368044471352303, + "grad_norm": 0.6543731093406677, + "learning_rate": 7.0337888125579465e-06, + "loss": 0.7555, + "step": 13387 + }, + { + "epoch": 0.736859485937586, + "grad_norm": 0.7783555388450623, + "learning_rate": 7.0333928175327125e-06, + "loss": 0.755, + "step": 13388 + }, + { + "epoch": 0.7369145247399417, + "grad_norm": 0.6275887489318848, + "learning_rate": 7.032996807225043e-06, + "loss": 0.7187, + "step": 13389 + }, + { + "epoch": 0.7369695635422974, + "grad_norm": 0.7007517218589783, + "learning_rate": 7.032600781637913e-06, + "loss": 0.6993, + "step": 13390 + }, + { + "epoch": 0.737024602344653, + "grad_norm": 0.6322247385978699, + "learning_rate": 7.0322047407743e-06, + "loss": 0.7178, + "step": 13391 + }, + { + "epoch": 0.7370796411470086, + "grad_norm": 0.7160976529121399, + "learning_rate": 7.0318086846371804e-06, + "loss": 0.6884, + "step": 13392 + }, + { + "epoch": 0.7371346799493643, + "grad_norm": 0.6056101322174072, + "learning_rate": 7.03141261322953e-06, + "loss": 0.6672, + "step": 13393 + }, + { + "epoch": 0.73718971875172, + "grad_norm": 0.8779410123825073, + "learning_rate": 7.0310165265543264e-06, + "loss": 0.7564, + "step": 13394 + }, + { + "epoch": 0.7372447575540756, + "grad_norm": 0.6868176460266113, + "learning_rate": 7.030620424614546e-06, + "loss": 0.7658, + "step": 13395 + }, + { + "epoch": 0.7372997963564313, + "grad_norm": 0.7611618041992188, + "learning_rate": 7.030224307413166e-06, + "loss": 0.6445, + "step": 13396 + }, + { + "epoch": 0.737354835158787, + "grad_norm": 0.7688242793083191, + "learning_rate": 7.0298281749531636e-06, + "loss": 0.8061, + "step": 13397 + }, + { + "epoch": 0.7374098739611427, + "grad_norm": 0.6781700849533081, + "learning_rate": 7.029432027237518e-06, + "loss": 0.6374, + "step": 13398 + }, + { + "epoch": 0.7374649127634982, + "grad_norm": 0.6719028353691101, + "learning_rate": 7.0290358642692e-06, + "loss": 0.7585, + "step": 13399 + }, + { + "epoch": 0.7375199515658539, + "grad_norm": 0.704429030418396, + "learning_rate": 7.028639686051195e-06, + "loss": 0.7052, + "step": 13400 + }, + { + "epoch": 0.7375749903682096, + "grad_norm": 0.714914083480835, + "learning_rate": 7.028243492586478e-06, + "loss": 0.7785, + "step": 13401 + }, + { + "epoch": 0.7376300291705652, + "grad_norm": 0.7732700705528259, + "learning_rate": 7.027847283878023e-06, + "loss": 0.7812, + "step": 13402 + }, + { + "epoch": 0.7376850679729209, + "grad_norm": 0.6849464178085327, + "learning_rate": 7.027451059928813e-06, + "loss": 0.7657, + "step": 13403 + }, + { + "epoch": 0.7377401067752766, + "grad_norm": 0.6924402117729187, + "learning_rate": 7.027054820741822e-06, + "loss": 0.677, + "step": 13404 + }, + { + "epoch": 0.7377951455776323, + "grad_norm": 0.7142716646194458, + "learning_rate": 7.02665856632003e-06, + "loss": 0.7071, + "step": 13405 + }, + { + "epoch": 0.7378501843799878, + "grad_norm": 0.7227265238761902, + "learning_rate": 7.0262622966664154e-06, + "loss": 0.6986, + "step": 13406 + }, + { + "epoch": 0.7379052231823435, + "grad_norm": 0.6387726664543152, + "learning_rate": 7.025866011783954e-06, + "loss": 0.6563, + "step": 13407 + }, + { + "epoch": 0.7379602619846992, + "grad_norm": 0.6411992311477661, + "learning_rate": 7.025469711675628e-06, + "loss": 0.5842, + "step": 13408 + }, + { + "epoch": 0.7380153007870549, + "grad_norm": 0.6811027526855469, + "learning_rate": 7.025073396344413e-06, + "loss": 0.6746, + "step": 13409 + }, + { + "epoch": 0.7380703395894105, + "grad_norm": 1.0705479383468628, + "learning_rate": 7.024677065793289e-06, + "loss": 0.7457, + "step": 13410 + }, + { + "epoch": 0.7381253783917662, + "grad_norm": 0.6920849084854126, + "learning_rate": 7.024280720025232e-06, + "loss": 0.6838, + "step": 13411 + }, + { + "epoch": 0.7381804171941219, + "grad_norm": 0.8089182376861572, + "learning_rate": 7.0238843590432236e-06, + "loss": 0.6682, + "step": 13412 + }, + { + "epoch": 0.7382354559964776, + "grad_norm": 0.6140334010124207, + "learning_rate": 7.023487982850244e-06, + "loss": 0.6992, + "step": 13413 + }, + { + "epoch": 0.7382904947988331, + "grad_norm": 0.8564643263816833, + "learning_rate": 7.023091591449269e-06, + "loss": 0.8512, + "step": 13414 + }, + { + "epoch": 0.7383455336011888, + "grad_norm": 0.655516505241394, + "learning_rate": 7.02269518484328e-06, + "loss": 0.7291, + "step": 13415 + }, + { + "epoch": 0.7384005724035445, + "grad_norm": 0.6373177766799927, + "learning_rate": 7.022298763035255e-06, + "loss": 0.7553, + "step": 13416 + }, + { + "epoch": 0.7384556112059002, + "grad_norm": 0.7023805379867554, + "learning_rate": 7.021902326028174e-06, + "loss": 0.7562, + "step": 13417 + }, + { + "epoch": 0.7385106500082558, + "grad_norm": 0.654181182384491, + "learning_rate": 7.021505873825016e-06, + "loss": 0.7153, + "step": 13418 + }, + { + "epoch": 0.7385656888106115, + "grad_norm": 0.6633459329605103, + "learning_rate": 7.02110940642876e-06, + "loss": 0.6779, + "step": 13419 + }, + { + "epoch": 0.7386207276129672, + "grad_norm": 0.7050659656524658, + "learning_rate": 7.020712923842388e-06, + "loss": 0.741, + "step": 13420 + }, + { + "epoch": 0.7386757664153228, + "grad_norm": 0.7241182327270508, + "learning_rate": 7.020316426068879e-06, + "loss": 0.7479, + "step": 13421 + }, + { + "epoch": 0.7387308052176784, + "grad_norm": 1.0262155532836914, + "learning_rate": 7.019919913111212e-06, + "loss": 0.8418, + "step": 13422 + }, + { + "epoch": 0.7387858440200341, + "grad_norm": 0.6765457391738892, + "learning_rate": 7.019523384972366e-06, + "loss": 0.727, + "step": 13423 + }, + { + "epoch": 0.7388408828223898, + "grad_norm": 0.6871724724769592, + "learning_rate": 7.0191268416553245e-06, + "loss": 0.8273, + "step": 13424 + }, + { + "epoch": 0.7388959216247455, + "grad_norm": 0.8085252046585083, + "learning_rate": 7.018730283163067e-06, + "loss": 0.7306, + "step": 13425 + }, + { + "epoch": 0.7389509604271011, + "grad_norm": 0.6822873950004578, + "learning_rate": 7.018333709498572e-06, + "loss": 0.7454, + "step": 13426 + }, + { + "epoch": 0.7390059992294568, + "grad_norm": 0.7210521697998047, + "learning_rate": 7.01793712066482e-06, + "loss": 0.8306, + "step": 13427 + }, + { + "epoch": 0.7390610380318124, + "grad_norm": 0.6404997110366821, + "learning_rate": 7.017540516664795e-06, + "loss": 0.7151, + "step": 13428 + }, + { + "epoch": 0.7391160768341681, + "grad_norm": 0.6662821769714355, + "learning_rate": 7.017143897501475e-06, + "loss": 0.7446, + "step": 13429 + }, + { + "epoch": 0.7391711156365237, + "grad_norm": 0.8048129081726074, + "learning_rate": 7.0167472631778415e-06, + "loss": 0.7953, + "step": 13430 + }, + { + "epoch": 0.7392261544388794, + "grad_norm": 0.7215000987052917, + "learning_rate": 7.016350613696873e-06, + "loss": 0.8373, + "step": 13431 + }, + { + "epoch": 0.7392811932412351, + "grad_norm": 0.7309150099754333, + "learning_rate": 7.015953949061555e-06, + "loss": 0.7654, + "step": 13432 + }, + { + "epoch": 0.7393362320435908, + "grad_norm": 0.6487464904785156, + "learning_rate": 7.0155572692748665e-06, + "loss": 0.6473, + "step": 13433 + }, + { + "epoch": 0.7393912708459464, + "grad_norm": 0.6172077059745789, + "learning_rate": 7.01516057433979e-06, + "loss": 0.6672, + "step": 13434 + }, + { + "epoch": 0.739446309648302, + "grad_norm": 0.7569651007652283, + "learning_rate": 7.014763864259304e-06, + "loss": 0.8501, + "step": 13435 + }, + { + "epoch": 0.7395013484506577, + "grad_norm": 0.824669599533081, + "learning_rate": 7.014367139036393e-06, + "loss": 0.8596, + "step": 13436 + }, + { + "epoch": 0.7395563872530134, + "grad_norm": 0.6904401183128357, + "learning_rate": 7.013970398674038e-06, + "loss": 0.7403, + "step": 13437 + }, + { + "epoch": 0.739611426055369, + "grad_norm": 0.7999581098556519, + "learning_rate": 7.013573643175221e-06, + "loss": 0.8879, + "step": 13438 + }, + { + "epoch": 0.7396664648577247, + "grad_norm": 0.6600533723831177, + "learning_rate": 7.0131768725429236e-06, + "loss": 0.7324, + "step": 13439 + }, + { + "epoch": 0.7397215036600804, + "grad_norm": 0.7174191474914551, + "learning_rate": 7.0127800867801275e-06, + "loss": 0.7474, + "step": 13440 + }, + { + "epoch": 0.7397765424624361, + "grad_norm": 0.7023884654045105, + "learning_rate": 7.012383285889814e-06, + "loss": 0.7826, + "step": 13441 + }, + { + "epoch": 0.7398315812647916, + "grad_norm": 0.6486913561820984, + "learning_rate": 7.011986469874969e-06, + "loss": 0.6553, + "step": 13442 + }, + { + "epoch": 0.7398866200671473, + "grad_norm": 0.7238486409187317, + "learning_rate": 7.011589638738569e-06, + "loss": 0.6759, + "step": 13443 + }, + { + "epoch": 0.739941658869503, + "grad_norm": 0.7879656553268433, + "learning_rate": 7.011192792483601e-06, + "loss": 0.886, + "step": 13444 + }, + { + "epoch": 0.7399966976718586, + "grad_norm": 0.6592407822608948, + "learning_rate": 7.010795931113047e-06, + "loss": 0.7746, + "step": 13445 + }, + { + "epoch": 0.7400517364742143, + "grad_norm": 0.8274507522583008, + "learning_rate": 7.010399054629889e-06, + "loss": 0.7615, + "step": 13446 + }, + { + "epoch": 0.74010677527657, + "grad_norm": 0.6233614087104797, + "learning_rate": 7.010002163037109e-06, + "loss": 0.695, + "step": 13447 + }, + { + "epoch": 0.7401618140789257, + "grad_norm": 0.7082701921463013, + "learning_rate": 7.00960525633769e-06, + "loss": 0.6677, + "step": 13448 + }, + { + "epoch": 0.7402168528812813, + "grad_norm": 1.0694652795791626, + "learning_rate": 7.009208334534618e-06, + "loss": 0.7792, + "step": 13449 + }, + { + "epoch": 0.7402718916836369, + "grad_norm": 0.7189109325408936, + "learning_rate": 7.008811397630874e-06, + "loss": 0.8606, + "step": 13450 + }, + { + "epoch": 0.7403269304859926, + "grad_norm": 0.7136901617050171, + "learning_rate": 7.00841444562944e-06, + "loss": 0.7142, + "step": 13451 + }, + { + "epoch": 0.7403819692883483, + "grad_norm": 0.6508508920669556, + "learning_rate": 7.008017478533301e-06, + "loss": 0.6748, + "step": 13452 + }, + { + "epoch": 0.7404370080907039, + "grad_norm": 0.6560903191566467, + "learning_rate": 7.007620496345441e-06, + "loss": 0.7929, + "step": 13453 + }, + { + "epoch": 0.7404920468930596, + "grad_norm": 0.6909067034721375, + "learning_rate": 7.007223499068841e-06, + "loss": 0.6118, + "step": 13454 + }, + { + "epoch": 0.7405470856954153, + "grad_norm": 0.6554582715034485, + "learning_rate": 7.0068264867064874e-06, + "loss": 0.7687, + "step": 13455 + }, + { + "epoch": 0.740602124497771, + "grad_norm": 0.7788346409797668, + "learning_rate": 7.006429459261363e-06, + "loss": 0.7535, + "step": 13456 + }, + { + "epoch": 0.7406571633001265, + "grad_norm": 0.7702943682670593, + "learning_rate": 7.006032416736452e-06, + "loss": 0.833, + "step": 13457 + }, + { + "epoch": 0.7407122021024822, + "grad_norm": 0.6860190033912659, + "learning_rate": 7.005635359134738e-06, + "loss": 0.6643, + "step": 13458 + }, + { + "epoch": 0.7407672409048379, + "grad_norm": 0.7470136880874634, + "learning_rate": 7.005238286459205e-06, + "loss": 0.7811, + "step": 13459 + }, + { + "epoch": 0.7408222797071936, + "grad_norm": 0.6769132614135742, + "learning_rate": 7.004841198712839e-06, + "loss": 0.7322, + "step": 13460 + }, + { + "epoch": 0.7408773185095492, + "grad_norm": 0.7865259647369385, + "learning_rate": 7.004444095898623e-06, + "loss": 0.817, + "step": 13461 + }, + { + "epoch": 0.7409323573119049, + "grad_norm": 0.7352784276008606, + "learning_rate": 7.004046978019542e-06, + "loss": 0.7373, + "step": 13462 + }, + { + "epoch": 0.7409873961142606, + "grad_norm": 0.7647448182106018, + "learning_rate": 7.00364984507858e-06, + "loss": 0.7129, + "step": 13463 + }, + { + "epoch": 0.7410424349166163, + "grad_norm": 0.6979989409446716, + "learning_rate": 7.003252697078722e-06, + "loss": 0.7833, + "step": 13464 + }, + { + "epoch": 0.7410974737189718, + "grad_norm": 0.6117465496063232, + "learning_rate": 7.002855534022953e-06, + "loss": 0.6732, + "step": 13465 + }, + { + "epoch": 0.7411525125213275, + "grad_norm": 0.6754159331321716, + "learning_rate": 7.002458355914258e-06, + "loss": 0.6939, + "step": 13466 + }, + { + "epoch": 0.7412075513236832, + "grad_norm": 0.6713566184043884, + "learning_rate": 7.002061162755621e-06, + "loss": 0.7459, + "step": 13467 + }, + { + "epoch": 0.7412625901260389, + "grad_norm": 0.6475394368171692, + "learning_rate": 7.001663954550029e-06, + "loss": 0.7912, + "step": 13468 + }, + { + "epoch": 0.7413176289283945, + "grad_norm": 0.6577908992767334, + "learning_rate": 7.001266731300467e-06, + "loss": 0.6903, + "step": 13469 + }, + { + "epoch": 0.7413726677307502, + "grad_norm": 0.8129748106002808, + "learning_rate": 7.00086949300992e-06, + "loss": 0.8277, + "step": 13470 + }, + { + "epoch": 0.7414277065331059, + "grad_norm": 0.6730444431304932, + "learning_rate": 7.000472239681372e-06, + "loss": 0.7357, + "step": 13471 + }, + { + "epoch": 0.7414827453354615, + "grad_norm": 0.7166460156440735, + "learning_rate": 7.000074971317812e-06, + "loss": 0.7544, + "step": 13472 + }, + { + "epoch": 0.7415377841378171, + "grad_norm": 0.6668731570243835, + "learning_rate": 6.9996776879222225e-06, + "loss": 0.7073, + "step": 13473 + }, + { + "epoch": 0.7415928229401728, + "grad_norm": 0.7031315565109253, + "learning_rate": 6.999280389497591e-06, + "loss": 0.7262, + "step": 13474 + }, + { + "epoch": 0.7416478617425285, + "grad_norm": 0.7426775693893433, + "learning_rate": 6.998883076046904e-06, + "loss": 0.7394, + "step": 13475 + }, + { + "epoch": 0.7417029005448842, + "grad_norm": 0.665226399898529, + "learning_rate": 6.9984857475731475e-06, + "loss": 0.7365, + "step": 13476 + }, + { + "epoch": 0.7417579393472398, + "grad_norm": 0.7762128114700317, + "learning_rate": 6.998088404079306e-06, + "loss": 0.8551, + "step": 13477 + }, + { + "epoch": 0.7418129781495955, + "grad_norm": 0.7129524350166321, + "learning_rate": 6.997691045568366e-06, + "loss": 0.7646, + "step": 13478 + }, + { + "epoch": 0.7418680169519511, + "grad_norm": 0.7199442386627197, + "learning_rate": 6.997293672043316e-06, + "loss": 0.6879, + "step": 13479 + }, + { + "epoch": 0.7419230557543068, + "grad_norm": 0.6559237241744995, + "learning_rate": 6.9968962835071415e-06, + "loss": 0.6965, + "step": 13480 + }, + { + "epoch": 0.7419780945566624, + "grad_norm": 0.7428768277168274, + "learning_rate": 6.996498879962829e-06, + "loss": 0.7748, + "step": 13481 + }, + { + "epoch": 0.7420331333590181, + "grad_norm": 0.7344076633453369, + "learning_rate": 6.996101461413365e-06, + "loss": 0.6554, + "step": 13482 + }, + { + "epoch": 0.7420881721613738, + "grad_norm": 0.7080272436141968, + "learning_rate": 6.995704027861736e-06, + "loss": 0.7335, + "step": 13483 + }, + { + "epoch": 0.7421432109637295, + "grad_norm": 0.6296887397766113, + "learning_rate": 6.9953065793109306e-06, + "loss": 0.6411, + "step": 13484 + }, + { + "epoch": 0.7421982497660851, + "grad_norm": 0.7597532868385315, + "learning_rate": 6.994909115763935e-06, + "loss": 0.8281, + "step": 13485 + }, + { + "epoch": 0.7422532885684407, + "grad_norm": 0.7059680819511414, + "learning_rate": 6.994511637223737e-06, + "loss": 0.8075, + "step": 13486 + }, + { + "epoch": 0.7423083273707964, + "grad_norm": 0.8097653388977051, + "learning_rate": 6.994114143693323e-06, + "loss": 0.772, + "step": 13487 + }, + { + "epoch": 0.742363366173152, + "grad_norm": 0.7609913945198059, + "learning_rate": 6.993716635175681e-06, + "loss": 0.8265, + "step": 13488 + }, + { + "epoch": 0.7424184049755077, + "grad_norm": 0.6209948062896729, + "learning_rate": 6.993319111673799e-06, + "loss": 0.6266, + "step": 13489 + }, + { + "epoch": 0.7424734437778634, + "grad_norm": 0.6655107140541077, + "learning_rate": 6.992921573190663e-06, + "loss": 0.7519, + "step": 13490 + }, + { + "epoch": 0.7425284825802191, + "grad_norm": 1.1243617534637451, + "learning_rate": 6.992524019729262e-06, + "loss": 0.7707, + "step": 13491 + }, + { + "epoch": 0.7425835213825747, + "grad_norm": 0.6680326461791992, + "learning_rate": 6.9921264512925845e-06, + "loss": 0.7344, + "step": 13492 + }, + { + "epoch": 0.7426385601849304, + "grad_norm": 0.7689213156700134, + "learning_rate": 6.991728867883618e-06, + "loss": 0.7591, + "step": 13493 + }, + { + "epoch": 0.742693598987286, + "grad_norm": 0.8587394952774048, + "learning_rate": 6.99133126950535e-06, + "loss": 0.6991, + "step": 13494 + }, + { + "epoch": 0.7427486377896417, + "grad_norm": 0.6736756563186646, + "learning_rate": 6.990933656160768e-06, + "loss": 0.7604, + "step": 13495 + }, + { + "epoch": 0.7428036765919973, + "grad_norm": 0.6538887023925781, + "learning_rate": 6.990536027852864e-06, + "loss": 0.7332, + "step": 13496 + }, + { + "epoch": 0.742858715394353, + "grad_norm": 0.6578357815742493, + "learning_rate": 6.990138384584623e-06, + "loss": 0.7238, + "step": 13497 + }, + { + "epoch": 0.7429137541967087, + "grad_norm": 0.6865534782409668, + "learning_rate": 6.989740726359035e-06, + "loss": 0.7012, + "step": 13498 + }, + { + "epoch": 0.7429687929990644, + "grad_norm": 0.6198129057884216, + "learning_rate": 6.989343053179088e-06, + "loss": 0.7391, + "step": 13499 + }, + { + "epoch": 0.74302383180142, + "grad_norm": 0.6929547786712646, + "learning_rate": 6.98894536504777e-06, + "loss": 0.8498, + "step": 13500 + }, + { + "epoch": 0.7430788706037756, + "grad_norm": 0.6863006353378296, + "learning_rate": 6.988547661968072e-06, + "loss": 0.6589, + "step": 13501 + }, + { + "epoch": 0.7431339094061313, + "grad_norm": 0.7490457892417908, + "learning_rate": 6.988149943942982e-06, + "loss": 0.8145, + "step": 13502 + }, + { + "epoch": 0.743188948208487, + "grad_norm": 0.6597211360931396, + "learning_rate": 6.987752210975489e-06, + "loss": 0.7786, + "step": 13503 + }, + { + "epoch": 0.7432439870108426, + "grad_norm": 0.7211003303527832, + "learning_rate": 6.987354463068583e-06, + "loss": 0.7668, + "step": 13504 + }, + { + "epoch": 0.7432990258131983, + "grad_norm": 0.6257827877998352, + "learning_rate": 6.9869567002252526e-06, + "loss": 0.7378, + "step": 13505 + }, + { + "epoch": 0.743354064615554, + "grad_norm": 0.656944751739502, + "learning_rate": 6.986558922448488e-06, + "loss": 0.6408, + "step": 13506 + }, + { + "epoch": 0.7434091034179097, + "grad_norm": 0.6862110495567322, + "learning_rate": 6.986161129741276e-06, + "loss": 0.7648, + "step": 13507 + }, + { + "epoch": 0.7434641422202652, + "grad_norm": 0.6216374039649963, + "learning_rate": 6.985763322106612e-06, + "loss": 0.6826, + "step": 13508 + }, + { + "epoch": 0.7435191810226209, + "grad_norm": 0.7959128618240356, + "learning_rate": 6.985365499547479e-06, + "loss": 0.7554, + "step": 13509 + }, + { + "epoch": 0.7435742198249766, + "grad_norm": 0.5882300734519958, + "learning_rate": 6.984967662066875e-06, + "loss": 0.6523, + "step": 13510 + }, + { + "epoch": 0.7436292586273323, + "grad_norm": 0.8529833555221558, + "learning_rate": 6.9845698096677805e-06, + "loss": 0.7871, + "step": 13511 + }, + { + "epoch": 0.7436842974296879, + "grad_norm": 1.2988953590393066, + "learning_rate": 6.9841719423531925e-06, + "loss": 0.708, + "step": 13512 + }, + { + "epoch": 0.7437393362320436, + "grad_norm": 0.6735696792602539, + "learning_rate": 6.983774060126101e-06, + "loss": 0.7962, + "step": 13513 + }, + { + "epoch": 0.7437943750343993, + "grad_norm": 0.8145982623100281, + "learning_rate": 6.9833761629894925e-06, + "loss": 0.9067, + "step": 13514 + }, + { + "epoch": 0.743849413836755, + "grad_norm": 0.7107387781143188, + "learning_rate": 6.98297825094636e-06, + "loss": 0.7986, + "step": 13515 + }, + { + "epoch": 0.7439044526391105, + "grad_norm": 0.7350436449050903, + "learning_rate": 6.9825803239996934e-06, + "loss": 0.7724, + "step": 13516 + }, + { + "epoch": 0.7439594914414662, + "grad_norm": 0.7300962805747986, + "learning_rate": 6.982182382152485e-06, + "loss": 0.734, + "step": 13517 + }, + { + "epoch": 0.7440145302438219, + "grad_norm": 0.7088475823402405, + "learning_rate": 6.981784425407724e-06, + "loss": 0.818, + "step": 13518 + }, + { + "epoch": 0.7440695690461776, + "grad_norm": 0.6911785006523132, + "learning_rate": 6.981386453768402e-06, + "loss": 0.6857, + "step": 13519 + }, + { + "epoch": 0.7441246078485332, + "grad_norm": 0.794143795967102, + "learning_rate": 6.980988467237508e-06, + "loss": 0.7496, + "step": 13520 + }, + { + "epoch": 0.7441796466508889, + "grad_norm": 0.7116371989250183, + "learning_rate": 6.980590465818037e-06, + "loss": 0.7082, + "step": 13521 + }, + { + "epoch": 0.7442346854532446, + "grad_norm": 0.6306180953979492, + "learning_rate": 6.980192449512978e-06, + "loss": 0.7227, + "step": 13522 + }, + { + "epoch": 0.7442897242556002, + "grad_norm": 0.6662481427192688, + "learning_rate": 6.979794418325323e-06, + "loss": 0.7323, + "step": 13523 + }, + { + "epoch": 0.7443447630579558, + "grad_norm": 0.6824387907981873, + "learning_rate": 6.97939637225806e-06, + "loss": 0.7188, + "step": 13524 + }, + { + "epoch": 0.7443998018603115, + "grad_norm": 0.7429190278053284, + "learning_rate": 6.9789983113141865e-06, + "loss": 0.7818, + "step": 13525 + }, + { + "epoch": 0.7444548406626672, + "grad_norm": 0.7148364782333374, + "learning_rate": 6.978600235496692e-06, + "loss": 0.7665, + "step": 13526 + }, + { + "epoch": 0.7445098794650229, + "grad_norm": 0.711482584476471, + "learning_rate": 6.978202144808567e-06, + "loss": 0.7865, + "step": 13527 + }, + { + "epoch": 0.7445649182673785, + "grad_norm": 0.6913465857505798, + "learning_rate": 6.977804039252802e-06, + "loss": 0.8206, + "step": 13528 + }, + { + "epoch": 0.7446199570697342, + "grad_norm": 0.9090713858604431, + "learning_rate": 6.977405918832394e-06, + "loss": 0.7243, + "step": 13529 + }, + { + "epoch": 0.7446749958720899, + "grad_norm": 0.7680408954620361, + "learning_rate": 6.977007783550331e-06, + "loss": 0.847, + "step": 13530 + }, + { + "epoch": 0.7447300346744454, + "grad_norm": 0.6486232876777649, + "learning_rate": 6.976609633409608e-06, + "loss": 0.7258, + "step": 13531 + }, + { + "epoch": 0.7447850734768011, + "grad_norm": 0.7612336277961731, + "learning_rate": 6.976211468413214e-06, + "loss": 0.7452, + "step": 13532 + }, + { + "epoch": 0.7448401122791568, + "grad_norm": 0.7539309859275818, + "learning_rate": 6.975813288564146e-06, + "loss": 0.8292, + "step": 13533 + }, + { + "epoch": 0.7448951510815125, + "grad_norm": 0.64984530210495, + "learning_rate": 6.975415093865394e-06, + "loss": 0.6818, + "step": 13534 + }, + { + "epoch": 0.7449501898838681, + "grad_norm": 0.6415309309959412, + "learning_rate": 6.9750168843199506e-06, + "loss": 0.7369, + "step": 13535 + }, + { + "epoch": 0.7450052286862238, + "grad_norm": 0.7107319235801697, + "learning_rate": 6.974618659930807e-06, + "loss": 0.7364, + "step": 13536 + }, + { + "epoch": 0.7450602674885795, + "grad_norm": 0.7358448505401611, + "learning_rate": 6.9742204207009605e-06, + "loss": 0.7784, + "step": 13537 + }, + { + "epoch": 0.7451153062909351, + "grad_norm": 0.6950068473815918, + "learning_rate": 6.9738221666334e-06, + "loss": 0.792, + "step": 13538 + }, + { + "epoch": 0.7451703450932907, + "grad_norm": 0.7355311512947083, + "learning_rate": 6.973423897731122e-06, + "loss": 0.7631, + "step": 13539 + }, + { + "epoch": 0.7452253838956464, + "grad_norm": 0.6813983917236328, + "learning_rate": 6.9730256139971175e-06, + "loss": 0.7397, + "step": 13540 + }, + { + "epoch": 0.7452804226980021, + "grad_norm": 0.7698497772216797, + "learning_rate": 6.9726273154343806e-06, + "loss": 0.7769, + "step": 13541 + }, + { + "epoch": 0.7453354615003578, + "grad_norm": 0.7406428456306458, + "learning_rate": 6.972229002045905e-06, + "loss": 0.6502, + "step": 13542 + }, + { + "epoch": 0.7453905003027134, + "grad_norm": 0.6976667046546936, + "learning_rate": 6.9718306738346846e-06, + "loss": 0.773, + "step": 13543 + }, + { + "epoch": 0.745445539105069, + "grad_norm": 0.6932592391967773, + "learning_rate": 6.9714323308037115e-06, + "loss": 0.7315, + "step": 13544 + }, + { + "epoch": 0.7455005779074247, + "grad_norm": 0.7329851984977722, + "learning_rate": 6.971033972955981e-06, + "loss": 0.7432, + "step": 13545 + }, + { + "epoch": 0.7455556167097804, + "grad_norm": 0.6262860298156738, + "learning_rate": 6.970635600294489e-06, + "loss": 0.6368, + "step": 13546 + }, + { + "epoch": 0.745610655512136, + "grad_norm": 0.7157273292541504, + "learning_rate": 6.970237212822225e-06, + "loss": 0.7209, + "step": 13547 + }, + { + "epoch": 0.7456656943144917, + "grad_norm": 0.7256374955177307, + "learning_rate": 6.9698388105421855e-06, + "loss": 0.794, + "step": 13548 + }, + { + "epoch": 0.7457207331168474, + "grad_norm": 0.7763124704360962, + "learning_rate": 6.969440393457365e-06, + "loss": 0.7211, + "step": 13549 + }, + { + "epoch": 0.7457757719192031, + "grad_norm": 0.7139148712158203, + "learning_rate": 6.9690419615707585e-06, + "loss": 0.6612, + "step": 13550 + }, + { + "epoch": 0.7458308107215587, + "grad_norm": 0.7532974481582642, + "learning_rate": 6.968643514885359e-06, + "loss": 0.6952, + "step": 13551 + }, + { + "epoch": 0.7458858495239143, + "grad_norm": 0.6845714449882507, + "learning_rate": 6.968245053404161e-06, + "loss": 0.6972, + "step": 13552 + }, + { + "epoch": 0.74594088832627, + "grad_norm": 0.7445462346076965, + "learning_rate": 6.967846577130162e-06, + "loss": 0.7826, + "step": 13553 + }, + { + "epoch": 0.7459959271286257, + "grad_norm": 0.7269366383552551, + "learning_rate": 6.967448086066353e-06, + "loss": 0.7353, + "step": 13554 + }, + { + "epoch": 0.7460509659309813, + "grad_norm": 0.7366362810134888, + "learning_rate": 6.967049580215732e-06, + "loss": 0.7955, + "step": 13555 + }, + { + "epoch": 0.746106004733337, + "grad_norm": 0.6456870436668396, + "learning_rate": 6.966651059581292e-06, + "loss": 0.7467, + "step": 13556 + }, + { + "epoch": 0.7461610435356927, + "grad_norm": 0.7196624279022217, + "learning_rate": 6.966252524166031e-06, + "loss": 0.6621, + "step": 13557 + }, + { + "epoch": 0.7462160823380484, + "grad_norm": 0.6776413917541504, + "learning_rate": 6.965853973972941e-06, + "loss": 0.7647, + "step": 13558 + }, + { + "epoch": 0.746271121140404, + "grad_norm": 0.7319629192352295, + "learning_rate": 6.9654554090050195e-06, + "loss": 0.8172, + "step": 13559 + }, + { + "epoch": 0.7463261599427596, + "grad_norm": 0.6995210647583008, + "learning_rate": 6.96505682926526e-06, + "loss": 0.7252, + "step": 13560 + }, + { + "epoch": 0.7463811987451153, + "grad_norm": 0.6520518064498901, + "learning_rate": 6.964658234756659e-06, + "loss": 0.6856, + "step": 13561 + }, + { + "epoch": 0.746436237547471, + "grad_norm": 0.7562724947929382, + "learning_rate": 6.964259625482215e-06, + "loss": 0.7088, + "step": 13562 + }, + { + "epoch": 0.7464912763498266, + "grad_norm": 0.788045346736908, + "learning_rate": 6.963861001444919e-06, + "loss": 0.7183, + "step": 13563 + }, + { + "epoch": 0.7465463151521823, + "grad_norm": 0.7461729049682617, + "learning_rate": 6.96346236264777e-06, + "loss": 0.6725, + "step": 13564 + }, + { + "epoch": 0.746601353954538, + "grad_norm": 0.7283952832221985, + "learning_rate": 6.963063709093764e-06, + "loss": 0.7765, + "step": 13565 + }, + { + "epoch": 0.7466563927568937, + "grad_norm": 0.7947741150856018, + "learning_rate": 6.962665040785896e-06, + "loss": 0.8423, + "step": 13566 + }, + { + "epoch": 0.7467114315592492, + "grad_norm": 0.7964398264884949, + "learning_rate": 6.962266357727164e-06, + "loss": 0.7589, + "step": 13567 + }, + { + "epoch": 0.7467664703616049, + "grad_norm": 0.7807595133781433, + "learning_rate": 6.961867659920563e-06, + "loss": 0.7843, + "step": 13568 + }, + { + "epoch": 0.7468215091639606, + "grad_norm": 0.678011417388916, + "learning_rate": 6.961468947369089e-06, + "loss": 0.6664, + "step": 13569 + }, + { + "epoch": 0.7468765479663163, + "grad_norm": 0.6768447756767273, + "learning_rate": 6.961070220075741e-06, + "loss": 0.7531, + "step": 13570 + }, + { + "epoch": 0.7469315867686719, + "grad_norm": 0.7405245304107666, + "learning_rate": 6.960671478043514e-06, + "loss": 0.8278, + "step": 13571 + }, + { + "epoch": 0.7469866255710276, + "grad_norm": 0.605675458908081, + "learning_rate": 6.960272721275403e-06, + "loss": 0.7167, + "step": 13572 + }, + { + "epoch": 0.7470416643733833, + "grad_norm": 0.7406657338142395, + "learning_rate": 6.959873949774409e-06, + "loss": 0.8191, + "step": 13573 + }, + { + "epoch": 0.7470967031757388, + "grad_norm": 0.6163522601127625, + "learning_rate": 6.959475163543526e-06, + "loss": 0.6711, + "step": 13574 + }, + { + "epoch": 0.7471517419780945, + "grad_norm": 0.6036590337753296, + "learning_rate": 6.9590763625857525e-06, + "loss": 0.7029, + "step": 13575 + }, + { + "epoch": 0.7472067807804502, + "grad_norm": 0.8638957738876343, + "learning_rate": 6.9586775469040845e-06, + "loss": 0.6288, + "step": 13576 + }, + { + "epoch": 0.7472618195828059, + "grad_norm": 0.7490845322608948, + "learning_rate": 6.958278716501521e-06, + "loss": 0.7375, + "step": 13577 + }, + { + "epoch": 0.7473168583851615, + "grad_norm": 0.7788114547729492, + "learning_rate": 6.957879871381059e-06, + "loss": 0.814, + "step": 13578 + }, + { + "epoch": 0.7473718971875172, + "grad_norm": 0.7247292995452881, + "learning_rate": 6.957481011545697e-06, + "loss": 0.6187, + "step": 13579 + }, + { + "epoch": 0.7474269359898729, + "grad_norm": 0.9642785787582397, + "learning_rate": 6.95708213699843e-06, + "loss": 0.8745, + "step": 13580 + }, + { + "epoch": 0.7474819747922286, + "grad_norm": 0.701675295829773, + "learning_rate": 6.956683247742259e-06, + "loss": 0.8474, + "step": 13581 + }, + { + "epoch": 0.7475370135945841, + "grad_norm": 0.6338050961494446, + "learning_rate": 6.9562843437801795e-06, + "loss": 0.7346, + "step": 13582 + }, + { + "epoch": 0.7475920523969398, + "grad_norm": 0.6954126358032227, + "learning_rate": 6.955885425115191e-06, + "loss": 0.8083, + "step": 13583 + }, + { + "epoch": 0.7476470911992955, + "grad_norm": 0.7316300272941589, + "learning_rate": 6.95548649175029e-06, + "loss": 0.8009, + "step": 13584 + }, + { + "epoch": 0.7477021300016512, + "grad_norm": 0.6314196586608887, + "learning_rate": 6.955087543688477e-06, + "loss": 0.6375, + "step": 13585 + }, + { + "epoch": 0.7477571688040068, + "grad_norm": 0.6604906320571899, + "learning_rate": 6.9546885809327495e-06, + "loss": 0.7081, + "step": 13586 + }, + { + "epoch": 0.7478122076063625, + "grad_norm": 0.8251973986625671, + "learning_rate": 6.9542896034861064e-06, + "loss": 0.7483, + "step": 13587 + }, + { + "epoch": 0.7478672464087182, + "grad_norm": 0.6946399211883545, + "learning_rate": 6.953890611351544e-06, + "loss": 0.8849, + "step": 13588 + }, + { + "epoch": 0.7479222852110738, + "grad_norm": 0.7713609933853149, + "learning_rate": 6.953491604532063e-06, + "loss": 0.7913, + "step": 13589 + }, + { + "epoch": 0.7479773240134294, + "grad_norm": 0.734355092048645, + "learning_rate": 6.953092583030664e-06, + "loss": 0.7216, + "step": 13590 + }, + { + "epoch": 0.7480323628157851, + "grad_norm": 0.6147064566612244, + "learning_rate": 6.952693546850342e-06, + "loss": 0.6894, + "step": 13591 + }, + { + "epoch": 0.7480874016181408, + "grad_norm": 0.7472255229949951, + "learning_rate": 6.9522944959940986e-06, + "loss": 0.7941, + "step": 13592 + }, + { + "epoch": 0.7481424404204965, + "grad_norm": 0.6478431224822998, + "learning_rate": 6.951895430464935e-06, + "loss": 0.6995, + "step": 13593 + }, + { + "epoch": 0.7481974792228521, + "grad_norm": 0.6956225633621216, + "learning_rate": 6.951496350265844e-06, + "loss": 0.7637, + "step": 13594 + }, + { + "epoch": 0.7482525180252078, + "grad_norm": 1.0637938976287842, + "learning_rate": 6.95109725539983e-06, + "loss": 0.7448, + "step": 13595 + }, + { + "epoch": 0.7483075568275634, + "grad_norm": 0.6948299407958984, + "learning_rate": 6.9506981458698916e-06, + "loss": 0.7343, + "step": 13596 + }, + { + "epoch": 0.7483625956299191, + "grad_norm": 0.9034255743026733, + "learning_rate": 6.950299021679028e-06, + "loss": 0.6481, + "step": 13597 + }, + { + "epoch": 0.7484176344322747, + "grad_norm": 0.7901731729507446, + "learning_rate": 6.949899882830239e-06, + "loss": 0.8368, + "step": 13598 + }, + { + "epoch": 0.7484726732346304, + "grad_norm": 0.7791730761528015, + "learning_rate": 6.949500729326525e-06, + "loss": 0.7912, + "step": 13599 + }, + { + "epoch": 0.7485277120369861, + "grad_norm": 0.7678626179695129, + "learning_rate": 6.949101561170883e-06, + "loss": 0.7514, + "step": 13600 + }, + { + "epoch": 0.7485827508393418, + "grad_norm": 0.709762454032898, + "learning_rate": 6.948702378366318e-06, + "loss": 0.6809, + "step": 13601 + }, + { + "epoch": 0.7486377896416974, + "grad_norm": 0.706031084060669, + "learning_rate": 6.948303180915827e-06, + "loss": 0.7454, + "step": 13602 + }, + { + "epoch": 0.748692828444053, + "grad_norm": 0.658869743347168, + "learning_rate": 6.9479039688224105e-06, + "loss": 0.6498, + "step": 13603 + }, + { + "epoch": 0.7487478672464087, + "grad_norm": 0.7253865599632263, + "learning_rate": 6.9475047420890685e-06, + "loss": 0.8063, + "step": 13604 + }, + { + "epoch": 0.7488029060487644, + "grad_norm": 0.752839207649231, + "learning_rate": 6.947105500718804e-06, + "loss": 0.7708, + "step": 13605 + }, + { + "epoch": 0.74885794485112, + "grad_norm": 0.6694571375846863, + "learning_rate": 6.946706244714615e-06, + "loss": 0.7121, + "step": 13606 + }, + { + "epoch": 0.7489129836534757, + "grad_norm": 0.751380443572998, + "learning_rate": 6.946306974079503e-06, + "loss": 0.8797, + "step": 13607 + }, + { + "epoch": 0.7489680224558314, + "grad_norm": 0.8001984357833862, + "learning_rate": 6.9459076888164676e-06, + "loss": 0.8963, + "step": 13608 + }, + { + "epoch": 0.7490230612581871, + "grad_norm": 0.7149432301521301, + "learning_rate": 6.945508388928511e-06, + "loss": 0.8311, + "step": 13609 + }, + { + "epoch": 0.7490781000605427, + "grad_norm": 0.8295183777809143, + "learning_rate": 6.945109074418635e-06, + "loss": 0.7466, + "step": 13610 + }, + { + "epoch": 0.7491331388628983, + "grad_norm": 0.7480556964874268, + "learning_rate": 6.94470974528984e-06, + "loss": 0.8277, + "step": 13611 + }, + { + "epoch": 0.749188177665254, + "grad_norm": 0.7962234616279602, + "learning_rate": 6.944310401545127e-06, + "loss": 0.7143, + "step": 13612 + }, + { + "epoch": 0.7492432164676097, + "grad_norm": 0.7722699642181396, + "learning_rate": 6.943911043187497e-06, + "loss": 0.6619, + "step": 13613 + }, + { + "epoch": 0.7492982552699653, + "grad_norm": 0.8495624661445618, + "learning_rate": 6.943511670219952e-06, + "loss": 0.8475, + "step": 13614 + }, + { + "epoch": 0.749353294072321, + "grad_norm": 0.7702826261520386, + "learning_rate": 6.943112282645494e-06, + "loss": 0.826, + "step": 13615 + }, + { + "epoch": 0.7494083328746767, + "grad_norm": 0.7435297966003418, + "learning_rate": 6.942712880467124e-06, + "loss": 0.8121, + "step": 13616 + }, + { + "epoch": 0.7494633716770323, + "grad_norm": 0.8108325600624084, + "learning_rate": 6.942313463687844e-06, + "loss": 0.7282, + "step": 13617 + }, + { + "epoch": 0.7495184104793879, + "grad_norm": 0.6840381622314453, + "learning_rate": 6.9419140323106574e-06, + "loss": 0.7446, + "step": 13618 + }, + { + "epoch": 0.7495734492817436, + "grad_norm": 0.7155357599258423, + "learning_rate": 6.941514586338562e-06, + "loss": 0.7598, + "step": 13619 + }, + { + "epoch": 0.7496284880840993, + "grad_norm": 0.7693290114402771, + "learning_rate": 6.941115125774564e-06, + "loss": 0.7666, + "step": 13620 + }, + { + "epoch": 0.7496835268864549, + "grad_norm": 0.6918750405311584, + "learning_rate": 6.940715650621665e-06, + "loss": 0.6831, + "step": 13621 + }, + { + "epoch": 0.7497385656888106, + "grad_norm": 0.8241471648216248, + "learning_rate": 6.9403161608828654e-06, + "loss": 0.6753, + "step": 13622 + }, + { + "epoch": 0.7497936044911663, + "grad_norm": 0.6659193634986877, + "learning_rate": 6.93991665656117e-06, + "loss": 0.6988, + "step": 13623 + }, + { + "epoch": 0.749848643293522, + "grad_norm": 0.8012998700141907, + "learning_rate": 6.9395171376595795e-06, + "loss": 0.7922, + "step": 13624 + }, + { + "epoch": 0.7499036820958775, + "grad_norm": 0.783018946647644, + "learning_rate": 6.9391176041810974e-06, + "loss": 0.7062, + "step": 13625 + }, + { + "epoch": 0.7499587208982332, + "grad_norm": 0.8228014707565308, + "learning_rate": 6.938718056128726e-06, + "loss": 0.7762, + "step": 13626 + }, + { + "epoch": 0.7500137597005889, + "grad_norm": 0.783525288105011, + "learning_rate": 6.9383184935054705e-06, + "loss": 0.7517, + "step": 13627 + }, + { + "epoch": 0.7500687985029446, + "grad_norm": 0.6686612963676453, + "learning_rate": 6.93791891631433e-06, + "loss": 0.7372, + "step": 13628 + }, + { + "epoch": 0.7501238373053002, + "grad_norm": 0.7089647054672241, + "learning_rate": 6.937519324558312e-06, + "loss": 0.7847, + "step": 13629 + }, + { + "epoch": 0.7501788761076559, + "grad_norm": 0.7674399018287659, + "learning_rate": 6.937119718240415e-06, + "loss": 0.7414, + "step": 13630 + }, + { + "epoch": 0.7502339149100116, + "grad_norm": 0.6331565380096436, + "learning_rate": 6.936720097363646e-06, + "loss": 0.7603, + "step": 13631 + }, + { + "epoch": 0.7502889537123673, + "grad_norm": 0.7084798812866211, + "learning_rate": 6.9363204619310065e-06, + "loss": 0.6844, + "step": 13632 + }, + { + "epoch": 0.7503439925147228, + "grad_norm": 0.8624362945556641, + "learning_rate": 6.9359208119455015e-06, + "loss": 0.7098, + "step": 13633 + }, + { + "epoch": 0.7503990313170785, + "grad_norm": 0.7681849598884583, + "learning_rate": 6.935521147410134e-06, + "loss": 0.7896, + "step": 13634 + }, + { + "epoch": 0.7504540701194342, + "grad_norm": 0.7494263052940369, + "learning_rate": 6.935121468327907e-06, + "loss": 0.7858, + "step": 13635 + }, + { + "epoch": 0.7505091089217899, + "grad_norm": 0.7102827429771423, + "learning_rate": 6.934721774701824e-06, + "loss": 0.7485, + "step": 13636 + }, + { + "epoch": 0.7505641477241455, + "grad_norm": 0.7031061053276062, + "learning_rate": 6.934322066534891e-06, + "loss": 0.7154, + "step": 13637 + }, + { + "epoch": 0.7506191865265012, + "grad_norm": 0.6468148231506348, + "learning_rate": 6.933922343830112e-06, + "loss": 0.729, + "step": 13638 + }, + { + "epoch": 0.7506742253288569, + "grad_norm": 0.8570408225059509, + "learning_rate": 6.933522606590489e-06, + "loss": 0.6922, + "step": 13639 + }, + { + "epoch": 0.7507292641312125, + "grad_norm": 0.6836286783218384, + "learning_rate": 6.933122854819027e-06, + "loss": 0.7982, + "step": 13640 + }, + { + "epoch": 0.7507843029335681, + "grad_norm": 1.052017092704773, + "learning_rate": 6.9327230885187344e-06, + "loss": 0.7522, + "step": 13641 + }, + { + "epoch": 0.7508393417359238, + "grad_norm": 0.6352099180221558, + "learning_rate": 6.932323307692611e-06, + "loss": 0.6724, + "step": 13642 + }, + { + "epoch": 0.7508943805382795, + "grad_norm": 0.7046655416488647, + "learning_rate": 6.931923512343663e-06, + "loss": 0.7732, + "step": 13643 + }, + { + "epoch": 0.7509494193406352, + "grad_norm": 0.7600587010383606, + "learning_rate": 6.931523702474893e-06, + "loss": 0.7013, + "step": 13644 + }, + { + "epoch": 0.7510044581429908, + "grad_norm": 0.674828052520752, + "learning_rate": 6.9311238780893095e-06, + "loss": 0.7022, + "step": 13645 + }, + { + "epoch": 0.7510594969453465, + "grad_norm": 0.7517798542976379, + "learning_rate": 6.930724039189916e-06, + "loss": 0.7248, + "step": 13646 + }, + { + "epoch": 0.7511145357477022, + "grad_norm": 0.7851112484931946, + "learning_rate": 6.930324185779716e-06, + "loss": 0.8025, + "step": 13647 + }, + { + "epoch": 0.7511695745500578, + "grad_norm": 0.6545413732528687, + "learning_rate": 6.929924317861717e-06, + "loss": 0.781, + "step": 13648 + }, + { + "epoch": 0.7512246133524134, + "grad_norm": 0.7079984545707703, + "learning_rate": 6.929524435438923e-06, + "loss": 0.8033, + "step": 13649 + }, + { + "epoch": 0.7512796521547691, + "grad_norm": 0.6501914262771606, + "learning_rate": 6.929124538514341e-06, + "loss": 0.7525, + "step": 13650 + }, + { + "epoch": 0.7513346909571248, + "grad_norm": 0.7697597742080688, + "learning_rate": 6.928724627090975e-06, + "loss": 0.7358, + "step": 13651 + }, + { + "epoch": 0.7513897297594805, + "grad_norm": 0.8155171275138855, + "learning_rate": 6.928324701171832e-06, + "loss": 0.7389, + "step": 13652 + }, + { + "epoch": 0.7514447685618361, + "grad_norm": 0.6969262361526489, + "learning_rate": 6.927924760759914e-06, + "loss": 0.8349, + "step": 13653 + }, + { + "epoch": 0.7514998073641918, + "grad_norm": 0.6736776828765869, + "learning_rate": 6.927524805858233e-06, + "loss": 0.7379, + "step": 13654 + }, + { + "epoch": 0.7515548461665474, + "grad_norm": 0.6362389922142029, + "learning_rate": 6.927124836469788e-06, + "loss": 0.7479, + "step": 13655 + }, + { + "epoch": 0.7516098849689031, + "grad_norm": 0.688922643661499, + "learning_rate": 6.92672485259759e-06, + "loss": 0.7828, + "step": 13656 + }, + { + "epoch": 0.7516649237712587, + "grad_norm": 0.7098214030265808, + "learning_rate": 6.926324854244644e-06, + "loss": 0.6084, + "step": 13657 + }, + { + "epoch": 0.7517199625736144, + "grad_norm": 0.6436209678649902, + "learning_rate": 6.925924841413956e-06, + "loss": 0.687, + "step": 13658 + }, + { + "epoch": 0.7517750013759701, + "grad_norm": 0.6051730513572693, + "learning_rate": 6.925524814108533e-06, + "loss": 0.6884, + "step": 13659 + }, + { + "epoch": 0.7518300401783257, + "grad_norm": 0.6347759962081909, + "learning_rate": 6.92512477233138e-06, + "loss": 0.7057, + "step": 13660 + }, + { + "epoch": 0.7518850789806814, + "grad_norm": 0.6917054653167725, + "learning_rate": 6.924724716085505e-06, + "loss": 0.8374, + "step": 13661 + }, + { + "epoch": 0.751940117783037, + "grad_norm": 0.7676698565483093, + "learning_rate": 6.924324645373914e-06, + "loss": 0.7435, + "step": 13662 + }, + { + "epoch": 0.7519951565853927, + "grad_norm": 0.6601388454437256, + "learning_rate": 6.923924560199613e-06, + "loss": 0.7168, + "step": 13663 + }, + { + "epoch": 0.7520501953877483, + "grad_norm": 0.6342683434486389, + "learning_rate": 6.923524460565611e-06, + "loss": 0.7382, + "step": 13664 + }, + { + "epoch": 0.752105234190104, + "grad_norm": 0.6703974604606628, + "learning_rate": 6.923124346474915e-06, + "loss": 0.7687, + "step": 13665 + }, + { + "epoch": 0.7521602729924597, + "grad_norm": 0.6937074661254883, + "learning_rate": 6.922724217930531e-06, + "loss": 0.7687, + "step": 13666 + }, + { + "epoch": 0.7522153117948154, + "grad_norm": 0.7919568419456482, + "learning_rate": 6.922324074935466e-06, + "loss": 0.7328, + "step": 13667 + }, + { + "epoch": 0.752270350597171, + "grad_norm": 0.668331503868103, + "learning_rate": 6.9219239174927275e-06, + "loss": 0.7654, + "step": 13668 + }, + { + "epoch": 0.7523253893995266, + "grad_norm": 0.6298941969871521, + "learning_rate": 6.921523745605323e-06, + "loss": 0.719, + "step": 13669 + }, + { + "epoch": 0.7523804282018823, + "grad_norm": 0.6539381146430969, + "learning_rate": 6.921123559276262e-06, + "loss": 0.6681, + "step": 13670 + }, + { + "epoch": 0.752435467004238, + "grad_norm": 1.0692330598831177, + "learning_rate": 6.920723358508548e-06, + "loss": 0.7914, + "step": 13671 + }, + { + "epoch": 0.7524905058065936, + "grad_norm": 0.7410482168197632, + "learning_rate": 6.920323143305193e-06, + "loss": 0.8331, + "step": 13672 + }, + { + "epoch": 0.7525455446089493, + "grad_norm": 0.6976327300071716, + "learning_rate": 6.919922913669203e-06, + "loss": 0.8131, + "step": 13673 + }, + { + "epoch": 0.752600583411305, + "grad_norm": 0.646442174911499, + "learning_rate": 6.919522669603587e-06, + "loss": 0.7658, + "step": 13674 + }, + { + "epoch": 0.7526556222136607, + "grad_norm": 0.6257727146148682, + "learning_rate": 6.919122411111352e-06, + "loss": 0.666, + "step": 13675 + }, + { + "epoch": 0.7527106610160162, + "grad_norm": 0.6913230419158936, + "learning_rate": 6.918722138195506e-06, + "loss": 0.6935, + "step": 13676 + }, + { + "epoch": 0.7527656998183719, + "grad_norm": 0.6282557249069214, + "learning_rate": 6.918321850859059e-06, + "loss": 0.7042, + "step": 13677 + }, + { + "epoch": 0.7528207386207276, + "grad_norm": 0.6980175971984863, + "learning_rate": 6.917921549105018e-06, + "loss": 0.6757, + "step": 13678 + }, + { + "epoch": 0.7528757774230833, + "grad_norm": 0.6954337954521179, + "learning_rate": 6.917521232936393e-06, + "loss": 0.729, + "step": 13679 + }, + { + "epoch": 0.7529308162254389, + "grad_norm": 0.6813758015632629, + "learning_rate": 6.91712090235619e-06, + "loss": 0.6964, + "step": 13680 + }, + { + "epoch": 0.7529858550277946, + "grad_norm": 1.0940780639648438, + "learning_rate": 6.916720557367419e-06, + "loss": 0.7853, + "step": 13681 + }, + { + "epoch": 0.7530408938301503, + "grad_norm": 0.6899382472038269, + "learning_rate": 6.9163201979730906e-06, + "loss": 0.7639, + "step": 13682 + }, + { + "epoch": 0.753095932632506, + "grad_norm": 0.660252034664154, + "learning_rate": 6.915919824176213e-06, + "loss": 0.7068, + "step": 13683 + }, + { + "epoch": 0.7531509714348615, + "grad_norm": 0.6454583406448364, + "learning_rate": 6.915519435979795e-06, + "loss": 0.7268, + "step": 13684 + }, + { + "epoch": 0.7532060102372172, + "grad_norm": 0.7292754650115967, + "learning_rate": 6.915119033386843e-06, + "loss": 0.8131, + "step": 13685 + }, + { + "epoch": 0.7532610490395729, + "grad_norm": 0.6312932372093201, + "learning_rate": 6.914718616400372e-06, + "loss": 0.6977, + "step": 13686 + }, + { + "epoch": 0.7533160878419286, + "grad_norm": 0.8528029322624207, + "learning_rate": 6.914318185023388e-06, + "loss": 0.8403, + "step": 13687 + }, + { + "epoch": 0.7533711266442842, + "grad_norm": 0.758721649646759, + "learning_rate": 6.9139177392589e-06, + "loss": 0.7, + "step": 13688 + }, + { + "epoch": 0.7534261654466399, + "grad_norm": 0.6678142547607422, + "learning_rate": 6.913517279109919e-06, + "loss": 0.6251, + "step": 13689 + }, + { + "epoch": 0.7534812042489956, + "grad_norm": 0.6136146783828735, + "learning_rate": 6.913116804579455e-06, + "loss": 0.653, + "step": 13690 + }, + { + "epoch": 0.7535362430513513, + "grad_norm": 0.7546648383140564, + "learning_rate": 6.912716315670517e-06, + "loss": 0.8202, + "step": 13691 + }, + { + "epoch": 0.7535912818537068, + "grad_norm": 0.7232012152671814, + "learning_rate": 6.912315812386114e-06, + "loss": 0.7993, + "step": 13692 + }, + { + "epoch": 0.7536463206560625, + "grad_norm": 0.7288710474967957, + "learning_rate": 6.911915294729258e-06, + "loss": 0.7702, + "step": 13693 + }, + { + "epoch": 0.7537013594584182, + "grad_norm": 0.6847403049468994, + "learning_rate": 6.9115147627029575e-06, + "loss": 0.8141, + "step": 13694 + }, + { + "epoch": 0.7537563982607739, + "grad_norm": 0.62345951795578, + "learning_rate": 6.9111142163102255e-06, + "loss": 0.6832, + "step": 13695 + }, + { + "epoch": 0.7538114370631295, + "grad_norm": 0.7275232672691345, + "learning_rate": 6.9107136555540695e-06, + "loss": 0.7548, + "step": 13696 + }, + { + "epoch": 0.7538664758654852, + "grad_norm": 0.6724695563316345, + "learning_rate": 6.910313080437501e-06, + "loss": 0.7755, + "step": 13697 + }, + { + "epoch": 0.7539215146678409, + "grad_norm": 0.8446974754333496, + "learning_rate": 6.90991249096353e-06, + "loss": 0.827, + "step": 13698 + }, + { + "epoch": 0.7539765534701965, + "grad_norm": 0.7124913930892944, + "learning_rate": 6.9095118871351705e-06, + "loss": 0.7463, + "step": 13699 + }, + { + "epoch": 0.7540315922725521, + "grad_norm": 0.6916043162345886, + "learning_rate": 6.90911126895543e-06, + "loss": 0.714, + "step": 13700 + }, + { + "epoch": 0.7540866310749078, + "grad_norm": 0.7585330009460449, + "learning_rate": 6.908710636427319e-06, + "loss": 0.6731, + "step": 13701 + }, + { + "epoch": 0.7541416698772635, + "grad_norm": 0.6905520558357239, + "learning_rate": 6.90830998955385e-06, + "loss": 0.726, + "step": 13702 + }, + { + "epoch": 0.7541967086796191, + "grad_norm": 0.7482494115829468, + "learning_rate": 6.907909328338035e-06, + "loss": 0.7269, + "step": 13703 + }, + { + "epoch": 0.7542517474819748, + "grad_norm": 0.7565957307815552, + "learning_rate": 6.907508652782884e-06, + "loss": 0.6959, + "step": 13704 + }, + { + "epoch": 0.7543067862843305, + "grad_norm": 0.7458370923995972, + "learning_rate": 6.9071079628914075e-06, + "loss": 0.7448, + "step": 13705 + }, + { + "epoch": 0.7543618250866861, + "grad_norm": 1.3538293838500977, + "learning_rate": 6.9067072586666185e-06, + "loss": 0.8164, + "step": 13706 + }, + { + "epoch": 0.7544168638890417, + "grad_norm": 0.6217493414878845, + "learning_rate": 6.906306540111528e-06, + "loss": 0.7001, + "step": 13707 + }, + { + "epoch": 0.7544719026913974, + "grad_norm": 0.6862730383872986, + "learning_rate": 6.9059058072291485e-06, + "loss": 0.7921, + "step": 13708 + }, + { + "epoch": 0.7545269414937531, + "grad_norm": 0.6684688925743103, + "learning_rate": 6.905505060022491e-06, + "loss": 0.6736, + "step": 13709 + }, + { + "epoch": 0.7545819802961088, + "grad_norm": 0.6581160426139832, + "learning_rate": 6.905104298494567e-06, + "loss": 0.7581, + "step": 13710 + }, + { + "epoch": 0.7546370190984644, + "grad_norm": 0.7772610783576965, + "learning_rate": 6.9047035226483885e-06, + "loss": 0.7984, + "step": 13711 + }, + { + "epoch": 0.7546920579008201, + "grad_norm": 0.6856822371482849, + "learning_rate": 6.90430273248697e-06, + "loss": 0.8232, + "step": 13712 + }, + { + "epoch": 0.7547470967031757, + "grad_norm": 0.7250725626945496, + "learning_rate": 6.903901928013322e-06, + "loss": 0.7844, + "step": 13713 + }, + { + "epoch": 0.7548021355055314, + "grad_norm": 0.7034164667129517, + "learning_rate": 6.9035011092304545e-06, + "loss": 0.8293, + "step": 13714 + }, + { + "epoch": 0.754857174307887, + "grad_norm": 0.6783095002174377, + "learning_rate": 6.903100276141383e-06, + "loss": 0.6841, + "step": 13715 + }, + { + "epoch": 0.7549122131102427, + "grad_norm": 0.6180121302604675, + "learning_rate": 6.90269942874912e-06, + "loss": 0.7111, + "step": 13716 + }, + { + "epoch": 0.7549672519125984, + "grad_norm": 0.70428466796875, + "learning_rate": 6.902298567056677e-06, + "loss": 0.8758, + "step": 13717 + }, + { + "epoch": 0.7550222907149541, + "grad_norm": 0.8130238652229309, + "learning_rate": 6.9018976910670665e-06, + "loss": 0.6443, + "step": 13718 + }, + { + "epoch": 0.7550773295173097, + "grad_norm": 0.6910800933837891, + "learning_rate": 6.901496800783302e-06, + "loss": 0.7231, + "step": 13719 + }, + { + "epoch": 0.7551323683196653, + "grad_norm": 0.700933575630188, + "learning_rate": 6.901095896208398e-06, + "loss": 0.6785, + "step": 13720 + }, + { + "epoch": 0.755187407122021, + "grad_norm": 0.7407829761505127, + "learning_rate": 6.9006949773453656e-06, + "loss": 0.694, + "step": 13721 + }, + { + "epoch": 0.7552424459243767, + "grad_norm": 0.7907935380935669, + "learning_rate": 6.900294044197218e-06, + "loss": 0.7674, + "step": 13722 + }, + { + "epoch": 0.7552974847267323, + "grad_norm": 0.6585111021995544, + "learning_rate": 6.89989309676697e-06, + "loss": 0.6785, + "step": 13723 + }, + { + "epoch": 0.755352523529088, + "grad_norm": 0.7611724138259888, + "learning_rate": 6.899492135057633e-06, + "loss": 0.8028, + "step": 13724 + }, + { + "epoch": 0.7554075623314437, + "grad_norm": 0.6412070989608765, + "learning_rate": 6.899091159072222e-06, + "loss": 0.7634, + "step": 13725 + }, + { + "epoch": 0.7554626011337994, + "grad_norm": 0.7712366580963135, + "learning_rate": 6.898690168813751e-06, + "loss": 0.8275, + "step": 13726 + }, + { + "epoch": 0.755517639936155, + "grad_norm": 0.6826579570770264, + "learning_rate": 6.898289164285232e-06, + "loss": 0.7949, + "step": 13727 + }, + { + "epoch": 0.7555726787385106, + "grad_norm": 0.7501955628395081, + "learning_rate": 6.897888145489681e-06, + "loss": 0.7846, + "step": 13728 + }, + { + "epoch": 0.7556277175408663, + "grad_norm": 0.6493077874183655, + "learning_rate": 6.8974871124301075e-06, + "loss": 0.7294, + "step": 13729 + }, + { + "epoch": 0.755682756343222, + "grad_norm": 0.6854347586631775, + "learning_rate": 6.897086065109532e-06, + "loss": 0.7121, + "step": 13730 + }, + { + "epoch": 0.7557377951455776, + "grad_norm": 0.7376317977905273, + "learning_rate": 6.896685003530964e-06, + "loss": 0.7719, + "step": 13731 + }, + { + "epoch": 0.7557928339479333, + "grad_norm": 0.8477175235748291, + "learning_rate": 6.89628392769742e-06, + "loss": 0.7981, + "step": 13732 + }, + { + "epoch": 0.755847872750289, + "grad_norm": 0.6611722111701965, + "learning_rate": 6.8958828376119125e-06, + "loss": 0.7628, + "step": 13733 + }, + { + "epoch": 0.7559029115526447, + "grad_norm": 0.6898290514945984, + "learning_rate": 6.895481733277458e-06, + "loss": 0.7578, + "step": 13734 + }, + { + "epoch": 0.7559579503550002, + "grad_norm": 0.6566810607910156, + "learning_rate": 6.89508061469707e-06, + "loss": 0.6919, + "step": 13735 + }, + { + "epoch": 0.7560129891573559, + "grad_norm": 0.6395933032035828, + "learning_rate": 6.894679481873763e-06, + "loss": 0.7334, + "step": 13736 + }, + { + "epoch": 0.7560680279597116, + "grad_norm": 0.7060876488685608, + "learning_rate": 6.8942783348105535e-06, + "loss": 0.7405, + "step": 13737 + }, + { + "epoch": 0.7561230667620673, + "grad_norm": 0.7303228974342346, + "learning_rate": 6.893877173510454e-06, + "loss": 0.8563, + "step": 13738 + }, + { + "epoch": 0.7561781055644229, + "grad_norm": 0.663474977016449, + "learning_rate": 6.893475997976481e-06, + "loss": 0.703, + "step": 13739 + }, + { + "epoch": 0.7562331443667786, + "grad_norm": 0.8005428910255432, + "learning_rate": 6.893074808211649e-06, + "loss": 0.7219, + "step": 13740 + }, + { + "epoch": 0.7562881831691343, + "grad_norm": 1.3285688161849976, + "learning_rate": 6.892673604218972e-06, + "loss": 0.672, + "step": 13741 + }, + { + "epoch": 0.75634322197149, + "grad_norm": 0.6958948373794556, + "learning_rate": 6.892272386001469e-06, + "loss": 0.7728, + "step": 13742 + }, + { + "epoch": 0.7563982607738455, + "grad_norm": 0.6840598583221436, + "learning_rate": 6.891871153562153e-06, + "loss": 0.7881, + "step": 13743 + }, + { + "epoch": 0.7564532995762012, + "grad_norm": 0.7184257507324219, + "learning_rate": 6.891469906904039e-06, + "loss": 0.736, + "step": 13744 + }, + { + "epoch": 0.7565083383785569, + "grad_norm": 0.6611571311950684, + "learning_rate": 6.891068646030143e-06, + "loss": 0.7171, + "step": 13745 + }, + { + "epoch": 0.7565633771809125, + "grad_norm": 0.8237559795379639, + "learning_rate": 6.890667370943482e-06, + "loss": 0.8669, + "step": 13746 + }, + { + "epoch": 0.7566184159832682, + "grad_norm": 0.6898388266563416, + "learning_rate": 6.890266081647072e-06, + "loss": 0.6654, + "step": 13747 + }, + { + "epoch": 0.7566734547856239, + "grad_norm": 0.6541711688041687, + "learning_rate": 6.889864778143928e-06, + "loss": 0.7455, + "step": 13748 + }, + { + "epoch": 0.7567284935879796, + "grad_norm": 0.6518157124519348, + "learning_rate": 6.8894634604370655e-06, + "loss": 0.7174, + "step": 13749 + }, + { + "epoch": 0.7567835323903351, + "grad_norm": 0.7992080450057983, + "learning_rate": 6.889062128529502e-06, + "loss": 0.7349, + "step": 13750 + }, + { + "epoch": 0.7568385711926908, + "grad_norm": 0.5748338103294373, + "learning_rate": 6.888660782424253e-06, + "loss": 0.5398, + "step": 13751 + }, + { + "epoch": 0.7568936099950465, + "grad_norm": 0.6507781744003296, + "learning_rate": 6.8882594221243344e-06, + "loss": 0.6762, + "step": 13752 + }, + { + "epoch": 0.7569486487974022, + "grad_norm": 0.6908432841300964, + "learning_rate": 6.887858047632764e-06, + "loss": 0.8034, + "step": 13753 + }, + { + "epoch": 0.7570036875997578, + "grad_norm": 0.6497751474380493, + "learning_rate": 6.887456658952557e-06, + "loss": 0.6351, + "step": 13754 + }, + { + "epoch": 0.7570587264021135, + "grad_norm": 0.7233273386955261, + "learning_rate": 6.887055256086732e-06, + "loss": 0.7096, + "step": 13755 + }, + { + "epoch": 0.7571137652044692, + "grad_norm": 0.6587454676628113, + "learning_rate": 6.886653839038305e-06, + "loss": 0.7354, + "step": 13756 + }, + { + "epoch": 0.7571688040068248, + "grad_norm": 0.6654310822486877, + "learning_rate": 6.886252407810292e-06, + "loss": 0.7776, + "step": 13757 + }, + { + "epoch": 0.7572238428091804, + "grad_norm": 0.796604573726654, + "learning_rate": 6.885850962405711e-06, + "loss": 0.7925, + "step": 13758 + }, + { + "epoch": 0.7572788816115361, + "grad_norm": 0.7053457498550415, + "learning_rate": 6.8854495028275795e-06, + "loss": 0.7893, + "step": 13759 + }, + { + "epoch": 0.7573339204138918, + "grad_norm": 0.7201200127601624, + "learning_rate": 6.885048029078914e-06, + "loss": 0.8346, + "step": 13760 + }, + { + "epoch": 0.7573889592162475, + "grad_norm": 0.8437653183937073, + "learning_rate": 6.884646541162731e-06, + "loss": 0.7468, + "step": 13761 + }, + { + "epoch": 0.7574439980186031, + "grad_norm": 0.6910028457641602, + "learning_rate": 6.884245039082052e-06, + "loss": 0.7362, + "step": 13762 + }, + { + "epoch": 0.7574990368209588, + "grad_norm": 0.6896274089813232, + "learning_rate": 6.883843522839889e-06, + "loss": 0.6515, + "step": 13763 + }, + { + "epoch": 0.7575540756233144, + "grad_norm": 0.9833560585975647, + "learning_rate": 6.8834419924392636e-06, + "loss": 0.8764, + "step": 13764 + }, + { + "epoch": 0.7576091144256701, + "grad_norm": 0.7130032181739807, + "learning_rate": 6.88304044788319e-06, + "loss": 0.7631, + "step": 13765 + }, + { + "epoch": 0.7576641532280257, + "grad_norm": 0.7059195041656494, + "learning_rate": 6.882638889174691e-06, + "loss": 0.8147, + "step": 13766 + }, + { + "epoch": 0.7577191920303814, + "grad_norm": 0.6451989412307739, + "learning_rate": 6.882237316316781e-06, + "loss": 0.6638, + "step": 13767 + }, + { + "epoch": 0.7577742308327371, + "grad_norm": 0.7541074752807617, + "learning_rate": 6.881835729312481e-06, + "loss": 0.6918, + "step": 13768 + }, + { + "epoch": 0.7578292696350928, + "grad_norm": 0.7227535843849182, + "learning_rate": 6.881434128164805e-06, + "loss": 0.7759, + "step": 13769 + }, + { + "epoch": 0.7578843084374484, + "grad_norm": 0.673112154006958, + "learning_rate": 6.881032512876774e-06, + "loss": 0.7328, + "step": 13770 + }, + { + "epoch": 0.757939347239804, + "grad_norm": 0.6536681056022644, + "learning_rate": 6.880630883451407e-06, + "loss": 0.7677, + "step": 13771 + }, + { + "epoch": 0.7579943860421597, + "grad_norm": 0.8517894148826599, + "learning_rate": 6.880229239891721e-06, + "loss": 0.8566, + "step": 13772 + }, + { + "epoch": 0.7580494248445154, + "grad_norm": 0.8260573148727417, + "learning_rate": 6.879827582200737e-06, + "loss": 0.8228, + "step": 13773 + }, + { + "epoch": 0.758104463646871, + "grad_norm": 0.7460072040557861, + "learning_rate": 6.87942591038147e-06, + "loss": 0.8047, + "step": 13774 + }, + { + "epoch": 0.7581595024492267, + "grad_norm": 0.7648436427116394, + "learning_rate": 6.879024224436942e-06, + "loss": 0.852, + "step": 13775 + }, + { + "epoch": 0.7582145412515824, + "grad_norm": 0.7161253094673157, + "learning_rate": 6.878622524370171e-06, + "loss": 0.7638, + "step": 13776 + }, + { + "epoch": 0.7582695800539381, + "grad_norm": 0.6559579372406006, + "learning_rate": 6.878220810184175e-06, + "loss": 0.6932, + "step": 13777 + }, + { + "epoch": 0.7583246188562937, + "grad_norm": 0.6846898198127747, + "learning_rate": 6.877819081881975e-06, + "loss": 0.7098, + "step": 13778 + }, + { + "epoch": 0.7583796576586493, + "grad_norm": 0.7569675445556641, + "learning_rate": 6.87741733946659e-06, + "loss": 0.687, + "step": 13779 + }, + { + "epoch": 0.758434696461005, + "grad_norm": 0.7513766288757324, + "learning_rate": 6.877015582941038e-06, + "loss": 0.8673, + "step": 13780 + }, + { + "epoch": 0.7584897352633607, + "grad_norm": 0.7158082127571106, + "learning_rate": 6.876613812308338e-06, + "loss": 0.7563, + "step": 13781 + }, + { + "epoch": 0.7585447740657163, + "grad_norm": 0.6307277083396912, + "learning_rate": 6.876212027571513e-06, + "loss": 0.6725, + "step": 13782 + }, + { + "epoch": 0.758599812868072, + "grad_norm": 0.735090434551239, + "learning_rate": 6.87581022873358e-06, + "loss": 0.763, + "step": 13783 + }, + { + "epoch": 0.7586548516704277, + "grad_norm": 0.6412403583526611, + "learning_rate": 6.8754084157975594e-06, + "loss": 0.5992, + "step": 13784 + }, + { + "epoch": 0.7587098904727834, + "grad_norm": 0.639854907989502, + "learning_rate": 6.875006588766472e-06, + "loss": 0.7372, + "step": 13785 + }, + { + "epoch": 0.7587649292751389, + "grad_norm": 0.6855082511901855, + "learning_rate": 6.8746047476433365e-06, + "loss": 0.7709, + "step": 13786 + }, + { + "epoch": 0.7588199680774946, + "grad_norm": 0.6838769912719727, + "learning_rate": 6.874202892431173e-06, + "loss": 0.7545, + "step": 13787 + }, + { + "epoch": 0.7588750068798503, + "grad_norm": 1.1560181379318237, + "learning_rate": 6.873801023133002e-06, + "loss": 0.7291, + "step": 13788 + }, + { + "epoch": 0.7589300456822059, + "grad_norm": 0.7140469551086426, + "learning_rate": 6.873399139751844e-06, + "loss": 0.7214, + "step": 13789 + }, + { + "epoch": 0.7589850844845616, + "grad_norm": 0.6856355667114258, + "learning_rate": 6.8729972422907195e-06, + "loss": 0.7417, + "step": 13790 + }, + { + "epoch": 0.7590401232869173, + "grad_norm": 0.7856155633926392, + "learning_rate": 6.8725953307526505e-06, + "loss": 0.7484, + "step": 13791 + }, + { + "epoch": 0.759095162089273, + "grad_norm": 0.8107255697250366, + "learning_rate": 6.8721934051406555e-06, + "loss": 0.7568, + "step": 13792 + }, + { + "epoch": 0.7591502008916285, + "grad_norm": 0.6590837240219116, + "learning_rate": 6.871791465457757e-06, + "loss": 0.7495, + "step": 13793 + }, + { + "epoch": 0.7592052396939842, + "grad_norm": 0.7531588077545166, + "learning_rate": 6.8713895117069715e-06, + "loss": 0.7434, + "step": 13794 + }, + { + "epoch": 0.7592602784963399, + "grad_norm": 0.6818329095840454, + "learning_rate": 6.870987543891326e-06, + "loss": 0.7128, + "step": 13795 + }, + { + "epoch": 0.7593153172986956, + "grad_norm": 0.6082884669303894, + "learning_rate": 6.8705855620138395e-06, + "loss": 0.7437, + "step": 13796 + }, + { + "epoch": 0.7593703561010512, + "grad_norm": 0.9583787322044373, + "learning_rate": 6.870183566077532e-06, + "loss": 0.7779, + "step": 13797 + }, + { + "epoch": 0.7594253949034069, + "grad_norm": 0.6684621572494507, + "learning_rate": 6.869781556085425e-06, + "loss": 0.5856, + "step": 13798 + }, + { + "epoch": 0.7594804337057626, + "grad_norm": 0.6225603222846985, + "learning_rate": 6.869379532040541e-06, + "loss": 0.7407, + "step": 13799 + }, + { + "epoch": 0.7595354725081183, + "grad_norm": 0.6973103284835815, + "learning_rate": 6.8689774939459005e-06, + "loss": 0.7789, + "step": 13800 + }, + { + "epoch": 0.7595905113104738, + "grad_norm": 0.6655399203300476, + "learning_rate": 6.868575441804526e-06, + "loss": 0.7489, + "step": 13801 + }, + { + "epoch": 0.7596455501128295, + "grad_norm": 0.7066664695739746, + "learning_rate": 6.868173375619437e-06, + "loss": 0.7035, + "step": 13802 + }, + { + "epoch": 0.7597005889151852, + "grad_norm": 1.0646852254867554, + "learning_rate": 6.867771295393658e-06, + "loss": 0.8488, + "step": 13803 + }, + { + "epoch": 0.7597556277175409, + "grad_norm": 0.6551353335380554, + "learning_rate": 6.867369201130209e-06, + "loss": 0.7147, + "step": 13804 + }, + { + "epoch": 0.7598106665198965, + "grad_norm": 0.6749850511550903, + "learning_rate": 6.866967092832115e-06, + "loss": 0.7963, + "step": 13805 + }, + { + "epoch": 0.7598657053222522, + "grad_norm": 0.6704042553901672, + "learning_rate": 6.866564970502394e-06, + "loss": 0.7992, + "step": 13806 + }, + { + "epoch": 0.7599207441246079, + "grad_norm": 0.7027791142463684, + "learning_rate": 6.866162834144071e-06, + "loss": 0.7931, + "step": 13807 + }, + { + "epoch": 0.7599757829269636, + "grad_norm": 0.7925322651863098, + "learning_rate": 6.865760683760169e-06, + "loss": 0.7826, + "step": 13808 + }, + { + "epoch": 0.7600308217293191, + "grad_norm": 0.7152161002159119, + "learning_rate": 6.865358519353708e-06, + "loss": 0.7481, + "step": 13809 + }, + { + "epoch": 0.7600858605316748, + "grad_norm": 0.6572757959365845, + "learning_rate": 6.864956340927711e-06, + "loss": 0.785, + "step": 13810 + }, + { + "epoch": 0.7601408993340305, + "grad_norm": 0.6848406791687012, + "learning_rate": 6.864554148485203e-06, + "loss": 0.6423, + "step": 13811 + }, + { + "epoch": 0.7601959381363862, + "grad_norm": 0.747597873210907, + "learning_rate": 6.864151942029205e-06, + "loss": 0.7901, + "step": 13812 + }, + { + "epoch": 0.7602509769387418, + "grad_norm": 0.7106720805168152, + "learning_rate": 6.863749721562738e-06, + "loss": 0.7488, + "step": 13813 + }, + { + "epoch": 0.7603060157410975, + "grad_norm": 0.6864057779312134, + "learning_rate": 6.8633474870888275e-06, + "loss": 0.7066, + "step": 13814 + }, + { + "epoch": 0.7603610545434532, + "grad_norm": 0.7022056579589844, + "learning_rate": 6.862945238610496e-06, + "loss": 0.6851, + "step": 13815 + }, + { + "epoch": 0.7604160933458088, + "grad_norm": 0.7361913919448853, + "learning_rate": 6.862542976130769e-06, + "loss": 0.7425, + "step": 13816 + }, + { + "epoch": 0.7604711321481644, + "grad_norm": 0.6723676323890686, + "learning_rate": 6.862140699652666e-06, + "loss": 0.7937, + "step": 13817 + }, + { + "epoch": 0.7605261709505201, + "grad_norm": 0.7491924166679382, + "learning_rate": 6.861738409179212e-06, + "loss": 0.7585, + "step": 13818 + }, + { + "epoch": 0.7605812097528758, + "grad_norm": 0.6772211790084839, + "learning_rate": 6.86133610471343e-06, + "loss": 0.7617, + "step": 13819 + }, + { + "epoch": 0.7606362485552315, + "grad_norm": 0.7819864153862, + "learning_rate": 6.860933786258344e-06, + "loss": 0.7924, + "step": 13820 + }, + { + "epoch": 0.7606912873575871, + "grad_norm": 0.6992526650428772, + "learning_rate": 6.86053145381698e-06, + "loss": 0.7054, + "step": 13821 + }, + { + "epoch": 0.7607463261599428, + "grad_norm": 0.7189231514930725, + "learning_rate": 6.860129107392357e-06, + "loss": 0.7603, + "step": 13822 + }, + { + "epoch": 0.7608013649622984, + "grad_norm": 0.7165294885635376, + "learning_rate": 6.859726746987503e-06, + "loss": 0.8118, + "step": 13823 + }, + { + "epoch": 0.7608564037646541, + "grad_norm": 0.6510334014892578, + "learning_rate": 6.85932437260544e-06, + "loss": 0.7584, + "step": 13824 + }, + { + "epoch": 0.7609114425670097, + "grad_norm": 0.7113379836082458, + "learning_rate": 6.8589219842491935e-06, + "loss": 0.7799, + "step": 13825 + }, + { + "epoch": 0.7609664813693654, + "grad_norm": 0.7441100478172302, + "learning_rate": 6.8585195819217856e-06, + "loss": 0.6468, + "step": 13826 + }, + { + "epoch": 0.7610215201717211, + "grad_norm": 1.0703508853912354, + "learning_rate": 6.858117165626244e-06, + "loss": 0.7922, + "step": 13827 + }, + { + "epoch": 0.7610765589740768, + "grad_norm": 0.7097275853157043, + "learning_rate": 6.857714735365589e-06, + "loss": 0.7594, + "step": 13828 + }, + { + "epoch": 0.7611315977764324, + "grad_norm": 0.7001124620437622, + "learning_rate": 6.857312291142848e-06, + "loss": 0.7679, + "step": 13829 + }, + { + "epoch": 0.761186636578788, + "grad_norm": 0.6898123621940613, + "learning_rate": 6.856909832961045e-06, + "loss": 0.7684, + "step": 13830 + }, + { + "epoch": 0.7612416753811437, + "grad_norm": 0.6535243391990662, + "learning_rate": 6.856507360823206e-06, + "loss": 0.6143, + "step": 13831 + }, + { + "epoch": 0.7612967141834993, + "grad_norm": 0.6726056933403015, + "learning_rate": 6.856104874732353e-06, + "loss": 0.7566, + "step": 13832 + }, + { + "epoch": 0.761351752985855, + "grad_norm": 0.8741437196731567, + "learning_rate": 6.855702374691513e-06, + "loss": 0.723, + "step": 13833 + }, + { + "epoch": 0.7614067917882107, + "grad_norm": 0.7025718092918396, + "learning_rate": 6.855299860703712e-06, + "loss": 0.8035, + "step": 13834 + }, + { + "epoch": 0.7614618305905664, + "grad_norm": 1.08286452293396, + "learning_rate": 6.8548973327719726e-06, + "loss": 0.7347, + "step": 13835 + }, + { + "epoch": 0.761516869392922, + "grad_norm": 0.6483243107795715, + "learning_rate": 6.854494790899322e-06, + "loss": 0.7326, + "step": 13836 + }, + { + "epoch": 0.7615719081952776, + "grad_norm": 0.6611089110374451, + "learning_rate": 6.854092235088784e-06, + "loss": 0.7619, + "step": 13837 + }, + { + "epoch": 0.7616269469976333, + "grad_norm": 0.8394322991371155, + "learning_rate": 6.853689665343385e-06, + "loss": 0.7017, + "step": 13838 + }, + { + "epoch": 0.761681985799989, + "grad_norm": 0.7131583094596863, + "learning_rate": 6.853287081666151e-06, + "loss": 0.7367, + "step": 13839 + }, + { + "epoch": 0.7617370246023446, + "grad_norm": 0.7316367626190186, + "learning_rate": 6.852884484060108e-06, + "loss": 0.7323, + "step": 13840 + }, + { + "epoch": 0.7617920634047003, + "grad_norm": 0.7639010548591614, + "learning_rate": 6.852481872528281e-06, + "loss": 0.819, + "step": 13841 + }, + { + "epoch": 0.761847102207056, + "grad_norm": 0.7118390202522278, + "learning_rate": 6.852079247073695e-06, + "loss": 0.7645, + "step": 13842 + }, + { + "epoch": 0.7619021410094117, + "grad_norm": 0.6885393857955933, + "learning_rate": 6.851676607699379e-06, + "loss": 0.8052, + "step": 13843 + }, + { + "epoch": 0.7619571798117672, + "grad_norm": 0.7034374475479126, + "learning_rate": 6.851273954408356e-06, + "loss": 0.8464, + "step": 13844 + }, + { + "epoch": 0.7620122186141229, + "grad_norm": 0.6531803607940674, + "learning_rate": 6.850871287203654e-06, + "loss": 0.7871, + "step": 13845 + }, + { + "epoch": 0.7620672574164786, + "grad_norm": 0.6637283563613892, + "learning_rate": 6.8504686060882995e-06, + "loss": 0.7326, + "step": 13846 + }, + { + "epoch": 0.7621222962188343, + "grad_norm": 0.6467694640159607, + "learning_rate": 6.850065911065318e-06, + "loss": 0.7936, + "step": 13847 + }, + { + "epoch": 0.7621773350211899, + "grad_norm": 0.6829109191894531, + "learning_rate": 6.849663202137735e-06, + "loss": 0.7003, + "step": 13848 + }, + { + "epoch": 0.7622323738235456, + "grad_norm": 0.7321386933326721, + "learning_rate": 6.84926047930858e-06, + "loss": 0.6921, + "step": 13849 + }, + { + "epoch": 0.7622874126259013, + "grad_norm": 0.6900202631950378, + "learning_rate": 6.8488577425808766e-06, + "loss": 0.7496, + "step": 13850 + }, + { + "epoch": 0.762342451428257, + "grad_norm": 0.6304247975349426, + "learning_rate": 6.848454991957655e-06, + "loss": 0.7135, + "step": 13851 + }, + { + "epoch": 0.7623974902306125, + "grad_norm": 0.7087798118591309, + "learning_rate": 6.8480522274419404e-06, + "loss": 0.7032, + "step": 13852 + }, + { + "epoch": 0.7624525290329682, + "grad_norm": 0.7777289152145386, + "learning_rate": 6.84764944903676e-06, + "loss": 0.7345, + "step": 13853 + }, + { + "epoch": 0.7625075678353239, + "grad_norm": 0.7282242774963379, + "learning_rate": 6.847246656745139e-06, + "loss": 0.6408, + "step": 13854 + }, + { + "epoch": 0.7625626066376796, + "grad_norm": 0.7798221707344055, + "learning_rate": 6.846843850570107e-06, + "loss": 0.9058, + "step": 13855 + }, + { + "epoch": 0.7626176454400352, + "grad_norm": 0.6145210266113281, + "learning_rate": 6.846441030514692e-06, + "loss": 0.6331, + "step": 13856 + }, + { + "epoch": 0.7626726842423909, + "grad_norm": 0.7079364061355591, + "learning_rate": 6.846038196581921e-06, + "loss": 0.7511, + "step": 13857 + }, + { + "epoch": 0.7627277230447466, + "grad_norm": 0.733635425567627, + "learning_rate": 6.845635348774821e-06, + "loss": 0.6957, + "step": 13858 + }, + { + "epoch": 0.7627827618471023, + "grad_norm": 0.8099489808082581, + "learning_rate": 6.845232487096419e-06, + "loss": 0.8068, + "step": 13859 + }, + { + "epoch": 0.7628378006494578, + "grad_norm": 0.6241937875747681, + "learning_rate": 6.844829611549744e-06, + "loss": 0.7102, + "step": 13860 + }, + { + "epoch": 0.7628928394518135, + "grad_norm": 0.8009611368179321, + "learning_rate": 6.8444267221378235e-06, + "loss": 0.8369, + "step": 13861 + }, + { + "epoch": 0.7629478782541692, + "grad_norm": 0.6700903177261353, + "learning_rate": 6.844023818863685e-06, + "loss": 0.8075, + "step": 13862 + }, + { + "epoch": 0.7630029170565249, + "grad_norm": 0.9378371834754944, + "learning_rate": 6.843620901730357e-06, + "loss": 0.7539, + "step": 13863 + }, + { + "epoch": 0.7630579558588805, + "grad_norm": 0.6704423427581787, + "learning_rate": 6.843217970740867e-06, + "loss": 0.7285, + "step": 13864 + }, + { + "epoch": 0.7631129946612362, + "grad_norm": 0.7236818075180054, + "learning_rate": 6.842815025898246e-06, + "loss": 0.7223, + "step": 13865 + }, + { + "epoch": 0.7631680334635919, + "grad_norm": 0.676184356212616, + "learning_rate": 6.84241206720552e-06, + "loss": 0.7286, + "step": 13866 + }, + { + "epoch": 0.7632230722659475, + "grad_norm": 0.6443304419517517, + "learning_rate": 6.842009094665717e-06, + "loss": 0.6806, + "step": 13867 + }, + { + "epoch": 0.7632781110683031, + "grad_norm": 0.7931790947914124, + "learning_rate": 6.841606108281868e-06, + "loss": 0.7801, + "step": 13868 + }, + { + "epoch": 0.7633331498706588, + "grad_norm": 0.7440798878669739, + "learning_rate": 6.841203108057e-06, + "loss": 0.8044, + "step": 13869 + }, + { + "epoch": 0.7633881886730145, + "grad_norm": 0.7226675748825073, + "learning_rate": 6.840800093994142e-06, + "loss": 0.718, + "step": 13870 + }, + { + "epoch": 0.7634432274753702, + "grad_norm": 0.7351265549659729, + "learning_rate": 6.8403970660963245e-06, + "loss": 0.8389, + "step": 13871 + }, + { + "epoch": 0.7634982662777258, + "grad_norm": 0.8326215744018555, + "learning_rate": 6.839994024366574e-06, + "loss": 0.8583, + "step": 13872 + }, + { + "epoch": 0.7635533050800815, + "grad_norm": 0.6841259002685547, + "learning_rate": 6.839590968807922e-06, + "loss": 0.7553, + "step": 13873 + }, + { + "epoch": 0.7636083438824371, + "grad_norm": 0.7305078506469727, + "learning_rate": 6.839187899423395e-06, + "loss": 0.7825, + "step": 13874 + }, + { + "epoch": 0.7636633826847927, + "grad_norm": 0.7235193252563477, + "learning_rate": 6.838784816216025e-06, + "loss": 0.7653, + "step": 13875 + }, + { + "epoch": 0.7637184214871484, + "grad_norm": 0.6468761563301086, + "learning_rate": 6.838381719188842e-06, + "loss": 0.6901, + "step": 13876 + }, + { + "epoch": 0.7637734602895041, + "grad_norm": 0.6806310415267944, + "learning_rate": 6.837978608344872e-06, + "loss": 0.6876, + "step": 13877 + }, + { + "epoch": 0.7638284990918598, + "grad_norm": 0.692081093788147, + "learning_rate": 6.837575483687147e-06, + "loss": 0.7506, + "step": 13878 + }, + { + "epoch": 0.7638835378942154, + "grad_norm": 0.6447135806083679, + "learning_rate": 6.837172345218697e-06, + "loss": 0.6841, + "step": 13879 + }, + { + "epoch": 0.7639385766965711, + "grad_norm": 0.7352014183998108, + "learning_rate": 6.8367691929425516e-06, + "loss": 0.8066, + "step": 13880 + }, + { + "epoch": 0.7639936154989267, + "grad_norm": 0.7305072546005249, + "learning_rate": 6.8363660268617405e-06, + "loss": 0.717, + "step": 13881 + }, + { + "epoch": 0.7640486543012824, + "grad_norm": 0.6580411195755005, + "learning_rate": 6.835962846979294e-06, + "loss": 0.7585, + "step": 13882 + }, + { + "epoch": 0.764103693103638, + "grad_norm": 0.7568425536155701, + "learning_rate": 6.835559653298242e-06, + "loss": 0.8273, + "step": 13883 + }, + { + "epoch": 0.7641587319059937, + "grad_norm": 0.8121107816696167, + "learning_rate": 6.835156445821616e-06, + "loss": 0.9064, + "step": 13884 + }, + { + "epoch": 0.7642137707083494, + "grad_norm": 0.6522091031074524, + "learning_rate": 6.834753224552444e-06, + "loss": 0.767, + "step": 13885 + }, + { + "epoch": 0.7642688095107051, + "grad_norm": 1.0779389142990112, + "learning_rate": 6.8343499894937574e-06, + "loss": 0.7702, + "step": 13886 + }, + { + "epoch": 0.7643238483130607, + "grad_norm": 0.6902838349342346, + "learning_rate": 6.833946740648588e-06, + "loss": 0.6529, + "step": 13887 + }, + { + "epoch": 0.7643788871154164, + "grad_norm": 0.692480742931366, + "learning_rate": 6.833543478019966e-06, + "loss": 0.7404, + "step": 13888 + }, + { + "epoch": 0.764433925917772, + "grad_norm": 0.633627712726593, + "learning_rate": 6.833140201610923e-06, + "loss": 0.711, + "step": 13889 + }, + { + "epoch": 0.7644889647201277, + "grad_norm": 0.8653294444084167, + "learning_rate": 6.832736911424487e-06, + "loss": 0.8102, + "step": 13890 + }, + { + "epoch": 0.7645440035224833, + "grad_norm": 0.7864197492599487, + "learning_rate": 6.832333607463692e-06, + "loss": 0.7064, + "step": 13891 + }, + { + "epoch": 0.764599042324839, + "grad_norm": 0.6703711748123169, + "learning_rate": 6.831930289731569e-06, + "loss": 0.7653, + "step": 13892 + }, + { + "epoch": 0.7646540811271947, + "grad_norm": 0.7420178651809692, + "learning_rate": 6.831526958231147e-06, + "loss": 0.8137, + "step": 13893 + }, + { + "epoch": 0.7647091199295504, + "grad_norm": 0.7372543215751648, + "learning_rate": 6.831123612965459e-06, + "loss": 0.6871, + "step": 13894 + }, + { + "epoch": 0.764764158731906, + "grad_norm": 0.77486652135849, + "learning_rate": 6.830720253937536e-06, + "loss": 0.727, + "step": 13895 + }, + { + "epoch": 0.7648191975342616, + "grad_norm": 0.7087406516075134, + "learning_rate": 6.83031688115041e-06, + "loss": 0.7743, + "step": 13896 + }, + { + "epoch": 0.7648742363366173, + "grad_norm": 0.8415336608886719, + "learning_rate": 6.829913494607112e-06, + "loss": 0.774, + "step": 13897 + }, + { + "epoch": 0.764929275138973, + "grad_norm": 0.7736749053001404, + "learning_rate": 6.829510094310674e-06, + "loss": 0.7541, + "step": 13898 + }, + { + "epoch": 0.7649843139413286, + "grad_norm": 0.6749987602233887, + "learning_rate": 6.829106680264128e-06, + "loss": 0.7139, + "step": 13899 + }, + { + "epoch": 0.7650393527436843, + "grad_norm": 0.7079635262489319, + "learning_rate": 6.8287032524705055e-06, + "loss": 0.75, + "step": 13900 + }, + { + "epoch": 0.76509439154604, + "grad_norm": 0.6906388401985168, + "learning_rate": 6.828299810932839e-06, + "loss": 0.6895, + "step": 13901 + }, + { + "epoch": 0.7651494303483957, + "grad_norm": 0.7045881152153015, + "learning_rate": 6.82789635565416e-06, + "loss": 0.8728, + "step": 13902 + }, + { + "epoch": 0.7652044691507512, + "grad_norm": 0.6836426258087158, + "learning_rate": 6.827492886637501e-06, + "loss": 0.7315, + "step": 13903 + }, + { + "epoch": 0.7652595079531069, + "grad_norm": 0.6467520594596863, + "learning_rate": 6.827089403885896e-06, + "loss": 0.7556, + "step": 13904 + }, + { + "epoch": 0.7653145467554626, + "grad_norm": 0.7118285894393921, + "learning_rate": 6.826685907402376e-06, + "loss": 0.8686, + "step": 13905 + }, + { + "epoch": 0.7653695855578183, + "grad_norm": 0.6093236207962036, + "learning_rate": 6.826282397189974e-06, + "loss": 0.7066, + "step": 13906 + }, + { + "epoch": 0.7654246243601739, + "grad_norm": 0.6839649677276611, + "learning_rate": 6.825878873251721e-06, + "loss": 0.7025, + "step": 13907 + }, + { + "epoch": 0.7654796631625296, + "grad_norm": 0.7582715153694153, + "learning_rate": 6.825475335590652e-06, + "loss": 0.7301, + "step": 13908 + }, + { + "epoch": 0.7655347019648853, + "grad_norm": 0.6580978631973267, + "learning_rate": 6.8250717842098e-06, + "loss": 0.6771, + "step": 13909 + }, + { + "epoch": 0.765589740767241, + "grad_norm": 0.6754937171936035, + "learning_rate": 6.824668219112195e-06, + "loss": 0.7446, + "step": 13910 + }, + { + "epoch": 0.7656447795695965, + "grad_norm": 0.7541018724441528, + "learning_rate": 6.8242646403008725e-06, + "loss": 0.802, + "step": 13911 + }, + { + "epoch": 0.7656998183719522, + "grad_norm": 0.6714808344841003, + "learning_rate": 6.823861047778866e-06, + "loss": 0.7334, + "step": 13912 + }, + { + "epoch": 0.7657548571743079, + "grad_norm": 0.6972425580024719, + "learning_rate": 6.823457441549209e-06, + "loss": 0.7859, + "step": 13913 + }, + { + "epoch": 0.7658098959766636, + "grad_norm": 0.6660878658294678, + "learning_rate": 6.823053821614931e-06, + "loss": 0.6594, + "step": 13914 + }, + { + "epoch": 0.7658649347790192, + "grad_norm": 0.7392181158065796, + "learning_rate": 6.82265018797907e-06, + "loss": 0.6667, + "step": 13915 + }, + { + "epoch": 0.7659199735813749, + "grad_norm": 0.7601449489593506, + "learning_rate": 6.822246540644659e-06, + "loss": 0.7349, + "step": 13916 + }, + { + "epoch": 0.7659750123837306, + "grad_norm": 0.6648421287536621, + "learning_rate": 6.821842879614731e-06, + "loss": 0.7597, + "step": 13917 + }, + { + "epoch": 0.7660300511860861, + "grad_norm": 0.6369950175285339, + "learning_rate": 6.821439204892317e-06, + "loss": 0.7452, + "step": 13918 + }, + { + "epoch": 0.7660850899884418, + "grad_norm": 0.747653603553772, + "learning_rate": 6.821035516480457e-06, + "loss": 0.693, + "step": 13919 + }, + { + "epoch": 0.7661401287907975, + "grad_norm": 0.6450137495994568, + "learning_rate": 6.8206318143821795e-06, + "loss": 0.6492, + "step": 13920 + }, + { + "epoch": 0.7661951675931532, + "grad_norm": 0.707801878452301, + "learning_rate": 6.8202280986005205e-06, + "loss": 0.7284, + "step": 13921 + }, + { + "epoch": 0.7662502063955088, + "grad_norm": 0.7191962003707886, + "learning_rate": 6.8198243691385146e-06, + "loss": 0.7714, + "step": 13922 + }, + { + "epoch": 0.7663052451978645, + "grad_norm": 0.7477172613143921, + "learning_rate": 6.819420625999196e-06, + "loss": 0.7076, + "step": 13923 + }, + { + "epoch": 0.7663602840002202, + "grad_norm": 0.6221175193786621, + "learning_rate": 6.819016869185599e-06, + "loss": 0.6848, + "step": 13924 + }, + { + "epoch": 0.7664153228025758, + "grad_norm": 0.7840436697006226, + "learning_rate": 6.818613098700758e-06, + "loss": 0.7028, + "step": 13925 + }, + { + "epoch": 0.7664703616049314, + "grad_norm": 0.7147907018661499, + "learning_rate": 6.818209314547707e-06, + "loss": 0.7242, + "step": 13926 + }, + { + "epoch": 0.7665254004072871, + "grad_norm": 0.6627985835075378, + "learning_rate": 6.817805516729482e-06, + "loss": 0.7177, + "step": 13927 + }, + { + "epoch": 0.7665804392096428, + "grad_norm": 0.8019070625305176, + "learning_rate": 6.817401705249118e-06, + "loss": 0.6594, + "step": 13928 + }, + { + "epoch": 0.7666354780119985, + "grad_norm": 0.7127207517623901, + "learning_rate": 6.816997880109649e-06, + "loss": 0.8282, + "step": 13929 + }, + { + "epoch": 0.7666905168143541, + "grad_norm": 0.7335825562477112, + "learning_rate": 6.816594041314111e-06, + "loss": 0.7593, + "step": 13930 + }, + { + "epoch": 0.7667455556167098, + "grad_norm": 0.6878668069839478, + "learning_rate": 6.816190188865538e-06, + "loss": 0.7898, + "step": 13931 + }, + { + "epoch": 0.7668005944190655, + "grad_norm": 0.6441968679428101, + "learning_rate": 6.815786322766965e-06, + "loss": 0.6795, + "step": 13932 + }, + { + "epoch": 0.7668556332214211, + "grad_norm": 0.6503410339355469, + "learning_rate": 6.815382443021429e-06, + "loss": 0.753, + "step": 13933 + }, + { + "epoch": 0.7669106720237767, + "grad_norm": 0.6734908223152161, + "learning_rate": 6.8149785496319645e-06, + "loss": 0.7145, + "step": 13934 + }, + { + "epoch": 0.7669657108261324, + "grad_norm": 0.8363823890686035, + "learning_rate": 6.814574642601606e-06, + "loss": 0.8499, + "step": 13935 + }, + { + "epoch": 0.7670207496284881, + "grad_norm": 0.6986021995544434, + "learning_rate": 6.81417072193339e-06, + "loss": 0.7101, + "step": 13936 + }, + { + "epoch": 0.7670757884308438, + "grad_norm": 0.9656592011451721, + "learning_rate": 6.813766787630354e-06, + "loss": 0.7841, + "step": 13937 + }, + { + "epoch": 0.7671308272331994, + "grad_norm": 0.6830777525901794, + "learning_rate": 6.813362839695532e-06, + "loss": 0.7443, + "step": 13938 + }, + { + "epoch": 0.767185866035555, + "grad_norm": 0.6358513236045837, + "learning_rate": 6.812958878131959e-06, + "loss": 0.7017, + "step": 13939 + }, + { + "epoch": 0.7672409048379107, + "grad_norm": 0.9075862169265747, + "learning_rate": 6.812554902942673e-06, + "loss": 0.6991, + "step": 13940 + }, + { + "epoch": 0.7672959436402664, + "grad_norm": 0.7004347443580627, + "learning_rate": 6.812150914130709e-06, + "loss": 0.6519, + "step": 13941 + }, + { + "epoch": 0.767350982442622, + "grad_norm": 0.6648300886154175, + "learning_rate": 6.811746911699105e-06, + "loss": 0.7044, + "step": 13942 + }, + { + "epoch": 0.7674060212449777, + "grad_norm": 0.7050208449363708, + "learning_rate": 6.811342895650896e-06, + "loss": 0.78, + "step": 13943 + }, + { + "epoch": 0.7674610600473334, + "grad_norm": 0.6387132406234741, + "learning_rate": 6.810938865989119e-06, + "loss": 0.6062, + "step": 13944 + }, + { + "epoch": 0.7675160988496891, + "grad_norm": 0.6441114544868469, + "learning_rate": 6.81053482271681e-06, + "loss": 0.7252, + "step": 13945 + }, + { + "epoch": 0.7675711376520447, + "grad_norm": 0.7309751510620117, + "learning_rate": 6.810130765837006e-06, + "loss": 0.6407, + "step": 13946 + }, + { + "epoch": 0.7676261764544003, + "grad_norm": 0.7132161259651184, + "learning_rate": 6.809726695352742e-06, + "loss": 0.8341, + "step": 13947 + }, + { + "epoch": 0.767681215256756, + "grad_norm": 0.7214738726615906, + "learning_rate": 6.809322611267058e-06, + "loss": 0.8357, + "step": 13948 + }, + { + "epoch": 0.7677362540591117, + "grad_norm": 0.6410175561904907, + "learning_rate": 6.80891851358299e-06, + "loss": 0.6718, + "step": 13949 + }, + { + "epoch": 0.7677912928614673, + "grad_norm": 0.8888845443725586, + "learning_rate": 6.8085144023035745e-06, + "loss": 0.7823, + "step": 13950 + }, + { + "epoch": 0.767846331663823, + "grad_norm": 0.7327878475189209, + "learning_rate": 6.808110277431848e-06, + "loss": 0.7083, + "step": 13951 + }, + { + "epoch": 0.7679013704661787, + "grad_norm": 0.6871985793113708, + "learning_rate": 6.807706138970849e-06, + "loss": 0.7808, + "step": 13952 + }, + { + "epoch": 0.7679564092685344, + "grad_norm": 0.6939501762390137, + "learning_rate": 6.8073019869236134e-06, + "loss": 0.693, + "step": 13953 + }, + { + "epoch": 0.76801144807089, + "grad_norm": 0.7377064824104309, + "learning_rate": 6.8068978212931814e-06, + "loss": 0.9322, + "step": 13954 + }, + { + "epoch": 0.7680664868732456, + "grad_norm": 0.8165044188499451, + "learning_rate": 6.80649364208259e-06, + "loss": 0.6846, + "step": 13955 + }, + { + "epoch": 0.7681215256756013, + "grad_norm": 0.6774152517318726, + "learning_rate": 6.806089449294875e-06, + "loss": 0.8503, + "step": 13956 + }, + { + "epoch": 0.768176564477957, + "grad_norm": 0.7773441076278687, + "learning_rate": 6.805685242933074e-06, + "loss": 0.8775, + "step": 13957 + }, + { + "epoch": 0.7682316032803126, + "grad_norm": 0.6710473895072937, + "learning_rate": 6.805281023000227e-06, + "loss": 0.7831, + "step": 13958 + }, + { + "epoch": 0.7682866420826683, + "grad_norm": 0.6163424849510193, + "learning_rate": 6.80487678949937e-06, + "loss": 0.7309, + "step": 13959 + }, + { + "epoch": 0.768341680885024, + "grad_norm": 0.6851963400840759, + "learning_rate": 6.804472542433543e-06, + "loss": 0.6556, + "step": 13960 + }, + { + "epoch": 0.7683967196873795, + "grad_norm": 0.6881004571914673, + "learning_rate": 6.804068281805784e-06, + "loss": 0.7115, + "step": 13961 + }, + { + "epoch": 0.7684517584897352, + "grad_norm": 0.7372351884841919, + "learning_rate": 6.8036640076191304e-06, + "loss": 0.7869, + "step": 13962 + }, + { + "epoch": 0.7685067972920909, + "grad_norm": 0.7900989055633545, + "learning_rate": 6.8032597198766205e-06, + "loss": 0.7419, + "step": 13963 + }, + { + "epoch": 0.7685618360944466, + "grad_norm": 0.7245132327079773, + "learning_rate": 6.802855418581294e-06, + "loss": 0.8175, + "step": 13964 + }, + { + "epoch": 0.7686168748968022, + "grad_norm": 0.6681550741195679, + "learning_rate": 6.802451103736188e-06, + "loss": 0.773, + "step": 13965 + }, + { + "epoch": 0.7686719136991579, + "grad_norm": 0.6316970586776733, + "learning_rate": 6.802046775344343e-06, + "loss": 0.6597, + "step": 13966 + }, + { + "epoch": 0.7687269525015136, + "grad_norm": 0.7201604843139648, + "learning_rate": 6.801642433408796e-06, + "loss": 0.7205, + "step": 13967 + }, + { + "epoch": 0.7687819913038693, + "grad_norm": 0.6226171851158142, + "learning_rate": 6.801238077932587e-06, + "loss": 0.7271, + "step": 13968 + }, + { + "epoch": 0.7688370301062248, + "grad_norm": 0.833369255065918, + "learning_rate": 6.800833708918755e-06, + "loss": 0.7731, + "step": 13969 + }, + { + "epoch": 0.7688920689085805, + "grad_norm": 0.7280329465866089, + "learning_rate": 6.800429326370339e-06, + "loss": 0.7833, + "step": 13970 + }, + { + "epoch": 0.7689471077109362, + "grad_norm": 0.7581672072410583, + "learning_rate": 6.800024930290376e-06, + "loss": 0.8008, + "step": 13971 + }, + { + "epoch": 0.7690021465132919, + "grad_norm": 0.7931516170501709, + "learning_rate": 6.79962052068191e-06, + "loss": 0.8884, + "step": 13972 + }, + { + "epoch": 0.7690571853156475, + "grad_norm": 0.8455879092216492, + "learning_rate": 6.799216097547977e-06, + "loss": 0.8109, + "step": 13973 + }, + { + "epoch": 0.7691122241180032, + "grad_norm": 0.687336266040802, + "learning_rate": 6.798811660891618e-06, + "loss": 0.783, + "step": 13974 + }, + { + "epoch": 0.7691672629203589, + "grad_norm": 0.7661089897155762, + "learning_rate": 6.7984072107158696e-06, + "loss": 0.8448, + "step": 13975 + }, + { + "epoch": 0.7692223017227146, + "grad_norm": 0.6965043544769287, + "learning_rate": 6.798002747023776e-06, + "loss": 0.7421, + "step": 13976 + }, + { + "epoch": 0.7692773405250701, + "grad_norm": 0.7373656630516052, + "learning_rate": 6.797598269818375e-06, + "loss": 0.7093, + "step": 13977 + }, + { + "epoch": 0.7693323793274258, + "grad_norm": 0.6387331485748291, + "learning_rate": 6.7971937791027064e-06, + "loss": 0.7811, + "step": 13978 + }, + { + "epoch": 0.7693874181297815, + "grad_norm": 0.7566075325012207, + "learning_rate": 6.796789274879811e-06, + "loss": 0.8245, + "step": 13979 + }, + { + "epoch": 0.7694424569321372, + "grad_norm": 0.7035738229751587, + "learning_rate": 6.796384757152729e-06, + "loss": 0.7674, + "step": 13980 + }, + { + "epoch": 0.7694974957344928, + "grad_norm": 0.8265605568885803, + "learning_rate": 6.795980225924499e-06, + "loss": 0.7755, + "step": 13981 + }, + { + "epoch": 0.7695525345368485, + "grad_norm": 0.709454357624054, + "learning_rate": 6.7955756811981625e-06, + "loss": 0.8651, + "step": 13982 + }, + { + "epoch": 0.7696075733392042, + "grad_norm": 0.7075764536857605, + "learning_rate": 6.795171122976758e-06, + "loss": 0.7371, + "step": 13983 + }, + { + "epoch": 0.7696626121415598, + "grad_norm": 0.7027561664581299, + "learning_rate": 6.79476655126333e-06, + "loss": 0.7763, + "step": 13984 + }, + { + "epoch": 0.7697176509439154, + "grad_norm": 0.7922375202178955, + "learning_rate": 6.794361966060916e-06, + "loss": 0.7677, + "step": 13985 + }, + { + "epoch": 0.7697726897462711, + "grad_norm": 0.7185537219047546, + "learning_rate": 6.793957367372559e-06, + "loss": 0.7229, + "step": 13986 + }, + { + "epoch": 0.7698277285486268, + "grad_norm": 0.7173545956611633, + "learning_rate": 6.793552755201297e-06, + "loss": 0.7508, + "step": 13987 + }, + { + "epoch": 0.7698827673509825, + "grad_norm": 0.7743139863014221, + "learning_rate": 6.793148129550175e-06, + "loss": 0.7305, + "step": 13988 + }, + { + "epoch": 0.7699378061533381, + "grad_norm": 0.7992164492607117, + "learning_rate": 6.792743490422229e-06, + "loss": 0.7212, + "step": 13989 + }, + { + "epoch": 0.7699928449556938, + "grad_norm": 0.7437503337860107, + "learning_rate": 6.792338837820504e-06, + "loss": 0.6396, + "step": 13990 + }, + { + "epoch": 0.7700478837580494, + "grad_norm": 0.6908634305000305, + "learning_rate": 6.79193417174804e-06, + "loss": 0.7279, + "step": 13991 + }, + { + "epoch": 0.7701029225604051, + "grad_norm": 0.6894391775131226, + "learning_rate": 6.7915294922078805e-06, + "loss": 0.7615, + "step": 13992 + }, + { + "epoch": 0.7701579613627607, + "grad_norm": 0.7162172794342041, + "learning_rate": 6.791124799203062e-06, + "loss": 0.7404, + "step": 13993 + }, + { + "epoch": 0.7702130001651164, + "grad_norm": 0.6469258069992065, + "learning_rate": 6.79072009273663e-06, + "loss": 0.7035, + "step": 13994 + }, + { + "epoch": 0.7702680389674721, + "grad_norm": 0.6456457376480103, + "learning_rate": 6.790315372811625e-06, + "loss": 0.708, + "step": 13995 + }, + { + "epoch": 0.7703230777698278, + "grad_norm": 0.7880644798278809, + "learning_rate": 6.789910639431089e-06, + "loss": 0.7723, + "step": 13996 + }, + { + "epoch": 0.7703781165721834, + "grad_norm": 0.7847834229469299, + "learning_rate": 6.789505892598063e-06, + "loss": 0.8585, + "step": 13997 + }, + { + "epoch": 0.770433155374539, + "grad_norm": 0.6909215450286865, + "learning_rate": 6.789101132315591e-06, + "loss": 0.7107, + "step": 13998 + }, + { + "epoch": 0.7704881941768947, + "grad_norm": 0.7883939146995544, + "learning_rate": 6.788696358586713e-06, + "loss": 0.7575, + "step": 13999 + }, + { + "epoch": 0.7705432329792504, + "grad_norm": 0.6629998087882996, + "learning_rate": 6.788291571414472e-06, + "loss": 0.7273, + "step": 14000 + }, + { + "epoch": 0.770598271781606, + "grad_norm": 0.7548647522926331, + "learning_rate": 6.7878867708019106e-06, + "loss": 0.8214, + "step": 14001 + }, + { + "epoch": 0.7706533105839617, + "grad_norm": 0.6721330881118774, + "learning_rate": 6.78748195675207e-06, + "loss": 0.7153, + "step": 14002 + }, + { + "epoch": 0.7707083493863174, + "grad_norm": 0.6921262145042419, + "learning_rate": 6.787077129267994e-06, + "loss": 0.7099, + "step": 14003 + }, + { + "epoch": 0.770763388188673, + "grad_norm": 0.956937849521637, + "learning_rate": 6.786672288352725e-06, + "loss": 0.6765, + "step": 14004 + }, + { + "epoch": 0.7708184269910286, + "grad_norm": 0.7265778183937073, + "learning_rate": 6.786267434009306e-06, + "loss": 0.7653, + "step": 14005 + }, + { + "epoch": 0.7708734657933843, + "grad_norm": 0.7429845929145813, + "learning_rate": 6.785862566240778e-06, + "loss": 0.8064, + "step": 14006 + }, + { + "epoch": 0.77092850459574, + "grad_norm": 0.7437632083892822, + "learning_rate": 6.785457685050184e-06, + "loss": 0.7138, + "step": 14007 + }, + { + "epoch": 0.7709835433980956, + "grad_norm": 0.7218232750892639, + "learning_rate": 6.7850527904405695e-06, + "loss": 0.7785, + "step": 14008 + }, + { + "epoch": 0.7710385822004513, + "grad_norm": 0.7131973505020142, + "learning_rate": 6.784647882414977e-06, + "loss": 0.7651, + "step": 14009 + }, + { + "epoch": 0.771093621002807, + "grad_norm": 0.739919126033783, + "learning_rate": 6.784242960976447e-06, + "loss": 0.7993, + "step": 14010 + }, + { + "epoch": 0.7711486598051627, + "grad_norm": 0.6655608415603638, + "learning_rate": 6.783838026128025e-06, + "loss": 0.7394, + "step": 14011 + }, + { + "epoch": 0.7712036986075183, + "grad_norm": 0.9327310919761658, + "learning_rate": 6.783433077872753e-06, + "loss": 0.8737, + "step": 14012 + }, + { + "epoch": 0.7712587374098739, + "grad_norm": 0.5928294062614441, + "learning_rate": 6.783028116213677e-06, + "loss": 0.5819, + "step": 14013 + }, + { + "epoch": 0.7713137762122296, + "grad_norm": 0.6752136945724487, + "learning_rate": 6.782623141153838e-06, + "loss": 0.8021, + "step": 14014 + }, + { + "epoch": 0.7713688150145853, + "grad_norm": 0.6452222466468811, + "learning_rate": 6.78221815269628e-06, + "loss": 0.7806, + "step": 14015 + }, + { + "epoch": 0.7714238538169409, + "grad_norm": 0.7725237607955933, + "learning_rate": 6.78181315084405e-06, + "loss": 0.7679, + "step": 14016 + }, + { + "epoch": 0.7714788926192966, + "grad_norm": 0.6594743728637695, + "learning_rate": 6.781408135600187e-06, + "loss": 0.7254, + "step": 14017 + }, + { + "epoch": 0.7715339314216523, + "grad_norm": 0.7008917927742004, + "learning_rate": 6.7810031069677385e-06, + "loss": 0.705, + "step": 14018 + }, + { + "epoch": 0.771588970224008, + "grad_norm": 0.9435684084892273, + "learning_rate": 6.780598064949746e-06, + "loss": 0.7787, + "step": 14019 + }, + { + "epoch": 0.7716440090263635, + "grad_norm": 0.6615981459617615, + "learning_rate": 6.780193009549256e-06, + "loss": 0.7592, + "step": 14020 + }, + { + "epoch": 0.7716990478287192, + "grad_norm": 0.7042600512504578, + "learning_rate": 6.7797879407693115e-06, + "loss": 0.719, + "step": 14021 + }, + { + "epoch": 0.7717540866310749, + "grad_norm": 0.7135425209999084, + "learning_rate": 6.779382858612957e-06, + "loss": 0.739, + "step": 14022 + }, + { + "epoch": 0.7718091254334306, + "grad_norm": 0.6546016931533813, + "learning_rate": 6.778977763083238e-06, + "loss": 0.7039, + "step": 14023 + }, + { + "epoch": 0.7718641642357862, + "grad_norm": 0.8549250960350037, + "learning_rate": 6.778572654183198e-06, + "loss": 0.8384, + "step": 14024 + }, + { + "epoch": 0.7719192030381419, + "grad_norm": 0.7008731365203857, + "learning_rate": 6.778167531915882e-06, + "loss": 0.776, + "step": 14025 + }, + { + "epoch": 0.7719742418404976, + "grad_norm": 0.7047393321990967, + "learning_rate": 6.7777623962843355e-06, + "loss": 0.819, + "step": 14026 + }, + { + "epoch": 0.7720292806428533, + "grad_norm": 0.7015580534934998, + "learning_rate": 6.777357247291601e-06, + "loss": 0.8339, + "step": 14027 + }, + { + "epoch": 0.7720843194452088, + "grad_norm": 0.7008551955223083, + "learning_rate": 6.776952084940727e-06, + "loss": 0.783, + "step": 14028 + }, + { + "epoch": 0.7721393582475645, + "grad_norm": 1.0310637950897217, + "learning_rate": 6.776546909234757e-06, + "loss": 0.7447, + "step": 14029 + }, + { + "epoch": 0.7721943970499202, + "grad_norm": 0.6264338493347168, + "learning_rate": 6.776141720176734e-06, + "loss": 0.5542, + "step": 14030 + }, + { + "epoch": 0.7722494358522759, + "grad_norm": 0.6249508261680603, + "learning_rate": 6.775736517769707e-06, + "loss": 0.6514, + "step": 14031 + }, + { + "epoch": 0.7723044746546315, + "grad_norm": 0.6741732954978943, + "learning_rate": 6.775331302016719e-06, + "loss": 0.6967, + "step": 14032 + }, + { + "epoch": 0.7723595134569872, + "grad_norm": 0.7342913746833801, + "learning_rate": 6.774926072920815e-06, + "loss": 0.8279, + "step": 14033 + }, + { + "epoch": 0.7724145522593429, + "grad_norm": 0.7702916264533997, + "learning_rate": 6.774520830485044e-06, + "loss": 0.8539, + "step": 14034 + }, + { + "epoch": 0.7724695910616985, + "grad_norm": 0.7873550057411194, + "learning_rate": 6.774115574712448e-06, + "loss": 0.6999, + "step": 14035 + }, + { + "epoch": 0.7725246298640541, + "grad_norm": 0.6832353472709656, + "learning_rate": 6.773710305606074e-06, + "loss": 0.7246, + "step": 14036 + }, + { + "epoch": 0.7725796686664098, + "grad_norm": 0.7547367215156555, + "learning_rate": 6.773305023168969e-06, + "loss": 0.7357, + "step": 14037 + }, + { + "epoch": 0.7726347074687655, + "grad_norm": 0.7146826386451721, + "learning_rate": 6.772899727404178e-06, + "loss": 0.6742, + "step": 14038 + }, + { + "epoch": 0.7726897462711212, + "grad_norm": 0.7623558640480042, + "learning_rate": 6.772494418314748e-06, + "loss": 0.7729, + "step": 14039 + }, + { + "epoch": 0.7727447850734768, + "grad_norm": 0.637706458568573, + "learning_rate": 6.772089095903723e-06, + "loss": 0.6662, + "step": 14040 + }, + { + "epoch": 0.7727998238758325, + "grad_norm": 0.7293589115142822, + "learning_rate": 6.771683760174151e-06, + "loss": 0.7899, + "step": 14041 + }, + { + "epoch": 0.7728548626781881, + "grad_norm": 0.7191390991210938, + "learning_rate": 6.771278411129079e-06, + "loss": 0.6912, + "step": 14042 + }, + { + "epoch": 0.7729099014805438, + "grad_norm": 0.8264575004577637, + "learning_rate": 6.770873048771552e-06, + "loss": 0.7027, + "step": 14043 + }, + { + "epoch": 0.7729649402828994, + "grad_norm": 0.7490931749343872, + "learning_rate": 6.770467673104617e-06, + "loss": 0.6917, + "step": 14044 + }, + { + "epoch": 0.7730199790852551, + "grad_norm": 0.6901552081108093, + "learning_rate": 6.77006228413132e-06, + "loss": 0.8097, + "step": 14045 + }, + { + "epoch": 0.7730750178876108, + "grad_norm": 0.6340280175209045, + "learning_rate": 6.76965688185471e-06, + "loss": 0.6309, + "step": 14046 + }, + { + "epoch": 0.7731300566899664, + "grad_norm": 0.6807279586791992, + "learning_rate": 6.7692514662778315e-06, + "loss": 0.7744, + "step": 14047 + }, + { + "epoch": 0.7731850954923221, + "grad_norm": 1.2796865701675415, + "learning_rate": 6.7688460374037335e-06, + "loss": 0.7499, + "step": 14048 + }, + { + "epoch": 0.7732401342946778, + "grad_norm": 0.7059674263000488, + "learning_rate": 6.768440595235463e-06, + "loss": 0.8705, + "step": 14049 + }, + { + "epoch": 0.7732951730970334, + "grad_norm": 0.7626641392707825, + "learning_rate": 6.768035139776066e-06, + "loss": 0.8448, + "step": 14050 + }, + { + "epoch": 0.773350211899389, + "grad_norm": 0.6590229868888855, + "learning_rate": 6.767629671028588e-06, + "loss": 0.6796, + "step": 14051 + }, + { + "epoch": 0.7734052507017447, + "grad_norm": 0.6702030301094055, + "learning_rate": 6.767224188996081e-06, + "loss": 0.7087, + "step": 14052 + }, + { + "epoch": 0.7734602895041004, + "grad_norm": 0.670612096786499, + "learning_rate": 6.76681869368159e-06, + "loss": 0.7203, + "step": 14053 + }, + { + "epoch": 0.7735153283064561, + "grad_norm": 0.6892215013504028, + "learning_rate": 6.766413185088161e-06, + "loss": 0.6891, + "step": 14054 + }, + { + "epoch": 0.7735703671088117, + "grad_norm": 0.8354474902153015, + "learning_rate": 6.766007663218843e-06, + "loss": 0.7378, + "step": 14055 + }, + { + "epoch": 0.7736254059111674, + "grad_norm": 0.7633876204490662, + "learning_rate": 6.765602128076686e-06, + "loss": 0.6916, + "step": 14056 + }, + { + "epoch": 0.773680444713523, + "grad_norm": 0.7249060869216919, + "learning_rate": 6.765196579664736e-06, + "loss": 0.791, + "step": 14057 + }, + { + "epoch": 0.7737354835158787, + "grad_norm": 0.7033042311668396, + "learning_rate": 6.7647910179860395e-06, + "loss": 0.6799, + "step": 14058 + }, + { + "epoch": 0.7737905223182343, + "grad_norm": 0.7087684273719788, + "learning_rate": 6.7643854430436466e-06, + "loss": 0.6389, + "step": 14059 + }, + { + "epoch": 0.77384556112059, + "grad_norm": 0.6433978080749512, + "learning_rate": 6.763979854840606e-06, + "loss": 0.7214, + "step": 14060 + }, + { + "epoch": 0.7739005999229457, + "grad_norm": 0.7777101993560791, + "learning_rate": 6.763574253379964e-06, + "loss": 0.7458, + "step": 14061 + }, + { + "epoch": 0.7739556387253014, + "grad_norm": 0.7065346240997314, + "learning_rate": 6.763168638664771e-06, + "loss": 0.7663, + "step": 14062 + }, + { + "epoch": 0.774010677527657, + "grad_norm": 0.7136278748512268, + "learning_rate": 6.762763010698074e-06, + "loss": 0.667, + "step": 14063 + }, + { + "epoch": 0.7740657163300126, + "grad_norm": 0.6670508980751038, + "learning_rate": 6.762357369482921e-06, + "loss": 0.7462, + "step": 14064 + }, + { + "epoch": 0.7741207551323683, + "grad_norm": 0.6366799473762512, + "learning_rate": 6.7619517150223635e-06, + "loss": 0.7147, + "step": 14065 + }, + { + "epoch": 0.774175793934724, + "grad_norm": 0.5999431610107422, + "learning_rate": 6.761546047319447e-06, + "loss": 0.667, + "step": 14066 + }, + { + "epoch": 0.7742308327370796, + "grad_norm": 0.6751196980476379, + "learning_rate": 6.761140366377222e-06, + "loss": 0.7255, + "step": 14067 + }, + { + "epoch": 0.7742858715394353, + "grad_norm": 0.6786272525787354, + "learning_rate": 6.760734672198738e-06, + "loss": 0.7694, + "step": 14068 + }, + { + "epoch": 0.774340910341791, + "grad_norm": 0.6915947794914246, + "learning_rate": 6.760328964787044e-06, + "loss": 0.7955, + "step": 14069 + }, + { + "epoch": 0.7743959491441467, + "grad_norm": 0.7041972279548645, + "learning_rate": 6.759923244145188e-06, + "loss": 0.6542, + "step": 14070 + }, + { + "epoch": 0.7744509879465022, + "grad_norm": 0.6384761333465576, + "learning_rate": 6.759517510276221e-06, + "loss": 0.7384, + "step": 14071 + }, + { + "epoch": 0.7745060267488579, + "grad_norm": 0.7430800199508667, + "learning_rate": 6.759111763183189e-06, + "loss": 0.7587, + "step": 14072 + }, + { + "epoch": 0.7745610655512136, + "grad_norm": 0.6568213701248169, + "learning_rate": 6.758706002869146e-06, + "loss": 0.7118, + "step": 14073 + }, + { + "epoch": 0.7746161043535693, + "grad_norm": 0.8791618943214417, + "learning_rate": 6.75830022933714e-06, + "loss": 0.8049, + "step": 14074 + }, + { + "epoch": 0.7746711431559249, + "grad_norm": 0.6377304792404175, + "learning_rate": 6.75789444259022e-06, + "loss": 0.737, + "step": 14075 + }, + { + "epoch": 0.7747261819582806, + "grad_norm": 0.7253721356391907, + "learning_rate": 6.757488642631434e-06, + "loss": 0.8432, + "step": 14076 + }, + { + "epoch": 0.7747812207606363, + "grad_norm": 0.684626042842865, + "learning_rate": 6.757082829463835e-06, + "loss": 0.7845, + "step": 14077 + }, + { + "epoch": 0.774836259562992, + "grad_norm": 0.7737520337104797, + "learning_rate": 6.756677003090471e-06, + "loss": 0.8055, + "step": 14078 + }, + { + "epoch": 0.7748912983653475, + "grad_norm": 0.7294824719429016, + "learning_rate": 6.756271163514394e-06, + "loss": 0.7666, + "step": 14079 + }, + { + "epoch": 0.7749463371677032, + "grad_norm": 0.7728607654571533, + "learning_rate": 6.755865310738651e-06, + "loss": 0.7748, + "step": 14080 + }, + { + "epoch": 0.7750013759700589, + "grad_norm": 0.6738442778587341, + "learning_rate": 6.755459444766297e-06, + "loss": 0.6711, + "step": 14081 + }, + { + "epoch": 0.7750564147724146, + "grad_norm": 0.7041414976119995, + "learning_rate": 6.7550535656003794e-06, + "loss": 0.7126, + "step": 14082 + }, + { + "epoch": 0.7751114535747702, + "grad_norm": 1.0205422639846802, + "learning_rate": 6.754647673243948e-06, + "loss": 0.7394, + "step": 14083 + }, + { + "epoch": 0.7751664923771259, + "grad_norm": 0.6594380736351013, + "learning_rate": 6.754241767700054e-06, + "loss": 0.7599, + "step": 14084 + }, + { + "epoch": 0.7752215311794816, + "grad_norm": 0.6800520420074463, + "learning_rate": 6.753835848971749e-06, + "loss": 0.7579, + "step": 14085 + }, + { + "epoch": 0.7752765699818372, + "grad_norm": 0.7658087611198425, + "learning_rate": 6.7534299170620846e-06, + "loss": 0.7705, + "step": 14086 + }, + { + "epoch": 0.7753316087841928, + "grad_norm": 0.7242750525474548, + "learning_rate": 6.7530239719741084e-06, + "loss": 0.7683, + "step": 14087 + }, + { + "epoch": 0.7753866475865485, + "grad_norm": 0.6997398138046265, + "learning_rate": 6.752618013710874e-06, + "loss": 0.8023, + "step": 14088 + }, + { + "epoch": 0.7754416863889042, + "grad_norm": 0.7041590809822083, + "learning_rate": 6.752212042275431e-06, + "loss": 0.7013, + "step": 14089 + }, + { + "epoch": 0.7754967251912598, + "grad_norm": 0.7027721405029297, + "learning_rate": 6.751806057670832e-06, + "loss": 0.7678, + "step": 14090 + }, + { + "epoch": 0.7755517639936155, + "grad_norm": 0.714290201663971, + "learning_rate": 6.751400059900128e-06, + "loss": 0.6769, + "step": 14091 + }, + { + "epoch": 0.7756068027959712, + "grad_norm": 0.7385110855102539, + "learning_rate": 6.750994048966369e-06, + "loss": 0.6576, + "step": 14092 + }, + { + "epoch": 0.7756618415983269, + "grad_norm": 0.7665147185325623, + "learning_rate": 6.750588024872607e-06, + "loss": 0.8127, + "step": 14093 + }, + { + "epoch": 0.7757168804006824, + "grad_norm": 0.6774508953094482, + "learning_rate": 6.750181987621895e-06, + "loss": 0.8112, + "step": 14094 + }, + { + "epoch": 0.7757719192030381, + "grad_norm": 0.666394054889679, + "learning_rate": 6.749775937217285e-06, + "loss": 0.6444, + "step": 14095 + }, + { + "epoch": 0.7758269580053938, + "grad_norm": 0.6557022929191589, + "learning_rate": 6.749369873661825e-06, + "loss": 0.7613, + "step": 14096 + }, + { + "epoch": 0.7758819968077495, + "grad_norm": 0.7090621590614319, + "learning_rate": 6.74896379695857e-06, + "loss": 0.7229, + "step": 14097 + }, + { + "epoch": 0.7759370356101051, + "grad_norm": 0.8117626309394836, + "learning_rate": 6.7485577071105734e-06, + "loss": 0.8002, + "step": 14098 + }, + { + "epoch": 0.7759920744124608, + "grad_norm": 0.6743370294570923, + "learning_rate": 6.748151604120883e-06, + "loss": 0.7457, + "step": 14099 + }, + { + "epoch": 0.7760471132148165, + "grad_norm": 0.7637452483177185, + "learning_rate": 6.747745487992553e-06, + "loss": 0.7471, + "step": 14100 + }, + { + "epoch": 0.7761021520171721, + "grad_norm": 0.6732922196388245, + "learning_rate": 6.747339358728636e-06, + "loss": 0.7471, + "step": 14101 + }, + { + "epoch": 0.7761571908195277, + "grad_norm": 0.7510336637496948, + "learning_rate": 6.746933216332184e-06, + "loss": 0.7252, + "step": 14102 + }, + { + "epoch": 0.7762122296218834, + "grad_norm": 0.731719434261322, + "learning_rate": 6.746527060806251e-06, + "loss": 0.8706, + "step": 14103 + }, + { + "epoch": 0.7762672684242391, + "grad_norm": 0.7625692486763, + "learning_rate": 6.746120892153886e-06, + "loss": 0.7518, + "step": 14104 + }, + { + "epoch": 0.7763223072265948, + "grad_norm": 0.6809547543525696, + "learning_rate": 6.745714710378145e-06, + "loss": 0.7172, + "step": 14105 + }, + { + "epoch": 0.7763773460289504, + "grad_norm": 0.709996223449707, + "learning_rate": 6.745308515482079e-06, + "loss": 0.7925, + "step": 14106 + }, + { + "epoch": 0.776432384831306, + "grad_norm": 0.6675372123718262, + "learning_rate": 6.744902307468742e-06, + "loss": 0.8175, + "step": 14107 + }, + { + "epoch": 0.7764874236336617, + "grad_norm": 0.6978115439414978, + "learning_rate": 6.744496086341186e-06, + "loss": 0.7895, + "step": 14108 + }, + { + "epoch": 0.7765424624360174, + "grad_norm": 0.6593814492225647, + "learning_rate": 6.7440898521024634e-06, + "loss": 0.7791, + "step": 14109 + }, + { + "epoch": 0.776597501238373, + "grad_norm": 0.7169299721717834, + "learning_rate": 6.743683604755631e-06, + "loss": 0.7944, + "step": 14110 + }, + { + "epoch": 0.7766525400407287, + "grad_norm": 0.6805511713027954, + "learning_rate": 6.743277344303738e-06, + "loss": 0.7671, + "step": 14111 + }, + { + "epoch": 0.7767075788430844, + "grad_norm": 0.7300780415534973, + "learning_rate": 6.742871070749838e-06, + "loss": 0.7789, + "step": 14112 + }, + { + "epoch": 0.7767626176454401, + "grad_norm": 0.6475857496261597, + "learning_rate": 6.742464784096987e-06, + "loss": 0.6652, + "step": 14113 + }, + { + "epoch": 0.7768176564477957, + "grad_norm": 0.6941269040107727, + "learning_rate": 6.742058484348236e-06, + "loss": 0.8138, + "step": 14114 + }, + { + "epoch": 0.7768726952501513, + "grad_norm": 0.6175981760025024, + "learning_rate": 6.7416521715066405e-06, + "loss": 0.7667, + "step": 14115 + }, + { + "epoch": 0.776927734052507, + "grad_norm": 0.6499401330947876, + "learning_rate": 6.741245845575252e-06, + "loss": 0.7415, + "step": 14116 + }, + { + "epoch": 0.7769827728548627, + "grad_norm": 0.6601547598838806, + "learning_rate": 6.740839506557127e-06, + "loss": 0.732, + "step": 14117 + }, + { + "epoch": 0.7770378116572183, + "grad_norm": 0.7939042448997498, + "learning_rate": 6.740433154455319e-06, + "loss": 0.7043, + "step": 14118 + }, + { + "epoch": 0.777092850459574, + "grad_norm": 0.7381628751754761, + "learning_rate": 6.740026789272881e-06, + "loss": 0.8256, + "step": 14119 + }, + { + "epoch": 0.7771478892619297, + "grad_norm": 0.6131769418716431, + "learning_rate": 6.739620411012866e-06, + "loss": 0.726, + "step": 14120 + }, + { + "epoch": 0.7772029280642854, + "grad_norm": 1.201745867729187, + "learning_rate": 6.739214019678332e-06, + "loss": 0.7097, + "step": 14121 + }, + { + "epoch": 0.777257966866641, + "grad_norm": 0.6618456244468689, + "learning_rate": 6.7388076152723295e-06, + "loss": 0.6396, + "step": 14122 + }, + { + "epoch": 0.7773130056689966, + "grad_norm": 0.7490836977958679, + "learning_rate": 6.738401197797915e-06, + "loss": 0.6475, + "step": 14123 + }, + { + "epoch": 0.7773680444713523, + "grad_norm": 0.8125407099723816, + "learning_rate": 6.737994767258142e-06, + "loss": 0.7693, + "step": 14124 + }, + { + "epoch": 0.777423083273708, + "grad_norm": 0.7501794099807739, + "learning_rate": 6.737588323656065e-06, + "loss": 0.7333, + "step": 14125 + }, + { + "epoch": 0.7774781220760636, + "grad_norm": 1.3062889575958252, + "learning_rate": 6.73718186699474e-06, + "loss": 0.6909, + "step": 14126 + }, + { + "epoch": 0.7775331608784193, + "grad_norm": 0.6784525513648987, + "learning_rate": 6.736775397277221e-06, + "loss": 0.7256, + "step": 14127 + }, + { + "epoch": 0.777588199680775, + "grad_norm": 0.7018646597862244, + "learning_rate": 6.736368914506562e-06, + "loss": 0.7632, + "step": 14128 + }, + { + "epoch": 0.7776432384831307, + "grad_norm": 0.7596307992935181, + "learning_rate": 6.735962418685821e-06, + "loss": 0.7117, + "step": 14129 + }, + { + "epoch": 0.7776982772854862, + "grad_norm": 0.7582107186317444, + "learning_rate": 6.7355559098180504e-06, + "loss": 0.7808, + "step": 14130 + }, + { + "epoch": 0.7777533160878419, + "grad_norm": 0.6460647583007812, + "learning_rate": 6.7351493879063056e-06, + "loss": 0.675, + "step": 14131 + }, + { + "epoch": 0.7778083548901976, + "grad_norm": 0.6801304221153259, + "learning_rate": 6.7347428529536415e-06, + "loss": 0.6504, + "step": 14132 + }, + { + "epoch": 0.7778633936925532, + "grad_norm": 0.8122933506965637, + "learning_rate": 6.7343363049631176e-06, + "loss": 0.7949, + "step": 14133 + }, + { + "epoch": 0.7779184324949089, + "grad_norm": 0.6750267744064331, + "learning_rate": 6.733929743937784e-06, + "loss": 0.7689, + "step": 14134 + }, + { + "epoch": 0.7779734712972646, + "grad_norm": 0.7141891121864319, + "learning_rate": 6.7335231698807005e-06, + "loss": 0.7099, + "step": 14135 + }, + { + "epoch": 0.7780285100996203, + "grad_norm": 0.7904065251350403, + "learning_rate": 6.733116582794918e-06, + "loss": 0.8458, + "step": 14136 + }, + { + "epoch": 0.7780835489019758, + "grad_norm": 0.6905248165130615, + "learning_rate": 6.732709982683496e-06, + "loss": 0.7848, + "step": 14137 + }, + { + "epoch": 0.7781385877043315, + "grad_norm": 0.6707245707511902, + "learning_rate": 6.732303369549491e-06, + "loss": 0.8319, + "step": 14138 + }, + { + "epoch": 0.7781936265066872, + "grad_norm": 0.6611519455909729, + "learning_rate": 6.731896743395957e-06, + "loss": 0.7025, + "step": 14139 + }, + { + "epoch": 0.7782486653090429, + "grad_norm": 0.7113156914710999, + "learning_rate": 6.73149010422595e-06, + "loss": 0.8297, + "step": 14140 + }, + { + "epoch": 0.7783037041113985, + "grad_norm": 0.7279486060142517, + "learning_rate": 6.7310834520425265e-06, + "loss": 0.8134, + "step": 14141 + }, + { + "epoch": 0.7783587429137542, + "grad_norm": 0.7561796307563782, + "learning_rate": 6.730676786848744e-06, + "loss": 0.806, + "step": 14142 + }, + { + "epoch": 0.7784137817161099, + "grad_norm": 0.6724728345870972, + "learning_rate": 6.7302701086476585e-06, + "loss": 0.7782, + "step": 14143 + }, + { + "epoch": 0.7784688205184656, + "grad_norm": 0.6363211274147034, + "learning_rate": 6.729863417442325e-06, + "loss": 0.6298, + "step": 14144 + }, + { + "epoch": 0.7785238593208211, + "grad_norm": 0.6920950412750244, + "learning_rate": 6.729456713235803e-06, + "loss": 0.5804, + "step": 14145 + }, + { + "epoch": 0.7785788981231768, + "grad_norm": 0.7388806343078613, + "learning_rate": 6.729049996031145e-06, + "loss": 0.6594, + "step": 14146 + }, + { + "epoch": 0.7786339369255325, + "grad_norm": 0.7736972570419312, + "learning_rate": 6.728643265831412e-06, + "loss": 0.8244, + "step": 14147 + }, + { + "epoch": 0.7786889757278882, + "grad_norm": 0.6928302049636841, + "learning_rate": 6.728236522639658e-06, + "loss": 0.6713, + "step": 14148 + }, + { + "epoch": 0.7787440145302438, + "grad_norm": 0.8058464527130127, + "learning_rate": 6.72782976645894e-06, + "loss": 0.7647, + "step": 14149 + }, + { + "epoch": 0.7787990533325995, + "grad_norm": 0.7111127376556396, + "learning_rate": 6.727422997292317e-06, + "loss": 0.7629, + "step": 14150 + }, + { + "epoch": 0.7788540921349552, + "grad_norm": 0.9375373721122742, + "learning_rate": 6.7270162151428455e-06, + "loss": 0.8306, + "step": 14151 + }, + { + "epoch": 0.7789091309373108, + "grad_norm": 0.6894392371177673, + "learning_rate": 6.726609420013581e-06, + "loss": 0.6995, + "step": 14152 + }, + { + "epoch": 0.7789641697396664, + "grad_norm": 0.7058690786361694, + "learning_rate": 6.726202611907583e-06, + "loss": 0.844, + "step": 14153 + }, + { + "epoch": 0.7790192085420221, + "grad_norm": 0.7672932744026184, + "learning_rate": 6.725795790827909e-06, + "loss": 0.6613, + "step": 14154 + }, + { + "epoch": 0.7790742473443778, + "grad_norm": 0.8575173020362854, + "learning_rate": 6.7253889567776146e-06, + "loss": 0.6946, + "step": 14155 + }, + { + "epoch": 0.7791292861467335, + "grad_norm": 0.6832261085510254, + "learning_rate": 6.724982109759759e-06, + "loss": 0.7121, + "step": 14156 + }, + { + "epoch": 0.7791843249490891, + "grad_norm": 0.8188209533691406, + "learning_rate": 6.724575249777401e-06, + "loss": 0.6479, + "step": 14157 + }, + { + "epoch": 0.7792393637514448, + "grad_norm": 0.6514336466789246, + "learning_rate": 6.724168376833595e-06, + "loss": 0.6117, + "step": 14158 + }, + { + "epoch": 0.7792944025538004, + "grad_norm": 0.7283767461776733, + "learning_rate": 6.723761490931403e-06, + "loss": 0.6882, + "step": 14159 + }, + { + "epoch": 0.7793494413561561, + "grad_norm": 0.7681146860122681, + "learning_rate": 6.7233545920738785e-06, + "loss": 0.8028, + "step": 14160 + }, + { + "epoch": 0.7794044801585117, + "grad_norm": 0.6202995181083679, + "learning_rate": 6.722947680264084e-06, + "loss": 0.713, + "step": 14161 + }, + { + "epoch": 0.7794595189608674, + "grad_norm": 0.7137139439582825, + "learning_rate": 6.722540755505076e-06, + "loss": 0.7842, + "step": 14162 + }, + { + "epoch": 0.7795145577632231, + "grad_norm": 0.6852554678916931, + "learning_rate": 6.722133817799913e-06, + "loss": 0.7329, + "step": 14163 + }, + { + "epoch": 0.7795695965655788, + "grad_norm": 0.7520774602890015, + "learning_rate": 6.7217268671516525e-06, + "loss": 0.7498, + "step": 14164 + }, + { + "epoch": 0.7796246353679344, + "grad_norm": 0.708577573299408, + "learning_rate": 6.7213199035633525e-06, + "loss": 0.675, + "step": 14165 + }, + { + "epoch": 0.77967967417029, + "grad_norm": 0.8061410188674927, + "learning_rate": 6.7209129270380744e-06, + "loss": 0.7176, + "step": 14166 + }, + { + "epoch": 0.7797347129726457, + "grad_norm": 0.8070787787437439, + "learning_rate": 6.720505937578876e-06, + "loss": 0.8138, + "step": 14167 + }, + { + "epoch": 0.7797897517750014, + "grad_norm": 0.7127004265785217, + "learning_rate": 6.720098935188815e-06, + "loss": 0.7004, + "step": 14168 + }, + { + "epoch": 0.779844790577357, + "grad_norm": 0.7188708782196045, + "learning_rate": 6.719691919870951e-06, + "loss": 0.6996, + "step": 14169 + }, + { + "epoch": 0.7798998293797127, + "grad_norm": 0.6346360445022583, + "learning_rate": 6.719284891628342e-06, + "loss": 0.7349, + "step": 14170 + }, + { + "epoch": 0.7799548681820684, + "grad_norm": 0.6262187361717224, + "learning_rate": 6.71887785046405e-06, + "loss": 0.7279, + "step": 14171 + }, + { + "epoch": 0.7800099069844241, + "grad_norm": 0.7538053393363953, + "learning_rate": 6.718470796381129e-06, + "loss": 0.754, + "step": 14172 + }, + { + "epoch": 0.7800649457867797, + "grad_norm": 0.6569569706916809, + "learning_rate": 6.718063729382643e-06, + "loss": 0.6787, + "step": 14173 + }, + { + "epoch": 0.7801199845891353, + "grad_norm": 0.6446678042411804, + "learning_rate": 6.71765664947165e-06, + "loss": 0.6338, + "step": 14174 + }, + { + "epoch": 0.780175023391491, + "grad_norm": 0.7559269666671753, + "learning_rate": 6.7172495566512095e-06, + "loss": 0.7472, + "step": 14175 + }, + { + "epoch": 0.7802300621938466, + "grad_norm": 0.6920101642608643, + "learning_rate": 6.71684245092438e-06, + "loss": 0.7189, + "step": 14176 + }, + { + "epoch": 0.7802851009962023, + "grad_norm": 0.6513105034828186, + "learning_rate": 6.716435332294223e-06, + "loss": 0.6104, + "step": 14177 + }, + { + "epoch": 0.780340139798558, + "grad_norm": 0.7076418399810791, + "learning_rate": 6.716028200763798e-06, + "loss": 0.7974, + "step": 14178 + }, + { + "epoch": 0.7803951786009137, + "grad_norm": 0.7291662693023682, + "learning_rate": 6.715621056336164e-06, + "loss": 0.7661, + "step": 14179 + }, + { + "epoch": 0.7804502174032693, + "grad_norm": 0.682321310043335, + "learning_rate": 6.715213899014381e-06, + "loss": 0.7345, + "step": 14180 + }, + { + "epoch": 0.7805052562056249, + "grad_norm": 0.7170400619506836, + "learning_rate": 6.71480672880151e-06, + "loss": 0.6968, + "step": 14181 + }, + { + "epoch": 0.7805602950079806, + "grad_norm": 0.7504192590713501, + "learning_rate": 6.714399545700611e-06, + "loss": 0.7868, + "step": 14182 + }, + { + "epoch": 0.7806153338103363, + "grad_norm": 0.7334801554679871, + "learning_rate": 6.713992349714744e-06, + "loss": 0.8806, + "step": 14183 + }, + { + "epoch": 0.7806703726126919, + "grad_norm": 0.6495537161827087, + "learning_rate": 6.713585140846969e-06, + "loss": 0.7272, + "step": 14184 + }, + { + "epoch": 0.7807254114150476, + "grad_norm": 0.7101101279258728, + "learning_rate": 6.713177919100347e-06, + "loss": 0.8038, + "step": 14185 + }, + { + "epoch": 0.7807804502174033, + "grad_norm": 0.7013083100318909, + "learning_rate": 6.712770684477937e-06, + "loss": 0.7576, + "step": 14186 + }, + { + "epoch": 0.780835489019759, + "grad_norm": 0.7535369992256165, + "learning_rate": 6.712363436982802e-06, + "loss": 0.6537, + "step": 14187 + }, + { + "epoch": 0.7808905278221145, + "grad_norm": 0.7432667016983032, + "learning_rate": 6.711956176618001e-06, + "loss": 0.7734, + "step": 14188 + }, + { + "epoch": 0.7809455666244702, + "grad_norm": 0.718006432056427, + "learning_rate": 6.711548903386597e-06, + "loss": 0.7291, + "step": 14189 + }, + { + "epoch": 0.7810006054268259, + "grad_norm": 0.7983072400093079, + "learning_rate": 6.711141617291649e-06, + "loss": 0.8403, + "step": 14190 + }, + { + "epoch": 0.7810556442291816, + "grad_norm": 0.7017259001731873, + "learning_rate": 6.710734318336218e-06, + "loss": 0.7293, + "step": 14191 + }, + { + "epoch": 0.7811106830315372, + "grad_norm": 0.6061737537384033, + "learning_rate": 6.710327006523366e-06, + "loss": 0.6624, + "step": 14192 + }, + { + "epoch": 0.7811657218338929, + "grad_norm": 0.6876726746559143, + "learning_rate": 6.709919681856155e-06, + "loss": 0.723, + "step": 14193 + }, + { + "epoch": 0.7812207606362486, + "grad_norm": 0.6926757097244263, + "learning_rate": 6.709512344337646e-06, + "loss": 0.7392, + "step": 14194 + }, + { + "epoch": 0.7812757994386043, + "grad_norm": 0.6464381217956543, + "learning_rate": 6.7091049939708985e-06, + "loss": 0.7301, + "step": 14195 + }, + { + "epoch": 0.7813308382409598, + "grad_norm": 0.7292629480361938, + "learning_rate": 6.708697630758974e-06, + "loss": 0.7511, + "step": 14196 + }, + { + "epoch": 0.7813858770433155, + "grad_norm": 0.7483099102973938, + "learning_rate": 6.708290254704937e-06, + "loss": 0.7981, + "step": 14197 + }, + { + "epoch": 0.7814409158456712, + "grad_norm": 0.6766877770423889, + "learning_rate": 6.707882865811848e-06, + "loss": 0.7987, + "step": 14198 + }, + { + "epoch": 0.7814959546480269, + "grad_norm": 0.7340181469917297, + "learning_rate": 6.707475464082769e-06, + "loss": 0.799, + "step": 14199 + }, + { + "epoch": 0.7815509934503825, + "grad_norm": 0.6247759461402893, + "learning_rate": 6.707068049520759e-06, + "loss": 0.7299, + "step": 14200 + }, + { + "epoch": 0.7816060322527382, + "grad_norm": 0.6783067584037781, + "learning_rate": 6.706660622128885e-06, + "loss": 0.6987, + "step": 14201 + }, + { + "epoch": 0.7816610710550939, + "grad_norm": 0.7613719701766968, + "learning_rate": 6.706253181910205e-06, + "loss": 0.7894, + "step": 14202 + }, + { + "epoch": 0.7817161098574495, + "grad_norm": 0.6673761606216431, + "learning_rate": 6.705845728867784e-06, + "loss": 0.8015, + "step": 14203 + }, + { + "epoch": 0.7817711486598051, + "grad_norm": 0.6551307439804077, + "learning_rate": 6.705438263004683e-06, + "loss": 0.7057, + "step": 14204 + }, + { + "epoch": 0.7818261874621608, + "grad_norm": 0.6815405488014221, + "learning_rate": 6.705030784323965e-06, + "loss": 0.7466, + "step": 14205 + }, + { + "epoch": 0.7818812262645165, + "grad_norm": 0.6838087439537048, + "learning_rate": 6.704623292828692e-06, + "loss": 0.8226, + "step": 14206 + }, + { + "epoch": 0.7819362650668722, + "grad_norm": 0.6704637408256531, + "learning_rate": 6.704215788521925e-06, + "loss": 0.8101, + "step": 14207 + }, + { + "epoch": 0.7819913038692278, + "grad_norm": 0.6606172919273376, + "learning_rate": 6.70380827140673e-06, + "loss": 0.7824, + "step": 14208 + }, + { + "epoch": 0.7820463426715835, + "grad_norm": 0.6641090512275696, + "learning_rate": 6.703400741486166e-06, + "loss": 0.7507, + "step": 14209 + }, + { + "epoch": 0.7821013814739392, + "grad_norm": 1.6413429975509644, + "learning_rate": 6.702993198763299e-06, + "loss": 0.7793, + "step": 14210 + }, + { + "epoch": 0.7821564202762948, + "grad_norm": 0.6664854884147644, + "learning_rate": 6.7025856432411915e-06, + "loss": 0.7304, + "step": 14211 + }, + { + "epoch": 0.7822114590786504, + "grad_norm": 0.6968172192573547, + "learning_rate": 6.7021780749229075e-06, + "loss": 0.7506, + "step": 14212 + }, + { + "epoch": 0.7822664978810061, + "grad_norm": 0.6443943381309509, + "learning_rate": 6.701770493811506e-06, + "loss": 0.7511, + "step": 14213 + }, + { + "epoch": 0.7823215366833618, + "grad_norm": 0.67723548412323, + "learning_rate": 6.701362899910053e-06, + "loss": 0.6839, + "step": 14214 + }, + { + "epoch": 0.7823765754857175, + "grad_norm": 0.7601221203804016, + "learning_rate": 6.700955293221614e-06, + "loss": 0.7397, + "step": 14215 + }, + { + "epoch": 0.7824316142880731, + "grad_norm": 0.6056920289993286, + "learning_rate": 6.700547673749249e-06, + "loss": 0.7706, + "step": 14216 + }, + { + "epoch": 0.7824866530904288, + "grad_norm": 0.6421142816543579, + "learning_rate": 6.700140041496024e-06, + "loss": 0.7209, + "step": 14217 + }, + { + "epoch": 0.7825416918927844, + "grad_norm": 0.6653133034706116, + "learning_rate": 6.6997323964650005e-06, + "loss": 0.708, + "step": 14218 + }, + { + "epoch": 0.78259673069514, + "grad_norm": 0.8854939937591553, + "learning_rate": 6.699324738659243e-06, + "loss": 0.7658, + "step": 14219 + }, + { + "epoch": 0.7826517694974957, + "grad_norm": 0.7130745649337769, + "learning_rate": 6.6989170680818175e-06, + "loss": 0.7827, + "step": 14220 + }, + { + "epoch": 0.7827068082998514, + "grad_norm": 0.953117847442627, + "learning_rate": 6.698509384735783e-06, + "loss": 0.7852, + "step": 14221 + }, + { + "epoch": 0.7827618471022071, + "grad_norm": 0.655768871307373, + "learning_rate": 6.698101688624209e-06, + "loss": 0.8461, + "step": 14222 + }, + { + "epoch": 0.7828168859045627, + "grad_norm": 0.656775951385498, + "learning_rate": 6.6976939797501575e-06, + "loss": 0.7254, + "step": 14223 + }, + { + "epoch": 0.7828719247069184, + "grad_norm": 0.6901991963386536, + "learning_rate": 6.697286258116691e-06, + "loss": 0.7242, + "step": 14224 + }, + { + "epoch": 0.782926963509274, + "grad_norm": 0.8289571404457092, + "learning_rate": 6.696878523726875e-06, + "loss": 0.8578, + "step": 14225 + }, + { + "epoch": 0.7829820023116297, + "grad_norm": 0.6268846392631531, + "learning_rate": 6.696470776583775e-06, + "loss": 0.737, + "step": 14226 + }, + { + "epoch": 0.7830370411139853, + "grad_norm": 0.7026770114898682, + "learning_rate": 6.696063016690455e-06, + "loss": 0.6771, + "step": 14227 + }, + { + "epoch": 0.783092079916341, + "grad_norm": 0.7377839088439941, + "learning_rate": 6.69565524404998e-06, + "loss": 0.7174, + "step": 14228 + }, + { + "epoch": 0.7831471187186967, + "grad_norm": 0.6778523921966553, + "learning_rate": 6.695247458665414e-06, + "loss": 0.8255, + "step": 14229 + }, + { + "epoch": 0.7832021575210524, + "grad_norm": 0.7624330520629883, + "learning_rate": 6.69483966053982e-06, + "loss": 0.7495, + "step": 14230 + }, + { + "epoch": 0.783257196323408, + "grad_norm": 0.8944052457809448, + "learning_rate": 6.694431849676267e-06, + "loss": 0.868, + "step": 14231 + }, + { + "epoch": 0.7833122351257636, + "grad_norm": 0.7391701936721802, + "learning_rate": 6.694024026077816e-06, + "loss": 0.7032, + "step": 14232 + }, + { + "epoch": 0.7833672739281193, + "grad_norm": 0.7548620104789734, + "learning_rate": 6.693616189747535e-06, + "loss": 0.8272, + "step": 14233 + }, + { + "epoch": 0.783422312730475, + "grad_norm": 0.6795994639396667, + "learning_rate": 6.693208340688489e-06, + "loss": 0.703, + "step": 14234 + }, + { + "epoch": 0.7834773515328306, + "grad_norm": 0.6580816507339478, + "learning_rate": 6.69280047890374e-06, + "loss": 0.7454, + "step": 14235 + }, + { + "epoch": 0.7835323903351863, + "grad_norm": 0.7124443650245667, + "learning_rate": 6.6923926043963576e-06, + "loss": 0.6655, + "step": 14236 + }, + { + "epoch": 0.783587429137542, + "grad_norm": 0.6730241179466248, + "learning_rate": 6.691984717169404e-06, + "loss": 0.7522, + "step": 14237 + }, + { + "epoch": 0.7836424679398977, + "grad_norm": 0.8156033158302307, + "learning_rate": 6.6915768172259466e-06, + "loss": 0.8955, + "step": 14238 + }, + { + "epoch": 0.7836975067422532, + "grad_norm": 0.8041443228721619, + "learning_rate": 6.6911689045690506e-06, + "loss": 0.8019, + "step": 14239 + }, + { + "epoch": 0.7837525455446089, + "grad_norm": 0.7252053618431091, + "learning_rate": 6.690760979201782e-06, + "loss": 0.7014, + "step": 14240 + }, + { + "epoch": 0.7838075843469646, + "grad_norm": 0.6969071626663208, + "learning_rate": 6.690353041127208e-06, + "loss": 0.7304, + "step": 14241 + }, + { + "epoch": 0.7838626231493203, + "grad_norm": 0.8254885673522949, + "learning_rate": 6.6899450903483906e-06, + "loss": 0.7193, + "step": 14242 + }, + { + "epoch": 0.7839176619516759, + "grad_norm": 0.7426590323448181, + "learning_rate": 6.6895371268684e-06, + "loss": 0.697, + "step": 14243 + }, + { + "epoch": 0.7839727007540316, + "grad_norm": 0.6744338274002075, + "learning_rate": 6.6891291506903e-06, + "loss": 0.8363, + "step": 14244 + }, + { + "epoch": 0.7840277395563873, + "grad_norm": 0.6609839797019958, + "learning_rate": 6.688721161817156e-06, + "loss": 0.7756, + "step": 14245 + }, + { + "epoch": 0.784082778358743, + "grad_norm": 0.8377131223678589, + "learning_rate": 6.688313160252038e-06, + "loss": 0.8355, + "step": 14246 + }, + { + "epoch": 0.7841378171610985, + "grad_norm": 0.6922308802604675, + "learning_rate": 6.687905145998009e-06, + "loss": 0.756, + "step": 14247 + }, + { + "epoch": 0.7841928559634542, + "grad_norm": 0.7217739820480347, + "learning_rate": 6.687497119058137e-06, + "loss": 0.7309, + "step": 14248 + }, + { + "epoch": 0.7842478947658099, + "grad_norm": 0.6906038522720337, + "learning_rate": 6.687089079435488e-06, + "loss": 0.6645, + "step": 14249 + }, + { + "epoch": 0.7843029335681656, + "grad_norm": 0.6800183057785034, + "learning_rate": 6.6866810271331305e-06, + "loss": 0.6791, + "step": 14250 + }, + { + "epoch": 0.7843579723705212, + "grad_norm": 0.6835503578186035, + "learning_rate": 6.686272962154129e-06, + "loss": 0.699, + "step": 14251 + }, + { + "epoch": 0.7844130111728769, + "grad_norm": 0.6643723845481873, + "learning_rate": 6.685864884501552e-06, + "loss": 0.7808, + "step": 14252 + }, + { + "epoch": 0.7844680499752326, + "grad_norm": 0.6742954850196838, + "learning_rate": 6.685456794178464e-06, + "loss": 0.7704, + "step": 14253 + }, + { + "epoch": 0.7845230887775883, + "grad_norm": 0.6374711990356445, + "learning_rate": 6.6850486911879355e-06, + "loss": 0.7557, + "step": 14254 + }, + { + "epoch": 0.7845781275799438, + "grad_norm": 0.7354347109794617, + "learning_rate": 6.684640575533031e-06, + "loss": 0.7928, + "step": 14255 + }, + { + "epoch": 0.7846331663822995, + "grad_norm": 0.6694937348365784, + "learning_rate": 6.684232447216821e-06, + "loss": 0.7247, + "step": 14256 + }, + { + "epoch": 0.7846882051846552, + "grad_norm": 0.716623842716217, + "learning_rate": 6.683824306242368e-06, + "loss": 0.8638, + "step": 14257 + }, + { + "epoch": 0.7847432439870109, + "grad_norm": 0.667164146900177, + "learning_rate": 6.683416152612743e-06, + "loss": 0.7455, + "step": 14258 + }, + { + "epoch": 0.7847982827893665, + "grad_norm": 0.7302100658416748, + "learning_rate": 6.683007986331014e-06, + "loss": 0.707, + "step": 14259 + }, + { + "epoch": 0.7848533215917222, + "grad_norm": 0.7605045437812805, + "learning_rate": 6.682599807400246e-06, + "loss": 0.7727, + "step": 14260 + }, + { + "epoch": 0.7849083603940779, + "grad_norm": 0.6819437146186829, + "learning_rate": 6.682191615823508e-06, + "loss": 0.7538, + "step": 14261 + }, + { + "epoch": 0.7849633991964334, + "grad_norm": 0.7399439811706543, + "learning_rate": 6.6817834116038695e-06, + "loss": 0.7499, + "step": 14262 + }, + { + "epoch": 0.7850184379987891, + "grad_norm": 0.7864901423454285, + "learning_rate": 6.681375194744397e-06, + "loss": 0.7128, + "step": 14263 + }, + { + "epoch": 0.7850734768011448, + "grad_norm": 0.7308626174926758, + "learning_rate": 6.680966965248159e-06, + "loss": 0.7239, + "step": 14264 + }, + { + "epoch": 0.7851285156035005, + "grad_norm": 0.6553478837013245, + "learning_rate": 6.680558723118222e-06, + "loss": 0.6984, + "step": 14265 + }, + { + "epoch": 0.7851835544058561, + "grad_norm": 0.621415376663208, + "learning_rate": 6.680150468357656e-06, + "loss": 0.6428, + "step": 14266 + }, + { + "epoch": 0.7852385932082118, + "grad_norm": 1.0505764484405518, + "learning_rate": 6.679742200969529e-06, + "loss": 0.8073, + "step": 14267 + }, + { + "epoch": 0.7852936320105675, + "grad_norm": 0.7393355369567871, + "learning_rate": 6.67933392095691e-06, + "loss": 0.7396, + "step": 14268 + }, + { + "epoch": 0.7853486708129231, + "grad_norm": 0.7346563935279846, + "learning_rate": 6.678925628322864e-06, + "loss": 0.7398, + "step": 14269 + }, + { + "epoch": 0.7854037096152787, + "grad_norm": 0.6694674491882324, + "learning_rate": 6.678517323070465e-06, + "loss": 0.7346, + "step": 14270 + }, + { + "epoch": 0.7854587484176344, + "grad_norm": 0.6907033920288086, + "learning_rate": 6.678109005202779e-06, + "loss": 0.7617, + "step": 14271 + }, + { + "epoch": 0.7855137872199901, + "grad_norm": 0.6588131189346313, + "learning_rate": 6.677700674722873e-06, + "loss": 0.7514, + "step": 14272 + }, + { + "epoch": 0.7855688260223458, + "grad_norm": 0.6535136699676514, + "learning_rate": 6.677292331633819e-06, + "loss": 0.7154, + "step": 14273 + }, + { + "epoch": 0.7856238648247014, + "grad_norm": 0.7013682723045349, + "learning_rate": 6.676883975938685e-06, + "loss": 0.8506, + "step": 14274 + }, + { + "epoch": 0.7856789036270571, + "grad_norm": 0.7128416895866394, + "learning_rate": 6.67647560764054e-06, + "loss": 0.7669, + "step": 14275 + }, + { + "epoch": 0.7857339424294127, + "grad_norm": 0.7021318674087524, + "learning_rate": 6.676067226742453e-06, + "loss": 0.8236, + "step": 14276 + }, + { + "epoch": 0.7857889812317684, + "grad_norm": 0.7067561745643616, + "learning_rate": 6.675658833247493e-06, + "loss": 0.6848, + "step": 14277 + }, + { + "epoch": 0.785844020034124, + "grad_norm": 0.6488254070281982, + "learning_rate": 6.675250427158731e-06, + "loss": 0.7877, + "step": 14278 + }, + { + "epoch": 0.7858990588364797, + "grad_norm": 0.7153946757316589, + "learning_rate": 6.674842008479234e-06, + "loss": 0.7994, + "step": 14279 + }, + { + "epoch": 0.7859540976388354, + "grad_norm": 0.7290914058685303, + "learning_rate": 6.6744335772120735e-06, + "loss": 0.8074, + "step": 14280 + }, + { + "epoch": 0.7860091364411911, + "grad_norm": 0.726309061050415, + "learning_rate": 6.674025133360316e-06, + "loss": 0.7789, + "step": 14281 + }, + { + "epoch": 0.7860641752435467, + "grad_norm": 0.6294347047805786, + "learning_rate": 6.673616676927037e-06, + "loss": 0.6405, + "step": 14282 + }, + { + "epoch": 0.7861192140459023, + "grad_norm": 0.654400646686554, + "learning_rate": 6.673208207915302e-06, + "loss": 0.7876, + "step": 14283 + }, + { + "epoch": 0.786174252848258, + "grad_norm": 0.6729328632354736, + "learning_rate": 6.672799726328182e-06, + "loss": 0.7773, + "step": 14284 + }, + { + "epoch": 0.7862292916506137, + "grad_norm": 0.7607905268669128, + "learning_rate": 6.672391232168745e-06, + "loss": 0.8262, + "step": 14285 + }, + { + "epoch": 0.7862843304529693, + "grad_norm": 0.6475018858909607, + "learning_rate": 6.671982725440065e-06, + "loss": 0.7383, + "step": 14286 + }, + { + "epoch": 0.786339369255325, + "grad_norm": 0.8290789723396301, + "learning_rate": 6.671574206145211e-06, + "loss": 0.7968, + "step": 14287 + }, + { + "epoch": 0.7863944080576807, + "grad_norm": 0.7462177872657776, + "learning_rate": 6.671165674287252e-06, + "loss": 0.7465, + "step": 14288 + }, + { + "epoch": 0.7864494468600364, + "grad_norm": 0.7029373049736023, + "learning_rate": 6.6707571298692595e-06, + "loss": 0.7342, + "step": 14289 + }, + { + "epoch": 0.786504485662392, + "grad_norm": 0.8253761529922485, + "learning_rate": 6.670348572894303e-06, + "loss": 0.8196, + "step": 14290 + }, + { + "epoch": 0.7865595244647476, + "grad_norm": 0.7234970331192017, + "learning_rate": 6.669940003365455e-06, + "loss": 0.7966, + "step": 14291 + }, + { + "epoch": 0.7866145632671033, + "grad_norm": 0.8699348568916321, + "learning_rate": 6.6695314212857845e-06, + "loss": 0.8761, + "step": 14292 + }, + { + "epoch": 0.786669602069459, + "grad_norm": 0.6620158553123474, + "learning_rate": 6.66912282665836e-06, + "loss": 0.7534, + "step": 14293 + }, + { + "epoch": 0.7867246408718146, + "grad_norm": 0.6469776630401611, + "learning_rate": 6.668714219486259e-06, + "loss": 0.7812, + "step": 14294 + }, + { + "epoch": 0.7867796796741703, + "grad_norm": 0.6477407813072205, + "learning_rate": 6.668305599772546e-06, + "loss": 0.7144, + "step": 14295 + }, + { + "epoch": 0.786834718476526, + "grad_norm": 0.6626473665237427, + "learning_rate": 6.667896967520297e-06, + "loss": 0.7283, + "step": 14296 + }, + { + "epoch": 0.7868897572788817, + "grad_norm": 0.6214945316314697, + "learning_rate": 6.667488322732578e-06, + "loss": 0.6835, + "step": 14297 + }, + { + "epoch": 0.7869447960812372, + "grad_norm": 0.6199555397033691, + "learning_rate": 6.667079665412465e-06, + "loss": 0.706, + "step": 14298 + }, + { + "epoch": 0.7869998348835929, + "grad_norm": 0.8127612471580505, + "learning_rate": 6.666670995563027e-06, + "loss": 0.7099, + "step": 14299 + }, + { + "epoch": 0.7870548736859486, + "grad_norm": 0.6241362690925598, + "learning_rate": 6.6662623131873374e-06, + "loss": 0.7076, + "step": 14300 + }, + { + "epoch": 0.7871099124883043, + "grad_norm": 0.7260692715644836, + "learning_rate": 6.665853618288465e-06, + "loss": 0.7842, + "step": 14301 + }, + { + "epoch": 0.7871649512906599, + "grad_norm": 0.6644107103347778, + "learning_rate": 6.665444910869482e-06, + "loss": 0.6515, + "step": 14302 + }, + { + "epoch": 0.7872199900930156, + "grad_norm": 0.6629641056060791, + "learning_rate": 6.6650361909334616e-06, + "loss": 0.7062, + "step": 14303 + }, + { + "epoch": 0.7872750288953713, + "grad_norm": 0.6616516709327698, + "learning_rate": 6.6646274584834745e-06, + "loss": 0.8195, + "step": 14304 + }, + { + "epoch": 0.7873300676977268, + "grad_norm": 0.7184805870056152, + "learning_rate": 6.664218713522593e-06, + "loss": 0.8699, + "step": 14305 + }, + { + "epoch": 0.7873851065000825, + "grad_norm": 0.6567219495773315, + "learning_rate": 6.6638099560538905e-06, + "loss": 0.7679, + "step": 14306 + }, + { + "epoch": 0.7874401453024382, + "grad_norm": 0.6952399611473083, + "learning_rate": 6.663401186080436e-06, + "loss": 0.603, + "step": 14307 + }, + { + "epoch": 0.7874951841047939, + "grad_norm": 0.7298767566680908, + "learning_rate": 6.662992403605304e-06, + "loss": 0.7655, + "step": 14308 + }, + { + "epoch": 0.7875502229071495, + "grad_norm": 0.7162219882011414, + "learning_rate": 6.662583608631567e-06, + "loss": 0.7797, + "step": 14309 + }, + { + "epoch": 0.7876052617095052, + "grad_norm": 0.6489827036857605, + "learning_rate": 6.662174801162296e-06, + "loss": 0.8165, + "step": 14310 + }, + { + "epoch": 0.7876603005118609, + "grad_norm": 0.7893611192703247, + "learning_rate": 6.6617659812005635e-06, + "loss": 0.8082, + "step": 14311 + }, + { + "epoch": 0.7877153393142166, + "grad_norm": 0.6709675192832947, + "learning_rate": 6.661357148749443e-06, + "loss": 0.7549, + "step": 14312 + }, + { + "epoch": 0.7877703781165721, + "grad_norm": 0.6166689991950989, + "learning_rate": 6.660948303812009e-06, + "loss": 0.7116, + "step": 14313 + }, + { + "epoch": 0.7878254169189278, + "grad_norm": 0.7941738367080688, + "learning_rate": 6.660539446391329e-06, + "loss": 0.7981, + "step": 14314 + }, + { + "epoch": 0.7878804557212835, + "grad_norm": 0.6339346170425415, + "learning_rate": 6.660130576490481e-06, + "loss": 0.7306, + "step": 14315 + }, + { + "epoch": 0.7879354945236392, + "grad_norm": 0.7044192552566528, + "learning_rate": 6.659721694112535e-06, + "loss": 0.7811, + "step": 14316 + }, + { + "epoch": 0.7879905333259948, + "grad_norm": 0.7853406071662903, + "learning_rate": 6.659312799260565e-06, + "loss": 0.7652, + "step": 14317 + }, + { + "epoch": 0.7880455721283505, + "grad_norm": 0.7076637148857117, + "learning_rate": 6.658903891937645e-06, + "loss": 0.7672, + "step": 14318 + }, + { + "epoch": 0.7881006109307062, + "grad_norm": 0.7043278813362122, + "learning_rate": 6.658494972146847e-06, + "loss": 0.726, + "step": 14319 + }, + { + "epoch": 0.7881556497330618, + "grad_norm": 0.8903809785842896, + "learning_rate": 6.658086039891245e-06, + "loss": 0.8, + "step": 14320 + }, + { + "epoch": 0.7882106885354174, + "grad_norm": 0.8239984512329102, + "learning_rate": 6.657677095173911e-06, + "loss": 0.7283, + "step": 14321 + }, + { + "epoch": 0.7882657273377731, + "grad_norm": 0.7221176028251648, + "learning_rate": 6.6572681379979206e-06, + "loss": 0.8058, + "step": 14322 + }, + { + "epoch": 0.7883207661401288, + "grad_norm": 0.8297285437583923, + "learning_rate": 6.6568591683663475e-06, + "loss": 0.8064, + "step": 14323 + }, + { + "epoch": 0.7883758049424845, + "grad_norm": 0.680659294128418, + "learning_rate": 6.656450186282264e-06, + "loss": 0.7259, + "step": 14324 + }, + { + "epoch": 0.7884308437448401, + "grad_norm": 0.7067807912826538, + "learning_rate": 6.656041191748744e-06, + "loss": 0.8414, + "step": 14325 + }, + { + "epoch": 0.7884858825471958, + "grad_norm": 0.6053900718688965, + "learning_rate": 6.655632184768861e-06, + "loss": 0.6762, + "step": 14326 + }, + { + "epoch": 0.7885409213495514, + "grad_norm": 0.7123621106147766, + "learning_rate": 6.65522316534569e-06, + "loss": 0.6968, + "step": 14327 + }, + { + "epoch": 0.7885959601519071, + "grad_norm": 0.7308228015899658, + "learning_rate": 6.6548141334823045e-06, + "loss": 0.6715, + "step": 14328 + }, + { + "epoch": 0.7886509989542627, + "grad_norm": 0.7508199214935303, + "learning_rate": 6.654405089181779e-06, + "loss": 0.7884, + "step": 14329 + }, + { + "epoch": 0.7887060377566184, + "grad_norm": 0.7317141890525818, + "learning_rate": 6.653996032447188e-06, + "loss": 0.7319, + "step": 14330 + }, + { + "epoch": 0.7887610765589741, + "grad_norm": 0.6797091364860535, + "learning_rate": 6.653586963281607e-06, + "loss": 0.7898, + "step": 14331 + }, + { + "epoch": 0.7888161153613298, + "grad_norm": 0.6293582320213318, + "learning_rate": 6.6531778816881065e-06, + "loss": 0.6784, + "step": 14332 + }, + { + "epoch": 0.7888711541636854, + "grad_norm": 0.7604238986968994, + "learning_rate": 6.652768787669763e-06, + "loss": 0.7226, + "step": 14333 + }, + { + "epoch": 0.788926192966041, + "grad_norm": 0.6921128034591675, + "learning_rate": 6.652359681229654e-06, + "loss": 0.7375, + "step": 14334 + }, + { + "epoch": 0.7889812317683967, + "grad_norm": 0.6532993316650391, + "learning_rate": 6.651950562370851e-06, + "loss": 0.703, + "step": 14335 + }, + { + "epoch": 0.7890362705707524, + "grad_norm": 0.6739360094070435, + "learning_rate": 6.651541431096431e-06, + "loss": 0.7488, + "step": 14336 + }, + { + "epoch": 0.789091309373108, + "grad_norm": 0.7503200173377991, + "learning_rate": 6.651132287409466e-06, + "loss": 0.7492, + "step": 14337 + }, + { + "epoch": 0.7891463481754637, + "grad_norm": 0.6537551879882812, + "learning_rate": 6.650723131313035e-06, + "loss": 0.723, + "step": 14338 + }, + { + "epoch": 0.7892013869778194, + "grad_norm": 0.6378511786460876, + "learning_rate": 6.650313962810208e-06, + "loss": 0.7764, + "step": 14339 + }, + { + "epoch": 0.7892564257801751, + "grad_norm": 0.7948685884475708, + "learning_rate": 6.649904781904065e-06, + "loss": 0.7996, + "step": 14340 + }, + { + "epoch": 0.7893114645825307, + "grad_norm": 0.7558071613311768, + "learning_rate": 6.649495588597678e-06, + "loss": 0.8249, + "step": 14341 + }, + { + "epoch": 0.7893665033848863, + "grad_norm": 0.7158063054084778, + "learning_rate": 6.649086382894124e-06, + "loss": 0.815, + "step": 14342 + }, + { + "epoch": 0.789421542187242, + "grad_norm": 0.7551599144935608, + "learning_rate": 6.648677164796479e-06, + "loss": 0.7151, + "step": 14343 + }, + { + "epoch": 0.7894765809895977, + "grad_norm": 0.6966339349746704, + "learning_rate": 6.648267934307817e-06, + "loss": 0.8057, + "step": 14344 + }, + { + "epoch": 0.7895316197919533, + "grad_norm": 0.6863396167755127, + "learning_rate": 6.647858691431214e-06, + "loss": 0.7819, + "step": 14345 + }, + { + "epoch": 0.789586658594309, + "grad_norm": 0.7352383136749268, + "learning_rate": 6.647449436169747e-06, + "loss": 0.8101, + "step": 14346 + }, + { + "epoch": 0.7896416973966647, + "grad_norm": 0.7630855441093445, + "learning_rate": 6.64704016852649e-06, + "loss": 0.7155, + "step": 14347 + }, + { + "epoch": 0.7896967361990203, + "grad_norm": 0.6740198135375977, + "learning_rate": 6.646630888504522e-06, + "loss": 0.7255, + "step": 14348 + }, + { + "epoch": 0.7897517750013759, + "grad_norm": 0.7095367908477783, + "learning_rate": 6.646221596106917e-06, + "loss": 0.7527, + "step": 14349 + }, + { + "epoch": 0.7898068138037316, + "grad_norm": 0.6096131801605225, + "learning_rate": 6.645812291336749e-06, + "loss": 0.7116, + "step": 14350 + }, + { + "epoch": 0.7898618526060873, + "grad_norm": 0.7212585210800171, + "learning_rate": 6.645402974197097e-06, + "loss": 0.7647, + "step": 14351 + }, + { + "epoch": 0.7899168914084429, + "grad_norm": 0.7145454287528992, + "learning_rate": 6.6449936446910376e-06, + "loss": 0.7988, + "step": 14352 + }, + { + "epoch": 0.7899719302107986, + "grad_norm": 0.668269693851471, + "learning_rate": 6.644584302821646e-06, + "loss": 0.8453, + "step": 14353 + }, + { + "epoch": 0.7900269690131543, + "grad_norm": 0.7431649565696716, + "learning_rate": 6.644174948591998e-06, + "loss": 0.6981, + "step": 14354 + }, + { + "epoch": 0.79008200781551, + "grad_norm": 0.6727485060691833, + "learning_rate": 6.643765582005172e-06, + "loss": 0.792, + "step": 14355 + }, + { + "epoch": 0.7901370466178655, + "grad_norm": 0.7102059721946716, + "learning_rate": 6.643356203064244e-06, + "loss": 0.7469, + "step": 14356 + }, + { + "epoch": 0.7901920854202212, + "grad_norm": 0.6719706654548645, + "learning_rate": 6.642946811772291e-06, + "loss": 0.7542, + "step": 14357 + }, + { + "epoch": 0.7902471242225769, + "grad_norm": 0.7044880986213684, + "learning_rate": 6.6425374081323875e-06, + "loss": 0.7884, + "step": 14358 + }, + { + "epoch": 0.7903021630249326, + "grad_norm": 0.656411349773407, + "learning_rate": 6.642127992147614e-06, + "loss": 0.7596, + "step": 14359 + }, + { + "epoch": 0.7903572018272882, + "grad_norm": 0.6256445050239563, + "learning_rate": 6.641718563821047e-06, + "loss": 0.6257, + "step": 14360 + }, + { + "epoch": 0.7904122406296439, + "grad_norm": 0.6761715412139893, + "learning_rate": 6.641309123155761e-06, + "loss": 0.7024, + "step": 14361 + }, + { + "epoch": 0.7904672794319996, + "grad_norm": 0.7567794322967529, + "learning_rate": 6.640899670154837e-06, + "loss": 0.7948, + "step": 14362 + }, + { + "epoch": 0.7905223182343553, + "grad_norm": 0.6192977428436279, + "learning_rate": 6.640490204821349e-06, + "loss": 0.7307, + "step": 14363 + }, + { + "epoch": 0.7905773570367108, + "grad_norm": 0.8120929002761841, + "learning_rate": 6.640080727158376e-06, + "loss": 0.7173, + "step": 14364 + }, + { + "epoch": 0.7906323958390665, + "grad_norm": 0.7303271293640137, + "learning_rate": 6.639671237168996e-06, + "loss": 0.8118, + "step": 14365 + }, + { + "epoch": 0.7906874346414222, + "grad_norm": 0.6731529831886292, + "learning_rate": 6.639261734856284e-06, + "loss": 0.76, + "step": 14366 + }, + { + "epoch": 0.7907424734437779, + "grad_norm": 0.6909935474395752, + "learning_rate": 6.638852220223321e-06, + "loss": 0.7732, + "step": 14367 + }, + { + "epoch": 0.7907975122461335, + "grad_norm": 0.6543979048728943, + "learning_rate": 6.638442693273183e-06, + "loss": 0.7408, + "step": 14368 + }, + { + "epoch": 0.7908525510484892, + "grad_norm": 0.6411511301994324, + "learning_rate": 6.6380331540089485e-06, + "loss": 0.6963, + "step": 14369 + }, + { + "epoch": 0.7909075898508449, + "grad_norm": 0.6657214164733887, + "learning_rate": 6.637623602433694e-06, + "loss": 0.7417, + "step": 14370 + }, + { + "epoch": 0.7909626286532006, + "grad_norm": 0.6852405071258545, + "learning_rate": 6.6372140385505e-06, + "loss": 0.7176, + "step": 14371 + }, + { + "epoch": 0.7910176674555561, + "grad_norm": 0.6453777551651001, + "learning_rate": 6.636804462362444e-06, + "loss": 0.7791, + "step": 14372 + }, + { + "epoch": 0.7910727062579118, + "grad_norm": 0.6806328296661377, + "learning_rate": 6.636394873872603e-06, + "loss": 0.7856, + "step": 14373 + }, + { + "epoch": 0.7911277450602675, + "grad_norm": 0.6819495558738708, + "learning_rate": 6.635985273084058e-06, + "loss": 0.7865, + "step": 14374 + }, + { + "epoch": 0.7911827838626232, + "grad_norm": 0.7372999787330627, + "learning_rate": 6.635575659999883e-06, + "loss": 0.8549, + "step": 14375 + }, + { + "epoch": 0.7912378226649788, + "grad_norm": 0.8146817684173584, + "learning_rate": 6.635166034623162e-06, + "loss": 0.7253, + "step": 14376 + }, + { + "epoch": 0.7912928614673345, + "grad_norm": 0.8205630779266357, + "learning_rate": 6.634756396956969e-06, + "loss": 0.6915, + "step": 14377 + }, + { + "epoch": 0.7913479002696902, + "grad_norm": 0.7168713808059692, + "learning_rate": 6.634346747004383e-06, + "loss": 0.7495, + "step": 14378 + }, + { + "epoch": 0.7914029390720458, + "grad_norm": 0.7210709452629089, + "learning_rate": 6.6339370847684854e-06, + "loss": 0.7323, + "step": 14379 + }, + { + "epoch": 0.7914579778744014, + "grad_norm": 0.9042065143585205, + "learning_rate": 6.633527410252355e-06, + "loss": 0.847, + "step": 14380 + }, + { + "epoch": 0.7915130166767571, + "grad_norm": 0.6700118184089661, + "learning_rate": 6.633117723459071e-06, + "loss": 0.7975, + "step": 14381 + }, + { + "epoch": 0.7915680554791128, + "grad_norm": 0.6355725526809692, + "learning_rate": 6.632708024391707e-06, + "loss": 0.7398, + "step": 14382 + }, + { + "epoch": 0.7916230942814685, + "grad_norm": 0.8274535536766052, + "learning_rate": 6.6322983130533505e-06, + "loss": 0.8641, + "step": 14383 + }, + { + "epoch": 0.7916781330838241, + "grad_norm": 0.5835573077201843, + "learning_rate": 6.631888589447075e-06, + "loss": 0.636, + "step": 14384 + }, + { + "epoch": 0.7917331718861798, + "grad_norm": 0.6933130621910095, + "learning_rate": 6.631478853575963e-06, + "loss": 0.7874, + "step": 14385 + }, + { + "epoch": 0.7917882106885354, + "grad_norm": 0.8125241994857788, + "learning_rate": 6.631069105443092e-06, + "loss": 0.7961, + "step": 14386 + }, + { + "epoch": 0.7918432494908911, + "grad_norm": 0.6661116480827332, + "learning_rate": 6.630659345051542e-06, + "loss": 0.6498, + "step": 14387 + }, + { + "epoch": 0.7918982882932467, + "grad_norm": 0.6807548403739929, + "learning_rate": 6.630249572404393e-06, + "loss": 0.6952, + "step": 14388 + }, + { + "epoch": 0.7919533270956024, + "grad_norm": 0.6886214017868042, + "learning_rate": 6.629839787504726e-06, + "loss": 0.7416, + "step": 14389 + }, + { + "epoch": 0.7920083658979581, + "grad_norm": 0.7633732557296753, + "learning_rate": 6.629429990355617e-06, + "loss": 0.8008, + "step": 14390 + }, + { + "epoch": 0.7920634047003137, + "grad_norm": 0.8401023745536804, + "learning_rate": 6.6290201809601494e-06, + "loss": 0.8312, + "step": 14391 + }, + { + "epoch": 0.7921184435026694, + "grad_norm": 0.6608526706695557, + "learning_rate": 6.628610359321403e-06, + "loss": 0.563, + "step": 14392 + }, + { + "epoch": 0.792173482305025, + "grad_norm": 0.687045156955719, + "learning_rate": 6.6282005254424566e-06, + "loss": 0.7451, + "step": 14393 + }, + { + "epoch": 0.7922285211073807, + "grad_norm": 0.7129287123680115, + "learning_rate": 6.627790679326389e-06, + "loss": 0.8495, + "step": 14394 + }, + { + "epoch": 0.7922835599097363, + "grad_norm": 0.6951952576637268, + "learning_rate": 6.627380820976283e-06, + "loss": 0.7895, + "step": 14395 + }, + { + "epoch": 0.792338598712092, + "grad_norm": 0.8020780086517334, + "learning_rate": 6.626970950395221e-06, + "loss": 0.7136, + "step": 14396 + }, + { + "epoch": 0.7923936375144477, + "grad_norm": 0.6654007434844971, + "learning_rate": 6.626561067586279e-06, + "loss": 0.7865, + "step": 14397 + }, + { + "epoch": 0.7924486763168034, + "grad_norm": 0.844744861125946, + "learning_rate": 6.62615117255254e-06, + "loss": 0.7856, + "step": 14398 + }, + { + "epoch": 0.792503715119159, + "grad_norm": 0.6890879273414612, + "learning_rate": 6.625741265297083e-06, + "loss": 0.7574, + "step": 14399 + }, + { + "epoch": 0.7925587539215146, + "grad_norm": 0.7559735774993896, + "learning_rate": 6.625331345822992e-06, + "loss": 0.634, + "step": 14400 + }, + { + "epoch": 0.7926137927238703, + "grad_norm": 0.6918107867240906, + "learning_rate": 6.624921414133344e-06, + "loss": 0.6935, + "step": 14401 + }, + { + "epoch": 0.792668831526226, + "grad_norm": 0.7468792200088501, + "learning_rate": 6.624511470231221e-06, + "loss": 0.7301, + "step": 14402 + }, + { + "epoch": 0.7927238703285816, + "grad_norm": 0.6749486327171326, + "learning_rate": 6.624101514119705e-06, + "loss": 0.7143, + "step": 14403 + }, + { + "epoch": 0.7927789091309373, + "grad_norm": 0.7765836119651794, + "learning_rate": 6.623691545801878e-06, + "loss": 0.7201, + "step": 14404 + }, + { + "epoch": 0.792833947933293, + "grad_norm": 0.6263312697410583, + "learning_rate": 6.623281565280819e-06, + "loss": 0.5866, + "step": 14405 + }, + { + "epoch": 0.7928889867356487, + "grad_norm": 0.6325232982635498, + "learning_rate": 6.62287157255961e-06, + "loss": 0.7389, + "step": 14406 + }, + { + "epoch": 0.7929440255380042, + "grad_norm": 0.7165958881378174, + "learning_rate": 6.622461567641333e-06, + "loss": 0.7378, + "step": 14407 + }, + { + "epoch": 0.7929990643403599, + "grad_norm": 0.7611519694328308, + "learning_rate": 6.62205155052907e-06, + "loss": 0.7146, + "step": 14408 + }, + { + "epoch": 0.7930541031427156, + "grad_norm": 0.6764969825744629, + "learning_rate": 6.6216415212259e-06, + "loss": 0.7802, + "step": 14409 + }, + { + "epoch": 0.7931091419450713, + "grad_norm": 0.7266956567764282, + "learning_rate": 6.621231479734908e-06, + "loss": 0.7065, + "step": 14410 + }, + { + "epoch": 0.7931641807474269, + "grad_norm": 0.7540454268455505, + "learning_rate": 6.620821426059174e-06, + "loss": 0.7327, + "step": 14411 + }, + { + "epoch": 0.7932192195497826, + "grad_norm": 0.7931423783302307, + "learning_rate": 6.620411360201779e-06, + "loss": 0.8032, + "step": 14412 + }, + { + "epoch": 0.7932742583521383, + "grad_norm": 1.2976648807525635, + "learning_rate": 6.620001282165808e-06, + "loss": 0.7422, + "step": 14413 + }, + { + "epoch": 0.793329297154494, + "grad_norm": 0.6525906920433044, + "learning_rate": 6.619591191954338e-06, + "loss": 0.6857, + "step": 14414 + }, + { + "epoch": 0.7933843359568495, + "grad_norm": 0.6153263449668884, + "learning_rate": 6.619181089570456e-06, + "loss": 0.6117, + "step": 14415 + }, + { + "epoch": 0.7934393747592052, + "grad_norm": 0.7076815962791443, + "learning_rate": 6.6187709750172425e-06, + "loss": 0.8053, + "step": 14416 + }, + { + "epoch": 0.7934944135615609, + "grad_norm": 0.6999046802520752, + "learning_rate": 6.618360848297779e-06, + "loss": 0.6275, + "step": 14417 + }, + { + "epoch": 0.7935494523639166, + "grad_norm": 0.7043859958648682, + "learning_rate": 6.6179507094151484e-06, + "loss": 0.8273, + "step": 14418 + }, + { + "epoch": 0.7936044911662722, + "grad_norm": 0.6295393705368042, + "learning_rate": 6.617540558372434e-06, + "loss": 0.6394, + "step": 14419 + }, + { + "epoch": 0.7936595299686279, + "grad_norm": 0.8165664076805115, + "learning_rate": 6.617130395172718e-06, + "loss": 0.8473, + "step": 14420 + }, + { + "epoch": 0.7937145687709836, + "grad_norm": 0.7598135471343994, + "learning_rate": 6.616720219819082e-06, + "loss": 0.729, + "step": 14421 + }, + { + "epoch": 0.7937696075733393, + "grad_norm": 0.7222034335136414, + "learning_rate": 6.6163100323146105e-06, + "loss": 0.7526, + "step": 14422 + }, + { + "epoch": 0.7938246463756948, + "grad_norm": 0.7994693517684937, + "learning_rate": 6.615899832662385e-06, + "loss": 0.8346, + "step": 14423 + }, + { + "epoch": 0.7938796851780505, + "grad_norm": 0.6603162884712219, + "learning_rate": 6.615489620865489e-06, + "loss": 0.7546, + "step": 14424 + }, + { + "epoch": 0.7939347239804062, + "grad_norm": 0.6525929570198059, + "learning_rate": 6.615079396927005e-06, + "loss": 0.7344, + "step": 14425 + }, + { + "epoch": 0.7939897627827619, + "grad_norm": 0.6144835948944092, + "learning_rate": 6.614669160850016e-06, + "loss": 0.6776, + "step": 14426 + }, + { + "epoch": 0.7940448015851175, + "grad_norm": 0.7205507159233093, + "learning_rate": 6.614258912637607e-06, + "loss": 0.809, + "step": 14427 + }, + { + "epoch": 0.7940998403874732, + "grad_norm": 0.6757732629776001, + "learning_rate": 6.61384865229286e-06, + "loss": 0.7403, + "step": 14428 + }, + { + "epoch": 0.7941548791898289, + "grad_norm": 0.6392103433609009, + "learning_rate": 6.6134383798188586e-06, + "loss": 0.7689, + "step": 14429 + }, + { + "epoch": 0.7942099179921845, + "grad_norm": 0.6647289395332336, + "learning_rate": 6.613028095218685e-06, + "loss": 0.6611, + "step": 14430 + }, + { + "epoch": 0.7942649567945401, + "grad_norm": 0.6961668133735657, + "learning_rate": 6.612617798495426e-06, + "loss": 0.7784, + "step": 14431 + }, + { + "epoch": 0.7943199955968958, + "grad_norm": 1.1188037395477295, + "learning_rate": 6.6122074896521615e-06, + "loss": 0.6518, + "step": 14432 + }, + { + "epoch": 0.7943750343992515, + "grad_norm": 0.6382507085800171, + "learning_rate": 6.611797168691978e-06, + "loss": 0.6954, + "step": 14433 + }, + { + "epoch": 0.7944300732016071, + "grad_norm": 0.6720117330551147, + "learning_rate": 6.6113868356179585e-06, + "loss": 0.7267, + "step": 14434 + }, + { + "epoch": 0.7944851120039628, + "grad_norm": 0.6667274832725525, + "learning_rate": 6.610976490433186e-06, + "loss": 0.6867, + "step": 14435 + }, + { + "epoch": 0.7945401508063185, + "grad_norm": 0.658217191696167, + "learning_rate": 6.610566133140747e-06, + "loss": 0.66, + "step": 14436 + }, + { + "epoch": 0.7945951896086741, + "grad_norm": 0.6820386648178101, + "learning_rate": 6.610155763743723e-06, + "loss": 0.7352, + "step": 14437 + }, + { + "epoch": 0.7946502284110297, + "grad_norm": 0.788696825504303, + "learning_rate": 6.609745382245198e-06, + "loss": 0.6822, + "step": 14438 + }, + { + "epoch": 0.7947052672133854, + "grad_norm": 0.6485540270805359, + "learning_rate": 6.6093349886482596e-06, + "loss": 0.718, + "step": 14439 + }, + { + "epoch": 0.7947603060157411, + "grad_norm": 0.717659056186676, + "learning_rate": 6.60892458295599e-06, + "loss": 0.7898, + "step": 14440 + }, + { + "epoch": 0.7948153448180968, + "grad_norm": 0.6576352119445801, + "learning_rate": 6.608514165171473e-06, + "loss": 0.8041, + "step": 14441 + }, + { + "epoch": 0.7948703836204524, + "grad_norm": 0.7034726738929749, + "learning_rate": 6.608103735297795e-06, + "loss": 0.7901, + "step": 14442 + }, + { + "epoch": 0.7949254224228081, + "grad_norm": 0.7001451253890991, + "learning_rate": 6.6076932933380386e-06, + "loss": 0.6814, + "step": 14443 + }, + { + "epoch": 0.7949804612251637, + "grad_norm": 0.789359450340271, + "learning_rate": 6.607282839295291e-06, + "loss": 0.744, + "step": 14444 + }, + { + "epoch": 0.7950355000275194, + "grad_norm": 0.7830412983894348, + "learning_rate": 6.606872373172636e-06, + "loss": 0.8161, + "step": 14445 + }, + { + "epoch": 0.795090538829875, + "grad_norm": 0.6462455987930298, + "learning_rate": 6.606461894973157e-06, + "loss": 0.7723, + "step": 14446 + }, + { + "epoch": 0.7951455776322307, + "grad_norm": 0.6232526898384094, + "learning_rate": 6.606051404699943e-06, + "loss": 0.6723, + "step": 14447 + }, + { + "epoch": 0.7952006164345864, + "grad_norm": 0.7790026068687439, + "learning_rate": 6.605640902356074e-06, + "loss": 0.7687, + "step": 14448 + }, + { + "epoch": 0.7952556552369421, + "grad_norm": 0.7281851768493652, + "learning_rate": 6.605230387944639e-06, + "loss": 0.827, + "step": 14449 + }, + { + "epoch": 0.7953106940392977, + "grad_norm": 0.6519556045532227, + "learning_rate": 6.604819861468721e-06, + "loss": 0.7039, + "step": 14450 + }, + { + "epoch": 0.7953657328416534, + "grad_norm": 0.6768763661384583, + "learning_rate": 6.604409322931406e-06, + "loss": 0.7288, + "step": 14451 + }, + { + "epoch": 0.795420771644009, + "grad_norm": 0.7457320094108582, + "learning_rate": 6.6039987723357825e-06, + "loss": 0.8386, + "step": 14452 + }, + { + "epoch": 0.7954758104463647, + "grad_norm": 0.9579072594642639, + "learning_rate": 6.6035882096849325e-06, + "loss": 0.7552, + "step": 14453 + }, + { + "epoch": 0.7955308492487203, + "grad_norm": 0.6709916591644287, + "learning_rate": 6.603177634981941e-06, + "loss": 0.724, + "step": 14454 + }, + { + "epoch": 0.795585888051076, + "grad_norm": 0.6097317934036255, + "learning_rate": 6.602767048229897e-06, + "loss": 0.6866, + "step": 14455 + }, + { + "epoch": 0.7956409268534317, + "grad_norm": 0.7303394675254822, + "learning_rate": 6.602356449431885e-06, + "loss": 0.682, + "step": 14456 + }, + { + "epoch": 0.7956959656557874, + "grad_norm": 0.775979220867157, + "learning_rate": 6.601945838590991e-06, + "loss": 0.7784, + "step": 14457 + }, + { + "epoch": 0.795751004458143, + "grad_norm": 0.7016483545303345, + "learning_rate": 6.6015352157103e-06, + "loss": 0.7557, + "step": 14458 + }, + { + "epoch": 0.7958060432604986, + "grad_norm": 0.688946545124054, + "learning_rate": 6.6011245807929e-06, + "loss": 0.707, + "step": 14459 + }, + { + "epoch": 0.7958610820628543, + "grad_norm": 0.7286174297332764, + "learning_rate": 6.600713933841877e-06, + "loss": 0.784, + "step": 14460 + }, + { + "epoch": 0.79591612086521, + "grad_norm": 0.7604749798774719, + "learning_rate": 6.600303274860316e-06, + "loss": 0.7099, + "step": 14461 + }, + { + "epoch": 0.7959711596675656, + "grad_norm": 0.6626706123352051, + "learning_rate": 6.599892603851301e-06, + "loss": 0.7137, + "step": 14462 + }, + { + "epoch": 0.7960261984699213, + "grad_norm": 0.7692080736160278, + "learning_rate": 6.599481920817925e-06, + "loss": 0.847, + "step": 14463 + }, + { + "epoch": 0.796081237272277, + "grad_norm": 0.6811042428016663, + "learning_rate": 6.599071225763269e-06, + "loss": 0.7888, + "step": 14464 + }, + { + "epoch": 0.7961362760746327, + "grad_norm": 0.654481053352356, + "learning_rate": 6.598660518690424e-06, + "loss": 0.6973, + "step": 14465 + }, + { + "epoch": 0.7961913148769882, + "grad_norm": 0.7332738637924194, + "learning_rate": 6.598249799602472e-06, + "loss": 0.8311, + "step": 14466 + }, + { + "epoch": 0.7962463536793439, + "grad_norm": 0.7098381519317627, + "learning_rate": 6.597839068502503e-06, + "loss": 0.8265, + "step": 14467 + }, + { + "epoch": 0.7963013924816996, + "grad_norm": 0.6338212490081787, + "learning_rate": 6.597428325393604e-06, + "loss": 0.6889, + "step": 14468 + }, + { + "epoch": 0.7963564312840553, + "grad_norm": 0.7001339197158813, + "learning_rate": 6.597017570278861e-06, + "loss": 0.7613, + "step": 14469 + }, + { + "epoch": 0.7964114700864109, + "grad_norm": 0.6565783619880676, + "learning_rate": 6.596606803161361e-06, + "loss": 0.6284, + "step": 14470 + }, + { + "epoch": 0.7964665088887666, + "grad_norm": 0.6638015508651733, + "learning_rate": 6.5961960240441935e-06, + "loss": 0.6635, + "step": 14471 + }, + { + "epoch": 0.7965215476911223, + "grad_norm": 0.6389575600624084, + "learning_rate": 6.595785232930443e-06, + "loss": 0.6588, + "step": 14472 + }, + { + "epoch": 0.796576586493478, + "grad_norm": 0.9486858248710632, + "learning_rate": 6.595374429823197e-06, + "loss": 0.8314, + "step": 14473 + }, + { + "epoch": 0.7966316252958335, + "grad_norm": 0.7555649280548096, + "learning_rate": 6.594963614725544e-06, + "loss": 0.8173, + "step": 14474 + }, + { + "epoch": 0.7966866640981892, + "grad_norm": 0.63021320104599, + "learning_rate": 6.5945527876405715e-06, + "loss": 0.7038, + "step": 14475 + }, + { + "epoch": 0.7967417029005449, + "grad_norm": 0.802980899810791, + "learning_rate": 6.594141948571366e-06, + "loss": 0.8031, + "step": 14476 + }, + { + "epoch": 0.7967967417029005, + "grad_norm": 0.7204614281654358, + "learning_rate": 6.593731097521019e-06, + "loss": 0.827, + "step": 14477 + }, + { + "epoch": 0.7968517805052562, + "grad_norm": 0.6805211305618286, + "learning_rate": 6.593320234492613e-06, + "loss": 0.7405, + "step": 14478 + }, + { + "epoch": 0.7969068193076119, + "grad_norm": 0.7011345028877258, + "learning_rate": 6.59290935948924e-06, + "loss": 0.7241, + "step": 14479 + }, + { + "epoch": 0.7969618581099676, + "grad_norm": 0.8995540738105774, + "learning_rate": 6.592498472513986e-06, + "loss": 0.6864, + "step": 14480 + }, + { + "epoch": 0.7970168969123231, + "grad_norm": 0.7518284320831299, + "learning_rate": 6.592087573569941e-06, + "loss": 0.7561, + "step": 14481 + }, + { + "epoch": 0.7970719357146788, + "grad_norm": 0.6359231472015381, + "learning_rate": 6.591676662660191e-06, + "loss": 0.6402, + "step": 14482 + }, + { + "epoch": 0.7971269745170345, + "grad_norm": 0.6610120534896851, + "learning_rate": 6.5912657397878264e-06, + "loss": 0.6419, + "step": 14483 + }, + { + "epoch": 0.7971820133193902, + "grad_norm": 0.7054341435432434, + "learning_rate": 6.590854804955934e-06, + "loss": 0.7252, + "step": 14484 + }, + { + "epoch": 0.7972370521217458, + "grad_norm": 0.6929903626441956, + "learning_rate": 6.5904438581676025e-06, + "loss": 0.6566, + "step": 14485 + }, + { + "epoch": 0.7972920909241015, + "grad_norm": 0.7354124188423157, + "learning_rate": 6.59003289942592e-06, + "loss": 0.763, + "step": 14486 + }, + { + "epoch": 0.7973471297264572, + "grad_norm": 0.6366610527038574, + "learning_rate": 6.5896219287339755e-06, + "loss": 0.6601, + "step": 14487 + }, + { + "epoch": 0.7974021685288128, + "grad_norm": 0.6916924715042114, + "learning_rate": 6.589210946094859e-06, + "loss": 0.7683, + "step": 14488 + }, + { + "epoch": 0.7974572073311684, + "grad_norm": 0.6567399501800537, + "learning_rate": 6.5887999515116586e-06, + "loss": 0.7487, + "step": 14489 + }, + { + "epoch": 0.7975122461335241, + "grad_norm": 0.8082888722419739, + "learning_rate": 6.5883889449874626e-06, + "loss": 0.7579, + "step": 14490 + }, + { + "epoch": 0.7975672849358798, + "grad_norm": 0.7138401865959167, + "learning_rate": 6.58797792652536e-06, + "loss": 0.7256, + "step": 14491 + }, + { + "epoch": 0.7976223237382355, + "grad_norm": 0.6514482498168945, + "learning_rate": 6.587566896128441e-06, + "loss": 0.6612, + "step": 14492 + }, + { + "epoch": 0.7976773625405911, + "grad_norm": 0.6770455837249756, + "learning_rate": 6.587155853799795e-06, + "loss": 0.677, + "step": 14493 + }, + { + "epoch": 0.7977324013429468, + "grad_norm": 0.6956327557563782, + "learning_rate": 6.586744799542511e-06, + "loss": 0.7824, + "step": 14494 + }, + { + "epoch": 0.7977874401453025, + "grad_norm": 0.6565653085708618, + "learning_rate": 6.586333733359676e-06, + "loss": 0.7496, + "step": 14495 + }, + { + "epoch": 0.7978424789476581, + "grad_norm": 0.6353399157524109, + "learning_rate": 6.585922655254382e-06, + "loss": 0.7264, + "step": 14496 + }, + { + "epoch": 0.7978975177500137, + "grad_norm": 1.037051796913147, + "learning_rate": 6.585511565229717e-06, + "loss": 0.7562, + "step": 14497 + }, + { + "epoch": 0.7979525565523694, + "grad_norm": 0.6447896957397461, + "learning_rate": 6.5851004632887725e-06, + "loss": 0.7509, + "step": 14498 + }, + { + "epoch": 0.7980075953547251, + "grad_norm": 0.7022401690483093, + "learning_rate": 6.584689349434636e-06, + "loss": 0.7752, + "step": 14499 + }, + { + "epoch": 0.7980626341570808, + "grad_norm": 0.7033591270446777, + "learning_rate": 6.5842782236703996e-06, + "loss": 0.7693, + "step": 14500 + }, + { + "epoch": 0.7981176729594364, + "grad_norm": 0.7061769962310791, + "learning_rate": 6.583867085999151e-06, + "loss": 0.6833, + "step": 14501 + }, + { + "epoch": 0.798172711761792, + "grad_norm": 0.7934882640838623, + "learning_rate": 6.583455936423984e-06, + "loss": 0.799, + "step": 14502 + }, + { + "epoch": 0.7982277505641477, + "grad_norm": 0.6968011260032654, + "learning_rate": 6.5830447749479835e-06, + "loss": 0.7132, + "step": 14503 + }, + { + "epoch": 0.7982827893665034, + "grad_norm": 1.7348299026489258, + "learning_rate": 6.582633601574243e-06, + "loss": 0.8996, + "step": 14504 + }, + { + "epoch": 0.798337828168859, + "grad_norm": 0.6822964549064636, + "learning_rate": 6.582222416305852e-06, + "loss": 0.7381, + "step": 14505 + }, + { + "epoch": 0.7983928669712147, + "grad_norm": 0.6600543856620789, + "learning_rate": 6.581811219145902e-06, + "loss": 0.711, + "step": 14506 + }, + { + "epoch": 0.7984479057735704, + "grad_norm": 0.8719834089279175, + "learning_rate": 6.581400010097481e-06, + "loss": 0.7567, + "step": 14507 + }, + { + "epoch": 0.7985029445759261, + "grad_norm": 0.7221046090126038, + "learning_rate": 6.580988789163681e-06, + "loss": 0.7417, + "step": 14508 + }, + { + "epoch": 0.7985579833782817, + "grad_norm": 0.6720401048660278, + "learning_rate": 6.580577556347592e-06, + "loss": 0.7467, + "step": 14509 + }, + { + "epoch": 0.7986130221806373, + "grad_norm": 0.7007263898849487, + "learning_rate": 6.580166311652306e-06, + "loss": 0.7356, + "step": 14510 + }, + { + "epoch": 0.798668060982993, + "grad_norm": 0.7384739518165588, + "learning_rate": 6.579755055080912e-06, + "loss": 0.7807, + "step": 14511 + }, + { + "epoch": 0.7987230997853487, + "grad_norm": 0.8054519295692444, + "learning_rate": 6.579343786636503e-06, + "loss": 0.7737, + "step": 14512 + }, + { + "epoch": 0.7987781385877043, + "grad_norm": 1.042319655418396, + "learning_rate": 6.578932506322169e-06, + "loss": 0.8708, + "step": 14513 + }, + { + "epoch": 0.79883317739006, + "grad_norm": 0.7122198343276978, + "learning_rate": 6.578521214141e-06, + "loss": 0.7818, + "step": 14514 + }, + { + "epoch": 0.7988882161924157, + "grad_norm": 0.9158271551132202, + "learning_rate": 6.578109910096088e-06, + "loss": 0.7439, + "step": 14515 + }, + { + "epoch": 0.7989432549947714, + "grad_norm": 0.7280082106590271, + "learning_rate": 6.577698594190524e-06, + "loss": 0.7888, + "step": 14516 + }, + { + "epoch": 0.798998293797127, + "grad_norm": 0.8203748464584351, + "learning_rate": 6.577287266427401e-06, + "loss": 0.7669, + "step": 14517 + }, + { + "epoch": 0.7990533325994826, + "grad_norm": 0.6998257637023926, + "learning_rate": 6.576875926809809e-06, + "loss": 0.7819, + "step": 14518 + }, + { + "epoch": 0.7991083714018383, + "grad_norm": 0.672575831413269, + "learning_rate": 6.57646457534084e-06, + "loss": 0.7359, + "step": 14519 + }, + { + "epoch": 0.7991634102041939, + "grad_norm": 0.931996762752533, + "learning_rate": 6.5760532120235845e-06, + "loss": 0.8816, + "step": 14520 + }, + { + "epoch": 0.7992184490065496, + "grad_norm": 0.7250553369522095, + "learning_rate": 6.575641836861134e-06, + "loss": 0.7924, + "step": 14521 + }, + { + "epoch": 0.7992734878089053, + "grad_norm": 0.6658768057823181, + "learning_rate": 6.575230449856582e-06, + "loss": 0.7064, + "step": 14522 + }, + { + "epoch": 0.799328526611261, + "grad_norm": 0.6901206374168396, + "learning_rate": 6.57481905101302e-06, + "loss": 0.7826, + "step": 14523 + }, + { + "epoch": 0.7993835654136165, + "grad_norm": 0.6772152781486511, + "learning_rate": 6.5744076403335386e-06, + "loss": 0.8143, + "step": 14524 + }, + { + "epoch": 0.7994386042159722, + "grad_norm": 0.6718147397041321, + "learning_rate": 6.5739962178212325e-06, + "loss": 0.765, + "step": 14525 + }, + { + "epoch": 0.7994936430183279, + "grad_norm": 0.7435488700866699, + "learning_rate": 6.573584783479191e-06, + "loss": 0.8685, + "step": 14526 + }, + { + "epoch": 0.7995486818206836, + "grad_norm": 0.7146314382553101, + "learning_rate": 6.573173337310506e-06, + "loss": 0.7605, + "step": 14527 + }, + { + "epoch": 0.7996037206230392, + "grad_norm": 0.6808409690856934, + "learning_rate": 6.572761879318274e-06, + "loss": 0.6996, + "step": 14528 + }, + { + "epoch": 0.7996587594253949, + "grad_norm": 1.1303905248641968, + "learning_rate": 6.572350409505584e-06, + "loss": 0.6107, + "step": 14529 + }, + { + "epoch": 0.7997137982277506, + "grad_norm": 0.7584583163261414, + "learning_rate": 6.571938927875529e-06, + "loss": 0.771, + "step": 14530 + }, + { + "epoch": 0.7997688370301063, + "grad_norm": 0.808233916759491, + "learning_rate": 6.5715274344312015e-06, + "loss": 0.7179, + "step": 14531 + }, + { + "epoch": 0.7998238758324618, + "grad_norm": 0.7067314386367798, + "learning_rate": 6.571115929175695e-06, + "loss": 0.7519, + "step": 14532 + }, + { + "epoch": 0.7998789146348175, + "grad_norm": 0.7611628174781799, + "learning_rate": 6.570704412112101e-06, + "loss": 0.8727, + "step": 14533 + }, + { + "epoch": 0.7999339534371732, + "grad_norm": 0.6485727429389954, + "learning_rate": 6.5702928832435145e-06, + "loss": 0.8455, + "step": 14534 + }, + { + "epoch": 0.7999889922395289, + "grad_norm": 1.5309134721755981, + "learning_rate": 6.569881342573024e-06, + "loss": 0.8362, + "step": 14535 + }, + { + "epoch": 0.8000440310418845, + "grad_norm": 0.7068225145339966, + "learning_rate": 6.569469790103729e-06, + "loss": 0.7924, + "step": 14536 + }, + { + "epoch": 0.8000990698442402, + "grad_norm": 0.7326669692993164, + "learning_rate": 6.569058225838717e-06, + "loss": 0.7594, + "step": 14537 + }, + { + "epoch": 0.8001541086465959, + "grad_norm": 0.6705706119537354, + "learning_rate": 6.568646649781085e-06, + "loss": 0.7331, + "step": 14538 + }, + { + "epoch": 0.8002091474489516, + "grad_norm": 0.7303051948547363, + "learning_rate": 6.568235061933923e-06, + "loss": 0.7274, + "step": 14539 + }, + { + "epoch": 0.8002641862513071, + "grad_norm": 0.6334550380706787, + "learning_rate": 6.567823462300326e-06, + "loss": 0.7105, + "step": 14540 + }, + { + "epoch": 0.8003192250536628, + "grad_norm": 0.7183839678764343, + "learning_rate": 6.56741185088339e-06, + "loss": 0.657, + "step": 14541 + }, + { + "epoch": 0.8003742638560185, + "grad_norm": 0.6896400451660156, + "learning_rate": 6.567000227686204e-06, + "loss": 0.7752, + "step": 14542 + }, + { + "epoch": 0.8004293026583742, + "grad_norm": 0.7214651703834534, + "learning_rate": 6.566588592711864e-06, + "loss": 0.753, + "step": 14543 + }, + { + "epoch": 0.8004843414607298, + "grad_norm": 0.7064470648765564, + "learning_rate": 6.566176945963464e-06, + "loss": 0.744, + "step": 14544 + } + ], + "logging_steps": 1, + "max_steps": 36338, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 909, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.292023542595191e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-14544/training_args.bin b/checkpoint-14544/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4fcf8689837015e25934915ab36e9943776ca6cd --- /dev/null +++ b/checkpoint-14544/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c62f9cafd9057de88f53b2d6143eaf1e38cf3558d65c4e5642eaa284f31d316 +size 7928 diff --git a/checkpoint-14544/zero_to_fp32.py b/checkpoint-14544/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-14544/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-15453/config.json b/checkpoint-15453/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0153f8ee396146a87c398da9234b3dce005be --- /dev/null +++ b/checkpoint-15453/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128259 +} diff --git a/checkpoint-15453/generation_config.json b/checkpoint-15453/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eab5082496e8b01f9c606a306676cbfabe0cce9d --- /dev/null +++ b/checkpoint-15453/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d46292fe69a1361dce8e02608c947a2b697144ef --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1252e733ad59d0bb150d32f0b3c6500fdf707ce212cf44daca95c0ada2af8a0a +size 12045435328 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2108aa026b1d62654581143d539c5fb4b3c4196f --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:559c9c6f8e2ead6ea68c57234aa07e81fb4aae54ebf3c0bcece69b705ff1844e +size 12045436096 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b4d6b50d56da8895f0c04d71cfe63a0b325e550 --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b428d1ad950acdc71c8cb871c362cc032fe9bcd9887c2a0d2301b79c2bce500 +size 12045436352 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0643a538214183f36fdee3efe2030a9477248a6f --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416a0be6f2ff63b6181bf692e9d694c3002ea522cc32d85e25f3f64c6ee3134f +size 12045436096 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fc11541971b784c4331482aa4b3b5683edc8ab3 --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6766e0d20cbdcc28c2fa435aa3539a2df37d724dd59fa70ddf2a0022c090bdef +size 12045436352 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0379e8c735062d098e4a3333c80c4ee90721cc3c --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b1cc76c468d262c82dee1e117a356a6ec255f80d00ee65bbb3fb52b228f806 +size 12045436416 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f8fbec7b5110e4ed0f9d573858d8b448b88e058 --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6e63e46b2ea718ed053c171b02c0f6f7eec3728305bbd9793ae833094919a3c +size 12045436096 diff --git a/checkpoint-15453/global_step15453/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af17475f7736cef28c67eebc422190b3540b7053 --- /dev/null +++ b/checkpoint-15453/global_step15453/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deacc7c302c0f6b8f8d2a05de38d73b0b24bbca31289f8059a41cad417b88351 +size 12045435008 diff --git a/checkpoint-15453/global_step15453/mp_rank_00_model_states.pt b/checkpoint-15453/global_step15453/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d751a41b2954f7af92d4dc28ba166692faf367e2 --- /dev/null +++ b/checkpoint-15453/global_step15453/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f0ae1bcd8985534c470be3ba8c57adfdfb91e2ace1cf01dda708362db251f0 +size 16060659704 diff --git a/checkpoint-15453/latest b/checkpoint-15453/latest new file mode 100644 index 0000000000000000000000000000000000000000..9da9f9fbc5f66cf66b54f7c2a776e502478d9976 --- /dev/null +++ b/checkpoint-15453/latest @@ -0,0 +1 @@ +global_step15453 \ No newline at end of file diff --git a/checkpoint-15453/model-00001-of-00004.safetensors b/checkpoint-15453/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5d69badc9fecb288291c1398aa5305ae191a02b0 --- /dev/null +++ b/checkpoint-15453/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836521e7868b1bc7daeee62fc86018a9bf4910acabae66ba79415171a964e207 +size 4976723248 diff --git a/checkpoint-15453/model-00002-of-00004.safetensors b/checkpoint-15453/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0fc35438a09cb27349c829a186ebf77bd9aa7bbf --- /dev/null +++ b/checkpoint-15453/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8526d0a328466106902d346d33e46caa67dca91d62988999a65eca51108068 +size 4999802720 diff --git a/checkpoint-15453/model-00003-of-00004.safetensors b/checkpoint-15453/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..047c847928519a2ed9b0474686c095526d8d26f1 --- /dev/null +++ b/checkpoint-15453/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:567dedf0f024a5d12f516d54ebf5799127b23487e2e6ec2d6a79e48f7861d283 +size 4915916176 diff --git a/checkpoint-15453/model-00004-of-00004.safetensors b/checkpoint-15453/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97af8561f5c1d66d21e4384d03d92be3c1482d93 --- /dev/null +++ b/checkpoint-15453/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e7d19eb0155b7ef2576b99bab6116fd3e95407e6e8a53ab1e7e4f904436a17 +size 1168163384 diff --git a/checkpoint-15453/model.safetensors.index.json b/checkpoint-15453/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..e734f8f9bcabe95e936a11f19b77148f54640122 --- /dev/null +++ b/checkpoint-15453/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060571648 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-15453/rng_state_0.pth b/checkpoint-15453/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-15453/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-15453/rng_state_1.pth b/checkpoint-15453/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-15453/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-15453/rng_state_2.pth b/checkpoint-15453/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-15453/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-15453/rng_state_3.pth b/checkpoint-15453/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-15453/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-15453/rng_state_4.pth b/checkpoint-15453/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-15453/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-15453/rng_state_5.pth b/checkpoint-15453/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-15453/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-15453/rng_state_6.pth b/checkpoint-15453/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-15453/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-15453/rng_state_7.pth b/checkpoint-15453/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-15453/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-15453/scheduler.pt b/checkpoint-15453/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba63c09273c60bc14b24c9ae9071efeff73a745f --- /dev/null +++ b/checkpoint-15453/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad072d21fc2d31c0824970ba97fd434fdd6608c686b6458a05b723c21e8bc1b8 +size 1064 diff --git a/checkpoint-15453/special_tokens_map.json b/checkpoint-15453/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-15453/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-15453/tokenizer.json b/checkpoint-15453/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d29771c68b37af9541b4c450532cb095b564ca5 --- /dev/null +++ b/checkpoint-15453/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a36f231bc2882e8c2e1859bc27098f73c95ea211ccb73ad0cdb441a16f49c6 +size 17210280 diff --git a/checkpoint-15453/tokenizer_config.json b/checkpoint-15453/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a695c457b54a00f10768564f6c25b0142ccc840 --- /dev/null +++ b/checkpoint-15453/tokenizer_config.json @@ -0,0 +1,2087 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|im_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|end_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|autheur|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|sujet|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|><|khey|><|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-15453/trainer_state.json b/checkpoint-15453/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5614b94bce1a82e88b7343798db245ae2a7bc1f --- /dev/null +++ b/checkpoint-15453/trainer_state.json @@ -0,0 +1,108204 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8505146128020254, + "eval_steps": 500, + "global_step": 15453, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.503880235566074e-05, + "grad_norm": 459.8753356933594, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.303, + "step": 1 + }, + { + "epoch": 0.00011007760471132149, + "grad_norm": 314.2561950683594, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.8226, + "step": 2 + }, + { + "epoch": 0.0001651164070669822, + "grad_norm": 314.1292419433594, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8517, + "step": 3 + }, + { + "epoch": 0.00022015520942264297, + "grad_norm": 312.4049072265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6248, + "step": 4 + }, + { + "epoch": 0.0002751940117783037, + "grad_norm": 353.7213134765625, + "learning_rate": 5.000000000000001e-07, + "loss": 2.7883, + "step": 5 + }, + { + "epoch": 0.0003302328141339644, + "grad_norm": 278.41668701171875, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5468, + "step": 6 + }, + { + "epoch": 0.0003852716164896252, + "grad_norm": 336.14532470703125, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7721, + "step": 7 + }, + { + "epoch": 0.00044031041884528595, + "grad_norm": 201.19374084472656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4873, + "step": 8 + }, + { + "epoch": 0.0004953492212009466, + "grad_norm": 184.7027587890625, + "learning_rate": 9.000000000000001e-07, + "loss": 2.6647, + "step": 9 + }, + { + "epoch": 0.0005503880235566074, + "grad_norm": 154.597412109375, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.602, + "step": 10 + }, + { + "epoch": 0.0006054268259122681, + "grad_norm": 40.47785568237305, + "learning_rate": 1.1e-06, + "loss": 2.6716, + "step": 11 + }, + { + "epoch": 0.0006604656282679288, + "grad_norm": 25.338607788085938, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2631, + "step": 12 + }, + { + "epoch": 0.0007155044306235897, + "grad_norm": 24.976919174194336, + "learning_rate": 1.3e-06, + "loss": 2.3564, + "step": 13 + }, + { + "epoch": 0.0007705432329792504, + "grad_norm": 15.239912033081055, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3295, + "step": 14 + }, + { + "epoch": 0.0008255820353349112, + "grad_norm": 14.125042915344238, + "learning_rate": 1.5e-06, + "loss": 2.307, + "step": 15 + }, + { + "epoch": 0.0008806208376905719, + "grad_norm": 13.163726806640625, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1493, + "step": 16 + }, + { + "epoch": 0.0009356596400462326, + "grad_norm": 8.726515769958496, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.0333, + "step": 17 + }, + { + "epoch": 0.0009906984424018933, + "grad_norm": 9.072502136230469, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.2046, + "step": 18 + }, + { + "epoch": 0.001045737244757554, + "grad_norm": 9.412588119506836, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.2001, + "step": 19 + }, + { + "epoch": 0.0011007760471132147, + "grad_norm": 8.67534065246582, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0011558148494688755, + "grad_norm": 14.015918731689453, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.9566, + "step": 21 + }, + { + "epoch": 0.0012108536518245362, + "grad_norm": 7.9474687576293945, + "learning_rate": 2.2e-06, + "loss": 1.9085, + "step": 22 + }, + { + "epoch": 0.001265892454180197, + "grad_norm": 6.806368350982666, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7918, + "step": 23 + }, + { + "epoch": 0.0013209312565358577, + "grad_norm": 5.3452582359313965, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8321, + "step": 24 + }, + { + "epoch": 0.0013759700588915184, + "grad_norm": 8.744244575500488, + "learning_rate": 2.5e-06, + "loss": 1.6317, + "step": 25 + }, + { + "epoch": 0.0014310088612471794, + "grad_norm": 5.304683685302734, + "learning_rate": 2.6e-06, + "loss": 1.6846, + "step": 26 + }, + { + "epoch": 0.00148604766360284, + "grad_norm": 5.650127410888672, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7449, + "step": 27 + }, + { + "epoch": 0.0015410864659585008, + "grad_norm": 5.479269504547119, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8158, + "step": 28 + }, + { + "epoch": 0.0015961252683141616, + "grad_norm": 4.873537063598633, + "learning_rate": 2.9e-06, + "loss": 1.8015, + "step": 29 + }, + { + "epoch": 0.0016511640706698223, + "grad_norm": 4.971101760864258, + "learning_rate": 3e-06, + "loss": 1.9034, + "step": 30 + }, + { + "epoch": 0.001706202873025483, + "grad_norm": 4.407571315765381, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.9037, + "step": 31 + }, + { + "epoch": 0.0017612416753811438, + "grad_norm": 4.429073810577393, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6812, + "step": 32 + }, + { + "epoch": 0.0018162804777368045, + "grad_norm": 5.16085147857666, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7627, + "step": 33 + }, + { + "epoch": 0.0018713192800924653, + "grad_norm": 4.0805768966674805, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6799, + "step": 34 + }, + { + "epoch": 0.001926358082448126, + "grad_norm": 4.548702239990234, + "learning_rate": 3.5e-06, + "loss": 1.7799, + "step": 35 + }, + { + "epoch": 0.0019813968848037865, + "grad_norm": 5.181888580322266, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.8235, + "step": 36 + }, + { + "epoch": 0.0020364356871594475, + "grad_norm": 3.9876129627227783, + "learning_rate": 3.7e-06, + "loss": 1.5999, + "step": 37 + }, + { + "epoch": 0.002091474489515108, + "grad_norm": 6.325051307678223, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.7499, + "step": 38 + }, + { + "epoch": 0.002146513291870769, + "grad_norm": 6.199049949645996, + "learning_rate": 3.900000000000001e-06, + "loss": 1.784, + "step": 39 + }, + { + "epoch": 0.0022015520942264295, + "grad_norm": 4.83912992477417, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8895, + "step": 40 + }, + { + "epoch": 0.0022565908965820904, + "grad_norm": 4.515626907348633, + "learning_rate": 4.1e-06, + "loss": 1.4887, + "step": 41 + }, + { + "epoch": 0.002311629698937751, + "grad_norm": 5.032265663146973, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7324, + "step": 42 + }, + { + "epoch": 0.002366668501293412, + "grad_norm": 4.1879048347473145, + "learning_rate": 4.3e-06, + "loss": 1.4912, + "step": 43 + }, + { + "epoch": 0.0024217073036490724, + "grad_norm": 4.128026485443115, + "learning_rate": 4.4e-06, + "loss": 1.554, + "step": 44 + }, + { + "epoch": 0.0024767461060047334, + "grad_norm": 4.527958393096924, + "learning_rate": 4.5e-06, + "loss": 1.652, + "step": 45 + }, + { + "epoch": 0.002531784908360394, + "grad_norm": 4.8388190269470215, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6696, + "step": 46 + }, + { + "epoch": 0.002586823710716055, + "grad_norm": 4.2088541984558105, + "learning_rate": 4.7e-06, + "loss": 1.568, + "step": 47 + }, + { + "epoch": 0.0026418625130717154, + "grad_norm": 4.789997577667236, + "learning_rate": 4.800000000000001e-06, + "loss": 1.642, + "step": 48 + }, + { + "epoch": 0.0026969013154273763, + "grad_norm": 4.408346652984619, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5181, + "step": 49 + }, + { + "epoch": 0.002751940117783037, + "grad_norm": 4.572340488433838, + "learning_rate": 5e-06, + "loss": 1.6698, + "step": 50 + }, + { + "epoch": 0.0028069789201386978, + "grad_norm": 4.728564739227295, + "learning_rate": 5.1e-06, + "loss": 1.5785, + "step": 51 + }, + { + "epoch": 0.0028620177224943587, + "grad_norm": 4.449855327606201, + "learning_rate": 5.2e-06, + "loss": 1.4624, + "step": 52 + }, + { + "epoch": 0.0029170565248500193, + "grad_norm": 4.127189636230469, + "learning_rate": 5.300000000000001e-06, + "loss": 1.6061, + "step": 53 + }, + { + "epoch": 0.00297209532720568, + "grad_norm": 4.244532108306885, + "learning_rate": 5.400000000000001e-06, + "loss": 1.491, + "step": 54 + }, + { + "epoch": 0.0030271341295613407, + "grad_norm": 3.437682628631592, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1967, + "step": 55 + }, + { + "epoch": 0.0030821729319170017, + "grad_norm": 3.83516788482666, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4731, + "step": 56 + }, + { + "epoch": 0.003137211734272662, + "grad_norm": 3.9108972549438477, + "learning_rate": 5.7e-06, + "loss": 1.4393, + "step": 57 + }, + { + "epoch": 0.003192250536628323, + "grad_norm": 3.5258419513702393, + "learning_rate": 5.8e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.0032472893389839837, + "grad_norm": 4.124903678894043, + "learning_rate": 5.9e-06, + "loss": 1.4747, + "step": 59 + }, + { + "epoch": 0.0033023281413396446, + "grad_norm": 4.055769920349121, + "learning_rate": 6e-06, + "loss": 1.4655, + "step": 60 + }, + { + "epoch": 0.003357366943695305, + "grad_norm": 3.904837131500244, + "learning_rate": 6.1e-06, + "loss": 1.5125, + "step": 61 + }, + { + "epoch": 0.003412405746050966, + "grad_norm": 3.2904794216156006, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4596, + "step": 62 + }, + { + "epoch": 0.0034674445484066266, + "grad_norm": 3.24053692817688, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3851, + "step": 63 + }, + { + "epoch": 0.0035224833507622876, + "grad_norm": 3.457639217376709, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4019, + "step": 64 + }, + { + "epoch": 0.003577522153117948, + "grad_norm": 3.073054790496826, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2872, + "step": 65 + }, + { + "epoch": 0.003632560955473609, + "grad_norm": 2.6726694107055664, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2361, + "step": 66 + }, + { + "epoch": 0.0036875997578292696, + "grad_norm": 2.9378459453582764, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4452, + "step": 67 + }, + { + "epoch": 0.0037426385601849305, + "grad_norm": 2.81107234954834, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4804, + "step": 68 + }, + { + "epoch": 0.003797677362540591, + "grad_norm": 2.60062313079834, + "learning_rate": 6.9e-06, + "loss": 1.3263, + "step": 69 + }, + { + "epoch": 0.003852716164896252, + "grad_norm": 2.5642921924591064, + "learning_rate": 7e-06, + "loss": 1.2751, + "step": 70 + }, + { + "epoch": 0.0039077549672519125, + "grad_norm": 2.3608031272888184, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2614, + "step": 71 + }, + { + "epoch": 0.003962793769607573, + "grad_norm": 2.7201738357543945, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5018, + "step": 72 + }, + { + "epoch": 0.004017832571963234, + "grad_norm": 2.584726095199585, + "learning_rate": 7.3e-06, + "loss": 1.3519, + "step": 73 + }, + { + "epoch": 0.004072871374318895, + "grad_norm": 1.9693044424057007, + "learning_rate": 7.4e-06, + "loss": 1.0934, + "step": 74 + }, + { + "epoch": 0.0041279101766745555, + "grad_norm": 2.220736503601074, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4687, + "step": 75 + }, + { + "epoch": 0.004182948979030216, + "grad_norm": 2.2629456520080566, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3328, + "step": 76 + }, + { + "epoch": 0.004237987781385877, + "grad_norm": 2.051820993423462, + "learning_rate": 7.7e-06, + "loss": 1.3058, + "step": 77 + }, + { + "epoch": 0.004293026583741538, + "grad_norm": 2.2451820373535156, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3556, + "step": 78 + }, + { + "epoch": 0.004348065386097198, + "grad_norm": 3.13584303855896, + "learning_rate": 7.9e-06, + "loss": 1.3262, + "step": 79 + }, + { + "epoch": 0.004403104188452859, + "grad_norm": 5.024479866027832, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2103, + "step": 80 + }, + { + "epoch": 0.00445814299080852, + "grad_norm": 2.070889711380005, + "learning_rate": 8.1e-06, + "loss": 1.1994, + "step": 81 + }, + { + "epoch": 0.004513181793164181, + "grad_norm": 2.797286033630371, + "learning_rate": 8.2e-06, + "loss": 1.3075, + "step": 82 + }, + { + "epoch": 0.004568220595519841, + "grad_norm": 2.11370849609375, + "learning_rate": 8.3e-06, + "loss": 1.36, + "step": 83 + }, + { + "epoch": 0.004623259397875502, + "grad_norm": 2.5416152477264404, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3484, + "step": 84 + }, + { + "epoch": 0.004678298200231163, + "grad_norm": 2.4702343940734863, + "learning_rate": 8.5e-06, + "loss": 1.3677, + "step": 85 + }, + { + "epoch": 0.004733337002586824, + "grad_norm": 3.670365333557129, + "learning_rate": 8.6e-06, + "loss": 1.2192, + "step": 86 + }, + { + "epoch": 0.004788375804942484, + "grad_norm": 2.282954692840576, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2982, + "step": 87 + }, + { + "epoch": 0.004843414607298145, + "grad_norm": 2.3659238815307617, + "learning_rate": 8.8e-06, + "loss": 1.3206, + "step": 88 + }, + { + "epoch": 0.004898453409653806, + "grad_norm": 4.939981460571289, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4328, + "step": 89 + }, + { + "epoch": 0.004953492212009467, + "grad_norm": 2.335858106613159, + "learning_rate": 9e-06, + "loss": 1.2603, + "step": 90 + }, + { + "epoch": 0.005008531014365127, + "grad_norm": 2.2165043354034424, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3141, + "step": 91 + }, + { + "epoch": 0.005063569816720788, + "grad_norm": 2.7872185707092285, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3314, + "step": 92 + }, + { + "epoch": 0.005118608619076449, + "grad_norm": 2.6353912353515625, + "learning_rate": 9.3e-06, + "loss": 1.2027, + "step": 93 + }, + { + "epoch": 0.00517364742143211, + "grad_norm": 3.2509102821350098, + "learning_rate": 9.4e-06, + "loss": 1.2316, + "step": 94 + }, + { + "epoch": 0.00522868622378777, + "grad_norm": 2.4560611248016357, + "learning_rate": 9.5e-06, + "loss": 1.1848, + "step": 95 + }, + { + "epoch": 0.005283725026143431, + "grad_norm": 2.338151216506958, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2392, + "step": 96 + }, + { + "epoch": 0.005338763828499092, + "grad_norm": 2.231065034866333, + "learning_rate": 9.7e-06, + "loss": 1.2089, + "step": 97 + }, + { + "epoch": 0.005393802630854753, + "grad_norm": 2.278428077697754, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2267, + "step": 98 + }, + { + "epoch": 0.005448841433210413, + "grad_norm": 2.4422810077667236, + "learning_rate": 9.9e-06, + "loss": 1.2041, + "step": 99 + }, + { + "epoch": 0.005503880235566074, + "grad_norm": 2.216248035430908, + "learning_rate": 1e-05, + "loss": 1.0798, + "step": 100 + }, + { + "epoch": 0.005558919037921735, + "grad_norm": 2.3301615715026855, + "learning_rate": 9.99999998121067e-06, + "loss": 1.3069, + "step": 101 + }, + { + "epoch": 0.0056139578402773956, + "grad_norm": 2.315436363220215, + "learning_rate": 9.999999924842678e-06, + "loss": 1.1589, + "step": 102 + }, + { + "epoch": 0.005668996642633056, + "grad_norm": 2.3522140979766846, + "learning_rate": 9.999999830896024e-06, + "loss": 1.0978, + "step": 103 + }, + { + "epoch": 0.0057240354449887175, + "grad_norm": 2.5798308849334717, + "learning_rate": 9.99999969937071e-06, + "loss": 1.0599, + "step": 104 + }, + { + "epoch": 0.005779074247344378, + "grad_norm": 2.456644058227539, + "learning_rate": 9.999999530266738e-06, + "loss": 1.1682, + "step": 105 + }, + { + "epoch": 0.0058341130497000385, + "grad_norm": 2.1559031009674072, + "learning_rate": 9.999999323584106e-06, + "loss": 1.0631, + "step": 106 + }, + { + "epoch": 0.005889151852055699, + "grad_norm": 2.2985048294067383, + "learning_rate": 9.99999907932282e-06, + "loss": 1.1455, + "step": 107 + }, + { + "epoch": 0.00594419065441136, + "grad_norm": 2.596167802810669, + "learning_rate": 9.999998797482877e-06, + "loss": 1.1686, + "step": 108 + }, + { + "epoch": 0.005999229456767021, + "grad_norm": 2.378618001937866, + "learning_rate": 9.999998478064283e-06, + "loss": 1.2226, + "step": 109 + }, + { + "epoch": 0.0060542682591226814, + "grad_norm": 2.228116750717163, + "learning_rate": 9.999998121067038e-06, + "loss": 1.1396, + "step": 110 + }, + { + "epoch": 0.006109307061478342, + "grad_norm": 2.4419472217559814, + "learning_rate": 9.999997726491146e-06, + "loss": 1.1401, + "step": 111 + }, + { + "epoch": 0.006164345863834003, + "grad_norm": 2.0695526599884033, + "learning_rate": 9.999997294336608e-06, + "loss": 1.1868, + "step": 112 + }, + { + "epoch": 0.006219384666189664, + "grad_norm": 2.3170363903045654, + "learning_rate": 9.99999682460343e-06, + "loss": 1.1172, + "step": 113 + }, + { + "epoch": 0.006274423468545324, + "grad_norm": 2.670466184616089, + "learning_rate": 9.999996317291615e-06, + "loss": 1.2481, + "step": 114 + }, + { + "epoch": 0.006329462270900985, + "grad_norm": 2.1214540004730225, + "learning_rate": 9.999995772401166e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.006384501073256646, + "grad_norm": 1.9283969402313232, + "learning_rate": 9.999995189932085e-06, + "loss": 1.0692, + "step": 116 + }, + { + "epoch": 0.006439539875612307, + "grad_norm": 2.2620882987976074, + "learning_rate": 9.99999456988438e-06, + "loss": 1.0725, + "step": 117 + }, + { + "epoch": 0.006494578677967967, + "grad_norm": 2.2121341228485107, + "learning_rate": 9.999993912258055e-06, + "loss": 1.1328, + "step": 118 + }, + { + "epoch": 0.006549617480323628, + "grad_norm": 2.298126220703125, + "learning_rate": 9.999993217053113e-06, + "loss": 1.1272, + "step": 119 + }, + { + "epoch": 0.006604656282679289, + "grad_norm": 1.81593656539917, + "learning_rate": 9.99999248426956e-06, + "loss": 1.017, + "step": 120 + }, + { + "epoch": 0.00665969508503495, + "grad_norm": 2.1174378395080566, + "learning_rate": 9.999991713907403e-06, + "loss": 1.0557, + "step": 121 + }, + { + "epoch": 0.00671473388739061, + "grad_norm": 1.9061017036437988, + "learning_rate": 9.999990905966647e-06, + "loss": 1.0379, + "step": 122 + }, + { + "epoch": 0.006769772689746271, + "grad_norm": 1.912500023841858, + "learning_rate": 9.999990060447297e-06, + "loss": 1.104, + "step": 123 + }, + { + "epoch": 0.006824811492101932, + "grad_norm": 1.9249529838562012, + "learning_rate": 9.99998917734936e-06, + "loss": 1.0136, + "step": 124 + }, + { + "epoch": 0.006879850294457593, + "grad_norm": 1.8504948616027832, + "learning_rate": 9.999988256672843e-06, + "loss": 0.99, + "step": 125 + }, + { + "epoch": 0.006934889096813253, + "grad_norm": 1.720042109489441, + "learning_rate": 9.999987298417753e-06, + "loss": 1.0666, + "step": 126 + }, + { + "epoch": 0.006989927899168914, + "grad_norm": 1.778251051902771, + "learning_rate": 9.999986302584097e-06, + "loss": 1.0424, + "step": 127 + }, + { + "epoch": 0.007044966701524575, + "grad_norm": 1.9485961198806763, + "learning_rate": 9.999985269171881e-06, + "loss": 1.105, + "step": 128 + }, + { + "epoch": 0.007100005503880236, + "grad_norm": 3.0802104473114014, + "learning_rate": 9.999984198181114e-06, + "loss": 1.1081, + "step": 129 + }, + { + "epoch": 0.007155044306235896, + "grad_norm": 1.7476954460144043, + "learning_rate": 9.999983089611806e-06, + "loss": 0.9677, + "step": 130 + }, + { + "epoch": 0.007210083108591557, + "grad_norm": 1.6127299070358276, + "learning_rate": 9.999981943463963e-06, + "loss": 0.9937, + "step": 131 + }, + { + "epoch": 0.007265121910947218, + "grad_norm": 2.1477208137512207, + "learning_rate": 9.999980759737594e-06, + "loss": 1.0319, + "step": 132 + }, + { + "epoch": 0.007320160713302879, + "grad_norm": 1.531163215637207, + "learning_rate": 9.999979538432707e-06, + "loss": 0.8696, + "step": 133 + }, + { + "epoch": 0.007375199515658539, + "grad_norm": 1.8226820230484009, + "learning_rate": 9.999978279549313e-06, + "loss": 1.2061, + "step": 134 + }, + { + "epoch": 0.0074302383180142, + "grad_norm": 1.481895923614502, + "learning_rate": 9.99997698308742e-06, + "loss": 0.949, + "step": 135 + }, + { + "epoch": 0.007485277120369861, + "grad_norm": 1.6715927124023438, + "learning_rate": 9.99997564904704e-06, + "loss": 1.1579, + "step": 136 + }, + { + "epoch": 0.0075403159227255215, + "grad_norm": 1.4235272407531738, + "learning_rate": 9.999974277428179e-06, + "loss": 1.064, + "step": 137 + }, + { + "epoch": 0.007595354725081182, + "grad_norm": 1.3524872064590454, + "learning_rate": 9.999972868230852e-06, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.007650393527436843, + "grad_norm": 1.3741765022277832, + "learning_rate": 9.999971421455066e-06, + "loss": 1.0256, + "step": 139 + }, + { + "epoch": 0.007705432329792504, + "grad_norm": 1.9869598150253296, + "learning_rate": 9.999969937100835e-06, + "loss": 0.9489, + "step": 140 + }, + { + "epoch": 0.0077604711321481645, + "grad_norm": 1.4785465002059937, + "learning_rate": 9.999968415168166e-06, + "loss": 0.9243, + "step": 141 + }, + { + "epoch": 0.007815509934503825, + "grad_norm": 1.5476176738739014, + "learning_rate": 9.999966855657074e-06, + "loss": 1.178, + "step": 142 + }, + { + "epoch": 0.007870548736859486, + "grad_norm": 1.500401258468628, + "learning_rate": 9.99996525856757e-06, + "loss": 0.9837, + "step": 143 + }, + { + "epoch": 0.007925587539215146, + "grad_norm": 1.3777157068252563, + "learning_rate": 9.999963623899664e-06, + "loss": 1.0732, + "step": 144 + }, + { + "epoch": 0.007980626341570807, + "grad_norm": 1.4466841220855713, + "learning_rate": 9.99996195165337e-06, + "loss": 0.9779, + "step": 145 + }, + { + "epoch": 0.008035665143926469, + "grad_norm": 1.5304051637649536, + "learning_rate": 9.9999602418287e-06, + "loss": 1.196, + "step": 146 + }, + { + "epoch": 0.008090703946282128, + "grad_norm": 1.9012362957000732, + "learning_rate": 9.99995849442567e-06, + "loss": 0.9797, + "step": 147 + }, + { + "epoch": 0.00814574274863779, + "grad_norm": 1.430679202079773, + "learning_rate": 9.999956709444289e-06, + "loss": 0.9869, + "step": 148 + }, + { + "epoch": 0.00820078155099345, + "grad_norm": 1.3489817380905151, + "learning_rate": 9.99995488688457e-06, + "loss": 1.0137, + "step": 149 + }, + { + "epoch": 0.008255820353349111, + "grad_norm": 1.1878125667572021, + "learning_rate": 9.999953026746531e-06, + "loss": 0.9355, + "step": 150 + }, + { + "epoch": 0.008310859155704772, + "grad_norm": 1.3481942415237427, + "learning_rate": 9.999951129030182e-06, + "loss": 1.1235, + "step": 151 + }, + { + "epoch": 0.008365897958060432, + "grad_norm": 1.7335314750671387, + "learning_rate": 9.999949193735539e-06, + "loss": 0.9382, + "step": 152 + }, + { + "epoch": 0.008420936760416093, + "grad_norm": 1.2029480934143066, + "learning_rate": 9.999947220862615e-06, + "loss": 0.9419, + "step": 153 + }, + { + "epoch": 0.008475975562771755, + "grad_norm": 1.2104203701019287, + "learning_rate": 9.999945210411428e-06, + "loss": 0.9196, + "step": 154 + }, + { + "epoch": 0.008531014365127414, + "grad_norm": 1.1857126951217651, + "learning_rate": 9.999943162381991e-06, + "loss": 0.9421, + "step": 155 + }, + { + "epoch": 0.008586053167483076, + "grad_norm": 1.115027904510498, + "learning_rate": 9.999941076774319e-06, + "loss": 0.9634, + "step": 156 + }, + { + "epoch": 0.008641091969838737, + "grad_norm": 1.4227553606033325, + "learning_rate": 9.999938953588428e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.008696130772194397, + "grad_norm": 1.2913776636123657, + "learning_rate": 9.999936792824334e-06, + "loss": 0.9232, + "step": 158 + }, + { + "epoch": 0.008751169574550058, + "grad_norm": 1.2817318439483643, + "learning_rate": 9.999934594482055e-06, + "loss": 0.9691, + "step": 159 + }, + { + "epoch": 0.008806208376905718, + "grad_norm": 1.5647841691970825, + "learning_rate": 9.999932358561604e-06, + "loss": 1.1842, + "step": 160 + }, + { + "epoch": 0.00886124717926138, + "grad_norm": 1.368135929107666, + "learning_rate": 9.999930085063002e-06, + "loss": 1.0873, + "step": 161 + }, + { + "epoch": 0.00891628598161704, + "grad_norm": 1.2297240495681763, + "learning_rate": 9.999927773986262e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.0089713247839727, + "grad_norm": 1.0658279657363892, + "learning_rate": 9.999925425331405e-06, + "loss": 0.9008, + "step": 163 + }, + { + "epoch": 0.009026363586328362, + "grad_norm": 1.3484326601028442, + "learning_rate": 9.999923039098445e-06, + "loss": 1.0664, + "step": 164 + }, + { + "epoch": 0.009081402388684023, + "grad_norm": 1.1839075088500977, + "learning_rate": 9.999920615287401e-06, + "loss": 0.9257, + "step": 165 + }, + { + "epoch": 0.009136441191039683, + "grad_norm": 1.2757254838943481, + "learning_rate": 9.999918153898295e-06, + "loss": 0.9473, + "step": 166 + }, + { + "epoch": 0.009191479993395344, + "grad_norm": 1.2414579391479492, + "learning_rate": 9.99991565493114e-06, + "loss": 1.1091, + "step": 167 + }, + { + "epoch": 0.009246518795751004, + "grad_norm": 1.2802611589431763, + "learning_rate": 9.999913118385959e-06, + "loss": 1.063, + "step": 168 + }, + { + "epoch": 0.009301557598106665, + "grad_norm": 1.2055327892303467, + "learning_rate": 9.99991054426277e-06, + "loss": 0.8, + "step": 169 + }, + { + "epoch": 0.009356596400462327, + "grad_norm": 1.0391098260879517, + "learning_rate": 9.99990793256159e-06, + "loss": 0.8672, + "step": 170 + }, + { + "epoch": 0.009411635202817986, + "grad_norm": 1.131536602973938, + "learning_rate": 9.99990528328244e-06, + "loss": 0.9569, + "step": 171 + }, + { + "epoch": 0.009466674005173648, + "grad_norm": 1.164307951927185, + "learning_rate": 9.999902596425342e-06, + "loss": 0.9999, + "step": 172 + }, + { + "epoch": 0.009521712807529309, + "grad_norm": 1.2099504470825195, + "learning_rate": 9.999899871990313e-06, + "loss": 0.9994, + "step": 173 + }, + { + "epoch": 0.009576751609884969, + "grad_norm": 1.7294539213180542, + "learning_rate": 9.999897109977376e-06, + "loss": 1.0265, + "step": 174 + }, + { + "epoch": 0.00963179041224063, + "grad_norm": 1.3009883165359497, + "learning_rate": 9.99989431038655e-06, + "loss": 0.9022, + "step": 175 + }, + { + "epoch": 0.00968682921459629, + "grad_norm": 1.1014611721038818, + "learning_rate": 9.999891473217857e-06, + "loss": 0.8476, + "step": 176 + }, + { + "epoch": 0.009741868016951951, + "grad_norm": 1.2410900592803955, + "learning_rate": 9.99988859847132e-06, + "loss": 1.0272, + "step": 177 + }, + { + "epoch": 0.009796906819307612, + "grad_norm": 1.336348295211792, + "learning_rate": 9.999885686146957e-06, + "loss": 0.9456, + "step": 178 + }, + { + "epoch": 0.009851945621663272, + "grad_norm": 1.2931095361709595, + "learning_rate": 9.99988273624479e-06, + "loss": 0.9554, + "step": 179 + }, + { + "epoch": 0.009906984424018933, + "grad_norm": 1.2647838592529297, + "learning_rate": 9.999879748764845e-06, + "loss": 1.0394, + "step": 180 + }, + { + "epoch": 0.009962023226374595, + "grad_norm": 1.3485127687454224, + "learning_rate": 9.99987672370714e-06, + "loss": 1.1016, + "step": 181 + }, + { + "epoch": 0.010017062028730254, + "grad_norm": 1.110187292098999, + "learning_rate": 9.999873661071702e-06, + "loss": 0.946, + "step": 182 + }, + { + "epoch": 0.010072100831085916, + "grad_norm": 1.0991623401641846, + "learning_rate": 9.999870560858551e-06, + "loss": 1.0084, + "step": 183 + }, + { + "epoch": 0.010127139633441576, + "grad_norm": 1.049804449081421, + "learning_rate": 9.999867423067713e-06, + "loss": 0.8264, + "step": 184 + }, + { + "epoch": 0.010182178435797237, + "grad_norm": 1.0947058200836182, + "learning_rate": 9.999864247699207e-06, + "loss": 0.8884, + "step": 185 + }, + { + "epoch": 0.010237217238152898, + "grad_norm": 1.1147902011871338, + "learning_rate": 9.999861034753061e-06, + "loss": 0.9657, + "step": 186 + }, + { + "epoch": 0.010292256040508558, + "grad_norm": 1.260027527809143, + "learning_rate": 9.999857784229298e-06, + "loss": 1.0102, + "step": 187 + }, + { + "epoch": 0.01034729484286422, + "grad_norm": 1.1275582313537598, + "learning_rate": 9.999854496127942e-06, + "loss": 1.028, + "step": 188 + }, + { + "epoch": 0.01040233364521988, + "grad_norm": 1.1377174854278564, + "learning_rate": 9.999851170449018e-06, + "loss": 1.032, + "step": 189 + }, + { + "epoch": 0.01045737244757554, + "grad_norm": 1.1734225749969482, + "learning_rate": 9.999847807192552e-06, + "loss": 1.0009, + "step": 190 + }, + { + "epoch": 0.010512411249931202, + "grad_norm": 1.1934596300125122, + "learning_rate": 9.999844406358565e-06, + "loss": 1.0432, + "step": 191 + }, + { + "epoch": 0.010567450052286861, + "grad_norm": 1.0638024806976318, + "learning_rate": 9.99984096794709e-06, + "loss": 0.8651, + "step": 192 + }, + { + "epoch": 0.010622488854642523, + "grad_norm": 1.2381829023361206, + "learning_rate": 9.999837491958147e-06, + "loss": 1.0088, + "step": 193 + }, + { + "epoch": 0.010677527656998184, + "grad_norm": 1.030246615409851, + "learning_rate": 9.999833978391763e-06, + "loss": 0.9488, + "step": 194 + }, + { + "epoch": 0.010732566459353844, + "grad_norm": 1.1640657186508179, + "learning_rate": 9.999830427247965e-06, + "loss": 1.0588, + "step": 195 + }, + { + "epoch": 0.010787605261709505, + "grad_norm": 1.0431616306304932, + "learning_rate": 9.99982683852678e-06, + "loss": 0.8728, + "step": 196 + }, + { + "epoch": 0.010842644064065167, + "grad_norm": 1.032263159751892, + "learning_rate": 9.999823212228235e-06, + "loss": 0.9498, + "step": 197 + }, + { + "epoch": 0.010897682866420826, + "grad_norm": 1.1383745670318604, + "learning_rate": 9.999819548352358e-06, + "loss": 0.9498, + "step": 198 + }, + { + "epoch": 0.010952721668776488, + "grad_norm": 1.1324639320373535, + "learning_rate": 9.999815846899175e-06, + "loss": 1.0432, + "step": 199 + }, + { + "epoch": 0.011007760471132147, + "grad_norm": 1.188672661781311, + "learning_rate": 9.999812107868714e-06, + "loss": 0.982, + "step": 200 + }, + { + "epoch": 0.011062799273487809, + "grad_norm": 1.1011098623275757, + "learning_rate": 9.999808331261005e-06, + "loss": 0.9587, + "step": 201 + }, + { + "epoch": 0.01111783807584347, + "grad_norm": 1.1782938241958618, + "learning_rate": 9.999804517076073e-06, + "loss": 1.0659, + "step": 202 + }, + { + "epoch": 0.01117287687819913, + "grad_norm": 1.0520117282867432, + "learning_rate": 9.99980066531395e-06, + "loss": 1.0056, + "step": 203 + }, + { + "epoch": 0.011227915680554791, + "grad_norm": 1.1584919691085815, + "learning_rate": 9.999796775974663e-06, + "loss": 0.9435, + "step": 204 + }, + { + "epoch": 0.011282954482910452, + "grad_norm": 1.2201849222183228, + "learning_rate": 9.999792849058242e-06, + "loss": 1.0562, + "step": 205 + }, + { + "epoch": 0.011337993285266112, + "grad_norm": 1.2985976934432983, + "learning_rate": 9.999788884564715e-06, + "loss": 1.0126, + "step": 206 + }, + { + "epoch": 0.011393032087621774, + "grad_norm": 0.9926307201385498, + "learning_rate": 9.999784882494115e-06, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.011448070889977435, + "grad_norm": 1.103365182876587, + "learning_rate": 9.99978084284647e-06, + "loss": 0.9833, + "step": 208 + }, + { + "epoch": 0.011503109692333095, + "grad_norm": 1.1798462867736816, + "learning_rate": 9.99977676562181e-06, + "loss": 0.8479, + "step": 209 + }, + { + "epoch": 0.011558148494688756, + "grad_norm": 1.2887194156646729, + "learning_rate": 9.999772650820168e-06, + "loss": 0.9606, + "step": 210 + }, + { + "epoch": 0.011613187297044416, + "grad_norm": 1.1120634078979492, + "learning_rate": 9.99976849844157e-06, + "loss": 0.9604, + "step": 211 + }, + { + "epoch": 0.011668226099400077, + "grad_norm": 1.1248979568481445, + "learning_rate": 9.999764308486052e-06, + "loss": 0.9428, + "step": 212 + }, + { + "epoch": 0.011723264901755738, + "grad_norm": 1.274610161781311, + "learning_rate": 9.999760080953643e-06, + "loss": 0.9044, + "step": 213 + }, + { + "epoch": 0.011778303704111398, + "grad_norm": 1.1746865510940552, + "learning_rate": 9.999755815844377e-06, + "loss": 0.9114, + "step": 214 + }, + { + "epoch": 0.01183334250646706, + "grad_norm": 1.2531086206436157, + "learning_rate": 9.999751513158282e-06, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.01188838130882272, + "grad_norm": 1.0789539813995361, + "learning_rate": 9.999747172895395e-06, + "loss": 0.9794, + "step": 216 + }, + { + "epoch": 0.01194342011117838, + "grad_norm": 1.1805329322814941, + "learning_rate": 9.999742795055746e-06, + "loss": 0.9602, + "step": 217 + }, + { + "epoch": 0.011998458913534042, + "grad_norm": 2.309329032897949, + "learning_rate": 9.99973837963937e-06, + "loss": 0.9482, + "step": 218 + }, + { + "epoch": 0.012053497715889702, + "grad_norm": 1.2379088401794434, + "learning_rate": 9.999733926646296e-06, + "loss": 1.0237, + "step": 219 + }, + { + "epoch": 0.012108536518245363, + "grad_norm": 1.1581377983093262, + "learning_rate": 9.999729436076562e-06, + "loss": 1.0583, + "step": 220 + }, + { + "epoch": 0.012163575320601024, + "grad_norm": 1.3006727695465088, + "learning_rate": 9.999724907930199e-06, + "loss": 0.9581, + "step": 221 + }, + { + "epoch": 0.012218614122956684, + "grad_norm": 1.3215982913970947, + "learning_rate": 9.999720342207243e-06, + "loss": 0.9438, + "step": 222 + }, + { + "epoch": 0.012273652925312345, + "grad_norm": 1.1107337474822998, + "learning_rate": 9.999715738907727e-06, + "loss": 0.9987, + "step": 223 + }, + { + "epoch": 0.012328691727668007, + "grad_norm": 1.0745457410812378, + "learning_rate": 9.999711098031685e-06, + "loss": 0.9637, + "step": 224 + }, + { + "epoch": 0.012383730530023666, + "grad_norm": 1.110861897468567, + "learning_rate": 9.999706419579154e-06, + "loss": 1.0225, + "step": 225 + }, + { + "epoch": 0.012438769332379328, + "grad_norm": 1.0755527019500732, + "learning_rate": 9.999701703550167e-06, + "loss": 1.0204, + "step": 226 + }, + { + "epoch": 0.012493808134734987, + "grad_norm": 1.1694976091384888, + "learning_rate": 9.99969694994476e-06, + "loss": 1.0566, + "step": 227 + }, + { + "epoch": 0.012548846937090649, + "grad_norm": 1.455856442451477, + "learning_rate": 9.99969215876297e-06, + "loss": 0.9397, + "step": 228 + }, + { + "epoch": 0.01260388573944631, + "grad_norm": 1.0707073211669922, + "learning_rate": 9.99968733000483e-06, + "loss": 0.8286, + "step": 229 + }, + { + "epoch": 0.01265892454180197, + "grad_norm": 1.189548134803772, + "learning_rate": 9.99968246367038e-06, + "loss": 0.8762, + "step": 230 + }, + { + "epoch": 0.012713963344157631, + "grad_norm": 1.1439214944839478, + "learning_rate": 9.999677559759655e-06, + "loss": 0.9187, + "step": 231 + }, + { + "epoch": 0.012769002146513293, + "grad_norm": 1.2329761981964111, + "learning_rate": 9.999672618272691e-06, + "loss": 1.0374, + "step": 232 + }, + { + "epoch": 0.012824040948868952, + "grad_norm": 1.1545134782791138, + "learning_rate": 9.999667639209527e-06, + "loss": 0.9343, + "step": 233 + }, + { + "epoch": 0.012879079751224614, + "grad_norm": 1.0946775674819946, + "learning_rate": 9.999662622570198e-06, + "loss": 0.9568, + "step": 234 + }, + { + "epoch": 0.012934118553580273, + "grad_norm": 1.2099589109420776, + "learning_rate": 9.999657568354743e-06, + "loss": 1.0364, + "step": 235 + }, + { + "epoch": 0.012989157355935935, + "grad_norm": 1.09062922000885, + "learning_rate": 9.999652476563202e-06, + "loss": 1.0289, + "step": 236 + }, + { + "epoch": 0.013044196158291596, + "grad_norm": 1.154557228088379, + "learning_rate": 9.999647347195612e-06, + "loss": 0.9925, + "step": 237 + }, + { + "epoch": 0.013099234960647256, + "grad_norm": 1.025374174118042, + "learning_rate": 9.999642180252008e-06, + "loss": 0.9346, + "step": 238 + }, + { + "epoch": 0.013154273763002917, + "grad_norm": 1.1473641395568848, + "learning_rate": 9.999636975732433e-06, + "loss": 1.0244, + "step": 239 + }, + { + "epoch": 0.013209312565358578, + "grad_norm": 1.0421240329742432, + "learning_rate": 9.999631733636923e-06, + "loss": 0.9368, + "step": 240 + }, + { + "epoch": 0.013264351367714238, + "grad_norm": 1.1076610088348389, + "learning_rate": 9.99962645396552e-06, + "loss": 1.0276, + "step": 241 + }, + { + "epoch": 0.0133193901700699, + "grad_norm": 1.143559455871582, + "learning_rate": 9.999621136718266e-06, + "loss": 0.9626, + "step": 242 + }, + { + "epoch": 0.01337442897242556, + "grad_norm": 1.0958378314971924, + "learning_rate": 9.999615781895195e-06, + "loss": 1.0254, + "step": 243 + }, + { + "epoch": 0.01342946777478122, + "grad_norm": 1.117688536643982, + "learning_rate": 9.99961038949635e-06, + "loss": 0.9685, + "step": 244 + }, + { + "epoch": 0.013484506577136882, + "grad_norm": 1.1645647287368774, + "learning_rate": 9.999604959521771e-06, + "loss": 1.0666, + "step": 245 + }, + { + "epoch": 0.013539545379492542, + "grad_norm": 1.1238516569137573, + "learning_rate": 9.999599491971502e-06, + "loss": 1.0252, + "step": 246 + }, + { + "epoch": 0.013594584181848203, + "grad_norm": 1.0196914672851562, + "learning_rate": 9.999593986845579e-06, + "loss": 0.9389, + "step": 247 + }, + { + "epoch": 0.013649622984203864, + "grad_norm": 1.0231372117996216, + "learning_rate": 9.999588444144049e-06, + "loss": 0.8786, + "step": 248 + }, + { + "epoch": 0.013704661786559524, + "grad_norm": 1.2504147291183472, + "learning_rate": 9.999582863866947e-06, + "loss": 1.0969, + "step": 249 + }, + { + "epoch": 0.013759700588915185, + "grad_norm": 1.1123549938201904, + "learning_rate": 9.99957724601432e-06, + "loss": 0.8833, + "step": 250 + }, + { + "epoch": 0.013814739391270847, + "grad_norm": 1.1068202257156372, + "learning_rate": 9.999571590586208e-06, + "loss": 0.9709, + "step": 251 + }, + { + "epoch": 0.013869778193626506, + "grad_norm": 0.9891651272773743, + "learning_rate": 9.999565897582655e-06, + "loss": 0.8598, + "step": 252 + }, + { + "epoch": 0.013924816995982168, + "grad_norm": 0.9866491556167603, + "learning_rate": 9.999560167003703e-06, + "loss": 0.8101, + "step": 253 + }, + { + "epoch": 0.013979855798337828, + "grad_norm": 1.0862594842910767, + "learning_rate": 9.999554398849396e-06, + "loss": 0.9411, + "step": 254 + }, + { + "epoch": 0.014034894600693489, + "grad_norm": 1.1898949146270752, + "learning_rate": 9.999548593119774e-06, + "loss": 0.9548, + "step": 255 + }, + { + "epoch": 0.01408993340304915, + "grad_norm": 1.2167880535125732, + "learning_rate": 9.999542749814886e-06, + "loss": 1.0302, + "step": 256 + }, + { + "epoch": 0.01414497220540481, + "grad_norm": 1.0784146785736084, + "learning_rate": 9.999536868934771e-06, + "loss": 0.8875, + "step": 257 + }, + { + "epoch": 0.014200011007760471, + "grad_norm": 1.1128027439117432, + "learning_rate": 9.999530950479475e-06, + "loss": 0.9498, + "step": 258 + }, + { + "epoch": 0.014255049810116133, + "grad_norm": 1.1311595439910889, + "learning_rate": 9.999524994449044e-06, + "loss": 0.9035, + "step": 259 + }, + { + "epoch": 0.014310088612471792, + "grad_norm": 1.225615382194519, + "learning_rate": 9.999519000843521e-06, + "loss": 1.0104, + "step": 260 + }, + { + "epoch": 0.014365127414827454, + "grad_norm": 1.2347793579101562, + "learning_rate": 9.99951296966295e-06, + "loss": 1.0288, + "step": 261 + }, + { + "epoch": 0.014420166217183113, + "grad_norm": 1.1837103366851807, + "learning_rate": 9.99950690090738e-06, + "loss": 0.9553, + "step": 262 + }, + { + "epoch": 0.014475205019538775, + "grad_norm": 1.1985397338867188, + "learning_rate": 9.999500794576852e-06, + "loss": 0.9561, + "step": 263 + }, + { + "epoch": 0.014530243821894436, + "grad_norm": 1.036928415298462, + "learning_rate": 9.999494650671418e-06, + "loss": 0.8906, + "step": 264 + }, + { + "epoch": 0.014585282624250096, + "grad_norm": 1.0797842741012573, + "learning_rate": 9.999488469191116e-06, + "loss": 0.8975, + "step": 265 + }, + { + "epoch": 0.014640321426605757, + "grad_norm": 1.0571156740188599, + "learning_rate": 9.999482250136e-06, + "loss": 0.9334, + "step": 266 + }, + { + "epoch": 0.014695360228961419, + "grad_norm": 1.2065023183822632, + "learning_rate": 9.999475993506114e-06, + "loss": 0.8986, + "step": 267 + }, + { + "epoch": 0.014750399031317078, + "grad_norm": 1.201586127281189, + "learning_rate": 9.999469699301502e-06, + "loss": 0.9192, + "step": 268 + }, + { + "epoch": 0.01480543783367274, + "grad_norm": 1.0470168590545654, + "learning_rate": 9.999463367522216e-06, + "loss": 0.8604, + "step": 269 + }, + { + "epoch": 0.0148604766360284, + "grad_norm": 1.1142147779464722, + "learning_rate": 9.9994569981683e-06, + "loss": 0.9847, + "step": 270 + }, + { + "epoch": 0.01491551543838406, + "grad_norm": 1.0352061986923218, + "learning_rate": 9.999450591239805e-06, + "loss": 0.8927, + "step": 271 + }, + { + "epoch": 0.014970554240739722, + "grad_norm": 1.0353184938430786, + "learning_rate": 9.999444146736779e-06, + "loss": 0.8435, + "step": 272 + }, + { + "epoch": 0.015025593043095382, + "grad_norm": 1.2091951370239258, + "learning_rate": 9.999437664659267e-06, + "loss": 0.8959, + "step": 273 + }, + { + "epoch": 0.015080631845451043, + "grad_norm": 1.006361722946167, + "learning_rate": 9.999431145007319e-06, + "loss": 0.8579, + "step": 274 + }, + { + "epoch": 0.015135670647806704, + "grad_norm": 1.1265509128570557, + "learning_rate": 9.999424587780985e-06, + "loss": 0.8808, + "step": 275 + }, + { + "epoch": 0.015190709450162364, + "grad_norm": 1.060882568359375, + "learning_rate": 9.999417992980317e-06, + "loss": 1.044, + "step": 276 + }, + { + "epoch": 0.015245748252518026, + "grad_norm": 1.0216747522354126, + "learning_rate": 9.999411360605358e-06, + "loss": 0.7773, + "step": 277 + }, + { + "epoch": 0.015300787054873685, + "grad_norm": 1.1382462978363037, + "learning_rate": 9.999404690656163e-06, + "loss": 0.8954, + "step": 278 + }, + { + "epoch": 0.015355825857229347, + "grad_norm": 1.113815188407898, + "learning_rate": 9.99939798313278e-06, + "loss": 0.8143, + "step": 279 + }, + { + "epoch": 0.015410864659585008, + "grad_norm": 1.123530387878418, + "learning_rate": 9.99939123803526e-06, + "loss": 0.8872, + "step": 280 + }, + { + "epoch": 0.015465903461940668, + "grad_norm": 1.0873669385910034, + "learning_rate": 9.999384455363656e-06, + "loss": 1.008, + "step": 281 + }, + { + "epoch": 0.015520942264296329, + "grad_norm": 1.5956637859344482, + "learning_rate": 9.999377635118014e-06, + "loss": 0.9456, + "step": 282 + }, + { + "epoch": 0.01557598106665199, + "grad_norm": 1.1471425294876099, + "learning_rate": 9.999370777298389e-06, + "loss": 0.9897, + "step": 283 + }, + { + "epoch": 0.01563101986900765, + "grad_norm": 0.9960193634033203, + "learning_rate": 9.999363881904831e-06, + "loss": 0.8196, + "step": 284 + }, + { + "epoch": 0.01568605867136331, + "grad_norm": 1.1033951044082642, + "learning_rate": 9.999356948937393e-06, + "loss": 0.879, + "step": 285 + }, + { + "epoch": 0.015741097473718973, + "grad_norm": 1.157765507698059, + "learning_rate": 9.999349978396126e-06, + "loss": 1.0116, + "step": 286 + }, + { + "epoch": 0.015796136276074634, + "grad_norm": 1.0472352504730225, + "learning_rate": 9.999342970281084e-06, + "loss": 0.8657, + "step": 287 + }, + { + "epoch": 0.015851175078430292, + "grad_norm": 1.1346659660339355, + "learning_rate": 9.999335924592315e-06, + "loss": 0.8482, + "step": 288 + }, + { + "epoch": 0.015906213880785953, + "grad_norm": 1.1164487600326538, + "learning_rate": 9.999328841329879e-06, + "loss": 1.0542, + "step": 289 + }, + { + "epoch": 0.015961252683141615, + "grad_norm": 1.1890591382980347, + "learning_rate": 9.999321720493825e-06, + "loss": 0.9598, + "step": 290 + }, + { + "epoch": 0.016016291485497276, + "grad_norm": 1.0419867038726807, + "learning_rate": 9.999314562084205e-06, + "loss": 0.9548, + "step": 291 + }, + { + "epoch": 0.016071330287852938, + "grad_norm": 1.0652042627334595, + "learning_rate": 9.999307366101077e-06, + "loss": 0.9359, + "step": 292 + }, + { + "epoch": 0.016126369090208596, + "grad_norm": 1.0166404247283936, + "learning_rate": 9.999300132544492e-06, + "loss": 0.9276, + "step": 293 + }, + { + "epoch": 0.016181407892564257, + "grad_norm": 1.1638866662979126, + "learning_rate": 9.999292861414507e-06, + "loss": 0.957, + "step": 294 + }, + { + "epoch": 0.01623644669491992, + "grad_norm": 1.5505993366241455, + "learning_rate": 9.999285552711173e-06, + "loss": 0.9878, + "step": 295 + }, + { + "epoch": 0.01629148549727558, + "grad_norm": 1.177262783050537, + "learning_rate": 9.999278206434549e-06, + "loss": 0.8631, + "step": 296 + }, + { + "epoch": 0.01634652429963124, + "grad_norm": 1.8578168153762817, + "learning_rate": 9.999270822584687e-06, + "loss": 0.9684, + "step": 297 + }, + { + "epoch": 0.0164015631019869, + "grad_norm": 1.2617360353469849, + "learning_rate": 9.999263401161643e-06, + "loss": 1.014, + "step": 298 + }, + { + "epoch": 0.01645660190434256, + "grad_norm": 0.9740132689476013, + "learning_rate": 9.999255942165475e-06, + "loss": 0.8606, + "step": 299 + }, + { + "epoch": 0.016511640706698222, + "grad_norm": 0.9821745753288269, + "learning_rate": 9.999248445596238e-06, + "loss": 0.8241, + "step": 300 + }, + { + "epoch": 0.016566679509053883, + "grad_norm": 1.0200445652008057, + "learning_rate": 9.999240911453986e-06, + "loss": 0.8256, + "step": 301 + }, + { + "epoch": 0.016621718311409545, + "grad_norm": 1.4100390672683716, + "learning_rate": 9.999233339738779e-06, + "loss": 0.9057, + "step": 302 + }, + { + "epoch": 0.016676757113765206, + "grad_norm": 1.056544303894043, + "learning_rate": 9.99922573045067e-06, + "loss": 1.0808, + "step": 303 + }, + { + "epoch": 0.016731795916120864, + "grad_norm": 0.9271026253700256, + "learning_rate": 9.99921808358972e-06, + "loss": 0.878, + "step": 304 + }, + { + "epoch": 0.016786834718476525, + "grad_norm": 0.9864157438278198, + "learning_rate": 9.999210399155987e-06, + "loss": 0.9198, + "step": 305 + }, + { + "epoch": 0.016841873520832187, + "grad_norm": 1.093995451927185, + "learning_rate": 9.999202677149525e-06, + "loss": 0.9794, + "step": 306 + }, + { + "epoch": 0.016896912323187848, + "grad_norm": 0.9717912077903748, + "learning_rate": 9.999194917570395e-06, + "loss": 0.8764, + "step": 307 + }, + { + "epoch": 0.01695195112554351, + "grad_norm": 1.0026428699493408, + "learning_rate": 9.999187120418653e-06, + "loss": 0.8526, + "step": 308 + }, + { + "epoch": 0.017006989927899167, + "grad_norm": 1.122870922088623, + "learning_rate": 9.999179285694359e-06, + "loss": 0.9773, + "step": 309 + }, + { + "epoch": 0.01706202873025483, + "grad_norm": 1.0522836446762085, + "learning_rate": 9.999171413397572e-06, + "loss": 1.0183, + "step": 310 + }, + { + "epoch": 0.01711706753261049, + "grad_norm": 0.9303658604621887, + "learning_rate": 9.99916350352835e-06, + "loss": 0.8402, + "step": 311 + }, + { + "epoch": 0.01717210633496615, + "grad_norm": 0.9606096148490906, + "learning_rate": 9.999155556086755e-06, + "loss": 0.9692, + "step": 312 + }, + { + "epoch": 0.017227145137321813, + "grad_norm": 1.176992416381836, + "learning_rate": 9.999147571072844e-06, + "loss": 0.8172, + "step": 313 + }, + { + "epoch": 0.017282183939677474, + "grad_norm": 1.1948801279067993, + "learning_rate": 9.999139548486678e-06, + "loss": 1.0205, + "step": 314 + }, + { + "epoch": 0.017337222742033132, + "grad_norm": 1.0064897537231445, + "learning_rate": 9.999131488328318e-06, + "loss": 0.9479, + "step": 315 + }, + { + "epoch": 0.017392261544388794, + "grad_norm": 1.048242449760437, + "learning_rate": 9.999123390597822e-06, + "loss": 0.9862, + "step": 316 + }, + { + "epoch": 0.017447300346744455, + "grad_norm": 1.12875497341156, + "learning_rate": 9.999115255295256e-06, + "loss": 0.9743, + "step": 317 + }, + { + "epoch": 0.017502339149100116, + "grad_norm": 1.0607460737228394, + "learning_rate": 9.999107082420674e-06, + "loss": 0.8878, + "step": 318 + }, + { + "epoch": 0.017557377951455778, + "grad_norm": 1.1480191946029663, + "learning_rate": 9.999098871974144e-06, + "loss": 0.8769, + "step": 319 + }, + { + "epoch": 0.017612416753811436, + "grad_norm": 1.1150004863739014, + "learning_rate": 9.999090623955724e-06, + "loss": 0.8615, + "step": 320 + }, + { + "epoch": 0.017667455556167097, + "grad_norm": 1.137839913368225, + "learning_rate": 9.999082338365478e-06, + "loss": 0.9703, + "step": 321 + }, + { + "epoch": 0.01772249435852276, + "grad_norm": 1.0883489847183228, + "learning_rate": 9.999074015203467e-06, + "loss": 0.9273, + "step": 322 + }, + { + "epoch": 0.01777753316087842, + "grad_norm": 1.0999557971954346, + "learning_rate": 9.999065654469752e-06, + "loss": 0.9605, + "step": 323 + }, + { + "epoch": 0.01783257196323408, + "grad_norm": 0.9911689758300781, + "learning_rate": 9.999057256164401e-06, + "loss": 0.9117, + "step": 324 + }, + { + "epoch": 0.01788761076558974, + "grad_norm": 1.040933609008789, + "learning_rate": 9.999048820287472e-06, + "loss": 0.9229, + "step": 325 + }, + { + "epoch": 0.0179426495679454, + "grad_norm": 1.4341392517089844, + "learning_rate": 9.999040346839031e-06, + "loss": 1.0718, + "step": 326 + }, + { + "epoch": 0.017997688370301062, + "grad_norm": 1.0246332883834839, + "learning_rate": 9.99903183581914e-06, + "loss": 0.9617, + "step": 327 + }, + { + "epoch": 0.018052727172656723, + "grad_norm": 10.162322998046875, + "learning_rate": 9.999023287227863e-06, + "loss": 1.0391, + "step": 328 + }, + { + "epoch": 0.018107765975012385, + "grad_norm": 1.3370027542114258, + "learning_rate": 9.999014701065266e-06, + "loss": 1.0211, + "step": 329 + }, + { + "epoch": 0.018162804777368046, + "grad_norm": 1.0146219730377197, + "learning_rate": 9.999006077331413e-06, + "loss": 0.8611, + "step": 330 + }, + { + "epoch": 0.018217843579723704, + "grad_norm": 1.0899269580841064, + "learning_rate": 9.998997416026368e-06, + "loss": 0.9209, + "step": 331 + }, + { + "epoch": 0.018272882382079365, + "grad_norm": 1.1343204975128174, + "learning_rate": 9.998988717150198e-06, + "loss": 0.9405, + "step": 332 + }, + { + "epoch": 0.018327921184435027, + "grad_norm": 1.2308380603790283, + "learning_rate": 9.998979980702965e-06, + "loss": 0.9579, + "step": 333 + }, + { + "epoch": 0.018382959986790688, + "grad_norm": 1.1433519124984741, + "learning_rate": 9.998971206684737e-06, + "loss": 1.0045, + "step": 334 + }, + { + "epoch": 0.01843799878914635, + "grad_norm": 1.0585781335830688, + "learning_rate": 9.99896239509558e-06, + "loss": 0.9171, + "step": 335 + }, + { + "epoch": 0.018493037591502007, + "grad_norm": 1.2735164165496826, + "learning_rate": 9.99895354593556e-06, + "loss": 1.1001, + "step": 336 + }, + { + "epoch": 0.01854807639385767, + "grad_norm": 1.2905755043029785, + "learning_rate": 9.998944659204744e-06, + "loss": 1.0294, + "step": 337 + }, + { + "epoch": 0.01860311519621333, + "grad_norm": 1.1442075967788696, + "learning_rate": 9.998935734903198e-06, + "loss": 0.9385, + "step": 338 + }, + { + "epoch": 0.01865815399856899, + "grad_norm": 1.1005232334136963, + "learning_rate": 9.998926773030987e-06, + "loss": 1.026, + "step": 339 + }, + { + "epoch": 0.018713192800924653, + "grad_norm": 1.2770785093307495, + "learning_rate": 9.998917773588182e-06, + "loss": 1.0015, + "step": 340 + }, + { + "epoch": 0.01876823160328031, + "grad_norm": 1.0963070392608643, + "learning_rate": 9.998908736574849e-06, + "loss": 0.9347, + "step": 341 + }, + { + "epoch": 0.018823270405635972, + "grad_norm": 1.10364830493927, + "learning_rate": 9.998899661991055e-06, + "loss": 0.869, + "step": 342 + }, + { + "epoch": 0.018878309207991634, + "grad_norm": 1.0364975929260254, + "learning_rate": 9.99889054983687e-06, + "loss": 0.9855, + "step": 343 + }, + { + "epoch": 0.018933348010347295, + "grad_norm": 1.104702115058899, + "learning_rate": 9.998881400112362e-06, + "loss": 0.9555, + "step": 344 + }, + { + "epoch": 0.018988386812702956, + "grad_norm": 0.9957441687583923, + "learning_rate": 9.998872212817599e-06, + "loss": 0.9634, + "step": 345 + }, + { + "epoch": 0.019043425615058618, + "grad_norm": 1.262271523475647, + "learning_rate": 9.998862987952651e-06, + "loss": 1.0133, + "step": 346 + }, + { + "epoch": 0.019098464417414276, + "grad_norm": 1.2075226306915283, + "learning_rate": 9.998853725517587e-06, + "loss": 1.0588, + "step": 347 + }, + { + "epoch": 0.019153503219769937, + "grad_norm": 1.0609898567199707, + "learning_rate": 9.998844425512477e-06, + "loss": 0.9952, + "step": 348 + }, + { + "epoch": 0.0192085420221256, + "grad_norm": 1.1930195093154907, + "learning_rate": 9.998835087937389e-06, + "loss": 0.9617, + "step": 349 + }, + { + "epoch": 0.01926358082448126, + "grad_norm": 1.2359932661056519, + "learning_rate": 9.998825712792396e-06, + "loss": 0.8768, + "step": 350 + }, + { + "epoch": 0.01931861962683692, + "grad_norm": 0.9984115362167358, + "learning_rate": 9.998816300077566e-06, + "loss": 0.8205, + "step": 351 + }, + { + "epoch": 0.01937365842919258, + "grad_norm": 1.6853677034378052, + "learning_rate": 9.998806849792972e-06, + "loss": 0.9066, + "step": 352 + }, + { + "epoch": 0.01942869723154824, + "grad_norm": 1.2869856357574463, + "learning_rate": 9.998797361938683e-06, + "loss": 1.0054, + "step": 353 + }, + { + "epoch": 0.019483736033903902, + "grad_norm": 1.2791584730148315, + "learning_rate": 9.99878783651477e-06, + "loss": 0.7627, + "step": 354 + }, + { + "epoch": 0.019538774836259563, + "grad_norm": 1.0795867443084717, + "learning_rate": 9.998778273521307e-06, + "loss": 0.9343, + "step": 355 + }, + { + "epoch": 0.019593813638615225, + "grad_norm": 1.0926088094711304, + "learning_rate": 9.998768672958365e-06, + "loss": 0.943, + "step": 356 + }, + { + "epoch": 0.019648852440970886, + "grad_norm": 1.0530847311019897, + "learning_rate": 9.998759034826015e-06, + "loss": 0.9656, + "step": 357 + }, + { + "epoch": 0.019703891243326544, + "grad_norm": 1.1793400049209595, + "learning_rate": 9.99874935912433e-06, + "loss": 0.9799, + "step": 358 + }, + { + "epoch": 0.019758930045682205, + "grad_norm": 1.0726191997528076, + "learning_rate": 9.998739645853383e-06, + "loss": 0.8739, + "step": 359 + }, + { + "epoch": 0.019813968848037867, + "grad_norm": 1.0488981008529663, + "learning_rate": 9.998729895013246e-06, + "loss": 0.8986, + "step": 360 + }, + { + "epoch": 0.019869007650393528, + "grad_norm": 1.8267477750778198, + "learning_rate": 9.998720106603993e-06, + "loss": 0.9175, + "step": 361 + }, + { + "epoch": 0.01992404645274919, + "grad_norm": 0.9868306517601013, + "learning_rate": 9.9987102806257e-06, + "loss": 0.9609, + "step": 362 + }, + { + "epoch": 0.019979085255104848, + "grad_norm": 1.0171183347702026, + "learning_rate": 9.998700417078438e-06, + "loss": 0.8904, + "step": 363 + }, + { + "epoch": 0.02003412405746051, + "grad_norm": 0.9800812602043152, + "learning_rate": 9.998690515962282e-06, + "loss": 0.8344, + "step": 364 + }, + { + "epoch": 0.02008916285981617, + "grad_norm": 1.024707317352295, + "learning_rate": 9.998680577277304e-06, + "loss": 0.9026, + "step": 365 + }, + { + "epoch": 0.02014420166217183, + "grad_norm": 1.1056619882583618, + "learning_rate": 9.998670601023584e-06, + "loss": 1.017, + "step": 366 + }, + { + "epoch": 0.020199240464527493, + "grad_norm": 1.0555908679962158, + "learning_rate": 9.998660587201191e-06, + "loss": 0.9627, + "step": 367 + }, + { + "epoch": 0.02025427926688315, + "grad_norm": 0.9502031803131104, + "learning_rate": 9.998650535810204e-06, + "loss": 0.935, + "step": 368 + }, + { + "epoch": 0.020309318069238812, + "grad_norm": 1.0355613231658936, + "learning_rate": 9.998640446850699e-06, + "loss": 0.9946, + "step": 369 + }, + { + "epoch": 0.020364356871594474, + "grad_norm": 0.9906355142593384, + "learning_rate": 9.99863032032275e-06, + "loss": 0.9389, + "step": 370 + }, + { + "epoch": 0.020419395673950135, + "grad_norm": 0.9483911395072937, + "learning_rate": 9.99862015622643e-06, + "loss": 0.979, + "step": 371 + }, + { + "epoch": 0.020474434476305797, + "grad_norm": 0.9769986271858215, + "learning_rate": 9.998609954561822e-06, + "loss": 0.8972, + "step": 372 + }, + { + "epoch": 0.020529473278661458, + "grad_norm": 1.1682699918746948, + "learning_rate": 9.998599715329e-06, + "loss": 0.943, + "step": 373 + }, + { + "epoch": 0.020584512081017116, + "grad_norm": 1.007912516593933, + "learning_rate": 9.99858943852804e-06, + "loss": 0.8825, + "step": 374 + }, + { + "epoch": 0.020639550883372777, + "grad_norm": 0.9788785576820374, + "learning_rate": 9.99857912415902e-06, + "loss": 0.9667, + "step": 375 + }, + { + "epoch": 0.02069458968572844, + "grad_norm": 1.0804275274276733, + "learning_rate": 9.998568772222017e-06, + "loss": 1.0026, + "step": 376 + }, + { + "epoch": 0.0207496284880841, + "grad_norm": 1.0859237909317017, + "learning_rate": 9.998558382717109e-06, + "loss": 0.9592, + "step": 377 + }, + { + "epoch": 0.02080466729043976, + "grad_norm": 1.2925337553024292, + "learning_rate": 9.998547955644373e-06, + "loss": 0.9067, + "step": 378 + }, + { + "epoch": 0.02085970609279542, + "grad_norm": 0.9853373765945435, + "learning_rate": 9.99853749100389e-06, + "loss": 0.9538, + "step": 379 + }, + { + "epoch": 0.02091474489515108, + "grad_norm": 1.0461076498031616, + "learning_rate": 9.998526988795738e-06, + "loss": 0.9261, + "step": 380 + }, + { + "epoch": 0.020969783697506742, + "grad_norm": 1.024559497833252, + "learning_rate": 9.998516449019995e-06, + "loss": 0.9117, + "step": 381 + }, + { + "epoch": 0.021024822499862404, + "grad_norm": 1.1474825143814087, + "learning_rate": 9.998505871676739e-06, + "loss": 1.0177, + "step": 382 + }, + { + "epoch": 0.021079861302218065, + "grad_norm": 0.9587596654891968, + "learning_rate": 9.998495256766051e-06, + "loss": 0.8809, + "step": 383 + }, + { + "epoch": 0.021134900104573723, + "grad_norm": 0.9505122303962708, + "learning_rate": 9.998484604288013e-06, + "loss": 0.9266, + "step": 384 + }, + { + "epoch": 0.021189938906929384, + "grad_norm": 0.9625647664070129, + "learning_rate": 9.9984739142427e-06, + "loss": 0.9073, + "step": 385 + }, + { + "epoch": 0.021244977709285046, + "grad_norm": 0.9650934338569641, + "learning_rate": 9.998463186630196e-06, + "loss": 0.9042, + "step": 386 + }, + { + "epoch": 0.021300016511640707, + "grad_norm": 1.0289491415023804, + "learning_rate": 9.99845242145058e-06, + "loss": 0.929, + "step": 387 + }, + { + "epoch": 0.02135505531399637, + "grad_norm": 0.9543869495391846, + "learning_rate": 9.998441618703935e-06, + "loss": 0.9406, + "step": 388 + }, + { + "epoch": 0.02141009411635203, + "grad_norm": 0.9276942610740662, + "learning_rate": 9.99843077839034e-06, + "loss": 0.8982, + "step": 389 + }, + { + "epoch": 0.021465132918707688, + "grad_norm": 0.9264664053916931, + "learning_rate": 9.998419900509877e-06, + "loss": 0.7255, + "step": 390 + }, + { + "epoch": 0.02152017172106335, + "grad_norm": 0.9961187243461609, + "learning_rate": 9.998408985062628e-06, + "loss": 0.9826, + "step": 391 + }, + { + "epoch": 0.02157521052341901, + "grad_norm": 0.966596245765686, + "learning_rate": 9.998398032048676e-06, + "loss": 0.8159, + "step": 392 + }, + { + "epoch": 0.021630249325774672, + "grad_norm": 1.1336095333099365, + "learning_rate": 9.998387041468102e-06, + "loss": 0.9289, + "step": 393 + }, + { + "epoch": 0.021685288128130333, + "grad_norm": 1.0453619956970215, + "learning_rate": 9.998376013320989e-06, + "loss": 0.8816, + "step": 394 + }, + { + "epoch": 0.02174032693048599, + "grad_norm": 0.8961821794509888, + "learning_rate": 9.998364947607419e-06, + "loss": 0.871, + "step": 395 + }, + { + "epoch": 0.021795365732841653, + "grad_norm": 1.3420332670211792, + "learning_rate": 9.998353844327477e-06, + "loss": 0.9338, + "step": 396 + }, + { + "epoch": 0.021850404535197314, + "grad_norm": 0.9635335206985474, + "learning_rate": 9.998342703481246e-06, + "loss": 0.9592, + "step": 397 + }, + { + "epoch": 0.021905443337552975, + "grad_norm": 1.3322341442108154, + "learning_rate": 9.998331525068807e-06, + "loss": 1.0974, + "step": 398 + }, + { + "epoch": 0.021960482139908637, + "grad_norm": 1.017220377922058, + "learning_rate": 9.998320309090247e-06, + "loss": 0.9827, + "step": 399 + }, + { + "epoch": 0.022015520942264295, + "grad_norm": 1.0080329179763794, + "learning_rate": 9.99830905554565e-06, + "loss": 0.877, + "step": 400 + }, + { + "epoch": 0.022070559744619956, + "grad_norm": 0.9883211255073547, + "learning_rate": 9.998297764435101e-06, + "loss": 0.9625, + "step": 401 + }, + { + "epoch": 0.022125598546975617, + "grad_norm": 1.0948412418365479, + "learning_rate": 9.998286435758684e-06, + "loss": 0.9058, + "step": 402 + }, + { + "epoch": 0.02218063734933128, + "grad_norm": 0.9402000308036804, + "learning_rate": 9.998275069516482e-06, + "loss": 0.8882, + "step": 403 + }, + { + "epoch": 0.02223567615168694, + "grad_norm": 0.9858806133270264, + "learning_rate": 9.998263665708583e-06, + "loss": 0.9086, + "step": 404 + }, + { + "epoch": 0.0222907149540426, + "grad_norm": 1.0556131601333618, + "learning_rate": 9.998252224335073e-06, + "loss": 0.9583, + "step": 405 + }, + { + "epoch": 0.02234575375639826, + "grad_norm": 1.092766284942627, + "learning_rate": 9.998240745396037e-06, + "loss": 0.9124, + "step": 406 + }, + { + "epoch": 0.02240079255875392, + "grad_norm": 1.1902250051498413, + "learning_rate": 9.998229228891563e-06, + "loss": 1.0566, + "step": 407 + }, + { + "epoch": 0.022455831361109582, + "grad_norm": 1.067906141281128, + "learning_rate": 9.998217674821734e-06, + "loss": 0.9823, + "step": 408 + }, + { + "epoch": 0.022510870163465244, + "grad_norm": 1.0051710605621338, + "learning_rate": 9.998206083186638e-06, + "loss": 0.9141, + "step": 409 + }, + { + "epoch": 0.022565908965820905, + "grad_norm": 1.046412467956543, + "learning_rate": 9.998194453986367e-06, + "loss": 0.9439, + "step": 410 + }, + { + "epoch": 0.022620947768176563, + "grad_norm": 1.1103553771972656, + "learning_rate": 9.998182787221e-06, + "loss": 0.9494, + "step": 411 + }, + { + "epoch": 0.022675986570532224, + "grad_norm": 1.0508466958999634, + "learning_rate": 9.998171082890632e-06, + "loss": 0.9202, + "step": 412 + }, + { + "epoch": 0.022731025372887886, + "grad_norm": 1.1364226341247559, + "learning_rate": 9.998159340995347e-06, + "loss": 0.9859, + "step": 413 + }, + { + "epoch": 0.022786064175243547, + "grad_norm": 1.2073607444763184, + "learning_rate": 9.998147561535234e-06, + "loss": 0.8883, + "step": 414 + }, + { + "epoch": 0.02284110297759921, + "grad_norm": 1.0657012462615967, + "learning_rate": 9.998135744510384e-06, + "loss": 0.8321, + "step": 415 + }, + { + "epoch": 0.02289614177995487, + "grad_norm": 1.0101548433303833, + "learning_rate": 9.998123889920881e-06, + "loss": 0.9374, + "step": 416 + }, + { + "epoch": 0.022951180582310528, + "grad_norm": 1.057455062866211, + "learning_rate": 9.998111997766817e-06, + "loss": 0.8831, + "step": 417 + }, + { + "epoch": 0.02300621938466619, + "grad_norm": 1.206092357635498, + "learning_rate": 9.998100068048282e-06, + "loss": 0.8812, + "step": 418 + }, + { + "epoch": 0.02306125818702185, + "grad_norm": 1.0709773302078247, + "learning_rate": 9.998088100765366e-06, + "loss": 0.9486, + "step": 419 + }, + { + "epoch": 0.023116296989377512, + "grad_norm": 1.066469669342041, + "learning_rate": 9.998076095918156e-06, + "loss": 1.0229, + "step": 420 + }, + { + "epoch": 0.023171335791733173, + "grad_norm": 1.0443583726882935, + "learning_rate": 9.998064053506744e-06, + "loss": 0.8615, + "step": 421 + }, + { + "epoch": 0.02322637459408883, + "grad_norm": 1.103096842765808, + "learning_rate": 9.99805197353122e-06, + "loss": 0.9909, + "step": 422 + }, + { + "epoch": 0.023281413396444493, + "grad_norm": 0.9804643392562866, + "learning_rate": 9.998039855991677e-06, + "loss": 0.9214, + "step": 423 + }, + { + "epoch": 0.023336452198800154, + "grad_norm": 0.9880676865577698, + "learning_rate": 9.998027700888202e-06, + "loss": 0.9345, + "step": 424 + }, + { + "epoch": 0.023391491001155815, + "grad_norm": 0.9633826017379761, + "learning_rate": 9.99801550822089e-06, + "loss": 0.9897, + "step": 425 + }, + { + "epoch": 0.023446529803511477, + "grad_norm": 1.0159331560134888, + "learning_rate": 9.998003277989831e-06, + "loss": 0.9385, + "step": 426 + }, + { + "epoch": 0.023501568605867135, + "grad_norm": 1.009667158126831, + "learning_rate": 9.99799101019512e-06, + "loss": 0.9013, + "step": 427 + }, + { + "epoch": 0.023556607408222796, + "grad_norm": 0.9478578567504883, + "learning_rate": 9.997978704836842e-06, + "loss": 0.8775, + "step": 428 + }, + { + "epoch": 0.023611646210578457, + "grad_norm": 1.013181447982788, + "learning_rate": 9.997966361915096e-06, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.02366668501293412, + "grad_norm": 1.0337481498718262, + "learning_rate": 9.997953981429974e-06, + "loss": 1.0047, + "step": 430 + }, + { + "epoch": 0.02372172381528978, + "grad_norm": 0.9423721432685852, + "learning_rate": 9.997941563381566e-06, + "loss": 0.8639, + "step": 431 + }, + { + "epoch": 0.02377676261764544, + "grad_norm": 1.100492000579834, + "learning_rate": 9.997929107769968e-06, + "loss": 1.0022, + "step": 432 + }, + { + "epoch": 0.0238318014200011, + "grad_norm": 1.1232364177703857, + "learning_rate": 9.997916614595272e-06, + "loss": 0.9145, + "step": 433 + }, + { + "epoch": 0.02388684022235676, + "grad_norm": 0.9466833472251892, + "learning_rate": 9.997904083857572e-06, + "loss": 0.9397, + "step": 434 + }, + { + "epoch": 0.023941879024712422, + "grad_norm": 0.9514566659927368, + "learning_rate": 9.997891515556963e-06, + "loss": 0.8025, + "step": 435 + }, + { + "epoch": 0.023996917827068084, + "grad_norm": 0.9292222261428833, + "learning_rate": 9.997878909693539e-06, + "loss": 0.7739, + "step": 436 + }, + { + "epoch": 0.024051956629423745, + "grad_norm": 1.1049963235855103, + "learning_rate": 9.997866266267397e-06, + "loss": 0.9439, + "step": 437 + }, + { + "epoch": 0.024106995431779403, + "grad_norm": 1.0938019752502441, + "learning_rate": 9.997853585278627e-06, + "loss": 0.9479, + "step": 438 + }, + { + "epoch": 0.024162034234135064, + "grad_norm": 1.0423611402511597, + "learning_rate": 9.997840866727331e-06, + "loss": 0.9309, + "step": 439 + }, + { + "epoch": 0.024217073036490726, + "grad_norm": 1.0584756135940552, + "learning_rate": 9.997828110613598e-06, + "loss": 1.0218, + "step": 440 + }, + { + "epoch": 0.024272111838846387, + "grad_norm": 0.9986408948898315, + "learning_rate": 9.997815316937527e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.02432715064120205, + "grad_norm": 0.9680983424186707, + "learning_rate": 9.997802485699215e-06, + "loss": 0.9286, + "step": 442 + }, + { + "epoch": 0.024382189443557706, + "grad_norm": 1.2231700420379639, + "learning_rate": 9.997789616898757e-06, + "loss": 0.8083, + "step": 443 + }, + { + "epoch": 0.024437228245913368, + "grad_norm": 1.0064021348953247, + "learning_rate": 9.99777671053625e-06, + "loss": 0.9161, + "step": 444 + }, + { + "epoch": 0.02449226704826903, + "grad_norm": 0.9658541679382324, + "learning_rate": 9.99776376661179e-06, + "loss": 0.8027, + "step": 445 + }, + { + "epoch": 0.02454730585062469, + "grad_norm": 0.9440343379974365, + "learning_rate": 9.997750785125477e-06, + "loss": 0.9124, + "step": 446 + }, + { + "epoch": 0.024602344652980352, + "grad_norm": 0.998792827129364, + "learning_rate": 9.997737766077404e-06, + "loss": 0.8699, + "step": 447 + }, + { + "epoch": 0.024657383455336013, + "grad_norm": 1.430880069732666, + "learning_rate": 9.997724709467676e-06, + "loss": 0.9158, + "step": 448 + }, + { + "epoch": 0.02471242225769167, + "grad_norm": 0.9737820029258728, + "learning_rate": 9.997711615296384e-06, + "loss": 0.9496, + "step": 449 + }, + { + "epoch": 0.024767461060047333, + "grad_norm": 0.9710075855255127, + "learning_rate": 9.997698483563629e-06, + "loss": 0.8714, + "step": 450 + }, + { + "epoch": 0.024822499862402994, + "grad_norm": 1.5286253690719604, + "learning_rate": 9.997685314269511e-06, + "loss": 0.8421, + "step": 451 + }, + { + "epoch": 0.024877538664758655, + "grad_norm": 1.0269445180892944, + "learning_rate": 9.99767210741413e-06, + "loss": 1.0131, + "step": 452 + }, + { + "epoch": 0.024932577467114317, + "grad_norm": 0.9780508279800415, + "learning_rate": 9.99765886299758e-06, + "loss": 0.9897, + "step": 453 + }, + { + "epoch": 0.024987616269469975, + "grad_norm": 0.998332679271698, + "learning_rate": 9.997645581019965e-06, + "loss": 0.9647, + "step": 454 + }, + { + "epoch": 0.025042655071825636, + "grad_norm": 1.7062602043151855, + "learning_rate": 9.997632261481383e-06, + "loss": 1.0729, + "step": 455 + }, + { + "epoch": 0.025097693874181298, + "grad_norm": 0.9793694615364075, + "learning_rate": 9.997618904381936e-06, + "loss": 0.9556, + "step": 456 + }, + { + "epoch": 0.02515273267653696, + "grad_norm": 1.0183895826339722, + "learning_rate": 9.997605509721721e-06, + "loss": 0.9194, + "step": 457 + }, + { + "epoch": 0.02520777147889262, + "grad_norm": 1.0288400650024414, + "learning_rate": 9.997592077500844e-06, + "loss": 0.955, + "step": 458 + }, + { + "epoch": 0.025262810281248282, + "grad_norm": 0.9551253914833069, + "learning_rate": 9.997578607719401e-06, + "loss": 0.8498, + "step": 459 + }, + { + "epoch": 0.02531784908360394, + "grad_norm": 0.9648008942604065, + "learning_rate": 9.997565100377494e-06, + "loss": 0.9306, + "step": 460 + }, + { + "epoch": 0.0253728878859596, + "grad_norm": 0.9206677675247192, + "learning_rate": 9.997551555475225e-06, + "loss": 0.7874, + "step": 461 + }, + { + "epoch": 0.025427926688315262, + "grad_norm": 1.0479545593261719, + "learning_rate": 9.997537973012698e-06, + "loss": 0.9201, + "step": 462 + }, + { + "epoch": 0.025482965490670924, + "grad_norm": 1.0329946279525757, + "learning_rate": 9.997524352990013e-06, + "loss": 0.9577, + "step": 463 + }, + { + "epoch": 0.025538004293026585, + "grad_norm": 1.1177828311920166, + "learning_rate": 9.997510695407273e-06, + "loss": 1.0041, + "step": 464 + }, + { + "epoch": 0.025593043095382243, + "grad_norm": 1.0351577997207642, + "learning_rate": 9.99749700026458e-06, + "loss": 0.9952, + "step": 465 + }, + { + "epoch": 0.025648081897737905, + "grad_norm": 0.905274510383606, + "learning_rate": 9.997483267562035e-06, + "loss": 0.8185, + "step": 466 + }, + { + "epoch": 0.025703120700093566, + "grad_norm": 1.0749776363372803, + "learning_rate": 9.997469497299747e-06, + "loss": 1.0611, + "step": 467 + }, + { + "epoch": 0.025758159502449227, + "grad_norm": 0.8972223401069641, + "learning_rate": 9.997455689477815e-06, + "loss": 0.8994, + "step": 468 + }, + { + "epoch": 0.02581319830480489, + "grad_norm": 1.0669914484024048, + "learning_rate": 9.997441844096342e-06, + "loss": 1.06, + "step": 469 + }, + { + "epoch": 0.025868237107160547, + "grad_norm": 1.0431914329528809, + "learning_rate": 9.997427961155435e-06, + "loss": 0.8657, + "step": 470 + }, + { + "epoch": 0.025923275909516208, + "grad_norm": 0.9609962701797485, + "learning_rate": 9.997414040655198e-06, + "loss": 0.8864, + "step": 471 + }, + { + "epoch": 0.02597831471187187, + "grad_norm": 1.0829721689224243, + "learning_rate": 9.997400082595735e-06, + "loss": 0.9221, + "step": 472 + }, + { + "epoch": 0.02603335351422753, + "grad_norm": 0.992082953453064, + "learning_rate": 9.99738608697715e-06, + "loss": 0.8455, + "step": 473 + }, + { + "epoch": 0.026088392316583192, + "grad_norm": 1.0486301183700562, + "learning_rate": 9.997372053799547e-06, + "loss": 0.8729, + "step": 474 + }, + { + "epoch": 0.026143431118938854, + "grad_norm": 1.0328491926193237, + "learning_rate": 9.997357983063036e-06, + "loss": 0.8788, + "step": 475 + }, + { + "epoch": 0.02619846992129451, + "grad_norm": 0.963333249092102, + "learning_rate": 9.997343874767719e-06, + "loss": 0.892, + "step": 476 + }, + { + "epoch": 0.026253508723650173, + "grad_norm": 1.1606497764587402, + "learning_rate": 9.997329728913704e-06, + "loss": 0.9984, + "step": 477 + }, + { + "epoch": 0.026308547526005834, + "grad_norm": 1.241650104522705, + "learning_rate": 9.997315545501096e-06, + "loss": 0.946, + "step": 478 + }, + { + "epoch": 0.026363586328361496, + "grad_norm": 1.008004069328308, + "learning_rate": 9.99730132453e-06, + "loss": 0.849, + "step": 479 + }, + { + "epoch": 0.026418625130717157, + "grad_norm": 0.9883478879928589, + "learning_rate": 9.997287066000527e-06, + "loss": 0.9478, + "step": 480 + }, + { + "epoch": 0.026473663933072815, + "grad_norm": 1.0224446058273315, + "learning_rate": 9.997272769912783e-06, + "loss": 1.0318, + "step": 481 + }, + { + "epoch": 0.026528702735428476, + "grad_norm": 0.9412569403648376, + "learning_rate": 9.997258436266874e-06, + "loss": 0.9119, + "step": 482 + }, + { + "epoch": 0.026583741537784138, + "grad_norm": 0.9214537739753723, + "learning_rate": 9.997244065062906e-06, + "loss": 0.8785, + "step": 483 + }, + { + "epoch": 0.0266387803401398, + "grad_norm": 1.0015628337860107, + "learning_rate": 9.997229656300991e-06, + "loss": 0.8869, + "step": 484 + }, + { + "epoch": 0.02669381914249546, + "grad_norm": 0.8965190052986145, + "learning_rate": 9.997215209981237e-06, + "loss": 0.7009, + "step": 485 + }, + { + "epoch": 0.02674885794485112, + "grad_norm": 1.1976135969161987, + "learning_rate": 9.997200726103749e-06, + "loss": 0.9795, + "step": 486 + }, + { + "epoch": 0.02680389674720678, + "grad_norm": 0.864780843257904, + "learning_rate": 9.997186204668639e-06, + "loss": 0.7687, + "step": 487 + }, + { + "epoch": 0.02685893554956244, + "grad_norm": 0.9946566820144653, + "learning_rate": 9.997171645676013e-06, + "loss": 0.9672, + "step": 488 + }, + { + "epoch": 0.026913974351918103, + "grad_norm": 1.043835997581482, + "learning_rate": 9.997157049125985e-06, + "loss": 0.862, + "step": 489 + }, + { + "epoch": 0.026969013154273764, + "grad_norm": 0.9697456955909729, + "learning_rate": 9.99714241501866e-06, + "loss": 0.8368, + "step": 490 + }, + { + "epoch": 0.027024051956629425, + "grad_norm": 0.9975618124008179, + "learning_rate": 9.997127743354153e-06, + "loss": 0.8739, + "step": 491 + }, + { + "epoch": 0.027079090758985083, + "grad_norm": 1.0055313110351562, + "learning_rate": 9.99711303413257e-06, + "loss": 0.9227, + "step": 492 + }, + { + "epoch": 0.027134129561340745, + "grad_norm": 1.0418384075164795, + "learning_rate": 9.997098287354024e-06, + "loss": 0.9978, + "step": 493 + }, + { + "epoch": 0.027189168363696406, + "grad_norm": 0.8648970723152161, + "learning_rate": 9.997083503018625e-06, + "loss": 0.8363, + "step": 494 + }, + { + "epoch": 0.027244207166052067, + "grad_norm": 1.13506019115448, + "learning_rate": 9.997068681126483e-06, + "loss": 0.8851, + "step": 495 + }, + { + "epoch": 0.02729924596840773, + "grad_norm": 0.974400520324707, + "learning_rate": 9.997053821677712e-06, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.027354284770763387, + "grad_norm": 1.226507544517517, + "learning_rate": 9.997038924672419e-06, + "loss": 0.8586, + "step": 497 + }, + { + "epoch": 0.027409323573119048, + "grad_norm": 1.004753589630127, + "learning_rate": 9.997023990110721e-06, + "loss": 0.8974, + "step": 498 + }, + { + "epoch": 0.02746436237547471, + "grad_norm": 1.0492571592330933, + "learning_rate": 9.997009017992729e-06, + "loss": 0.8457, + "step": 499 + }, + { + "epoch": 0.02751940117783037, + "grad_norm": 1.0068167448043823, + "learning_rate": 9.996994008318554e-06, + "loss": 0.9608, + "step": 500 + }, + { + "epoch": 0.027574439980186032, + "grad_norm": 0.9686044454574585, + "learning_rate": 9.996978961088311e-06, + "loss": 0.9041, + "step": 501 + }, + { + "epoch": 0.027629478782541694, + "grad_norm": 1.281728744506836, + "learning_rate": 9.99696387630211e-06, + "loss": 0.9739, + "step": 502 + }, + { + "epoch": 0.02768451758489735, + "grad_norm": 0.9069758653640747, + "learning_rate": 9.996948753960065e-06, + "loss": 0.8467, + "step": 503 + }, + { + "epoch": 0.027739556387253013, + "grad_norm": 1.0337222814559937, + "learning_rate": 9.996933594062293e-06, + "loss": 0.9638, + "step": 504 + }, + { + "epoch": 0.027794595189608674, + "grad_norm": 0.9695359468460083, + "learning_rate": 9.996918396608905e-06, + "loss": 0.8986, + "step": 505 + }, + { + "epoch": 0.027849633991964336, + "grad_norm": 0.9120615124702454, + "learning_rate": 9.996903161600016e-06, + "loss": 0.9103, + "step": 506 + }, + { + "epoch": 0.027904672794319997, + "grad_norm": 0.9736546874046326, + "learning_rate": 9.996887889035741e-06, + "loss": 0.9308, + "step": 507 + }, + { + "epoch": 0.027959711596675655, + "grad_norm": 1.0184897184371948, + "learning_rate": 9.996872578916192e-06, + "loss": 0.8978, + "step": 508 + }, + { + "epoch": 0.028014750399031316, + "grad_norm": 0.9791838526725769, + "learning_rate": 9.996857231241489e-06, + "loss": 0.8639, + "step": 509 + }, + { + "epoch": 0.028069789201386978, + "grad_norm": 1.2985681295394897, + "learning_rate": 9.996841846011742e-06, + "loss": 0.9581, + "step": 510 + }, + { + "epoch": 0.02812482800374264, + "grad_norm": 1.0647368431091309, + "learning_rate": 9.996826423227071e-06, + "loss": 1.0565, + "step": 511 + }, + { + "epoch": 0.0281798668060983, + "grad_norm": 1.0336421728134155, + "learning_rate": 9.996810962887591e-06, + "loss": 1.008, + "step": 512 + }, + { + "epoch": 0.02823490560845396, + "grad_norm": 1.1838933229446411, + "learning_rate": 9.996795464993416e-06, + "loss": 0.8359, + "step": 513 + }, + { + "epoch": 0.02828994441080962, + "grad_norm": 0.9898360371589661, + "learning_rate": 9.996779929544663e-06, + "loss": 0.8501, + "step": 514 + }, + { + "epoch": 0.02834498321316528, + "grad_norm": 0.9836066365242004, + "learning_rate": 9.99676435654145e-06, + "loss": 0.8795, + "step": 515 + }, + { + "epoch": 0.028400022015520943, + "grad_norm": 1.0621601343154907, + "learning_rate": 9.996748745983895e-06, + "loss": 0.8746, + "step": 516 + }, + { + "epoch": 0.028455060817876604, + "grad_norm": 1.0082437992095947, + "learning_rate": 9.996733097872113e-06, + "loss": 0.9278, + "step": 517 + }, + { + "epoch": 0.028510099620232265, + "grad_norm": 0.9903931617736816, + "learning_rate": 9.996717412206222e-06, + "loss": 0.8264, + "step": 518 + }, + { + "epoch": 0.028565138422587923, + "grad_norm": 1.0797243118286133, + "learning_rate": 9.996701688986342e-06, + "loss": 1.0077, + "step": 519 + }, + { + "epoch": 0.028620177224943585, + "grad_norm": 1.147133231163025, + "learning_rate": 9.99668592821259e-06, + "loss": 0.9374, + "step": 520 + }, + { + "epoch": 0.028675216027299246, + "grad_norm": 0.9993947744369507, + "learning_rate": 9.996670129885082e-06, + "loss": 0.9562, + "step": 521 + }, + { + "epoch": 0.028730254829654907, + "grad_norm": 0.8580895066261292, + "learning_rate": 9.99665429400394e-06, + "loss": 0.7985, + "step": 522 + }, + { + "epoch": 0.02878529363201057, + "grad_norm": 0.9251388907432556, + "learning_rate": 9.996638420569281e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.028840332434366227, + "grad_norm": 1.0010193586349487, + "learning_rate": 9.996622509581227e-06, + "loss": 0.9316, + "step": 524 + }, + { + "epoch": 0.028895371236721888, + "grad_norm": 0.9822579026222229, + "learning_rate": 9.996606561039894e-06, + "loss": 0.8978, + "step": 525 + }, + { + "epoch": 0.02895041003907755, + "grad_norm": 1.0760595798492432, + "learning_rate": 9.996590574945403e-06, + "loss": 0.9125, + "step": 526 + }, + { + "epoch": 0.02900544884143321, + "grad_norm": 1.138869285583496, + "learning_rate": 9.996574551297876e-06, + "loss": 0.8185, + "step": 527 + }, + { + "epoch": 0.029060487643788872, + "grad_norm": 1.002994179725647, + "learning_rate": 9.996558490097433e-06, + "loss": 0.9404, + "step": 528 + }, + { + "epoch": 0.02911552644614453, + "grad_norm": 0.9550611972808838, + "learning_rate": 9.996542391344194e-06, + "loss": 0.859, + "step": 529 + }, + { + "epoch": 0.02917056524850019, + "grad_norm": 0.9236055612564087, + "learning_rate": 9.996526255038277e-06, + "loss": 0.7758, + "step": 530 + }, + { + "epoch": 0.029225604050855853, + "grad_norm": 1.103966474533081, + "learning_rate": 9.996510081179808e-06, + "loss": 1.0147, + "step": 531 + }, + { + "epoch": 0.029280642853211514, + "grad_norm": 0.9884665012359619, + "learning_rate": 9.996493869768906e-06, + "loss": 0.8784, + "step": 532 + }, + { + "epoch": 0.029335681655567176, + "grad_norm": 0.9173223376274109, + "learning_rate": 9.996477620805694e-06, + "loss": 0.8741, + "step": 533 + }, + { + "epoch": 0.029390720457922837, + "grad_norm": 0.965548574924469, + "learning_rate": 9.996461334290294e-06, + "loss": 0.8989, + "step": 534 + }, + { + "epoch": 0.029445759260278495, + "grad_norm": 0.9939296245574951, + "learning_rate": 9.996445010222828e-06, + "loss": 0.8552, + "step": 535 + }, + { + "epoch": 0.029500798062634156, + "grad_norm": 1.0081578493118286, + "learning_rate": 9.996428648603417e-06, + "loss": 0.9138, + "step": 536 + }, + { + "epoch": 0.029555836864989818, + "grad_norm": 1.0139487981796265, + "learning_rate": 9.996412249432188e-06, + "loss": 0.9452, + "step": 537 + }, + { + "epoch": 0.02961087566734548, + "grad_norm": 0.9463647603988647, + "learning_rate": 9.996395812709262e-06, + "loss": 0.8721, + "step": 538 + }, + { + "epoch": 0.02966591446970114, + "grad_norm": 0.9981473684310913, + "learning_rate": 9.99637933843476e-06, + "loss": 0.7791, + "step": 539 + }, + { + "epoch": 0.0297209532720568, + "grad_norm": 1.1637190580368042, + "learning_rate": 9.996362826608812e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 0.02977599207441246, + "grad_norm": 2.2887051105499268, + "learning_rate": 9.996346277231536e-06, + "loss": 0.9303, + "step": 541 + }, + { + "epoch": 0.02983103087676812, + "grad_norm": 0.9173391461372375, + "learning_rate": 9.99632969030306e-06, + "loss": 0.8627, + "step": 542 + }, + { + "epoch": 0.029886069679123783, + "grad_norm": 1.033355474472046, + "learning_rate": 9.996313065823506e-06, + "loss": 0.9906, + "step": 543 + }, + { + "epoch": 0.029941108481479444, + "grad_norm": 0.9286639094352722, + "learning_rate": 9.996296403793002e-06, + "loss": 0.7043, + "step": 544 + }, + { + "epoch": 0.029996147283835102, + "grad_norm": 0.963238000869751, + "learning_rate": 9.996279704211671e-06, + "loss": 1.0236, + "step": 545 + }, + { + "epoch": 0.030051186086190763, + "grad_norm": 1.0275089740753174, + "learning_rate": 9.99626296707964e-06, + "loss": 0.976, + "step": 546 + }, + { + "epoch": 0.030106224888546425, + "grad_norm": 1.0944674015045166, + "learning_rate": 9.996246192397032e-06, + "loss": 0.9209, + "step": 547 + }, + { + "epoch": 0.030161263690902086, + "grad_norm": 0.9620945453643799, + "learning_rate": 9.996229380163976e-06, + "loss": 0.8973, + "step": 548 + }, + { + "epoch": 0.030216302493257748, + "grad_norm": 1.032549500465393, + "learning_rate": 9.996212530380597e-06, + "loss": 0.892, + "step": 549 + }, + { + "epoch": 0.03027134129561341, + "grad_norm": 1.0433719158172607, + "learning_rate": 9.996195643047023e-06, + "loss": 0.8428, + "step": 550 + }, + { + "epoch": 0.030326380097969067, + "grad_norm": 1.1541085243225098, + "learning_rate": 9.996178718163378e-06, + "loss": 0.9084, + "step": 551 + }, + { + "epoch": 0.03038141890032473, + "grad_norm": 0.9386873245239258, + "learning_rate": 9.996161755729793e-06, + "loss": 0.9246, + "step": 552 + }, + { + "epoch": 0.03043645770268039, + "grad_norm": 1.092236042022705, + "learning_rate": 9.996144755746393e-06, + "loss": 0.8419, + "step": 553 + }, + { + "epoch": 0.03049149650503605, + "grad_norm": 0.9517606496810913, + "learning_rate": 9.996127718213306e-06, + "loss": 0.9002, + "step": 554 + }, + { + "epoch": 0.030546535307391712, + "grad_norm": 0.965972900390625, + "learning_rate": 9.996110643130661e-06, + "loss": 0.9197, + "step": 555 + }, + { + "epoch": 0.03060157410974737, + "grad_norm": 0.9396095275878906, + "learning_rate": 9.996093530498586e-06, + "loss": 0.8686, + "step": 556 + }, + { + "epoch": 0.030656612912103032, + "grad_norm": 1.0154120922088623, + "learning_rate": 9.99607638031721e-06, + "loss": 0.9773, + "step": 557 + }, + { + "epoch": 0.030711651714458693, + "grad_norm": 1.3572301864624023, + "learning_rate": 9.99605919258666e-06, + "loss": 0.911, + "step": 558 + }, + { + "epoch": 0.030766690516814355, + "grad_norm": 0.968278169631958, + "learning_rate": 9.996041967307066e-06, + "loss": 0.7704, + "step": 559 + }, + { + "epoch": 0.030821729319170016, + "grad_norm": 0.9867869019508362, + "learning_rate": 9.99602470447856e-06, + "loss": 0.873, + "step": 560 + }, + { + "epoch": 0.030876768121525677, + "grad_norm": 1.056450605392456, + "learning_rate": 9.996007404101269e-06, + "loss": 0.941, + "step": 561 + }, + { + "epoch": 0.030931806923881335, + "grad_norm": 1.0419799089431763, + "learning_rate": 9.995990066175321e-06, + "loss": 0.957, + "step": 562 + }, + { + "epoch": 0.030986845726236997, + "grad_norm": 0.9789314866065979, + "learning_rate": 9.995972690700852e-06, + "loss": 0.9229, + "step": 563 + }, + { + "epoch": 0.031041884528592658, + "grad_norm": 0.917783796787262, + "learning_rate": 9.995955277677989e-06, + "loss": 0.8186, + "step": 564 + }, + { + "epoch": 0.03109692333094832, + "grad_norm": 1.0231432914733887, + "learning_rate": 9.995937827106863e-06, + "loss": 0.8624, + "step": 565 + }, + { + "epoch": 0.03115196213330398, + "grad_norm": 0.9552083015441895, + "learning_rate": 9.995920338987605e-06, + "loss": 0.7967, + "step": 566 + }, + { + "epoch": 0.03120700093565964, + "grad_norm": 0.9441083669662476, + "learning_rate": 9.995902813320349e-06, + "loss": 0.8471, + "step": 567 + }, + { + "epoch": 0.0312620397380153, + "grad_norm": 1.0025299787521362, + "learning_rate": 9.995885250105223e-06, + "loss": 0.8646, + "step": 568 + }, + { + "epoch": 0.03131707854037096, + "grad_norm": 0.8997280597686768, + "learning_rate": 9.99586764934236e-06, + "loss": 0.8736, + "step": 569 + }, + { + "epoch": 0.03137211734272662, + "grad_norm": 0.9090663194656372, + "learning_rate": 9.995850011031896e-06, + "loss": 0.8548, + "step": 570 + }, + { + "epoch": 0.031427156145082284, + "grad_norm": 0.9641294479370117, + "learning_rate": 9.995832335173959e-06, + "loss": 0.8667, + "step": 571 + }, + { + "epoch": 0.031482194947437946, + "grad_norm": 0.9165804982185364, + "learning_rate": 9.995814621768682e-06, + "loss": 0.803, + "step": 572 + }, + { + "epoch": 0.03153723374979361, + "grad_norm": 0.9672492742538452, + "learning_rate": 9.995796870816202e-06, + "loss": 0.8335, + "step": 573 + }, + { + "epoch": 0.03159227255214927, + "grad_norm": 0.9359404444694519, + "learning_rate": 9.995779082316648e-06, + "loss": 0.8294, + "step": 574 + }, + { + "epoch": 0.03164731135450492, + "grad_norm": 0.926925003528595, + "learning_rate": 9.995761256270157e-06, + "loss": 0.7714, + "step": 575 + }, + { + "epoch": 0.031702350156860584, + "grad_norm": 1.1848629713058472, + "learning_rate": 9.995743392676862e-06, + "loss": 0.8925, + "step": 576 + }, + { + "epoch": 0.031757388959216246, + "grad_norm": 0.9624786972999573, + "learning_rate": 9.995725491536897e-06, + "loss": 0.9292, + "step": 577 + }, + { + "epoch": 0.03181242776157191, + "grad_norm": 0.9479736089706421, + "learning_rate": 9.995707552850396e-06, + "loss": 0.8797, + "step": 578 + }, + { + "epoch": 0.03186746656392757, + "grad_norm": 0.9551546573638916, + "learning_rate": 9.995689576617494e-06, + "loss": 0.8793, + "step": 579 + }, + { + "epoch": 0.03192250536628323, + "grad_norm": 0.9210056662559509, + "learning_rate": 9.995671562838325e-06, + "loss": 0.9714, + "step": 580 + }, + { + "epoch": 0.03197754416863889, + "grad_norm": 1.063117504119873, + "learning_rate": 9.995653511513029e-06, + "loss": 0.9608, + "step": 581 + }, + { + "epoch": 0.03203258297099455, + "grad_norm": 0.9426459670066833, + "learning_rate": 9.995635422641736e-06, + "loss": 0.9102, + "step": 582 + }, + { + "epoch": 0.032087621773350214, + "grad_norm": 1.0176693201065063, + "learning_rate": 9.995617296224584e-06, + "loss": 0.9109, + "step": 583 + }, + { + "epoch": 0.032142660575705875, + "grad_norm": 0.9457042217254639, + "learning_rate": 9.995599132261711e-06, + "loss": 0.9017, + "step": 584 + }, + { + "epoch": 0.03219769937806154, + "grad_norm": 1.5851638317108154, + "learning_rate": 9.995580930753252e-06, + "loss": 0.967, + "step": 585 + }, + { + "epoch": 0.03225273818041719, + "grad_norm": 0.9961487054824829, + "learning_rate": 9.995562691699345e-06, + "loss": 0.9396, + "step": 586 + }, + { + "epoch": 0.03230777698277285, + "grad_norm": 0.9892112016677856, + "learning_rate": 9.995544415100125e-06, + "loss": 0.9058, + "step": 587 + }, + { + "epoch": 0.032362815785128514, + "grad_norm": 0.9052272439002991, + "learning_rate": 9.99552610095573e-06, + "loss": 0.9194, + "step": 588 + }, + { + "epoch": 0.032417854587484175, + "grad_norm": 0.8381399512290955, + "learning_rate": 9.995507749266297e-06, + "loss": 0.7465, + "step": 589 + }, + { + "epoch": 0.03247289338983984, + "grad_norm": 1.018964171409607, + "learning_rate": 9.995489360031969e-06, + "loss": 0.841, + "step": 590 + }, + { + "epoch": 0.0325279321921955, + "grad_norm": 0.908311128616333, + "learning_rate": 9.995470933252876e-06, + "loss": 0.8592, + "step": 591 + }, + { + "epoch": 0.03258297099455116, + "grad_norm": 1.2986040115356445, + "learning_rate": 9.995452468929162e-06, + "loss": 0.8341, + "step": 592 + }, + { + "epoch": 0.03263800979690682, + "grad_norm": 1.6565190553665161, + "learning_rate": 9.995433967060966e-06, + "loss": 0.8681, + "step": 593 + }, + { + "epoch": 0.03269304859926248, + "grad_norm": 0.9725674390792847, + "learning_rate": 9.995415427648423e-06, + "loss": 0.8449, + "step": 594 + }, + { + "epoch": 0.032748087401618144, + "grad_norm": 0.8683852553367615, + "learning_rate": 9.995396850691677e-06, + "loss": 0.8478, + "step": 595 + }, + { + "epoch": 0.0328031262039738, + "grad_norm": 0.9912856817245483, + "learning_rate": 9.995378236190862e-06, + "loss": 0.8912, + "step": 596 + }, + { + "epoch": 0.03285816500632946, + "grad_norm": 0.9396800398826599, + "learning_rate": 9.995359584146125e-06, + "loss": 0.856, + "step": 597 + }, + { + "epoch": 0.03291320380868512, + "grad_norm": 1.385006308555603, + "learning_rate": 9.995340894557601e-06, + "loss": 0.9633, + "step": 598 + }, + { + "epoch": 0.03296824261104078, + "grad_norm": 0.8982875943183899, + "learning_rate": 9.995322167425433e-06, + "loss": 0.9244, + "step": 599 + }, + { + "epoch": 0.033023281413396444, + "grad_norm": 0.8981022834777832, + "learning_rate": 9.995303402749759e-06, + "loss": 0.8854, + "step": 600 + }, + { + "epoch": 0.033078320215752105, + "grad_norm": 0.9917197227478027, + "learning_rate": 9.995284600530724e-06, + "loss": 1.0086, + "step": 601 + }, + { + "epoch": 0.033133359018107766, + "grad_norm": 1.0540626049041748, + "learning_rate": 9.995265760768464e-06, + "loss": 1.0022, + "step": 602 + }, + { + "epoch": 0.03318839782046343, + "grad_norm": 0.9523479342460632, + "learning_rate": 9.995246883463126e-06, + "loss": 0.9893, + "step": 603 + }, + { + "epoch": 0.03324343662281909, + "grad_norm": 0.9824770092964172, + "learning_rate": 9.99522796861485e-06, + "loss": 0.8385, + "step": 604 + }, + { + "epoch": 0.03329847542517475, + "grad_norm": 1.0968893766403198, + "learning_rate": 9.995209016223776e-06, + "loss": 1.0109, + "step": 605 + }, + { + "epoch": 0.03335351422753041, + "grad_norm": 0.9115625023841858, + "learning_rate": 9.995190026290049e-06, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.033408553029886066, + "grad_norm": 0.9795814156532288, + "learning_rate": 9.99517099881381e-06, + "loss": 0.8941, + "step": 607 + }, + { + "epoch": 0.03346359183224173, + "grad_norm": 0.9317291378974915, + "learning_rate": 9.995151933795204e-06, + "loss": 0.7819, + "step": 608 + }, + { + "epoch": 0.03351863063459739, + "grad_norm": 0.9936283230781555, + "learning_rate": 9.995132831234373e-06, + "loss": 0.8674, + "step": 609 + }, + { + "epoch": 0.03357366943695305, + "grad_norm": 0.9872812032699585, + "learning_rate": 9.995113691131462e-06, + "loss": 0.9038, + "step": 610 + }, + { + "epoch": 0.03362870823930871, + "grad_norm": 0.9516895413398743, + "learning_rate": 9.995094513486611e-06, + "loss": 0.9038, + "step": 611 + }, + { + "epoch": 0.03368374704166437, + "grad_norm": 1.090579867362976, + "learning_rate": 9.995075298299968e-06, + "loss": 0.9587, + "step": 612 + }, + { + "epoch": 0.033738785844020035, + "grad_norm": 1.021398663520813, + "learning_rate": 9.995056045571677e-06, + "loss": 0.9569, + "step": 613 + }, + { + "epoch": 0.033793824646375696, + "grad_norm": 1.009657382965088, + "learning_rate": 9.99503675530188e-06, + "loss": 0.8346, + "step": 614 + }, + { + "epoch": 0.03384886344873136, + "grad_norm": 1.0478712320327759, + "learning_rate": 9.995017427490725e-06, + "loss": 1.0566, + "step": 615 + }, + { + "epoch": 0.03390390225108702, + "grad_norm": 1.1391830444335938, + "learning_rate": 9.994998062138355e-06, + "loss": 1.0727, + "step": 616 + }, + { + "epoch": 0.03395894105344268, + "grad_norm": 1.0172302722930908, + "learning_rate": 9.994978659244918e-06, + "loss": 0.7869, + "step": 617 + }, + { + "epoch": 0.034013979855798335, + "grad_norm": 1.0532630681991577, + "learning_rate": 9.994959218810558e-06, + "loss": 0.8626, + "step": 618 + }, + { + "epoch": 0.034069018658153996, + "grad_norm": 0.8300478458404541, + "learning_rate": 9.99493974083542e-06, + "loss": 0.8166, + "step": 619 + }, + { + "epoch": 0.03412405746050966, + "grad_norm": 1.0613664388656616, + "learning_rate": 9.994920225319656e-06, + "loss": 0.8899, + "step": 620 + }, + { + "epoch": 0.03417909626286532, + "grad_norm": 0.9827042818069458, + "learning_rate": 9.994900672263406e-06, + "loss": 0.8243, + "step": 621 + }, + { + "epoch": 0.03423413506522098, + "grad_norm": 0.8790082931518555, + "learning_rate": 9.994881081666818e-06, + "loss": 0.8153, + "step": 622 + }, + { + "epoch": 0.03428917386757664, + "grad_norm": 1.033378005027771, + "learning_rate": 9.994861453530044e-06, + "loss": 0.8916, + "step": 623 + }, + { + "epoch": 0.0343442126699323, + "grad_norm": 0.9547238349914551, + "learning_rate": 9.994841787853227e-06, + "loss": 0.9141, + "step": 624 + }, + { + "epoch": 0.034399251472287964, + "grad_norm": 0.9606438279151917, + "learning_rate": 9.994822084636514e-06, + "loss": 0.9435, + "step": 625 + }, + { + "epoch": 0.034454290274643626, + "grad_norm": 0.8461503982543945, + "learning_rate": 9.994802343880059e-06, + "loss": 0.7914, + "step": 626 + }, + { + "epoch": 0.03450932907699929, + "grad_norm": 1.144538402557373, + "learning_rate": 9.994782565584004e-06, + "loss": 0.8025, + "step": 627 + }, + { + "epoch": 0.03456436787935495, + "grad_norm": 1.0099962949752808, + "learning_rate": 9.994762749748502e-06, + "loss": 0.9607, + "step": 628 + }, + { + "epoch": 0.0346194066817106, + "grad_norm": 0.9822041988372803, + "learning_rate": 9.9947428963737e-06, + "loss": 0.9216, + "step": 629 + }, + { + "epoch": 0.034674445484066264, + "grad_norm": 0.9056866765022278, + "learning_rate": 9.994723005459746e-06, + "loss": 0.7913, + "step": 630 + }, + { + "epoch": 0.034729484286421926, + "grad_norm": 1.0099287033081055, + "learning_rate": 9.994703077006792e-06, + "loss": 0.9937, + "step": 631 + }, + { + "epoch": 0.03478452308877759, + "grad_norm": 0.9559167623519897, + "learning_rate": 9.994683111014984e-06, + "loss": 0.9774, + "step": 632 + }, + { + "epoch": 0.03483956189113325, + "grad_norm": 1.0359059572219849, + "learning_rate": 9.994663107484478e-06, + "loss": 0.9062, + "step": 633 + }, + { + "epoch": 0.03489460069348891, + "grad_norm": 0.8803057074546814, + "learning_rate": 9.99464306641542e-06, + "loss": 0.9638, + "step": 634 + }, + { + "epoch": 0.03494963949584457, + "grad_norm": 1.0926579236984253, + "learning_rate": 9.994622987807962e-06, + "loss": 1.0467, + "step": 635 + }, + { + "epoch": 0.03500467829820023, + "grad_norm": 1.0051401853561401, + "learning_rate": 9.994602871662253e-06, + "loss": 0.8717, + "step": 636 + }, + { + "epoch": 0.035059717100555894, + "grad_norm": 1.2007508277893066, + "learning_rate": 9.994582717978448e-06, + "loss": 0.8004, + "step": 637 + }, + { + "epoch": 0.035114755902911556, + "grad_norm": 0.8826266527175903, + "learning_rate": 9.994562526756695e-06, + "loss": 0.8888, + "step": 638 + }, + { + "epoch": 0.03516979470526721, + "grad_norm": 0.9953717589378357, + "learning_rate": 9.994542297997147e-06, + "loss": 0.8999, + "step": 639 + }, + { + "epoch": 0.03522483350762287, + "grad_norm": 1.0203614234924316, + "learning_rate": 9.994522031699958e-06, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.03527987230997853, + "grad_norm": 0.8760203719139099, + "learning_rate": 9.994501727865276e-06, + "loss": 0.7893, + "step": 641 + }, + { + "epoch": 0.035334911112334194, + "grad_norm": 1.024888277053833, + "learning_rate": 9.994481386493257e-06, + "loss": 0.9865, + "step": 642 + }, + { + "epoch": 0.035389949914689856, + "grad_norm": 0.907454788684845, + "learning_rate": 9.994461007584052e-06, + "loss": 0.891, + "step": 643 + }, + { + "epoch": 0.03544498871704552, + "grad_norm": 1.0400965213775635, + "learning_rate": 9.994440591137816e-06, + "loss": 0.9345, + "step": 644 + }, + { + "epoch": 0.03550002751940118, + "grad_norm": 0.9816616177558899, + "learning_rate": 9.9944201371547e-06, + "loss": 0.91, + "step": 645 + }, + { + "epoch": 0.03555506632175684, + "grad_norm": 1.0528117418289185, + "learning_rate": 9.99439964563486e-06, + "loss": 0.952, + "step": 646 + }, + { + "epoch": 0.0356101051241125, + "grad_norm": 0.9802080988883972, + "learning_rate": 9.99437911657845e-06, + "loss": 0.9392, + "step": 647 + }, + { + "epoch": 0.03566514392646816, + "grad_norm": 0.9580393433570862, + "learning_rate": 9.994358549985623e-06, + "loss": 0.874, + "step": 648 + }, + { + "epoch": 0.035720182728823824, + "grad_norm": 0.8935576677322388, + "learning_rate": 9.994337945856533e-06, + "loss": 0.8435, + "step": 649 + }, + { + "epoch": 0.03577522153117948, + "grad_norm": 1.009699821472168, + "learning_rate": 9.994317304191337e-06, + "loss": 0.9436, + "step": 650 + }, + { + "epoch": 0.03583026033353514, + "grad_norm": 0.9126121401786804, + "learning_rate": 9.994296624990188e-06, + "loss": 0.8424, + "step": 651 + }, + { + "epoch": 0.0358852991358908, + "grad_norm": 0.9555553197860718, + "learning_rate": 9.994275908253243e-06, + "loss": 0.93, + "step": 652 + }, + { + "epoch": 0.03594033793824646, + "grad_norm": 0.8359857797622681, + "learning_rate": 9.994255153980658e-06, + "loss": 0.6326, + "step": 653 + }, + { + "epoch": 0.035995376740602124, + "grad_norm": 0.8918783664703369, + "learning_rate": 9.994234362172587e-06, + "loss": 0.8287, + "step": 654 + }, + { + "epoch": 0.036050415542957785, + "grad_norm": 0.9878549575805664, + "learning_rate": 9.994213532829188e-06, + "loss": 0.8841, + "step": 655 + }, + { + "epoch": 0.03610545434531345, + "grad_norm": 0.9504040479660034, + "learning_rate": 9.994192665950617e-06, + "loss": 1.0182, + "step": 656 + }, + { + "epoch": 0.03616049314766911, + "grad_norm": 0.9531422257423401, + "learning_rate": 9.99417176153703e-06, + "loss": 0.8504, + "step": 657 + }, + { + "epoch": 0.03621553195002477, + "grad_norm": 0.9580292105674744, + "learning_rate": 9.994150819588587e-06, + "loss": 0.8048, + "step": 658 + }, + { + "epoch": 0.03627057075238043, + "grad_norm": 0.9786819815635681, + "learning_rate": 9.99412984010544e-06, + "loss": 0.9124, + "step": 659 + }, + { + "epoch": 0.03632560955473609, + "grad_norm": 0.9733422994613647, + "learning_rate": 9.994108823087751e-06, + "loss": 0.8868, + "step": 660 + }, + { + "epoch": 0.03638064835709175, + "grad_norm": 1.093173623085022, + "learning_rate": 9.994087768535679e-06, + "loss": 0.9428, + "step": 661 + }, + { + "epoch": 0.03643568715944741, + "grad_norm": 0.9067148566246033, + "learning_rate": 9.994066676449378e-06, + "loss": 0.8838, + "step": 662 + }, + { + "epoch": 0.03649072596180307, + "grad_norm": 0.9509521722793579, + "learning_rate": 9.99404554682901e-06, + "loss": 0.9034, + "step": 663 + }, + { + "epoch": 0.03654576476415873, + "grad_norm": 0.9523824453353882, + "learning_rate": 9.994024379674731e-06, + "loss": 0.9623, + "step": 664 + }, + { + "epoch": 0.03660080356651439, + "grad_norm": 0.987276554107666, + "learning_rate": 9.994003174986703e-06, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.036655842368870054, + "grad_norm": 0.9500744342803955, + "learning_rate": 9.993981932765083e-06, + "loss": 0.9742, + "step": 666 + }, + { + "epoch": 0.036710881171225715, + "grad_norm": 0.9420705437660217, + "learning_rate": 9.993960653010034e-06, + "loss": 0.9657, + "step": 667 + }, + { + "epoch": 0.036765919973581376, + "grad_norm": 0.9443248510360718, + "learning_rate": 9.99393933572171e-06, + "loss": 0.8468, + "step": 668 + }, + { + "epoch": 0.03682095877593704, + "grad_norm": 0.9666558504104614, + "learning_rate": 9.993917980900276e-06, + "loss": 0.9871, + "step": 669 + }, + { + "epoch": 0.0368759975782927, + "grad_norm": 1.0236201286315918, + "learning_rate": 9.993896588545892e-06, + "loss": 0.9814, + "step": 670 + }, + { + "epoch": 0.03693103638064836, + "grad_norm": 1.016190528869629, + "learning_rate": 9.993875158658716e-06, + "loss": 1.0156, + "step": 671 + }, + { + "epoch": 0.036986075183004015, + "grad_norm": 0.9296661019325256, + "learning_rate": 9.993853691238913e-06, + "loss": 0.7956, + "step": 672 + }, + { + "epoch": 0.037041113985359676, + "grad_norm": 0.9276684522628784, + "learning_rate": 9.993832186286643e-06, + "loss": 0.9253, + "step": 673 + }, + { + "epoch": 0.03709615278771534, + "grad_norm": 0.8588787913322449, + "learning_rate": 9.993810643802065e-06, + "loss": 0.7878, + "step": 674 + }, + { + "epoch": 0.037151191590071, + "grad_norm": 0.9955212473869324, + "learning_rate": 9.993789063785344e-06, + "loss": 0.8711, + "step": 675 + }, + { + "epoch": 0.03720623039242666, + "grad_norm": 0.925578236579895, + "learning_rate": 9.993767446236642e-06, + "loss": 0.9431, + "step": 676 + }, + { + "epoch": 0.03726126919478232, + "grad_norm": 0.9610552787780762, + "learning_rate": 9.99374579115612e-06, + "loss": 0.887, + "step": 677 + }, + { + "epoch": 0.03731630799713798, + "grad_norm": 1.0052428245544434, + "learning_rate": 9.99372409854394e-06, + "loss": 0.8751, + "step": 678 + }, + { + "epoch": 0.037371346799493645, + "grad_norm": 0.9503066539764404, + "learning_rate": 9.99370236840027e-06, + "loss": 0.8556, + "step": 679 + }, + { + "epoch": 0.037426385601849306, + "grad_norm": 2.426232099533081, + "learning_rate": 9.993680600725266e-06, + "loss": 0.9077, + "step": 680 + }, + { + "epoch": 0.03748142440420497, + "grad_norm": 0.9119723439216614, + "learning_rate": 9.993658795519096e-06, + "loss": 0.8575, + "step": 681 + }, + { + "epoch": 0.03753646320656062, + "grad_norm": 0.9688286781311035, + "learning_rate": 9.993636952781923e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 0.03759150200891628, + "grad_norm": 1.030013084411621, + "learning_rate": 9.993615072513913e-06, + "loss": 0.8622, + "step": 683 + }, + { + "epoch": 0.037646540811271945, + "grad_norm": 1.055187463760376, + "learning_rate": 9.993593154715228e-06, + "loss": 0.9251, + "step": 684 + }, + { + "epoch": 0.037701579613627606, + "grad_norm": 1.0518591403961182, + "learning_rate": 9.993571199386032e-06, + "loss": 0.9575, + "step": 685 + }, + { + "epoch": 0.03775661841598327, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.993549206526495e-06, + "loss": 0.8522, + "step": 686 + }, + { + "epoch": 0.03781165721833893, + "grad_norm": 1.0212332010269165, + "learning_rate": 9.993527176136775e-06, + "loss": 0.9358, + "step": 687 + }, + { + "epoch": 0.03786669602069459, + "grad_norm": 0.9137141108512878, + "learning_rate": 9.993505108217045e-06, + "loss": 0.8561, + "step": 688 + }, + { + "epoch": 0.03792173482305025, + "grad_norm": 1.0069375038146973, + "learning_rate": 9.993483002767465e-06, + "loss": 0.8274, + "step": 689 + }, + { + "epoch": 0.03797677362540591, + "grad_norm": 0.9820672869682312, + "learning_rate": 9.993460859788204e-06, + "loss": 0.907, + "step": 690 + }, + { + "epoch": 0.038031812427761574, + "grad_norm": 1.0042002201080322, + "learning_rate": 9.993438679279428e-06, + "loss": 0.9263, + "step": 691 + }, + { + "epoch": 0.038086851230117236, + "grad_norm": 0.9733695983886719, + "learning_rate": 9.993416461241304e-06, + "loss": 0.8455, + "step": 692 + }, + { + "epoch": 0.03814189003247289, + "grad_norm": 0.9106015563011169, + "learning_rate": 9.993394205673996e-06, + "loss": 0.8469, + "step": 693 + }, + { + "epoch": 0.03819692883482855, + "grad_norm": 0.9802660346031189, + "learning_rate": 9.993371912577677e-06, + "loss": 0.8662, + "step": 694 + }, + { + "epoch": 0.03825196763718421, + "grad_norm": 0.9183964729309082, + "learning_rate": 9.99334958195251e-06, + "loss": 0.8968, + "step": 695 + }, + { + "epoch": 0.038307006439539874, + "grad_norm": 0.9572185277938843, + "learning_rate": 9.993327213798663e-06, + "loss": 0.953, + "step": 696 + }, + { + "epoch": 0.038362045241895536, + "grad_norm": 1.4480071067810059, + "learning_rate": 9.993304808116307e-06, + "loss": 1.1131, + "step": 697 + }, + { + "epoch": 0.0384170840442512, + "grad_norm": 0.9297361969947815, + "learning_rate": 9.993282364905607e-06, + "loss": 0.884, + "step": 698 + }, + { + "epoch": 0.03847212284660686, + "grad_norm": 0.9400073885917664, + "learning_rate": 9.993259884166735e-06, + "loss": 0.932, + "step": 699 + }, + { + "epoch": 0.03852716164896252, + "grad_norm": 0.9231798052787781, + "learning_rate": 9.993237365899858e-06, + "loss": 0.8981, + "step": 700 + }, + { + "epoch": 0.03858220045131818, + "grad_norm": 0.8233712911605835, + "learning_rate": 9.993214810105144e-06, + "loss": 0.8218, + "step": 701 + }, + { + "epoch": 0.03863723925367384, + "grad_norm": 1.0997854471206665, + "learning_rate": 9.993192216782768e-06, + "loss": 0.9298, + "step": 702 + }, + { + "epoch": 0.038692278056029504, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.993169585932893e-06, + "loss": 0.7815, + "step": 703 + }, + { + "epoch": 0.03874731685838516, + "grad_norm": 0.9913730025291443, + "learning_rate": 9.993146917555692e-06, + "loss": 0.9621, + "step": 704 + }, + { + "epoch": 0.03880235566074082, + "grad_norm": 1.088767409324646, + "learning_rate": 9.993124211651334e-06, + "loss": 0.9295, + "step": 705 + }, + { + "epoch": 0.03885739446309648, + "grad_norm": 0.8199124336242676, + "learning_rate": 9.993101468219995e-06, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.03891243326545214, + "grad_norm": 1.112566351890564, + "learning_rate": 9.99307868726184e-06, + "loss": 0.791, + "step": 707 + }, + { + "epoch": 0.038967472067807804, + "grad_norm": 0.9372578859329224, + "learning_rate": 9.99305586877704e-06, + "loss": 0.8567, + "step": 708 + }, + { + "epoch": 0.039022510870163465, + "grad_norm": 1.0167721509933472, + "learning_rate": 9.99303301276577e-06, + "loss": 0.9787, + "step": 709 + }, + { + "epoch": 0.03907754967251913, + "grad_norm": 1.3526856899261475, + "learning_rate": 9.993010119228202e-06, + "loss": 1.2215, + "step": 710 + }, + { + "epoch": 0.03913258847487479, + "grad_norm": 0.8819016814231873, + "learning_rate": 9.992987188164505e-06, + "loss": 0.7736, + "step": 711 + }, + { + "epoch": 0.03918762727723045, + "grad_norm": 1.0033677816390991, + "learning_rate": 9.992964219574852e-06, + "loss": 0.9919, + "step": 712 + }, + { + "epoch": 0.03924266607958611, + "grad_norm": 0.894926130771637, + "learning_rate": 9.992941213459417e-06, + "loss": 0.9058, + "step": 713 + }, + { + "epoch": 0.03929770488194177, + "grad_norm": 0.9481377005577087, + "learning_rate": 9.992918169818373e-06, + "loss": 0.8436, + "step": 714 + }, + { + "epoch": 0.03935274368429743, + "grad_norm": 0.9312933087348938, + "learning_rate": 9.992895088651893e-06, + "loss": 0.8869, + "step": 715 + }, + { + "epoch": 0.03940778248665309, + "grad_norm": 0.9765705466270447, + "learning_rate": 9.99287196996015e-06, + "loss": 0.9512, + "step": 716 + }, + { + "epoch": 0.03946282128900875, + "grad_norm": 0.9610235691070557, + "learning_rate": 9.992848813743317e-06, + "loss": 0.8005, + "step": 717 + }, + { + "epoch": 0.03951786009136441, + "grad_norm": 1.102995753288269, + "learning_rate": 9.99282562000157e-06, + "loss": 0.8017, + "step": 718 + }, + { + "epoch": 0.03957289889372007, + "grad_norm": 1.023317575454712, + "learning_rate": 9.99280238873508e-06, + "loss": 0.911, + "step": 719 + }, + { + "epoch": 0.039627937696075734, + "grad_norm": 1.0531049966812134, + "learning_rate": 9.992779119944025e-06, + "loss": 0.8562, + "step": 720 + }, + { + "epoch": 0.039682976498431395, + "grad_norm": 0.918250322341919, + "learning_rate": 9.992755813628579e-06, + "loss": 0.92, + "step": 721 + }, + { + "epoch": 0.039738015300787057, + "grad_norm": 0.8508251309394836, + "learning_rate": 9.992732469788915e-06, + "loss": 0.7347, + "step": 722 + }, + { + "epoch": 0.03979305410314272, + "grad_norm": 0.9184926152229309, + "learning_rate": 9.992709088425211e-06, + "loss": 0.8732, + "step": 723 + }, + { + "epoch": 0.03984809290549838, + "grad_norm": 1.1613929271697998, + "learning_rate": 9.992685669537643e-06, + "loss": 0.9522, + "step": 724 + }, + { + "epoch": 0.039903131707854034, + "grad_norm": 1.091513752937317, + "learning_rate": 9.992662213126386e-06, + "loss": 0.9646, + "step": 725 + }, + { + "epoch": 0.039958170510209695, + "grad_norm": 1.057803750038147, + "learning_rate": 9.992638719191615e-06, + "loss": 0.7032, + "step": 726 + }, + { + "epoch": 0.040013209312565357, + "grad_norm": 0.8771823644638062, + "learning_rate": 9.992615187733508e-06, + "loss": 0.8577, + "step": 727 + }, + { + "epoch": 0.04006824811492102, + "grad_norm": 0.9471028447151184, + "learning_rate": 9.992591618752244e-06, + "loss": 0.9057, + "step": 728 + }, + { + "epoch": 0.04012328691727668, + "grad_norm": 0.9547705054283142, + "learning_rate": 9.992568012247995e-06, + "loss": 0.9549, + "step": 729 + }, + { + "epoch": 0.04017832571963234, + "grad_norm": 0.8862974047660828, + "learning_rate": 9.992544368220941e-06, + "loss": 0.8593, + "step": 730 + }, + { + "epoch": 0.040233364521988, + "grad_norm": 0.906334400177002, + "learning_rate": 9.992520686671261e-06, + "loss": 0.8832, + "step": 731 + }, + { + "epoch": 0.04028840332434366, + "grad_norm": 1.07270085811615, + "learning_rate": 9.992496967599133e-06, + "loss": 0.9409, + "step": 732 + }, + { + "epoch": 0.040343442126699325, + "grad_norm": 0.9026005268096924, + "learning_rate": 9.992473211004734e-06, + "loss": 0.8326, + "step": 733 + }, + { + "epoch": 0.040398480929054986, + "grad_norm": 0.9762942790985107, + "learning_rate": 9.992449416888241e-06, + "loss": 0.9048, + "step": 734 + }, + { + "epoch": 0.04045351973141065, + "grad_norm": 0.9658033847808838, + "learning_rate": 9.992425585249837e-06, + "loss": 0.9219, + "step": 735 + }, + { + "epoch": 0.0405085585337663, + "grad_norm": 0.8909044861793518, + "learning_rate": 9.992401716089698e-06, + "loss": 0.8564, + "step": 736 + }, + { + "epoch": 0.04056359733612196, + "grad_norm": 1.0387929677963257, + "learning_rate": 9.992377809408001e-06, + "loss": 0.9533, + "step": 737 + }, + { + "epoch": 0.040618636138477625, + "grad_norm": 0.9044275879859924, + "learning_rate": 9.99235386520493e-06, + "loss": 0.8508, + "step": 738 + }, + { + "epoch": 0.040673674940833286, + "grad_norm": 1.019377589225769, + "learning_rate": 9.992329883480667e-06, + "loss": 0.8684, + "step": 739 + }, + { + "epoch": 0.04072871374318895, + "grad_norm": 0.9394627213478088, + "learning_rate": 9.992305864235385e-06, + "loss": 0.7665, + "step": 740 + }, + { + "epoch": 0.04078375254554461, + "grad_norm": 0.8652323484420776, + "learning_rate": 9.99228180746927e-06, + "loss": 0.8576, + "step": 741 + }, + { + "epoch": 0.04083879134790027, + "grad_norm": 0.9347619414329529, + "learning_rate": 9.992257713182502e-06, + "loss": 0.9586, + "step": 742 + }, + { + "epoch": 0.04089383015025593, + "grad_norm": 0.9510203003883362, + "learning_rate": 9.99223358137526e-06, + "loss": 0.9092, + "step": 743 + }, + { + "epoch": 0.04094886895261159, + "grad_norm": 0.8242866396903992, + "learning_rate": 9.992209412047729e-06, + "loss": 0.6997, + "step": 744 + }, + { + "epoch": 0.041003907754967255, + "grad_norm": 0.8842730522155762, + "learning_rate": 9.992185205200087e-06, + "loss": 0.8873, + "step": 745 + }, + { + "epoch": 0.041058946557322916, + "grad_norm": 1.0813730955123901, + "learning_rate": 9.992160960832518e-06, + "loss": 1.0162, + "step": 746 + }, + { + "epoch": 0.04111398535967857, + "grad_norm": 1.1276283264160156, + "learning_rate": 9.9921366789452e-06, + "loss": 1.0004, + "step": 747 + }, + { + "epoch": 0.04116902416203423, + "grad_norm": 0.8810326457023621, + "learning_rate": 9.992112359538323e-06, + "loss": 0.7823, + "step": 748 + }, + { + "epoch": 0.04122406296438989, + "grad_norm": 0.9939407110214233, + "learning_rate": 9.992088002612066e-06, + "loss": 1.0016, + "step": 749 + }, + { + "epoch": 0.041279101766745555, + "grad_norm": 1.0963523387908936, + "learning_rate": 9.99206360816661e-06, + "loss": 0.9252, + "step": 750 + }, + { + "epoch": 0.041334140569101216, + "grad_norm": 1.1346478462219238, + "learning_rate": 9.99203917620214e-06, + "loss": 0.9608, + "step": 751 + }, + { + "epoch": 0.04138917937145688, + "grad_norm": 1.0108580589294434, + "learning_rate": 9.992014706718841e-06, + "loss": 0.9179, + "step": 752 + }, + { + "epoch": 0.04144421817381254, + "grad_norm": 0.897293210029602, + "learning_rate": 9.991990199716894e-06, + "loss": 0.9295, + "step": 753 + }, + { + "epoch": 0.0414992569761682, + "grad_norm": 1.0152363777160645, + "learning_rate": 9.991965655196488e-06, + "loss": 0.8467, + "step": 754 + }, + { + "epoch": 0.04155429577852386, + "grad_norm": 0.8655388355255127, + "learning_rate": 9.9919410731578e-06, + "loss": 0.796, + "step": 755 + }, + { + "epoch": 0.04160933458087952, + "grad_norm": 1.0140331983566284, + "learning_rate": 9.991916453601023e-06, + "loss": 0.8444, + "step": 756 + }, + { + "epoch": 0.041664373383235184, + "grad_norm": 0.9387341141700745, + "learning_rate": 9.991891796526338e-06, + "loss": 0.8669, + "step": 757 + }, + { + "epoch": 0.04171941218559084, + "grad_norm": 0.9395696520805359, + "learning_rate": 9.991867101933928e-06, + "loss": 0.8376, + "step": 758 + }, + { + "epoch": 0.0417744509879465, + "grad_norm": 1.0856634378433228, + "learning_rate": 9.991842369823983e-06, + "loss": 0.9271, + "step": 759 + }, + { + "epoch": 0.04182948979030216, + "grad_norm": 0.8777190446853638, + "learning_rate": 9.991817600196687e-06, + "loss": 0.9197, + "step": 760 + }, + { + "epoch": 0.04188452859265782, + "grad_norm": 0.9639917016029358, + "learning_rate": 9.991792793052225e-06, + "loss": 0.8835, + "step": 761 + }, + { + "epoch": 0.041939567395013484, + "grad_norm": 0.9384773969650269, + "learning_rate": 9.991767948390785e-06, + "loss": 0.8403, + "step": 762 + }, + { + "epoch": 0.041994606197369146, + "grad_norm": 0.8987650275230408, + "learning_rate": 9.991743066212554e-06, + "loss": 0.7948, + "step": 763 + }, + { + "epoch": 0.04204964499972481, + "grad_norm": 1.0545049905776978, + "learning_rate": 9.991718146517717e-06, + "loss": 0.9359, + "step": 764 + }, + { + "epoch": 0.04210468380208047, + "grad_norm": 0.9840022325515747, + "learning_rate": 9.991693189306463e-06, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.04215972260443613, + "grad_norm": 0.8769927620887756, + "learning_rate": 9.991668194578981e-06, + "loss": 0.8647, + "step": 766 + }, + { + "epoch": 0.04221476140679179, + "grad_norm": 0.9268791675567627, + "learning_rate": 9.991643162335455e-06, + "loss": 0.897, + "step": 767 + }, + { + "epoch": 0.042269800209147446, + "grad_norm": 0.9316747784614563, + "learning_rate": 9.991618092576075e-06, + "loss": 0.9341, + "step": 768 + }, + { + "epoch": 0.04232483901150311, + "grad_norm": 0.8348364233970642, + "learning_rate": 9.991592985301031e-06, + "loss": 0.7528, + "step": 769 + }, + { + "epoch": 0.04237987781385877, + "grad_norm": 0.9139068126678467, + "learning_rate": 9.99156784051051e-06, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 0.04243491661621443, + "grad_norm": 0.9403928518295288, + "learning_rate": 9.991542658204701e-06, + "loss": 0.974, + "step": 771 + }, + { + "epoch": 0.04248995541857009, + "grad_norm": 0.993549108505249, + "learning_rate": 9.991517438383793e-06, + "loss": 0.9479, + "step": 772 + }, + { + "epoch": 0.04254499422092575, + "grad_norm": 0.8494916558265686, + "learning_rate": 9.991492181047975e-06, + "loss": 0.9149, + "step": 773 + }, + { + "epoch": 0.042600033023281414, + "grad_norm": 1.0351910591125488, + "learning_rate": 9.991466886197441e-06, + "loss": 0.9552, + "step": 774 + }, + { + "epoch": 0.042655071825637075, + "grad_norm": 0.916829526424408, + "learning_rate": 9.991441553832375e-06, + "loss": 0.8781, + "step": 775 + }, + { + "epoch": 0.04271011062799274, + "grad_norm": 1.113476276397705, + "learning_rate": 9.991416183952972e-06, + "loss": 0.8137, + "step": 776 + }, + { + "epoch": 0.0427651494303484, + "grad_norm": 1.1608171463012695, + "learning_rate": 9.991390776559421e-06, + "loss": 1.0045, + "step": 777 + }, + { + "epoch": 0.04282018823270406, + "grad_norm": 1.0045493841171265, + "learning_rate": 9.991365331651913e-06, + "loss": 0.8813, + "step": 778 + }, + { + "epoch": 0.042875227035059714, + "grad_norm": 0.918820858001709, + "learning_rate": 9.991339849230639e-06, + "loss": 0.9198, + "step": 779 + }, + { + "epoch": 0.042930265837415375, + "grad_norm": 0.9875735640525818, + "learning_rate": 9.991314329295792e-06, + "loss": 0.8665, + "step": 780 + }, + { + "epoch": 0.04298530463977104, + "grad_norm": 0.873768150806427, + "learning_rate": 9.991288771847561e-06, + "loss": 0.8606, + "step": 781 + }, + { + "epoch": 0.0430403434421267, + "grad_norm": 0.8892746567726135, + "learning_rate": 9.991263176886139e-06, + "loss": 0.9011, + "step": 782 + }, + { + "epoch": 0.04309538224448236, + "grad_norm": 1.097734808921814, + "learning_rate": 9.99123754441172e-06, + "loss": 1.009, + "step": 783 + }, + { + "epoch": 0.04315042104683802, + "grad_norm": 1.0065964460372925, + "learning_rate": 9.991211874424497e-06, + "loss": 0.9492, + "step": 784 + }, + { + "epoch": 0.04320545984919368, + "grad_norm": 1.0791678428649902, + "learning_rate": 9.99118616692466e-06, + "loss": 1.0142, + "step": 785 + }, + { + "epoch": 0.043260498651549344, + "grad_norm": 0.9454777836799622, + "learning_rate": 9.991160421912404e-06, + "loss": 0.8058, + "step": 786 + }, + { + "epoch": 0.043315537453905005, + "grad_norm": 0.9448156952857971, + "learning_rate": 9.991134639387922e-06, + "loss": 0.8184, + "step": 787 + }, + { + "epoch": 0.043370576256260666, + "grad_norm": 0.9636550545692444, + "learning_rate": 9.99110881935141e-06, + "loss": 0.8606, + "step": 788 + }, + { + "epoch": 0.04342561505861633, + "grad_norm": 0.9933613538742065, + "learning_rate": 9.991082961803058e-06, + "loss": 0.9449, + "step": 789 + }, + { + "epoch": 0.04348065386097198, + "grad_norm": 0.8906797170639038, + "learning_rate": 9.991057066743065e-06, + "loss": 0.8053, + "step": 790 + }, + { + "epoch": 0.043535692663327644, + "grad_norm": 1.0393906831741333, + "learning_rate": 9.991031134171621e-06, + "loss": 0.8487, + "step": 791 + }, + { + "epoch": 0.043590731465683305, + "grad_norm": 1.0618231296539307, + "learning_rate": 9.991005164088923e-06, + "loss": 0.9847, + "step": 792 + }, + { + "epoch": 0.043645770268038966, + "grad_norm": 0.9525149464607239, + "learning_rate": 9.990979156495167e-06, + "loss": 0.9318, + "step": 793 + }, + { + "epoch": 0.04370080907039463, + "grad_norm": 0.9430851936340332, + "learning_rate": 9.990953111390546e-06, + "loss": 0.8483, + "step": 794 + }, + { + "epoch": 0.04375584787275029, + "grad_norm": 0.9259672164916992, + "learning_rate": 9.99092702877526e-06, + "loss": 0.9365, + "step": 795 + }, + { + "epoch": 0.04381088667510595, + "grad_norm": 0.942609965801239, + "learning_rate": 9.9909009086495e-06, + "loss": 0.8408, + "step": 796 + }, + { + "epoch": 0.04386592547746161, + "grad_norm": 0.939255952835083, + "learning_rate": 9.990874751013467e-06, + "loss": 0.8749, + "step": 797 + }, + { + "epoch": 0.04392096427981727, + "grad_norm": 1.1701711416244507, + "learning_rate": 9.990848555867353e-06, + "loss": 0.9312, + "step": 798 + }, + { + "epoch": 0.043976003082172935, + "grad_norm": 1.0441124439239502, + "learning_rate": 9.990822323211358e-06, + "loss": 0.8618, + "step": 799 + }, + { + "epoch": 0.04403104188452859, + "grad_norm": 0.9601489305496216, + "learning_rate": 9.990796053045679e-06, + "loss": 0.9569, + "step": 800 + }, + { + "epoch": 0.04408608068688425, + "grad_norm": 0.9394032955169678, + "learning_rate": 9.990769745370513e-06, + "loss": 0.846, + "step": 801 + }, + { + "epoch": 0.04414111948923991, + "grad_norm": 0.9631348252296448, + "learning_rate": 9.990743400186056e-06, + "loss": 0.8754, + "step": 802 + }, + { + "epoch": 0.04419615829159557, + "grad_norm": 0.9234963059425354, + "learning_rate": 9.990717017492508e-06, + "loss": 0.8613, + "step": 803 + }, + { + "epoch": 0.044251197093951235, + "grad_norm": 0.9169090390205383, + "learning_rate": 9.990690597290069e-06, + "loss": 0.8867, + "step": 804 + }, + { + "epoch": 0.044306235896306896, + "grad_norm": 1.0194867849349976, + "learning_rate": 9.990664139578933e-06, + "loss": 0.8675, + "step": 805 + }, + { + "epoch": 0.04436127469866256, + "grad_norm": 1.3226114511489868, + "learning_rate": 9.990637644359302e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.04441631350101822, + "grad_norm": 0.8904317617416382, + "learning_rate": 9.990611111631374e-06, + "loss": 0.7274, + "step": 807 + }, + { + "epoch": 0.04447135230337388, + "grad_norm": 0.8909007906913757, + "learning_rate": 9.99058454139535e-06, + "loss": 0.8141, + "step": 808 + }, + { + "epoch": 0.04452639110572954, + "grad_norm": 1.004015564918518, + "learning_rate": 9.990557933651429e-06, + "loss": 0.9883, + "step": 809 + }, + { + "epoch": 0.0445814299080852, + "grad_norm": 1.1215732097625732, + "learning_rate": 9.990531288399807e-06, + "loss": 0.9355, + "step": 810 + }, + { + "epoch": 0.04463646871044086, + "grad_norm": 1.0545012950897217, + "learning_rate": 9.99050460564069e-06, + "loss": 0.9532, + "step": 811 + }, + { + "epoch": 0.04469150751279652, + "grad_norm": 0.9608867168426514, + "learning_rate": 9.990477885374277e-06, + "loss": 0.9363, + "step": 812 + }, + { + "epoch": 0.04474654631515218, + "grad_norm": 0.8750461935997009, + "learning_rate": 9.990451127600766e-06, + "loss": 0.7343, + "step": 813 + }, + { + "epoch": 0.04480158511750784, + "grad_norm": 0.891740620136261, + "learning_rate": 9.99042433232036e-06, + "loss": 0.8541, + "step": 814 + }, + { + "epoch": 0.0448566239198635, + "grad_norm": 1.1520029306411743, + "learning_rate": 9.990397499533264e-06, + "loss": 0.7696, + "step": 815 + }, + { + "epoch": 0.044911662722219164, + "grad_norm": 0.9526278972625732, + "learning_rate": 9.990370629239673e-06, + "loss": 0.8953, + "step": 816 + }, + { + "epoch": 0.044966701524574826, + "grad_norm": 0.9218434691429138, + "learning_rate": 9.990343721439795e-06, + "loss": 0.8198, + "step": 817 + }, + { + "epoch": 0.04502174032693049, + "grad_norm": 0.8502745628356934, + "learning_rate": 9.990316776133827e-06, + "loss": 0.8035, + "step": 818 + }, + { + "epoch": 0.04507677912928615, + "grad_norm": 0.8861565589904785, + "learning_rate": 9.990289793321975e-06, + "loss": 0.8626, + "step": 819 + }, + { + "epoch": 0.04513181793164181, + "grad_norm": 1.1113256216049194, + "learning_rate": 9.99026277300444e-06, + "loss": 0.9363, + "step": 820 + }, + { + "epoch": 0.04518685673399747, + "grad_norm": 0.9984708428382874, + "learning_rate": 9.990235715181426e-06, + "loss": 1.0376, + "step": 821 + }, + { + "epoch": 0.045241895536353126, + "grad_norm": 0.9026711583137512, + "learning_rate": 9.990208619853137e-06, + "loss": 0.9079, + "step": 822 + }, + { + "epoch": 0.04529693433870879, + "grad_norm": 0.8724965453147888, + "learning_rate": 9.990181487019775e-06, + "loss": 0.8665, + "step": 823 + }, + { + "epoch": 0.04535197314106445, + "grad_norm": 0.8923047780990601, + "learning_rate": 9.990154316681543e-06, + "loss": 0.7779, + "step": 824 + }, + { + "epoch": 0.04540701194342011, + "grad_norm": 0.9024640321731567, + "learning_rate": 9.99012710883865e-06, + "loss": 0.8859, + "step": 825 + }, + { + "epoch": 0.04546205074577577, + "grad_norm": 0.9245888590812683, + "learning_rate": 9.990099863491296e-06, + "loss": 0.8501, + "step": 826 + }, + { + "epoch": 0.04551708954813143, + "grad_norm": 0.9257050156593323, + "learning_rate": 9.990072580639687e-06, + "loss": 0.9561, + "step": 827 + }, + { + "epoch": 0.045572128350487094, + "grad_norm": 0.995610773563385, + "learning_rate": 9.99004526028403e-06, + "loss": 0.917, + "step": 828 + }, + { + "epoch": 0.045627167152842756, + "grad_norm": 0.9524009823799133, + "learning_rate": 9.990017902424525e-06, + "loss": 0.9184, + "step": 829 + }, + { + "epoch": 0.04568220595519842, + "grad_norm": 0.9264503121376038, + "learning_rate": 9.989990507061385e-06, + "loss": 0.8615, + "step": 830 + }, + { + "epoch": 0.04573724475755408, + "grad_norm": 1.0068570375442505, + "learning_rate": 9.989963074194809e-06, + "loss": 0.8331, + "step": 831 + }, + { + "epoch": 0.04579228355990974, + "grad_norm": 0.9295952320098877, + "learning_rate": 9.989935603825009e-06, + "loss": 0.8387, + "step": 832 + }, + { + "epoch": 0.045847322362265394, + "grad_norm": 1.0408827066421509, + "learning_rate": 9.989908095952186e-06, + "loss": 0.9686, + "step": 833 + }, + { + "epoch": 0.045902361164621056, + "grad_norm": 0.8874136209487915, + "learning_rate": 9.989880550576551e-06, + "loss": 0.815, + "step": 834 + }, + { + "epoch": 0.04595739996697672, + "grad_norm": 0.9898836016654968, + "learning_rate": 9.989852967698311e-06, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 0.04601243876933238, + "grad_norm": 0.9828970432281494, + "learning_rate": 9.989825347317668e-06, + "loss": 0.7922, + "step": 836 + }, + { + "epoch": 0.04606747757168804, + "grad_norm": 1.025447964668274, + "learning_rate": 9.989797689434836e-06, + "loss": 0.9349, + "step": 837 + }, + { + "epoch": 0.0461225163740437, + "grad_norm": 0.8623831272125244, + "learning_rate": 9.98976999405002e-06, + "loss": 0.8786, + "step": 838 + }, + { + "epoch": 0.04617755517639936, + "grad_norm": 0.9614997506141663, + "learning_rate": 9.98974226116343e-06, + "loss": 0.7885, + "step": 839 + }, + { + "epoch": 0.046232593978755024, + "grad_norm": 1.0207616090774536, + "learning_rate": 9.989714490775269e-06, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.046287632781110685, + "grad_norm": 0.8509595990180969, + "learning_rate": 9.98968668288575e-06, + "loss": 0.7312, + "step": 841 + }, + { + "epoch": 0.04634267158346635, + "grad_norm": 0.9822607040405273, + "learning_rate": 9.989658837495084e-06, + "loss": 0.952, + "step": 842 + }, + { + "epoch": 0.046397710385822, + "grad_norm": 1.0058252811431885, + "learning_rate": 9.989630954603477e-06, + "loss": 0.8811, + "step": 843 + }, + { + "epoch": 0.04645274918817766, + "grad_norm": 1.0146985054016113, + "learning_rate": 9.989603034211139e-06, + "loss": 0.9051, + "step": 844 + }, + { + "epoch": 0.046507787990533324, + "grad_norm": 0.8976503610610962, + "learning_rate": 9.98957507631828e-06, + "loss": 0.879, + "step": 845 + }, + { + "epoch": 0.046562826792888985, + "grad_norm": 0.8791939616203308, + "learning_rate": 9.989547080925111e-06, + "loss": 0.8944, + "step": 846 + }, + { + "epoch": 0.04661786559524465, + "grad_norm": 0.8530884981155396, + "learning_rate": 9.989519048031842e-06, + "loss": 0.9029, + "step": 847 + }, + { + "epoch": 0.04667290439760031, + "grad_norm": 0.9621617197990417, + "learning_rate": 9.989490977638683e-06, + "loss": 0.8374, + "step": 848 + }, + { + "epoch": 0.04672794319995597, + "grad_norm": 0.9629075527191162, + "learning_rate": 9.989462869745845e-06, + "loss": 0.9032, + "step": 849 + }, + { + "epoch": 0.04678298200231163, + "grad_norm": 1.3256126642227173, + "learning_rate": 9.989434724353541e-06, + "loss": 0.9748, + "step": 850 + }, + { + "epoch": 0.04683802080466729, + "grad_norm": 1.0230494737625122, + "learning_rate": 9.989406541461979e-06, + "loss": 0.9752, + "step": 851 + }, + { + "epoch": 0.046893059607022954, + "grad_norm": 0.8454533219337463, + "learning_rate": 9.989378321071375e-06, + "loss": 0.8426, + "step": 852 + }, + { + "epoch": 0.046948098409378615, + "grad_norm": 0.9995863437652588, + "learning_rate": 9.989350063181939e-06, + "loss": 0.9955, + "step": 853 + }, + { + "epoch": 0.04700313721173427, + "grad_norm": 0.8956604599952698, + "learning_rate": 9.989321767793883e-06, + "loss": 0.9024, + "step": 854 + }, + { + "epoch": 0.04705817601408993, + "grad_norm": 1.0123292207717896, + "learning_rate": 9.989293434907419e-06, + "loss": 0.7856, + "step": 855 + }, + { + "epoch": 0.04711321481644559, + "grad_norm": 0.814577043056488, + "learning_rate": 9.989265064522762e-06, + "loss": 0.8377, + "step": 856 + }, + { + "epoch": 0.047168253618801254, + "grad_norm": 1.1571552753448486, + "learning_rate": 9.989236656640125e-06, + "loss": 0.8562, + "step": 857 + }, + { + "epoch": 0.047223292421156915, + "grad_norm": 0.9681577682495117, + "learning_rate": 9.98920821125972e-06, + "loss": 0.8473, + "step": 858 + }, + { + "epoch": 0.047278331223512576, + "grad_norm": 0.9680121541023254, + "learning_rate": 9.989179728381761e-06, + "loss": 0.9811, + "step": 859 + }, + { + "epoch": 0.04733337002586824, + "grad_norm": 0.985477089881897, + "learning_rate": 9.989151208006464e-06, + "loss": 0.6994, + "step": 860 + }, + { + "epoch": 0.0473884088282239, + "grad_norm": 0.8612962365150452, + "learning_rate": 9.98912265013404e-06, + "loss": 0.7667, + "step": 861 + }, + { + "epoch": 0.04744344763057956, + "grad_norm": 0.8884604573249817, + "learning_rate": 9.989094054764708e-06, + "loss": 0.8382, + "step": 862 + }, + { + "epoch": 0.04749848643293522, + "grad_norm": 1.036881923675537, + "learning_rate": 9.989065421898681e-06, + "loss": 0.8748, + "step": 863 + }, + { + "epoch": 0.04755352523529088, + "grad_norm": 0.9954493045806885, + "learning_rate": 9.989036751536171e-06, + "loss": 0.9174, + "step": 864 + }, + { + "epoch": 0.04760856403764654, + "grad_norm": 0.9984694123268127, + "learning_rate": 9.989008043677399e-06, + "loss": 0.7636, + "step": 865 + }, + { + "epoch": 0.0476636028400022, + "grad_norm": 1.0412588119506836, + "learning_rate": 9.988979298322576e-06, + "loss": 0.773, + "step": 866 + }, + { + "epoch": 0.04771864164235786, + "grad_norm": 0.8034874796867371, + "learning_rate": 9.98895051547192e-06, + "loss": 0.7914, + "step": 867 + }, + { + "epoch": 0.04777368044471352, + "grad_norm": 0.8983979225158691, + "learning_rate": 9.988921695125648e-06, + "loss": 0.7292, + "step": 868 + }, + { + "epoch": 0.04782871924706918, + "grad_norm": 0.9445077776908875, + "learning_rate": 9.988892837283976e-06, + "loss": 0.8263, + "step": 869 + }, + { + "epoch": 0.047883758049424845, + "grad_norm": 1.0753306150436401, + "learning_rate": 9.988863941947121e-06, + "loss": 1.1122, + "step": 870 + }, + { + "epoch": 0.047938796851780506, + "grad_norm": 1.0091484785079956, + "learning_rate": 9.9888350091153e-06, + "loss": 0.9276, + "step": 871 + }, + { + "epoch": 0.04799383565413617, + "grad_norm": 1.0977306365966797, + "learning_rate": 9.988806038788732e-06, + "loss": 0.854, + "step": 872 + }, + { + "epoch": 0.04804887445649183, + "grad_norm": 1.0285007953643799, + "learning_rate": 9.988777030967632e-06, + "loss": 0.9441, + "step": 873 + }, + { + "epoch": 0.04810391325884749, + "grad_norm": 0.8973976373672485, + "learning_rate": 9.988747985652218e-06, + "loss": 0.786, + "step": 874 + }, + { + "epoch": 0.04815895206120315, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.98871890284271e-06, + "loss": 0.9042, + "step": 875 + }, + { + "epoch": 0.048213990863558806, + "grad_norm": 0.8514279723167419, + "learning_rate": 9.988689782539326e-06, + "loss": 0.7874, + "step": 876 + }, + { + "epoch": 0.04826902966591447, + "grad_norm": 0.8299674391746521, + "learning_rate": 9.988660624742286e-06, + "loss": 0.8704, + "step": 877 + }, + { + "epoch": 0.04832406846827013, + "grad_norm": 0.9862462282180786, + "learning_rate": 9.988631429451809e-06, + "loss": 0.9963, + "step": 878 + }, + { + "epoch": 0.04837910727062579, + "grad_norm": 0.9041131734848022, + "learning_rate": 9.988602196668111e-06, + "loss": 0.9207, + "step": 879 + }, + { + "epoch": 0.04843414607298145, + "grad_norm": 0.8597276210784912, + "learning_rate": 9.988572926391416e-06, + "loss": 0.8226, + "step": 880 + }, + { + "epoch": 0.04848918487533711, + "grad_norm": 0.9494329690933228, + "learning_rate": 9.988543618621941e-06, + "loss": 0.8834, + "step": 881 + }, + { + "epoch": 0.048544223677692774, + "grad_norm": 0.9129118323326111, + "learning_rate": 9.98851427335991e-06, + "loss": 0.7819, + "step": 882 + }, + { + "epoch": 0.048599262480048436, + "grad_norm": 0.9145999550819397, + "learning_rate": 9.988484890605539e-06, + "loss": 0.885, + "step": 883 + }, + { + "epoch": 0.0486543012824041, + "grad_norm": 1.0115307569503784, + "learning_rate": 9.98845547035905e-06, + "loss": 0.8347, + "step": 884 + }, + { + "epoch": 0.04870934008475976, + "grad_norm": 1.1372706890106201, + "learning_rate": 9.988426012620667e-06, + "loss": 0.944, + "step": 885 + }, + { + "epoch": 0.04876437888711541, + "grad_norm": 0.9502811431884766, + "learning_rate": 9.98839651739061e-06, + "loss": 0.9054, + "step": 886 + }, + { + "epoch": 0.048819417689471074, + "grad_norm": 0.9612823128700256, + "learning_rate": 9.988366984669097e-06, + "loss": 0.8796, + "step": 887 + }, + { + "epoch": 0.048874456491826736, + "grad_norm": 0.9551461935043335, + "learning_rate": 9.988337414456355e-06, + "loss": 0.8769, + "step": 888 + }, + { + "epoch": 0.0489294952941824, + "grad_norm": 0.8554086089134216, + "learning_rate": 9.988307806752603e-06, + "loss": 0.892, + "step": 889 + }, + { + "epoch": 0.04898453409653806, + "grad_norm": 0.8418886661529541, + "learning_rate": 9.988278161558067e-06, + "loss": 0.7568, + "step": 890 + }, + { + "epoch": 0.04903957289889372, + "grad_norm": 1.4780360460281372, + "learning_rate": 9.988248478872967e-06, + "loss": 0.9126, + "step": 891 + }, + { + "epoch": 0.04909461170124938, + "grad_norm": 0.8236714005470276, + "learning_rate": 9.988218758697526e-06, + "loss": 0.7317, + "step": 892 + }, + { + "epoch": 0.04914965050360504, + "grad_norm": 0.8777141571044922, + "learning_rate": 9.988189001031968e-06, + "loss": 0.7989, + "step": 893 + }, + { + "epoch": 0.049204689305960704, + "grad_norm": 1.0235031843185425, + "learning_rate": 9.988159205876516e-06, + "loss": 0.8335, + "step": 894 + }, + { + "epoch": 0.049259728108316365, + "grad_norm": 0.9340357184410095, + "learning_rate": 9.988129373231395e-06, + "loss": 0.8129, + "step": 895 + }, + { + "epoch": 0.04931476691067203, + "grad_norm": 1.7686667442321777, + "learning_rate": 9.98809950309683e-06, + "loss": 0.9792, + "step": 896 + }, + { + "epoch": 0.04936980571302768, + "grad_norm": 0.9252369403839111, + "learning_rate": 9.988069595473044e-06, + "loss": 0.8671, + "step": 897 + }, + { + "epoch": 0.04942484451538334, + "grad_norm": 0.9989960789680481, + "learning_rate": 9.988039650360262e-06, + "loss": 0.9245, + "step": 898 + }, + { + "epoch": 0.049479883317739004, + "grad_norm": 1.062912106513977, + "learning_rate": 9.98800966775871e-06, + "loss": 0.9146, + "step": 899 + }, + { + "epoch": 0.049534922120094665, + "grad_norm": 0.8698169589042664, + "learning_rate": 9.98797964766861e-06, + "loss": 0.8606, + "step": 900 + }, + { + "epoch": 0.04958996092245033, + "grad_norm": 1.6754224300384521, + "learning_rate": 9.98794959009019e-06, + "loss": 0.9236, + "step": 901 + }, + { + "epoch": 0.04964499972480599, + "grad_norm": 1.084174394607544, + "learning_rate": 9.98791949502368e-06, + "loss": 0.9252, + "step": 902 + }, + { + "epoch": 0.04970003852716165, + "grad_norm": 0.9866724610328674, + "learning_rate": 9.987889362469301e-06, + "loss": 0.9096, + "step": 903 + }, + { + "epoch": 0.04975507732951731, + "grad_norm": 0.8814040422439575, + "learning_rate": 9.987859192427279e-06, + "loss": 0.8475, + "step": 904 + }, + { + "epoch": 0.04981011613187297, + "grad_norm": 0.8796457052230835, + "learning_rate": 9.987828984897843e-06, + "loss": 0.8478, + "step": 905 + }, + { + "epoch": 0.049865154934228634, + "grad_norm": 1.0541884899139404, + "learning_rate": 9.98779873988122e-06, + "loss": 0.9799, + "step": 906 + }, + { + "epoch": 0.049920193736584295, + "grad_norm": 0.91409832239151, + "learning_rate": 9.987768457377636e-06, + "loss": 0.8701, + "step": 907 + }, + { + "epoch": 0.04997523253893995, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.98773813738732e-06, + "loss": 0.8417, + "step": 908 + }, + { + "epoch": 0.05003027134129561, + "grad_norm": 1.7744206190109253, + "learning_rate": 9.987707779910499e-06, + "loss": 0.9263, + "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + }, + { + "epoch": 0.2502063955088337, + "grad_norm": 0.8124812841415405, + "learning_rate": 9.633168274647203e-06, + "loss": 0.8133, + "step": 4546 + }, + { + "epoch": 0.2502614343111894, + "grad_norm": 0.8511367440223694, + "learning_rate": 9.63300528873605e-06, + "loss": 0.7747, + "step": 4547 + }, + { + "epoch": 0.25031647311354505, + "grad_norm": 0.7305121421813965, + "learning_rate": 9.632842268004469e-06, + "loss": 0.8479, + "step": 4548 + }, + { + "epoch": 0.25037151191590074, + "grad_norm": 0.7127692103385925, + "learning_rate": 9.632679212453686e-06, + "loss": 0.8514, + "step": 4549 + }, + { + "epoch": 0.25042655071825637, + "grad_norm": 0.8251872062683105, + "learning_rate": 9.632516122084926e-06, + "loss": 0.7686, + "step": 4550 + }, + { + "epoch": 0.25048158952061206, + "grad_norm": 0.6756613850593567, + "learning_rate": 9.632352996899413e-06, + "loss": 0.5959, + "step": 4551 + }, + { + "epoch": 0.2505366283229677, + "grad_norm": 0.9266120791435242, + "learning_rate": 9.632189836898377e-06, + "loss": 0.7889, + "step": 4552 + }, + { + "epoch": 0.2505916671253233, + "grad_norm": 0.769890546798706, + "learning_rate": 9.63202664208304e-06, + "loss": 0.7864, + "step": 4553 + }, + { + "epoch": 0.250646705927679, + "grad_norm": 0.7314025163650513, + "learning_rate": 9.631863412454634e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 0.25070174473003465, + "grad_norm": 0.818317711353302, + "learning_rate": 9.63170014801438e-06, + "loss": 0.7096, + "step": 4555 + }, + { + "epoch": 0.25075678353239034, + "grad_norm": 0.7538807392120361, + "learning_rate": 9.631536848763508e-06, + "loss": 0.7779, + "step": 4556 + }, + { + "epoch": 0.25081182233474597, + "grad_norm": 0.7658100128173828, + "learning_rate": 9.631373514703247e-06, + "loss": 0.8535, + "step": 4557 + }, + { + "epoch": 0.25086686113710166, + "grad_norm": 0.8019290566444397, + "learning_rate": 9.631210145834819e-06, + "loss": 0.8141, + "step": 4558 + }, + { + "epoch": 0.2509218999394573, + "grad_norm": 0.7257653474807739, + "learning_rate": 9.631046742159456e-06, + "loss": 0.7451, + "step": 4559 + }, + { + "epoch": 0.250976938741813, + "grad_norm": 0.7546024918556213, + "learning_rate": 9.630883303678386e-06, + "loss": 0.7707, + "step": 4560 + }, + { + "epoch": 0.2510319775441686, + "grad_norm": 0.7288938760757446, + "learning_rate": 9.630719830392835e-06, + "loss": 0.7362, + "step": 4561 + }, + { + "epoch": 0.2510870163465243, + "grad_norm": 0.7814223170280457, + "learning_rate": 9.630556322304036e-06, + "loss": 0.8514, + "step": 4562 + }, + { + "epoch": 0.25114205514887994, + "grad_norm": 0.7561381459236145, + "learning_rate": 9.630392779413214e-06, + "loss": 0.7659, + "step": 4563 + }, + { + "epoch": 0.25119709395123563, + "grad_norm": 0.750641942024231, + "learning_rate": 9.6302292017216e-06, + "loss": 0.8496, + "step": 4564 + }, + { + "epoch": 0.25125213275359126, + "grad_norm": 0.832155704498291, + "learning_rate": 9.630065589230422e-06, + "loss": 0.7778, + "step": 4565 + }, + { + "epoch": 0.25130717155594695, + "grad_norm": 0.8202440142631531, + "learning_rate": 9.62990194194091e-06, + "loss": 0.8962, + "step": 4566 + }, + { + "epoch": 0.2513622103583026, + "grad_norm": 0.8777977824211121, + "learning_rate": 9.629738259854295e-06, + "loss": 0.7215, + "step": 4567 + }, + { + "epoch": 0.2514172491606583, + "grad_norm": 1.1868599653244019, + "learning_rate": 9.629574542971806e-06, + "loss": 0.8238, + "step": 4568 + }, + { + "epoch": 0.2514722879630139, + "grad_norm": 0.9128753542900085, + "learning_rate": 9.629410791294675e-06, + "loss": 0.7638, + "step": 4569 + }, + { + "epoch": 0.2515273267653696, + "grad_norm": 0.7350082993507385, + "learning_rate": 9.629247004824132e-06, + "loss": 0.8041, + "step": 4570 + }, + { + "epoch": 0.25158236556772523, + "grad_norm": 0.7279660701751709, + "learning_rate": 9.629083183561407e-06, + "loss": 0.7377, + "step": 4571 + }, + { + "epoch": 0.2516374043700809, + "grad_norm": 0.8570461273193359, + "learning_rate": 9.628919327507732e-06, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.25169244317243655, + "grad_norm": 0.8998312950134277, + "learning_rate": 9.62875543666434e-06, + "loss": 0.8171, + "step": 4573 + }, + { + "epoch": 0.25174748197479224, + "grad_norm": 0.7631624937057495, + "learning_rate": 9.628591511032456e-06, + "loss": 0.7871, + "step": 4574 + }, + { + "epoch": 0.2518025207771479, + "grad_norm": 0.7752320766448975, + "learning_rate": 9.628427550613322e-06, + "loss": 0.8241, + "step": 4575 + }, + { + "epoch": 0.25185755957950356, + "grad_norm": 0.8741563558578491, + "learning_rate": 9.628263555408163e-06, + "loss": 0.7312, + "step": 4576 + }, + { + "epoch": 0.2519125983818592, + "grad_norm": 0.8615008592605591, + "learning_rate": 9.628099525418216e-06, + "loss": 0.8586, + "step": 4577 + }, + { + "epoch": 0.2519676371842149, + "grad_norm": 0.8273662328720093, + "learning_rate": 9.62793546064471e-06, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.2520226759865705, + "grad_norm": 0.7454090118408203, + "learning_rate": 9.627771361088882e-06, + "loss": 0.8461, + "step": 4579 + }, + { + "epoch": 0.2520777147889262, + "grad_norm": 0.8225379586219788, + "learning_rate": 9.627607226751962e-06, + "loss": 0.7792, + "step": 4580 + }, + { + "epoch": 0.25213275359128184, + "grad_norm": 0.8655416369438171, + "learning_rate": 9.627443057635184e-06, + "loss": 0.8165, + "step": 4581 + }, + { + "epoch": 0.25218779239363753, + "grad_norm": 0.7735984921455383, + "learning_rate": 9.627278853739783e-06, + "loss": 0.8208, + "step": 4582 + }, + { + "epoch": 0.25224283119599317, + "grad_norm": 0.8293350338935852, + "learning_rate": 9.627114615066994e-06, + "loss": 0.7394, + "step": 4583 + }, + { + "epoch": 0.25229786999834886, + "grad_norm": 0.7840214371681213, + "learning_rate": 9.626950341618048e-06, + "loss": 0.8522, + "step": 4584 + }, + { + "epoch": 0.2523529088007045, + "grad_norm": 0.7724186182022095, + "learning_rate": 9.626786033394185e-06, + "loss": 0.8175, + "step": 4585 + }, + { + "epoch": 0.2524079476030602, + "grad_norm": 1.0751588344573975, + "learning_rate": 9.626621690396634e-06, + "loss": 0.9229, + "step": 4586 + }, + { + "epoch": 0.2524629864054158, + "grad_norm": 0.7016913294792175, + "learning_rate": 9.626457312626634e-06, + "loss": 0.6883, + "step": 4587 + }, + { + "epoch": 0.2525180252077715, + "grad_norm": 0.918377697467804, + "learning_rate": 9.626292900085419e-06, + "loss": 0.7889, + "step": 4588 + }, + { + "epoch": 0.25257306401012714, + "grad_norm": 1.006564736366272, + "learning_rate": 9.626128452774226e-06, + "loss": 0.7888, + "step": 4589 + }, + { + "epoch": 0.2526281028124828, + "grad_norm": 1.0214998722076416, + "learning_rate": 9.625963970694287e-06, + "loss": 0.768, + "step": 4590 + }, + { + "epoch": 0.25268314161483846, + "grad_norm": 0.7980843186378479, + "learning_rate": 9.625799453846844e-06, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 0.25273818041719415, + "grad_norm": 0.734582245349884, + "learning_rate": 9.625634902233128e-06, + "loss": 0.759, + "step": 4592 + }, + { + "epoch": 0.2527932192195498, + "grad_norm": 0.7185904383659363, + "learning_rate": 9.62547031585438e-06, + "loss": 0.774, + "step": 4593 + }, + { + "epoch": 0.25284825802190547, + "grad_norm": 0.7356622219085693, + "learning_rate": 9.625305694711835e-06, + "loss": 0.7435, + "step": 4594 + }, + { + "epoch": 0.2529032968242611, + "grad_norm": 0.7589355707168579, + "learning_rate": 9.62514103880673e-06, + "loss": 0.807, + "step": 4595 + }, + { + "epoch": 0.25295833562661674, + "grad_norm": 0.889228880405426, + "learning_rate": 9.624976348140305e-06, + "loss": 0.8609, + "step": 4596 + }, + { + "epoch": 0.2530133744289724, + "grad_norm": 0.7546125650405884, + "learning_rate": 9.624811622713793e-06, + "loss": 0.8379, + "step": 4597 + }, + { + "epoch": 0.25306841323132806, + "grad_norm": 0.8262770175933838, + "learning_rate": 9.624646862528436e-06, + "loss": 0.7611, + "step": 4598 + }, + { + "epoch": 0.25312345203368375, + "grad_norm": 0.8876076936721802, + "learning_rate": 9.624482067585472e-06, + "loss": 0.8106, + "step": 4599 + }, + { + "epoch": 0.2531784908360394, + "grad_norm": 0.7045544981956482, + "learning_rate": 9.624317237886137e-06, + "loss": 0.7121, + "step": 4600 + }, + { + "epoch": 0.25323352963839507, + "grad_norm": 0.7693355083465576, + "learning_rate": 9.624152373431672e-06, + "loss": 0.8052, + "step": 4601 + }, + { + "epoch": 0.2532885684407507, + "grad_norm": 0.8072683811187744, + "learning_rate": 9.623987474223316e-06, + "loss": 0.8543, + "step": 4602 + }, + { + "epoch": 0.2533436072431064, + "grad_norm": 0.8158687949180603, + "learning_rate": 9.62382254026231e-06, + "loss": 0.6922, + "step": 4603 + }, + { + "epoch": 0.25339864604546203, + "grad_norm": 0.7688641548156738, + "learning_rate": 9.623657571549887e-06, + "loss": 0.7198, + "step": 4604 + }, + { + "epoch": 0.2534536848478177, + "grad_norm": 0.7806578278541565, + "learning_rate": 9.623492568087293e-06, + "loss": 0.8539, + "step": 4605 + }, + { + "epoch": 0.25350872365017335, + "grad_norm": 0.9557347893714905, + "learning_rate": 9.623327529875769e-06, + "loss": 0.6996, + "step": 4606 + }, + { + "epoch": 0.25356376245252904, + "grad_norm": 0.9465067386627197, + "learning_rate": 9.62316245691655e-06, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 0.2536188012548847, + "grad_norm": 0.8029165863990784, + "learning_rate": 9.62299734921088e-06, + "loss": 0.8573, + "step": 4608 + }, + { + "epoch": 0.25367384005724036, + "grad_norm": 0.7530128955841064, + "learning_rate": 9.62283220676e-06, + "loss": 0.7466, + "step": 4609 + }, + { + "epoch": 0.253728878859596, + "grad_norm": 0.6704453825950623, + "learning_rate": 9.622667029565151e-06, + "loss": 0.6512, + "step": 4610 + }, + { + "epoch": 0.2537839176619517, + "grad_norm": 0.7162728309631348, + "learning_rate": 9.622501817627574e-06, + "loss": 0.7615, + "step": 4611 + }, + { + "epoch": 0.2538389564643073, + "grad_norm": 0.7599188089370728, + "learning_rate": 9.622336570948509e-06, + "loss": 0.8463, + "step": 4612 + }, + { + "epoch": 0.253893995266663, + "grad_norm": 0.7922326922416687, + "learning_rate": 9.6221712895292e-06, + "loss": 0.9221, + "step": 4613 + }, + { + "epoch": 0.25394903406901864, + "grad_norm": 1.4635218381881714, + "learning_rate": 9.622005973370892e-06, + "loss": 0.9159, + "step": 4614 + }, + { + "epoch": 0.25400407287137433, + "grad_norm": 0.8695057034492493, + "learning_rate": 9.62184062247482e-06, + "loss": 0.6792, + "step": 4615 + }, + { + "epoch": 0.25405911167372996, + "grad_norm": 0.8070930242538452, + "learning_rate": 9.621675236842235e-06, + "loss": 0.8257, + "step": 4616 + }, + { + "epoch": 0.25411415047608565, + "grad_norm": 0.8642075061798096, + "learning_rate": 9.621509816474372e-06, + "loss": 0.8223, + "step": 4617 + }, + { + "epoch": 0.2541691892784413, + "grad_norm": 0.7131080031394958, + "learning_rate": 9.621344361372483e-06, + "loss": 0.6831, + "step": 4618 + }, + { + "epoch": 0.254224228080797, + "grad_norm": 0.7582216262817383, + "learning_rate": 9.621178871537804e-06, + "loss": 0.8091, + "step": 4619 + }, + { + "epoch": 0.2542792668831526, + "grad_norm": 0.7705016732215881, + "learning_rate": 9.62101334697158e-06, + "loss": 0.7537, + "step": 4620 + }, + { + "epoch": 0.2543343056855083, + "grad_norm": 0.7638342976570129, + "learning_rate": 9.62084778767506e-06, + "loss": 0.7661, + "step": 4621 + }, + { + "epoch": 0.25438934448786393, + "grad_norm": 0.9296607971191406, + "learning_rate": 9.620682193649482e-06, + "loss": 0.8875, + "step": 4622 + }, + { + "epoch": 0.2544443832902196, + "grad_norm": 0.795394778251648, + "learning_rate": 9.620516564896096e-06, + "loss": 0.6884, + "step": 4623 + }, + { + "epoch": 0.25449942209257526, + "grad_norm": 0.9164957404136658, + "learning_rate": 9.620350901416142e-06, + "loss": 0.8693, + "step": 4624 + }, + { + "epoch": 0.25455446089493095, + "grad_norm": 0.8306281566619873, + "learning_rate": 9.62018520321087e-06, + "loss": 0.8972, + "step": 4625 + }, + { + "epoch": 0.2546094996972866, + "grad_norm": 0.778831422328949, + "learning_rate": 9.620019470281521e-06, + "loss": 0.7574, + "step": 4626 + }, + { + "epoch": 0.25466453849964227, + "grad_norm": 0.9326225519180298, + "learning_rate": 9.619853702629343e-06, + "loss": 0.7712, + "step": 4627 + }, + { + "epoch": 0.2547195773019979, + "grad_norm": 0.8772255182266235, + "learning_rate": 9.619687900255581e-06, + "loss": 0.8241, + "step": 4628 + }, + { + "epoch": 0.2547746161043536, + "grad_norm": 0.8777550458908081, + "learning_rate": 9.619522063161482e-06, + "loss": 0.8724, + "step": 4629 + }, + { + "epoch": 0.2548296549067092, + "grad_norm": 0.8332602381706238, + "learning_rate": 9.61935619134829e-06, + "loss": 0.8716, + "step": 4630 + }, + { + "epoch": 0.2548846937090649, + "grad_norm": 0.8246355056762695, + "learning_rate": 9.619190284817255e-06, + "loss": 0.7789, + "step": 4631 + }, + { + "epoch": 0.25493973251142055, + "grad_norm": 0.7200644612312317, + "learning_rate": 9.61902434356962e-06, + "loss": 0.7956, + "step": 4632 + }, + { + "epoch": 0.25499477131377624, + "grad_norm": 0.827756404876709, + "learning_rate": 9.618858367606638e-06, + "loss": 0.7925, + "step": 4633 + }, + { + "epoch": 0.25504981011613187, + "grad_norm": 0.7749341726303101, + "learning_rate": 9.618692356929551e-06, + "loss": 0.8706, + "step": 4634 + }, + { + "epoch": 0.25510484891848756, + "grad_norm": 0.7233432531356812, + "learning_rate": 9.618526311539608e-06, + "loss": 0.7725, + "step": 4635 + }, + { + "epoch": 0.2551598877208432, + "grad_norm": 0.846340537071228, + "learning_rate": 9.618360231438058e-06, + "loss": 0.8758, + "step": 4636 + }, + { + "epoch": 0.2552149265231989, + "grad_norm": 0.8262908458709717, + "learning_rate": 9.61819411662615e-06, + "loss": 0.7758, + "step": 4637 + }, + { + "epoch": 0.2552699653255545, + "grad_norm": 0.7829110026359558, + "learning_rate": 9.61802796710513e-06, + "loss": 0.8494, + "step": 4638 + }, + { + "epoch": 0.25532500412791015, + "grad_norm": 0.7480815649032593, + "learning_rate": 9.617861782876247e-06, + "loss": 0.7639, + "step": 4639 + }, + { + "epoch": 0.25538004293026584, + "grad_norm": 0.8782994747161865, + "learning_rate": 9.617695563940752e-06, + "loss": 0.9651, + "step": 4640 + }, + { + "epoch": 0.25543508173262147, + "grad_norm": 0.7215868234634399, + "learning_rate": 9.617529310299895e-06, + "loss": 0.7833, + "step": 4641 + }, + { + "epoch": 0.25549012053497716, + "grad_norm": 0.8287535905838013, + "learning_rate": 9.617363021954922e-06, + "loss": 0.901, + "step": 4642 + }, + { + "epoch": 0.2555451593373328, + "grad_norm": 0.7679935097694397, + "learning_rate": 9.617196698907084e-06, + "loss": 0.761, + "step": 4643 + }, + { + "epoch": 0.2556001981396885, + "grad_norm": 0.7765942811965942, + "learning_rate": 9.617030341157632e-06, + "loss": 0.7356, + "step": 4644 + }, + { + "epoch": 0.2556552369420441, + "grad_norm": 0.6964583396911621, + "learning_rate": 9.616863948707816e-06, + "loss": 0.7683, + "step": 4645 + }, + { + "epoch": 0.2557102757443998, + "grad_norm": 0.8031953573226929, + "learning_rate": 9.616697521558886e-06, + "loss": 0.7875, + "step": 4646 + }, + { + "epoch": 0.25576531454675544, + "grad_norm": 0.7155965566635132, + "learning_rate": 9.616531059712094e-06, + "loss": 0.6516, + "step": 4647 + }, + { + "epoch": 0.25582035334911113, + "grad_norm": 0.6870070099830627, + "learning_rate": 9.61636456316869e-06, + "loss": 0.7217, + "step": 4648 + }, + { + "epoch": 0.25587539215146676, + "grad_norm": 0.7686315774917603, + "learning_rate": 9.616198031929926e-06, + "loss": 0.8136, + "step": 4649 + }, + { + "epoch": 0.25593043095382245, + "grad_norm": 0.7532772421836853, + "learning_rate": 9.616031465997054e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.2559854697561781, + "grad_norm": 0.8111574053764343, + "learning_rate": 9.615864865371323e-06, + "loss": 0.8501, + "step": 4651 + }, + { + "epoch": 0.2560405085585338, + "grad_norm": 0.771065890789032, + "learning_rate": 9.615698230053989e-06, + "loss": 0.7417, + "step": 4652 + }, + { + "epoch": 0.2560955473608894, + "grad_norm": 0.7468003034591675, + "learning_rate": 9.6155315600463e-06, + "loss": 0.7303, + "step": 4653 + }, + { + "epoch": 0.2561505861632451, + "grad_norm": 0.8041057586669922, + "learning_rate": 9.615364855349514e-06, + "loss": 0.8689, + "step": 4654 + }, + { + "epoch": 0.25620562496560073, + "grad_norm": 0.8439033627510071, + "learning_rate": 9.61519811596488e-06, + "loss": 0.8654, + "step": 4655 + }, + { + "epoch": 0.2562606637679564, + "grad_norm": 0.7768430113792419, + "learning_rate": 9.615031341893653e-06, + "loss": 0.8789, + "step": 4656 + }, + { + "epoch": 0.25631570257031205, + "grad_norm": 0.712876558303833, + "learning_rate": 9.614864533137086e-06, + "loss": 0.7497, + "step": 4657 + }, + { + "epoch": 0.25637074137266774, + "grad_norm": 0.7586949467658997, + "learning_rate": 9.614697689696431e-06, + "loss": 0.81, + "step": 4658 + }, + { + "epoch": 0.2564257801750234, + "grad_norm": 0.717078447341919, + "learning_rate": 9.614530811572946e-06, + "loss": 0.8023, + "step": 4659 + }, + { + "epoch": 0.25648081897737907, + "grad_norm": 0.7369407415390015, + "learning_rate": 9.61436389876788e-06, + "loss": 0.784, + "step": 4660 + }, + { + "epoch": 0.2565358577797347, + "grad_norm": 0.7536265850067139, + "learning_rate": 9.61419695128249e-06, + "loss": 0.7687, + "step": 4661 + }, + { + "epoch": 0.2565908965820904, + "grad_norm": 0.9718124866485596, + "learning_rate": 9.614029969118033e-06, + "loss": 0.8495, + "step": 4662 + }, + { + "epoch": 0.256645935384446, + "grad_norm": 1.1578630208969116, + "learning_rate": 9.613862952275762e-06, + "loss": 0.9189, + "step": 4663 + }, + { + "epoch": 0.2567009741868017, + "grad_norm": 0.7752498984336853, + "learning_rate": 9.613695900756929e-06, + "loss": 0.7677, + "step": 4664 + }, + { + "epoch": 0.25675601298915735, + "grad_norm": 0.9640393257141113, + "learning_rate": 9.613528814562795e-06, + "loss": 0.719, + "step": 4665 + }, + { + "epoch": 0.25681105179151303, + "grad_norm": 0.7690972089767456, + "learning_rate": 9.613361693694614e-06, + "loss": 0.7977, + "step": 4666 + }, + { + "epoch": 0.25686609059386867, + "grad_norm": 0.8390190601348877, + "learning_rate": 9.61319453815364e-06, + "loss": 0.8032, + "step": 4667 + }, + { + "epoch": 0.25692112939622436, + "grad_norm": 0.8293220400810242, + "learning_rate": 9.613027347941131e-06, + "loss": 0.8645, + "step": 4668 + }, + { + "epoch": 0.25697616819858, + "grad_norm": 0.8020731210708618, + "learning_rate": 9.612860123058344e-06, + "loss": 0.8374, + "step": 4669 + }, + { + "epoch": 0.2570312070009357, + "grad_norm": 0.7756736278533936, + "learning_rate": 9.612692863506534e-06, + "loss": 0.7318, + "step": 4670 + }, + { + "epoch": 0.2570862458032913, + "grad_norm": 0.895416259765625, + "learning_rate": 9.61252556928696e-06, + "loss": 0.9654, + "step": 4671 + }, + { + "epoch": 0.257141284605647, + "grad_norm": 0.8647375106811523, + "learning_rate": 9.61235824040088e-06, + "loss": 0.7411, + "step": 4672 + }, + { + "epoch": 0.25719632340800264, + "grad_norm": 0.6927250623703003, + "learning_rate": 9.612190876849546e-06, + "loss": 0.7558, + "step": 4673 + }, + { + "epoch": 0.2572513622103583, + "grad_norm": 0.7614898085594177, + "learning_rate": 9.612023478634222e-06, + "loss": 0.7696, + "step": 4674 + }, + { + "epoch": 0.25730640101271396, + "grad_norm": 0.7910586595535278, + "learning_rate": 9.611856045756166e-06, + "loss": 0.8207, + "step": 4675 + }, + { + "epoch": 0.25736143981506965, + "grad_norm": 0.7330125570297241, + "learning_rate": 9.611688578216632e-06, + "loss": 0.8615, + "step": 4676 + }, + { + "epoch": 0.2574164786174253, + "grad_norm": 0.7703417539596558, + "learning_rate": 9.611521076016882e-06, + "loss": 0.8321, + "step": 4677 + }, + { + "epoch": 0.25747151741978097, + "grad_norm": 0.7121796607971191, + "learning_rate": 9.611353539158174e-06, + "loss": 0.8228, + "step": 4678 + }, + { + "epoch": 0.2575265562221366, + "grad_norm": 0.8313117027282715, + "learning_rate": 9.611185967641768e-06, + "loss": 0.9012, + "step": 4679 + }, + { + "epoch": 0.2575815950244923, + "grad_norm": 0.806776225566864, + "learning_rate": 9.61101836146892e-06, + "loss": 0.769, + "step": 4680 + }, + { + "epoch": 0.2576366338268479, + "grad_norm": 0.7049515843391418, + "learning_rate": 9.610850720640894e-06, + "loss": 0.7938, + "step": 4681 + }, + { + "epoch": 0.25769167262920356, + "grad_norm": 0.7286638021469116, + "learning_rate": 9.610683045158948e-06, + "loss": 0.8168, + "step": 4682 + }, + { + "epoch": 0.25774671143155925, + "grad_norm": 0.7916898727416992, + "learning_rate": 9.610515335024345e-06, + "loss": 0.7681, + "step": 4683 + }, + { + "epoch": 0.2578017502339149, + "grad_norm": 0.7649673819541931, + "learning_rate": 9.61034759023834e-06, + "loss": 0.7273, + "step": 4684 + }, + { + "epoch": 0.2578567890362706, + "grad_norm": 0.8280686736106873, + "learning_rate": 9.610179810802196e-06, + "loss": 0.7968, + "step": 4685 + }, + { + "epoch": 0.2579118278386262, + "grad_norm": 0.7206569910049438, + "learning_rate": 9.610011996717175e-06, + "loss": 0.7359, + "step": 4686 + }, + { + "epoch": 0.2579668666409819, + "grad_norm": 0.7365424036979675, + "learning_rate": 9.60984414798454e-06, + "loss": 0.7962, + "step": 4687 + }, + { + "epoch": 0.25802190544333753, + "grad_norm": 0.8030344247817993, + "learning_rate": 9.609676264605549e-06, + "loss": 0.7931, + "step": 4688 + }, + { + "epoch": 0.2580769442456932, + "grad_norm": 0.8812693357467651, + "learning_rate": 9.609508346581464e-06, + "loss": 0.8493, + "step": 4689 + }, + { + "epoch": 0.25813198304804885, + "grad_norm": 0.8026734590530396, + "learning_rate": 9.60934039391355e-06, + "loss": 0.8368, + "step": 4690 + }, + { + "epoch": 0.25818702185040454, + "grad_norm": 0.8270768523216248, + "learning_rate": 9.609172406603067e-06, + "loss": 0.9077, + "step": 4691 + }, + { + "epoch": 0.2582420606527602, + "grad_norm": 0.7362856864929199, + "learning_rate": 9.609004384651276e-06, + "loss": 0.7384, + "step": 4692 + }, + { + "epoch": 0.25829709945511586, + "grad_norm": 0.7195929288864136, + "learning_rate": 9.608836328059444e-06, + "loss": 0.8475, + "step": 4693 + }, + { + "epoch": 0.2583521382574715, + "grad_norm": 0.7653167843818665, + "learning_rate": 9.60866823682883e-06, + "loss": 0.7704, + "step": 4694 + }, + { + "epoch": 0.2584071770598272, + "grad_norm": 0.7056792974472046, + "learning_rate": 9.6085001109607e-06, + "loss": 0.7835, + "step": 4695 + }, + { + "epoch": 0.2584622158621828, + "grad_norm": 0.7299804091453552, + "learning_rate": 9.60833195045632e-06, + "loss": 0.7894, + "step": 4696 + }, + { + "epoch": 0.2585172546645385, + "grad_norm": 0.7235645055770874, + "learning_rate": 9.608163755316948e-06, + "loss": 0.8113, + "step": 4697 + }, + { + "epoch": 0.25857229346689414, + "grad_norm": 0.7066782116889954, + "learning_rate": 9.60799552554385e-06, + "loss": 0.739, + "step": 4698 + }, + { + "epoch": 0.25862733226924983, + "grad_norm": 0.769930362701416, + "learning_rate": 9.607827261138291e-06, + "loss": 0.7565, + "step": 4699 + }, + { + "epoch": 0.25868237107160547, + "grad_norm": 0.8875935077667236, + "learning_rate": 9.607658962101538e-06, + "loss": 0.849, + "step": 4700 + }, + { + "epoch": 0.25873740987396115, + "grad_norm": 0.7887380123138428, + "learning_rate": 9.60749062843485e-06, + "loss": 0.8795, + "step": 4701 + }, + { + "epoch": 0.2587924486763168, + "grad_norm": 0.7600420117378235, + "learning_rate": 9.607322260139499e-06, + "loss": 0.7581, + "step": 4702 + }, + { + "epoch": 0.2588474874786725, + "grad_norm": 0.7431491017341614, + "learning_rate": 9.607153857216746e-06, + "loss": 0.7119, + "step": 4703 + }, + { + "epoch": 0.2589025262810281, + "grad_norm": 0.7444193363189697, + "learning_rate": 9.606985419667858e-06, + "loss": 0.7492, + "step": 4704 + }, + { + "epoch": 0.2589575650833838, + "grad_norm": 0.8348917365074158, + "learning_rate": 9.6068169474941e-06, + "loss": 0.7656, + "step": 4705 + }, + { + "epoch": 0.25901260388573943, + "grad_norm": 0.6790240406990051, + "learning_rate": 9.60664844069674e-06, + "loss": 0.6354, + "step": 4706 + }, + { + "epoch": 0.2590676426880951, + "grad_norm": 0.8425769805908203, + "learning_rate": 9.606479899277044e-06, + "loss": 0.7927, + "step": 4707 + }, + { + "epoch": 0.25912268149045076, + "grad_norm": 0.7234740853309631, + "learning_rate": 9.606311323236277e-06, + "loss": 0.8122, + "step": 4708 + }, + { + "epoch": 0.25917772029280645, + "grad_norm": 0.839507520198822, + "learning_rate": 9.606142712575707e-06, + "loss": 0.8807, + "step": 4709 + }, + { + "epoch": 0.2592327590951621, + "grad_norm": 0.7155291438102722, + "learning_rate": 9.605974067296601e-06, + "loss": 0.7852, + "step": 4710 + }, + { + "epoch": 0.25928779789751777, + "grad_norm": 0.7222152352333069, + "learning_rate": 9.605805387400228e-06, + "loss": 0.7362, + "step": 4711 + }, + { + "epoch": 0.2593428366998734, + "grad_norm": 0.8350114226341248, + "learning_rate": 9.605636672887854e-06, + "loss": 0.7201, + "step": 4712 + }, + { + "epoch": 0.2593978755022291, + "grad_norm": 0.6805943250656128, + "learning_rate": 9.605467923760747e-06, + "loss": 0.6936, + "step": 4713 + }, + { + "epoch": 0.2594529143045847, + "grad_norm": 0.7863980531692505, + "learning_rate": 9.605299140020177e-06, + "loss": 0.9079, + "step": 4714 + }, + { + "epoch": 0.2595079531069404, + "grad_norm": 0.838843584060669, + "learning_rate": 9.60513032166741e-06, + "loss": 0.839, + "step": 4715 + }, + { + "epoch": 0.25956299190929605, + "grad_norm": 0.7872797250747681, + "learning_rate": 9.60496146870372e-06, + "loss": 0.9164, + "step": 4716 + }, + { + "epoch": 0.25961803071165174, + "grad_norm": 0.7300794720649719, + "learning_rate": 9.604792581130369e-06, + "loss": 0.8227, + "step": 4717 + }, + { + "epoch": 0.25967306951400737, + "grad_norm": 0.8420879244804382, + "learning_rate": 9.60462365894863e-06, + "loss": 0.7865, + "step": 4718 + }, + { + "epoch": 0.25972810831636306, + "grad_norm": 0.807697057723999, + "learning_rate": 9.604454702159771e-06, + "loss": 0.9081, + "step": 4719 + }, + { + "epoch": 0.2597831471187187, + "grad_norm": 0.9041245579719543, + "learning_rate": 9.604285710765064e-06, + "loss": 0.8102, + "step": 4720 + }, + { + "epoch": 0.2598381859210744, + "grad_norm": 0.7061690092086792, + "learning_rate": 9.604116684765779e-06, + "loss": 0.762, + "step": 4721 + }, + { + "epoch": 0.25989322472343, + "grad_norm": 0.7790346741676331, + "learning_rate": 9.603947624163186e-06, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.2599482635257857, + "grad_norm": 0.8109704256057739, + "learning_rate": 9.603778528958553e-06, + "loss": 0.9105, + "step": 4723 + }, + { + "epoch": 0.26000330232814134, + "grad_norm": 0.7396997213363647, + "learning_rate": 9.603609399153153e-06, + "loss": 0.8384, + "step": 4724 + }, + { + "epoch": 0.260058341130497, + "grad_norm": 0.8594317436218262, + "learning_rate": 9.603440234748257e-06, + "loss": 0.8301, + "step": 4725 + }, + { + "epoch": 0.26011337993285266, + "grad_norm": 0.7087241411209106, + "learning_rate": 9.603271035745138e-06, + "loss": 0.6652, + "step": 4726 + }, + { + "epoch": 0.2601684187352083, + "grad_norm": 0.7405440211296082, + "learning_rate": 9.603101802145065e-06, + "loss": 0.7804, + "step": 4727 + }, + { + "epoch": 0.260223457537564, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.602932533949312e-06, + "loss": 0.8509, + "step": 4728 + }, + { + "epoch": 0.2602784963399196, + "grad_norm": 0.7040451765060425, + "learning_rate": 9.60276323115915e-06, + "loss": 0.7842, + "step": 4729 + }, + { + "epoch": 0.2603335351422753, + "grad_norm": 0.7743955254554749, + "learning_rate": 9.602593893775852e-06, + "loss": 0.8492, + "step": 4730 + }, + { + "epoch": 0.26038857394463094, + "grad_norm": 0.7110480070114136, + "learning_rate": 9.602424521800688e-06, + "loss": 0.7227, + "step": 4731 + }, + { + "epoch": 0.26044361274698663, + "grad_norm": 1.0066583156585693, + "learning_rate": 9.602255115234936e-06, + "loss": 0.8825, + "step": 4732 + }, + { + "epoch": 0.26049865154934226, + "grad_norm": 0.7746492624282837, + "learning_rate": 9.602085674079864e-06, + "loss": 0.8316, + "step": 4733 + }, + { + "epoch": 0.26055369035169795, + "grad_norm": 0.7394356727600098, + "learning_rate": 9.60191619833675e-06, + "loss": 0.746, + "step": 4734 + }, + { + "epoch": 0.2606087291540536, + "grad_norm": 0.7140582203865051, + "learning_rate": 9.601746688006866e-06, + "loss": 0.7204, + "step": 4735 + }, + { + "epoch": 0.2606637679564093, + "grad_norm": 0.753399133682251, + "learning_rate": 9.601577143091483e-06, + "loss": 0.8157, + "step": 4736 + }, + { + "epoch": 0.2607188067587649, + "grad_norm": 0.674320638179779, + "learning_rate": 9.601407563591881e-06, + "loss": 0.7279, + "step": 4737 + }, + { + "epoch": 0.2607738455611206, + "grad_norm": 0.855944037437439, + "learning_rate": 9.60123794950933e-06, + "loss": 0.804, + "step": 4738 + }, + { + "epoch": 0.26082888436347623, + "grad_norm": 0.6833948493003845, + "learning_rate": 9.601068300845106e-06, + "loss": 0.701, + "step": 4739 + }, + { + "epoch": 0.2608839231658319, + "grad_norm": 0.8085536360740662, + "learning_rate": 9.600898617600485e-06, + "loss": 0.8435, + "step": 4740 + }, + { + "epoch": 0.26093896196818755, + "grad_norm": 0.752849817276001, + "learning_rate": 9.600728899776741e-06, + "loss": 0.7205, + "step": 4741 + }, + { + "epoch": 0.26099400077054324, + "grad_norm": 0.7320554852485657, + "learning_rate": 9.600559147375151e-06, + "loss": 0.7556, + "step": 4742 + }, + { + "epoch": 0.2610490395728989, + "grad_norm": 0.7789202928543091, + "learning_rate": 9.600389360396988e-06, + "loss": 0.8467, + "step": 4743 + }, + { + "epoch": 0.26110407837525457, + "grad_norm": 0.8480898141860962, + "learning_rate": 9.600219538843532e-06, + "loss": 0.7762, + "step": 4744 + }, + { + "epoch": 0.2611591171776102, + "grad_norm": 0.8382542133331299, + "learning_rate": 9.600049682716055e-06, + "loss": 0.9051, + "step": 4745 + }, + { + "epoch": 0.2612141559799659, + "grad_norm": 0.8319274187088013, + "learning_rate": 9.599879792015838e-06, + "loss": 0.8221, + "step": 4746 + }, + { + "epoch": 0.2612691947823215, + "grad_norm": 0.7325875163078308, + "learning_rate": 9.599709866744156e-06, + "loss": 0.7968, + "step": 4747 + }, + { + "epoch": 0.2613242335846772, + "grad_norm": 0.7053360342979431, + "learning_rate": 9.599539906902285e-06, + "loss": 0.7073, + "step": 4748 + }, + { + "epoch": 0.26137927238703285, + "grad_norm": 0.763017475605011, + "learning_rate": 9.599369912491503e-06, + "loss": 0.7031, + "step": 4749 + }, + { + "epoch": 0.26143431118938854, + "grad_norm": 0.6816151738166809, + "learning_rate": 9.599199883513088e-06, + "loss": 0.7295, + "step": 4750 + }, + { + "epoch": 0.26148934999174417, + "grad_norm": 0.8143941164016724, + "learning_rate": 9.599029819968319e-06, + "loss": 0.8449, + "step": 4751 + }, + { + "epoch": 0.26154438879409986, + "grad_norm": 0.8093858361244202, + "learning_rate": 9.598859721858471e-06, + "loss": 0.8397, + "step": 4752 + }, + { + "epoch": 0.2615994275964555, + "grad_norm": 0.7431835532188416, + "learning_rate": 9.598689589184827e-06, + "loss": 0.7299, + "step": 4753 + }, + { + "epoch": 0.2616544663988112, + "grad_norm": 0.9871510863304138, + "learning_rate": 9.59851942194866e-06, + "loss": 0.7992, + "step": 4754 + }, + { + "epoch": 0.2617095052011668, + "grad_norm": 0.9304273724555969, + "learning_rate": 9.598349220151254e-06, + "loss": 0.7519, + "step": 4755 + }, + { + "epoch": 0.2617645440035225, + "grad_norm": 0.9361812472343445, + "learning_rate": 9.598178983793886e-06, + "loss": 0.8131, + "step": 4756 + }, + { + "epoch": 0.26181958280587814, + "grad_norm": 0.7783429622650146, + "learning_rate": 9.598008712877835e-06, + "loss": 0.7351, + "step": 4757 + }, + { + "epoch": 0.2618746216082338, + "grad_norm": 0.8739376068115234, + "learning_rate": 9.597838407404381e-06, + "loss": 0.9458, + "step": 4758 + }, + { + "epoch": 0.26192966041058946, + "grad_norm": 0.7076277732849121, + "learning_rate": 9.597668067374805e-06, + "loss": 0.7632, + "step": 4759 + }, + { + "epoch": 0.26198469921294515, + "grad_norm": 0.7652345299720764, + "learning_rate": 9.597497692790386e-06, + "loss": 0.8018, + "step": 4760 + }, + { + "epoch": 0.2620397380153008, + "grad_norm": 0.7332149147987366, + "learning_rate": 9.597327283652405e-06, + "loss": 0.8223, + "step": 4761 + }, + { + "epoch": 0.26209477681765647, + "grad_norm": 0.8361638784408569, + "learning_rate": 9.597156839962145e-06, + "loss": 0.8784, + "step": 4762 + }, + { + "epoch": 0.2621498156200121, + "grad_norm": 1.183772325515747, + "learning_rate": 9.596986361720882e-06, + "loss": 0.8768, + "step": 4763 + }, + { + "epoch": 0.2622048544223678, + "grad_norm": 0.9895418882369995, + "learning_rate": 9.596815848929902e-06, + "loss": 0.714, + "step": 4764 + }, + { + "epoch": 0.26225989322472343, + "grad_norm": 0.8210558295249939, + "learning_rate": 9.59664530159048e-06, + "loss": 0.7246, + "step": 4765 + }, + { + "epoch": 0.2623149320270791, + "grad_norm": 0.8003455996513367, + "learning_rate": 9.596474719703908e-06, + "loss": 0.8385, + "step": 4766 + }, + { + "epoch": 0.26236997082943475, + "grad_norm": 0.7555826306343079, + "learning_rate": 9.59630410327146e-06, + "loss": 0.7243, + "step": 4767 + }, + { + "epoch": 0.2624250096317904, + "grad_norm": 0.7746273279190063, + "learning_rate": 9.596133452294421e-06, + "loss": 0.8763, + "step": 4768 + }, + { + "epoch": 0.2624800484341461, + "grad_norm": 0.7238507866859436, + "learning_rate": 9.595962766774074e-06, + "loss": 0.8302, + "step": 4769 + }, + { + "epoch": 0.2625350872365017, + "grad_norm": 0.7874132394790649, + "learning_rate": 9.595792046711699e-06, + "loss": 0.7979, + "step": 4770 + }, + { + "epoch": 0.2625901260388574, + "grad_norm": 0.8792033791542053, + "learning_rate": 9.595621292108583e-06, + "loss": 0.8555, + "step": 4771 + }, + { + "epoch": 0.26264516484121303, + "grad_norm": 0.7026945948600769, + "learning_rate": 9.595450502966006e-06, + "loss": 0.718, + "step": 4772 + }, + { + "epoch": 0.2627002036435687, + "grad_norm": 0.7747959494590759, + "learning_rate": 9.595279679285254e-06, + "loss": 0.8329, + "step": 4773 + }, + { + "epoch": 0.26275524244592435, + "grad_norm": 0.697979748249054, + "learning_rate": 9.59510882106761e-06, + "loss": 0.7456, + "step": 4774 + }, + { + "epoch": 0.26281028124828004, + "grad_norm": 0.7600447535514832, + "learning_rate": 9.594937928314359e-06, + "loss": 0.875, + "step": 4775 + }, + { + "epoch": 0.2628653200506357, + "grad_norm": 0.7591384649276733, + "learning_rate": 9.594767001026783e-06, + "loss": 0.7607, + "step": 4776 + }, + { + "epoch": 0.26292035885299136, + "grad_norm": 0.9267380833625793, + "learning_rate": 9.59459603920617e-06, + "loss": 0.8926, + "step": 4777 + }, + { + "epoch": 0.262975397655347, + "grad_norm": 0.7751328349113464, + "learning_rate": 9.594425042853802e-06, + "loss": 0.7449, + "step": 4778 + }, + { + "epoch": 0.2630304364577027, + "grad_norm": 0.7066012620925903, + "learning_rate": 9.594254011970966e-06, + "loss": 0.8374, + "step": 4779 + }, + { + "epoch": 0.2630854752600583, + "grad_norm": 0.7564317584037781, + "learning_rate": 9.594082946558945e-06, + "loss": 0.735, + "step": 4780 + }, + { + "epoch": 0.263140514062414, + "grad_norm": 0.8151416182518005, + "learning_rate": 9.593911846619027e-06, + "loss": 0.8575, + "step": 4781 + }, + { + "epoch": 0.26319555286476964, + "grad_norm": 0.719261646270752, + "learning_rate": 9.593740712152497e-06, + "loss": 0.7981, + "step": 4782 + }, + { + "epoch": 0.26325059166712533, + "grad_norm": 0.8627344369888306, + "learning_rate": 9.593569543160642e-06, + "loss": 0.895, + "step": 4783 + }, + { + "epoch": 0.26330563046948097, + "grad_norm": 1.293272614479065, + "learning_rate": 9.593398339644748e-06, + "loss": 0.7531, + "step": 4784 + }, + { + "epoch": 0.26336066927183666, + "grad_norm": 0.8475207686424255, + "learning_rate": 9.593227101606102e-06, + "loss": 0.9091, + "step": 4785 + }, + { + "epoch": 0.2634157080741923, + "grad_norm": 0.78054279088974, + "learning_rate": 9.593055829045989e-06, + "loss": 0.7692, + "step": 4786 + }, + { + "epoch": 0.263470746876548, + "grad_norm": 0.7677399516105652, + "learning_rate": 9.592884521965699e-06, + "loss": 0.6232, + "step": 4787 + }, + { + "epoch": 0.2635257856789036, + "grad_norm": 0.7232677340507507, + "learning_rate": 9.59271318036652e-06, + "loss": 0.8087, + "step": 4788 + }, + { + "epoch": 0.2635808244812593, + "grad_norm": 0.8728463649749756, + "learning_rate": 9.592541804249735e-06, + "loss": 0.7824, + "step": 4789 + }, + { + "epoch": 0.26363586328361494, + "grad_norm": 0.7569910883903503, + "learning_rate": 9.592370393616637e-06, + "loss": 0.7418, + "step": 4790 + }, + { + "epoch": 0.2636909020859706, + "grad_norm": 0.7631934285163879, + "learning_rate": 9.592198948468511e-06, + "loss": 0.7929, + "step": 4791 + }, + { + "epoch": 0.26374594088832626, + "grad_norm": 0.8021631240844727, + "learning_rate": 9.592027468806649e-06, + "loss": 0.8111, + "step": 4792 + }, + { + "epoch": 0.26380097969068195, + "grad_norm": 0.9454651474952698, + "learning_rate": 9.591855954632336e-06, + "loss": 0.8239, + "step": 4793 + }, + { + "epoch": 0.2638560184930376, + "grad_norm": 0.672924280166626, + "learning_rate": 9.591684405946863e-06, + "loss": 0.6877, + "step": 4794 + }, + { + "epoch": 0.26391105729539327, + "grad_norm": 0.7942802906036377, + "learning_rate": 9.59151282275152e-06, + "loss": 0.9002, + "step": 4795 + }, + { + "epoch": 0.2639660960977489, + "grad_norm": 0.7131155133247375, + "learning_rate": 9.591341205047596e-06, + "loss": 0.7692, + "step": 4796 + }, + { + "epoch": 0.2640211349001046, + "grad_norm": 1.0395869016647339, + "learning_rate": 9.59116955283638e-06, + "loss": 0.8352, + "step": 4797 + }, + { + "epoch": 0.2640761737024602, + "grad_norm": 0.9503256678581238, + "learning_rate": 9.590997866119163e-06, + "loss": 1.0287, + "step": 4798 + }, + { + "epoch": 0.2641312125048159, + "grad_norm": 0.7539612054824829, + "learning_rate": 9.590826144897235e-06, + "loss": 0.872, + "step": 4799 + }, + { + "epoch": 0.26418625130717155, + "grad_norm": 0.7067893743515015, + "learning_rate": 9.590654389171885e-06, + "loss": 0.7636, + "step": 4800 + }, + { + "epoch": 0.26424129010952724, + "grad_norm": 0.7355281710624695, + "learning_rate": 9.590482598944407e-06, + "loss": 0.7715, + "step": 4801 + }, + { + "epoch": 0.26429632891188287, + "grad_norm": 0.7589674592018127, + "learning_rate": 9.590310774216089e-06, + "loss": 0.7451, + "step": 4802 + }, + { + "epoch": 0.26435136771423856, + "grad_norm": 0.701386034488678, + "learning_rate": 9.590138914988226e-06, + "loss": 0.7317, + "step": 4803 + }, + { + "epoch": 0.2644064065165942, + "grad_norm": 0.7663118243217468, + "learning_rate": 9.589967021262105e-06, + "loss": 0.8227, + "step": 4804 + }, + { + "epoch": 0.2644614453189499, + "grad_norm": 0.7059655785560608, + "learning_rate": 9.589795093039023e-06, + "loss": 0.7829, + "step": 4805 + }, + { + "epoch": 0.2645164841213055, + "grad_norm": 0.7377020120620728, + "learning_rate": 9.58962313032027e-06, + "loss": 0.8308, + "step": 4806 + }, + { + "epoch": 0.2645715229236612, + "grad_norm": 0.8635388612747192, + "learning_rate": 9.589451133107134e-06, + "loss": 0.7882, + "step": 4807 + }, + { + "epoch": 0.26462656172601684, + "grad_norm": 0.8282824754714966, + "learning_rate": 9.589279101400915e-06, + "loss": 0.8055, + "step": 4808 + }, + { + "epoch": 0.26468160052837253, + "grad_norm": 0.7026814818382263, + "learning_rate": 9.589107035202903e-06, + "loss": 0.7567, + "step": 4809 + }, + { + "epoch": 0.26473663933072816, + "grad_norm": 0.7575708031654358, + "learning_rate": 9.588934934514392e-06, + "loss": 0.7456, + "step": 4810 + }, + { + "epoch": 0.2647916781330838, + "grad_norm": 0.9732069969177246, + "learning_rate": 9.588762799336671e-06, + "loss": 0.8217, + "step": 4811 + }, + { + "epoch": 0.2648467169354395, + "grad_norm": 0.786803126335144, + "learning_rate": 9.58859062967104e-06, + "loss": 0.729, + "step": 4812 + }, + { + "epoch": 0.2649017557377951, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.588418425518789e-06, + "loss": 0.8204, + "step": 4813 + }, + { + "epoch": 0.2649567945401508, + "grad_norm": 0.8222702145576477, + "learning_rate": 9.588246186881213e-06, + "loss": 0.8349, + "step": 4814 + }, + { + "epoch": 0.26501183334250644, + "grad_norm": 0.7560802698135376, + "learning_rate": 9.588073913759608e-06, + "loss": 0.7601, + "step": 4815 + }, + { + "epoch": 0.26506687214486213, + "grad_norm": 0.9221365451812744, + "learning_rate": 9.587901606155266e-06, + "loss": 0.7725, + "step": 4816 + }, + { + "epoch": 0.26512191094721776, + "grad_norm": 0.8092262744903564, + "learning_rate": 9.587729264069485e-06, + "loss": 0.9074, + "step": 4817 + }, + { + "epoch": 0.26517694974957345, + "grad_norm": 0.8183920979499817, + "learning_rate": 9.587556887503557e-06, + "loss": 0.8321, + "step": 4818 + }, + { + "epoch": 0.2652319885519291, + "grad_norm": 0.7023420929908752, + "learning_rate": 9.587384476458781e-06, + "loss": 0.7842, + "step": 4819 + }, + { + "epoch": 0.2652870273542848, + "grad_norm": 1.2864880561828613, + "learning_rate": 9.58721203093645e-06, + "loss": 0.7519, + "step": 4820 + }, + { + "epoch": 0.2653420661566404, + "grad_norm": 0.8133784532546997, + "learning_rate": 9.587039550937864e-06, + "loss": 0.8208, + "step": 4821 + }, + { + "epoch": 0.2653971049589961, + "grad_norm": 0.739732027053833, + "learning_rate": 9.586867036464314e-06, + "loss": 0.8553, + "step": 4822 + }, + { + "epoch": 0.26545214376135173, + "grad_norm": 0.7539162635803223, + "learning_rate": 9.5866944875171e-06, + "loss": 0.7385, + "step": 4823 + }, + { + "epoch": 0.2655071825637074, + "grad_norm": 0.8012336492538452, + "learning_rate": 9.58652190409752e-06, + "loss": 0.8343, + "step": 4824 + }, + { + "epoch": 0.26556222136606306, + "grad_norm": 0.7972521185874939, + "learning_rate": 9.586349286206865e-06, + "loss": 0.8481, + "step": 4825 + }, + { + "epoch": 0.26561726016841875, + "grad_norm": 0.7772900462150574, + "learning_rate": 9.58617663384644e-06, + "loss": 0.7655, + "step": 4826 + }, + { + "epoch": 0.2656722989707744, + "grad_norm": 0.677916944026947, + "learning_rate": 9.586003947017537e-06, + "loss": 0.696, + "step": 4827 + }, + { + "epoch": 0.26572733777313007, + "grad_norm": 0.8254117369651794, + "learning_rate": 9.585831225721455e-06, + "loss": 0.7841, + "step": 4828 + }, + { + "epoch": 0.2657823765754857, + "grad_norm": 0.7256904244422913, + "learning_rate": 9.585658469959496e-06, + "loss": 0.8057, + "step": 4829 + }, + { + "epoch": 0.2658374153778414, + "grad_norm": 0.7651757001876831, + "learning_rate": 9.585485679732953e-06, + "loss": 0.7918, + "step": 4830 + }, + { + "epoch": 0.265892454180197, + "grad_norm": 0.7581052184104919, + "learning_rate": 9.58531285504313e-06, + "loss": 0.759, + "step": 4831 + }, + { + "epoch": 0.2659474929825527, + "grad_norm": 0.7190486192703247, + "learning_rate": 9.58513999589132e-06, + "loss": 0.7403, + "step": 4832 + }, + { + "epoch": 0.26600253178490835, + "grad_norm": 0.8603141903877258, + "learning_rate": 9.584967102278825e-06, + "loss": 0.8944, + "step": 4833 + }, + { + "epoch": 0.26605757058726404, + "grad_norm": 0.806297779083252, + "learning_rate": 9.584794174206947e-06, + "loss": 0.7039, + "step": 4834 + }, + { + "epoch": 0.26611260938961967, + "grad_norm": 0.7604451775550842, + "learning_rate": 9.584621211676981e-06, + "loss": 0.8076, + "step": 4835 + }, + { + "epoch": 0.26616764819197536, + "grad_norm": 0.7276773452758789, + "learning_rate": 9.584448214690232e-06, + "loss": 0.786, + "step": 4836 + }, + { + "epoch": 0.266222686994331, + "grad_norm": 0.8737080693244934, + "learning_rate": 9.584275183247994e-06, + "loss": 0.8071, + "step": 4837 + }, + { + "epoch": 0.2662777257966867, + "grad_norm": 0.8447219133377075, + "learning_rate": 9.584102117351574e-06, + "loss": 0.7682, + "step": 4838 + }, + { + "epoch": 0.2663327645990423, + "grad_norm": 0.7001703381538391, + "learning_rate": 9.583929017002268e-06, + "loss": 0.7077, + "step": 4839 + }, + { + "epoch": 0.266387803401398, + "grad_norm": 0.7935730218887329, + "learning_rate": 9.583755882201377e-06, + "loss": 0.8122, + "step": 4840 + }, + { + "epoch": 0.26644284220375364, + "grad_norm": 0.8763312697410583, + "learning_rate": 9.583582712950207e-06, + "loss": 0.8241, + "step": 4841 + }, + { + "epoch": 0.2664978810061093, + "grad_norm": 0.7910245656967163, + "learning_rate": 9.583409509250055e-06, + "loss": 0.7717, + "step": 4842 + }, + { + "epoch": 0.26655291980846496, + "grad_norm": 0.7975226640701294, + "learning_rate": 9.583236271102222e-06, + "loss": 0.7165, + "step": 4843 + }, + { + "epoch": 0.26660795861082065, + "grad_norm": 0.8060342073440552, + "learning_rate": 9.583062998508014e-06, + "loss": 0.7659, + "step": 4844 + }, + { + "epoch": 0.2666629974131763, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.582889691468732e-06, + "loss": 0.8207, + "step": 4845 + }, + { + "epoch": 0.266718036215532, + "grad_norm": 0.7409310936927795, + "learning_rate": 9.582716349985677e-06, + "loss": 0.8439, + "step": 4846 + }, + { + "epoch": 0.2667730750178876, + "grad_norm": 0.8871899843215942, + "learning_rate": 9.582542974060152e-06, + "loss": 0.8305, + "step": 4847 + }, + { + "epoch": 0.2668281138202433, + "grad_norm": 0.9003115296363831, + "learning_rate": 9.58236956369346e-06, + "loss": 0.8334, + "step": 4848 + }, + { + "epoch": 0.26688315262259893, + "grad_norm": 1.0149577856063843, + "learning_rate": 9.582196118886909e-06, + "loss": 0.7962, + "step": 4849 + }, + { + "epoch": 0.2669381914249546, + "grad_norm": 0.785214900970459, + "learning_rate": 9.582022639641795e-06, + "loss": 0.7806, + "step": 4850 + }, + { + "epoch": 0.26699323022731025, + "grad_norm": 0.9833952188491821, + "learning_rate": 9.581849125959426e-06, + "loss": 0.7607, + "step": 4851 + }, + { + "epoch": 0.26704826902966594, + "grad_norm": 1.404751181602478, + "learning_rate": 9.581675577841104e-06, + "loss": 0.9046, + "step": 4852 + }, + { + "epoch": 0.2671033078320216, + "grad_norm": 0.791159451007843, + "learning_rate": 9.581501995288137e-06, + "loss": 0.6582, + "step": 4853 + }, + { + "epoch": 0.2671583466343772, + "grad_norm": 0.8507272005081177, + "learning_rate": 9.581328378301827e-06, + "loss": 0.8946, + "step": 4854 + }, + { + "epoch": 0.2672133854367329, + "grad_norm": 0.7372786998748779, + "learning_rate": 9.58115472688348e-06, + "loss": 0.7865, + "step": 4855 + }, + { + "epoch": 0.26726842423908853, + "grad_norm": 0.8293853998184204, + "learning_rate": 9.580981041034398e-06, + "loss": 0.9113, + "step": 4856 + }, + { + "epoch": 0.2673234630414442, + "grad_norm": 0.7212402820587158, + "learning_rate": 9.580807320755889e-06, + "loss": 0.7149, + "step": 4857 + }, + { + "epoch": 0.26737850184379985, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.58063356604926e-06, + "loss": 0.8651, + "step": 4858 + }, + { + "epoch": 0.26743354064615554, + "grad_norm": 0.8444308042526245, + "learning_rate": 9.580459776915814e-06, + "loss": 0.7968, + "step": 4859 + }, + { + "epoch": 0.2674885794485112, + "grad_norm": 0.7974254488945007, + "learning_rate": 9.58028595335686e-06, + "loss": 0.8499, + "step": 4860 + }, + { + "epoch": 0.26754361825086687, + "grad_norm": 0.7491242289543152, + "learning_rate": 9.580112095373702e-06, + "loss": 0.8278, + "step": 4861 + }, + { + "epoch": 0.2675986570532225, + "grad_norm": 0.6856499314308167, + "learning_rate": 9.579938202967646e-06, + "loss": 0.7466, + "step": 4862 + }, + { + "epoch": 0.2676536958555782, + "grad_norm": 0.7347447872161865, + "learning_rate": 9.579764276140002e-06, + "loss": 0.8046, + "step": 4863 + }, + { + "epoch": 0.2677087346579338, + "grad_norm": 0.6797083020210266, + "learning_rate": 9.579590314892077e-06, + "loss": 0.7012, + "step": 4864 + }, + { + "epoch": 0.2677637734602895, + "grad_norm": 0.8219562768936157, + "learning_rate": 9.579416319225175e-06, + "loss": 0.7592, + "step": 4865 + }, + { + "epoch": 0.26781881226264515, + "grad_norm": 0.7388357520103455, + "learning_rate": 9.579242289140607e-06, + "loss": 0.8179, + "step": 4866 + }, + { + "epoch": 0.26787385106500083, + "grad_norm": 0.7394490838050842, + "learning_rate": 9.579068224639679e-06, + "loss": 0.694, + "step": 4867 + }, + { + "epoch": 0.26792888986735647, + "grad_norm": 0.7309017181396484, + "learning_rate": 9.578894125723699e-06, + "loss": 0.7882, + "step": 4868 + }, + { + "epoch": 0.26798392866971216, + "grad_norm": 0.7785035967826843, + "learning_rate": 9.578719992393978e-06, + "loss": 0.8142, + "step": 4869 + }, + { + "epoch": 0.2680389674720678, + "grad_norm": 0.8983079195022583, + "learning_rate": 9.57854582465182e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.2680940062744235, + "grad_norm": 0.7433765530586243, + "learning_rate": 9.578371622498542e-06, + "loss": 0.8937, + "step": 4871 + }, + { + "epoch": 0.2681490450767791, + "grad_norm": 0.8808667659759521, + "learning_rate": 9.578197385935446e-06, + "loss": 0.7821, + "step": 4872 + }, + { + "epoch": 0.2682040838791348, + "grad_norm": 0.825794517993927, + "learning_rate": 9.578023114963843e-06, + "loss": 0.8228, + "step": 4873 + }, + { + "epoch": 0.26825912268149044, + "grad_norm": 1.0165129899978638, + "learning_rate": 9.577848809585046e-06, + "loss": 0.7964, + "step": 4874 + }, + { + "epoch": 0.2683141614838461, + "grad_norm": 0.742028534412384, + "learning_rate": 9.577674469800362e-06, + "loss": 0.9126, + "step": 4875 + }, + { + "epoch": 0.26836920028620176, + "grad_norm": 0.7571890354156494, + "learning_rate": 9.577500095611101e-06, + "loss": 0.879, + "step": 4876 + }, + { + "epoch": 0.26842423908855745, + "grad_norm": 0.7577160596847534, + "learning_rate": 9.577325687018575e-06, + "loss": 0.8048, + "step": 4877 + }, + { + "epoch": 0.2684792778909131, + "grad_norm": 0.7704411745071411, + "learning_rate": 9.577151244024095e-06, + "loss": 0.7451, + "step": 4878 + }, + { + "epoch": 0.26853431669326877, + "grad_norm": 0.8323166966438293, + "learning_rate": 9.57697676662897e-06, + "loss": 0.7591, + "step": 4879 + }, + { + "epoch": 0.2685893554956244, + "grad_norm": 0.7257028222084045, + "learning_rate": 9.576802254834516e-06, + "loss": 0.7941, + "step": 4880 + }, + { + "epoch": 0.2686443942979801, + "grad_norm": 0.8170442581176758, + "learning_rate": 9.57662770864204e-06, + "loss": 0.8617, + "step": 4881 + }, + { + "epoch": 0.2686994331003357, + "grad_norm": 0.7435339689254761, + "learning_rate": 9.576453128052852e-06, + "loss": 0.7683, + "step": 4882 + }, + { + "epoch": 0.2687544719026914, + "grad_norm": 0.7932955026626587, + "learning_rate": 9.576278513068271e-06, + "loss": 0.7103, + "step": 4883 + }, + { + "epoch": 0.26880951070504705, + "grad_norm": 0.8008469939231873, + "learning_rate": 9.576103863689604e-06, + "loss": 0.8144, + "step": 4884 + }, + { + "epoch": 0.26886454950740274, + "grad_norm": 0.8573774695396423, + "learning_rate": 9.575929179918167e-06, + "loss": 0.8992, + "step": 4885 + }, + { + "epoch": 0.2689195883097584, + "grad_norm": 0.7326993942260742, + "learning_rate": 9.57575446175527e-06, + "loss": 0.699, + "step": 4886 + }, + { + "epoch": 0.26897462711211406, + "grad_norm": 0.8249791264533997, + "learning_rate": 9.575579709202228e-06, + "loss": 0.7445, + "step": 4887 + }, + { + "epoch": 0.2690296659144697, + "grad_norm": 0.7136644124984741, + "learning_rate": 9.575404922260351e-06, + "loss": 0.779, + "step": 4888 + }, + { + "epoch": 0.2690847047168254, + "grad_norm": 1.0130438804626465, + "learning_rate": 9.575230100930958e-06, + "loss": 0.8535, + "step": 4889 + }, + { + "epoch": 0.269139743519181, + "grad_norm": 0.6784926652908325, + "learning_rate": 9.575055245215358e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.2691947823215367, + "grad_norm": 0.7492508888244629, + "learning_rate": 9.57488035511487e-06, + "loss": 0.6748, + "step": 4891 + }, + { + "epoch": 0.26924982112389234, + "grad_norm": 0.7951217889785767, + "learning_rate": 9.574705430630807e-06, + "loss": 0.8119, + "step": 4892 + }, + { + "epoch": 0.26930485992624803, + "grad_norm": 0.9756677746772766, + "learning_rate": 9.574530471764478e-06, + "loss": 0.855, + "step": 4893 + }, + { + "epoch": 0.26935989872860366, + "grad_norm": 0.7806811928749084, + "learning_rate": 9.574355478517206e-06, + "loss": 0.8432, + "step": 4894 + }, + { + "epoch": 0.26941493753095935, + "grad_norm": 0.7814774513244629, + "learning_rate": 9.574180450890301e-06, + "loss": 0.8226, + "step": 4895 + }, + { + "epoch": 0.269469976333315, + "grad_norm": 0.7745325565338135, + "learning_rate": 9.574005388885081e-06, + "loss": 0.7722, + "step": 4896 + }, + { + "epoch": 0.2695250151356706, + "grad_norm": 0.7805666327476501, + "learning_rate": 9.573830292502862e-06, + "loss": 0.8357, + "step": 4897 + }, + { + "epoch": 0.2695800539380263, + "grad_norm": 0.8428031802177429, + "learning_rate": 9.573655161744958e-06, + "loss": 0.8056, + "step": 4898 + }, + { + "epoch": 0.26963509274038194, + "grad_norm": 0.7896600961685181, + "learning_rate": 9.573479996612684e-06, + "loss": 0.7984, + "step": 4899 + }, + { + "epoch": 0.26969013154273763, + "grad_norm": 0.7718683481216431, + "learning_rate": 9.57330479710736e-06, + "loss": 0.7527, + "step": 4900 + }, + { + "epoch": 0.26974517034509327, + "grad_norm": 0.7868129014968872, + "learning_rate": 9.573129563230302e-06, + "loss": 0.7876, + "step": 4901 + }, + { + "epoch": 0.26980020914744895, + "grad_norm": 0.8493777513504028, + "learning_rate": 9.572954294982826e-06, + "loss": 0.864, + "step": 4902 + }, + { + "epoch": 0.2698552479498046, + "grad_norm": 0.7492502331733704, + "learning_rate": 9.57277899236625e-06, + "loss": 0.8236, + "step": 4903 + }, + { + "epoch": 0.2699102867521603, + "grad_norm": 1.0534250736236572, + "learning_rate": 9.57260365538189e-06, + "loss": 0.8012, + "step": 4904 + }, + { + "epoch": 0.2699653255545159, + "grad_norm": 0.7557470202445984, + "learning_rate": 9.572428284031065e-06, + "loss": 0.9084, + "step": 4905 + }, + { + "epoch": 0.2700203643568716, + "grad_norm": 0.8055123686790466, + "learning_rate": 9.572252878315094e-06, + "loss": 0.7468, + "step": 4906 + }, + { + "epoch": 0.27007540315922723, + "grad_norm": 0.8399039506912231, + "learning_rate": 9.572077438235294e-06, + "loss": 0.9293, + "step": 4907 + }, + { + "epoch": 0.2701304419615829, + "grad_norm": 0.9800041317939758, + "learning_rate": 9.571901963792983e-06, + "loss": 0.8664, + "step": 4908 + }, + { + "epoch": 0.27018548076393856, + "grad_norm": 0.7732129096984863, + "learning_rate": 9.571726454989482e-06, + "loss": 0.7227, + "step": 4909 + }, + { + "epoch": 0.27024051956629425, + "grad_norm": 0.730754017829895, + "learning_rate": 9.571550911826109e-06, + "loss": 0.6467, + "step": 4910 + }, + { + "epoch": 0.2702955583686499, + "grad_norm": 0.8245325684547424, + "learning_rate": 9.57137533430418e-06, + "loss": 0.7847, + "step": 4911 + }, + { + "epoch": 0.27035059717100557, + "grad_norm": 0.8606786131858826, + "learning_rate": 9.57119972242502e-06, + "loss": 0.9556, + "step": 4912 + }, + { + "epoch": 0.2704056359733612, + "grad_norm": 0.7480195164680481, + "learning_rate": 9.571024076189947e-06, + "loss": 0.8504, + "step": 4913 + }, + { + "epoch": 0.2704606747757169, + "grad_norm": 0.718913197517395, + "learning_rate": 9.57084839560028e-06, + "loss": 0.7869, + "step": 4914 + }, + { + "epoch": 0.2705157135780725, + "grad_norm": 0.9778180122375488, + "learning_rate": 9.57067268065734e-06, + "loss": 0.8514, + "step": 4915 + }, + { + "epoch": 0.2705707523804282, + "grad_norm": 0.7394844889640808, + "learning_rate": 9.570496931362448e-06, + "loss": 0.7906, + "step": 4916 + }, + { + "epoch": 0.27062579118278385, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.570321147716923e-06, + "loss": 0.8194, + "step": 4917 + }, + { + "epoch": 0.27068082998513954, + "grad_norm": 0.8002632260322571, + "learning_rate": 9.57014532972209e-06, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.27073586878749517, + "grad_norm": 0.8668341040611267, + "learning_rate": 9.569969477379267e-06, + "loss": 0.8954, + "step": 4919 + }, + { + "epoch": 0.27079090758985086, + "grad_norm": 0.7403327226638794, + "learning_rate": 9.569793590689775e-06, + "loss": 0.7755, + "step": 4920 + }, + { + "epoch": 0.2708459463922065, + "grad_norm": 0.7399682998657227, + "learning_rate": 9.569617669654938e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 0.2709009851945622, + "grad_norm": 0.788600504398346, + "learning_rate": 9.56944171427608e-06, + "loss": 0.7565, + "step": 4922 + }, + { + "epoch": 0.2709560239969178, + "grad_norm": 0.7044861912727356, + "learning_rate": 9.56926572455452e-06, + "loss": 0.7073, + "step": 4923 + }, + { + "epoch": 0.2710110627992735, + "grad_norm": 0.8195114135742188, + "learning_rate": 9.569089700491581e-06, + "loss": 0.8658, + "step": 4924 + }, + { + "epoch": 0.27106610160162914, + "grad_norm": 0.7792258858680725, + "learning_rate": 9.568913642088589e-06, + "loss": 0.8628, + "step": 4925 + }, + { + "epoch": 0.27112114040398483, + "grad_norm": 0.764930248260498, + "learning_rate": 9.568737549346862e-06, + "loss": 0.7761, + "step": 4926 + }, + { + "epoch": 0.27117617920634046, + "grad_norm": 0.7226328253746033, + "learning_rate": 9.56856142226773e-06, + "loss": 0.7208, + "step": 4927 + }, + { + "epoch": 0.27123121800869615, + "grad_norm": 0.8726598620414734, + "learning_rate": 9.568385260852512e-06, + "loss": 0.8599, + "step": 4928 + }, + { + "epoch": 0.2712862568110518, + "grad_norm": 1.0126571655273438, + "learning_rate": 9.568209065102533e-06, + "loss": 0.8145, + "step": 4929 + }, + { + "epoch": 0.2713412956134075, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.568032835019116e-06, + "loss": 0.6758, + "step": 4930 + }, + { + "epoch": 0.2713963344157631, + "grad_norm": 0.6955474019050598, + "learning_rate": 9.567856570603589e-06, + "loss": 0.7461, + "step": 4931 + }, + { + "epoch": 0.2714513732181188, + "grad_norm": 0.7136832475662231, + "learning_rate": 9.567680271857274e-06, + "loss": 0.7692, + "step": 4932 + }, + { + "epoch": 0.27150641202047443, + "grad_norm": 1.2288198471069336, + "learning_rate": 9.567503938781497e-06, + "loss": 0.7815, + "step": 4933 + }, + { + "epoch": 0.2715614508228301, + "grad_norm": 0.9182234406471252, + "learning_rate": 9.567327571377584e-06, + "loss": 0.8822, + "step": 4934 + }, + { + "epoch": 0.27161648962518575, + "grad_norm": 0.7684763669967651, + "learning_rate": 9.567151169646859e-06, + "loss": 0.7618, + "step": 4935 + }, + { + "epoch": 0.27167152842754144, + "grad_norm": 0.872360348701477, + "learning_rate": 9.566974733590647e-06, + "loss": 0.7975, + "step": 4936 + }, + { + "epoch": 0.2717265672298971, + "grad_norm": 0.9010463356971741, + "learning_rate": 9.566798263210277e-06, + "loss": 0.7159, + "step": 4937 + }, + { + "epoch": 0.27178160603225276, + "grad_norm": 0.7254281044006348, + "learning_rate": 9.566621758507072e-06, + "loss": 0.6724, + "step": 4938 + }, + { + "epoch": 0.2718366448346084, + "grad_norm": 0.8478212356567383, + "learning_rate": 9.566445219482363e-06, + "loss": 0.659, + "step": 4939 + }, + { + "epoch": 0.27189168363696403, + "grad_norm": 0.9038714170455933, + "learning_rate": 9.56626864613747e-06, + "loss": 0.8766, + "step": 4940 + }, + { + "epoch": 0.2719467224393197, + "grad_norm": 0.9704582691192627, + "learning_rate": 9.566092038473728e-06, + "loss": 0.8972, + "step": 4941 + }, + { + "epoch": 0.27200176124167535, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.565915396492459e-06, + "loss": 0.8116, + "step": 4942 + }, + { + "epoch": 0.27205680004403104, + "grad_norm": 0.7432642579078674, + "learning_rate": 9.565738720194993e-06, + "loss": 0.847, + "step": 4943 + }, + { + "epoch": 0.2721118388463867, + "grad_norm": 0.6813814043998718, + "learning_rate": 9.565562009582655e-06, + "loss": 0.7146, + "step": 4944 + }, + { + "epoch": 0.27216687764874237, + "grad_norm": 0.7447707056999207, + "learning_rate": 9.565385264656776e-06, + "loss": 0.7696, + "step": 4945 + }, + { + "epoch": 0.272221916451098, + "grad_norm": 0.875073254108429, + "learning_rate": 9.565208485418685e-06, + "loss": 0.8714, + "step": 4946 + }, + { + "epoch": 0.2722769552534537, + "grad_norm": 0.7753880620002747, + "learning_rate": 9.565031671869707e-06, + "loss": 0.739, + "step": 4947 + }, + { + "epoch": 0.2723319940558093, + "grad_norm": 0.749264121055603, + "learning_rate": 9.564854824011172e-06, + "loss": 0.7957, + "step": 4948 + }, + { + "epoch": 0.272387032858165, + "grad_norm": 0.6733991503715515, + "learning_rate": 9.564677941844412e-06, + "loss": 0.7402, + "step": 4949 + }, + { + "epoch": 0.27244207166052065, + "grad_norm": 0.7426447868347168, + "learning_rate": 9.564501025370753e-06, + "loss": 0.7977, + "step": 4950 + }, + { + "epoch": 0.27249711046287634, + "grad_norm": 0.7930514812469482, + "learning_rate": 9.564324074591529e-06, + "loss": 0.8485, + "step": 4951 + }, + { + "epoch": 0.27255214926523197, + "grad_norm": 0.8087072968482971, + "learning_rate": 9.564147089508064e-06, + "loss": 0.9215, + "step": 4952 + }, + { + "epoch": 0.27260718806758766, + "grad_norm": 0.7560327053070068, + "learning_rate": 9.563970070121694e-06, + "loss": 0.7966, + "step": 4953 + }, + { + "epoch": 0.2726622268699433, + "grad_norm": 0.735573947429657, + "learning_rate": 9.563793016433744e-06, + "loss": 0.7737, + "step": 4954 + }, + { + "epoch": 0.272717265672299, + "grad_norm": 0.7603545784950256, + "learning_rate": 9.563615928445548e-06, + "loss": 0.7717, + "step": 4955 + }, + { + "epoch": 0.2727723044746546, + "grad_norm": 0.7185375094413757, + "learning_rate": 9.563438806158437e-06, + "loss": 0.8057, + "step": 4956 + }, + { + "epoch": 0.2728273432770103, + "grad_norm": 0.7619272470474243, + "learning_rate": 9.56326164957374e-06, + "loss": 0.8173, + "step": 4957 + }, + { + "epoch": 0.27288238207936594, + "grad_norm": 0.7868000864982605, + "learning_rate": 9.563084458692793e-06, + "loss": 0.6855, + "step": 4958 + }, + { + "epoch": 0.2729374208817216, + "grad_norm": 0.7949535846710205, + "learning_rate": 9.562907233516923e-06, + "loss": 0.7754, + "step": 4959 + }, + { + "epoch": 0.27299245968407726, + "grad_norm": 0.7037919163703918, + "learning_rate": 9.562729974047462e-06, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.27304749848643295, + "grad_norm": 0.7236568927764893, + "learning_rate": 9.562552680285746e-06, + "loss": 0.7135, + "step": 4961 + }, + { + "epoch": 0.2731025372887886, + "grad_norm": 0.8410467505455017, + "learning_rate": 9.562375352233105e-06, + "loss": 0.8507, + "step": 4962 + }, + { + "epoch": 0.27315757609114427, + "grad_norm": 0.8043560981750488, + "learning_rate": 9.562197989890871e-06, + "loss": 0.8484, + "step": 4963 + }, + { + "epoch": 0.2732126148934999, + "grad_norm": 0.6926127672195435, + "learning_rate": 9.56202059326038e-06, + "loss": 0.8087, + "step": 4964 + }, + { + "epoch": 0.2732676536958556, + "grad_norm": 0.7149024605751038, + "learning_rate": 9.561843162342961e-06, + "loss": 0.7349, + "step": 4965 + }, + { + "epoch": 0.27332269249821123, + "grad_norm": 0.7165781855583191, + "learning_rate": 9.561665697139952e-06, + "loss": 0.8139, + "step": 4966 + }, + { + "epoch": 0.2733777313005669, + "grad_norm": 0.7481133341789246, + "learning_rate": 9.561488197652684e-06, + "loss": 0.7712, + "step": 4967 + }, + { + "epoch": 0.27343277010292255, + "grad_norm": 0.6928209066390991, + "learning_rate": 9.561310663882491e-06, + "loss": 0.7524, + "step": 4968 + }, + { + "epoch": 0.27348780890527824, + "grad_norm": 0.7397856116294861, + "learning_rate": 9.561133095830708e-06, + "loss": 0.718, + "step": 4969 + }, + { + "epoch": 0.2735428477076339, + "grad_norm": 0.7712383270263672, + "learning_rate": 9.560955493498672e-06, + "loss": 0.8201, + "step": 4970 + }, + { + "epoch": 0.27359788650998956, + "grad_norm": 0.96076899766922, + "learning_rate": 9.560777856887714e-06, + "loss": 0.8555, + "step": 4971 + }, + { + "epoch": 0.2736529253123452, + "grad_norm": 0.7331019639968872, + "learning_rate": 9.56060018599917e-06, + "loss": 0.8315, + "step": 4972 + }, + { + "epoch": 0.2737079641147009, + "grad_norm": 0.7157140970230103, + "learning_rate": 9.560422480834374e-06, + "loss": 0.7177, + "step": 4973 + }, + { + "epoch": 0.2737630029170565, + "grad_norm": 0.807614266872406, + "learning_rate": 9.560244741394666e-06, + "loss": 0.8413, + "step": 4974 + }, + { + "epoch": 0.2738180417194122, + "grad_norm": 0.7618574500083923, + "learning_rate": 9.560066967681378e-06, + "loss": 0.8248, + "step": 4975 + }, + { + "epoch": 0.27387308052176784, + "grad_norm": 0.7886885404586792, + "learning_rate": 9.559889159695848e-06, + "loss": 0.8793, + "step": 4976 + }, + { + "epoch": 0.27392811932412353, + "grad_norm": 1.0090755224227905, + "learning_rate": 9.559711317439411e-06, + "loss": 0.9255, + "step": 4977 + }, + { + "epoch": 0.27398315812647916, + "grad_norm": 0.7855443358421326, + "learning_rate": 9.559533440913405e-06, + "loss": 0.8001, + "step": 4978 + }, + { + "epoch": 0.27403819692883485, + "grad_norm": 0.768741250038147, + "learning_rate": 9.559355530119165e-06, + "loss": 0.8109, + "step": 4979 + }, + { + "epoch": 0.2740932357311905, + "grad_norm": 0.759589672088623, + "learning_rate": 9.55917758505803e-06, + "loss": 0.8001, + "step": 4980 + }, + { + "epoch": 0.2741482745335462, + "grad_norm": 0.7937445640563965, + "learning_rate": 9.558999605731338e-06, + "loss": 0.8924, + "step": 4981 + }, + { + "epoch": 0.2742033133359018, + "grad_norm": 0.9041592478752136, + "learning_rate": 9.558821592140423e-06, + "loss": 0.9167, + "step": 4982 + }, + { + "epoch": 0.27425835213825744, + "grad_norm": 0.6971380710601807, + "learning_rate": 9.558643544286627e-06, + "loss": 0.7589, + "step": 4983 + }, + { + "epoch": 0.27431339094061313, + "grad_norm": 0.9292929172515869, + "learning_rate": 9.558465462171287e-06, + "loss": 0.9566, + "step": 4984 + }, + { + "epoch": 0.27436842974296877, + "grad_norm": 0.8320629000663757, + "learning_rate": 9.558287345795738e-06, + "loss": 0.8854, + "step": 4985 + }, + { + "epoch": 0.27442346854532446, + "grad_norm": 0.797272801399231, + "learning_rate": 9.558109195161325e-06, + "loss": 0.7838, + "step": 4986 + }, + { + "epoch": 0.2744785073476801, + "grad_norm": 0.9702700972557068, + "learning_rate": 9.557931010269382e-06, + "loss": 0.8593, + "step": 4987 + }, + { + "epoch": 0.2745335461500358, + "grad_norm": 0.8309103846549988, + "learning_rate": 9.557752791121248e-06, + "loss": 0.8902, + "step": 4988 + }, + { + "epoch": 0.2745885849523914, + "grad_norm": 0.706667959690094, + "learning_rate": 9.557574537718265e-06, + "loss": 0.7259, + "step": 4989 + }, + { + "epoch": 0.2746436237547471, + "grad_norm": 0.770239531993866, + "learning_rate": 9.557396250061771e-06, + "loss": 0.8644, + "step": 4990 + }, + { + "epoch": 0.27469866255710274, + "grad_norm": 0.8695803880691528, + "learning_rate": 9.557217928153108e-06, + "loss": 0.895, + "step": 4991 + }, + { + "epoch": 0.2747537013594584, + "grad_norm": 0.7525948286056519, + "learning_rate": 9.557039571993614e-06, + "loss": 0.7029, + "step": 4992 + }, + { + "epoch": 0.27480874016181406, + "grad_norm": 0.7616680264472961, + "learning_rate": 9.556861181584631e-06, + "loss": 0.8025, + "step": 4993 + }, + { + "epoch": 0.27486377896416975, + "grad_norm": 0.7216167449951172, + "learning_rate": 9.5566827569275e-06, + "loss": 0.8314, + "step": 4994 + }, + { + "epoch": 0.2749188177665254, + "grad_norm": 0.7412614226341248, + "learning_rate": 9.55650429802356e-06, + "loss": 0.7877, + "step": 4995 + }, + { + "epoch": 0.27497385656888107, + "grad_norm": 0.7176525592803955, + "learning_rate": 9.556325804874154e-06, + "loss": 0.7615, + "step": 4996 + }, + { + "epoch": 0.2750288953712367, + "grad_norm": 0.7544515132904053, + "learning_rate": 9.556147277480623e-06, + "loss": 0.8352, + "step": 4997 + }, + { + "epoch": 0.2750839341735924, + "grad_norm": 0.7318205833435059, + "learning_rate": 9.555968715844309e-06, + "loss": 0.7403, + "step": 4998 + }, + { + "epoch": 0.275138972975948, + "grad_norm": 0.7495027780532837, + "learning_rate": 9.555790119966552e-06, + "loss": 0.7611, + "step": 4999 + }, + { + "epoch": 0.2751940117783037, + "grad_norm": 0.7544401288032532, + "learning_rate": 9.555611489848697e-06, + "loss": 0.8594, + "step": 5000 + }, + { + "epoch": 0.27524905058065935, + "grad_norm": 0.7698250412940979, + "learning_rate": 9.555432825492084e-06, + "loss": 0.8438, + "step": 5001 + }, + { + "epoch": 0.27530408938301504, + "grad_norm": 0.7668892741203308, + "learning_rate": 9.555254126898059e-06, + "loss": 0.8082, + "step": 5002 + }, + { + "epoch": 0.27535912818537067, + "grad_norm": 0.9170669317245483, + "learning_rate": 9.555075394067963e-06, + "loss": 0.7443, + "step": 5003 + }, + { + "epoch": 0.27541416698772636, + "grad_norm": 0.7890255451202393, + "learning_rate": 9.55489662700314e-06, + "loss": 0.8269, + "step": 5004 + }, + { + "epoch": 0.275469205790082, + "grad_norm": 0.6740512847900391, + "learning_rate": 9.554717825704932e-06, + "loss": 0.6906, + "step": 5005 + }, + { + "epoch": 0.2755242445924377, + "grad_norm": 0.8032376170158386, + "learning_rate": 9.554538990174685e-06, + "loss": 0.812, + "step": 5006 + }, + { + "epoch": 0.2755792833947933, + "grad_norm": 0.6932135224342346, + "learning_rate": 9.554360120413741e-06, + "loss": 0.7823, + "step": 5007 + }, + { + "epoch": 0.275634322197149, + "grad_norm": 0.7447643876075745, + "learning_rate": 9.554181216423447e-06, + "loss": 0.8753, + "step": 5008 + }, + { + "epoch": 0.27568936099950464, + "grad_norm": 0.8035081624984741, + "learning_rate": 9.554002278205145e-06, + "loss": 0.7135, + "step": 5009 + }, + { + "epoch": 0.27574439980186033, + "grad_norm": 0.7544171214103699, + "learning_rate": 9.553823305760182e-06, + "loss": 0.7574, + "step": 5010 + }, + { + "epoch": 0.27579943860421596, + "grad_norm": 0.6648419499397278, + "learning_rate": 9.553644299089902e-06, + "loss": 0.7566, + "step": 5011 + }, + { + "epoch": 0.27585447740657165, + "grad_norm": 0.7481752038002014, + "learning_rate": 9.55346525819565e-06, + "loss": 0.7862, + "step": 5012 + }, + { + "epoch": 0.2759095162089273, + "grad_norm": 0.7000668048858643, + "learning_rate": 9.55328618307877e-06, + "loss": 0.7767, + "step": 5013 + }, + { + "epoch": 0.275964555011283, + "grad_norm": 0.7435166239738464, + "learning_rate": 9.553107073740612e-06, + "loss": 0.6888, + "step": 5014 + }, + { + "epoch": 0.2760195938136386, + "grad_norm": 0.7593170404434204, + "learning_rate": 9.552927930182521e-06, + "loss": 0.7272, + "step": 5015 + }, + { + "epoch": 0.2760746326159943, + "grad_norm": 0.870079755783081, + "learning_rate": 9.55274875240584e-06, + "loss": 0.8692, + "step": 5016 + }, + { + "epoch": 0.27612967141834993, + "grad_norm": 0.8550307750701904, + "learning_rate": 9.55256954041192e-06, + "loss": 0.8729, + "step": 5017 + }, + { + "epoch": 0.2761847102207056, + "grad_norm": 0.888830304145813, + "learning_rate": 9.552390294202105e-06, + "loss": 0.8607, + "step": 5018 + }, + { + "epoch": 0.27623974902306125, + "grad_norm": 0.8295729160308838, + "learning_rate": 9.552211013777743e-06, + "loss": 0.8722, + "step": 5019 + }, + { + "epoch": 0.27629478782541694, + "grad_norm": 0.7732356190681458, + "learning_rate": 9.552031699140182e-06, + "loss": 0.8332, + "step": 5020 + }, + { + "epoch": 0.2763498266277726, + "grad_norm": 0.9132987856864929, + "learning_rate": 9.55185235029077e-06, + "loss": 0.769, + "step": 5021 + }, + { + "epoch": 0.27640486543012827, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.551672967230851e-06, + "loss": 0.8505, + "step": 5022 + }, + { + "epoch": 0.2764599042324839, + "grad_norm": 0.8526949882507324, + "learning_rate": 9.551493549961778e-06, + "loss": 0.8002, + "step": 5023 + }, + { + "epoch": 0.2765149430348396, + "grad_norm": 0.9513188004493713, + "learning_rate": 9.551314098484901e-06, + "loss": 0.8558, + "step": 5024 + }, + { + "epoch": 0.2765699818371952, + "grad_norm": 0.7543003559112549, + "learning_rate": 9.551134612801563e-06, + "loss": 0.8292, + "step": 5025 + }, + { + "epoch": 0.27662502063955086, + "grad_norm": 0.7531017065048218, + "learning_rate": 9.550955092913115e-06, + "loss": 0.7837, + "step": 5026 + }, + { + "epoch": 0.27668005944190655, + "grad_norm": 0.8725717663764954, + "learning_rate": 9.550775538820907e-06, + "loss": 0.8362, + "step": 5027 + }, + { + "epoch": 0.2767350982442622, + "grad_norm": 0.8122721910476685, + "learning_rate": 9.550595950526288e-06, + "loss": 0.8539, + "step": 5028 + }, + { + "epoch": 0.27679013704661787, + "grad_norm": 0.7756829261779785, + "learning_rate": 9.550416328030608e-06, + "loss": 0.787, + "step": 5029 + }, + { + "epoch": 0.2768451758489735, + "grad_norm": 0.9086001515388489, + "learning_rate": 9.550236671335218e-06, + "loss": 0.7972, + "step": 5030 + }, + { + "epoch": 0.2769002146513292, + "grad_norm": 0.7857060432434082, + "learning_rate": 9.550056980441466e-06, + "loss": 0.7577, + "step": 5031 + }, + { + "epoch": 0.2769552534536848, + "grad_norm": 0.8190392851829529, + "learning_rate": 9.549877255350703e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.2770102922560405, + "grad_norm": 0.7714588642120361, + "learning_rate": 9.549697496064283e-06, + "loss": 0.7916, + "step": 5033 + }, + { + "epoch": 0.27706533105839615, + "grad_norm": 0.7178533673286438, + "learning_rate": 9.549517702583552e-06, + "loss": 0.8001, + "step": 5034 + }, + { + "epoch": 0.27712036986075184, + "grad_norm": 0.7552955150604248, + "learning_rate": 9.549337874909865e-06, + "loss": 0.8361, + "step": 5035 + }, + { + "epoch": 0.27717540866310747, + "grad_norm": 0.7823992371559143, + "learning_rate": 9.549158013044573e-06, + "loss": 0.7033, + "step": 5036 + }, + { + "epoch": 0.27723044746546316, + "grad_norm": 0.731504499912262, + "learning_rate": 9.548978116989026e-06, + "loss": 0.73, + "step": 5037 + }, + { + "epoch": 0.2772854862678188, + "grad_norm": 0.7455994486808777, + "learning_rate": 9.548798186744578e-06, + "loss": 0.8005, + "step": 5038 + }, + { + "epoch": 0.2773405250701745, + "grad_norm": 0.7020164728164673, + "learning_rate": 9.54861822231258e-06, + "loss": 0.6707, + "step": 5039 + }, + { + "epoch": 0.2773955638725301, + "grad_norm": 0.7526360750198364, + "learning_rate": 9.548438223694385e-06, + "loss": 0.7686, + "step": 5040 + }, + { + "epoch": 0.2774506026748858, + "grad_norm": 0.7268579006195068, + "learning_rate": 9.548258190891344e-06, + "loss": 0.7039, + "step": 5041 + }, + { + "epoch": 0.27750564147724144, + "grad_norm": 0.9361631274223328, + "learning_rate": 9.548078123904815e-06, + "loss": 0.8023, + "step": 5042 + }, + { + "epoch": 0.2775606802795971, + "grad_norm": 0.7786710262298584, + "learning_rate": 9.547898022736147e-06, + "loss": 0.6866, + "step": 5043 + }, + { + "epoch": 0.27761571908195276, + "grad_norm": 0.7175624370574951, + "learning_rate": 9.547717887386695e-06, + "loss": 0.7554, + "step": 5044 + }, + { + "epoch": 0.27767075788430845, + "grad_norm": 0.9157657623291016, + "learning_rate": 9.547537717857813e-06, + "loss": 0.7936, + "step": 5045 + }, + { + "epoch": 0.2777257966866641, + "grad_norm": 0.7881377935409546, + "learning_rate": 9.547357514150854e-06, + "loss": 0.8198, + "step": 5046 + }, + { + "epoch": 0.2777808354890198, + "grad_norm": 1.0444039106369019, + "learning_rate": 9.547177276267173e-06, + "loss": 0.7954, + "step": 5047 + }, + { + "epoch": 0.2778358742913754, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.546997004208124e-06, + "loss": 0.7697, + "step": 5048 + }, + { + "epoch": 0.2778909130937311, + "grad_norm": 0.7304134368896484, + "learning_rate": 9.546816697975066e-06, + "loss": 0.7034, + "step": 5049 + }, + { + "epoch": 0.27794595189608673, + "grad_norm": 0.7783082723617554, + "learning_rate": 9.546636357569347e-06, + "loss": 0.8185, + "step": 5050 + }, + { + "epoch": 0.2780009906984424, + "grad_norm": 0.750712513923645, + "learning_rate": 9.54645598299233e-06, + "loss": 0.7336, + "step": 5051 + }, + { + "epoch": 0.27805602950079805, + "grad_norm": 0.7849590182304382, + "learning_rate": 9.546275574245364e-06, + "loss": 0.8088, + "step": 5052 + }, + { + "epoch": 0.27811106830315374, + "grad_norm": 0.8490208983421326, + "learning_rate": 9.546095131329809e-06, + "loss": 0.8507, + "step": 5053 + }, + { + "epoch": 0.2781661071055094, + "grad_norm": 0.8107250928878784, + "learning_rate": 9.54591465424702e-06, + "loss": 0.7787, + "step": 5054 + }, + { + "epoch": 0.27822114590786506, + "grad_norm": 0.8278594613075256, + "learning_rate": 9.54573414299835e-06, + "loss": 0.7836, + "step": 5055 + }, + { + "epoch": 0.2782761847102207, + "grad_norm": 0.7982015013694763, + "learning_rate": 9.545553597585163e-06, + "loss": 0.7672, + "step": 5056 + }, + { + "epoch": 0.2783312235125764, + "grad_norm": 0.7311522364616394, + "learning_rate": 9.54537301800881e-06, + "loss": 0.7571, + "step": 5057 + }, + { + "epoch": 0.278386262314932, + "grad_norm": 0.8039999604225159, + "learning_rate": 9.545192404270651e-06, + "loss": 0.764, + "step": 5058 + }, + { + "epoch": 0.2784413011172877, + "grad_norm": 0.7810946702957153, + "learning_rate": 9.545011756372042e-06, + "loss": 0.9217, + "step": 5059 + }, + { + "epoch": 0.27849633991964334, + "grad_norm": 0.7092248797416687, + "learning_rate": 9.544831074314343e-06, + "loss": 0.7599, + "step": 5060 + }, + { + "epoch": 0.27855137872199903, + "grad_norm": 0.831550657749176, + "learning_rate": 9.544650358098908e-06, + "loss": 0.7278, + "step": 5061 + }, + { + "epoch": 0.27860641752435467, + "grad_norm": 0.7645474076271057, + "learning_rate": 9.544469607727098e-06, + "loss": 0.7945, + "step": 5062 + }, + { + "epoch": 0.27866145632671036, + "grad_norm": 0.6956788301467896, + "learning_rate": 9.544288823200273e-06, + "loss": 0.749, + "step": 5063 + }, + { + "epoch": 0.278716495129066, + "grad_norm": 0.7262974381446838, + "learning_rate": 9.544108004519786e-06, + "loss": 0.8074, + "step": 5064 + }, + { + "epoch": 0.2787715339314217, + "grad_norm": 0.7439202666282654, + "learning_rate": 9.543927151687001e-06, + "loss": 0.9403, + "step": 5065 + }, + { + "epoch": 0.2788265727337773, + "grad_norm": 0.8468778133392334, + "learning_rate": 9.543746264703277e-06, + "loss": 0.8182, + "step": 5066 + }, + { + "epoch": 0.278881611536133, + "grad_norm": 0.8396204113960266, + "learning_rate": 9.54356534356997e-06, + "loss": 0.8067, + "step": 5067 + }, + { + "epoch": 0.27893665033848863, + "grad_norm": 0.718758225440979, + "learning_rate": 9.543384388288445e-06, + "loss": 0.8172, + "step": 5068 + }, + { + "epoch": 0.27899168914084427, + "grad_norm": 0.7562685012817383, + "learning_rate": 9.543203398860056e-06, + "loss": 0.9053, + "step": 5069 + }, + { + "epoch": 0.27904672794319996, + "grad_norm": 0.9592792987823486, + "learning_rate": 9.543022375286169e-06, + "loss": 0.9375, + "step": 5070 + }, + { + "epoch": 0.2791017667455556, + "grad_norm": 0.7162739634513855, + "learning_rate": 9.54284131756814e-06, + "loss": 0.7297, + "step": 5071 + }, + { + "epoch": 0.2791568055479113, + "grad_norm": 0.7703517079353333, + "learning_rate": 9.542660225707335e-06, + "loss": 0.8863, + "step": 5072 + }, + { + "epoch": 0.2792118443502669, + "grad_norm": 0.7860418558120728, + "learning_rate": 9.542479099705109e-06, + "loss": 0.8335, + "step": 5073 + }, + { + "epoch": 0.2792668831526226, + "grad_norm": 0.8880825042724609, + "learning_rate": 9.542297939562825e-06, + "loss": 0.8344, + "step": 5074 + }, + { + "epoch": 0.27932192195497824, + "grad_norm": 0.7900505661964417, + "learning_rate": 9.542116745281849e-06, + "loss": 0.7613, + "step": 5075 + }, + { + "epoch": 0.2793769607573339, + "grad_norm": 0.7446081042289734, + "learning_rate": 9.541935516863536e-06, + "loss": 0.6615, + "step": 5076 + }, + { + "epoch": 0.27943199955968956, + "grad_norm": 0.7831308245658875, + "learning_rate": 9.541754254309254e-06, + "loss": 0.779, + "step": 5077 + }, + { + "epoch": 0.27948703836204525, + "grad_norm": 0.9007606506347656, + "learning_rate": 9.541572957620361e-06, + "loss": 0.8883, + "step": 5078 + }, + { + "epoch": 0.2795420771644009, + "grad_norm": 0.8033407330513, + "learning_rate": 9.541391626798222e-06, + "loss": 0.7354, + "step": 5079 + }, + { + "epoch": 0.27959711596675657, + "grad_norm": 0.9259470105171204, + "learning_rate": 9.5412102618442e-06, + "loss": 0.7602, + "step": 5080 + }, + { + "epoch": 0.2796521547691122, + "grad_norm": 0.786523163318634, + "learning_rate": 9.541028862759656e-06, + "loss": 0.7402, + "step": 5081 + }, + { + "epoch": 0.2797071935714679, + "grad_norm": 0.8053372502326965, + "learning_rate": 9.540847429545954e-06, + "loss": 0.825, + "step": 5082 + }, + { + "epoch": 0.2797622323738235, + "grad_norm": 0.8578022122383118, + "learning_rate": 9.54066596220446e-06, + "loss": 0.7866, + "step": 5083 + }, + { + "epoch": 0.2798172711761792, + "grad_norm": 0.916161835193634, + "learning_rate": 9.540484460736535e-06, + "loss": 0.5961, + "step": 5084 + }, + { + "epoch": 0.27987230997853485, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.540302925143545e-06, + "loss": 0.764, + "step": 5085 + }, + { + "epoch": 0.27992734878089054, + "grad_norm": 0.7392510771751404, + "learning_rate": 9.540121355426852e-06, + "loss": 0.8038, + "step": 5086 + }, + { + "epoch": 0.2799823875832462, + "grad_norm": 0.7406296133995056, + "learning_rate": 9.539939751587825e-06, + "loss": 0.8202, + "step": 5087 + }, + { + "epoch": 0.28003742638560186, + "grad_norm": 0.7274924516677856, + "learning_rate": 9.539758113627823e-06, + "loss": 0.7691, + "step": 5088 + }, + { + "epoch": 0.2800924651879575, + "grad_norm": 0.8563184142112732, + "learning_rate": 9.539576441548218e-06, + "loss": 0.8341, + "step": 5089 + }, + { + "epoch": 0.2801475039903132, + "grad_norm": 0.7708351016044617, + "learning_rate": 9.539394735350366e-06, + "loss": 0.7126, + "step": 5090 + }, + { + "epoch": 0.2802025427926688, + "grad_norm": 0.7314836382865906, + "learning_rate": 9.539212995035642e-06, + "loss": 0.7465, + "step": 5091 + }, + { + "epoch": 0.2802575815950245, + "grad_norm": 0.7594754695892334, + "learning_rate": 9.539031220605409e-06, + "loss": 0.7563, + "step": 5092 + }, + { + "epoch": 0.28031262039738014, + "grad_norm": 0.699414074420929, + "learning_rate": 9.53884941206103e-06, + "loss": 0.7847, + "step": 5093 + }, + { + "epoch": 0.28036765919973583, + "grad_norm": 0.8013063073158264, + "learning_rate": 9.538667569403877e-06, + "loss": 0.7769, + "step": 5094 + }, + { + "epoch": 0.28042269800209146, + "grad_norm": 0.7778805494308472, + "learning_rate": 9.538485692635312e-06, + "loss": 0.7646, + "step": 5095 + }, + { + "epoch": 0.28047773680444715, + "grad_norm": 0.785649299621582, + "learning_rate": 9.538303781756702e-06, + "loss": 0.8162, + "step": 5096 + }, + { + "epoch": 0.2805327756068028, + "grad_norm": 0.7073212265968323, + "learning_rate": 9.538121836769417e-06, + "loss": 0.7208, + "step": 5097 + }, + { + "epoch": 0.2805878144091585, + "grad_norm": 0.7545642852783203, + "learning_rate": 9.53793985767482e-06, + "loss": 0.8673, + "step": 5098 + }, + { + "epoch": 0.2806428532115141, + "grad_norm": 0.6818416118621826, + "learning_rate": 9.537757844474285e-06, + "loss": 0.7576, + "step": 5099 + }, + { + "epoch": 0.2806978920138698, + "grad_norm": 0.6718038320541382, + "learning_rate": 9.537575797169176e-06, + "loss": 0.6683, + "step": 5100 + }, + { + "epoch": 0.28075293081622543, + "grad_norm": 0.7851004600524902, + "learning_rate": 9.53739371576086e-06, + "loss": 0.8871, + "step": 5101 + }, + { + "epoch": 0.2808079696185811, + "grad_norm": 0.7565650343894958, + "learning_rate": 9.53721160025071e-06, + "loss": 0.8799, + "step": 5102 + }, + { + "epoch": 0.28086300842093676, + "grad_norm": 0.7522932887077332, + "learning_rate": 9.537029450640091e-06, + "loss": 0.838, + "step": 5103 + }, + { + "epoch": 0.28091804722329244, + "grad_norm": 0.929634690284729, + "learning_rate": 9.536847266930375e-06, + "loss": 0.7997, + "step": 5104 + }, + { + "epoch": 0.2809730860256481, + "grad_norm": 0.8050084710121155, + "learning_rate": 9.536665049122928e-06, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.28102812482800377, + "grad_norm": 0.7401233315467834, + "learning_rate": 9.53648279721912e-06, + "loss": 0.7904, + "step": 5106 + }, + { + "epoch": 0.2810831636303594, + "grad_norm": 0.7125453948974609, + "learning_rate": 9.536300511220322e-06, + "loss": 0.7349, + "step": 5107 + }, + { + "epoch": 0.2811382024327151, + "grad_norm": 0.7165758609771729, + "learning_rate": 9.536118191127905e-06, + "loss": 0.7314, + "step": 5108 + }, + { + "epoch": 0.2811932412350707, + "grad_norm": 0.7507439851760864, + "learning_rate": 9.535935836943237e-06, + "loss": 0.7603, + "step": 5109 + }, + { + "epoch": 0.2812482800374264, + "grad_norm": 0.7832109332084656, + "learning_rate": 9.535753448667688e-06, + "loss": 0.7279, + "step": 5110 + }, + { + "epoch": 0.28130331883978205, + "grad_norm": 0.7346609234809875, + "learning_rate": 9.535571026302633e-06, + "loss": 0.6882, + "step": 5111 + }, + { + "epoch": 0.2813583576421377, + "grad_norm": 0.7569608688354492, + "learning_rate": 9.535388569849437e-06, + "loss": 0.8451, + "step": 5112 + }, + { + "epoch": 0.28141339644449337, + "grad_norm": 0.7319865822792053, + "learning_rate": 9.535206079309478e-06, + "loss": 0.8161, + "step": 5113 + }, + { + "epoch": 0.281468435246849, + "grad_norm": 0.7744631171226501, + "learning_rate": 9.535023554684122e-06, + "loss": 0.8025, + "step": 5114 + }, + { + "epoch": 0.2815234740492047, + "grad_norm": 0.6867525577545166, + "learning_rate": 9.534840995974743e-06, + "loss": 0.7693, + "step": 5115 + }, + { + "epoch": 0.2815785128515603, + "grad_norm": 0.7625848054885864, + "learning_rate": 9.534658403182715e-06, + "loss": 0.8034, + "step": 5116 + }, + { + "epoch": 0.281633551653916, + "grad_norm": 0.7369832992553711, + "learning_rate": 9.534475776309406e-06, + "loss": 0.873, + "step": 5117 + }, + { + "epoch": 0.28168859045627165, + "grad_norm": 0.7267127633094788, + "learning_rate": 9.534293115356191e-06, + "loss": 0.7954, + "step": 5118 + }, + { + "epoch": 0.28174362925862734, + "grad_norm": 0.7244247794151306, + "learning_rate": 9.534110420324443e-06, + "loss": 0.7784, + "step": 5119 + }, + { + "epoch": 0.28179866806098297, + "grad_norm": 0.8207812905311584, + "learning_rate": 9.533927691215534e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.28185370686333866, + "grad_norm": 0.8669891357421875, + "learning_rate": 9.53374492803084e-06, + "loss": 0.8203, + "step": 5121 + }, + { + "epoch": 0.2819087456656943, + "grad_norm": 0.7650816440582275, + "learning_rate": 9.533562130771732e-06, + "loss": 0.77, + "step": 5122 + }, + { + "epoch": 0.28196378446805, + "grad_norm": 0.7664972543716431, + "learning_rate": 9.533379299439584e-06, + "loss": 0.7187, + "step": 5123 + }, + { + "epoch": 0.2820188232704056, + "grad_norm": 0.7921896576881409, + "learning_rate": 9.533196434035772e-06, + "loss": 0.8669, + "step": 5124 + }, + { + "epoch": 0.2820738620727613, + "grad_norm": 0.7714456915855408, + "learning_rate": 9.533013534561669e-06, + "loss": 0.8783, + "step": 5125 + }, + { + "epoch": 0.28212890087511694, + "grad_norm": 0.7222065329551697, + "learning_rate": 9.532830601018648e-06, + "loss": 0.7449, + "step": 5126 + }, + { + "epoch": 0.28218393967747263, + "grad_norm": 0.718142569065094, + "learning_rate": 9.532647633408085e-06, + "loss": 0.8226, + "step": 5127 + }, + { + "epoch": 0.28223897847982826, + "grad_norm": 0.730592668056488, + "learning_rate": 9.532464631731357e-06, + "loss": 0.7878, + "step": 5128 + }, + { + "epoch": 0.28229401728218395, + "grad_norm": 0.7841802835464478, + "learning_rate": 9.532281595989839e-06, + "loss": 0.8262, + "step": 5129 + }, + { + "epoch": 0.2823490560845396, + "grad_norm": 0.8617212772369385, + "learning_rate": 9.532098526184904e-06, + "loss": 0.8368, + "step": 5130 + }, + { + "epoch": 0.2824040948868953, + "grad_norm": 0.6968556642532349, + "learning_rate": 9.53191542231793e-06, + "loss": 0.6848, + "step": 5131 + }, + { + "epoch": 0.2824591336892509, + "grad_norm": 0.7872157096862793, + "learning_rate": 9.531732284390294e-06, + "loss": 0.7898, + "step": 5132 + }, + { + "epoch": 0.2825141724916066, + "grad_norm": 0.7727276086807251, + "learning_rate": 9.53154911240337e-06, + "loss": 0.8506, + "step": 5133 + }, + { + "epoch": 0.28256921129396223, + "grad_norm": 0.7279896140098572, + "learning_rate": 9.531365906358536e-06, + "loss": 0.7415, + "step": 5134 + }, + { + "epoch": 0.2826242500963179, + "grad_norm": 0.7457457780838013, + "learning_rate": 9.53118266625717e-06, + "loss": 0.7652, + "step": 5135 + }, + { + "epoch": 0.28267928889867355, + "grad_norm": 0.8989270329475403, + "learning_rate": 9.530999392100646e-06, + "loss": 0.9085, + "step": 5136 + }, + { + "epoch": 0.28273432770102924, + "grad_norm": 0.9622626304626465, + "learning_rate": 9.530816083890347e-06, + "loss": 0.8726, + "step": 5137 + }, + { + "epoch": 0.2827893665033849, + "grad_norm": 0.7712846994400024, + "learning_rate": 9.530632741627643e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.28284440530574056, + "grad_norm": 0.8320727348327637, + "learning_rate": 9.530449365313918e-06, + "loss": 0.7828, + "step": 5139 + }, + { + "epoch": 0.2828994441080962, + "grad_norm": 0.9310963153839111, + "learning_rate": 9.530265954950549e-06, + "loss": 0.8482, + "step": 5140 + }, + { + "epoch": 0.2829544829104519, + "grad_norm": 0.9984502792358398, + "learning_rate": 9.530082510538914e-06, + "loss": 0.8673, + "step": 5141 + }, + { + "epoch": 0.2830095217128075, + "grad_norm": 0.8300992250442505, + "learning_rate": 9.52989903208039e-06, + "loss": 0.8232, + "step": 5142 + }, + { + "epoch": 0.2830645605151632, + "grad_norm": 0.930052638053894, + "learning_rate": 9.529715519576356e-06, + "loss": 0.7766, + "step": 5143 + }, + { + "epoch": 0.28311959931751884, + "grad_norm": 0.8038359880447388, + "learning_rate": 9.529531973028194e-06, + "loss": 0.712, + "step": 5144 + }, + { + "epoch": 0.28317463811987453, + "grad_norm": 0.856250524520874, + "learning_rate": 9.529348392437283e-06, + "loss": 0.8578, + "step": 5145 + }, + { + "epoch": 0.28322967692223017, + "grad_norm": 0.7602483630180359, + "learning_rate": 9.529164777805002e-06, + "loss": 0.749, + "step": 5146 + }, + { + "epoch": 0.28328471572458586, + "grad_norm": 0.8946549892425537, + "learning_rate": 9.52898112913273e-06, + "loss": 0.8101, + "step": 5147 + }, + { + "epoch": 0.2833397545269415, + "grad_norm": 0.8015615344047546, + "learning_rate": 9.52879744642185e-06, + "loss": 0.8203, + "step": 5148 + }, + { + "epoch": 0.2833947933292972, + "grad_norm": 0.7767183780670166, + "learning_rate": 9.528613729673738e-06, + "loss": 0.8409, + "step": 5149 + }, + { + "epoch": 0.2834498321316528, + "grad_norm": 0.7604000568389893, + "learning_rate": 9.52842997888978e-06, + "loss": 0.8853, + "step": 5150 + }, + { + "epoch": 0.2835048709340085, + "grad_norm": 0.7079401016235352, + "learning_rate": 9.528246194071353e-06, + "loss": 0.6855, + "step": 5151 + }, + { + "epoch": 0.28355990973636414, + "grad_norm": 0.7616782188415527, + "learning_rate": 9.52806237521984e-06, + "loss": 0.785, + "step": 5152 + }, + { + "epoch": 0.2836149485387198, + "grad_norm": 0.7408583760261536, + "learning_rate": 9.527878522336622e-06, + "loss": 0.7105, + "step": 5153 + }, + { + "epoch": 0.28366998734107546, + "grad_norm": 0.694821834564209, + "learning_rate": 9.52769463542308e-06, + "loss": 0.6552, + "step": 5154 + }, + { + "epoch": 0.2837250261434311, + "grad_norm": 0.796925961971283, + "learning_rate": 9.5275107144806e-06, + "loss": 0.7122, + "step": 5155 + }, + { + "epoch": 0.2837800649457868, + "grad_norm": 0.8001971244812012, + "learning_rate": 9.527326759510558e-06, + "loss": 0.8528, + "step": 5156 + }, + { + "epoch": 0.2838351037481424, + "grad_norm": 0.8605831265449524, + "learning_rate": 9.527142770514341e-06, + "loss": 0.7948, + "step": 5157 + }, + { + "epoch": 0.2838901425504981, + "grad_norm": 0.8380078077316284, + "learning_rate": 9.526958747493334e-06, + "loss": 0.8184, + "step": 5158 + }, + { + "epoch": 0.28394518135285374, + "grad_norm": 0.8758485317230225, + "learning_rate": 9.526774690448913e-06, + "loss": 0.7625, + "step": 5159 + }, + { + "epoch": 0.2840002201552094, + "grad_norm": 0.7078989744186401, + "learning_rate": 9.526590599382466e-06, + "loss": 0.8179, + "step": 5160 + }, + { + "epoch": 0.28405525895756506, + "grad_norm": 0.6668990850448608, + "learning_rate": 9.526406474295376e-06, + "loss": 0.7169, + "step": 5161 + }, + { + "epoch": 0.28411029775992075, + "grad_norm": 0.7666084170341492, + "learning_rate": 9.526222315189026e-06, + "loss": 0.8511, + "step": 5162 + }, + { + "epoch": 0.2841653365622764, + "grad_norm": 0.7390545606613159, + "learning_rate": 9.526038122064802e-06, + "loss": 0.7926, + "step": 5163 + }, + { + "epoch": 0.28422037536463207, + "grad_norm": 0.7972092032432556, + "learning_rate": 9.525853894924086e-06, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 0.2842754141669877, + "grad_norm": 0.8988455533981323, + "learning_rate": 9.525669633768265e-06, + "loss": 0.9497, + "step": 5165 + }, + { + "epoch": 0.2843304529693434, + "grad_norm": 0.7092710137367249, + "learning_rate": 9.525485338598722e-06, + "loss": 0.7241, + "step": 5166 + }, + { + "epoch": 0.28438549177169903, + "grad_norm": 0.8630063533782959, + "learning_rate": 9.525301009416843e-06, + "loss": 0.8318, + "step": 5167 + }, + { + "epoch": 0.2844405305740547, + "grad_norm": 0.7336890697479248, + "learning_rate": 9.52511664622401e-06, + "loss": 0.7077, + "step": 5168 + }, + { + "epoch": 0.28449556937641035, + "grad_norm": 0.8156722784042358, + "learning_rate": 9.524932249021615e-06, + "loss": 0.8573, + "step": 5169 + }, + { + "epoch": 0.28455060817876604, + "grad_norm": 0.7061388492584229, + "learning_rate": 9.524747817811038e-06, + "loss": 0.7432, + "step": 5170 + }, + { + "epoch": 0.2846056469811217, + "grad_norm": 0.7948413491249084, + "learning_rate": 9.52456335259367e-06, + "loss": 0.8082, + "step": 5171 + }, + { + "epoch": 0.28466068578347736, + "grad_norm": 0.7208091020584106, + "learning_rate": 9.524378853370893e-06, + "loss": 0.7027, + "step": 5172 + }, + { + "epoch": 0.284715724585833, + "grad_norm": 0.8377540111541748, + "learning_rate": 9.524194320144096e-06, + "loss": 0.7093, + "step": 5173 + }, + { + "epoch": 0.2847707633881887, + "grad_norm": 0.8734563589096069, + "learning_rate": 9.524009752914666e-06, + "loss": 0.8422, + "step": 5174 + }, + { + "epoch": 0.2848258021905443, + "grad_norm": 0.7303940653800964, + "learning_rate": 9.523825151683989e-06, + "loss": 0.811, + "step": 5175 + }, + { + "epoch": 0.2848808409929, + "grad_norm": 0.7653842568397522, + "learning_rate": 9.523640516453455e-06, + "loss": 0.8595, + "step": 5176 + }, + { + "epoch": 0.28493587979525564, + "grad_norm": 0.7366930246353149, + "learning_rate": 9.523455847224448e-06, + "loss": 0.7832, + "step": 5177 + }, + { + "epoch": 0.28499091859761133, + "grad_norm": 0.7908505797386169, + "learning_rate": 9.523271143998357e-06, + "loss": 0.8115, + "step": 5178 + }, + { + "epoch": 0.28504595739996696, + "grad_norm": 0.8176048398017883, + "learning_rate": 9.523086406776572e-06, + "loss": 0.8377, + "step": 5179 + }, + { + "epoch": 0.28510099620232265, + "grad_norm": 0.724086344242096, + "learning_rate": 9.52290163556048e-06, + "loss": 0.7804, + "step": 5180 + }, + { + "epoch": 0.2851560350046783, + "grad_norm": 0.6461299657821655, + "learning_rate": 9.52271683035147e-06, + "loss": 0.5727, + "step": 5181 + }, + { + "epoch": 0.285211073807034, + "grad_norm": 0.7275353074073792, + "learning_rate": 9.522531991150932e-06, + "loss": 0.8345, + "step": 5182 + }, + { + "epoch": 0.2852661126093896, + "grad_norm": 0.7321951985359192, + "learning_rate": 9.522347117960253e-06, + "loss": 0.8832, + "step": 5183 + }, + { + "epoch": 0.2853211514117453, + "grad_norm": 0.7526552677154541, + "learning_rate": 9.522162210780825e-06, + "loss": 0.831, + "step": 5184 + }, + { + "epoch": 0.28537619021410093, + "grad_norm": 0.7592381238937378, + "learning_rate": 9.521977269614036e-06, + "loss": 0.7293, + "step": 5185 + }, + { + "epoch": 0.2854312290164566, + "grad_norm": 0.8060448169708252, + "learning_rate": 9.521792294461274e-06, + "loss": 0.819, + "step": 5186 + }, + { + "epoch": 0.28548626781881226, + "grad_norm": 0.7178553342819214, + "learning_rate": 9.521607285323932e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.28554130662116795, + "grad_norm": 0.8186969757080078, + "learning_rate": 9.521422242203401e-06, + "loss": 0.8526, + "step": 5188 + }, + { + "epoch": 0.2855963454235236, + "grad_norm": 0.8480883240699768, + "learning_rate": 9.521237165101071e-06, + "loss": 0.8088, + "step": 5189 + }, + { + "epoch": 0.28565138422587927, + "grad_norm": 0.8053719401359558, + "learning_rate": 9.521052054018333e-06, + "loss": 0.928, + "step": 5190 + }, + { + "epoch": 0.2857064230282349, + "grad_norm": 0.6937163472175598, + "learning_rate": 9.52086690895658e-06, + "loss": 0.7418, + "step": 5191 + }, + { + "epoch": 0.2857614618305906, + "grad_norm": 1.0616179704666138, + "learning_rate": 9.520681729917196e-06, + "loss": 0.8726, + "step": 5192 + }, + { + "epoch": 0.2858165006329462, + "grad_norm": 0.7504106163978577, + "learning_rate": 9.520496516901582e-06, + "loss": 0.844, + "step": 5193 + }, + { + "epoch": 0.2858715394353019, + "grad_norm": 0.7634509205818176, + "learning_rate": 9.520311269911127e-06, + "loss": 0.7595, + "step": 5194 + }, + { + "epoch": 0.28592657823765755, + "grad_norm": 0.7069799900054932, + "learning_rate": 9.52012598894722e-06, + "loss": 0.7566, + "step": 5195 + }, + { + "epoch": 0.28598161704001324, + "grad_norm": 0.695737361907959, + "learning_rate": 9.519940674011256e-06, + "loss": 0.7534, + "step": 5196 + }, + { + "epoch": 0.28603665584236887, + "grad_norm": 0.7212124466896057, + "learning_rate": 9.51975532510463e-06, + "loss": 0.8237, + "step": 5197 + }, + { + "epoch": 0.2860916946447245, + "grad_norm": 0.7274062633514404, + "learning_rate": 9.519569942228732e-06, + "loss": 0.756, + "step": 5198 + }, + { + "epoch": 0.2861467334470802, + "grad_norm": 0.7038697600364685, + "learning_rate": 9.519384525384956e-06, + "loss": 0.7308, + "step": 5199 + }, + { + "epoch": 0.2862017722494358, + "grad_norm": 0.6897109150886536, + "learning_rate": 9.519199074574694e-06, + "loss": 0.7858, + "step": 5200 + }, + { + "epoch": 0.2862568110517915, + "grad_norm": 0.8471527099609375, + "learning_rate": 9.519013589799343e-06, + "loss": 0.8198, + "step": 5201 + }, + { + "epoch": 0.28631184985414715, + "grad_norm": 0.6828129291534424, + "learning_rate": 9.518828071060295e-06, + "loss": 0.7734, + "step": 5202 + }, + { + "epoch": 0.28636688865650284, + "grad_norm": 0.7437755465507507, + "learning_rate": 9.518642518358946e-06, + "loss": 0.7669, + "step": 5203 + }, + { + "epoch": 0.28642192745885847, + "grad_norm": 0.8841923475265503, + "learning_rate": 9.518456931696689e-06, + "loss": 0.8201, + "step": 5204 + }, + { + "epoch": 0.28647696626121416, + "grad_norm": 0.9514154195785522, + "learning_rate": 9.518271311074917e-06, + "loss": 0.7864, + "step": 5205 + }, + { + "epoch": 0.2865320050635698, + "grad_norm": 0.830795407295227, + "learning_rate": 9.51808565649503e-06, + "loss": 0.8024, + "step": 5206 + }, + { + "epoch": 0.2865870438659255, + "grad_norm": 0.7274934649467468, + "learning_rate": 9.51789996795842e-06, + "loss": 0.7631, + "step": 5207 + }, + { + "epoch": 0.2866420826682811, + "grad_norm": 0.7004290223121643, + "learning_rate": 9.517714245466482e-06, + "loss": 0.7344, + "step": 5208 + }, + { + "epoch": 0.2866971214706368, + "grad_norm": 0.8559010624885559, + "learning_rate": 9.517528489020614e-06, + "loss": 0.7502, + "step": 5209 + }, + { + "epoch": 0.28675216027299244, + "grad_norm": 0.8913494348526001, + "learning_rate": 9.517342698622212e-06, + "loss": 0.8908, + "step": 5210 + }, + { + "epoch": 0.28680719907534813, + "grad_norm": 0.8375207781791687, + "learning_rate": 9.51715687427267e-06, + "loss": 0.7701, + "step": 5211 + }, + { + "epoch": 0.28686223787770376, + "grad_norm": 1.1804776191711426, + "learning_rate": 9.516971015973386e-06, + "loss": 0.8449, + "step": 5212 + }, + { + "epoch": 0.28691727668005945, + "grad_norm": 0.7260473370552063, + "learning_rate": 9.516785123725758e-06, + "loss": 0.7978, + "step": 5213 + }, + { + "epoch": 0.2869723154824151, + "grad_norm": 0.8159041404724121, + "learning_rate": 9.516599197531182e-06, + "loss": 0.7454, + "step": 5214 + }, + { + "epoch": 0.2870273542847708, + "grad_norm": 0.7850227952003479, + "learning_rate": 9.516413237391056e-06, + "loss": 0.8082, + "step": 5215 + }, + { + "epoch": 0.2870823930871264, + "grad_norm": 0.7596960067749023, + "learning_rate": 9.516227243306774e-06, + "loss": 0.7286, + "step": 5216 + }, + { + "epoch": 0.2871374318894821, + "grad_norm": 0.8763321042060852, + "learning_rate": 9.516041215279741e-06, + "loss": 0.8685, + "step": 5217 + }, + { + "epoch": 0.28719247069183773, + "grad_norm": 1.2130110263824463, + "learning_rate": 9.515855153311349e-06, + "loss": 0.8374, + "step": 5218 + }, + { + "epoch": 0.2872475094941934, + "grad_norm": 0.7578628063201904, + "learning_rate": 9.515669057402999e-06, + "loss": 0.793, + "step": 5219 + }, + { + "epoch": 0.28730254829654905, + "grad_norm": 0.9085225462913513, + "learning_rate": 9.515482927556088e-06, + "loss": 0.8366, + "step": 5220 + }, + { + "epoch": 0.28735758709890474, + "grad_norm": 0.7107900977134705, + "learning_rate": 9.515296763772017e-06, + "loss": 0.6571, + "step": 5221 + }, + { + "epoch": 0.2874126259012604, + "grad_norm": 0.7742018699645996, + "learning_rate": 9.515110566052183e-06, + "loss": 0.8387, + "step": 5222 + }, + { + "epoch": 0.28746766470361607, + "grad_norm": 0.8934319615364075, + "learning_rate": 9.514924334397987e-06, + "loss": 0.8546, + "step": 5223 + }, + { + "epoch": 0.2875227035059717, + "grad_norm": 0.720245897769928, + "learning_rate": 9.51473806881083e-06, + "loss": 0.7459, + "step": 5224 + }, + { + "epoch": 0.2875777423083274, + "grad_norm": 0.7074370384216309, + "learning_rate": 9.514551769292109e-06, + "loss": 0.8598, + "step": 5225 + }, + { + "epoch": 0.287632781110683, + "grad_norm": 0.7608621120452881, + "learning_rate": 9.514365435843226e-06, + "loss": 0.7263, + "step": 5226 + }, + { + "epoch": 0.2876878199130387, + "grad_norm": 0.7581011652946472, + "learning_rate": 9.51417906846558e-06, + "loss": 0.7498, + "step": 5227 + }, + { + "epoch": 0.28774285871539435, + "grad_norm": 0.8184412121772766, + "learning_rate": 9.513992667160572e-06, + "loss": 0.6889, + "step": 5228 + }, + { + "epoch": 0.28779789751775003, + "grad_norm": 0.6835145354270935, + "learning_rate": 9.513806231929605e-06, + "loss": 0.7399, + "step": 5229 + }, + { + "epoch": 0.28785293632010567, + "grad_norm": 0.7601536512374878, + "learning_rate": 9.513619762774077e-06, + "loss": 0.846, + "step": 5230 + }, + { + "epoch": 0.28790797512246136, + "grad_norm": 0.781491219997406, + "learning_rate": 9.513433259695392e-06, + "loss": 0.8326, + "step": 5231 + }, + { + "epoch": 0.287963013924817, + "grad_norm": 0.7978106141090393, + "learning_rate": 9.513246722694951e-06, + "loss": 0.7917, + "step": 5232 + }, + { + "epoch": 0.2880180527271727, + "grad_norm": 0.8071381449699402, + "learning_rate": 9.513060151774156e-06, + "loss": 0.8054, + "step": 5233 + }, + { + "epoch": 0.2880730915295283, + "grad_norm": 0.815567135810852, + "learning_rate": 9.512873546934406e-06, + "loss": 0.8647, + "step": 5234 + }, + { + "epoch": 0.288128130331884, + "grad_norm": 0.8255048990249634, + "learning_rate": 9.512686908177111e-06, + "loss": 0.9011, + "step": 5235 + }, + { + "epoch": 0.28818316913423964, + "grad_norm": 0.8392062187194824, + "learning_rate": 9.512500235503666e-06, + "loss": 0.8778, + "step": 5236 + }, + { + "epoch": 0.2882382079365953, + "grad_norm": 0.7256191372871399, + "learning_rate": 9.512313528915478e-06, + "loss": 0.7231, + "step": 5237 + }, + { + "epoch": 0.28829324673895096, + "grad_norm": 0.9041032195091248, + "learning_rate": 9.51212678841395e-06, + "loss": 0.8469, + "step": 5238 + }, + { + "epoch": 0.28834828554130665, + "grad_norm": 0.7857525944709778, + "learning_rate": 9.511940014000485e-06, + "loss": 0.7447, + "step": 5239 + }, + { + "epoch": 0.2884033243436623, + "grad_norm": 0.6925225257873535, + "learning_rate": 9.511753205676485e-06, + "loss": 0.8302, + "step": 5240 + }, + { + "epoch": 0.2884583631460179, + "grad_norm": 0.7253623008728027, + "learning_rate": 9.511566363443356e-06, + "loss": 0.8373, + "step": 5241 + }, + { + "epoch": 0.2885134019483736, + "grad_norm": 0.7198607921600342, + "learning_rate": 9.511379487302504e-06, + "loss": 0.79, + "step": 5242 + }, + { + "epoch": 0.28856844075072924, + "grad_norm": 0.7966421246528625, + "learning_rate": 9.511192577255328e-06, + "loss": 0.7933, + "step": 5243 + }, + { + "epoch": 0.2886234795530849, + "grad_norm": 0.9159359931945801, + "learning_rate": 9.511005633303239e-06, + "loss": 0.7254, + "step": 5244 + }, + { + "epoch": 0.28867851835544056, + "grad_norm": 0.9514481425285339, + "learning_rate": 9.510818655447638e-06, + "loss": 0.8916, + "step": 5245 + }, + { + "epoch": 0.28873355715779625, + "grad_norm": 0.7505099773406982, + "learning_rate": 9.510631643689932e-06, + "loss": 0.765, + "step": 5246 + }, + { + "epoch": 0.2887885959601519, + "grad_norm": 0.7824658751487732, + "learning_rate": 9.510444598031526e-06, + "loss": 0.6972, + "step": 5247 + }, + { + "epoch": 0.2888436347625076, + "grad_norm": 0.7778681516647339, + "learning_rate": 9.510257518473824e-06, + "loss": 0.8705, + "step": 5248 + }, + { + "epoch": 0.2888986735648632, + "grad_norm": 0.6785199642181396, + "learning_rate": 9.510070405018235e-06, + "loss": 0.6889, + "step": 5249 + }, + { + "epoch": 0.2889537123672189, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.509883257666164e-06, + "loss": 0.7979, + "step": 5250 + }, + { + "epoch": 0.28900875116957453, + "grad_norm": 1.3174562454223633, + "learning_rate": 9.509696076419018e-06, + "loss": 0.8802, + "step": 5251 + }, + { + "epoch": 0.2890637899719302, + "grad_norm": 1.1800767183303833, + "learning_rate": 9.509508861278205e-06, + "loss": 0.9246, + "step": 5252 + }, + { + "epoch": 0.28911882877428585, + "grad_norm": 0.7057580947875977, + "learning_rate": 9.509321612245128e-06, + "loss": 0.7565, + "step": 5253 + }, + { + "epoch": 0.28917386757664154, + "grad_norm": 0.7681905031204224, + "learning_rate": 9.509134329321197e-06, + "loss": 0.8678, + "step": 5254 + }, + { + "epoch": 0.2892289063789972, + "grad_norm": 0.96025550365448, + "learning_rate": 9.50894701250782e-06, + "loss": 0.9108, + "step": 5255 + }, + { + "epoch": 0.28928394518135286, + "grad_norm": 0.7786841988563538, + "learning_rate": 9.508759661806405e-06, + "loss": 0.7747, + "step": 5256 + }, + { + "epoch": 0.2893389839837085, + "grad_norm": 0.7073540091514587, + "learning_rate": 9.508572277218358e-06, + "loss": 0.7573, + "step": 5257 + }, + { + "epoch": 0.2893940227860642, + "grad_norm": 0.6648856401443481, + "learning_rate": 9.50838485874509e-06, + "loss": 0.7294, + "step": 5258 + }, + { + "epoch": 0.2894490615884198, + "grad_norm": 0.6794270873069763, + "learning_rate": 9.508197406388007e-06, + "loss": 0.7001, + "step": 5259 + }, + { + "epoch": 0.2895041003907755, + "grad_norm": 0.6819350123405457, + "learning_rate": 9.50800992014852e-06, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.28955913919313114, + "grad_norm": 0.6616997122764587, + "learning_rate": 9.507822400028036e-06, + "loss": 0.7108, + "step": 5261 + }, + { + "epoch": 0.28961417799548683, + "grad_norm": 0.7447230219841003, + "learning_rate": 9.507634846027966e-06, + "loss": 0.7865, + "step": 5262 + }, + { + "epoch": 0.28966921679784247, + "grad_norm": 0.7826278209686279, + "learning_rate": 9.50744725814972e-06, + "loss": 0.7922, + "step": 5263 + }, + { + "epoch": 0.28972425560019816, + "grad_norm": 0.8054459095001221, + "learning_rate": 9.507259636394706e-06, + "loss": 0.795, + "step": 5264 + }, + { + "epoch": 0.2897792944025538, + "grad_norm": 0.9539191722869873, + "learning_rate": 9.507071980764335e-06, + "loss": 0.9495, + "step": 5265 + }, + { + "epoch": 0.2898343332049095, + "grad_norm": 0.8877993226051331, + "learning_rate": 9.506884291260017e-06, + "loss": 0.8418, + "step": 5266 + }, + { + "epoch": 0.2898893720072651, + "grad_norm": 0.6620327234268188, + "learning_rate": 9.506696567883164e-06, + "loss": 0.6285, + "step": 5267 + }, + { + "epoch": 0.2899444108096208, + "grad_norm": 0.7604434490203857, + "learning_rate": 9.506508810635187e-06, + "loss": 0.8562, + "step": 5268 + }, + { + "epoch": 0.28999944961197643, + "grad_norm": 0.8181812763214111, + "learning_rate": 9.506321019517494e-06, + "loss": 0.905, + "step": 5269 + }, + { + "epoch": 0.2900544884143321, + "grad_norm": 0.7776391506195068, + "learning_rate": 9.5061331945315e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.29010952721668776, + "grad_norm": 0.8125039339065552, + "learning_rate": 9.505945335678613e-06, + "loss": 0.7254, + "step": 5271 + }, + { + "epoch": 0.29016456601904345, + "grad_norm": 0.7229846715927124, + "learning_rate": 9.50575744296025e-06, + "loss": 0.8192, + "step": 5272 + }, + { + "epoch": 0.2902196048213991, + "grad_norm": 0.72443026304245, + "learning_rate": 9.505569516377817e-06, + "loss": 0.7813, + "step": 5273 + }, + { + "epoch": 0.29027464362375477, + "grad_norm": 0.6798073053359985, + "learning_rate": 9.505381555932731e-06, + "loss": 0.7655, + "step": 5274 + }, + { + "epoch": 0.2903296824261104, + "grad_norm": 1.0805624723434448, + "learning_rate": 9.505193561626404e-06, + "loss": 0.9035, + "step": 5275 + }, + { + "epoch": 0.2903847212284661, + "grad_norm": 0.7579694986343384, + "learning_rate": 9.505005533460247e-06, + "loss": 0.8612, + "step": 5276 + }, + { + "epoch": 0.2904397600308217, + "grad_norm": 1.2496099472045898, + "learning_rate": 9.504817471435676e-06, + "loss": 0.813, + "step": 5277 + }, + { + "epoch": 0.2904947988331774, + "grad_norm": 0.6915673017501831, + "learning_rate": 9.504629375554102e-06, + "loss": 0.6891, + "step": 5278 + }, + { + "epoch": 0.29054983763553305, + "grad_norm": 0.8581767082214355, + "learning_rate": 9.504441245816937e-06, + "loss": 0.7137, + "step": 5279 + }, + { + "epoch": 0.29060487643788874, + "grad_norm": 0.7469545006752014, + "learning_rate": 9.504253082225601e-06, + "loss": 0.7621, + "step": 5280 + }, + { + "epoch": 0.29065991524024437, + "grad_norm": 0.7725615501403809, + "learning_rate": 9.504064884781503e-06, + "loss": 0.7988, + "step": 5281 + }, + { + "epoch": 0.29071495404260006, + "grad_norm": 1.0187722444534302, + "learning_rate": 9.503876653486058e-06, + "loss": 0.7772, + "step": 5282 + }, + { + "epoch": 0.2907699928449557, + "grad_norm": 0.675574779510498, + "learning_rate": 9.503688388340683e-06, + "loss": 0.7096, + "step": 5283 + }, + { + "epoch": 0.2908250316473113, + "grad_norm": 0.7980207800865173, + "learning_rate": 9.503500089346792e-06, + "loss": 0.8291, + "step": 5284 + }, + { + "epoch": 0.290880070449667, + "grad_norm": 0.6891655325889587, + "learning_rate": 9.503311756505797e-06, + "loss": 0.7186, + "step": 5285 + }, + { + "epoch": 0.29093510925202265, + "grad_norm": 0.7273408770561218, + "learning_rate": 9.50312338981912e-06, + "loss": 0.7483, + "step": 5286 + }, + { + "epoch": 0.29099014805437834, + "grad_norm": 0.7346869111061096, + "learning_rate": 9.50293498928817e-06, + "loss": 0.766, + "step": 5287 + }, + { + "epoch": 0.291045186856734, + "grad_norm": 0.7627394795417786, + "learning_rate": 9.502746554914368e-06, + "loss": 0.867, + "step": 5288 + }, + { + "epoch": 0.29110022565908966, + "grad_norm": 0.8477200865745544, + "learning_rate": 9.502558086699128e-06, + "loss": 0.8317, + "step": 5289 + }, + { + "epoch": 0.2911552644614453, + "grad_norm": 0.7696006894111633, + "learning_rate": 9.502369584643867e-06, + "loss": 0.7814, + "step": 5290 + }, + { + "epoch": 0.291210303263801, + "grad_norm": 0.7614455819129944, + "learning_rate": 9.502181048749999e-06, + "loss": 0.7398, + "step": 5291 + }, + { + "epoch": 0.2912653420661566, + "grad_norm": 0.7877628207206726, + "learning_rate": 9.501992479018946e-06, + "loss": 0.8731, + "step": 5292 + }, + { + "epoch": 0.2913203808685123, + "grad_norm": 0.7455846667289734, + "learning_rate": 9.50180387545212e-06, + "loss": 0.7059, + "step": 5293 + }, + { + "epoch": 0.29137541967086794, + "grad_norm": 1.145520567893982, + "learning_rate": 9.501615238050944e-06, + "loss": 0.6968, + "step": 5294 + }, + { + "epoch": 0.29143045847322363, + "grad_norm": 0.8100234866142273, + "learning_rate": 9.501426566816831e-06, + "loss": 0.8122, + "step": 5295 + }, + { + "epoch": 0.29148549727557926, + "grad_norm": 0.6813066005706787, + "learning_rate": 9.501237861751203e-06, + "loss": 0.6718, + "step": 5296 + }, + { + "epoch": 0.29154053607793495, + "grad_norm": 0.7400195002555847, + "learning_rate": 9.501049122855473e-06, + "loss": 0.802, + "step": 5297 + }, + { + "epoch": 0.2915955748802906, + "grad_norm": 0.7948681712150574, + "learning_rate": 9.500860350131065e-06, + "loss": 0.8237, + "step": 5298 + }, + { + "epoch": 0.2916506136826463, + "grad_norm": 0.772093653678894, + "learning_rate": 9.500671543579394e-06, + "loss": 0.7687, + "step": 5299 + }, + { + "epoch": 0.2917056524850019, + "grad_norm": 0.7468486428260803, + "learning_rate": 9.500482703201881e-06, + "loss": 0.7827, + "step": 5300 + }, + { + "epoch": 0.2917606912873576, + "grad_norm": 0.7284440398216248, + "learning_rate": 9.500293828999945e-06, + "loss": 0.8086, + "step": 5301 + }, + { + "epoch": 0.29181573008971323, + "grad_norm": 0.8014211654663086, + "learning_rate": 9.500104920975005e-06, + "loss": 0.8409, + "step": 5302 + }, + { + "epoch": 0.2918707688920689, + "grad_norm": 0.7588346004486084, + "learning_rate": 9.49991597912848e-06, + "loss": 0.7149, + "step": 5303 + }, + { + "epoch": 0.29192580769442456, + "grad_norm": 0.8098518252372742, + "learning_rate": 9.499727003461794e-06, + "loss": 0.8375, + "step": 5304 + }, + { + "epoch": 0.29198084649678024, + "grad_norm": 0.8502426743507385, + "learning_rate": 9.499537993976363e-06, + "loss": 0.8177, + "step": 5305 + }, + { + "epoch": 0.2920358852991359, + "grad_norm": 0.8010903596878052, + "learning_rate": 9.499348950673607e-06, + "loss": 0.8457, + "step": 5306 + }, + { + "epoch": 0.29209092410149157, + "grad_norm": 0.6628156304359436, + "learning_rate": 9.49915987355495e-06, + "loss": 0.7327, + "step": 5307 + }, + { + "epoch": 0.2921459629038472, + "grad_norm": 0.7414939999580383, + "learning_rate": 9.49897076262181e-06, + "loss": 0.8271, + "step": 5308 + }, + { + "epoch": 0.2922010017062029, + "grad_norm": 0.7490847706794739, + "learning_rate": 9.498781617875613e-06, + "loss": 0.7689, + "step": 5309 + }, + { + "epoch": 0.2922560405085585, + "grad_norm": 0.7913424968719482, + "learning_rate": 9.498592439317777e-06, + "loss": 0.8571, + "step": 5310 + }, + { + "epoch": 0.2923110793109142, + "grad_norm": 0.6903867125511169, + "learning_rate": 9.498403226949724e-06, + "loss": 0.7325, + "step": 5311 + }, + { + "epoch": 0.29236611811326985, + "grad_norm": 0.8087130188941956, + "learning_rate": 9.498213980772875e-06, + "loss": 0.8167, + "step": 5312 + }, + { + "epoch": 0.29242115691562554, + "grad_norm": 1.1316752433776855, + "learning_rate": 9.498024700788655e-06, + "loss": 0.912, + "step": 5313 + }, + { + "epoch": 0.29247619571798117, + "grad_norm": 0.8701719045639038, + "learning_rate": 9.497835386998486e-06, + "loss": 0.8728, + "step": 5314 + }, + { + "epoch": 0.29253123452033686, + "grad_norm": 0.6688953638076782, + "learning_rate": 9.49764603940379e-06, + "loss": 0.6561, + "step": 5315 + }, + { + "epoch": 0.2925862733226925, + "grad_norm": 0.8067505359649658, + "learning_rate": 9.49745665800599e-06, + "loss": 0.8419, + "step": 5316 + }, + { + "epoch": 0.2926413121250482, + "grad_norm": 0.7157390117645264, + "learning_rate": 9.49726724280651e-06, + "loss": 0.7964, + "step": 5317 + }, + { + "epoch": 0.2926963509274038, + "grad_norm": 0.7038627862930298, + "learning_rate": 9.497077793806772e-06, + "loss": 0.7343, + "step": 5318 + }, + { + "epoch": 0.2927513897297595, + "grad_norm": 0.7674478888511658, + "learning_rate": 9.4968883110082e-06, + "loss": 0.7624, + "step": 5319 + }, + { + "epoch": 0.29280642853211514, + "grad_norm": 0.6708847284317017, + "learning_rate": 9.496698794412223e-06, + "loss": 0.6554, + "step": 5320 + }, + { + "epoch": 0.2928614673344708, + "grad_norm": 0.8332329392433167, + "learning_rate": 9.49650924402026e-06, + "loss": 0.9357, + "step": 5321 + }, + { + "epoch": 0.29291650613682646, + "grad_norm": 0.7601341605186462, + "learning_rate": 9.496319659833737e-06, + "loss": 0.8208, + "step": 5322 + }, + { + "epoch": 0.29297154493918215, + "grad_norm": 0.8320396542549133, + "learning_rate": 9.496130041854077e-06, + "loss": 0.8423, + "step": 5323 + }, + { + "epoch": 0.2930265837415378, + "grad_norm": 0.8242839574813843, + "learning_rate": 9.49594039008271e-06, + "loss": 0.9101, + "step": 5324 + }, + { + "epoch": 0.29308162254389347, + "grad_norm": 0.8906320333480835, + "learning_rate": 9.495750704521058e-06, + "loss": 0.7343, + "step": 5325 + }, + { + "epoch": 0.2931366613462491, + "grad_norm": 0.7964318990707397, + "learning_rate": 9.495560985170546e-06, + "loss": 0.7789, + "step": 5326 + }, + { + "epoch": 0.29319170014860474, + "grad_norm": 0.8267771601676941, + "learning_rate": 9.495371232032602e-06, + "loss": 0.7447, + "step": 5327 + }, + { + "epoch": 0.29324673895096043, + "grad_norm": 0.8120046257972717, + "learning_rate": 9.49518144510865e-06, + "loss": 0.7803, + "step": 5328 + }, + { + "epoch": 0.29330177775331606, + "grad_norm": 0.7314801812171936, + "learning_rate": 9.494991624400119e-06, + "loss": 0.6758, + "step": 5329 + }, + { + "epoch": 0.29335681655567175, + "grad_norm": 0.6989930272102356, + "learning_rate": 9.494801769908433e-06, + "loss": 0.7945, + "step": 5330 + }, + { + "epoch": 0.2934118553580274, + "grad_norm": 0.7804785966873169, + "learning_rate": 9.494611881635021e-06, + "loss": 0.7977, + "step": 5331 + }, + { + "epoch": 0.2934668941603831, + "grad_norm": 0.8377045392990112, + "learning_rate": 9.494421959581308e-06, + "loss": 0.8077, + "step": 5332 + }, + { + "epoch": 0.2935219329627387, + "grad_norm": 0.7463418245315552, + "learning_rate": 9.494232003748724e-06, + "loss": 0.783, + "step": 5333 + }, + { + "epoch": 0.2935769717650944, + "grad_norm": 0.7598912715911865, + "learning_rate": 9.494042014138695e-06, + "loss": 0.7869, + "step": 5334 + }, + { + "epoch": 0.29363201056745003, + "grad_norm": 0.7634113430976868, + "learning_rate": 9.493851990752648e-06, + "loss": 0.8108, + "step": 5335 + }, + { + "epoch": 0.2936870493698057, + "grad_norm": 0.8056474328041077, + "learning_rate": 9.493661933592013e-06, + "loss": 0.7921, + "step": 5336 + }, + { + "epoch": 0.29374208817216135, + "grad_norm": 0.8699371218681335, + "learning_rate": 9.493471842658219e-06, + "loss": 0.8833, + "step": 5337 + }, + { + "epoch": 0.29379712697451704, + "grad_norm": 0.8803261518478394, + "learning_rate": 9.493281717952691e-06, + "loss": 0.7848, + "step": 5338 + }, + { + "epoch": 0.2938521657768727, + "grad_norm": 0.7678453922271729, + "learning_rate": 9.493091559476864e-06, + "loss": 0.836, + "step": 5339 + }, + { + "epoch": 0.29390720457922836, + "grad_norm": 0.7653701305389404, + "learning_rate": 9.49290136723216e-06, + "loss": 0.8215, + "step": 5340 + }, + { + "epoch": 0.293962243381584, + "grad_norm": 0.768120527267456, + "learning_rate": 9.492711141220013e-06, + "loss": 0.7498, + "step": 5341 + }, + { + "epoch": 0.2940172821839397, + "grad_norm": 0.7665749788284302, + "learning_rate": 9.492520881441854e-06, + "loss": 0.7883, + "step": 5342 + }, + { + "epoch": 0.2940723209862953, + "grad_norm": 0.7405015230178833, + "learning_rate": 9.492330587899108e-06, + "loss": 0.8112, + "step": 5343 + }, + { + "epoch": 0.294127359788651, + "grad_norm": 0.7183459997177124, + "learning_rate": 9.492140260593208e-06, + "loss": 0.8227, + "step": 5344 + }, + { + "epoch": 0.29418239859100664, + "grad_norm": 0.7453572154045105, + "learning_rate": 9.491949899525585e-06, + "loss": 0.8148, + "step": 5345 + }, + { + "epoch": 0.29423743739336233, + "grad_norm": 0.8963750600814819, + "learning_rate": 9.491759504697669e-06, + "loss": 0.9261, + "step": 5346 + }, + { + "epoch": 0.29429247619571797, + "grad_norm": 0.7631667256355286, + "learning_rate": 9.49156907611089e-06, + "loss": 0.7708, + "step": 5347 + }, + { + "epoch": 0.29434751499807366, + "grad_norm": 0.6324381232261658, + "learning_rate": 9.49137861376668e-06, + "loss": 0.6688, + "step": 5348 + }, + { + "epoch": 0.2944025538004293, + "grad_norm": 0.6969807147979736, + "learning_rate": 9.491188117666472e-06, + "loss": 0.7516, + "step": 5349 + }, + { + "epoch": 0.294457592602785, + "grad_norm": 1.633340835571289, + "learning_rate": 9.490997587811697e-06, + "loss": 0.8111, + "step": 5350 + }, + { + "epoch": 0.2945126314051406, + "grad_norm": 0.7084371447563171, + "learning_rate": 9.490807024203785e-06, + "loss": 0.8375, + "step": 5351 + }, + { + "epoch": 0.2945676702074963, + "grad_norm": 0.7335958480834961, + "learning_rate": 9.490616426844169e-06, + "loss": 0.7884, + "step": 5352 + }, + { + "epoch": 0.29462270900985194, + "grad_norm": 0.7560276985168457, + "learning_rate": 9.490425795734282e-06, + "loss": 0.8918, + "step": 5353 + }, + { + "epoch": 0.2946777478122076, + "grad_norm": 0.9185894727706909, + "learning_rate": 9.490235130875557e-06, + "loss": 0.7976, + "step": 5354 + }, + { + "epoch": 0.29473278661456326, + "grad_norm": 0.7871553897857666, + "learning_rate": 9.490044432269427e-06, + "loss": 0.8564, + "step": 5355 + }, + { + "epoch": 0.29478782541691895, + "grad_norm": 0.8736812472343445, + "learning_rate": 9.489853699917326e-06, + "loss": 0.8114, + "step": 5356 + }, + { + "epoch": 0.2948428642192746, + "grad_norm": 0.8068968653678894, + "learning_rate": 9.489662933820684e-06, + "loss": 0.9198, + "step": 5357 + }, + { + "epoch": 0.29489790302163027, + "grad_norm": 0.7816325426101685, + "learning_rate": 9.489472133980939e-06, + "loss": 0.8012, + "step": 5358 + }, + { + "epoch": 0.2949529418239859, + "grad_norm": 0.7248200178146362, + "learning_rate": 9.489281300399522e-06, + "loss": 0.8099, + "step": 5359 + }, + { + "epoch": 0.2950079806263416, + "grad_norm": 0.7887724041938782, + "learning_rate": 9.48909043307787e-06, + "loss": 0.884, + "step": 5360 + }, + { + "epoch": 0.2950630194286972, + "grad_norm": 0.765163004398346, + "learning_rate": 9.488899532017415e-06, + "loss": 0.8563, + "step": 5361 + }, + { + "epoch": 0.2951180582310529, + "grad_norm": 0.7658557295799255, + "learning_rate": 9.488708597219592e-06, + "loss": 0.8897, + "step": 5362 + }, + { + "epoch": 0.29517309703340855, + "grad_norm": 0.6653227806091309, + "learning_rate": 9.488517628685838e-06, + "loss": 0.7107, + "step": 5363 + }, + { + "epoch": 0.29522813583576424, + "grad_norm": 0.787739098072052, + "learning_rate": 9.488326626417586e-06, + "loss": 0.8181, + "step": 5364 + }, + { + "epoch": 0.29528317463811987, + "grad_norm": 0.7822532057762146, + "learning_rate": 9.488135590416275e-06, + "loss": 0.8238, + "step": 5365 + }, + { + "epoch": 0.29533821344047556, + "grad_norm": 0.7797419428825378, + "learning_rate": 9.487944520683334e-06, + "loss": 0.8484, + "step": 5366 + }, + { + "epoch": 0.2953932522428312, + "grad_norm": 0.7230222225189209, + "learning_rate": 9.487753417220207e-06, + "loss": 0.8193, + "step": 5367 + }, + { + "epoch": 0.2954482910451869, + "grad_norm": 0.8256810307502747, + "learning_rate": 9.487562280028325e-06, + "loss": 0.7691, + "step": 5368 + }, + { + "epoch": 0.2955033298475425, + "grad_norm": 0.7704648375511169, + "learning_rate": 9.487371109109127e-06, + "loss": 0.8235, + "step": 5369 + }, + { + "epoch": 0.29555836864989815, + "grad_norm": 0.7580391764640808, + "learning_rate": 9.487179904464048e-06, + "loss": 0.7911, + "step": 5370 + }, + { + "epoch": 0.29561340745225384, + "grad_norm": 0.7211806774139404, + "learning_rate": 9.486988666094526e-06, + "loss": 0.7188, + "step": 5371 + }, + { + "epoch": 0.2956684462546095, + "grad_norm": 0.8375828862190247, + "learning_rate": 9.486797394001999e-06, + "loss": 0.881, + "step": 5372 + }, + { + "epoch": 0.29572348505696516, + "grad_norm": 0.8500093221664429, + "learning_rate": 9.486606088187903e-06, + "loss": 0.8632, + "step": 5373 + }, + { + "epoch": 0.2957785238593208, + "grad_norm": 0.7754727005958557, + "learning_rate": 9.486414748653677e-06, + "loss": 0.8124, + "step": 5374 + }, + { + "epoch": 0.2958335626616765, + "grad_norm": 0.9395208954811096, + "learning_rate": 9.486223375400759e-06, + "loss": 0.8046, + "step": 5375 + }, + { + "epoch": 0.2958886014640321, + "grad_norm": 0.7587517499923706, + "learning_rate": 9.486031968430587e-06, + "loss": 0.7852, + "step": 5376 + }, + { + "epoch": 0.2959436402663878, + "grad_norm": 0.6921781301498413, + "learning_rate": 9.485840527744599e-06, + "loss": 0.7392, + "step": 5377 + }, + { + "epoch": 0.29599867906874344, + "grad_norm": 0.8768522143363953, + "learning_rate": 9.485649053344233e-06, + "loss": 0.7819, + "step": 5378 + }, + { + "epoch": 0.29605371787109913, + "grad_norm": 0.7565680146217346, + "learning_rate": 9.485457545230932e-06, + "loss": 0.7489, + "step": 5379 + }, + { + "epoch": 0.29610875667345476, + "grad_norm": 0.7760992050170898, + "learning_rate": 9.485266003406132e-06, + "loss": 0.8129, + "step": 5380 + }, + { + "epoch": 0.29616379547581045, + "grad_norm": 0.7726097106933594, + "learning_rate": 9.485074427871272e-06, + "loss": 0.725, + "step": 5381 + }, + { + "epoch": 0.2962188342781661, + "grad_norm": 0.6885473728179932, + "learning_rate": 9.484882818627796e-06, + "loss": 0.685, + "step": 5382 + }, + { + "epoch": 0.2962738730805218, + "grad_norm": 0.776509702205658, + "learning_rate": 9.484691175677138e-06, + "loss": 0.8077, + "step": 5383 + }, + { + "epoch": 0.2963289118828774, + "grad_norm": 0.7436297535896301, + "learning_rate": 9.484499499020744e-06, + "loss": 0.8161, + "step": 5384 + }, + { + "epoch": 0.2963839506852331, + "grad_norm": 0.7604314088821411, + "learning_rate": 9.484307788660052e-06, + "loss": 0.825, + "step": 5385 + }, + { + "epoch": 0.29643898948758873, + "grad_norm": 0.7230789065361023, + "learning_rate": 9.484116044596501e-06, + "loss": 0.8005, + "step": 5386 + }, + { + "epoch": 0.2964940282899444, + "grad_norm": 0.820442259311676, + "learning_rate": 9.483924266831536e-06, + "loss": 0.789, + "step": 5387 + }, + { + "epoch": 0.29654906709230006, + "grad_norm": 0.7514582276344299, + "learning_rate": 9.483732455366596e-06, + "loss": 0.8531, + "step": 5388 + }, + { + "epoch": 0.29660410589465575, + "grad_norm": 0.6671503782272339, + "learning_rate": 9.483540610203124e-06, + "loss": 0.7627, + "step": 5389 + }, + { + "epoch": 0.2966591446970114, + "grad_norm": 0.6955942511558533, + "learning_rate": 9.483348731342559e-06, + "loss": 0.726, + "step": 5390 + }, + { + "epoch": 0.29671418349936707, + "grad_norm": 0.769781768321991, + "learning_rate": 9.483156818786347e-06, + "loss": 0.8064, + "step": 5391 + }, + { + "epoch": 0.2967692223017227, + "grad_norm": 1.0764707326889038, + "learning_rate": 9.482964872535927e-06, + "loss": 0.8249, + "step": 5392 + }, + { + "epoch": 0.2968242611040784, + "grad_norm": 1.0508921146392822, + "learning_rate": 9.482772892592744e-06, + "loss": 0.706, + "step": 5393 + }, + { + "epoch": 0.296879299906434, + "grad_norm": 0.6442564129829407, + "learning_rate": 9.482580878958239e-06, + "loss": 0.6025, + "step": 5394 + }, + { + "epoch": 0.2969343387087897, + "grad_norm": 0.7622735500335693, + "learning_rate": 9.482388831633856e-06, + "loss": 0.7639, + "step": 5395 + }, + { + "epoch": 0.29698937751114535, + "grad_norm": 0.8179057240486145, + "learning_rate": 9.482196750621038e-06, + "loss": 0.7641, + "step": 5396 + }, + { + "epoch": 0.29704441631350104, + "grad_norm": 0.7955192923545837, + "learning_rate": 9.48200463592123e-06, + "loss": 0.8407, + "step": 5397 + }, + { + "epoch": 0.29709945511585667, + "grad_norm": 0.7909773588180542, + "learning_rate": 9.481812487535875e-06, + "loss": 0.7833, + "step": 5398 + }, + { + "epoch": 0.29715449391821236, + "grad_norm": 0.8409042954444885, + "learning_rate": 9.481620305466417e-06, + "loss": 0.7788, + "step": 5399 + }, + { + "epoch": 0.297209532720568, + "grad_norm": 0.7521414160728455, + "learning_rate": 9.4814280897143e-06, + "loss": 0.7192, + "step": 5400 + }, + { + "epoch": 0.2972645715229237, + "grad_norm": 0.7016280889511108, + "learning_rate": 9.481235840280969e-06, + "loss": 0.7181, + "step": 5401 + }, + { + "epoch": 0.2973196103252793, + "grad_norm": 0.7257362604141235, + "learning_rate": 9.48104355716787e-06, + "loss": 0.7845, + "step": 5402 + }, + { + "epoch": 0.297374649127635, + "grad_norm": 0.8048765659332275, + "learning_rate": 9.480851240376445e-06, + "loss": 0.7921, + "step": 5403 + }, + { + "epoch": 0.29742968792999064, + "grad_norm": 0.8715546131134033, + "learning_rate": 9.480658889908143e-06, + "loss": 0.856, + "step": 5404 + }, + { + "epoch": 0.2974847267323463, + "grad_norm": 0.7211160063743591, + "learning_rate": 9.480466505764408e-06, + "loss": 0.7687, + "step": 5405 + }, + { + "epoch": 0.29753976553470196, + "grad_norm": 0.8749645352363586, + "learning_rate": 9.480274087946686e-06, + "loss": 0.8419, + "step": 5406 + }, + { + "epoch": 0.29759480433705765, + "grad_norm": 0.7986398935317993, + "learning_rate": 9.480081636456424e-06, + "loss": 0.8309, + "step": 5407 + }, + { + "epoch": 0.2976498431394133, + "grad_norm": 0.8435508012771606, + "learning_rate": 9.479889151295067e-06, + "loss": 0.7457, + "step": 5408 + }, + { + "epoch": 0.297704881941769, + "grad_norm": 0.8725010752677917, + "learning_rate": 9.479696632464063e-06, + "loss": 0.8069, + "step": 5409 + }, + { + "epoch": 0.2977599207441246, + "grad_norm": 0.7364320158958435, + "learning_rate": 9.479504079964856e-06, + "loss": 0.8316, + "step": 5410 + }, + { + "epoch": 0.2978149595464803, + "grad_norm": 0.7967824935913086, + "learning_rate": 9.479311493798898e-06, + "loss": 0.7689, + "step": 5411 + }, + { + "epoch": 0.29786999834883593, + "grad_norm": 0.8415414094924927, + "learning_rate": 9.479118873967632e-06, + "loss": 0.8288, + "step": 5412 + }, + { + "epoch": 0.29792503715119156, + "grad_norm": 0.9723265767097473, + "learning_rate": 9.478926220472508e-06, + "loss": 0.7422, + "step": 5413 + }, + { + "epoch": 0.29798007595354725, + "grad_norm": 0.7203155159950256, + "learning_rate": 9.478733533314974e-06, + "loss": 0.707, + "step": 5414 + }, + { + "epoch": 0.2980351147559029, + "grad_norm": 0.7643926739692688, + "learning_rate": 9.478540812496478e-06, + "loss": 0.7793, + "step": 5415 + }, + { + "epoch": 0.2980901535582586, + "grad_norm": 0.9177087545394897, + "learning_rate": 9.478348058018467e-06, + "loss": 0.865, + "step": 5416 + }, + { + "epoch": 0.2981451923606142, + "grad_norm": 0.678931713104248, + "learning_rate": 9.478155269882392e-06, + "loss": 0.7716, + "step": 5417 + }, + { + "epoch": 0.2982002311629699, + "grad_norm": 0.8440513610839844, + "learning_rate": 9.4779624480897e-06, + "loss": 0.8904, + "step": 5418 + }, + { + "epoch": 0.29825526996532553, + "grad_norm": 0.8508756756782532, + "learning_rate": 9.47776959264184e-06, + "loss": 0.7994, + "step": 5419 + }, + { + "epoch": 0.2983103087676812, + "grad_norm": 0.8736951947212219, + "learning_rate": 9.477576703540265e-06, + "loss": 0.8374, + "step": 5420 + }, + { + "epoch": 0.29836534757003685, + "grad_norm": 0.8063240051269531, + "learning_rate": 9.47738378078642e-06, + "loss": 0.7217, + "step": 5421 + }, + { + "epoch": 0.29842038637239254, + "grad_norm": 1.1495088338851929, + "learning_rate": 9.477190824381757e-06, + "loss": 0.8902, + "step": 5422 + }, + { + "epoch": 0.2984754251747482, + "grad_norm": 1.0241554975509644, + "learning_rate": 9.476997834327725e-06, + "loss": 0.9354, + "step": 5423 + }, + { + "epoch": 0.29853046397710387, + "grad_norm": 0.939950168132782, + "learning_rate": 9.476804810625779e-06, + "loss": 0.8714, + "step": 5424 + }, + { + "epoch": 0.2985855027794595, + "grad_norm": 0.7592660188674927, + "learning_rate": 9.476611753277364e-06, + "loss": 0.7513, + "step": 5425 + }, + { + "epoch": 0.2986405415818152, + "grad_norm": 0.776153028011322, + "learning_rate": 9.476418662283935e-06, + "loss": 0.7828, + "step": 5426 + }, + { + "epoch": 0.2986955803841708, + "grad_norm": 0.9317814707756042, + "learning_rate": 9.47622553764694e-06, + "loss": 0.865, + "step": 5427 + }, + { + "epoch": 0.2987506191865265, + "grad_norm": 0.7770501971244812, + "learning_rate": 9.476032379367832e-06, + "loss": 0.7281, + "step": 5428 + }, + { + "epoch": 0.29880565798888215, + "grad_norm": 0.7815201282501221, + "learning_rate": 9.475839187448064e-06, + "loss": 0.7565, + "step": 5429 + }, + { + "epoch": 0.29886069679123783, + "grad_norm": 0.7992607951164246, + "learning_rate": 9.475645961889086e-06, + "loss": 0.8109, + "step": 5430 + }, + { + "epoch": 0.29891573559359347, + "grad_norm": 0.7780614495277405, + "learning_rate": 9.475452702692351e-06, + "loss": 0.7814, + "step": 5431 + }, + { + "epoch": 0.29897077439594916, + "grad_norm": 0.7409062385559082, + "learning_rate": 9.475259409859313e-06, + "loss": 0.7712, + "step": 5432 + }, + { + "epoch": 0.2990258131983048, + "grad_norm": 0.7935584187507629, + "learning_rate": 9.47506608339142e-06, + "loss": 0.8301, + "step": 5433 + }, + { + "epoch": 0.2990808520006605, + "grad_norm": 0.6931030750274658, + "learning_rate": 9.474872723290132e-06, + "loss": 0.7471, + "step": 5434 + }, + { + "epoch": 0.2991358908030161, + "grad_norm": 0.7622918486595154, + "learning_rate": 9.474679329556894e-06, + "loss": 0.7727, + "step": 5435 + }, + { + "epoch": 0.2991909296053718, + "grad_norm": 0.7957701086997986, + "learning_rate": 9.474485902193169e-06, + "loss": 0.7663, + "step": 5436 + }, + { + "epoch": 0.29924596840772744, + "grad_norm": 1.0600612163543701, + "learning_rate": 9.474292441200404e-06, + "loss": 0.7861, + "step": 5437 + }, + { + "epoch": 0.2993010072100831, + "grad_norm": 0.7343600392341614, + "learning_rate": 9.474098946580053e-06, + "loss": 0.8609, + "step": 5438 + }, + { + "epoch": 0.29935604601243876, + "grad_norm": 0.7477726340293884, + "learning_rate": 9.473905418333573e-06, + "loss": 0.7683, + "step": 5439 + }, + { + "epoch": 0.29941108481479445, + "grad_norm": 0.7955546379089355, + "learning_rate": 9.473711856462417e-06, + "loss": 0.8406, + "step": 5440 + }, + { + "epoch": 0.2994661236171501, + "grad_norm": 0.8291183114051819, + "learning_rate": 9.47351826096804e-06, + "loss": 0.6919, + "step": 5441 + }, + { + "epoch": 0.29952116241950577, + "grad_norm": 0.8899849057197571, + "learning_rate": 9.473324631851898e-06, + "loss": 0.9403, + "step": 5442 + }, + { + "epoch": 0.2995762012218614, + "grad_norm": 0.837066650390625, + "learning_rate": 9.473130969115445e-06, + "loss": 0.8676, + "step": 5443 + }, + { + "epoch": 0.2996312400242171, + "grad_norm": 0.8385708928108215, + "learning_rate": 9.472937272760138e-06, + "loss": 0.7588, + "step": 5444 + }, + { + "epoch": 0.2996862788265727, + "grad_norm": 0.6990595459938049, + "learning_rate": 9.472743542787431e-06, + "loss": 0.6769, + "step": 5445 + }, + { + "epoch": 0.2997413176289284, + "grad_norm": 0.789165735244751, + "learning_rate": 9.472549779198781e-06, + "loss": 0.8084, + "step": 5446 + }, + { + "epoch": 0.29979635643128405, + "grad_norm": 0.8820298314094543, + "learning_rate": 9.472355981995643e-06, + "loss": 0.8262, + "step": 5447 + }, + { + "epoch": 0.29985139523363974, + "grad_norm": 0.8928382992744446, + "learning_rate": 9.472162151179475e-06, + "loss": 0.8123, + "step": 5448 + }, + { + "epoch": 0.2999064340359954, + "grad_norm": 0.7688086032867432, + "learning_rate": 9.471968286751735e-06, + "loss": 0.6846, + "step": 5449 + }, + { + "epoch": 0.29996147283835106, + "grad_norm": 0.6962918043136597, + "learning_rate": 9.471774388713877e-06, + "loss": 0.7872, + "step": 5450 + }, + { + "epoch": 0.3000165116407067, + "grad_norm": 0.7467569708824158, + "learning_rate": 9.47158045706736e-06, + "loss": 0.8201, + "step": 5451 + }, + { + "epoch": 0.3000715504430624, + "grad_norm": 0.7651814222335815, + "learning_rate": 9.471386491813642e-06, + "loss": 0.7734, + "step": 5452 + }, + { + "epoch": 0.300126589245418, + "grad_norm": 0.8001144528388977, + "learning_rate": 9.47119249295418e-06, + "loss": 0.8266, + "step": 5453 + }, + { + "epoch": 0.3001816280477737, + "grad_norm": 0.7937704920768738, + "learning_rate": 9.47099846049043e-06, + "loss": 0.8025, + "step": 5454 + }, + { + "epoch": 0.30023666685012934, + "grad_norm": 0.7353448867797852, + "learning_rate": 9.470804394423853e-06, + "loss": 0.7926, + "step": 5455 + }, + { + "epoch": 0.300291705652485, + "grad_norm": 0.9116304516792297, + "learning_rate": 9.470610294755908e-06, + "loss": 0.8295, + "step": 5456 + }, + { + "epoch": 0.30034674445484066, + "grad_norm": 0.7169163823127747, + "learning_rate": 9.470416161488053e-06, + "loss": 0.822, + "step": 5457 + }, + { + "epoch": 0.3004017832571963, + "grad_norm": 1.0421968698501587, + "learning_rate": 9.470221994621747e-06, + "loss": 0.9273, + "step": 5458 + }, + { + "epoch": 0.300456822059552, + "grad_norm": 0.9064405560493469, + "learning_rate": 9.470027794158447e-06, + "loss": 0.7087, + "step": 5459 + }, + { + "epoch": 0.3005118608619076, + "grad_norm": 0.6766010522842407, + "learning_rate": 9.469833560099617e-06, + "loss": 0.7063, + "step": 5460 + }, + { + "epoch": 0.3005668996642633, + "grad_norm": 0.7987816333770752, + "learning_rate": 9.469639292446712e-06, + "loss": 0.8216, + "step": 5461 + }, + { + "epoch": 0.30062193846661894, + "grad_norm": 0.776792049407959, + "learning_rate": 9.469444991201197e-06, + "loss": 0.8598, + "step": 5462 + }, + { + "epoch": 0.30067697726897463, + "grad_norm": 0.8048756718635559, + "learning_rate": 9.469250656364529e-06, + "loss": 0.8645, + "step": 5463 + }, + { + "epoch": 0.30073201607133027, + "grad_norm": 1.0650218725204468, + "learning_rate": 9.46905628793817e-06, + "loss": 0.8918, + "step": 5464 + }, + { + "epoch": 0.30078705487368596, + "grad_norm": 0.7378712296485901, + "learning_rate": 9.468861885923577e-06, + "loss": 0.6866, + "step": 5465 + }, + { + "epoch": 0.3008420936760416, + "grad_norm": 0.7382808327674866, + "learning_rate": 9.468667450322218e-06, + "loss": 0.8413, + "step": 5466 + }, + { + "epoch": 0.3008971324783973, + "grad_norm": 0.8390250205993652, + "learning_rate": 9.468472981135548e-06, + "loss": 0.8275, + "step": 5467 + }, + { + "epoch": 0.3009521712807529, + "grad_norm": 0.9169766902923584, + "learning_rate": 9.468278478365034e-06, + "loss": 0.8274, + "step": 5468 + }, + { + "epoch": 0.3010072100831086, + "grad_norm": 0.7487995028495789, + "learning_rate": 9.468083942012134e-06, + "loss": 0.7729, + "step": 5469 + }, + { + "epoch": 0.30106224888546423, + "grad_norm": 0.7457556128501892, + "learning_rate": 9.467889372078309e-06, + "loss": 0.7435, + "step": 5470 + }, + { + "epoch": 0.3011172876878199, + "grad_norm": 0.7085639834403992, + "learning_rate": 9.467694768565026e-06, + "loss": 0.7686, + "step": 5471 + }, + { + "epoch": 0.30117232649017556, + "grad_norm": 0.7396196722984314, + "learning_rate": 9.467500131473744e-06, + "loss": 0.7496, + "step": 5472 + }, + { + "epoch": 0.30122736529253125, + "grad_norm": 0.7906790971755981, + "learning_rate": 9.467305460805927e-06, + "loss": 0.8341, + "step": 5473 + }, + { + "epoch": 0.3012824040948869, + "grad_norm": 0.673541247844696, + "learning_rate": 9.467110756563039e-06, + "loss": 0.8041, + "step": 5474 + }, + { + "epoch": 0.30133744289724257, + "grad_norm": 0.8247049450874329, + "learning_rate": 9.46691601874654e-06, + "loss": 0.8227, + "step": 5475 + }, + { + "epoch": 0.3013924816995982, + "grad_norm": 0.7564057111740112, + "learning_rate": 9.466721247357898e-06, + "loss": 0.8181, + "step": 5476 + }, + { + "epoch": 0.3014475205019539, + "grad_norm": 0.7533192038536072, + "learning_rate": 9.466526442398574e-06, + "loss": 0.782, + "step": 5477 + }, + { + "epoch": 0.3015025593043095, + "grad_norm": 0.6934120059013367, + "learning_rate": 9.466331603870033e-06, + "loss": 0.7153, + "step": 5478 + }, + { + "epoch": 0.3015575981066652, + "grad_norm": 0.7417232990264893, + "learning_rate": 9.466136731773738e-06, + "loss": 0.753, + "step": 5479 + }, + { + "epoch": 0.30161263690902085, + "grad_norm": 0.7421486973762512, + "learning_rate": 9.465941826111156e-06, + "loss": 0.7668, + "step": 5480 + }, + { + "epoch": 0.30166767571137654, + "grad_norm": 1.0851647853851318, + "learning_rate": 9.465746886883751e-06, + "loss": 0.8019, + "step": 5481 + }, + { + "epoch": 0.30172271451373217, + "grad_norm": 0.9209244847297668, + "learning_rate": 9.465551914092987e-06, + "loss": 0.7912, + "step": 5482 + }, + { + "epoch": 0.30177775331608786, + "grad_norm": 0.6915135383605957, + "learning_rate": 9.465356907740331e-06, + "loss": 0.8112, + "step": 5483 + }, + { + "epoch": 0.3018327921184435, + "grad_norm": 0.824593722820282, + "learning_rate": 9.465161867827247e-06, + "loss": 0.7969, + "step": 5484 + }, + { + "epoch": 0.3018878309207992, + "grad_norm": 0.7985100746154785, + "learning_rate": 9.464966794355201e-06, + "loss": 0.8258, + "step": 5485 + }, + { + "epoch": 0.3019428697231548, + "grad_norm": 0.8471764326095581, + "learning_rate": 9.464771687325663e-06, + "loss": 0.8241, + "step": 5486 + }, + { + "epoch": 0.3019979085255105, + "grad_norm": 0.8133455514907837, + "learning_rate": 9.464576546740093e-06, + "loss": 0.7809, + "step": 5487 + }, + { + "epoch": 0.30205294732786614, + "grad_norm": 0.7684013843536377, + "learning_rate": 9.464381372599961e-06, + "loss": 0.9023, + "step": 5488 + }, + { + "epoch": 0.30210798613022183, + "grad_norm": 0.7818747758865356, + "learning_rate": 9.464186164906735e-06, + "loss": 0.7152, + "step": 5489 + }, + { + "epoch": 0.30216302493257746, + "grad_norm": 0.7524297833442688, + "learning_rate": 9.46399092366188e-06, + "loss": 0.782, + "step": 5490 + }, + { + "epoch": 0.30221806373493315, + "grad_norm": 0.6550590991973877, + "learning_rate": 9.463795648866864e-06, + "loss": 0.7696, + "step": 5491 + }, + { + "epoch": 0.3022731025372888, + "grad_norm": 0.8679335117340088, + "learning_rate": 9.463600340523154e-06, + "loss": 0.8115, + "step": 5492 + }, + { + "epoch": 0.3023281413396445, + "grad_norm": 0.692500114440918, + "learning_rate": 9.46340499863222e-06, + "loss": 0.7692, + "step": 5493 + }, + { + "epoch": 0.3023831801420001, + "grad_norm": 0.8604017496109009, + "learning_rate": 9.463209623195528e-06, + "loss": 0.8547, + "step": 5494 + }, + { + "epoch": 0.3024382189443558, + "grad_norm": 0.6715821623802185, + "learning_rate": 9.463014214214548e-06, + "loss": 0.7638, + "step": 5495 + }, + { + "epoch": 0.30249325774671143, + "grad_norm": 0.7803179025650024, + "learning_rate": 9.462818771690747e-06, + "loss": 0.7795, + "step": 5496 + }, + { + "epoch": 0.3025482965490671, + "grad_norm": 0.787323534488678, + "learning_rate": 9.462623295625596e-06, + "loss": 0.735, + "step": 5497 + }, + { + "epoch": 0.30260333535142275, + "grad_norm": 0.9943159222602844, + "learning_rate": 9.462427786020563e-06, + "loss": 0.7451, + "step": 5498 + }, + { + "epoch": 0.3026583741537784, + "grad_norm": 0.772524893283844, + "learning_rate": 9.462232242877116e-06, + "loss": 0.9167, + "step": 5499 + }, + { + "epoch": 0.3027134129561341, + "grad_norm": 0.7204643487930298, + "learning_rate": 9.462036666196726e-06, + "loss": 0.7442, + "step": 5500 + }, + { + "epoch": 0.3027684517584897, + "grad_norm": 0.7450547218322754, + "learning_rate": 9.461841055980863e-06, + "loss": 0.8002, + "step": 5501 + }, + { + "epoch": 0.3028234905608454, + "grad_norm": 0.8096264004707336, + "learning_rate": 9.461645412230997e-06, + "loss": 0.8601, + "step": 5502 + }, + { + "epoch": 0.30287852936320103, + "grad_norm": 0.684968888759613, + "learning_rate": 9.461449734948597e-06, + "loss": 0.7251, + "step": 5503 + }, + { + "epoch": 0.3029335681655567, + "grad_norm": 0.7727203369140625, + "learning_rate": 9.461254024135138e-06, + "loss": 0.7797, + "step": 5504 + }, + { + "epoch": 0.30298860696791236, + "grad_norm": 0.9292891025543213, + "learning_rate": 9.461058279792086e-06, + "loss": 0.7519, + "step": 5505 + }, + { + "epoch": 0.30304364577026804, + "grad_norm": 0.7836466431617737, + "learning_rate": 9.460862501920915e-06, + "loss": 0.8201, + "step": 5506 + }, + { + "epoch": 0.3030986845726237, + "grad_norm": 0.9043576121330261, + "learning_rate": 9.460666690523094e-06, + "loss": 0.79, + "step": 5507 + }, + { + "epoch": 0.30315372337497937, + "grad_norm": 0.8339952230453491, + "learning_rate": 9.460470845600098e-06, + "loss": 0.8392, + "step": 5508 + }, + { + "epoch": 0.303208762177335, + "grad_norm": 0.7603133320808411, + "learning_rate": 9.460274967153395e-06, + "loss": 0.7168, + "step": 5509 + }, + { + "epoch": 0.3032638009796907, + "grad_norm": 0.7287996411323547, + "learning_rate": 9.460079055184461e-06, + "loss": 0.7452, + "step": 5510 + }, + { + "epoch": 0.3033188397820463, + "grad_norm": 0.707953691482544, + "learning_rate": 9.459883109694767e-06, + "loss": 0.8081, + "step": 5511 + }, + { + "epoch": 0.303373878584402, + "grad_norm": 0.7556451559066772, + "learning_rate": 9.459687130685784e-06, + "loss": 0.8145, + "step": 5512 + }, + { + "epoch": 0.30342891738675765, + "grad_norm": 0.8076426386833191, + "learning_rate": 9.459491118158987e-06, + "loss": 0.8006, + "step": 5513 + }, + { + "epoch": 0.30348395618911334, + "grad_norm": 0.7343682646751404, + "learning_rate": 9.459295072115849e-06, + "loss": 0.7574, + "step": 5514 + }, + { + "epoch": 0.30353899499146897, + "grad_norm": 0.68440181016922, + "learning_rate": 9.459098992557843e-06, + "loss": 0.7432, + "step": 5515 + }, + { + "epoch": 0.30359403379382466, + "grad_norm": 0.8278071880340576, + "learning_rate": 9.458902879486441e-06, + "loss": 0.8357, + "step": 5516 + }, + { + "epoch": 0.3036490725961803, + "grad_norm": 0.8377245664596558, + "learning_rate": 9.458706732903121e-06, + "loss": 0.7552, + "step": 5517 + }, + { + "epoch": 0.303704111398536, + "grad_norm": 0.7354543805122375, + "learning_rate": 9.458510552809353e-06, + "loss": 0.7862, + "step": 5518 + }, + { + "epoch": 0.3037591502008916, + "grad_norm": 0.8071799874305725, + "learning_rate": 9.458314339206611e-06, + "loss": 0.8428, + "step": 5519 + }, + { + "epoch": 0.3038141890032473, + "grad_norm": 0.7452389597892761, + "learning_rate": 9.458118092096376e-06, + "loss": 0.8252, + "step": 5520 + }, + { + "epoch": 0.30386922780560294, + "grad_norm": 0.7370620965957642, + "learning_rate": 9.457921811480115e-06, + "loss": 0.8143, + "step": 5521 + }, + { + "epoch": 0.3039242666079586, + "grad_norm": 0.8816156387329102, + "learning_rate": 9.45772549735931e-06, + "loss": 0.7163, + "step": 5522 + }, + { + "epoch": 0.30397930541031426, + "grad_norm": 0.7208901643753052, + "learning_rate": 9.457529149735432e-06, + "loss": 0.7877, + "step": 5523 + }, + { + "epoch": 0.30403434421266995, + "grad_norm": 0.820792019367218, + "learning_rate": 9.457332768609959e-06, + "loss": 0.8275, + "step": 5524 + }, + { + "epoch": 0.3040893830150256, + "grad_norm": 0.8471686244010925, + "learning_rate": 9.457136353984365e-06, + "loss": 0.8127, + "step": 5525 + }, + { + "epoch": 0.30414442181738127, + "grad_norm": 0.9448342323303223, + "learning_rate": 9.456939905860127e-06, + "loss": 0.8157, + "step": 5526 + }, + { + "epoch": 0.3041994606197369, + "grad_norm": 0.7835188508033752, + "learning_rate": 9.456743424238723e-06, + "loss": 0.7116, + "step": 5527 + }, + { + "epoch": 0.3042544994220926, + "grad_norm": 0.8884950876235962, + "learning_rate": 9.456546909121629e-06, + "loss": 0.8514, + "step": 5528 + }, + { + "epoch": 0.30430953822444823, + "grad_norm": 0.7400928735733032, + "learning_rate": 9.45635036051032e-06, + "loss": 0.8207, + "step": 5529 + }, + { + "epoch": 0.3043645770268039, + "grad_norm": 0.8278732299804688, + "learning_rate": 9.456153778406274e-06, + "loss": 0.8269, + "step": 5530 + }, + { + "epoch": 0.30441961582915955, + "grad_norm": 0.7423332929611206, + "learning_rate": 9.45595716281097e-06, + "loss": 0.7937, + "step": 5531 + }, + { + "epoch": 0.30447465463151524, + "grad_norm": 1.5018088817596436, + "learning_rate": 9.455760513725885e-06, + "loss": 0.7935, + "step": 5532 + }, + { + "epoch": 0.3045296934338709, + "grad_norm": 0.8105388283729553, + "learning_rate": 9.455563831152496e-06, + "loss": 0.8225, + "step": 5533 + }, + { + "epoch": 0.30458473223622656, + "grad_norm": 0.6874535083770752, + "learning_rate": 9.455367115092283e-06, + "loss": 0.7301, + "step": 5534 + }, + { + "epoch": 0.3046397710385822, + "grad_norm": 0.8085837960243225, + "learning_rate": 9.455170365546721e-06, + "loss": 0.83, + "step": 5535 + }, + { + "epoch": 0.3046948098409379, + "grad_norm": 0.810773491859436, + "learning_rate": 9.454973582517293e-06, + "loss": 0.7186, + "step": 5536 + }, + { + "epoch": 0.3047498486432935, + "grad_norm": 0.7290367484092712, + "learning_rate": 9.454776766005476e-06, + "loss": 0.8181, + "step": 5537 + }, + { + "epoch": 0.3048048874456492, + "grad_norm": 0.773728609085083, + "learning_rate": 9.45457991601275e-06, + "loss": 0.8454, + "step": 5538 + }, + { + "epoch": 0.30485992624800484, + "grad_norm": 0.792169451713562, + "learning_rate": 9.454383032540592e-06, + "loss": 0.8797, + "step": 5539 + }, + { + "epoch": 0.30491496505036053, + "grad_norm": 0.7478733658790588, + "learning_rate": 9.454186115590485e-06, + "loss": 0.7544, + "step": 5540 + }, + { + "epoch": 0.30497000385271616, + "grad_norm": 0.8527306318283081, + "learning_rate": 9.453989165163906e-06, + "loss": 0.8379, + "step": 5541 + }, + { + "epoch": 0.3050250426550718, + "grad_norm": 0.8829329013824463, + "learning_rate": 9.453792181262337e-06, + "loss": 0.7643, + "step": 5542 + }, + { + "epoch": 0.3050800814574275, + "grad_norm": 0.9477338790893555, + "learning_rate": 9.453595163887258e-06, + "loss": 0.7414, + "step": 5543 + }, + { + "epoch": 0.3051351202597831, + "grad_norm": 0.8311536312103271, + "learning_rate": 9.453398113040151e-06, + "loss": 0.8133, + "step": 5544 + }, + { + "epoch": 0.3051901590621388, + "grad_norm": 0.8035525679588318, + "learning_rate": 9.453201028722497e-06, + "loss": 0.7841, + "step": 5545 + }, + { + "epoch": 0.30524519786449444, + "grad_norm": 0.7779183983802795, + "learning_rate": 9.453003910935775e-06, + "loss": 0.7696, + "step": 5546 + }, + { + "epoch": 0.30530023666685013, + "grad_norm": 0.7843946218490601, + "learning_rate": 9.452806759681465e-06, + "loss": 0.6018, + "step": 5547 + }, + { + "epoch": 0.30535527546920577, + "grad_norm": 0.7215032577514648, + "learning_rate": 9.452609574961053e-06, + "loss": 0.7457, + "step": 5548 + }, + { + "epoch": 0.30541031427156146, + "grad_norm": 0.9628198742866516, + "learning_rate": 9.452412356776021e-06, + "loss": 0.8061, + "step": 5549 + }, + { + "epoch": 0.3054653530739171, + "grad_norm": 0.9468308687210083, + "learning_rate": 9.452215105127848e-06, + "loss": 0.7909, + "step": 5550 + }, + { + "epoch": 0.3055203918762728, + "grad_norm": 0.876402735710144, + "learning_rate": 9.452017820018017e-06, + "loss": 0.69, + "step": 5551 + }, + { + "epoch": 0.3055754306786284, + "grad_norm": 1.03409743309021, + "learning_rate": 9.451820501448014e-06, + "loss": 0.8375, + "step": 5552 + }, + { + "epoch": 0.3056304694809841, + "grad_norm": 0.8057541847229004, + "learning_rate": 9.45162314941932e-06, + "loss": 0.7704, + "step": 5553 + }, + { + "epoch": 0.30568550828333974, + "grad_norm": 0.7256304025650024, + "learning_rate": 9.451425763933417e-06, + "loss": 0.7819, + "step": 5554 + }, + { + "epoch": 0.3057405470856954, + "grad_norm": 0.7982180118560791, + "learning_rate": 9.451228344991788e-06, + "loss": 0.8094, + "step": 5555 + }, + { + "epoch": 0.30579558588805106, + "grad_norm": 1.0314620733261108, + "learning_rate": 9.45103089259592e-06, + "loss": 0.7777, + "step": 5556 + }, + { + "epoch": 0.30585062469040675, + "grad_norm": 0.6948755383491516, + "learning_rate": 9.450833406747294e-06, + "loss": 0.7189, + "step": 5557 + }, + { + "epoch": 0.3059056634927624, + "grad_norm": 0.7412117719650269, + "learning_rate": 9.450635887447396e-06, + "loss": 0.783, + "step": 5558 + }, + { + "epoch": 0.30596070229511807, + "grad_norm": 0.7394647002220154, + "learning_rate": 9.450438334697711e-06, + "loss": 0.7888, + "step": 5559 + }, + { + "epoch": 0.3060157410974737, + "grad_norm": 0.692701518535614, + "learning_rate": 9.450240748499725e-06, + "loss": 0.7427, + "step": 5560 + }, + { + "epoch": 0.3060707798998294, + "grad_norm": 0.6854925751686096, + "learning_rate": 9.450043128854916e-06, + "loss": 0.7877, + "step": 5561 + }, + { + "epoch": 0.306125818702185, + "grad_norm": 0.8073517680168152, + "learning_rate": 9.449845475764776e-06, + "loss": 0.8715, + "step": 5562 + }, + { + "epoch": 0.3061808575045407, + "grad_norm": 0.9672908186912537, + "learning_rate": 9.449647789230789e-06, + "loss": 0.782, + "step": 5563 + }, + { + "epoch": 0.30623589630689635, + "grad_norm": 0.7409735918045044, + "learning_rate": 9.44945006925444e-06, + "loss": 0.7956, + "step": 5564 + }, + { + "epoch": 0.30629093510925204, + "grad_norm": 0.7839213609695435, + "learning_rate": 9.449252315837215e-06, + "loss": 0.7559, + "step": 5565 + }, + { + "epoch": 0.30634597391160767, + "grad_norm": 0.668393075466156, + "learning_rate": 9.449054528980602e-06, + "loss": 0.717, + "step": 5566 + }, + { + "epoch": 0.30640101271396336, + "grad_norm": 0.8818438053131104, + "learning_rate": 9.448856708686084e-06, + "loss": 0.7801, + "step": 5567 + }, + { + "epoch": 0.306456051516319, + "grad_norm": 0.7331361770629883, + "learning_rate": 9.44865885495515e-06, + "loss": 0.6999, + "step": 5568 + }, + { + "epoch": 0.3065110903186747, + "grad_norm": 0.7818138599395752, + "learning_rate": 9.448460967789288e-06, + "loss": 0.7437, + "step": 5569 + }, + { + "epoch": 0.3065661291210303, + "grad_norm": 0.7713417410850525, + "learning_rate": 9.448263047189985e-06, + "loss": 0.8523, + "step": 5570 + }, + { + "epoch": 0.306621167923386, + "grad_norm": 0.7152866125106812, + "learning_rate": 9.448065093158726e-06, + "loss": 0.7706, + "step": 5571 + }, + { + "epoch": 0.30667620672574164, + "grad_norm": 0.7486638426780701, + "learning_rate": 9.447867105697e-06, + "loss": 0.7738, + "step": 5572 + }, + { + "epoch": 0.30673124552809733, + "grad_norm": 0.7014918923377991, + "learning_rate": 9.447669084806297e-06, + "loss": 0.7013, + "step": 5573 + }, + { + "epoch": 0.30678628433045296, + "grad_norm": 0.8328303694725037, + "learning_rate": 9.447471030488102e-06, + "loss": 0.8113, + "step": 5574 + }, + { + "epoch": 0.30684132313280865, + "grad_norm": 0.6800024509429932, + "learning_rate": 9.447272942743906e-06, + "loss": 0.6786, + "step": 5575 + }, + { + "epoch": 0.3068963619351643, + "grad_norm": 0.6827595829963684, + "learning_rate": 9.447074821575198e-06, + "loss": 0.812, + "step": 5576 + }, + { + "epoch": 0.30695140073752, + "grad_norm": 0.8775614500045776, + "learning_rate": 9.446876666983465e-06, + "loss": 0.7683, + "step": 5577 + }, + { + "epoch": 0.3070064395398756, + "grad_norm": 0.7440332174301147, + "learning_rate": 9.446678478970198e-06, + "loss": 0.7152, + "step": 5578 + }, + { + "epoch": 0.3070614783422313, + "grad_norm": 0.7031408548355103, + "learning_rate": 9.446480257536885e-06, + "loss": 0.7603, + "step": 5579 + }, + { + "epoch": 0.30711651714458693, + "grad_norm": 0.8419817090034485, + "learning_rate": 9.446282002685019e-06, + "loss": 0.9939, + "step": 5580 + }, + { + "epoch": 0.3071715559469426, + "grad_norm": 0.7622908353805542, + "learning_rate": 9.446083714416085e-06, + "loss": 0.8682, + "step": 5581 + }, + { + "epoch": 0.30722659474929825, + "grad_norm": 0.7341362833976746, + "learning_rate": 9.445885392731576e-06, + "loss": 0.848, + "step": 5582 + }, + { + "epoch": 0.30728163355165394, + "grad_norm": 0.7248286604881287, + "learning_rate": 9.445687037632984e-06, + "loss": 0.7699, + "step": 5583 + }, + { + "epoch": 0.3073366723540096, + "grad_norm": 0.9409947991371155, + "learning_rate": 9.445488649121797e-06, + "loss": 1.0051, + "step": 5584 + }, + { + "epoch": 0.3073917111563652, + "grad_norm": 0.7279968857765198, + "learning_rate": 9.445290227199509e-06, + "loss": 0.8001, + "step": 5585 + }, + { + "epoch": 0.3074467499587209, + "grad_norm": 0.7904797196388245, + "learning_rate": 9.445091771867607e-06, + "loss": 0.8892, + "step": 5586 + }, + { + "epoch": 0.30750178876107653, + "grad_norm": 0.7090430855751038, + "learning_rate": 9.444893283127587e-06, + "loss": 0.5983, + "step": 5587 + }, + { + "epoch": 0.3075568275634322, + "grad_norm": 0.8363901376724243, + "learning_rate": 9.444694760980939e-06, + "loss": 0.7688, + "step": 5588 + }, + { + "epoch": 0.30761186636578786, + "grad_norm": 0.7487169504165649, + "learning_rate": 9.444496205429152e-06, + "loss": 0.7585, + "step": 5589 + }, + { + "epoch": 0.30766690516814355, + "grad_norm": 0.750801146030426, + "learning_rate": 9.444297616473724e-06, + "loss": 0.6493, + "step": 5590 + }, + { + "epoch": 0.3077219439704992, + "grad_norm": 0.754846453666687, + "learning_rate": 9.444098994116144e-06, + "loss": 0.8528, + "step": 5591 + }, + { + "epoch": 0.30777698277285487, + "grad_norm": 0.7088152766227722, + "learning_rate": 9.443900338357907e-06, + "loss": 0.7927, + "step": 5592 + }, + { + "epoch": 0.3078320215752105, + "grad_norm": 0.7077113389968872, + "learning_rate": 9.443701649200503e-06, + "loss": 0.7996, + "step": 5593 + }, + { + "epoch": 0.3078870603775662, + "grad_norm": 0.732982873916626, + "learning_rate": 9.443502926645427e-06, + "loss": 0.7473, + "step": 5594 + }, + { + "epoch": 0.3079420991799218, + "grad_norm": 0.7068434357643127, + "learning_rate": 9.443304170694174e-06, + "loss": 0.7575, + "step": 5595 + }, + { + "epoch": 0.3079971379822775, + "grad_norm": 0.7703887224197388, + "learning_rate": 9.443105381348234e-06, + "loss": 0.8157, + "step": 5596 + }, + { + "epoch": 0.30805217678463315, + "grad_norm": 0.806924045085907, + "learning_rate": 9.442906558609103e-06, + "loss": 0.7572, + "step": 5597 + }, + { + "epoch": 0.30810721558698884, + "grad_norm": 0.8364617824554443, + "learning_rate": 9.442707702478278e-06, + "loss": 0.7491, + "step": 5598 + }, + { + "epoch": 0.30816225438934447, + "grad_norm": 0.9269624352455139, + "learning_rate": 9.442508812957249e-06, + "loss": 0.8746, + "step": 5599 + }, + { + "epoch": 0.30821729319170016, + "grad_norm": 0.7308455109596252, + "learning_rate": 9.442309890047515e-06, + "loss": 0.8068, + "step": 5600 + }, + { + "epoch": 0.3082723319940558, + "grad_norm": 0.812622606754303, + "learning_rate": 9.442110933750567e-06, + "loss": 0.9137, + "step": 5601 + }, + { + "epoch": 0.3083273707964115, + "grad_norm": 0.7100754976272583, + "learning_rate": 9.441911944067905e-06, + "loss": 0.7471, + "step": 5602 + }, + { + "epoch": 0.3083824095987671, + "grad_norm": 0.760208010673523, + "learning_rate": 9.44171292100102e-06, + "loss": 0.8243, + "step": 5603 + }, + { + "epoch": 0.3084374484011228, + "grad_norm": 0.6931812763214111, + "learning_rate": 9.44151386455141e-06, + "loss": 0.7523, + "step": 5604 + }, + { + "epoch": 0.30849248720347844, + "grad_norm": 0.6584734916687012, + "learning_rate": 9.44131477472057e-06, + "loss": 0.6929, + "step": 5605 + }, + { + "epoch": 0.3085475260058341, + "grad_norm": 0.977661669254303, + "learning_rate": 9.441115651509997e-06, + "loss": 0.8003, + "step": 5606 + }, + { + "epoch": 0.30860256480818976, + "grad_norm": 0.650434672832489, + "learning_rate": 9.440916494921189e-06, + "loss": 0.6629, + "step": 5607 + }, + { + "epoch": 0.30865760361054545, + "grad_norm": 0.6804447770118713, + "learning_rate": 9.44071730495564e-06, + "loss": 0.7216, + "step": 5608 + }, + { + "epoch": 0.3087126424129011, + "grad_norm": 0.7942929267883301, + "learning_rate": 9.44051808161485e-06, + "loss": 0.7593, + "step": 5609 + }, + { + "epoch": 0.3087676812152568, + "grad_norm": 0.7069621086120605, + "learning_rate": 9.440318824900313e-06, + "loss": 0.7453, + "step": 5610 + }, + { + "epoch": 0.3088227200176124, + "grad_norm": 0.7903168797492981, + "learning_rate": 9.440119534813528e-06, + "loss": 0.8084, + "step": 5611 + }, + { + "epoch": 0.3088777588199681, + "grad_norm": 0.7828298807144165, + "learning_rate": 9.439920211355993e-06, + "loss": 0.7556, + "step": 5612 + }, + { + "epoch": 0.30893279762232373, + "grad_norm": 0.8118648529052734, + "learning_rate": 9.43972085452921e-06, + "loss": 0.8548, + "step": 5613 + }, + { + "epoch": 0.3089878364246794, + "grad_norm": 0.9169642329216003, + "learning_rate": 9.439521464334669e-06, + "loss": 0.833, + "step": 5614 + }, + { + "epoch": 0.30904287522703505, + "grad_norm": 0.7844422459602356, + "learning_rate": 9.439322040773875e-06, + "loss": 0.8363, + "step": 5615 + }, + { + "epoch": 0.30909791402939074, + "grad_norm": 1.4801305532455444, + "learning_rate": 9.439122583848324e-06, + "loss": 0.7617, + "step": 5616 + }, + { + "epoch": 0.3091529528317464, + "grad_norm": 0.7737647891044617, + "learning_rate": 9.438923093559517e-06, + "loss": 0.7224, + "step": 5617 + }, + { + "epoch": 0.30920799163410206, + "grad_norm": 0.7279127836227417, + "learning_rate": 9.438723569908952e-06, + "loss": 0.7783, + "step": 5618 + }, + { + "epoch": 0.3092630304364577, + "grad_norm": 0.7635996341705322, + "learning_rate": 9.438524012898127e-06, + "loss": 0.8408, + "step": 5619 + }, + { + "epoch": 0.3093180692388134, + "grad_norm": 0.818445086479187, + "learning_rate": 9.438324422528547e-06, + "loss": 0.8836, + "step": 5620 + }, + { + "epoch": 0.309373108041169, + "grad_norm": 0.8620640635490417, + "learning_rate": 9.438124798801706e-06, + "loss": 0.925, + "step": 5621 + }, + { + "epoch": 0.3094281468435247, + "grad_norm": 0.7294883728027344, + "learning_rate": 9.437925141719108e-06, + "loss": 0.8387, + "step": 5622 + }, + { + "epoch": 0.30948318564588034, + "grad_norm": 0.6696046590805054, + "learning_rate": 9.437725451282252e-06, + "loss": 0.6712, + "step": 5623 + }, + { + "epoch": 0.30953822444823603, + "grad_norm": 0.8200504779815674, + "learning_rate": 9.43752572749264e-06, + "loss": 0.8191, + "step": 5624 + }, + { + "epoch": 0.30959326325059167, + "grad_norm": 0.8440756797790527, + "learning_rate": 9.437325970351773e-06, + "loss": 0.7412, + "step": 5625 + }, + { + "epoch": 0.30964830205294736, + "grad_norm": 0.8550771474838257, + "learning_rate": 9.43712617986115e-06, + "loss": 0.7842, + "step": 5626 + }, + { + "epoch": 0.309703340855303, + "grad_norm": 0.8203451037406921, + "learning_rate": 9.436926356022275e-06, + "loss": 0.8298, + "step": 5627 + }, + { + "epoch": 0.3097583796576586, + "grad_norm": 1.0105336904525757, + "learning_rate": 9.436726498836651e-06, + "loss": 0.8416, + "step": 5628 + }, + { + "epoch": 0.3098134184600143, + "grad_norm": 0.7684324383735657, + "learning_rate": 9.436526608305777e-06, + "loss": 0.7051, + "step": 5629 + }, + { + "epoch": 0.30986845726236995, + "grad_norm": 0.7284610867500305, + "learning_rate": 9.436326684431157e-06, + "loss": 0.755, + "step": 5630 + }, + { + "epoch": 0.30992349606472563, + "grad_norm": 0.7125874161720276, + "learning_rate": 9.436126727214293e-06, + "loss": 0.7336, + "step": 5631 + }, + { + "epoch": 0.30997853486708127, + "grad_norm": 0.7008525729179382, + "learning_rate": 9.435926736656687e-06, + "loss": 0.7185, + "step": 5632 + }, + { + "epoch": 0.31003357366943696, + "grad_norm": 0.7087175250053406, + "learning_rate": 9.435726712759844e-06, + "loss": 0.717, + "step": 5633 + }, + { + "epoch": 0.3100886124717926, + "grad_norm": 0.7892497777938843, + "learning_rate": 9.435526655525267e-06, + "loss": 0.8308, + "step": 5634 + }, + { + "epoch": 0.3101436512741483, + "grad_norm": 0.733906626701355, + "learning_rate": 9.435326564954457e-06, + "loss": 0.7421, + "step": 5635 + }, + { + "epoch": 0.3101986900765039, + "grad_norm": 0.7874915599822998, + "learning_rate": 9.43512644104892e-06, + "loss": 0.8808, + "step": 5636 + }, + { + "epoch": 0.3102537288788596, + "grad_norm": 0.6849297881126404, + "learning_rate": 9.434926283810162e-06, + "loss": 0.7297, + "step": 5637 + }, + { + "epoch": 0.31030876768121524, + "grad_norm": 0.7847834825515747, + "learning_rate": 9.434726093239685e-06, + "loss": 0.7873, + "step": 5638 + }, + { + "epoch": 0.3103638064835709, + "grad_norm": 0.6999106407165527, + "learning_rate": 9.434525869338992e-06, + "loss": 0.7699, + "step": 5639 + }, + { + "epoch": 0.31041884528592656, + "grad_norm": 0.7662788033485413, + "learning_rate": 9.43432561210959e-06, + "loss": 0.7583, + "step": 5640 + }, + { + "epoch": 0.31047388408828225, + "grad_norm": 0.8336607217788696, + "learning_rate": 9.434125321552985e-06, + "loss": 0.7297, + "step": 5641 + }, + { + "epoch": 0.3105289228906379, + "grad_norm": 0.8038349151611328, + "learning_rate": 9.433924997670681e-06, + "loss": 0.798, + "step": 5642 + }, + { + "epoch": 0.31058396169299357, + "grad_norm": 0.6819794178009033, + "learning_rate": 9.433724640464181e-06, + "loss": 0.7951, + "step": 5643 + }, + { + "epoch": 0.3106390004953492, + "grad_norm": 0.916238009929657, + "learning_rate": 9.433524249934995e-06, + "loss": 0.7371, + "step": 5644 + }, + { + "epoch": 0.3106940392977049, + "grad_norm": 0.8390263915061951, + "learning_rate": 9.433323826084628e-06, + "loss": 0.8211, + "step": 5645 + }, + { + "epoch": 0.3107490781000605, + "grad_norm": 0.7957239747047424, + "learning_rate": 9.433123368914586e-06, + "loss": 0.8406, + "step": 5646 + }, + { + "epoch": 0.3108041169024162, + "grad_norm": 0.6771933436393738, + "learning_rate": 9.432922878426374e-06, + "loss": 0.7664, + "step": 5647 + }, + { + "epoch": 0.31085915570477185, + "grad_norm": 0.7874065041542053, + "learning_rate": 9.432722354621503e-06, + "loss": 0.7445, + "step": 5648 + }, + { + "epoch": 0.31091419450712754, + "grad_norm": 0.674749493598938, + "learning_rate": 9.432521797501475e-06, + "loss": 0.745, + "step": 5649 + }, + { + "epoch": 0.3109692333094832, + "grad_norm": 0.7695828676223755, + "learning_rate": 9.432321207067799e-06, + "loss": 0.7555, + "step": 5650 + }, + { + "epoch": 0.31102427211183886, + "grad_norm": 0.8050221800804138, + "learning_rate": 9.432120583321984e-06, + "loss": 0.8464, + "step": 5651 + }, + { + "epoch": 0.3110793109141945, + "grad_norm": 0.7242713570594788, + "learning_rate": 9.431919926265538e-06, + "loss": 0.7439, + "step": 5652 + }, + { + "epoch": 0.3111343497165502, + "grad_norm": 0.7372434735298157, + "learning_rate": 9.431719235899967e-06, + "loss": 0.7973, + "step": 5653 + }, + { + "epoch": 0.3111893885189058, + "grad_norm": 0.7573439478874207, + "learning_rate": 9.431518512226783e-06, + "loss": 0.8259, + "step": 5654 + }, + { + "epoch": 0.3112444273212615, + "grad_norm": 0.7098552584648132, + "learning_rate": 9.43131775524749e-06, + "loss": 0.8159, + "step": 5655 + }, + { + "epoch": 0.31129946612361714, + "grad_norm": 0.7804632186889648, + "learning_rate": 9.431116964963599e-06, + "loss": 0.7795, + "step": 5656 + }, + { + "epoch": 0.31135450492597283, + "grad_norm": 1.0158027410507202, + "learning_rate": 9.43091614137662e-06, + "loss": 0.7935, + "step": 5657 + }, + { + "epoch": 0.31140954372832846, + "grad_norm": 0.708238422870636, + "learning_rate": 9.430715284488059e-06, + "loss": 0.7592, + "step": 5658 + }, + { + "epoch": 0.31146458253068415, + "grad_norm": 0.7086984515190125, + "learning_rate": 9.43051439429943e-06, + "loss": 0.7303, + "step": 5659 + }, + { + "epoch": 0.3115196213330398, + "grad_norm": 0.7620081305503845, + "learning_rate": 9.43031347081224e-06, + "loss": 0.7429, + "step": 5660 + }, + { + "epoch": 0.3115746601353955, + "grad_norm": 0.746126115322113, + "learning_rate": 9.430112514028e-06, + "loss": 0.8836, + "step": 5661 + }, + { + "epoch": 0.3116296989377511, + "grad_norm": 0.9113686680793762, + "learning_rate": 9.429911523948221e-06, + "loss": 0.6343, + "step": 5662 + }, + { + "epoch": 0.3116847377401068, + "grad_norm": 0.700890839099884, + "learning_rate": 9.429710500574413e-06, + "loss": 0.8201, + "step": 5663 + }, + { + "epoch": 0.31173977654246243, + "grad_norm": 0.7428706288337708, + "learning_rate": 9.429509443908085e-06, + "loss": 0.6838, + "step": 5664 + }, + { + "epoch": 0.3117948153448181, + "grad_norm": 0.851725697517395, + "learning_rate": 9.429308353950752e-06, + "loss": 0.7151, + "step": 5665 + }, + { + "epoch": 0.31184985414717376, + "grad_norm": 0.8555309176445007, + "learning_rate": 9.42910723070392e-06, + "loss": 0.7384, + "step": 5666 + }, + { + "epoch": 0.31190489294952944, + "grad_norm": 0.735927939414978, + "learning_rate": 9.428906074169107e-06, + "loss": 0.6911, + "step": 5667 + }, + { + "epoch": 0.3119599317518851, + "grad_norm": 0.8007609844207764, + "learning_rate": 9.42870488434782e-06, + "loss": 0.869, + "step": 5668 + }, + { + "epoch": 0.31201497055424077, + "grad_norm": 0.7604133486747742, + "learning_rate": 9.42850366124157e-06, + "loss": 0.7633, + "step": 5669 + }, + { + "epoch": 0.3120700093565964, + "grad_norm": 0.8181144595146179, + "learning_rate": 9.428302404851875e-06, + "loss": 0.7631, + "step": 5670 + }, + { + "epoch": 0.31212504815895203, + "grad_norm": 0.7115523219108582, + "learning_rate": 9.428101115180243e-06, + "loss": 0.734, + "step": 5671 + }, + { + "epoch": 0.3121800869613077, + "grad_norm": 0.7165855765342712, + "learning_rate": 9.42789979222819e-06, + "loss": 0.8068, + "step": 5672 + }, + { + "epoch": 0.31223512576366336, + "grad_norm": 0.6515665650367737, + "learning_rate": 9.427698435997225e-06, + "loss": 0.6946, + "step": 5673 + }, + { + "epoch": 0.31229016456601905, + "grad_norm": 0.7692676186561584, + "learning_rate": 9.427497046488867e-06, + "loss": 0.7387, + "step": 5674 + }, + { + "epoch": 0.3123452033683747, + "grad_norm": 0.70064777135849, + "learning_rate": 9.427295623704625e-06, + "loss": 0.7976, + "step": 5675 + }, + { + "epoch": 0.31240024217073037, + "grad_norm": 0.7464852333068848, + "learning_rate": 9.427094167646013e-06, + "loss": 0.7574, + "step": 5676 + }, + { + "epoch": 0.312455280973086, + "grad_norm": 0.7721675634384155, + "learning_rate": 9.426892678314548e-06, + "loss": 0.7405, + "step": 5677 + }, + { + "epoch": 0.3125103197754417, + "grad_norm": 0.6581596732139587, + "learning_rate": 9.42669115571174e-06, + "loss": 0.6972, + "step": 5678 + }, + { + "epoch": 0.3125653585777973, + "grad_norm": 0.8722662329673767, + "learning_rate": 9.426489599839108e-06, + "loss": 0.8073, + "step": 5679 + }, + { + "epoch": 0.312620397380153, + "grad_norm": 0.6800306439399719, + "learning_rate": 9.426288010698165e-06, + "loss": 0.7721, + "step": 5680 + }, + { + "epoch": 0.31267543618250865, + "grad_norm": 0.7443979382514954, + "learning_rate": 9.426086388290428e-06, + "loss": 0.7719, + "step": 5681 + }, + { + "epoch": 0.31273047498486434, + "grad_norm": 0.7818729877471924, + "learning_rate": 9.425884732617407e-06, + "loss": 0.7815, + "step": 5682 + }, + { + "epoch": 0.31278551378721997, + "grad_norm": 0.7640877366065979, + "learning_rate": 9.425683043680624e-06, + "loss": 0.8315, + "step": 5683 + }, + { + "epoch": 0.31284055258957566, + "grad_norm": 0.6871064305305481, + "learning_rate": 9.42548132148159e-06, + "loss": 0.8017, + "step": 5684 + }, + { + "epoch": 0.3128955913919313, + "grad_norm": 0.8394801616668701, + "learning_rate": 9.425279566021824e-06, + "loss": 0.763, + "step": 5685 + }, + { + "epoch": 0.312950630194287, + "grad_norm": 0.7104960083961487, + "learning_rate": 9.42507777730284e-06, + "loss": 0.7991, + "step": 5686 + }, + { + "epoch": 0.3130056689966426, + "grad_norm": 0.7820347547531128, + "learning_rate": 9.424875955326159e-06, + "loss": 0.825, + "step": 5687 + }, + { + "epoch": 0.3130607077989983, + "grad_norm": 0.783343493938446, + "learning_rate": 9.424674100093292e-06, + "loss": 0.8189, + "step": 5688 + }, + { + "epoch": 0.31311574660135394, + "grad_norm": 0.7998474836349487, + "learning_rate": 9.42447221160576e-06, + "loss": 0.7382, + "step": 5689 + }, + { + "epoch": 0.31317078540370963, + "grad_norm": 0.7232120633125305, + "learning_rate": 9.424270289865078e-06, + "loss": 0.8556, + "step": 5690 + }, + { + "epoch": 0.31322582420606526, + "grad_norm": 0.7944191694259644, + "learning_rate": 9.424068334872764e-06, + "loss": 0.8272, + "step": 5691 + }, + { + "epoch": 0.31328086300842095, + "grad_norm": 0.7951859831809998, + "learning_rate": 9.42386634663034e-06, + "loss": 0.7612, + "step": 5692 + }, + { + "epoch": 0.3133359018107766, + "grad_norm": 1.394667387008667, + "learning_rate": 9.423664325139318e-06, + "loss": 0.8108, + "step": 5693 + }, + { + "epoch": 0.3133909406131323, + "grad_norm": 0.868886411190033, + "learning_rate": 9.42346227040122e-06, + "loss": 0.8308, + "step": 5694 + }, + { + "epoch": 0.3134459794154879, + "grad_norm": 0.9442586302757263, + "learning_rate": 9.423260182417563e-06, + "loss": 0.9145, + "step": 5695 + }, + { + "epoch": 0.3135010182178436, + "grad_norm": 0.7432793974876404, + "learning_rate": 9.423058061189868e-06, + "loss": 0.7715, + "step": 5696 + }, + { + "epoch": 0.31355605702019923, + "grad_norm": 0.7221946120262146, + "learning_rate": 9.422855906719652e-06, + "loss": 0.7588, + "step": 5697 + }, + { + "epoch": 0.3136110958225549, + "grad_norm": 0.7459834814071655, + "learning_rate": 9.422653719008434e-06, + "loss": 0.7834, + "step": 5698 + }, + { + "epoch": 0.31366613462491055, + "grad_norm": 0.8562330007553101, + "learning_rate": 9.422451498057737e-06, + "loss": 0.6994, + "step": 5699 + }, + { + "epoch": 0.31372117342726624, + "grad_norm": 0.672696053981781, + "learning_rate": 9.422249243869075e-06, + "loss": 0.7201, + "step": 5700 + }, + { + "epoch": 0.3137762122296219, + "grad_norm": 0.7459990382194519, + "learning_rate": 9.422046956443973e-06, + "loss": 0.7663, + "step": 5701 + }, + { + "epoch": 0.31383125103197757, + "grad_norm": 0.9653169512748718, + "learning_rate": 9.42184463578395e-06, + "loss": 0.8899, + "step": 5702 + }, + { + "epoch": 0.3138862898343332, + "grad_norm": 0.7137778997421265, + "learning_rate": 9.421642281890526e-06, + "loss": 0.74, + "step": 5703 + }, + { + "epoch": 0.3139413286366889, + "grad_norm": 0.6961745619773865, + "learning_rate": 9.421439894765222e-06, + "loss": 0.7309, + "step": 5704 + }, + { + "epoch": 0.3139963674390445, + "grad_norm": 0.7843212485313416, + "learning_rate": 9.421237474409559e-06, + "loss": 0.8654, + "step": 5705 + }, + { + "epoch": 0.3140514062414002, + "grad_norm": 0.7560604810714722, + "learning_rate": 9.42103502082506e-06, + "loss": 0.7949, + "step": 5706 + }, + { + "epoch": 0.31410644504375584, + "grad_norm": 0.756200909614563, + "learning_rate": 9.420832534013245e-06, + "loss": 0.7315, + "step": 5707 + }, + { + "epoch": 0.31416148384611153, + "grad_norm": 0.7857967615127563, + "learning_rate": 9.420630013975635e-06, + "loss": 0.7698, + "step": 5708 + }, + { + "epoch": 0.31421652264846717, + "grad_norm": 0.6943809986114502, + "learning_rate": 9.420427460713754e-06, + "loss": 0.7691, + "step": 5709 + }, + { + "epoch": 0.31427156145082286, + "grad_norm": 0.7460532188415527, + "learning_rate": 9.420224874229123e-06, + "loss": 0.7679, + "step": 5710 + }, + { + "epoch": 0.3143266002531785, + "grad_norm": 0.764406144618988, + "learning_rate": 9.420022254523265e-06, + "loss": 0.9545, + "step": 5711 + }, + { + "epoch": 0.3143816390555342, + "grad_norm": 0.7191083431243896, + "learning_rate": 9.419819601597703e-06, + "loss": 0.728, + "step": 5712 + }, + { + "epoch": 0.3144366778578898, + "grad_norm": 0.8799699544906616, + "learning_rate": 9.419616915453959e-06, + "loss": 0.6911, + "step": 5713 + }, + { + "epoch": 0.31449171666024545, + "grad_norm": 0.7505975365638733, + "learning_rate": 9.419414196093558e-06, + "loss": 0.7953, + "step": 5714 + }, + { + "epoch": 0.31454675546260114, + "grad_norm": 0.7575502395629883, + "learning_rate": 9.419211443518023e-06, + "loss": 0.7752, + "step": 5715 + }, + { + "epoch": 0.31460179426495677, + "grad_norm": 0.7220337986946106, + "learning_rate": 9.419008657728879e-06, + "loss": 0.7894, + "step": 5716 + }, + { + "epoch": 0.31465683306731246, + "grad_norm": 0.7797306776046753, + "learning_rate": 9.418805838727648e-06, + "loss": 0.7582, + "step": 5717 + }, + { + "epoch": 0.3147118718696681, + "grad_norm": 0.9011242985725403, + "learning_rate": 9.418602986515855e-06, + "loss": 0.7379, + "step": 5718 + }, + { + "epoch": 0.3147669106720238, + "grad_norm": 0.7568445801734924, + "learning_rate": 9.418400101095025e-06, + "loss": 0.8003, + "step": 5719 + }, + { + "epoch": 0.3148219494743794, + "grad_norm": 0.6810547709465027, + "learning_rate": 9.418197182466681e-06, + "loss": 0.7186, + "step": 5720 + }, + { + "epoch": 0.3148769882767351, + "grad_norm": 0.7390284538269043, + "learning_rate": 9.417994230632352e-06, + "loss": 0.7478, + "step": 5721 + }, + { + "epoch": 0.31493202707909074, + "grad_norm": 0.695286214351654, + "learning_rate": 9.41779124559356e-06, + "loss": 0.7467, + "step": 5722 + }, + { + "epoch": 0.3149870658814464, + "grad_norm": 0.7783445715904236, + "learning_rate": 9.41758822735183e-06, + "loss": 0.824, + "step": 5723 + }, + { + "epoch": 0.31504210468380206, + "grad_norm": 0.7176268696784973, + "learning_rate": 9.41738517590869e-06, + "loss": 0.7596, + "step": 5724 + }, + { + "epoch": 0.31509714348615775, + "grad_norm": 0.7829678058624268, + "learning_rate": 9.417182091265668e-06, + "loss": 0.8184, + "step": 5725 + }, + { + "epoch": 0.3151521822885134, + "grad_norm": 0.7461703419685364, + "learning_rate": 9.416978973424286e-06, + "loss": 0.8732, + "step": 5726 + }, + { + "epoch": 0.31520722109086907, + "grad_norm": 0.7186999320983887, + "learning_rate": 9.416775822386073e-06, + "loss": 0.6878, + "step": 5727 + }, + { + "epoch": 0.3152622598932247, + "grad_norm": 0.6775033473968506, + "learning_rate": 9.416572638152553e-06, + "loss": 0.7211, + "step": 5728 + }, + { + "epoch": 0.3153172986955804, + "grad_norm": 0.6845641732215881, + "learning_rate": 9.416369420725258e-06, + "loss": 0.7282, + "step": 5729 + }, + { + "epoch": 0.31537233749793603, + "grad_norm": 0.8301281929016113, + "learning_rate": 9.416166170105712e-06, + "loss": 0.7999, + "step": 5730 + }, + { + "epoch": 0.3154273763002917, + "grad_norm": 0.8487183451652527, + "learning_rate": 9.415962886295442e-06, + "loss": 0.8202, + "step": 5731 + }, + { + "epoch": 0.31548241510264735, + "grad_norm": 0.74607914686203, + "learning_rate": 9.415759569295979e-06, + "loss": 0.7552, + "step": 5732 + }, + { + "epoch": 0.31553745390500304, + "grad_norm": 0.7774194478988647, + "learning_rate": 9.415556219108846e-06, + "loss": 0.7847, + "step": 5733 + }, + { + "epoch": 0.3155924927073587, + "grad_norm": 0.7782126069068909, + "learning_rate": 9.415352835735576e-06, + "loss": 0.8001, + "step": 5734 + }, + { + "epoch": 0.31564753150971436, + "grad_norm": 0.7577764987945557, + "learning_rate": 9.415149419177698e-06, + "loss": 0.8262, + "step": 5735 + }, + { + "epoch": 0.31570257031207, + "grad_norm": 0.7949855327606201, + "learning_rate": 9.414945969436737e-06, + "loss": 0.8259, + "step": 5736 + }, + { + "epoch": 0.3157576091144257, + "grad_norm": 0.7670153379440308, + "learning_rate": 9.414742486514224e-06, + "loss": 0.7181, + "step": 5737 + }, + { + "epoch": 0.3158126479167813, + "grad_norm": 0.7852359414100647, + "learning_rate": 9.414538970411687e-06, + "loss": 0.8802, + "step": 5738 + }, + { + "epoch": 0.315867686719137, + "grad_norm": 0.8300517201423645, + "learning_rate": 9.414335421130658e-06, + "loss": 0.7665, + "step": 5739 + }, + { + "epoch": 0.31592272552149264, + "grad_norm": 0.7631614804267883, + "learning_rate": 9.414131838672666e-06, + "loss": 0.8864, + "step": 5740 + }, + { + "epoch": 0.31597776432384833, + "grad_norm": 0.7946471571922302, + "learning_rate": 9.41392822303924e-06, + "loss": 0.7587, + "step": 5741 + }, + { + "epoch": 0.31603280312620396, + "grad_norm": 0.7043818235397339, + "learning_rate": 9.413724574231912e-06, + "loss": 0.7793, + "step": 5742 + }, + { + "epoch": 0.31608784192855965, + "grad_norm": 0.7276063561439514, + "learning_rate": 9.41352089225221e-06, + "loss": 0.8064, + "step": 5743 + }, + { + "epoch": 0.3161428807309153, + "grad_norm": 0.7141419053077698, + "learning_rate": 9.413317177101667e-06, + "loss": 0.7251, + "step": 5744 + }, + { + "epoch": 0.316197919533271, + "grad_norm": 0.7961493730545044, + "learning_rate": 9.413113428781815e-06, + "loss": 0.8438, + "step": 5745 + }, + { + "epoch": 0.3162529583356266, + "grad_norm": 0.7046970129013062, + "learning_rate": 9.412909647294181e-06, + "loss": 0.8319, + "step": 5746 + }, + { + "epoch": 0.3163079971379823, + "grad_norm": 0.8231918215751648, + "learning_rate": 9.412705832640302e-06, + "loss": 0.7707, + "step": 5747 + }, + { + "epoch": 0.31636303594033793, + "grad_norm": 0.769840657711029, + "learning_rate": 9.412501984821705e-06, + "loss": 0.6819, + "step": 5748 + }, + { + "epoch": 0.3164180747426936, + "grad_norm": 0.7526834607124329, + "learning_rate": 9.412298103839925e-06, + "loss": 0.8106, + "step": 5749 + }, + { + "epoch": 0.31647311354504926, + "grad_norm": 0.6763152480125427, + "learning_rate": 9.412094189696494e-06, + "loss": 0.7577, + "step": 5750 + }, + { + "epoch": 0.31652815234740495, + "grad_norm": 0.8460820317268372, + "learning_rate": 9.411890242392945e-06, + "loss": 0.752, + "step": 5751 + }, + { + "epoch": 0.3165831911497606, + "grad_norm": 0.7610277533531189, + "learning_rate": 9.411686261930809e-06, + "loss": 0.7284, + "step": 5752 + }, + { + "epoch": 0.31663822995211627, + "grad_norm": 0.7596566081047058, + "learning_rate": 9.411482248311619e-06, + "loss": 0.8518, + "step": 5753 + }, + { + "epoch": 0.3166932687544719, + "grad_norm": 0.7615048885345459, + "learning_rate": 9.41127820153691e-06, + "loss": 0.8232, + "step": 5754 + }, + { + "epoch": 0.3167483075568276, + "grad_norm": 0.7882834672927856, + "learning_rate": 9.411074121608215e-06, + "loss": 0.8682, + "step": 5755 + }, + { + "epoch": 0.3168033463591832, + "grad_norm": 0.748002827167511, + "learning_rate": 9.410870008527067e-06, + "loss": 0.7934, + "step": 5756 + }, + { + "epoch": 0.31685838516153886, + "grad_norm": 0.7677696943283081, + "learning_rate": 9.410665862295003e-06, + "loss": 0.8114, + "step": 5757 + }, + { + "epoch": 0.31691342396389455, + "grad_norm": 0.8966217041015625, + "learning_rate": 9.410461682913552e-06, + "loss": 0.8005, + "step": 5758 + }, + { + "epoch": 0.3169684627662502, + "grad_norm": 0.8769435286521912, + "learning_rate": 9.410257470384253e-06, + "loss": 0.7935, + "step": 5759 + }, + { + "epoch": 0.31702350156860587, + "grad_norm": 0.9828680753707886, + "learning_rate": 9.41005322470864e-06, + "loss": 0.8182, + "step": 5760 + }, + { + "epoch": 0.3170785403709615, + "grad_norm": 0.7340976595878601, + "learning_rate": 9.409848945888245e-06, + "loss": 0.7832, + "step": 5761 + }, + { + "epoch": 0.3171335791733172, + "grad_norm": 0.7516821622848511, + "learning_rate": 9.409644633924609e-06, + "loss": 0.8223, + "step": 5762 + }, + { + "epoch": 0.3171886179756728, + "grad_norm": 0.7556331157684326, + "learning_rate": 9.409440288819263e-06, + "loss": 0.7631, + "step": 5763 + }, + { + "epoch": 0.3172436567780285, + "grad_norm": 0.6182114481925964, + "learning_rate": 9.409235910573743e-06, + "loss": 0.558, + "step": 5764 + }, + { + "epoch": 0.31729869558038415, + "grad_norm": 0.7854578495025635, + "learning_rate": 9.409031499189586e-06, + "loss": 0.8496, + "step": 5765 + }, + { + "epoch": 0.31735373438273984, + "grad_norm": 0.7246551513671875, + "learning_rate": 9.40882705466833e-06, + "loss": 0.8407, + "step": 5766 + }, + { + "epoch": 0.31740877318509547, + "grad_norm": 1.089107632637024, + "learning_rate": 9.40862257701151e-06, + "loss": 0.8363, + "step": 5767 + }, + { + "epoch": 0.31746381198745116, + "grad_norm": 0.9886558055877686, + "learning_rate": 9.408418066220664e-06, + "loss": 0.6888, + "step": 5768 + }, + { + "epoch": 0.3175188507898068, + "grad_norm": 0.8724960088729858, + "learning_rate": 9.408213522297325e-06, + "loss": 0.7717, + "step": 5769 + }, + { + "epoch": 0.3175738895921625, + "grad_norm": 0.7453228831291199, + "learning_rate": 9.408008945243035e-06, + "loss": 0.7081, + "step": 5770 + }, + { + "epoch": 0.3176289283945181, + "grad_norm": 0.7601909637451172, + "learning_rate": 9.40780433505933e-06, + "loss": 0.7974, + "step": 5771 + }, + { + "epoch": 0.3176839671968738, + "grad_norm": 0.7704907655715942, + "learning_rate": 9.407599691747746e-06, + "loss": 0.7521, + "step": 5772 + }, + { + "epoch": 0.31773900599922944, + "grad_norm": 0.7639214396476746, + "learning_rate": 9.407395015309824e-06, + "loss": 0.7888, + "step": 5773 + }, + { + "epoch": 0.31779404480158513, + "grad_norm": 0.711355984210968, + "learning_rate": 9.4071903057471e-06, + "loss": 0.7482, + "step": 5774 + }, + { + "epoch": 0.31784908360394076, + "grad_norm": 0.6097242832183838, + "learning_rate": 9.406985563061114e-06, + "loss": 0.6533, + "step": 5775 + }, + { + "epoch": 0.31790412240629645, + "grad_norm": 0.807133138179779, + "learning_rate": 9.406780787253402e-06, + "loss": 0.7788, + "step": 5776 + }, + { + "epoch": 0.3179591612086521, + "grad_norm": 0.6938545107841492, + "learning_rate": 9.406575978325508e-06, + "loss": 0.8046, + "step": 5777 + }, + { + "epoch": 0.3180142000110078, + "grad_norm": 0.848858118057251, + "learning_rate": 9.406371136278968e-06, + "loss": 0.8481, + "step": 5778 + }, + { + "epoch": 0.3180692388133634, + "grad_norm": 0.8496920466423035, + "learning_rate": 9.40616626111532e-06, + "loss": 0.8172, + "step": 5779 + }, + { + "epoch": 0.3181242776157191, + "grad_norm": 0.8169928193092346, + "learning_rate": 9.405961352836107e-06, + "loss": 0.792, + "step": 5780 + }, + { + "epoch": 0.31817931641807473, + "grad_norm": 0.9380607604980469, + "learning_rate": 9.405756411442868e-06, + "loss": 0.8371, + "step": 5781 + }, + { + "epoch": 0.3182343552204304, + "grad_norm": 0.6938190460205078, + "learning_rate": 9.405551436937144e-06, + "loss": 0.7825, + "step": 5782 + }, + { + "epoch": 0.31828939402278605, + "grad_norm": 0.7726871371269226, + "learning_rate": 9.405346429320473e-06, + "loss": 0.6481, + "step": 5783 + }, + { + "epoch": 0.31834443282514174, + "grad_norm": 0.77762770652771, + "learning_rate": 9.4051413885944e-06, + "loss": 0.6916, + "step": 5784 + }, + { + "epoch": 0.3183994716274974, + "grad_norm": 0.7580817341804504, + "learning_rate": 9.404936314760459e-06, + "loss": 0.8222, + "step": 5785 + }, + { + "epoch": 0.31845451042985307, + "grad_norm": 0.6984102725982666, + "learning_rate": 9.4047312078202e-06, + "loss": 0.707, + "step": 5786 + }, + { + "epoch": 0.3185095492322087, + "grad_norm": 0.6887965202331543, + "learning_rate": 9.404526067775159e-06, + "loss": 0.7289, + "step": 5787 + }, + { + "epoch": 0.3185645880345644, + "grad_norm": 0.7022155523300171, + "learning_rate": 9.404320894626879e-06, + "loss": 0.741, + "step": 5788 + }, + { + "epoch": 0.31861962683692, + "grad_norm": 0.8007381558418274, + "learning_rate": 9.404115688376903e-06, + "loss": 0.8332, + "step": 5789 + }, + { + "epoch": 0.3186746656392757, + "grad_norm": 0.6985924243927002, + "learning_rate": 9.40391044902677e-06, + "loss": 0.7849, + "step": 5790 + }, + { + "epoch": 0.31872970444163135, + "grad_norm": 0.771060585975647, + "learning_rate": 9.403705176578028e-06, + "loss": 0.8728, + "step": 5791 + }, + { + "epoch": 0.31878474324398703, + "grad_norm": 0.6976794600486755, + "learning_rate": 9.403499871032214e-06, + "loss": 0.7621, + "step": 5792 + }, + { + "epoch": 0.31883978204634267, + "grad_norm": 0.7552126049995422, + "learning_rate": 9.403294532390876e-06, + "loss": 0.7641, + "step": 5793 + }, + { + "epoch": 0.31889482084869836, + "grad_norm": 1.0032007694244385, + "learning_rate": 9.403089160655553e-06, + "loss": 0.8497, + "step": 5794 + }, + { + "epoch": 0.318949859651054, + "grad_norm": 0.7193583250045776, + "learning_rate": 9.402883755827792e-06, + "loss": 0.7991, + "step": 5795 + }, + { + "epoch": 0.3190048984534097, + "grad_norm": 0.7665852308273315, + "learning_rate": 9.402678317909135e-06, + "loss": 0.7692, + "step": 5796 + }, + { + "epoch": 0.3190599372557653, + "grad_norm": 0.7514237761497498, + "learning_rate": 9.402472846901125e-06, + "loss": 0.7388, + "step": 5797 + }, + { + "epoch": 0.319114976058121, + "grad_norm": 0.6817325353622437, + "learning_rate": 9.402267342805309e-06, + "loss": 0.7249, + "step": 5798 + }, + { + "epoch": 0.31917001486047664, + "grad_norm": 0.7659624218940735, + "learning_rate": 9.402061805623229e-06, + "loss": 0.755, + "step": 5799 + }, + { + "epoch": 0.31922505366283227, + "grad_norm": 0.7860668301582336, + "learning_rate": 9.401856235356431e-06, + "loss": 0.8175, + "step": 5800 + }, + { + "epoch": 0.31928009246518796, + "grad_norm": 0.714030921459198, + "learning_rate": 9.401650632006461e-06, + "loss": 0.7359, + "step": 5801 + }, + { + "epoch": 0.3193351312675436, + "grad_norm": 0.6052672266960144, + "learning_rate": 9.401444995574862e-06, + "loss": 0.6167, + "step": 5802 + }, + { + "epoch": 0.3193901700698993, + "grad_norm": 0.7960858941078186, + "learning_rate": 9.40123932606318e-06, + "loss": 0.7542, + "step": 5803 + }, + { + "epoch": 0.3194452088722549, + "grad_norm": 0.7926718592643738, + "learning_rate": 9.401033623472962e-06, + "loss": 0.8292, + "step": 5804 + }, + { + "epoch": 0.3195002476746106, + "grad_norm": 0.7950098514556885, + "learning_rate": 9.400827887805754e-06, + "loss": 0.9332, + "step": 5805 + }, + { + "epoch": 0.31955528647696624, + "grad_norm": 0.7564939260482788, + "learning_rate": 9.400622119063101e-06, + "loss": 0.7217, + "step": 5806 + }, + { + "epoch": 0.3196103252793219, + "grad_norm": 0.7582511901855469, + "learning_rate": 9.40041631724655e-06, + "loss": 0.723, + "step": 5807 + }, + { + "epoch": 0.31966536408167756, + "grad_norm": 0.8826366066932678, + "learning_rate": 9.400210482357648e-06, + "loss": 0.6977, + "step": 5808 + }, + { + "epoch": 0.31972040288403325, + "grad_norm": 0.7029523253440857, + "learning_rate": 9.400004614397941e-06, + "loss": 0.6949, + "step": 5809 + }, + { + "epoch": 0.3197754416863889, + "grad_norm": 0.7651532888412476, + "learning_rate": 9.399798713368979e-06, + "loss": 0.7158, + "step": 5810 + }, + { + "epoch": 0.3198304804887446, + "grad_norm": 0.9379491806030273, + "learning_rate": 9.399592779272307e-06, + "loss": 0.7639, + "step": 5811 + }, + { + "epoch": 0.3198855192911002, + "grad_norm": 0.7945839762687683, + "learning_rate": 9.399386812109474e-06, + "loss": 0.8175, + "step": 5812 + }, + { + "epoch": 0.3199405580934559, + "grad_norm": 0.9462345242500305, + "learning_rate": 9.399180811882025e-06, + "loss": 0.6635, + "step": 5813 + }, + { + "epoch": 0.31999559689581153, + "grad_norm": 1.0449726581573486, + "learning_rate": 9.398974778591513e-06, + "loss": 0.789, + "step": 5814 + }, + { + "epoch": 0.3200506356981672, + "grad_norm": 0.8295683860778809, + "learning_rate": 9.398768712239483e-06, + "loss": 0.7937, + "step": 5815 + }, + { + "epoch": 0.32010567450052285, + "grad_norm": 0.7578030228614807, + "learning_rate": 9.398562612827485e-06, + "loss": 0.8291, + "step": 5816 + }, + { + "epoch": 0.32016071330287854, + "grad_norm": 0.804563581943512, + "learning_rate": 9.398356480357068e-06, + "loss": 0.7604, + "step": 5817 + }, + { + "epoch": 0.3202157521052342, + "grad_norm": 0.8073337078094482, + "learning_rate": 9.39815031482978e-06, + "loss": 0.8288, + "step": 5818 + }, + { + "epoch": 0.32027079090758986, + "grad_norm": 0.8054978251457214, + "learning_rate": 9.397944116247173e-06, + "loss": 0.819, + "step": 5819 + }, + { + "epoch": 0.3203258297099455, + "grad_norm": 0.8304697871208191, + "learning_rate": 9.397737884610794e-06, + "loss": 0.7991, + "step": 5820 + }, + { + "epoch": 0.3203808685123012, + "grad_norm": 0.784662663936615, + "learning_rate": 9.397531619922195e-06, + "loss": 0.763, + "step": 5821 + }, + { + "epoch": 0.3204359073146568, + "grad_norm": 0.726046085357666, + "learning_rate": 9.397325322182926e-06, + "loss": 0.7926, + "step": 5822 + }, + { + "epoch": 0.3204909461170125, + "grad_norm": 0.7291107773780823, + "learning_rate": 9.397118991394535e-06, + "loss": 0.6871, + "step": 5823 + }, + { + "epoch": 0.32054598491936814, + "grad_norm": 0.7870203256607056, + "learning_rate": 9.396912627558577e-06, + "loss": 0.7827, + "step": 5824 + }, + { + "epoch": 0.32060102372172383, + "grad_norm": 0.8665844798088074, + "learning_rate": 9.3967062306766e-06, + "loss": 0.8098, + "step": 5825 + }, + { + "epoch": 0.32065606252407947, + "grad_norm": 0.7743843793869019, + "learning_rate": 9.396499800750157e-06, + "loss": 0.835, + "step": 5826 + }, + { + "epoch": 0.32071110132643516, + "grad_norm": 0.7724023461341858, + "learning_rate": 9.396293337780796e-06, + "loss": 0.8928, + "step": 5827 + }, + { + "epoch": 0.3207661401287908, + "grad_norm": 0.7497217655181885, + "learning_rate": 9.39608684177007e-06, + "loss": 0.8035, + "step": 5828 + }, + { + "epoch": 0.3208211789311465, + "grad_norm": 0.8346971869468689, + "learning_rate": 9.395880312719536e-06, + "loss": 0.8879, + "step": 5829 + }, + { + "epoch": 0.3208762177335021, + "grad_norm": 0.836626410484314, + "learning_rate": 9.39567375063074e-06, + "loss": 0.8523, + "step": 5830 + }, + { + "epoch": 0.3209312565358578, + "grad_norm": 0.734428346157074, + "learning_rate": 9.395467155505237e-06, + "loss": 0.7568, + "step": 5831 + }, + { + "epoch": 0.32098629533821343, + "grad_norm": 0.6620383858680725, + "learning_rate": 9.39526052734458e-06, + "loss": 0.7296, + "step": 5832 + }, + { + "epoch": 0.3210413341405691, + "grad_norm": 0.9356484413146973, + "learning_rate": 9.39505386615032e-06, + "loss": 0.8233, + "step": 5833 + }, + { + "epoch": 0.32109637294292476, + "grad_norm": 0.9238032698631287, + "learning_rate": 9.394847171924013e-06, + "loss": 0.7397, + "step": 5834 + }, + { + "epoch": 0.32115141174528045, + "grad_norm": 0.7161185145378113, + "learning_rate": 9.39464044466721e-06, + "loss": 0.7541, + "step": 5835 + }, + { + "epoch": 0.3212064505476361, + "grad_norm": 0.8381507396697998, + "learning_rate": 9.394433684381467e-06, + "loss": 0.7839, + "step": 5836 + }, + { + "epoch": 0.32126148934999177, + "grad_norm": 0.8299819231033325, + "learning_rate": 9.394226891068337e-06, + "loss": 0.871, + "step": 5837 + }, + { + "epoch": 0.3213165281523474, + "grad_norm": 0.7443987131118774, + "learning_rate": 9.394020064729372e-06, + "loss": 0.7661, + "step": 5838 + }, + { + "epoch": 0.3213715669547031, + "grad_norm": 0.7084206938743591, + "learning_rate": 9.393813205366128e-06, + "loss": 0.7609, + "step": 5839 + }, + { + "epoch": 0.3214266057570587, + "grad_norm": 0.7443114519119263, + "learning_rate": 9.393606312980164e-06, + "loss": 0.8189, + "step": 5840 + }, + { + "epoch": 0.3214816445594144, + "grad_norm": 0.7157652974128723, + "learning_rate": 9.393399387573028e-06, + "loss": 0.8369, + "step": 5841 + }, + { + "epoch": 0.32153668336177005, + "grad_norm": 0.709507942199707, + "learning_rate": 9.393192429146278e-06, + "loss": 0.7314, + "step": 5842 + }, + { + "epoch": 0.3215917221641257, + "grad_norm": 0.7704687714576721, + "learning_rate": 9.39298543770147e-06, + "loss": 0.8793, + "step": 5843 + }, + { + "epoch": 0.32164676096648137, + "grad_norm": 0.8123828172683716, + "learning_rate": 9.39277841324016e-06, + "loss": 0.8748, + "step": 5844 + }, + { + "epoch": 0.321701799768837, + "grad_norm": 0.6951777338981628, + "learning_rate": 9.392571355763903e-06, + "loss": 0.7883, + "step": 5845 + }, + { + "epoch": 0.3217568385711927, + "grad_norm": 0.6753274202346802, + "learning_rate": 9.392364265274256e-06, + "loss": 0.7292, + "step": 5846 + }, + { + "epoch": 0.3218118773735483, + "grad_norm": 0.7940227389335632, + "learning_rate": 9.392157141772775e-06, + "loss": 0.7919, + "step": 5847 + }, + { + "epoch": 0.321866916175904, + "grad_norm": 0.6706317067146301, + "learning_rate": 9.391949985261016e-06, + "loss": 0.6791, + "step": 5848 + }, + { + "epoch": 0.32192195497825965, + "grad_norm": 0.7898741960525513, + "learning_rate": 9.391742795740537e-06, + "loss": 0.7539, + "step": 5849 + }, + { + "epoch": 0.32197699378061534, + "grad_norm": 0.7623887658119202, + "learning_rate": 9.391535573212895e-06, + "loss": 0.7891, + "step": 5850 + }, + { + "epoch": 0.322032032582971, + "grad_norm": 0.6852909326553345, + "learning_rate": 9.391328317679647e-06, + "loss": 0.6587, + "step": 5851 + }, + { + "epoch": 0.32208707138532666, + "grad_norm": 0.7944231033325195, + "learning_rate": 9.39112102914235e-06, + "loss": 0.8316, + "step": 5852 + }, + { + "epoch": 0.3221421101876823, + "grad_norm": 0.6720889806747437, + "learning_rate": 9.390913707602563e-06, + "loss": 0.7791, + "step": 5853 + }, + { + "epoch": 0.322197148990038, + "grad_norm": 0.7482234239578247, + "learning_rate": 9.390706353061845e-06, + "loss": 0.826, + "step": 5854 + }, + { + "epoch": 0.3222521877923936, + "grad_norm": 0.6821579933166504, + "learning_rate": 9.390498965521752e-06, + "loss": 0.7183, + "step": 5855 + }, + { + "epoch": 0.3223072265947493, + "grad_norm": 0.755171537399292, + "learning_rate": 9.390291544983845e-06, + "loss": 0.6887, + "step": 5856 + }, + { + "epoch": 0.32236226539710494, + "grad_norm": 0.748824417591095, + "learning_rate": 9.39008409144968e-06, + "loss": 0.7169, + "step": 5857 + }, + { + "epoch": 0.32241730419946063, + "grad_norm": 0.7479343414306641, + "learning_rate": 9.38987660492082e-06, + "loss": 0.8122, + "step": 5858 + }, + { + "epoch": 0.32247234300181626, + "grad_norm": 0.7459376454353333, + "learning_rate": 9.389669085398823e-06, + "loss": 0.7782, + "step": 5859 + }, + { + "epoch": 0.32252738180417195, + "grad_norm": 0.7016253471374512, + "learning_rate": 9.389461532885246e-06, + "loss": 0.7866, + "step": 5860 + }, + { + "epoch": 0.3225824206065276, + "grad_norm": 0.6711822152137756, + "learning_rate": 9.389253947381654e-06, + "loss": 0.7223, + "step": 5861 + }, + { + "epoch": 0.3226374594088833, + "grad_norm": 0.855045735836029, + "learning_rate": 9.389046328889602e-06, + "loss": 0.7327, + "step": 5862 + }, + { + "epoch": 0.3226924982112389, + "grad_norm": 0.7309823632240295, + "learning_rate": 9.388838677410654e-06, + "loss": 0.7737, + "step": 5863 + }, + { + "epoch": 0.3227475370135946, + "grad_norm": 0.7737841010093689, + "learning_rate": 9.388630992946369e-06, + "loss": 0.7061, + "step": 5864 + }, + { + "epoch": 0.32280257581595023, + "grad_norm": 0.9448195099830627, + "learning_rate": 9.388423275498307e-06, + "loss": 0.8382, + "step": 5865 + }, + { + "epoch": 0.3228576146183059, + "grad_norm": 0.7348229885101318, + "learning_rate": 9.388215525068032e-06, + "loss": 0.8317, + "step": 5866 + }, + { + "epoch": 0.32291265342066156, + "grad_norm": 1.2628185749053955, + "learning_rate": 9.388007741657103e-06, + "loss": 0.7959, + "step": 5867 + }, + { + "epoch": 0.32296769222301724, + "grad_norm": 0.7730327844619751, + "learning_rate": 9.387799925267083e-06, + "loss": 0.7455, + "step": 5868 + }, + { + "epoch": 0.3230227310253729, + "grad_norm": 0.8273047804832458, + "learning_rate": 9.387592075899532e-06, + "loss": 0.877, + "step": 5869 + }, + { + "epoch": 0.32307776982772857, + "grad_norm": 0.7413405776023865, + "learning_rate": 9.387384193556014e-06, + "loss": 0.7734, + "step": 5870 + }, + { + "epoch": 0.3231328086300842, + "grad_norm": 1.0173207521438599, + "learning_rate": 9.387176278238092e-06, + "loss": 0.8674, + "step": 5871 + }, + { + "epoch": 0.3231878474324399, + "grad_norm": 0.7741677761077881, + "learning_rate": 9.386968329947327e-06, + "loss": 0.8226, + "step": 5872 + }, + { + "epoch": 0.3232428862347955, + "grad_norm": 0.8912034034729004, + "learning_rate": 9.38676034868528e-06, + "loss": 0.7977, + "step": 5873 + }, + { + "epoch": 0.3232979250371512, + "grad_norm": 0.7343642711639404, + "learning_rate": 9.386552334453519e-06, + "loss": 0.7639, + "step": 5874 + }, + { + "epoch": 0.32335296383950685, + "grad_norm": 0.697225034236908, + "learning_rate": 9.386344287253603e-06, + "loss": 0.6801, + "step": 5875 + }, + { + "epoch": 0.32340800264186254, + "grad_norm": 0.7082511186599731, + "learning_rate": 9.386136207087099e-06, + "loss": 0.746, + "step": 5876 + }, + { + "epoch": 0.32346304144421817, + "grad_norm": 0.671419620513916, + "learning_rate": 9.38592809395557e-06, + "loss": 0.7023, + "step": 5877 + }, + { + "epoch": 0.32351808024657386, + "grad_norm": 0.775834321975708, + "learning_rate": 9.385719947860579e-06, + "loss": 0.7797, + "step": 5878 + }, + { + "epoch": 0.3235731190489295, + "grad_norm": 0.7867023348808289, + "learning_rate": 9.38551176880369e-06, + "loss": 0.8165, + "step": 5879 + }, + { + "epoch": 0.3236281578512852, + "grad_norm": 0.7099916934967041, + "learning_rate": 9.385303556786469e-06, + "loss": 0.7598, + "step": 5880 + }, + { + "epoch": 0.3236831966536408, + "grad_norm": 0.7362176179885864, + "learning_rate": 9.385095311810479e-06, + "loss": 0.8002, + "step": 5881 + }, + { + "epoch": 0.3237382354559965, + "grad_norm": 0.7310882806777954, + "learning_rate": 9.384887033877288e-06, + "loss": 0.7641, + "step": 5882 + }, + { + "epoch": 0.32379327425835214, + "grad_norm": 0.7769907116889954, + "learning_rate": 9.384678722988458e-06, + "loss": 0.7938, + "step": 5883 + }, + { + "epoch": 0.3238483130607078, + "grad_norm": 0.9913623929023743, + "learning_rate": 9.384470379145558e-06, + "loss": 0.8203, + "step": 5884 + }, + { + "epoch": 0.32390335186306346, + "grad_norm": 0.8765702247619629, + "learning_rate": 9.384262002350153e-06, + "loss": 0.9343, + "step": 5885 + }, + { + "epoch": 0.3239583906654191, + "grad_norm": 0.8122400641441345, + "learning_rate": 9.384053592603808e-06, + "loss": 0.8325, + "step": 5886 + }, + { + "epoch": 0.3240134294677748, + "grad_norm": 0.7600317597389221, + "learning_rate": 9.383845149908089e-06, + "loss": 0.8335, + "step": 5887 + }, + { + "epoch": 0.3240684682701304, + "grad_norm": 0.9472025632858276, + "learning_rate": 9.383636674264563e-06, + "loss": 0.7265, + "step": 5888 + }, + { + "epoch": 0.3241235070724861, + "grad_norm": 0.6961854100227356, + "learning_rate": 9.383428165674797e-06, + "loss": 0.6962, + "step": 5889 + }, + { + "epoch": 0.32417854587484174, + "grad_norm": 0.7032504081726074, + "learning_rate": 9.38321962414036e-06, + "loss": 0.7627, + "step": 5890 + }, + { + "epoch": 0.32423358467719743, + "grad_norm": 0.7727648019790649, + "learning_rate": 9.383011049662816e-06, + "loss": 0.757, + "step": 5891 + }, + { + "epoch": 0.32428862347955306, + "grad_norm": 0.7263824343681335, + "learning_rate": 9.382802442243735e-06, + "loss": 0.8057, + "step": 5892 + }, + { + "epoch": 0.32434366228190875, + "grad_norm": 0.7576926350593567, + "learning_rate": 9.382593801884683e-06, + "loss": 0.763, + "step": 5893 + }, + { + "epoch": 0.3243987010842644, + "grad_norm": 0.7468064427375793, + "learning_rate": 9.38238512858723e-06, + "loss": 0.731, + "step": 5894 + }, + { + "epoch": 0.3244537398866201, + "grad_norm": 0.9570005536079407, + "learning_rate": 9.382176422352944e-06, + "loss": 0.7985, + "step": 5895 + }, + { + "epoch": 0.3245087786889757, + "grad_norm": 0.7296027541160583, + "learning_rate": 9.381967683183393e-06, + "loss": 0.8117, + "step": 5896 + }, + { + "epoch": 0.3245638174913314, + "grad_norm": 0.7330880165100098, + "learning_rate": 9.381758911080145e-06, + "loss": 0.7229, + "step": 5897 + }, + { + "epoch": 0.32461885629368703, + "grad_norm": 0.7247695922851562, + "learning_rate": 9.38155010604477e-06, + "loss": 0.7704, + "step": 5898 + }, + { + "epoch": 0.3246738950960427, + "grad_norm": 0.8011599779129028, + "learning_rate": 9.381341268078836e-06, + "loss": 0.6982, + "step": 5899 + }, + { + "epoch": 0.32472893389839835, + "grad_norm": 0.7931570410728455, + "learning_rate": 9.381132397183917e-06, + "loss": 0.8188, + "step": 5900 + }, + { + "epoch": 0.32478397270075404, + "grad_norm": 0.7469003200531006, + "learning_rate": 9.380923493361577e-06, + "loss": 0.7638, + "step": 5901 + }, + { + "epoch": 0.3248390115031097, + "grad_norm": 0.7442750334739685, + "learning_rate": 9.380714556613391e-06, + "loss": 0.8134, + "step": 5902 + }, + { + "epoch": 0.32489405030546537, + "grad_norm": 0.8014402985572815, + "learning_rate": 9.380505586940925e-06, + "loss": 0.838, + "step": 5903 + }, + { + "epoch": 0.324949089107821, + "grad_norm": 0.7287543416023254, + "learning_rate": 9.380296584345751e-06, + "loss": 0.7317, + "step": 5904 + }, + { + "epoch": 0.3250041279101767, + "grad_norm": 0.7754266262054443, + "learning_rate": 9.380087548829441e-06, + "loss": 0.7205, + "step": 5905 + }, + { + "epoch": 0.3250591667125323, + "grad_norm": 0.7439714074134827, + "learning_rate": 9.379878480393567e-06, + "loss": 0.821, + "step": 5906 + }, + { + "epoch": 0.325114205514888, + "grad_norm": 0.7142870426177979, + "learning_rate": 9.379669379039698e-06, + "loss": 0.7462, + "step": 5907 + }, + { + "epoch": 0.32516924431724364, + "grad_norm": 0.6522948145866394, + "learning_rate": 9.379460244769407e-06, + "loss": 0.739, + "step": 5908 + }, + { + "epoch": 0.32522428311959933, + "grad_norm": 0.7879271507263184, + "learning_rate": 9.379251077584263e-06, + "loss": 0.719, + "step": 5909 + }, + { + "epoch": 0.32527932192195497, + "grad_norm": 0.6969109773635864, + "learning_rate": 9.379041877485842e-06, + "loss": 0.7517, + "step": 5910 + }, + { + "epoch": 0.32533436072431066, + "grad_norm": 0.736890971660614, + "learning_rate": 9.378832644475714e-06, + "loss": 0.7797, + "step": 5911 + }, + { + "epoch": 0.3253893995266663, + "grad_norm": 0.7504066824913025, + "learning_rate": 9.378623378555451e-06, + "loss": 0.7502, + "step": 5912 + }, + { + "epoch": 0.325444438329022, + "grad_norm": 0.9339223504066467, + "learning_rate": 9.378414079726629e-06, + "loss": 0.8842, + "step": 5913 + }, + { + "epoch": 0.3254994771313776, + "grad_norm": 1.08317232131958, + "learning_rate": 9.378204747990818e-06, + "loss": 0.7503, + "step": 5914 + }, + { + "epoch": 0.3255545159337333, + "grad_norm": 0.722665011882782, + "learning_rate": 9.37799538334959e-06, + "loss": 0.7825, + "step": 5915 + }, + { + "epoch": 0.32560955473608894, + "grad_norm": 0.7969509959220886, + "learning_rate": 9.377785985804521e-06, + "loss": 0.8678, + "step": 5916 + }, + { + "epoch": 0.3256645935384446, + "grad_norm": 0.7944697141647339, + "learning_rate": 9.377576555357187e-06, + "loss": 0.8067, + "step": 5917 + }, + { + "epoch": 0.32571963234080026, + "grad_norm": 0.905580461025238, + "learning_rate": 9.377367092009158e-06, + "loss": 0.7689, + "step": 5918 + }, + { + "epoch": 0.32577467114315595, + "grad_norm": 0.7428018450737, + "learning_rate": 9.37715759576201e-06, + "loss": 0.7748, + "step": 5919 + }, + { + "epoch": 0.3258297099455116, + "grad_norm": 0.7746098041534424, + "learning_rate": 9.376948066617316e-06, + "loss": 0.7235, + "step": 5920 + }, + { + "epoch": 0.32588474874786727, + "grad_norm": 0.6842886805534363, + "learning_rate": 9.376738504576653e-06, + "loss": 0.7697, + "step": 5921 + }, + { + "epoch": 0.3259397875502229, + "grad_norm": 0.7858961224555969, + "learning_rate": 9.376528909641595e-06, + "loss": 0.7746, + "step": 5922 + }, + { + "epoch": 0.3259948263525786, + "grad_norm": 0.7534621357917786, + "learning_rate": 9.376319281813717e-06, + "loss": 0.8183, + "step": 5923 + }, + { + "epoch": 0.3260498651549342, + "grad_norm": 1.2406045198440552, + "learning_rate": 9.376109621094594e-06, + "loss": 0.8173, + "step": 5924 + }, + { + "epoch": 0.3261049039572899, + "grad_norm": 0.740075945854187, + "learning_rate": 9.375899927485804e-06, + "loss": 0.725, + "step": 5925 + }, + { + "epoch": 0.32615994275964555, + "grad_norm": 0.8432604074478149, + "learning_rate": 9.375690200988921e-06, + "loss": 0.7805, + "step": 5926 + }, + { + "epoch": 0.32621498156200124, + "grad_norm": 0.7652943134307861, + "learning_rate": 9.37548044160552e-06, + "loss": 0.8609, + "step": 5927 + }, + { + "epoch": 0.32627002036435687, + "grad_norm": 0.7629607915878296, + "learning_rate": 9.37527064933718e-06, + "loss": 0.8776, + "step": 5928 + }, + { + "epoch": 0.3263250591667125, + "grad_norm": 0.8648995757102966, + "learning_rate": 9.375060824185479e-06, + "loss": 0.7543, + "step": 5929 + }, + { + "epoch": 0.3263800979690682, + "grad_norm": 0.8069457411766052, + "learning_rate": 9.374850966151989e-06, + "loss": 0.7995, + "step": 5930 + }, + { + "epoch": 0.32643513677142383, + "grad_norm": 0.7948445677757263, + "learning_rate": 9.374641075238293e-06, + "loss": 0.8312, + "step": 5931 + }, + { + "epoch": 0.3264901755737795, + "grad_norm": 0.7739841341972351, + "learning_rate": 9.374431151445963e-06, + "loss": 0.8442, + "step": 5932 + }, + { + "epoch": 0.32654521437613515, + "grad_norm": 0.7382220029830933, + "learning_rate": 9.374221194776583e-06, + "loss": 0.7519, + "step": 5933 + }, + { + "epoch": 0.32660025317849084, + "grad_norm": 0.7876916527748108, + "learning_rate": 9.374011205231725e-06, + "loss": 0.817, + "step": 5934 + }, + { + "epoch": 0.3266552919808465, + "grad_norm": 0.7175565958023071, + "learning_rate": 9.373801182812969e-06, + "loss": 0.7317, + "step": 5935 + }, + { + "epoch": 0.32671033078320216, + "grad_norm": 0.7739143967628479, + "learning_rate": 9.373591127521894e-06, + "loss": 0.8134, + "step": 5936 + }, + { + "epoch": 0.3267653695855578, + "grad_norm": 0.7388991713523865, + "learning_rate": 9.373381039360082e-06, + "loss": 0.8758, + "step": 5937 + }, + { + "epoch": 0.3268204083879135, + "grad_norm": 0.7393535375595093, + "learning_rate": 9.373170918329105e-06, + "loss": 0.7453, + "step": 5938 + }, + { + "epoch": 0.3268754471902691, + "grad_norm": 0.7168294191360474, + "learning_rate": 9.372960764430547e-06, + "loss": 0.6535, + "step": 5939 + }, + { + "epoch": 0.3269304859926248, + "grad_norm": 0.7472337484359741, + "learning_rate": 9.372750577665988e-06, + "loss": 0.8065, + "step": 5940 + }, + { + "epoch": 0.32698552479498044, + "grad_norm": 0.7211272120475769, + "learning_rate": 9.372540358037005e-06, + "loss": 0.7389, + "step": 5941 + }, + { + "epoch": 0.32704056359733613, + "grad_norm": 0.8097178339958191, + "learning_rate": 9.37233010554518e-06, + "loss": 0.8034, + "step": 5942 + }, + { + "epoch": 0.32709560239969176, + "grad_norm": 0.7929103970527649, + "learning_rate": 9.372119820192091e-06, + "loss": 0.796, + "step": 5943 + }, + { + "epoch": 0.32715064120204745, + "grad_norm": 0.701171875, + "learning_rate": 9.37190950197932e-06, + "loss": 0.7092, + "step": 5944 + }, + { + "epoch": 0.3272056800044031, + "grad_norm": 0.679142951965332, + "learning_rate": 9.371699150908448e-06, + "loss": 0.6995, + "step": 5945 + }, + { + "epoch": 0.3272607188067588, + "grad_norm": 0.7757906913757324, + "learning_rate": 9.371488766981057e-06, + "loss": 0.8662, + "step": 5946 + }, + { + "epoch": 0.3273157576091144, + "grad_norm": 0.8086597323417664, + "learning_rate": 9.371278350198724e-06, + "loss": 0.7455, + "step": 5947 + }, + { + "epoch": 0.3273707964114701, + "grad_norm": 0.6443416476249695, + "learning_rate": 9.371067900563033e-06, + "loss": 0.7262, + "step": 5948 + }, + { + "epoch": 0.32742583521382573, + "grad_norm": 0.8132354021072388, + "learning_rate": 9.370857418075567e-06, + "loss": 0.7841, + "step": 5949 + }, + { + "epoch": 0.3274808740161814, + "grad_norm": 0.6811150908470154, + "learning_rate": 9.370646902737907e-06, + "loss": 0.6955, + "step": 5950 + }, + { + "epoch": 0.32753591281853706, + "grad_norm": 0.8956614136695862, + "learning_rate": 9.370436354551633e-06, + "loss": 0.8218, + "step": 5951 + }, + { + "epoch": 0.32759095162089275, + "grad_norm": 0.6807655692100525, + "learning_rate": 9.370225773518332e-06, + "loss": 0.7869, + "step": 5952 + }, + { + "epoch": 0.3276459904232484, + "grad_norm": 0.7506592869758606, + "learning_rate": 9.37001515963958e-06, + "loss": 0.7975, + "step": 5953 + }, + { + "epoch": 0.32770102922560407, + "grad_norm": 0.7488718032836914, + "learning_rate": 9.369804512916966e-06, + "loss": 0.7611, + "step": 5954 + }, + { + "epoch": 0.3277560680279597, + "grad_norm": 0.734569251537323, + "learning_rate": 9.369593833352073e-06, + "loss": 0.8532, + "step": 5955 + }, + { + "epoch": 0.3278111068303154, + "grad_norm": 0.780170738697052, + "learning_rate": 9.36938312094648e-06, + "loss": 0.7766, + "step": 5956 + }, + { + "epoch": 0.327866145632671, + "grad_norm": 0.6329935193061829, + "learning_rate": 9.369172375701774e-06, + "loss": 0.6789, + "step": 5957 + }, + { + "epoch": 0.3279211844350267, + "grad_norm": 1.0177193880081177, + "learning_rate": 9.368961597619537e-06, + "loss": 0.8362, + "step": 5958 + }, + { + "epoch": 0.32797622323738235, + "grad_norm": 0.730696439743042, + "learning_rate": 9.368750786701354e-06, + "loss": 0.7696, + "step": 5959 + }, + { + "epoch": 0.32803126203973804, + "grad_norm": 0.7946468591690063, + "learning_rate": 9.36853994294881e-06, + "loss": 0.8559, + "step": 5960 + }, + { + "epoch": 0.32808630084209367, + "grad_norm": 0.9353142976760864, + "learning_rate": 9.368329066363489e-06, + "loss": 0.9041, + "step": 5961 + }, + { + "epoch": 0.32814133964444936, + "grad_norm": 0.7256187796592712, + "learning_rate": 9.368118156946977e-06, + "loss": 0.787, + "step": 5962 + }, + { + "epoch": 0.328196378446805, + "grad_norm": 0.7454268336296082, + "learning_rate": 9.367907214700858e-06, + "loss": 0.7255, + "step": 5963 + }, + { + "epoch": 0.3282514172491607, + "grad_norm": 0.7087902426719666, + "learning_rate": 9.367696239626716e-06, + "loss": 0.7166, + "step": 5964 + }, + { + "epoch": 0.3283064560515163, + "grad_norm": 0.8217566609382629, + "learning_rate": 9.36748523172614e-06, + "loss": 0.8351, + "step": 5965 + }, + { + "epoch": 0.328361494853872, + "grad_norm": 0.7712824940681458, + "learning_rate": 9.367274191000713e-06, + "loss": 0.7561, + "step": 5966 + }, + { + "epoch": 0.32841653365622764, + "grad_norm": 0.6798166036605835, + "learning_rate": 9.367063117452024e-06, + "loss": 0.7447, + "step": 5967 + }, + { + "epoch": 0.3284715724585833, + "grad_norm": 0.7139115929603577, + "learning_rate": 9.366852011081655e-06, + "loss": 0.7728, + "step": 5968 + }, + { + "epoch": 0.32852661126093896, + "grad_norm": 1.0488213300704956, + "learning_rate": 9.366640871891196e-06, + "loss": 0.8283, + "step": 5969 + }, + { + "epoch": 0.32858165006329465, + "grad_norm": 0.7939574122428894, + "learning_rate": 9.366429699882233e-06, + "loss": 0.849, + "step": 5970 + }, + { + "epoch": 0.3286366888656503, + "grad_norm": 0.7959052324295044, + "learning_rate": 9.366218495056356e-06, + "loss": 0.7469, + "step": 5971 + }, + { + "epoch": 0.3286917276680059, + "grad_norm": 0.7293235063552856, + "learning_rate": 9.366007257415146e-06, + "loss": 0.8537, + "step": 5972 + }, + { + "epoch": 0.3287467664703616, + "grad_norm": 0.7490390539169312, + "learning_rate": 9.365795986960196e-06, + "loss": 0.8166, + "step": 5973 + }, + { + "epoch": 0.32880180527271724, + "grad_norm": 0.6572316884994507, + "learning_rate": 9.365584683693093e-06, + "loss": 0.6919, + "step": 5974 + }, + { + "epoch": 0.32885684407507293, + "grad_norm": 0.7286609411239624, + "learning_rate": 9.365373347615421e-06, + "loss": 0.768, + "step": 5975 + }, + { + "epoch": 0.32891188287742856, + "grad_norm": 0.7798202037811279, + "learning_rate": 9.365161978728772e-06, + "loss": 0.788, + "step": 5976 + }, + { + "epoch": 0.32896692167978425, + "grad_norm": 0.7224245071411133, + "learning_rate": 9.364950577034737e-06, + "loss": 0.7551, + "step": 5977 + }, + { + "epoch": 0.3290219604821399, + "grad_norm": 0.7238701581954956, + "learning_rate": 9.364739142534898e-06, + "loss": 0.6663, + "step": 5978 + }, + { + "epoch": 0.3290769992844956, + "grad_norm": 0.8947147727012634, + "learning_rate": 9.36452767523085e-06, + "loss": 0.8559, + "step": 5979 + }, + { + "epoch": 0.3291320380868512, + "grad_norm": 0.7346563935279846, + "learning_rate": 9.36431617512418e-06, + "loss": 0.7915, + "step": 5980 + }, + { + "epoch": 0.3291870768892069, + "grad_norm": 0.7674046158790588, + "learning_rate": 9.364104642216479e-06, + "loss": 0.7643, + "step": 5981 + }, + { + "epoch": 0.32924211569156253, + "grad_norm": 0.7288179397583008, + "learning_rate": 9.363893076509335e-06, + "loss": 0.7796, + "step": 5982 + }, + { + "epoch": 0.3292971544939182, + "grad_norm": 0.6603766083717346, + "learning_rate": 9.363681478004339e-06, + "loss": 0.7035, + "step": 5983 + }, + { + "epoch": 0.32935219329627385, + "grad_norm": 0.7523066997528076, + "learning_rate": 9.36346984670308e-06, + "loss": 0.8196, + "step": 5984 + }, + { + "epoch": 0.32940723209862954, + "grad_norm": 0.730312168598175, + "learning_rate": 9.36325818260715e-06, + "loss": 0.7967, + "step": 5985 + }, + { + "epoch": 0.3294622709009852, + "grad_norm": 0.7341319918632507, + "learning_rate": 9.363046485718139e-06, + "loss": 0.8361, + "step": 5986 + }, + { + "epoch": 0.32951730970334087, + "grad_norm": 0.839894711971283, + "learning_rate": 9.36283475603764e-06, + "loss": 0.862, + "step": 5987 + }, + { + "epoch": 0.3295723485056965, + "grad_norm": 0.7794893980026245, + "learning_rate": 9.362622993567243e-06, + "loss": 0.8521, + "step": 5988 + }, + { + "epoch": 0.3296273873080522, + "grad_norm": 0.929410457611084, + "learning_rate": 9.362411198308538e-06, + "loss": 0.7644, + "step": 5989 + }, + { + "epoch": 0.3296824261104078, + "grad_norm": 0.7687333226203918, + "learning_rate": 9.362199370263118e-06, + "loss": 0.8047, + "step": 5990 + }, + { + "epoch": 0.3297374649127635, + "grad_norm": 0.8040616512298584, + "learning_rate": 9.361987509432576e-06, + "loss": 0.7574, + "step": 5991 + }, + { + "epoch": 0.32979250371511915, + "grad_norm": 0.7743237614631653, + "learning_rate": 9.361775615818503e-06, + "loss": 0.8491, + "step": 5992 + }, + { + "epoch": 0.32984754251747483, + "grad_norm": 1.2796664237976074, + "learning_rate": 9.361563689422493e-06, + "loss": 0.7975, + "step": 5993 + }, + { + "epoch": 0.32990258131983047, + "grad_norm": 0.9493466019630432, + "learning_rate": 9.361351730246136e-06, + "loss": 1.0258, + "step": 5994 + }, + { + "epoch": 0.32995762012218616, + "grad_norm": 0.7148050665855408, + "learning_rate": 9.36113973829103e-06, + "loss": 0.805, + "step": 5995 + }, + { + "epoch": 0.3300126589245418, + "grad_norm": 0.723426342010498, + "learning_rate": 9.360927713558762e-06, + "loss": 0.6886, + "step": 5996 + }, + { + "epoch": 0.3300676977268975, + "grad_norm": 0.8274679183959961, + "learning_rate": 9.360715656050929e-06, + "loss": 0.8559, + "step": 5997 + }, + { + "epoch": 0.3301227365292531, + "grad_norm": 0.7493795156478882, + "learning_rate": 9.360503565769126e-06, + "loss": 0.8266, + "step": 5998 + }, + { + "epoch": 0.3301777753316088, + "grad_norm": 0.7690125703811646, + "learning_rate": 9.360291442714944e-06, + "loss": 0.783, + "step": 5999 + }, + { + "epoch": 0.33023281413396444, + "grad_norm": 0.8740219473838806, + "learning_rate": 9.360079286889981e-06, + "loss": 0.8409, + "step": 6000 + }, + { + "epoch": 0.3302878529363201, + "grad_norm": 0.6931017637252808, + "learning_rate": 9.359867098295827e-06, + "loss": 0.7985, + "step": 6001 + }, + { + "epoch": 0.33034289173867576, + "grad_norm": 0.915532112121582, + "learning_rate": 9.35965487693408e-06, + "loss": 0.8718, + "step": 6002 + }, + { + "epoch": 0.33039793054103145, + "grad_norm": 0.7898837924003601, + "learning_rate": 9.359442622806332e-06, + "loss": 0.8571, + "step": 6003 + }, + { + "epoch": 0.3304529693433871, + "grad_norm": 0.8661002516746521, + "learning_rate": 9.359230335914182e-06, + "loss": 0.7963, + "step": 6004 + }, + { + "epoch": 0.33050800814574277, + "grad_norm": 0.7188493013381958, + "learning_rate": 9.359018016259223e-06, + "loss": 0.8188, + "step": 6005 + }, + { + "epoch": 0.3305630469480984, + "grad_norm": 0.8648282289505005, + "learning_rate": 9.358805663843051e-06, + "loss": 0.9136, + "step": 6006 + }, + { + "epoch": 0.3306180857504541, + "grad_norm": 0.8010255098342896, + "learning_rate": 9.358593278667265e-06, + "loss": 0.849, + "step": 6007 + }, + { + "epoch": 0.3306731245528097, + "grad_norm": 0.8128451108932495, + "learning_rate": 9.358380860733456e-06, + "loss": 0.8082, + "step": 6008 + }, + { + "epoch": 0.3307281633551654, + "grad_norm": 1.0003761053085327, + "learning_rate": 9.358168410043224e-06, + "loss": 0.9064, + "step": 6009 + }, + { + "epoch": 0.33078320215752105, + "grad_norm": 0.7412391901016235, + "learning_rate": 9.357955926598163e-06, + "loss": 0.8049, + "step": 6010 + }, + { + "epoch": 0.33083824095987674, + "grad_norm": 0.795615553855896, + "learning_rate": 9.357743410399875e-06, + "loss": 0.7923, + "step": 6011 + }, + { + "epoch": 0.3308932797622324, + "grad_norm": 0.8696123957633972, + "learning_rate": 9.357530861449953e-06, + "loss": 0.8543, + "step": 6012 + }, + { + "epoch": 0.33094831856458806, + "grad_norm": 0.8909900784492493, + "learning_rate": 9.357318279749994e-06, + "loss": 0.6157, + "step": 6013 + }, + { + "epoch": 0.3310033573669437, + "grad_norm": 0.7326250672340393, + "learning_rate": 9.357105665301597e-06, + "loss": 0.7647, + "step": 6014 + }, + { + "epoch": 0.33105839616929933, + "grad_norm": 0.8425576090812683, + "learning_rate": 9.356893018106364e-06, + "loss": 0.7832, + "step": 6015 + }, + { + "epoch": 0.331113434971655, + "grad_norm": 0.7404599785804749, + "learning_rate": 9.356680338165885e-06, + "loss": 0.7759, + "step": 6016 + }, + { + "epoch": 0.33116847377401065, + "grad_norm": 0.6935396790504456, + "learning_rate": 9.356467625481765e-06, + "loss": 0.7488, + "step": 6017 + }, + { + "epoch": 0.33122351257636634, + "grad_norm": 0.7799031138420105, + "learning_rate": 9.3562548800556e-06, + "loss": 0.7617, + "step": 6018 + }, + { + "epoch": 0.331278551378722, + "grad_norm": 0.7824636101722717, + "learning_rate": 9.35604210188899e-06, + "loss": 0.7936, + "step": 6019 + }, + { + "epoch": 0.33133359018107766, + "grad_norm": 0.7051861882209778, + "learning_rate": 9.355829290983531e-06, + "loss": 0.7869, + "step": 6020 + }, + { + "epoch": 0.3313886289834333, + "grad_norm": 0.8172006607055664, + "learning_rate": 9.355616447340826e-06, + "loss": 0.8888, + "step": 6021 + }, + { + "epoch": 0.331443667785789, + "grad_norm": 0.7263272404670715, + "learning_rate": 9.355403570962475e-06, + "loss": 0.8393, + "step": 6022 + }, + { + "epoch": 0.3314987065881446, + "grad_norm": 0.7143926620483398, + "learning_rate": 9.355190661850077e-06, + "loss": 0.6693, + "step": 6023 + }, + { + "epoch": 0.3315537453905003, + "grad_norm": 0.7294363975524902, + "learning_rate": 9.354977720005232e-06, + "loss": 0.8035, + "step": 6024 + }, + { + "epoch": 0.33160878419285594, + "grad_norm": 0.7072308659553528, + "learning_rate": 9.354764745429538e-06, + "loss": 0.761, + "step": 6025 + }, + { + "epoch": 0.33166382299521163, + "grad_norm": 0.6945865154266357, + "learning_rate": 9.3545517381246e-06, + "loss": 0.7212, + "step": 6026 + }, + { + "epoch": 0.33171886179756727, + "grad_norm": 0.7645060420036316, + "learning_rate": 9.354338698092016e-06, + "loss": 0.812, + "step": 6027 + }, + { + "epoch": 0.33177390059992296, + "grad_norm": 0.9494503140449524, + "learning_rate": 9.354125625333387e-06, + "loss": 0.9037, + "step": 6028 + }, + { + "epoch": 0.3318289394022786, + "grad_norm": 0.7311872243881226, + "learning_rate": 9.353912519850317e-06, + "loss": 0.7137, + "step": 6029 + }, + { + "epoch": 0.3318839782046343, + "grad_norm": 0.658562958240509, + "learning_rate": 9.353699381644405e-06, + "loss": 0.7048, + "step": 6030 + }, + { + "epoch": 0.3319390170069899, + "grad_norm": 0.8106339573860168, + "learning_rate": 9.353486210717253e-06, + "loss": 0.8905, + "step": 6031 + }, + { + "epoch": 0.3319940558093456, + "grad_norm": 0.8166239261627197, + "learning_rate": 9.353273007070465e-06, + "loss": 0.7011, + "step": 6032 + }, + { + "epoch": 0.33204909461170123, + "grad_norm": 0.730172872543335, + "learning_rate": 9.353059770705643e-06, + "loss": 0.6934, + "step": 6033 + }, + { + "epoch": 0.3321041334140569, + "grad_norm": 0.7633965611457825, + "learning_rate": 9.352846501624387e-06, + "loss": 0.7379, + "step": 6034 + }, + { + "epoch": 0.33215917221641256, + "grad_norm": 0.7786447405815125, + "learning_rate": 9.352633199828304e-06, + "loss": 0.8533, + "step": 6035 + }, + { + "epoch": 0.33221421101876825, + "grad_norm": 0.7211753726005554, + "learning_rate": 9.352419865318993e-06, + "loss": 0.815, + "step": 6036 + }, + { + "epoch": 0.3322692498211239, + "grad_norm": 0.6861024498939514, + "learning_rate": 9.352206498098062e-06, + "loss": 0.7678, + "step": 6037 + }, + { + "epoch": 0.33232428862347957, + "grad_norm": 0.7702088952064514, + "learning_rate": 9.35199309816711e-06, + "loss": 0.8463, + "step": 6038 + }, + { + "epoch": 0.3323793274258352, + "grad_norm": 0.7179547548294067, + "learning_rate": 9.351779665527742e-06, + "loss": 0.8315, + "step": 6039 + }, + { + "epoch": 0.3324343662281909, + "grad_norm": 0.8686990737915039, + "learning_rate": 9.351566200181565e-06, + "loss": 0.8396, + "step": 6040 + }, + { + "epoch": 0.3324894050305465, + "grad_norm": 0.7269062995910645, + "learning_rate": 9.351352702130181e-06, + "loss": 0.7126, + "step": 6041 + }, + { + "epoch": 0.3325444438329022, + "grad_norm": 0.7759222984313965, + "learning_rate": 9.351139171375195e-06, + "loss": 0.8383, + "step": 6042 + }, + { + "epoch": 0.33259948263525785, + "grad_norm": 0.6882128119468689, + "learning_rate": 9.350925607918212e-06, + "loss": 0.6371, + "step": 6043 + }, + { + "epoch": 0.33265452143761354, + "grad_norm": 0.7552365660667419, + "learning_rate": 9.350712011760834e-06, + "loss": 0.8018, + "step": 6044 + }, + { + "epoch": 0.33270956023996917, + "grad_norm": 0.8320692181587219, + "learning_rate": 9.350498382904672e-06, + "loss": 0.8556, + "step": 6045 + }, + { + "epoch": 0.33276459904232486, + "grad_norm": 0.7542223334312439, + "learning_rate": 9.350284721351326e-06, + "loss": 0.8006, + "step": 6046 + }, + { + "epoch": 0.3328196378446805, + "grad_norm": 1.2724859714508057, + "learning_rate": 9.350071027102406e-06, + "loss": 0.9253, + "step": 6047 + }, + { + "epoch": 0.3328746766470362, + "grad_norm": 0.731383204460144, + "learning_rate": 9.349857300159517e-06, + "loss": 0.83, + "step": 6048 + }, + { + "epoch": 0.3329297154493918, + "grad_norm": 0.731419026851654, + "learning_rate": 9.349643540524265e-06, + "loss": 0.779, + "step": 6049 + }, + { + "epoch": 0.3329847542517475, + "grad_norm": 0.8462278842926025, + "learning_rate": 9.349429748198256e-06, + "loss": 0.84, + "step": 6050 + }, + { + "epoch": 0.33303979305410314, + "grad_norm": 0.8199888467788696, + "learning_rate": 9.349215923183098e-06, + "loss": 0.844, + "step": 6051 + }, + { + "epoch": 0.33309483185645883, + "grad_norm": 0.8696722984313965, + "learning_rate": 9.349002065480397e-06, + "loss": 0.709, + "step": 6052 + }, + { + "epoch": 0.33314987065881446, + "grad_norm": 0.8484870195388794, + "learning_rate": 9.34878817509176e-06, + "loss": 0.7434, + "step": 6053 + }, + { + "epoch": 0.33320490946117015, + "grad_norm": 0.8392589688301086, + "learning_rate": 9.348574252018796e-06, + "loss": 0.8972, + "step": 6054 + }, + { + "epoch": 0.3332599482635258, + "grad_norm": 0.673829972743988, + "learning_rate": 9.34836029626311e-06, + "loss": 0.6789, + "step": 6055 + }, + { + "epoch": 0.3333149870658815, + "grad_norm": 0.6693649888038635, + "learning_rate": 9.348146307826315e-06, + "loss": 0.68, + "step": 6056 + }, + { + "epoch": 0.3333700258682371, + "grad_norm": 0.8516272306442261, + "learning_rate": 9.347932286710014e-06, + "loss": 0.8585, + "step": 6057 + }, + { + "epoch": 0.33342506467059274, + "grad_norm": 0.7431588768959045, + "learning_rate": 9.347718232915818e-06, + "loss": 0.8239, + "step": 6058 + }, + { + "epoch": 0.33348010347294843, + "grad_norm": 0.8823427557945251, + "learning_rate": 9.347504146445336e-06, + "loss": 0.845, + "step": 6059 + }, + { + "epoch": 0.33353514227530406, + "grad_norm": 0.7884035110473633, + "learning_rate": 9.347290027300177e-06, + "loss": 0.8503, + "step": 6060 + }, + { + "epoch": 0.33359018107765975, + "grad_norm": 0.841397225856781, + "learning_rate": 9.34707587548195e-06, + "loss": 0.7551, + "step": 6061 + }, + { + "epoch": 0.3336452198800154, + "grad_norm": 0.7592034935951233, + "learning_rate": 9.346861690992263e-06, + "loss": 0.8516, + "step": 6062 + }, + { + "epoch": 0.3337002586823711, + "grad_norm": 0.6925262212753296, + "learning_rate": 9.346647473832728e-06, + "loss": 0.7351, + "step": 6063 + }, + { + "epoch": 0.3337552974847267, + "grad_norm": 0.8152759075164795, + "learning_rate": 9.346433224004955e-06, + "loss": 0.7673, + "step": 6064 + }, + { + "epoch": 0.3338103362870824, + "grad_norm": 0.7383455038070679, + "learning_rate": 9.346218941510551e-06, + "loss": 0.7312, + "step": 6065 + }, + { + "epoch": 0.33386537508943803, + "grad_norm": 0.7905310392379761, + "learning_rate": 9.346004626351131e-06, + "loss": 0.7891, + "step": 6066 + }, + { + "epoch": 0.3339204138917937, + "grad_norm": 0.7032167315483093, + "learning_rate": 9.345790278528305e-06, + "loss": 0.8358, + "step": 6067 + }, + { + "epoch": 0.33397545269414936, + "grad_norm": 0.6415952444076538, + "learning_rate": 9.34557589804368e-06, + "loss": 0.6716, + "step": 6068 + }, + { + "epoch": 0.33403049149650504, + "grad_norm": 0.7558899521827698, + "learning_rate": 9.34536148489887e-06, + "loss": 0.781, + "step": 6069 + }, + { + "epoch": 0.3340855302988607, + "grad_norm": 0.8913301825523376, + "learning_rate": 9.345147039095485e-06, + "loss": 0.8482, + "step": 6070 + }, + { + "epoch": 0.33414056910121637, + "grad_norm": 0.768984854221344, + "learning_rate": 9.34493256063514e-06, + "loss": 0.7578, + "step": 6071 + }, + { + "epoch": 0.334195607903572, + "grad_norm": 0.7428637742996216, + "learning_rate": 9.344718049519445e-06, + "loss": 0.7812, + "step": 6072 + }, + { + "epoch": 0.3342506467059277, + "grad_norm": 0.7290430665016174, + "learning_rate": 9.344503505750012e-06, + "loss": 0.7536, + "step": 6073 + }, + { + "epoch": 0.3343056855082833, + "grad_norm": 0.7637680172920227, + "learning_rate": 9.344288929328453e-06, + "loss": 0.8576, + "step": 6074 + }, + { + "epoch": 0.334360724310639, + "grad_norm": 0.9568214416503906, + "learning_rate": 9.344074320256379e-06, + "loss": 0.897, + "step": 6075 + }, + { + "epoch": 0.33441576311299465, + "grad_norm": 0.7516217827796936, + "learning_rate": 9.34385967853541e-06, + "loss": 0.7853, + "step": 6076 + }, + { + "epoch": 0.33447080191535034, + "grad_norm": 0.833039402961731, + "learning_rate": 9.34364500416715e-06, + "loss": 0.702, + "step": 6077 + }, + { + "epoch": 0.33452584071770597, + "grad_norm": 0.8080580830574036, + "learning_rate": 9.34343029715322e-06, + "loss": 0.7867, + "step": 6078 + }, + { + "epoch": 0.33458087952006166, + "grad_norm": 0.8039596080780029, + "learning_rate": 9.343215557495229e-06, + "loss": 0.8221, + "step": 6079 + }, + { + "epoch": 0.3346359183224173, + "grad_norm": 0.7003986835479736, + "learning_rate": 9.343000785194794e-06, + "loss": 0.746, + "step": 6080 + }, + { + "epoch": 0.334690957124773, + "grad_norm": 0.6623722314834595, + "learning_rate": 9.342785980253526e-06, + "loss": 0.6998, + "step": 6081 + }, + { + "epoch": 0.3347459959271286, + "grad_norm": 0.8425901532173157, + "learning_rate": 9.342571142673042e-06, + "loss": 0.8789, + "step": 6082 + }, + { + "epoch": 0.3348010347294843, + "grad_norm": 0.7263861894607544, + "learning_rate": 9.342356272454954e-06, + "loss": 0.7299, + "step": 6083 + }, + { + "epoch": 0.33485607353183994, + "grad_norm": 0.8420364260673523, + "learning_rate": 9.34214136960088e-06, + "loss": 0.8073, + "step": 6084 + }, + { + "epoch": 0.3349111123341956, + "grad_norm": 0.950019359588623, + "learning_rate": 9.341926434112435e-06, + "loss": 0.9288, + "step": 6085 + }, + { + "epoch": 0.33496615113655126, + "grad_norm": 0.7583657503128052, + "learning_rate": 9.341711465991231e-06, + "loss": 0.8079, + "step": 6086 + }, + { + "epoch": 0.33502118993890695, + "grad_norm": 0.7623111605644226, + "learning_rate": 9.341496465238887e-06, + "loss": 0.879, + "step": 6087 + }, + { + "epoch": 0.3350762287412626, + "grad_norm": 0.8934749960899353, + "learning_rate": 9.341281431857017e-06, + "loss": 0.9348, + "step": 6088 + }, + { + "epoch": 0.33513126754361827, + "grad_norm": 0.7363337874412537, + "learning_rate": 9.341066365847238e-06, + "loss": 0.8284, + "step": 6089 + }, + { + "epoch": 0.3351863063459739, + "grad_norm": 0.6408932209014893, + "learning_rate": 9.340851267211166e-06, + "loss": 0.6019, + "step": 6090 + }, + { + "epoch": 0.3352413451483296, + "grad_norm": 0.8491614460945129, + "learning_rate": 9.34063613595042e-06, + "loss": 0.7287, + "step": 6091 + }, + { + "epoch": 0.33529638395068523, + "grad_norm": 0.6922628879547119, + "learning_rate": 9.340420972066612e-06, + "loss": 0.6649, + "step": 6092 + }, + { + "epoch": 0.3353514227530409, + "grad_norm": 0.7304210662841797, + "learning_rate": 9.340205775561364e-06, + "loss": 0.7373, + "step": 6093 + }, + { + "epoch": 0.33540646155539655, + "grad_norm": 0.8924282193183899, + "learning_rate": 9.339990546436289e-06, + "loss": 0.8337, + "step": 6094 + }, + { + "epoch": 0.33546150035775224, + "grad_norm": 0.7671791315078735, + "learning_rate": 9.339775284693008e-06, + "loss": 0.856, + "step": 6095 + }, + { + "epoch": 0.3355165391601079, + "grad_norm": 0.830427348613739, + "learning_rate": 9.339559990333138e-06, + "loss": 0.7204, + "step": 6096 + }, + { + "epoch": 0.33557157796246356, + "grad_norm": 0.7064357399940491, + "learning_rate": 9.339344663358297e-06, + "loss": 0.8533, + "step": 6097 + }, + { + "epoch": 0.3356266167648192, + "grad_norm": 0.7828566431999207, + "learning_rate": 9.3391293037701e-06, + "loss": 0.7203, + "step": 6098 + }, + { + "epoch": 0.3356816555671749, + "grad_norm": 0.7686871886253357, + "learning_rate": 9.338913911570172e-06, + "loss": 0.7813, + "step": 6099 + }, + { + "epoch": 0.3357366943695305, + "grad_norm": 0.7536553740501404, + "learning_rate": 9.338698486760126e-06, + "loss": 0.7581, + "step": 6100 + }, + { + "epoch": 0.33579173317188615, + "grad_norm": 0.7240094542503357, + "learning_rate": 9.338483029341586e-06, + "loss": 0.7513, + "step": 6101 + }, + { + "epoch": 0.33584677197424184, + "grad_norm": 0.7519696354866028, + "learning_rate": 9.338267539316169e-06, + "loss": 0.8139, + "step": 6102 + }, + { + "epoch": 0.3359018107765975, + "grad_norm": 0.7267377376556396, + "learning_rate": 9.338052016685492e-06, + "loss": 0.7807, + "step": 6103 + }, + { + "epoch": 0.33595684957895317, + "grad_norm": 0.6925491094589233, + "learning_rate": 9.33783646145118e-06, + "loss": 0.8124, + "step": 6104 + }, + { + "epoch": 0.3360118883813088, + "grad_norm": 0.6896460652351379, + "learning_rate": 9.337620873614848e-06, + "loss": 0.7459, + "step": 6105 + }, + { + "epoch": 0.3360669271836645, + "grad_norm": 0.8631082773208618, + "learning_rate": 9.337405253178121e-06, + "loss": 0.7662, + "step": 6106 + }, + { + "epoch": 0.3361219659860201, + "grad_norm": 0.76750248670578, + "learning_rate": 9.337189600142614e-06, + "loss": 0.9016, + "step": 6107 + }, + { + "epoch": 0.3361770047883758, + "grad_norm": 0.9230479001998901, + "learning_rate": 9.336973914509952e-06, + "loss": 0.7631, + "step": 6108 + }, + { + "epoch": 0.33623204359073144, + "grad_norm": 0.746776282787323, + "learning_rate": 9.336758196281756e-06, + "loss": 0.6934, + "step": 6109 + }, + { + "epoch": 0.33628708239308713, + "grad_norm": 0.7631211280822754, + "learning_rate": 9.336542445459646e-06, + "loss": 0.7957, + "step": 6110 + }, + { + "epoch": 0.33634212119544277, + "grad_norm": 0.7460417151451111, + "learning_rate": 9.336326662045243e-06, + "loss": 0.7979, + "step": 6111 + }, + { + "epoch": 0.33639715999779846, + "grad_norm": 0.7072319388389587, + "learning_rate": 9.336110846040171e-06, + "loss": 0.763, + "step": 6112 + }, + { + "epoch": 0.3364521988001541, + "grad_norm": 0.822266697883606, + "learning_rate": 9.33589499744605e-06, + "loss": 0.7719, + "step": 6113 + }, + { + "epoch": 0.3365072376025098, + "grad_norm": 0.778685986995697, + "learning_rate": 9.335679116264502e-06, + "loss": 0.896, + "step": 6114 + }, + { + "epoch": 0.3365622764048654, + "grad_norm": 0.9335552453994751, + "learning_rate": 9.33546320249715e-06, + "loss": 0.7317, + "step": 6115 + }, + { + "epoch": 0.3366173152072211, + "grad_norm": 0.755109965801239, + "learning_rate": 9.33524725614562e-06, + "loss": 0.8184, + "step": 6116 + }, + { + "epoch": 0.33667235400957674, + "grad_norm": 0.7963696122169495, + "learning_rate": 9.33503127721153e-06, + "loss": 0.7835, + "step": 6117 + }, + { + "epoch": 0.3367273928119324, + "grad_norm": 0.8298614621162415, + "learning_rate": 9.334815265696506e-06, + "loss": 0.7946, + "step": 6118 + }, + { + "epoch": 0.33678243161428806, + "grad_norm": 0.728638768196106, + "learning_rate": 9.33459922160217e-06, + "loss": 0.801, + "step": 6119 + }, + { + "epoch": 0.33683747041664375, + "grad_norm": 0.7275198698043823, + "learning_rate": 9.334383144930146e-06, + "loss": 0.7721, + "step": 6120 + }, + { + "epoch": 0.3368925092189994, + "grad_norm": 0.7146986722946167, + "learning_rate": 9.33416703568206e-06, + "loss": 0.7573, + "step": 6121 + }, + { + "epoch": 0.33694754802135507, + "grad_norm": 0.7875215411186218, + "learning_rate": 9.333950893859533e-06, + "loss": 0.8223, + "step": 6122 + }, + { + "epoch": 0.3370025868237107, + "grad_norm": 0.7636967301368713, + "learning_rate": 9.333734719464193e-06, + "loss": 0.7596, + "step": 6123 + }, + { + "epoch": 0.3370576256260664, + "grad_norm": 0.8068925142288208, + "learning_rate": 9.333518512497663e-06, + "loss": 0.834, + "step": 6124 + }, + { + "epoch": 0.337112664428422, + "grad_norm": 0.7153680920600891, + "learning_rate": 9.333302272961566e-06, + "loss": 0.703, + "step": 6125 + }, + { + "epoch": 0.3371677032307777, + "grad_norm": 0.7429617047309875, + "learning_rate": 9.33308600085753e-06, + "loss": 0.7327, + "step": 6126 + }, + { + "epoch": 0.33722274203313335, + "grad_norm": 0.6937283873558044, + "learning_rate": 9.33286969618718e-06, + "loss": 0.6494, + "step": 6127 + }, + { + "epoch": 0.33727778083548904, + "grad_norm": 0.7775923609733582, + "learning_rate": 9.33265335895214e-06, + "loss": 0.8668, + "step": 6128 + }, + { + "epoch": 0.33733281963784467, + "grad_norm": 0.6911064386367798, + "learning_rate": 9.33243698915404e-06, + "loss": 0.6462, + "step": 6129 + }, + { + "epoch": 0.33738785844020036, + "grad_norm": 0.8951280117034912, + "learning_rate": 9.3322205867945e-06, + "loss": 0.825, + "step": 6130 + }, + { + "epoch": 0.337442897242556, + "grad_norm": 0.9521064758300781, + "learning_rate": 9.332004151875151e-06, + "loss": 0.641, + "step": 6131 + }, + { + "epoch": 0.3374979360449117, + "grad_norm": 0.7036865949630737, + "learning_rate": 9.33178768439762e-06, + "loss": 0.804, + "step": 6132 + }, + { + "epoch": 0.3375529748472673, + "grad_norm": 1.0232574939727783, + "learning_rate": 9.331571184363529e-06, + "loss": 0.8577, + "step": 6133 + }, + { + "epoch": 0.337608013649623, + "grad_norm": 0.9680090546607971, + "learning_rate": 9.33135465177451e-06, + "loss": 0.7725, + "step": 6134 + }, + { + "epoch": 0.33766305245197864, + "grad_norm": 0.7664901614189148, + "learning_rate": 9.33113808663219e-06, + "loss": 0.8406, + "step": 6135 + }, + { + "epoch": 0.33771809125433433, + "grad_norm": 0.6703250408172607, + "learning_rate": 9.330921488938193e-06, + "loss": 0.7311, + "step": 6136 + }, + { + "epoch": 0.33777313005668996, + "grad_norm": 0.7364899516105652, + "learning_rate": 9.330704858694151e-06, + "loss": 0.8571, + "step": 6137 + }, + { + "epoch": 0.33782816885904565, + "grad_norm": 0.7167731523513794, + "learning_rate": 9.33048819590169e-06, + "loss": 0.7597, + "step": 6138 + }, + { + "epoch": 0.3378832076614013, + "grad_norm": 0.7761037945747375, + "learning_rate": 9.33027150056244e-06, + "loss": 0.8112, + "step": 6139 + }, + { + "epoch": 0.337938246463757, + "grad_norm": 0.8143900632858276, + "learning_rate": 9.330054772678028e-06, + "loss": 0.8213, + "step": 6140 + }, + { + "epoch": 0.3379932852661126, + "grad_norm": 0.7181026339530945, + "learning_rate": 9.329838012250083e-06, + "loss": 0.8228, + "step": 6141 + }, + { + "epoch": 0.3380483240684683, + "grad_norm": 0.7229815721511841, + "learning_rate": 9.329621219280235e-06, + "loss": 0.8205, + "step": 6142 + }, + { + "epoch": 0.33810336287082393, + "grad_norm": 0.7120887637138367, + "learning_rate": 9.329404393770113e-06, + "loss": 0.8012, + "step": 6143 + }, + { + "epoch": 0.33815840167317956, + "grad_norm": 0.7859634757041931, + "learning_rate": 9.329187535721346e-06, + "loss": 0.7583, + "step": 6144 + }, + { + "epoch": 0.33821344047553525, + "grad_norm": 0.7630401253700256, + "learning_rate": 9.328970645135564e-06, + "loss": 0.9087, + "step": 6145 + }, + { + "epoch": 0.3382684792778909, + "grad_norm": 0.7028466463088989, + "learning_rate": 9.328753722014399e-06, + "loss": 0.7253, + "step": 6146 + }, + { + "epoch": 0.3383235180802466, + "grad_norm": 0.8910240530967712, + "learning_rate": 9.328536766359477e-06, + "loss": 0.9048, + "step": 6147 + }, + { + "epoch": 0.3383785568826022, + "grad_norm": 0.6695914268493652, + "learning_rate": 9.328319778172435e-06, + "loss": 0.6817, + "step": 6148 + }, + { + "epoch": 0.3384335956849579, + "grad_norm": 0.9667700529098511, + "learning_rate": 9.328102757454898e-06, + "loss": 0.7721, + "step": 6149 + }, + { + "epoch": 0.33848863448731353, + "grad_norm": 0.7267603874206543, + "learning_rate": 9.3278857042085e-06, + "loss": 0.7263, + "step": 6150 + }, + { + "epoch": 0.3385436732896692, + "grad_norm": 0.7603437900543213, + "learning_rate": 9.32766861843487e-06, + "loss": 0.7856, + "step": 6151 + }, + { + "epoch": 0.33859871209202486, + "grad_norm": 0.7355918288230896, + "learning_rate": 9.327451500135641e-06, + "loss": 0.7687, + "step": 6152 + }, + { + "epoch": 0.33865375089438055, + "grad_norm": 0.712210476398468, + "learning_rate": 9.327234349312446e-06, + "loss": 0.7689, + "step": 6153 + }, + { + "epoch": 0.3387087896967362, + "grad_norm": 0.9011964797973633, + "learning_rate": 9.327017165966916e-06, + "loss": 0.888, + "step": 6154 + }, + { + "epoch": 0.33876382849909187, + "grad_norm": 0.7334766387939453, + "learning_rate": 9.326799950100683e-06, + "loss": 0.7577, + "step": 6155 + }, + { + "epoch": 0.3388188673014475, + "grad_norm": 0.711370587348938, + "learning_rate": 9.32658270171538e-06, + "loss": 0.7653, + "step": 6156 + }, + { + "epoch": 0.3388739061038032, + "grad_norm": 0.8465714454650879, + "learning_rate": 9.32636542081264e-06, + "loss": 0.7252, + "step": 6157 + }, + { + "epoch": 0.3389289449061588, + "grad_norm": 0.8105099201202393, + "learning_rate": 9.326148107394094e-06, + "loss": 0.7886, + "step": 6158 + }, + { + "epoch": 0.3389839837085145, + "grad_norm": 0.8082063794136047, + "learning_rate": 9.32593076146138e-06, + "loss": 0.8968, + "step": 6159 + }, + { + "epoch": 0.33903902251087015, + "grad_norm": 0.7451661229133606, + "learning_rate": 9.325713383016125e-06, + "loss": 0.762, + "step": 6160 + }, + { + "epoch": 0.33909406131322584, + "grad_norm": 0.8174484372138977, + "learning_rate": 9.325495972059968e-06, + "loss": 0.8285, + "step": 6161 + }, + { + "epoch": 0.33914910011558147, + "grad_norm": 0.7690935134887695, + "learning_rate": 9.32527852859454e-06, + "loss": 0.8908, + "step": 6162 + }, + { + "epoch": 0.33920413891793716, + "grad_norm": 0.7730095386505127, + "learning_rate": 9.325061052621476e-06, + "loss": 0.8571, + "step": 6163 + }, + { + "epoch": 0.3392591777202928, + "grad_norm": 0.7750043869018555, + "learning_rate": 9.324843544142412e-06, + "loss": 0.8314, + "step": 6164 + }, + { + "epoch": 0.3393142165226485, + "grad_norm": 0.8184822797775269, + "learning_rate": 9.32462600315898e-06, + "loss": 0.8783, + "step": 6165 + }, + { + "epoch": 0.3393692553250041, + "grad_norm": 0.8553629517555237, + "learning_rate": 9.32440842967282e-06, + "loss": 0.7116, + "step": 6166 + }, + { + "epoch": 0.3394242941273598, + "grad_norm": 0.8072115778923035, + "learning_rate": 9.324190823685562e-06, + "loss": 0.7498, + "step": 6167 + }, + { + "epoch": 0.33947933292971544, + "grad_norm": 0.7787594795227051, + "learning_rate": 9.323973185198843e-06, + "loss": 0.7567, + "step": 6168 + }, + { + "epoch": 0.3395343717320711, + "grad_norm": 0.7571421265602112, + "learning_rate": 9.323755514214299e-06, + "loss": 0.8349, + "step": 6169 + }, + { + "epoch": 0.33958941053442676, + "grad_norm": 0.6768494248390198, + "learning_rate": 9.323537810733565e-06, + "loss": 0.7382, + "step": 6170 + }, + { + "epoch": 0.33964444933678245, + "grad_norm": 0.7091678380966187, + "learning_rate": 9.32332007475828e-06, + "loss": 0.8107, + "step": 6171 + }, + { + "epoch": 0.3396994881391381, + "grad_norm": 0.6896559596061707, + "learning_rate": 9.323102306290078e-06, + "loss": 0.7973, + "step": 6172 + }, + { + "epoch": 0.3397545269414938, + "grad_norm": 0.7383756637573242, + "learning_rate": 9.322884505330595e-06, + "loss": 0.7998, + "step": 6173 + }, + { + "epoch": 0.3398095657438494, + "grad_norm": 0.7487883567810059, + "learning_rate": 9.32266667188147e-06, + "loss": 0.7928, + "step": 6174 + }, + { + "epoch": 0.3398646045462051, + "grad_norm": 0.7935298681259155, + "learning_rate": 9.32244880594434e-06, + "loss": 0.8457, + "step": 6175 + }, + { + "epoch": 0.33991964334856073, + "grad_norm": 0.6571856737136841, + "learning_rate": 9.322230907520841e-06, + "loss": 0.7177, + "step": 6176 + }, + { + "epoch": 0.3399746821509164, + "grad_norm": 0.7694165706634521, + "learning_rate": 9.322012976612613e-06, + "loss": 0.7124, + "step": 6177 + }, + { + "epoch": 0.34002972095327205, + "grad_norm": 0.8665503263473511, + "learning_rate": 9.32179501322129e-06, + "loss": 0.8054, + "step": 6178 + }, + { + "epoch": 0.34008475975562774, + "grad_norm": 0.6794337034225464, + "learning_rate": 9.321577017348515e-06, + "loss": 0.6468, + "step": 6179 + }, + { + "epoch": 0.3401397985579834, + "grad_norm": 0.7875672578811646, + "learning_rate": 9.32135898899592e-06, + "loss": 0.8384, + "step": 6180 + }, + { + "epoch": 0.34019483736033906, + "grad_norm": 0.8050880432128906, + "learning_rate": 9.321140928165152e-06, + "loss": 0.7261, + "step": 6181 + }, + { + "epoch": 0.3402498761626947, + "grad_norm": 0.7489742040634155, + "learning_rate": 9.320922834857844e-06, + "loss": 0.8252, + "step": 6182 + }, + { + "epoch": 0.3403049149650504, + "grad_norm": 0.7785589098930359, + "learning_rate": 9.320704709075637e-06, + "loss": 0.7123, + "step": 6183 + }, + { + "epoch": 0.340359953767406, + "grad_norm": 0.7698208689689636, + "learning_rate": 9.320486550820169e-06, + "loss": 0.704, + "step": 6184 + }, + { + "epoch": 0.3404149925697617, + "grad_norm": 0.78490149974823, + "learning_rate": 9.320268360093081e-06, + "loss": 0.8446, + "step": 6185 + }, + { + "epoch": 0.34047003137211734, + "grad_norm": 0.6684672236442566, + "learning_rate": 9.320050136896012e-06, + "loss": 0.6728, + "step": 6186 + }, + { + "epoch": 0.340525070174473, + "grad_norm": 0.818122386932373, + "learning_rate": 9.319831881230603e-06, + "loss": 0.7744, + "step": 6187 + }, + { + "epoch": 0.34058010897682867, + "grad_norm": 0.83867347240448, + "learning_rate": 9.319613593098494e-06, + "loss": 0.7423, + "step": 6188 + }, + { + "epoch": 0.3406351477791843, + "grad_norm": 0.7800338268280029, + "learning_rate": 9.319395272501326e-06, + "loss": 0.8189, + "step": 6189 + }, + { + "epoch": 0.34069018658154, + "grad_norm": 0.7530137300491333, + "learning_rate": 9.319176919440737e-06, + "loss": 0.7978, + "step": 6190 + }, + { + "epoch": 0.3407452253838956, + "grad_norm": 0.8916274309158325, + "learning_rate": 9.318958533918374e-06, + "loss": 0.8828, + "step": 6191 + }, + { + "epoch": 0.3408002641862513, + "grad_norm": 0.76950603723526, + "learning_rate": 9.318740115935873e-06, + "loss": 0.7691, + "step": 6192 + }, + { + "epoch": 0.34085530298860695, + "grad_norm": 0.8348222970962524, + "learning_rate": 9.318521665494877e-06, + "loss": 0.8022, + "step": 6193 + }, + { + "epoch": 0.34091034179096263, + "grad_norm": 0.6879388689994812, + "learning_rate": 9.318303182597029e-06, + "loss": 0.747, + "step": 6194 + }, + { + "epoch": 0.34096538059331827, + "grad_norm": 0.8032572269439697, + "learning_rate": 9.31808466724397e-06, + "loss": 0.7621, + "step": 6195 + }, + { + "epoch": 0.34102041939567396, + "grad_norm": 0.6842368841171265, + "learning_rate": 9.317866119437342e-06, + "loss": 0.6867, + "step": 6196 + }, + { + "epoch": 0.3410754581980296, + "grad_norm": 0.7797672152519226, + "learning_rate": 9.317647539178788e-06, + "loss": 0.8329, + "step": 6197 + }, + { + "epoch": 0.3411304970003853, + "grad_norm": 0.6865420341491699, + "learning_rate": 9.317428926469952e-06, + "loss": 0.7544, + "step": 6198 + }, + { + "epoch": 0.3411855358027409, + "grad_norm": 0.818217396736145, + "learning_rate": 9.317210281312475e-06, + "loss": 0.8853, + "step": 6199 + }, + { + "epoch": 0.3412405746050966, + "grad_norm": 0.7531415224075317, + "learning_rate": 9.316991603708001e-06, + "loss": 0.8225, + "step": 6200 + }, + { + "epoch": 0.34129561340745224, + "grad_norm": 0.7347036600112915, + "learning_rate": 9.316772893658173e-06, + "loss": 0.7817, + "step": 6201 + }, + { + "epoch": 0.3413506522098079, + "grad_norm": 0.7162033915519714, + "learning_rate": 9.316554151164636e-06, + "loss": 0.7836, + "step": 6202 + }, + { + "epoch": 0.34140569101216356, + "grad_norm": 0.7421988248825073, + "learning_rate": 9.316335376229035e-06, + "loss": 0.7782, + "step": 6203 + }, + { + "epoch": 0.34146072981451925, + "grad_norm": 0.7672573328018188, + "learning_rate": 9.31611656885301e-06, + "loss": 0.8585, + "step": 6204 + }, + { + "epoch": 0.3415157686168749, + "grad_norm": 0.6898330450057983, + "learning_rate": 9.31589772903821e-06, + "loss": 0.7719, + "step": 6205 + }, + { + "epoch": 0.34157080741923057, + "grad_norm": 0.7700635194778442, + "learning_rate": 9.315678856786279e-06, + "loss": 0.7345, + "step": 6206 + }, + { + "epoch": 0.3416258462215862, + "grad_norm": 0.6982038617134094, + "learning_rate": 9.315459952098858e-06, + "loss": 0.8332, + "step": 6207 + }, + { + "epoch": 0.3416808850239419, + "grad_norm": 0.8882858753204346, + "learning_rate": 9.315241014977598e-06, + "loss": 0.9029, + "step": 6208 + }, + { + "epoch": 0.3417359238262975, + "grad_norm": 0.7313854098320007, + "learning_rate": 9.31502204542414e-06, + "loss": 0.8061, + "step": 6209 + }, + { + "epoch": 0.3417909626286532, + "grad_norm": 0.7324157953262329, + "learning_rate": 9.314803043440131e-06, + "loss": 0.7889, + "step": 6210 + }, + { + "epoch": 0.34184600143100885, + "grad_norm": 0.7498225569725037, + "learning_rate": 9.314584009027218e-06, + "loss": 0.7937, + "step": 6211 + }, + { + "epoch": 0.34190104023336454, + "grad_norm": 0.7093212008476257, + "learning_rate": 9.314364942187048e-06, + "loss": 0.8404, + "step": 6212 + }, + { + "epoch": 0.3419560790357202, + "grad_norm": 0.7008668780326843, + "learning_rate": 9.314145842921264e-06, + "loss": 0.8175, + "step": 6213 + }, + { + "epoch": 0.34201111783807586, + "grad_norm": 0.8049909472465515, + "learning_rate": 9.313926711231516e-06, + "loss": 0.78, + "step": 6214 + }, + { + "epoch": 0.3420661566404315, + "grad_norm": 0.7777613997459412, + "learning_rate": 9.313707547119448e-06, + "loss": 0.9566, + "step": 6215 + }, + { + "epoch": 0.3421211954427872, + "grad_norm": 0.7787579894065857, + "learning_rate": 9.31348835058671e-06, + "loss": 0.7698, + "step": 6216 + }, + { + "epoch": 0.3421762342451428, + "grad_norm": 0.7779031991958618, + "learning_rate": 9.313269121634947e-06, + "loss": 0.8853, + "step": 6217 + }, + { + "epoch": 0.3422312730474985, + "grad_norm": 0.7194382548332214, + "learning_rate": 9.313049860265809e-06, + "loss": 0.8399, + "step": 6218 + }, + { + "epoch": 0.34228631184985414, + "grad_norm": 0.6513093709945679, + "learning_rate": 9.312830566480943e-06, + "loss": 0.7156, + "step": 6219 + }, + { + "epoch": 0.34234135065220983, + "grad_norm": 0.935325026512146, + "learning_rate": 9.312611240281996e-06, + "loss": 0.7525, + "step": 6220 + }, + { + "epoch": 0.34239638945456546, + "grad_norm": 0.7539558410644531, + "learning_rate": 9.312391881670618e-06, + "loss": 0.7716, + "step": 6221 + }, + { + "epoch": 0.34245142825692115, + "grad_norm": 0.7239616513252258, + "learning_rate": 9.312172490648457e-06, + "loss": 0.7272, + "step": 6222 + }, + { + "epoch": 0.3425064670592768, + "grad_norm": 0.7742316126823425, + "learning_rate": 9.311953067217162e-06, + "loss": 0.7657, + "step": 6223 + }, + { + "epoch": 0.3425615058616325, + "grad_norm": 0.782691240310669, + "learning_rate": 9.311733611378379e-06, + "loss": 0.813, + "step": 6224 + }, + { + "epoch": 0.3426165446639881, + "grad_norm": 0.7448118329048157, + "learning_rate": 9.311514123133765e-06, + "loss": 0.8298, + "step": 6225 + }, + { + "epoch": 0.3426715834663438, + "grad_norm": 0.8201695680618286, + "learning_rate": 9.311294602484961e-06, + "loss": 0.7738, + "step": 6226 + }, + { + "epoch": 0.34272662226869943, + "grad_norm": 0.6928383111953735, + "learning_rate": 9.311075049433625e-06, + "loss": 0.6829, + "step": 6227 + }, + { + "epoch": 0.3427816610710551, + "grad_norm": 0.7509302496910095, + "learning_rate": 9.310855463981399e-06, + "loss": 0.6265, + "step": 6228 + }, + { + "epoch": 0.34283669987341076, + "grad_norm": 0.7012569308280945, + "learning_rate": 9.310635846129938e-06, + "loss": 0.7478, + "step": 6229 + }, + { + "epoch": 0.3428917386757664, + "grad_norm": 0.7428532242774963, + "learning_rate": 9.310416195880894e-06, + "loss": 0.7434, + "step": 6230 + }, + { + "epoch": 0.3429467774781221, + "grad_norm": 0.9089111685752869, + "learning_rate": 9.310196513235915e-06, + "loss": 0.6991, + "step": 6231 + }, + { + "epoch": 0.3430018162804777, + "grad_norm": 0.7633285522460938, + "learning_rate": 9.309976798196651e-06, + "loss": 0.7789, + "step": 6232 + }, + { + "epoch": 0.3430568550828334, + "grad_norm": 0.7035595178604126, + "learning_rate": 9.309757050764756e-06, + "loss": 0.6784, + "step": 6233 + }, + { + "epoch": 0.34311189388518903, + "grad_norm": 0.8782615661621094, + "learning_rate": 9.309537270941881e-06, + "loss": 0.8861, + "step": 6234 + }, + { + "epoch": 0.3431669326875447, + "grad_norm": 0.7690381407737732, + "learning_rate": 9.309317458729677e-06, + "loss": 0.7701, + "step": 6235 + }, + { + "epoch": 0.34322197148990036, + "grad_norm": 0.7730939388275146, + "learning_rate": 9.309097614129797e-06, + "loss": 0.8004, + "step": 6236 + }, + { + "epoch": 0.34327701029225605, + "grad_norm": 0.9295101761817932, + "learning_rate": 9.308877737143894e-06, + "loss": 0.6964, + "step": 6237 + }, + { + "epoch": 0.3433320490946117, + "grad_norm": 0.7496231198310852, + "learning_rate": 9.308657827773617e-06, + "loss": 0.8107, + "step": 6238 + }, + { + "epoch": 0.34338708789696737, + "grad_norm": 0.7656146287918091, + "learning_rate": 9.308437886020622e-06, + "loss": 0.8016, + "step": 6239 + }, + { + "epoch": 0.343442126699323, + "grad_norm": 0.8925992846488953, + "learning_rate": 9.308217911886562e-06, + "loss": 0.7136, + "step": 6240 + }, + { + "epoch": 0.3434971655016787, + "grad_norm": 0.7669470906257629, + "learning_rate": 9.307997905373087e-06, + "loss": 0.8284, + "step": 6241 + }, + { + "epoch": 0.3435522043040343, + "grad_norm": 0.6964572072029114, + "learning_rate": 9.307777866481855e-06, + "loss": 0.7926, + "step": 6242 + }, + { + "epoch": 0.34360724310639, + "grad_norm": 0.8405120968818665, + "learning_rate": 9.307557795214517e-06, + "loss": 0.9398, + "step": 6243 + }, + { + "epoch": 0.34366228190874565, + "grad_norm": 0.7517451643943787, + "learning_rate": 9.30733769157273e-06, + "loss": 0.8315, + "step": 6244 + }, + { + "epoch": 0.34371732071110134, + "grad_norm": 0.7740843892097473, + "learning_rate": 9.307117555558144e-06, + "loss": 0.8287, + "step": 6245 + }, + { + "epoch": 0.34377235951345697, + "grad_norm": 0.7214275598526001, + "learning_rate": 9.306897387172413e-06, + "loss": 0.7416, + "step": 6246 + }, + { + "epoch": 0.34382739831581266, + "grad_norm": 0.8217877745628357, + "learning_rate": 9.306677186417197e-06, + "loss": 0.8365, + "step": 6247 + }, + { + "epoch": 0.3438824371181683, + "grad_norm": 0.7397332191467285, + "learning_rate": 9.306456953294148e-06, + "loss": 0.7284, + "step": 6248 + }, + { + "epoch": 0.343937475920524, + "grad_norm": 0.8141350746154785, + "learning_rate": 9.30623668780492e-06, + "loss": 0.8976, + "step": 6249 + }, + { + "epoch": 0.3439925147228796, + "grad_norm": 0.7078670263290405, + "learning_rate": 9.306016389951171e-06, + "loss": 0.8167, + "step": 6250 + }, + { + "epoch": 0.3440475535252353, + "grad_norm": 0.7136256098747253, + "learning_rate": 9.305796059734553e-06, + "loss": 0.7916, + "step": 6251 + }, + { + "epoch": 0.34410259232759094, + "grad_norm": 1.6186310052871704, + "learning_rate": 9.305575697156726e-06, + "loss": 0.8148, + "step": 6252 + }, + { + "epoch": 0.34415763112994663, + "grad_norm": 0.7567281126976013, + "learning_rate": 9.305355302219346e-06, + "loss": 0.8676, + "step": 6253 + }, + { + "epoch": 0.34421266993230226, + "grad_norm": 0.9036027193069458, + "learning_rate": 9.305134874924067e-06, + "loss": 0.8111, + "step": 6254 + }, + { + "epoch": 0.34426770873465795, + "grad_norm": 0.9375718235969543, + "learning_rate": 9.304914415272547e-06, + "loss": 0.6176, + "step": 6255 + }, + { + "epoch": 0.3443227475370136, + "grad_norm": 0.7309718132019043, + "learning_rate": 9.304693923266441e-06, + "loss": 0.7313, + "step": 6256 + }, + { + "epoch": 0.3443777863393693, + "grad_norm": 0.7499229311943054, + "learning_rate": 9.30447339890741e-06, + "loss": 0.6704, + "step": 6257 + }, + { + "epoch": 0.3444328251417249, + "grad_norm": 0.7553356289863586, + "learning_rate": 9.304252842197108e-06, + "loss": 0.8671, + "step": 6258 + }, + { + "epoch": 0.3444878639440806, + "grad_norm": 0.7144323587417603, + "learning_rate": 9.304032253137194e-06, + "loss": 0.7684, + "step": 6259 + }, + { + "epoch": 0.34454290274643623, + "grad_norm": 0.7566905617713928, + "learning_rate": 9.303811631729324e-06, + "loss": 0.8381, + "step": 6260 + }, + { + "epoch": 0.3445979415487919, + "grad_norm": 0.7300242185592651, + "learning_rate": 9.30359097797516e-06, + "loss": 0.7044, + "step": 6261 + }, + { + "epoch": 0.34465298035114755, + "grad_norm": 0.6504725813865662, + "learning_rate": 9.303370291876359e-06, + "loss": 0.6693, + "step": 6262 + }, + { + "epoch": 0.34470801915350324, + "grad_norm": 0.7010672688484192, + "learning_rate": 9.303149573434576e-06, + "loss": 0.6635, + "step": 6263 + }, + { + "epoch": 0.3447630579558589, + "grad_norm": 0.8416483998298645, + "learning_rate": 9.302928822651473e-06, + "loss": 0.8408, + "step": 6264 + }, + { + "epoch": 0.34481809675821457, + "grad_norm": 0.7011786699295044, + "learning_rate": 9.302708039528712e-06, + "loss": 0.7636, + "step": 6265 + }, + { + "epoch": 0.3448731355605702, + "grad_norm": 0.7361586689949036, + "learning_rate": 9.302487224067947e-06, + "loss": 0.824, + "step": 6266 + }, + { + "epoch": 0.3449281743629259, + "grad_norm": 0.7747073173522949, + "learning_rate": 9.302266376270839e-06, + "loss": 0.8012, + "step": 6267 + }, + { + "epoch": 0.3449832131652815, + "grad_norm": 0.9407958388328552, + "learning_rate": 9.302045496139049e-06, + "loss": 0.8664, + "step": 6268 + }, + { + "epoch": 0.3450382519676372, + "grad_norm": 0.8674719929695129, + "learning_rate": 9.301824583674238e-06, + "loss": 0.8842, + "step": 6269 + }, + { + "epoch": 0.34509329076999284, + "grad_norm": 0.7697336673736572, + "learning_rate": 9.301603638878062e-06, + "loss": 0.7148, + "step": 6270 + }, + { + "epoch": 0.34514832957234853, + "grad_norm": 0.7220168709754944, + "learning_rate": 9.301382661752187e-06, + "loss": 0.7199, + "step": 6271 + }, + { + "epoch": 0.34520336837470417, + "grad_norm": 0.6745235919952393, + "learning_rate": 9.301161652298272e-06, + "loss": 0.708, + "step": 6272 + }, + { + "epoch": 0.3452584071770598, + "grad_norm": 0.7062309980392456, + "learning_rate": 9.300940610517974e-06, + "loss": 0.863, + "step": 6273 + }, + { + "epoch": 0.3453134459794155, + "grad_norm": 0.7499971985816956, + "learning_rate": 9.300719536412961e-06, + "loss": 0.7976, + "step": 6274 + }, + { + "epoch": 0.3453684847817711, + "grad_norm": 0.8304464221000671, + "learning_rate": 9.30049842998489e-06, + "loss": 0.8689, + "step": 6275 + }, + { + "epoch": 0.3454235235841268, + "grad_norm": 0.7460494041442871, + "learning_rate": 9.300277291235423e-06, + "loss": 0.7499, + "step": 6276 + }, + { + "epoch": 0.34547856238648245, + "grad_norm": 0.758788526058197, + "learning_rate": 9.300056120166225e-06, + "loss": 0.7501, + "step": 6277 + }, + { + "epoch": 0.34553360118883814, + "grad_norm": 0.7204456925392151, + "learning_rate": 9.299834916778955e-06, + "loss": 0.8234, + "step": 6278 + }, + { + "epoch": 0.34558863999119377, + "grad_norm": 0.7647501826286316, + "learning_rate": 9.299613681075277e-06, + "loss": 0.8653, + "step": 6279 + }, + { + "epoch": 0.34564367879354946, + "grad_norm": 0.7543594837188721, + "learning_rate": 9.299392413056853e-06, + "loss": 0.7915, + "step": 6280 + }, + { + "epoch": 0.3456987175959051, + "grad_norm": 0.7691700458526611, + "learning_rate": 9.299171112725347e-06, + "loss": 0.7429, + "step": 6281 + }, + { + "epoch": 0.3457537563982608, + "grad_norm": 0.7703940272331238, + "learning_rate": 9.29894978008242e-06, + "loss": 0.7424, + "step": 6282 + }, + { + "epoch": 0.3458087952006164, + "grad_norm": 0.8482547402381897, + "learning_rate": 9.29872841512974e-06, + "loss": 0.8971, + "step": 6283 + }, + { + "epoch": 0.3458638340029721, + "grad_norm": 0.755224883556366, + "learning_rate": 9.298507017868966e-06, + "loss": 0.7984, + "step": 6284 + }, + { + "epoch": 0.34591887280532774, + "grad_norm": 1.079891324043274, + "learning_rate": 9.298285588301766e-06, + "loss": 0.8301, + "step": 6285 + }, + { + "epoch": 0.3459739116076834, + "grad_norm": 0.7357321381568909, + "learning_rate": 9.2980641264298e-06, + "loss": 0.9018, + "step": 6286 + }, + { + "epoch": 0.34602895041003906, + "grad_norm": 0.7541963458061218, + "learning_rate": 9.297842632254734e-06, + "loss": 0.8716, + "step": 6287 + }, + { + "epoch": 0.34608398921239475, + "grad_norm": 1.1570138931274414, + "learning_rate": 9.297621105778235e-06, + "loss": 0.9163, + "step": 6288 + }, + { + "epoch": 0.3461390280147504, + "grad_norm": 0.7626895904541016, + "learning_rate": 9.297399547001965e-06, + "loss": 0.8162, + "step": 6289 + }, + { + "epoch": 0.34619406681710607, + "grad_norm": 0.758469820022583, + "learning_rate": 9.297177955927593e-06, + "loss": 0.8966, + "step": 6290 + }, + { + "epoch": 0.3462491056194617, + "grad_norm": 0.8998799324035645, + "learning_rate": 9.296956332556779e-06, + "loss": 0.8127, + "step": 6291 + }, + { + "epoch": 0.3463041444218174, + "grad_norm": 0.7470666170120239, + "learning_rate": 9.29673467689119e-06, + "loss": 0.7738, + "step": 6292 + }, + { + "epoch": 0.34635918322417303, + "grad_norm": 0.8066977858543396, + "learning_rate": 9.296512988932497e-06, + "loss": 0.8958, + "step": 6293 + }, + { + "epoch": 0.3464142220265287, + "grad_norm": 0.8394894003868103, + "learning_rate": 9.29629126868236e-06, + "loss": 0.8023, + "step": 6294 + }, + { + "epoch": 0.34646926082888435, + "grad_norm": 0.9053472876548767, + "learning_rate": 9.29606951614245e-06, + "loss": 0.8244, + "step": 6295 + }, + { + "epoch": 0.34652429963124004, + "grad_norm": 0.6996710896492004, + "learning_rate": 9.295847731314428e-06, + "loss": 0.8203, + "step": 6296 + }, + { + "epoch": 0.3465793384335957, + "grad_norm": 0.7236999273300171, + "learning_rate": 9.295625914199968e-06, + "loss": 0.6982, + "step": 6297 + }, + { + "epoch": 0.34663437723595136, + "grad_norm": 0.7006070017814636, + "learning_rate": 9.295404064800733e-06, + "loss": 0.7881, + "step": 6298 + }, + { + "epoch": 0.346689416038307, + "grad_norm": 0.8188902735710144, + "learning_rate": 9.29518218311839e-06, + "loss": 0.7472, + "step": 6299 + }, + { + "epoch": 0.3467444548406627, + "grad_norm": 0.7708863019943237, + "learning_rate": 9.294960269154608e-06, + "loss": 0.7572, + "step": 6300 + }, + { + "epoch": 0.3467994936430183, + "grad_norm": 0.7819802761077881, + "learning_rate": 9.294738322911052e-06, + "loss": 0.8486, + "step": 6301 + }, + { + "epoch": 0.346854532445374, + "grad_norm": 0.7160501480102539, + "learning_rate": 9.294516344389394e-06, + "loss": 0.8104, + "step": 6302 + }, + { + "epoch": 0.34690957124772964, + "grad_norm": 0.7426022887229919, + "learning_rate": 9.294294333591302e-06, + "loss": 0.7158, + "step": 6303 + }, + { + "epoch": 0.34696461005008533, + "grad_norm": 0.8397019505500793, + "learning_rate": 9.294072290518441e-06, + "loss": 0.8466, + "step": 6304 + }, + { + "epoch": 0.34701964885244097, + "grad_norm": 0.7220905423164368, + "learning_rate": 9.293850215172483e-06, + "loss": 0.7619, + "step": 6305 + }, + { + "epoch": 0.34707468765479665, + "grad_norm": 0.7401862740516663, + "learning_rate": 9.293628107555097e-06, + "loss": 0.7873, + "step": 6306 + }, + { + "epoch": 0.3471297264571523, + "grad_norm": 0.6764525175094604, + "learning_rate": 9.29340596766795e-06, + "loss": 0.7278, + "step": 6307 + }, + { + "epoch": 0.347184765259508, + "grad_norm": 0.8553194403648376, + "learning_rate": 9.293183795512715e-06, + "loss": 0.9074, + "step": 6308 + }, + { + "epoch": 0.3472398040618636, + "grad_norm": 0.6796454191207886, + "learning_rate": 9.292961591091058e-06, + "loss": 0.7179, + "step": 6309 + }, + { + "epoch": 0.3472948428642193, + "grad_norm": 0.6075254082679749, + "learning_rate": 9.292739354404652e-06, + "loss": 0.7228, + "step": 6310 + }, + { + "epoch": 0.34734988166657493, + "grad_norm": 0.7366840243339539, + "learning_rate": 9.292517085455166e-06, + "loss": 0.7934, + "step": 6311 + }, + { + "epoch": 0.3474049204689306, + "grad_norm": 0.6820569038391113, + "learning_rate": 9.29229478424427e-06, + "loss": 0.7315, + "step": 6312 + }, + { + "epoch": 0.34745995927128626, + "grad_norm": 0.8356956243515015, + "learning_rate": 9.292072450773635e-06, + "loss": 0.7787, + "step": 6313 + }, + { + "epoch": 0.34751499807364195, + "grad_norm": 0.70506352186203, + "learning_rate": 9.291850085044933e-06, + "loss": 0.7411, + "step": 6314 + }, + { + "epoch": 0.3475700368759976, + "grad_norm": 0.9074786901473999, + "learning_rate": 9.291627687059835e-06, + "loss": 0.7352, + "step": 6315 + }, + { + "epoch": 0.3476250756783532, + "grad_norm": 0.7858747839927673, + "learning_rate": 9.291405256820013e-06, + "loss": 0.7816, + "step": 6316 + }, + { + "epoch": 0.3476801144807089, + "grad_norm": 0.8576731085777283, + "learning_rate": 9.291182794327134e-06, + "loss": 0.7861, + "step": 6317 + }, + { + "epoch": 0.34773515328306454, + "grad_norm": 0.7500558495521545, + "learning_rate": 9.290960299582877e-06, + "loss": 0.8028, + "step": 6318 + }, + { + "epoch": 0.3477901920854202, + "grad_norm": 0.6577744483947754, + "learning_rate": 9.29073777258891e-06, + "loss": 0.7458, + "step": 6319 + }, + { + "epoch": 0.34784523088777586, + "grad_norm": 0.742855429649353, + "learning_rate": 9.290515213346906e-06, + "loss": 0.755, + "step": 6320 + }, + { + "epoch": 0.34790026969013155, + "grad_norm": 0.7626619338989258, + "learning_rate": 9.290292621858542e-06, + "loss": 0.6671, + "step": 6321 + }, + { + "epoch": 0.3479553084924872, + "grad_norm": 0.7139305472373962, + "learning_rate": 9.290069998125481e-06, + "loss": 0.7981, + "step": 6322 + }, + { + "epoch": 0.34801034729484287, + "grad_norm": 0.9249686002731323, + "learning_rate": 9.289847342149407e-06, + "loss": 0.7243, + "step": 6323 + }, + { + "epoch": 0.3480653860971985, + "grad_norm": 0.8090649843215942, + "learning_rate": 9.289624653931986e-06, + "loss": 0.7892, + "step": 6324 + }, + { + "epoch": 0.3481204248995542, + "grad_norm": 0.6845510005950928, + "learning_rate": 9.289401933474895e-06, + "loss": 0.7427, + "step": 6325 + }, + { + "epoch": 0.3481754637019098, + "grad_norm": 0.7620648741722107, + "learning_rate": 9.289179180779808e-06, + "loss": 0.7715, + "step": 6326 + }, + { + "epoch": 0.3482305025042655, + "grad_norm": 0.7441076040267944, + "learning_rate": 9.288956395848398e-06, + "loss": 0.7814, + "step": 6327 + }, + { + "epoch": 0.34828554130662115, + "grad_norm": 0.6777048707008362, + "learning_rate": 9.28873357868234e-06, + "loss": 0.759, + "step": 6328 + }, + { + "epoch": 0.34834058010897684, + "grad_norm": 0.6534250974655151, + "learning_rate": 9.288510729283307e-06, + "loss": 0.6777, + "step": 6329 + }, + { + "epoch": 0.34839561891133247, + "grad_norm": 0.8205152153968811, + "learning_rate": 9.288287847652977e-06, + "loss": 0.8027, + "step": 6330 + }, + { + "epoch": 0.34845065771368816, + "grad_norm": 0.7152554392814636, + "learning_rate": 9.288064933793024e-06, + "loss": 0.7956, + "step": 6331 + }, + { + "epoch": 0.3485056965160438, + "grad_norm": 0.9816664457321167, + "learning_rate": 9.287841987705121e-06, + "loss": 0.828, + "step": 6332 + }, + { + "epoch": 0.3485607353183995, + "grad_norm": 0.826554000377655, + "learning_rate": 9.287619009390945e-06, + "loss": 0.8544, + "step": 6333 + }, + { + "epoch": 0.3486157741207551, + "grad_norm": 0.7255695462226868, + "learning_rate": 9.287395998852175e-06, + "loss": 0.7749, + "step": 6334 + }, + { + "epoch": 0.3486708129231108, + "grad_norm": 0.7161709070205688, + "learning_rate": 9.287172956090482e-06, + "loss": 0.7114, + "step": 6335 + }, + { + "epoch": 0.34872585172546644, + "grad_norm": 0.7219997644424438, + "learning_rate": 9.286949881107546e-06, + "loss": 0.8309, + "step": 6336 + }, + { + "epoch": 0.34878089052782213, + "grad_norm": 0.7269770503044128, + "learning_rate": 9.286726773905042e-06, + "loss": 0.8039, + "step": 6337 + }, + { + "epoch": 0.34883592933017776, + "grad_norm": 0.8142165541648865, + "learning_rate": 9.286503634484645e-06, + "loss": 0.7673, + "step": 6338 + }, + { + "epoch": 0.34889096813253345, + "grad_norm": 0.7568639516830444, + "learning_rate": 9.286280462848037e-06, + "loss": 0.8471, + "step": 6339 + }, + { + "epoch": 0.3489460069348891, + "grad_norm": 0.7927737236022949, + "learning_rate": 9.28605725899689e-06, + "loss": 0.8828, + "step": 6340 + }, + { + "epoch": 0.3490010457372448, + "grad_norm": 0.9755893349647522, + "learning_rate": 9.285834022932885e-06, + "loss": 0.837, + "step": 6341 + }, + { + "epoch": 0.3490560845396004, + "grad_norm": 0.6831560730934143, + "learning_rate": 9.2856107546577e-06, + "loss": 0.7169, + "step": 6342 + }, + { + "epoch": 0.3491111233419561, + "grad_norm": 0.728239119052887, + "learning_rate": 9.285387454173009e-06, + "loss": 0.7805, + "step": 6343 + }, + { + "epoch": 0.34916616214431173, + "grad_norm": 0.6979145407676697, + "learning_rate": 9.285164121480495e-06, + "loss": 0.7794, + "step": 6344 + }, + { + "epoch": 0.3492212009466674, + "grad_norm": 0.7206674218177795, + "learning_rate": 9.284940756581834e-06, + "loss": 0.7198, + "step": 6345 + }, + { + "epoch": 0.34927623974902305, + "grad_norm": 0.8156035542488098, + "learning_rate": 9.284717359478705e-06, + "loss": 0.884, + "step": 6346 + }, + { + "epoch": 0.34933127855137874, + "grad_norm": 0.6876983046531677, + "learning_rate": 9.284493930172788e-06, + "loss": 0.7426, + "step": 6347 + }, + { + "epoch": 0.3493863173537344, + "grad_norm": 0.6856677532196045, + "learning_rate": 9.284270468665762e-06, + "loss": 0.7085, + "step": 6348 + }, + { + "epoch": 0.34944135615609007, + "grad_norm": 0.8378047943115234, + "learning_rate": 9.284046974959304e-06, + "loss": 0.725, + "step": 6349 + }, + { + "epoch": 0.3494963949584457, + "grad_norm": 0.7410693764686584, + "learning_rate": 9.283823449055097e-06, + "loss": 0.7953, + "step": 6350 + }, + { + "epoch": 0.3495514337608014, + "grad_norm": 0.7558375000953674, + "learning_rate": 9.28359989095482e-06, + "loss": 0.8052, + "step": 6351 + }, + { + "epoch": 0.349606472563157, + "grad_norm": 0.7176862955093384, + "learning_rate": 9.283376300660151e-06, + "loss": 0.7077, + "step": 6352 + }, + { + "epoch": 0.3496615113655127, + "grad_norm": 0.7443307042121887, + "learning_rate": 9.283152678172774e-06, + "loss": 0.7557, + "step": 6353 + }, + { + "epoch": 0.34971655016786835, + "grad_norm": 0.6653748750686646, + "learning_rate": 9.282929023494368e-06, + "loss": 0.7558, + "step": 6354 + }, + { + "epoch": 0.34977158897022403, + "grad_norm": 0.8139400482177734, + "learning_rate": 9.282705336626615e-06, + "loss": 0.847, + "step": 6355 + }, + { + "epoch": 0.34982662777257967, + "grad_norm": 1.012450933456421, + "learning_rate": 9.282481617571193e-06, + "loss": 0.744, + "step": 6356 + }, + { + "epoch": 0.34988166657493536, + "grad_norm": 0.7877402305603027, + "learning_rate": 9.282257866329784e-06, + "loss": 0.7475, + "step": 6357 + }, + { + "epoch": 0.349936705377291, + "grad_norm": 0.7989935874938965, + "learning_rate": 9.282034082904075e-06, + "loss": 0.7379, + "step": 6358 + }, + { + "epoch": 0.3499917441796466, + "grad_norm": 0.6665796637535095, + "learning_rate": 9.281810267295741e-06, + "loss": 0.7253, + "step": 6359 + }, + { + "epoch": 0.3500467829820023, + "grad_norm": 0.8344665765762329, + "learning_rate": 9.28158641950647e-06, + "loss": 0.8095, + "step": 6360 + }, + { + "epoch": 0.35010182178435795, + "grad_norm": 0.8312307596206665, + "learning_rate": 9.281362539537939e-06, + "loss": 0.8452, + "step": 6361 + }, + { + "epoch": 0.35015686058671364, + "grad_norm": 0.7423825263977051, + "learning_rate": 9.281138627391834e-06, + "loss": 0.8291, + "step": 6362 + }, + { + "epoch": 0.35021189938906927, + "grad_norm": 0.7594212293624878, + "learning_rate": 9.280914683069837e-06, + "loss": 0.8314, + "step": 6363 + }, + { + "epoch": 0.35026693819142496, + "grad_norm": 0.8059762716293335, + "learning_rate": 9.280690706573633e-06, + "loss": 0.7695, + "step": 6364 + }, + { + "epoch": 0.3503219769937806, + "grad_norm": 0.8053386807441711, + "learning_rate": 9.280466697904902e-06, + "loss": 0.8941, + "step": 6365 + }, + { + "epoch": 0.3503770157961363, + "grad_norm": 0.6703817248344421, + "learning_rate": 9.280242657065329e-06, + "loss": 0.5978, + "step": 6366 + }, + { + "epoch": 0.3504320545984919, + "grad_norm": 0.9359784722328186, + "learning_rate": 9.280018584056598e-06, + "loss": 0.8479, + "step": 6367 + }, + { + "epoch": 0.3504870934008476, + "grad_norm": 0.7692418098449707, + "learning_rate": 9.279794478880393e-06, + "loss": 0.7254, + "step": 6368 + }, + { + "epoch": 0.35054213220320324, + "grad_norm": 0.7992031574249268, + "learning_rate": 9.279570341538397e-06, + "loss": 0.6749, + "step": 6369 + }, + { + "epoch": 0.3505971710055589, + "grad_norm": 0.7735288739204407, + "learning_rate": 9.279346172032297e-06, + "loss": 0.8545, + "step": 6370 + }, + { + "epoch": 0.35065220980791456, + "grad_norm": 0.7124339938163757, + "learning_rate": 9.279121970363778e-06, + "loss": 0.8066, + "step": 6371 + }, + { + "epoch": 0.35070724861027025, + "grad_norm": 0.8116535544395447, + "learning_rate": 9.278897736534521e-06, + "loss": 0.8197, + "step": 6372 + }, + { + "epoch": 0.3507622874126259, + "grad_norm": 0.9377869963645935, + "learning_rate": 9.278673470546217e-06, + "loss": 0.74, + "step": 6373 + }, + { + "epoch": 0.3508173262149816, + "grad_norm": 0.6726253628730774, + "learning_rate": 9.278449172400548e-06, + "loss": 0.6389, + "step": 6374 + }, + { + "epoch": 0.3508723650173372, + "grad_norm": 0.8470593094825745, + "learning_rate": 9.278224842099198e-06, + "loss": 0.8059, + "step": 6375 + }, + { + "epoch": 0.3509274038196929, + "grad_norm": 0.7041867971420288, + "learning_rate": 9.278000479643857e-06, + "loss": 0.7409, + "step": 6376 + }, + { + "epoch": 0.35098244262204853, + "grad_norm": 0.7467322945594788, + "learning_rate": 9.27777608503621e-06, + "loss": 0.823, + "step": 6377 + }, + { + "epoch": 0.3510374814244042, + "grad_norm": 0.7211065888404846, + "learning_rate": 9.277551658277942e-06, + "loss": 0.7655, + "step": 6378 + }, + { + "epoch": 0.35109252022675985, + "grad_norm": 0.7709450125694275, + "learning_rate": 9.27732719937074e-06, + "loss": 0.8938, + "step": 6379 + }, + { + "epoch": 0.35114755902911554, + "grad_norm": 0.7672929167747498, + "learning_rate": 9.277102708316293e-06, + "loss": 0.6814, + "step": 6380 + }, + { + "epoch": 0.3512025978314712, + "grad_norm": 0.7334907650947571, + "learning_rate": 9.276878185116287e-06, + "loss": 0.6608, + "step": 6381 + }, + { + "epoch": 0.35125763663382686, + "grad_norm": 0.7011460065841675, + "learning_rate": 9.27665362977241e-06, + "loss": 0.8196, + "step": 6382 + }, + { + "epoch": 0.3513126754361825, + "grad_norm": 0.7388820052146912, + "learning_rate": 9.276429042286349e-06, + "loss": 0.8793, + "step": 6383 + }, + { + "epoch": 0.3513677142385382, + "grad_norm": 0.809725821018219, + "learning_rate": 9.27620442265979e-06, + "loss": 0.6976, + "step": 6384 + }, + { + "epoch": 0.3514227530408938, + "grad_norm": 0.6933012008666992, + "learning_rate": 9.275979770894424e-06, + "loss": 0.759, + "step": 6385 + }, + { + "epoch": 0.3514777918432495, + "grad_norm": 0.7928480505943298, + "learning_rate": 9.27575508699194e-06, + "loss": 0.7462, + "step": 6386 + }, + { + "epoch": 0.35153283064560514, + "grad_norm": 0.8461304903030396, + "learning_rate": 9.275530370954024e-06, + "loss": 0.8184, + "step": 6387 + }, + { + "epoch": 0.35158786944796083, + "grad_norm": 0.7624425292015076, + "learning_rate": 9.275305622782366e-06, + "loss": 0.7913, + "step": 6388 + }, + { + "epoch": 0.35164290825031647, + "grad_norm": 0.7103675007820129, + "learning_rate": 9.275080842478657e-06, + "loss": 0.7633, + "step": 6389 + }, + { + "epoch": 0.35169794705267216, + "grad_norm": 0.9002664089202881, + "learning_rate": 9.274856030044583e-06, + "loss": 0.7643, + "step": 6390 + }, + { + "epoch": 0.3517529858550278, + "grad_norm": 0.7658692002296448, + "learning_rate": 9.274631185481836e-06, + "loss": 0.8028, + "step": 6391 + }, + { + "epoch": 0.3518080246573835, + "grad_norm": 0.6747875809669495, + "learning_rate": 9.274406308792106e-06, + "loss": 0.695, + "step": 6392 + }, + { + "epoch": 0.3518630634597391, + "grad_norm": 0.8197165131568909, + "learning_rate": 9.27418139997708e-06, + "loss": 0.7218, + "step": 6393 + }, + { + "epoch": 0.3519181022620948, + "grad_norm": 0.7597750425338745, + "learning_rate": 9.273956459038453e-06, + "loss": 0.7738, + "step": 6394 + }, + { + "epoch": 0.35197314106445043, + "grad_norm": 0.7365928888320923, + "learning_rate": 9.273731485977912e-06, + "loss": 0.7906, + "step": 6395 + }, + { + "epoch": 0.3520281798668061, + "grad_norm": 0.7313928604125977, + "learning_rate": 9.273506480797151e-06, + "loss": 0.834, + "step": 6396 + }, + { + "epoch": 0.35208321866916176, + "grad_norm": 0.758886456489563, + "learning_rate": 9.273281443497858e-06, + "loss": 0.8883, + "step": 6397 + }, + { + "epoch": 0.35213825747151745, + "grad_norm": 0.7318256497383118, + "learning_rate": 9.273056374081726e-06, + "loss": 0.7463, + "step": 6398 + }, + { + "epoch": 0.3521932962738731, + "grad_norm": 0.778448224067688, + "learning_rate": 9.272831272550446e-06, + "loss": 0.6838, + "step": 6399 + }, + { + "epoch": 0.3522483350762287, + "grad_norm": 0.7392274141311646, + "learning_rate": 9.272606138905709e-06, + "loss": 0.7237, + "step": 6400 + }, + { + "epoch": 0.3523033738785844, + "grad_norm": 0.8803032040596008, + "learning_rate": 9.272380973149209e-06, + "loss": 0.7839, + "step": 6401 + }, + { + "epoch": 0.35235841268094004, + "grad_norm": 0.7506754994392395, + "learning_rate": 9.272155775282636e-06, + "loss": 0.7665, + "step": 6402 + }, + { + "epoch": 0.3524134514832957, + "grad_norm": 0.8136595487594604, + "learning_rate": 9.271930545307686e-06, + "loss": 0.9111, + "step": 6403 + }, + { + "epoch": 0.35246849028565136, + "grad_norm": 0.7976880073547363, + "learning_rate": 9.271705283226047e-06, + "loss": 0.735, + "step": 6404 + }, + { + "epoch": 0.35252352908800705, + "grad_norm": 0.89708411693573, + "learning_rate": 9.271479989039415e-06, + "loss": 0.7698, + "step": 6405 + }, + { + "epoch": 0.3525785678903627, + "grad_norm": 0.8618703484535217, + "learning_rate": 9.271254662749484e-06, + "loss": 0.9001, + "step": 6406 + }, + { + "epoch": 0.35263360669271837, + "grad_norm": 0.7143027186393738, + "learning_rate": 9.271029304357946e-06, + "loss": 0.8188, + "step": 6407 + }, + { + "epoch": 0.352688645495074, + "grad_norm": 0.795365571975708, + "learning_rate": 9.270803913866496e-06, + "loss": 0.7389, + "step": 6408 + }, + { + "epoch": 0.3527436842974297, + "grad_norm": 0.6947643756866455, + "learning_rate": 9.270578491276825e-06, + "loss": 0.7278, + "step": 6409 + }, + { + "epoch": 0.3527987230997853, + "grad_norm": 0.7806137204170227, + "learning_rate": 9.27035303659063e-06, + "loss": 0.808, + "step": 6410 + }, + { + "epoch": 0.352853761902141, + "grad_norm": 0.8908704519271851, + "learning_rate": 9.270127549809606e-06, + "loss": 0.8659, + "step": 6411 + }, + { + "epoch": 0.35290880070449665, + "grad_norm": 0.8171417713165283, + "learning_rate": 9.269902030935445e-06, + "loss": 0.7918, + "step": 6412 + }, + { + "epoch": 0.35296383950685234, + "grad_norm": 0.7556712627410889, + "learning_rate": 9.269676479969842e-06, + "loss": 0.7121, + "step": 6413 + }, + { + "epoch": 0.353018878309208, + "grad_norm": 0.8080483675003052, + "learning_rate": 9.269450896914495e-06, + "loss": 0.8185, + "step": 6414 + }, + { + "epoch": 0.35307391711156366, + "grad_norm": 0.8514583706855774, + "learning_rate": 9.2692252817711e-06, + "loss": 0.8055, + "step": 6415 + }, + { + "epoch": 0.3531289559139193, + "grad_norm": 0.7914162278175354, + "learning_rate": 9.268999634541347e-06, + "loss": 0.759, + "step": 6416 + }, + { + "epoch": 0.353183994716275, + "grad_norm": 0.6452118754386902, + "learning_rate": 9.268773955226937e-06, + "loss": 0.6797, + "step": 6417 + }, + { + "epoch": 0.3532390335186306, + "grad_norm": 0.6876220107078552, + "learning_rate": 9.268548243829565e-06, + "loss": 0.7365, + "step": 6418 + }, + { + "epoch": 0.3532940723209863, + "grad_norm": 0.758550226688385, + "learning_rate": 9.268322500350926e-06, + "loss": 0.7069, + "step": 6419 + }, + { + "epoch": 0.35334911112334194, + "grad_norm": 0.7905879020690918, + "learning_rate": 9.268096724792718e-06, + "loss": 0.8024, + "step": 6420 + }, + { + "epoch": 0.35340414992569763, + "grad_norm": 0.755253255367279, + "learning_rate": 9.267870917156638e-06, + "loss": 0.8018, + "step": 6421 + }, + { + "epoch": 0.35345918872805326, + "grad_norm": 0.6879923343658447, + "learning_rate": 9.267645077444382e-06, + "loss": 0.7267, + "step": 6422 + }, + { + "epoch": 0.35351422753040895, + "grad_norm": 0.766214907169342, + "learning_rate": 9.267419205657649e-06, + "loss": 0.7801, + "step": 6423 + }, + { + "epoch": 0.3535692663327646, + "grad_norm": 0.868776798248291, + "learning_rate": 9.267193301798135e-06, + "loss": 0.9234, + "step": 6424 + }, + { + "epoch": 0.3536243051351203, + "grad_norm": 1.2007492780685425, + "learning_rate": 9.266967365867536e-06, + "loss": 0.7743, + "step": 6425 + }, + { + "epoch": 0.3536793439374759, + "grad_norm": 0.7445551156997681, + "learning_rate": 9.266741397867556e-06, + "loss": 0.6755, + "step": 6426 + }, + { + "epoch": 0.3537343827398316, + "grad_norm": 0.7493785619735718, + "learning_rate": 9.266515397799889e-06, + "loss": 0.7891, + "step": 6427 + }, + { + "epoch": 0.35378942154218723, + "grad_norm": 0.6718230843544006, + "learning_rate": 9.266289365666234e-06, + "loss": 0.6908, + "step": 6428 + }, + { + "epoch": 0.3538444603445429, + "grad_norm": 0.7783547639846802, + "learning_rate": 9.266063301468289e-06, + "loss": 0.7115, + "step": 6429 + }, + { + "epoch": 0.35389949914689856, + "grad_norm": 0.745627224445343, + "learning_rate": 9.265837205207755e-06, + "loss": 0.8421, + "step": 6430 + }, + { + "epoch": 0.35395453794925424, + "grad_norm": 0.7314152717590332, + "learning_rate": 9.26561107688633e-06, + "loss": 0.807, + "step": 6431 + }, + { + "epoch": 0.3540095767516099, + "grad_norm": 0.6975863575935364, + "learning_rate": 9.265384916505714e-06, + "loss": 0.7787, + "step": 6432 + }, + { + "epoch": 0.35406461555396557, + "grad_norm": 0.9758319854736328, + "learning_rate": 9.265158724067608e-06, + "loss": 0.8668, + "step": 6433 + }, + { + "epoch": 0.3541196543563212, + "grad_norm": 0.7686764001846313, + "learning_rate": 9.264932499573711e-06, + "loss": 0.7428, + "step": 6434 + }, + { + "epoch": 0.3541746931586769, + "grad_norm": 0.8761935830116272, + "learning_rate": 9.26470624302572e-06, + "loss": 0.8022, + "step": 6435 + }, + { + "epoch": 0.3542297319610325, + "grad_norm": 0.9145118594169617, + "learning_rate": 9.264479954425341e-06, + "loss": 0.7994, + "step": 6436 + }, + { + "epoch": 0.3542847707633882, + "grad_norm": 0.8217951655387878, + "learning_rate": 9.264253633774271e-06, + "loss": 0.7235, + "step": 6437 + }, + { + "epoch": 0.35433980956574385, + "grad_norm": 0.7624716758728027, + "learning_rate": 9.264027281074214e-06, + "loss": 0.8238, + "step": 6438 + }, + { + "epoch": 0.35439484836809954, + "grad_norm": 0.7772085070610046, + "learning_rate": 9.26380089632687e-06, + "loss": 0.7941, + "step": 6439 + }, + { + "epoch": 0.35444988717045517, + "grad_norm": 1.0462371110916138, + "learning_rate": 9.263574479533937e-06, + "loss": 0.8255, + "step": 6440 + }, + { + "epoch": 0.35450492597281086, + "grad_norm": 0.8523101210594177, + "learning_rate": 9.263348030697119e-06, + "loss": 0.8489, + "step": 6441 + }, + { + "epoch": 0.3545599647751665, + "grad_norm": 1.0292255878448486, + "learning_rate": 9.26312154981812e-06, + "loss": 0.7989, + "step": 6442 + }, + { + "epoch": 0.3546150035775221, + "grad_norm": 0.7621143460273743, + "learning_rate": 9.262895036898641e-06, + "loss": 0.8154, + "step": 6443 + }, + { + "epoch": 0.3546700423798778, + "grad_norm": 0.7158074378967285, + "learning_rate": 9.262668491940382e-06, + "loss": 0.7821, + "step": 6444 + }, + { + "epoch": 0.35472508118223345, + "grad_norm": 0.7969478964805603, + "learning_rate": 9.26244191494505e-06, + "loss": 0.8535, + "step": 6445 + }, + { + "epoch": 0.35478011998458914, + "grad_norm": 0.9244762063026428, + "learning_rate": 9.262215305914345e-06, + "loss": 0.7585, + "step": 6446 + }, + { + "epoch": 0.35483515878694477, + "grad_norm": 0.6862454414367676, + "learning_rate": 9.26198866484997e-06, + "loss": 0.7294, + "step": 6447 + }, + { + "epoch": 0.35489019758930046, + "grad_norm": 0.6816834211349487, + "learning_rate": 9.261761991753629e-06, + "loss": 0.7763, + "step": 6448 + }, + { + "epoch": 0.3549452363916561, + "grad_norm": 0.792539119720459, + "learning_rate": 9.261535286627025e-06, + "loss": 0.7829, + "step": 6449 + }, + { + "epoch": 0.3550002751940118, + "grad_norm": 0.8563211560249329, + "learning_rate": 9.261308549471866e-06, + "loss": 0.8945, + "step": 6450 + }, + { + "epoch": 0.3550553139963674, + "grad_norm": 0.7241078019142151, + "learning_rate": 9.26108178028985e-06, + "loss": 0.6936, + "step": 6451 + }, + { + "epoch": 0.3551103527987231, + "grad_norm": 0.7150034308433533, + "learning_rate": 9.260854979082682e-06, + "loss": 0.7689, + "step": 6452 + }, + { + "epoch": 0.35516539160107874, + "grad_norm": 0.8630193471908569, + "learning_rate": 9.260628145852073e-06, + "loss": 0.8506, + "step": 6453 + }, + { + "epoch": 0.35522043040343443, + "grad_norm": 0.7133893370628357, + "learning_rate": 9.26040128059972e-06, + "loss": 0.7976, + "step": 6454 + }, + { + "epoch": 0.35527546920579006, + "grad_norm": 0.6984630823135376, + "learning_rate": 9.260174383327332e-06, + "loss": 0.7442, + "step": 6455 + }, + { + "epoch": 0.35533050800814575, + "grad_norm": 0.7166933417320251, + "learning_rate": 9.259947454036613e-06, + "loss": 0.813, + "step": 6456 + }, + { + "epoch": 0.3553855468105014, + "grad_norm": 0.7353581190109253, + "learning_rate": 9.259720492729272e-06, + "loss": 0.8157, + "step": 6457 + }, + { + "epoch": 0.3554405856128571, + "grad_norm": 0.6810038089752197, + "learning_rate": 9.259493499407011e-06, + "loss": 0.7423, + "step": 6458 + }, + { + "epoch": 0.3554956244152127, + "grad_norm": 1.1599586009979248, + "learning_rate": 9.259266474071535e-06, + "loss": 0.7159, + "step": 6459 + }, + { + "epoch": 0.3555506632175684, + "grad_norm": 0.7857629060745239, + "learning_rate": 9.259039416724554e-06, + "loss": 0.7846, + "step": 6460 + }, + { + "epoch": 0.35560570201992403, + "grad_norm": 0.705333948135376, + "learning_rate": 9.258812327367773e-06, + "loss": 0.751, + "step": 6461 + }, + { + "epoch": 0.3556607408222797, + "grad_norm": 0.6899998188018799, + "learning_rate": 9.258585206002897e-06, + "loss": 0.7303, + "step": 6462 + }, + { + "epoch": 0.35571577962463535, + "grad_norm": 0.8007912039756775, + "learning_rate": 9.258358052631637e-06, + "loss": 0.7363, + "step": 6463 + }, + { + "epoch": 0.35577081842699104, + "grad_norm": 0.9403146505355835, + "learning_rate": 9.258130867255695e-06, + "loss": 0.9096, + "step": 6464 + }, + { + "epoch": 0.3558258572293467, + "grad_norm": 0.7069174647331238, + "learning_rate": 9.257903649876782e-06, + "loss": 0.7362, + "step": 6465 + }, + { + "epoch": 0.35588089603170237, + "grad_norm": 0.770807683467865, + "learning_rate": 9.257676400496607e-06, + "loss": 0.7904, + "step": 6466 + }, + { + "epoch": 0.355935934834058, + "grad_norm": 0.8586871027946472, + "learning_rate": 9.257449119116874e-06, + "loss": 0.7596, + "step": 6467 + }, + { + "epoch": 0.3559909736364137, + "grad_norm": 0.6934101581573486, + "learning_rate": 9.257221805739294e-06, + "loss": 0.6655, + "step": 6468 + }, + { + "epoch": 0.3560460124387693, + "grad_norm": 0.9494497179985046, + "learning_rate": 9.256994460365573e-06, + "loss": 0.7923, + "step": 6469 + }, + { + "epoch": 0.356101051241125, + "grad_norm": 0.7131130695343018, + "learning_rate": 9.256767082997422e-06, + "loss": 0.819, + "step": 6470 + }, + { + "epoch": 0.35615609004348064, + "grad_norm": 0.8641398549079895, + "learning_rate": 9.25653967363655e-06, + "loss": 0.8275, + "step": 6471 + }, + { + "epoch": 0.35621112884583633, + "grad_norm": 0.7350367307662964, + "learning_rate": 9.256312232284665e-06, + "loss": 0.7991, + "step": 6472 + }, + { + "epoch": 0.35626616764819197, + "grad_norm": 0.8174671530723572, + "learning_rate": 9.256084758943476e-06, + "loss": 0.7147, + "step": 6473 + }, + { + "epoch": 0.35632120645054766, + "grad_norm": 0.7560263872146606, + "learning_rate": 9.255857253614693e-06, + "loss": 0.7435, + "step": 6474 + }, + { + "epoch": 0.3563762452529033, + "grad_norm": 0.7465197443962097, + "learning_rate": 9.255629716300025e-06, + "loss": 0.8228, + "step": 6475 + }, + { + "epoch": 0.356431284055259, + "grad_norm": 0.7130733728408813, + "learning_rate": 9.255402147001184e-06, + "loss": 0.8361, + "step": 6476 + }, + { + "epoch": 0.3564863228576146, + "grad_norm": 0.7200759053230286, + "learning_rate": 9.255174545719882e-06, + "loss": 0.7387, + "step": 6477 + }, + { + "epoch": 0.3565413616599703, + "grad_norm": 0.8387622237205505, + "learning_rate": 9.254946912457826e-06, + "loss": 0.8427, + "step": 6478 + }, + { + "epoch": 0.35659640046232594, + "grad_norm": 0.7263510823249817, + "learning_rate": 9.254719247216725e-06, + "loss": 0.712, + "step": 6479 + }, + { + "epoch": 0.3566514392646816, + "grad_norm": 0.7393862009048462, + "learning_rate": 9.254491549998296e-06, + "loss": 0.6916, + "step": 6480 + }, + { + "epoch": 0.35670647806703726, + "grad_norm": 0.7289569973945618, + "learning_rate": 9.254263820804246e-06, + "loss": 0.7561, + "step": 6481 + }, + { + "epoch": 0.35676151686939295, + "grad_norm": 0.7597448825836182, + "learning_rate": 9.254036059636288e-06, + "loss": 0.853, + "step": 6482 + }, + { + "epoch": 0.3568165556717486, + "grad_norm": 0.7652063369750977, + "learning_rate": 9.253808266496136e-06, + "loss": 0.7652, + "step": 6483 + }, + { + "epoch": 0.35687159447410427, + "grad_norm": 1.193938136100769, + "learning_rate": 9.253580441385497e-06, + "loss": 0.8288, + "step": 6484 + }, + { + "epoch": 0.3569266332764599, + "grad_norm": 0.9258719086647034, + "learning_rate": 9.253352584306087e-06, + "loss": 0.807, + "step": 6485 + }, + { + "epoch": 0.35698167207881554, + "grad_norm": 0.78384929895401, + "learning_rate": 9.253124695259617e-06, + "loss": 0.7785, + "step": 6486 + }, + { + "epoch": 0.3570367108811712, + "grad_norm": 0.801403284072876, + "learning_rate": 9.252896774247802e-06, + "loss": 0.8382, + "step": 6487 + }, + { + "epoch": 0.35709174968352686, + "grad_norm": 0.9472376108169556, + "learning_rate": 9.25266882127235e-06, + "loss": 0.8661, + "step": 6488 + }, + { + "epoch": 0.35714678848588255, + "grad_norm": 0.7575686573982239, + "learning_rate": 9.252440836334981e-06, + "loss": 0.8428, + "step": 6489 + }, + { + "epoch": 0.3572018272882382, + "grad_norm": 0.736282467842102, + "learning_rate": 9.252212819437402e-06, + "loss": 0.801, + "step": 6490 + }, + { + "epoch": 0.35725686609059387, + "grad_norm": 0.7420864701271057, + "learning_rate": 9.251984770581332e-06, + "loss": 0.8849, + "step": 6491 + }, + { + "epoch": 0.3573119048929495, + "grad_norm": 0.7129189372062683, + "learning_rate": 9.251756689768482e-06, + "loss": 0.7716, + "step": 6492 + }, + { + "epoch": 0.3573669436953052, + "grad_norm": 0.7777297496795654, + "learning_rate": 9.251528577000566e-06, + "loss": 0.8183, + "step": 6493 + }, + { + "epoch": 0.35742198249766083, + "grad_norm": 0.7644590139389038, + "learning_rate": 9.2513004322793e-06, + "loss": 0.6319, + "step": 6494 + }, + { + "epoch": 0.3574770213000165, + "grad_norm": 0.7112484574317932, + "learning_rate": 9.251072255606399e-06, + "loss": 0.8012, + "step": 6495 + }, + { + "epoch": 0.35753206010237215, + "grad_norm": 0.7772265076637268, + "learning_rate": 9.250844046983576e-06, + "loss": 0.8372, + "step": 6496 + }, + { + "epoch": 0.35758709890472784, + "grad_norm": 0.9530157446861267, + "learning_rate": 9.250615806412546e-06, + "loss": 0.8683, + "step": 6497 + }, + { + "epoch": 0.3576421377070835, + "grad_norm": 0.7249575257301331, + "learning_rate": 9.250387533895026e-06, + "loss": 0.7091, + "step": 6498 + }, + { + "epoch": 0.35769717650943916, + "grad_norm": 0.8549422025680542, + "learning_rate": 9.25015922943273e-06, + "loss": 0.8376, + "step": 6499 + }, + { + "epoch": 0.3577522153117948, + "grad_norm": 0.74477618932724, + "learning_rate": 9.249930893027376e-06, + "loss": 0.7594, + "step": 6500 + }, + { + "epoch": 0.3578072541141505, + "grad_norm": 0.8269739151000977, + "learning_rate": 9.24970252468068e-06, + "loss": 0.6473, + "step": 6501 + }, + { + "epoch": 0.3578622929165061, + "grad_norm": 0.8375437259674072, + "learning_rate": 9.249474124394358e-06, + "loss": 0.7631, + "step": 6502 + }, + { + "epoch": 0.3579173317188618, + "grad_norm": 0.8680340051651001, + "learning_rate": 9.249245692170123e-06, + "loss": 0.7863, + "step": 6503 + }, + { + "epoch": 0.35797237052121744, + "grad_norm": 0.7179692983627319, + "learning_rate": 9.249017228009696e-06, + "loss": 0.8022, + "step": 6504 + }, + { + "epoch": 0.35802740932357313, + "grad_norm": 0.7797464728355408, + "learning_rate": 9.248788731914794e-06, + "loss": 0.8067, + "step": 6505 + }, + { + "epoch": 0.35808244812592877, + "grad_norm": 0.8032993674278259, + "learning_rate": 9.248560203887133e-06, + "loss": 0.7383, + "step": 6506 + }, + { + "epoch": 0.35813748692828445, + "grad_norm": 0.7714722156524658, + "learning_rate": 9.24833164392843e-06, + "loss": 0.7149, + "step": 6507 + }, + { + "epoch": 0.3581925257306401, + "grad_norm": 0.7492430210113525, + "learning_rate": 9.248103052040404e-06, + "loss": 0.7645, + "step": 6508 + }, + { + "epoch": 0.3582475645329958, + "grad_norm": 0.6843901872634888, + "learning_rate": 9.247874428224773e-06, + "loss": 0.7183, + "step": 6509 + }, + { + "epoch": 0.3583026033353514, + "grad_norm": 0.8370186686515808, + "learning_rate": 9.247645772483254e-06, + "loss": 0.7832, + "step": 6510 + }, + { + "epoch": 0.3583576421377071, + "grad_norm": 0.7907791137695312, + "learning_rate": 9.247417084817567e-06, + "loss": 0.8742, + "step": 6511 + }, + { + "epoch": 0.35841268094006273, + "grad_norm": 0.7950869798660278, + "learning_rate": 9.247188365229428e-06, + "loss": 0.8705, + "step": 6512 + }, + { + "epoch": 0.3584677197424184, + "grad_norm": 0.7276936173439026, + "learning_rate": 9.24695961372056e-06, + "loss": 0.7629, + "step": 6513 + }, + { + "epoch": 0.35852275854477406, + "grad_norm": 0.7761141657829285, + "learning_rate": 9.24673083029268e-06, + "loss": 0.8813, + "step": 6514 + }, + { + "epoch": 0.35857779734712975, + "grad_norm": 0.7528283596038818, + "learning_rate": 9.24650201494751e-06, + "loss": 0.7885, + "step": 6515 + }, + { + "epoch": 0.3586328361494854, + "grad_norm": 0.8972534537315369, + "learning_rate": 9.246273167686765e-06, + "loss": 0.9081, + "step": 6516 + }, + { + "epoch": 0.35868787495184107, + "grad_norm": 0.7658557891845703, + "learning_rate": 9.246044288512168e-06, + "loss": 0.8451, + "step": 6517 + }, + { + "epoch": 0.3587429137541967, + "grad_norm": 0.8013193607330322, + "learning_rate": 9.245815377425438e-06, + "loss": 0.7236, + "step": 6518 + }, + { + "epoch": 0.3587979525565524, + "grad_norm": 0.8134163022041321, + "learning_rate": 9.245586434428298e-06, + "loss": 0.908, + "step": 6519 + }, + { + "epoch": 0.358852991358908, + "grad_norm": 0.6479801535606384, + "learning_rate": 9.245357459522466e-06, + "loss": 0.7397, + "step": 6520 + }, + { + "epoch": 0.3589080301612637, + "grad_norm": 0.70014488697052, + "learning_rate": 9.245128452709665e-06, + "loss": 0.6898, + "step": 6521 + }, + { + "epoch": 0.35896306896361935, + "grad_norm": 0.7645437717437744, + "learning_rate": 9.244899413991613e-06, + "loss": 0.8319, + "step": 6522 + }, + { + "epoch": 0.35901810776597504, + "grad_norm": 0.6812799572944641, + "learning_rate": 9.244670343370033e-06, + "loss": 0.7359, + "step": 6523 + }, + { + "epoch": 0.35907314656833067, + "grad_norm": 0.6573774218559265, + "learning_rate": 9.244441240846647e-06, + "loss": 0.742, + "step": 6524 + }, + { + "epoch": 0.35912818537068636, + "grad_norm": 0.7870661020278931, + "learning_rate": 9.244212106423178e-06, + "loss": 0.7307, + "step": 6525 + }, + { + "epoch": 0.359183224173042, + "grad_norm": 0.9163166284561157, + "learning_rate": 9.243982940101347e-06, + "loss": 0.8584, + "step": 6526 + }, + { + "epoch": 0.3592382629753977, + "grad_norm": 0.766888439655304, + "learning_rate": 9.243753741882874e-06, + "loss": 0.8093, + "step": 6527 + }, + { + "epoch": 0.3592933017777533, + "grad_norm": 0.7831236124038696, + "learning_rate": 9.243524511769486e-06, + "loss": 0.8665, + "step": 6528 + }, + { + "epoch": 0.35934834058010895, + "grad_norm": 0.7485133409500122, + "learning_rate": 9.243295249762904e-06, + "loss": 0.7336, + "step": 6529 + }, + { + "epoch": 0.35940337938246464, + "grad_norm": 0.7231502532958984, + "learning_rate": 9.24306595586485e-06, + "loss": 0.8095, + "step": 6530 + }, + { + "epoch": 0.35945841818482027, + "grad_norm": 0.821898877620697, + "learning_rate": 9.242836630077048e-06, + "loss": 0.831, + "step": 6531 + }, + { + "epoch": 0.35951345698717596, + "grad_norm": 0.6792737245559692, + "learning_rate": 9.242607272401223e-06, + "loss": 0.7183, + "step": 6532 + }, + { + "epoch": 0.3595684957895316, + "grad_norm": 0.7200430631637573, + "learning_rate": 9.242377882839095e-06, + "loss": 0.7256, + "step": 6533 + }, + { + "epoch": 0.3596235345918873, + "grad_norm": 0.6713700890541077, + "learning_rate": 9.242148461392393e-06, + "loss": 0.7416, + "step": 6534 + }, + { + "epoch": 0.3596785733942429, + "grad_norm": 0.7054564356803894, + "learning_rate": 9.241919008062836e-06, + "loss": 0.6856, + "step": 6535 + }, + { + "epoch": 0.3597336121965986, + "grad_norm": 0.7516196966171265, + "learning_rate": 9.241689522852152e-06, + "loss": 0.7149, + "step": 6536 + }, + { + "epoch": 0.35978865099895424, + "grad_norm": 0.8547651767730713, + "learning_rate": 9.241460005762067e-06, + "loss": 0.7075, + "step": 6537 + }, + { + "epoch": 0.35984368980130993, + "grad_norm": 0.6791819334030151, + "learning_rate": 9.241230456794302e-06, + "loss": 0.6449, + "step": 6538 + }, + { + "epoch": 0.35989872860366556, + "grad_norm": 0.8365122079849243, + "learning_rate": 9.241000875950583e-06, + "loss": 0.7619, + "step": 6539 + }, + { + "epoch": 0.35995376740602125, + "grad_norm": 0.763829231262207, + "learning_rate": 9.24077126323264e-06, + "loss": 0.71, + "step": 6540 + }, + { + "epoch": 0.3600088062083769, + "grad_norm": 0.7698483467102051, + "learning_rate": 9.240541618642193e-06, + "loss": 0.7949, + "step": 6541 + }, + { + "epoch": 0.3600638450107326, + "grad_norm": 0.7331508994102478, + "learning_rate": 9.24031194218097e-06, + "loss": 0.8292, + "step": 6542 + }, + { + "epoch": 0.3601188838130882, + "grad_norm": 0.7507451772689819, + "learning_rate": 9.2400822338507e-06, + "loss": 0.8651, + "step": 6543 + }, + { + "epoch": 0.3601739226154439, + "grad_norm": 0.8537001609802246, + "learning_rate": 9.239852493653104e-06, + "loss": 0.848, + "step": 6544 + }, + { + "epoch": 0.36022896141779953, + "grad_norm": 0.683311939239502, + "learning_rate": 9.239622721589913e-06, + "loss": 0.803, + "step": 6545 + }, + { + "epoch": 0.3602840002201552, + "grad_norm": 0.6916974186897278, + "learning_rate": 9.239392917662852e-06, + "loss": 0.8037, + "step": 6546 + }, + { + "epoch": 0.36033903902251085, + "grad_norm": 0.798795223236084, + "learning_rate": 9.23916308187365e-06, + "loss": 0.8037, + "step": 6547 + }, + { + "epoch": 0.36039407782486654, + "grad_norm": 0.7284069657325745, + "learning_rate": 9.238933214224032e-06, + "loss": 0.7365, + "step": 6548 + }, + { + "epoch": 0.3604491166272222, + "grad_norm": 0.7789250016212463, + "learning_rate": 9.238703314715727e-06, + "loss": 0.788, + "step": 6549 + }, + { + "epoch": 0.36050415542957787, + "grad_norm": 0.7029675841331482, + "learning_rate": 9.238473383350462e-06, + "loss": 0.7796, + "step": 6550 + }, + { + "epoch": 0.3605591942319335, + "grad_norm": 0.9094457626342773, + "learning_rate": 9.238243420129965e-06, + "loss": 0.7884, + "step": 6551 + }, + { + "epoch": 0.3606142330342892, + "grad_norm": 0.8253848552703857, + "learning_rate": 9.238013425055965e-06, + "loss": 0.7671, + "step": 6552 + }, + { + "epoch": 0.3606692718366448, + "grad_norm": 0.7052987813949585, + "learning_rate": 9.237783398130193e-06, + "loss": 0.7511, + "step": 6553 + }, + { + "epoch": 0.3607243106390005, + "grad_norm": 0.7506607174873352, + "learning_rate": 9.237553339354373e-06, + "loss": 0.6804, + "step": 6554 + }, + { + "epoch": 0.36077934944135615, + "grad_norm": 0.725106418132782, + "learning_rate": 9.237323248730237e-06, + "loss": 0.7658, + "step": 6555 + }, + { + "epoch": 0.36083438824371183, + "grad_norm": 0.8164945244789124, + "learning_rate": 9.237093126259515e-06, + "loss": 0.7857, + "step": 6556 + }, + { + "epoch": 0.36088942704606747, + "grad_norm": 0.6937377452850342, + "learning_rate": 9.236862971943934e-06, + "loss": 0.6985, + "step": 6557 + }, + { + "epoch": 0.36094446584842316, + "grad_norm": 0.7511105537414551, + "learning_rate": 9.236632785785225e-06, + "loss": 0.7891, + "step": 6558 + }, + { + "epoch": 0.3609995046507788, + "grad_norm": 0.7217637896537781, + "learning_rate": 9.236402567785118e-06, + "loss": 0.7942, + "step": 6559 + }, + { + "epoch": 0.3610545434531345, + "grad_norm": 1.1438478231430054, + "learning_rate": 9.236172317945343e-06, + "loss": 0.8311, + "step": 6560 + }, + { + "epoch": 0.3611095822554901, + "grad_norm": 0.7414245009422302, + "learning_rate": 9.23594203626763e-06, + "loss": 0.7726, + "step": 6561 + }, + { + "epoch": 0.3611646210578458, + "grad_norm": 0.7762154340744019, + "learning_rate": 9.235711722753712e-06, + "loss": 0.7891, + "step": 6562 + }, + { + "epoch": 0.36121965986020144, + "grad_norm": 0.7368801832199097, + "learning_rate": 9.23548137740532e-06, + "loss": 0.7656, + "step": 6563 + }, + { + "epoch": 0.3612746986625571, + "grad_norm": 0.7571502923965454, + "learning_rate": 9.235251000224181e-06, + "loss": 0.7845, + "step": 6564 + }, + { + "epoch": 0.36132973746491276, + "grad_norm": 0.8078309297561646, + "learning_rate": 9.235020591212031e-06, + "loss": 0.7969, + "step": 6565 + }, + { + "epoch": 0.36138477626726845, + "grad_norm": 0.6897913813591003, + "learning_rate": 9.234790150370599e-06, + "loss": 0.6922, + "step": 6566 + }, + { + "epoch": 0.3614398150696241, + "grad_norm": 0.8053449988365173, + "learning_rate": 9.234559677701618e-06, + "loss": 0.9126, + "step": 6567 + }, + { + "epoch": 0.36149485387197977, + "grad_norm": 0.8400903940200806, + "learning_rate": 9.23432917320682e-06, + "loss": 0.8144, + "step": 6568 + }, + { + "epoch": 0.3615498926743354, + "grad_norm": 0.7753110527992249, + "learning_rate": 9.234098636887935e-06, + "loss": 0.7025, + "step": 6569 + }, + { + "epoch": 0.3616049314766911, + "grad_norm": 0.7901243567466736, + "learning_rate": 9.233868068746702e-06, + "loss": 0.783, + "step": 6570 + }, + { + "epoch": 0.3616599702790467, + "grad_norm": 1.2297497987747192, + "learning_rate": 9.233637468784849e-06, + "loss": 0.8541, + "step": 6571 + }, + { + "epoch": 0.36171500908140236, + "grad_norm": 0.7590478658676147, + "learning_rate": 9.233406837004108e-06, + "loss": 0.7856, + "step": 6572 + }, + { + "epoch": 0.36177004788375805, + "grad_norm": 0.6651493310928345, + "learning_rate": 9.233176173406216e-06, + "loss": 0.6822, + "step": 6573 + }, + { + "epoch": 0.3618250866861137, + "grad_norm": 0.7760787010192871, + "learning_rate": 9.232945477992905e-06, + "loss": 0.8017, + "step": 6574 + }, + { + "epoch": 0.3618801254884694, + "grad_norm": 0.8788009285926819, + "learning_rate": 9.232714750765908e-06, + "loss": 0.7812, + "step": 6575 + }, + { + "epoch": 0.361935164290825, + "grad_norm": 0.7014517188072205, + "learning_rate": 9.232483991726961e-06, + "loss": 0.7293, + "step": 6576 + }, + { + "epoch": 0.3619902030931807, + "grad_norm": 0.7586061954498291, + "learning_rate": 9.232253200877797e-06, + "loss": 0.7953, + "step": 6577 + }, + { + "epoch": 0.36204524189553633, + "grad_norm": 0.8202564120292664, + "learning_rate": 9.232022378220151e-06, + "loss": 0.8545, + "step": 6578 + }, + { + "epoch": 0.362100280697892, + "grad_norm": 0.7816846966743469, + "learning_rate": 9.231791523755758e-06, + "loss": 0.8573, + "step": 6579 + }, + { + "epoch": 0.36215531950024765, + "grad_norm": 0.883222222328186, + "learning_rate": 9.23156063748635e-06, + "loss": 0.7733, + "step": 6580 + }, + { + "epoch": 0.36221035830260334, + "grad_norm": 0.8472830057144165, + "learning_rate": 9.231329719413668e-06, + "loss": 0.8931, + "step": 6581 + }, + { + "epoch": 0.362265397104959, + "grad_norm": 0.7916087508201599, + "learning_rate": 9.231098769539443e-06, + "loss": 0.8806, + "step": 6582 + }, + { + "epoch": 0.36232043590731466, + "grad_norm": 0.815339982509613, + "learning_rate": 9.230867787865414e-06, + "loss": 0.9081, + "step": 6583 + }, + { + "epoch": 0.3623754747096703, + "grad_norm": 1.2352560758590698, + "learning_rate": 9.230636774393312e-06, + "loss": 0.726, + "step": 6584 + }, + { + "epoch": 0.362430513512026, + "grad_norm": 0.759308397769928, + "learning_rate": 9.230405729124878e-06, + "loss": 0.7648, + "step": 6585 + }, + { + "epoch": 0.3624855523143816, + "grad_norm": 0.8285754323005676, + "learning_rate": 9.230174652061847e-06, + "loss": 0.7972, + "step": 6586 + }, + { + "epoch": 0.3625405911167373, + "grad_norm": 0.7393043041229248, + "learning_rate": 9.229943543205956e-06, + "loss": 0.7859, + "step": 6587 + }, + { + "epoch": 0.36259562991909294, + "grad_norm": 0.7354594469070435, + "learning_rate": 9.229712402558942e-06, + "loss": 0.6683, + "step": 6588 + }, + { + "epoch": 0.36265066872144863, + "grad_norm": 0.8244406580924988, + "learning_rate": 9.229481230122543e-06, + "loss": 0.6977, + "step": 6589 + }, + { + "epoch": 0.36270570752380427, + "grad_norm": 0.810565173625946, + "learning_rate": 9.229250025898493e-06, + "loss": 0.7278, + "step": 6590 + }, + { + "epoch": 0.36276074632615996, + "grad_norm": 0.7443352937698364, + "learning_rate": 9.229018789888532e-06, + "loss": 0.7821, + "step": 6591 + }, + { + "epoch": 0.3628157851285156, + "grad_norm": 0.9211748838424683, + "learning_rate": 9.228787522094398e-06, + "loss": 0.9174, + "step": 6592 + }, + { + "epoch": 0.3628708239308713, + "grad_norm": 0.7099255919456482, + "learning_rate": 9.22855622251783e-06, + "loss": 0.74, + "step": 6593 + }, + { + "epoch": 0.3629258627332269, + "grad_norm": 0.7373029589653015, + "learning_rate": 9.228324891160564e-06, + "loss": 0.7909, + "step": 6594 + }, + { + "epoch": 0.3629809015355826, + "grad_norm": 0.8774755001068115, + "learning_rate": 9.22809352802434e-06, + "loss": 0.8354, + "step": 6595 + }, + { + "epoch": 0.36303594033793823, + "grad_norm": 0.7547696232795715, + "learning_rate": 9.227862133110899e-06, + "loss": 0.6942, + "step": 6596 + }, + { + "epoch": 0.3630909791402939, + "grad_norm": 0.7868191003799438, + "learning_rate": 9.227630706421975e-06, + "loss": 0.7575, + "step": 6597 + }, + { + "epoch": 0.36314601794264956, + "grad_norm": 0.6753721237182617, + "learning_rate": 9.227399247959312e-06, + "loss": 0.7092, + "step": 6598 + }, + { + "epoch": 0.36320105674500525, + "grad_norm": 0.7317304611206055, + "learning_rate": 9.227167757724646e-06, + "loss": 0.8372, + "step": 6599 + }, + { + "epoch": 0.3632560955473609, + "grad_norm": 0.8928040266036987, + "learning_rate": 9.226936235719721e-06, + "loss": 0.8536, + "step": 6600 + }, + { + "epoch": 0.36331113434971657, + "grad_norm": 0.7178280353546143, + "learning_rate": 9.226704681946275e-06, + "loss": 0.7648, + "step": 6601 + }, + { + "epoch": 0.3633661731520722, + "grad_norm": 0.7439851760864258, + "learning_rate": 9.226473096406046e-06, + "loss": 0.8284, + "step": 6602 + }, + { + "epoch": 0.3634212119544279, + "grad_norm": 0.7000887989997864, + "learning_rate": 9.226241479100777e-06, + "loss": 0.7797, + "step": 6603 + }, + { + "epoch": 0.3634762507567835, + "grad_norm": 0.7882626056671143, + "learning_rate": 9.226009830032209e-06, + "loss": 0.72, + "step": 6604 + }, + { + "epoch": 0.3635312895591392, + "grad_norm": 0.6445927619934082, + "learning_rate": 9.225778149202081e-06, + "loss": 0.6785, + "step": 6605 + }, + { + "epoch": 0.36358632836149485, + "grad_norm": 0.7348469495773315, + "learning_rate": 9.225546436612137e-06, + "loss": 0.8117, + "step": 6606 + }, + { + "epoch": 0.36364136716385054, + "grad_norm": 0.7455001473426819, + "learning_rate": 9.225314692264118e-06, + "loss": 0.8196, + "step": 6607 + }, + { + "epoch": 0.36369640596620617, + "grad_norm": 0.7149390578269958, + "learning_rate": 9.225082916159762e-06, + "loss": 0.8841, + "step": 6608 + }, + { + "epoch": 0.36375144476856186, + "grad_norm": 0.7095748782157898, + "learning_rate": 9.224851108300816e-06, + "loss": 0.7336, + "step": 6609 + }, + { + "epoch": 0.3638064835709175, + "grad_norm": 0.7112231850624084, + "learning_rate": 9.224619268689019e-06, + "loss": 0.8606, + "step": 6610 + }, + { + "epoch": 0.3638615223732732, + "grad_norm": 0.8052846789360046, + "learning_rate": 9.224387397326115e-06, + "loss": 0.7838, + "step": 6611 + }, + { + "epoch": 0.3639165611756288, + "grad_norm": 0.7538836002349854, + "learning_rate": 9.224155494213846e-06, + "loss": 0.8252, + "step": 6612 + }, + { + "epoch": 0.3639715999779845, + "grad_norm": 0.6968722343444824, + "learning_rate": 9.223923559353956e-06, + "loss": 0.759, + "step": 6613 + }, + { + "epoch": 0.36402663878034014, + "grad_norm": 0.7797368168830872, + "learning_rate": 9.223691592748185e-06, + "loss": 0.8452, + "step": 6614 + }, + { + "epoch": 0.3640816775826958, + "grad_norm": 0.7738572955131531, + "learning_rate": 9.223459594398278e-06, + "loss": 0.806, + "step": 6615 + }, + { + "epoch": 0.36413671638505146, + "grad_norm": 0.7998547554016113, + "learning_rate": 9.223227564305983e-06, + "loss": 0.748, + "step": 6616 + }, + { + "epoch": 0.3641917551874071, + "grad_norm": 0.838666558265686, + "learning_rate": 9.222995502473037e-06, + "loss": 0.8252, + "step": 6617 + }, + { + "epoch": 0.3642467939897628, + "grad_norm": 1.1672697067260742, + "learning_rate": 9.222763408901189e-06, + "loss": 0.806, + "step": 6618 + }, + { + "epoch": 0.3643018327921184, + "grad_norm": 0.6721193194389343, + "learning_rate": 9.22253128359218e-06, + "loss": 0.6897, + "step": 6619 + }, + { + "epoch": 0.3643568715944741, + "grad_norm": 0.8152795433998108, + "learning_rate": 9.222299126547758e-06, + "loss": 0.8377, + "step": 6620 + }, + { + "epoch": 0.36441191039682974, + "grad_norm": 0.7959492206573486, + "learning_rate": 9.222066937769664e-06, + "loss": 0.8496, + "step": 6621 + }, + { + "epoch": 0.36446694919918543, + "grad_norm": 0.7759784460067749, + "learning_rate": 9.221834717259646e-06, + "loss": 0.7736, + "step": 6622 + }, + { + "epoch": 0.36452198800154106, + "grad_norm": 0.6929076313972473, + "learning_rate": 9.221602465019449e-06, + "loss": 0.7759, + "step": 6623 + }, + { + "epoch": 0.36457702680389675, + "grad_norm": 0.7323315143585205, + "learning_rate": 9.221370181050817e-06, + "loss": 0.7958, + "step": 6624 + }, + { + "epoch": 0.3646320656062524, + "grad_norm": 0.7177294492721558, + "learning_rate": 9.221137865355496e-06, + "loss": 0.8405, + "step": 6625 + }, + { + "epoch": 0.3646871044086081, + "grad_norm": 0.7425093650817871, + "learning_rate": 9.220905517935235e-06, + "loss": 0.7722, + "step": 6626 + }, + { + "epoch": 0.3647421432109637, + "grad_norm": 0.8761040568351746, + "learning_rate": 9.220673138791775e-06, + "loss": 0.8617, + "step": 6627 + }, + { + "epoch": 0.3647971820133194, + "grad_norm": 0.927509069442749, + "learning_rate": 9.220440727926869e-06, + "loss": 0.7839, + "step": 6628 + }, + { + "epoch": 0.36485222081567503, + "grad_norm": 0.874399721622467, + "learning_rate": 9.220208285342258e-06, + "loss": 0.9697, + "step": 6629 + }, + { + "epoch": 0.3649072596180307, + "grad_norm": 0.931384801864624, + "learning_rate": 9.219975811039691e-06, + "loss": 0.8142, + "step": 6630 + }, + { + "epoch": 0.36496229842038636, + "grad_norm": 0.8567885160446167, + "learning_rate": 9.219743305020916e-06, + "loss": 0.7623, + "step": 6631 + }, + { + "epoch": 0.36501733722274204, + "grad_norm": 0.7287514209747314, + "learning_rate": 9.21951076728768e-06, + "loss": 0.8044, + "step": 6632 + }, + { + "epoch": 0.3650723760250977, + "grad_norm": 0.7234703302383423, + "learning_rate": 9.21927819784173e-06, + "loss": 0.7736, + "step": 6633 + }, + { + "epoch": 0.36512741482745337, + "grad_norm": 0.7174978256225586, + "learning_rate": 9.219045596684815e-06, + "loss": 0.7658, + "step": 6634 + }, + { + "epoch": 0.365182453629809, + "grad_norm": 0.751075804233551, + "learning_rate": 9.218812963818682e-06, + "loss": 0.7586, + "step": 6635 + }, + { + "epoch": 0.3652374924321647, + "grad_norm": 0.755283534526825, + "learning_rate": 9.21858029924508e-06, + "loss": 0.8904, + "step": 6636 + }, + { + "epoch": 0.3652925312345203, + "grad_norm": 0.6439716815948486, + "learning_rate": 9.21834760296576e-06, + "loss": 0.7335, + "step": 6637 + }, + { + "epoch": 0.365347570036876, + "grad_norm": 0.735285758972168, + "learning_rate": 9.218114874982467e-06, + "loss": 0.7193, + "step": 6638 + }, + { + "epoch": 0.36540260883923165, + "grad_norm": 0.7724307775497437, + "learning_rate": 9.217882115296952e-06, + "loss": 0.8322, + "step": 6639 + }, + { + "epoch": 0.36545764764158734, + "grad_norm": 0.7771303653717041, + "learning_rate": 9.217649323910964e-06, + "loss": 0.7952, + "step": 6640 + }, + { + "epoch": 0.36551268644394297, + "grad_norm": 0.7753337621688843, + "learning_rate": 9.217416500826251e-06, + "loss": 0.8501, + "step": 6641 + }, + { + "epoch": 0.36556772524629866, + "grad_norm": 0.8104514479637146, + "learning_rate": 9.217183646044567e-06, + "loss": 0.8503, + "step": 6642 + }, + { + "epoch": 0.3656227640486543, + "grad_norm": 0.7191929221153259, + "learning_rate": 9.21695075956766e-06, + "loss": 0.7578, + "step": 6643 + }, + { + "epoch": 0.36567780285101, + "grad_norm": 0.745837926864624, + "learning_rate": 9.216717841397277e-06, + "loss": 0.819, + "step": 6644 + }, + { + "epoch": 0.3657328416533656, + "grad_norm": 0.7019662261009216, + "learning_rate": 9.216484891535174e-06, + "loss": 0.8024, + "step": 6645 + }, + { + "epoch": 0.3657878804557213, + "grad_norm": 0.9709738492965698, + "learning_rate": 9.216251909983095e-06, + "loss": 0.7653, + "step": 6646 + }, + { + "epoch": 0.36584291925807694, + "grad_norm": 0.7973032593727112, + "learning_rate": 9.2160188967428e-06, + "loss": 0.8071, + "step": 6647 + }, + { + "epoch": 0.3658979580604326, + "grad_norm": 0.6945796012878418, + "learning_rate": 9.215785851816034e-06, + "loss": 0.6831, + "step": 6648 + }, + { + "epoch": 0.36595299686278826, + "grad_norm": 0.8685100674629211, + "learning_rate": 9.21555277520455e-06, + "loss": 0.821, + "step": 6649 + }, + { + "epoch": 0.36600803566514395, + "grad_norm": 1.0164310932159424, + "learning_rate": 9.2153196669101e-06, + "loss": 0.7861, + "step": 6650 + }, + { + "epoch": 0.3660630744674996, + "grad_norm": 0.8572850227355957, + "learning_rate": 9.215086526934435e-06, + "loss": 0.7982, + "step": 6651 + }, + { + "epoch": 0.36611811326985527, + "grad_norm": 0.7481987476348877, + "learning_rate": 9.214853355279307e-06, + "loss": 0.8258, + "step": 6652 + }, + { + "epoch": 0.3661731520722109, + "grad_norm": 0.750344455242157, + "learning_rate": 9.214620151946472e-06, + "loss": 0.7842, + "step": 6653 + }, + { + "epoch": 0.3662281908745666, + "grad_norm": 1.0266414880752563, + "learning_rate": 9.214386916937678e-06, + "loss": 0.7313, + "step": 6654 + }, + { + "epoch": 0.36628322967692223, + "grad_norm": 0.7913589477539062, + "learning_rate": 9.214153650254682e-06, + "loss": 0.8251, + "step": 6655 + }, + { + "epoch": 0.3663382684792779, + "grad_norm": 0.7185465693473816, + "learning_rate": 9.213920351899235e-06, + "loss": 0.7145, + "step": 6656 + }, + { + "epoch": 0.36639330728163355, + "grad_norm": 0.7185063362121582, + "learning_rate": 9.213687021873088e-06, + "loss": 0.8321, + "step": 6657 + }, + { + "epoch": 0.3664483460839892, + "grad_norm": 0.8380091190338135, + "learning_rate": 9.213453660178e-06, + "loss": 0.8293, + "step": 6658 + }, + { + "epoch": 0.3665033848863449, + "grad_norm": 0.7569485306739807, + "learning_rate": 9.21322026681572e-06, + "loss": 0.7201, + "step": 6659 + }, + { + "epoch": 0.3665584236887005, + "grad_norm": 0.7212445735931396, + "learning_rate": 9.212986841788005e-06, + "loss": 0.7869, + "step": 6660 + }, + { + "epoch": 0.3666134624910562, + "grad_norm": 0.9435489773750305, + "learning_rate": 9.212753385096612e-06, + "loss": 0.8469, + "step": 6661 + }, + { + "epoch": 0.36666850129341183, + "grad_norm": 0.6609265208244324, + "learning_rate": 9.212519896743289e-06, + "loss": 0.6446, + "step": 6662 + }, + { + "epoch": 0.3667235400957675, + "grad_norm": 0.7232604026794434, + "learning_rate": 9.212286376729794e-06, + "loss": 0.7138, + "step": 6663 + }, + { + "epoch": 0.36677857889812315, + "grad_norm": 0.7276197075843811, + "learning_rate": 9.212052825057882e-06, + "loss": 0.725, + "step": 6664 + }, + { + "epoch": 0.36683361770047884, + "grad_norm": 0.7029727101325989, + "learning_rate": 9.21181924172931e-06, + "loss": 0.6973, + "step": 6665 + }, + { + "epoch": 0.3668886565028345, + "grad_norm": 0.7292968034744263, + "learning_rate": 9.21158562674583e-06, + "loss": 0.6984, + "step": 6666 + }, + { + "epoch": 0.36694369530519017, + "grad_norm": 0.6977009177207947, + "learning_rate": 9.2113519801092e-06, + "loss": 0.7752, + "step": 6667 + }, + { + "epoch": 0.3669987341075458, + "grad_norm": 0.8019471764564514, + "learning_rate": 9.211118301821176e-06, + "loss": 0.7481, + "step": 6668 + }, + { + "epoch": 0.3670537729099015, + "grad_norm": 0.8097867965698242, + "learning_rate": 9.210884591883516e-06, + "loss": 0.8077, + "step": 6669 + }, + { + "epoch": 0.3671088117122571, + "grad_norm": 1.1622828245162964, + "learning_rate": 9.210650850297973e-06, + "loss": 0.8053, + "step": 6670 + }, + { + "epoch": 0.3671638505146128, + "grad_norm": 0.8188957571983337, + "learning_rate": 9.210417077066304e-06, + "loss": 0.7731, + "step": 6671 + }, + { + "epoch": 0.36721888931696844, + "grad_norm": 0.8531584739685059, + "learning_rate": 9.210183272190269e-06, + "loss": 0.8183, + "step": 6672 + }, + { + "epoch": 0.36727392811932413, + "grad_norm": 0.8007203936576843, + "learning_rate": 9.209949435671624e-06, + "loss": 0.7906, + "step": 6673 + }, + { + "epoch": 0.36732896692167977, + "grad_norm": 0.8284860253334045, + "learning_rate": 9.209715567512126e-06, + "loss": 0.7845, + "step": 6674 + }, + { + "epoch": 0.36738400572403546, + "grad_norm": 0.7735304236412048, + "learning_rate": 9.209481667713533e-06, + "loss": 0.7333, + "step": 6675 + }, + { + "epoch": 0.3674390445263911, + "grad_norm": 0.7390912771224976, + "learning_rate": 9.209247736277601e-06, + "loss": 0.7992, + "step": 6676 + }, + { + "epoch": 0.3674940833287468, + "grad_norm": 0.6871926784515381, + "learning_rate": 9.209013773206091e-06, + "loss": 0.7765, + "step": 6677 + }, + { + "epoch": 0.3675491221311024, + "grad_norm": 0.7241746187210083, + "learning_rate": 9.208779778500758e-06, + "loss": 0.7124, + "step": 6678 + }, + { + "epoch": 0.3676041609334581, + "grad_norm": 0.7362630367279053, + "learning_rate": 9.208545752163365e-06, + "loss": 0.7695, + "step": 6679 + }, + { + "epoch": 0.36765919973581374, + "grad_norm": 0.7577944993972778, + "learning_rate": 9.208311694195669e-06, + "loss": 0.8302, + "step": 6680 + }, + { + "epoch": 0.3677142385381694, + "grad_norm": 0.7182355523109436, + "learning_rate": 9.208077604599427e-06, + "loss": 0.8182, + "step": 6681 + }, + { + "epoch": 0.36776927734052506, + "grad_norm": 0.7636679410934448, + "learning_rate": 9.207843483376402e-06, + "loss": 0.7266, + "step": 6682 + }, + { + "epoch": 0.36782431614288075, + "grad_norm": 0.7325936555862427, + "learning_rate": 9.207609330528349e-06, + "loss": 0.735, + "step": 6683 + }, + { + "epoch": 0.3678793549452364, + "grad_norm": 1.1119143962860107, + "learning_rate": 9.207375146057033e-06, + "loss": 1.0124, + "step": 6684 + }, + { + "epoch": 0.36793439374759207, + "grad_norm": 0.7694228291511536, + "learning_rate": 9.207140929964212e-06, + "loss": 0.7803, + "step": 6685 + }, + { + "epoch": 0.3679894325499477, + "grad_norm": 0.7628658413887024, + "learning_rate": 9.206906682251644e-06, + "loss": 0.8057, + "step": 6686 + }, + { + "epoch": 0.3680444713523034, + "grad_norm": 0.766266942024231, + "learning_rate": 9.206672402921092e-06, + "loss": 0.7827, + "step": 6687 + }, + { + "epoch": 0.368099510154659, + "grad_norm": 0.7355746626853943, + "learning_rate": 9.206438091974316e-06, + "loss": 0.8146, + "step": 6688 + }, + { + "epoch": 0.3681545489570147, + "grad_norm": 0.8464547395706177, + "learning_rate": 9.20620374941308e-06, + "loss": 0.8296, + "step": 6689 + }, + { + "epoch": 0.36820958775937035, + "grad_norm": 0.7113955616950989, + "learning_rate": 9.20596937523914e-06, + "loss": 0.7621, + "step": 6690 + }, + { + "epoch": 0.36826462656172604, + "grad_norm": 0.7141324877738953, + "learning_rate": 9.205734969454259e-06, + "loss": 0.738, + "step": 6691 + }, + { + "epoch": 0.36831966536408167, + "grad_norm": 0.7576237320899963, + "learning_rate": 9.2055005320602e-06, + "loss": 0.7727, + "step": 6692 + }, + { + "epoch": 0.36837470416643736, + "grad_norm": 0.7448444962501526, + "learning_rate": 9.205266063058727e-06, + "loss": 0.8238, + "step": 6693 + }, + { + "epoch": 0.368429742968793, + "grad_norm": 0.7441811561584473, + "learning_rate": 9.205031562451599e-06, + "loss": 0.7518, + "step": 6694 + }, + { + "epoch": 0.3684847817711487, + "grad_norm": 0.9284115433692932, + "learning_rate": 9.20479703024058e-06, + "loss": 0.817, + "step": 6695 + }, + { + "epoch": 0.3685398205735043, + "grad_norm": 0.7019243836402893, + "learning_rate": 9.204562466427431e-06, + "loss": 0.7403, + "step": 6696 + }, + { + "epoch": 0.36859485937586, + "grad_norm": 0.6345306634902954, + "learning_rate": 9.204327871013917e-06, + "loss": 0.7058, + "step": 6697 + }, + { + "epoch": 0.36864989817821564, + "grad_norm": 0.7375063300132751, + "learning_rate": 9.2040932440018e-06, + "loss": 0.831, + "step": 6698 + }, + { + "epoch": 0.36870493698057133, + "grad_norm": 0.8213731050491333, + "learning_rate": 9.203858585392842e-06, + "loss": 0.7677, + "step": 6699 + }, + { + "epoch": 0.36875997578292696, + "grad_norm": 0.7114601731300354, + "learning_rate": 9.203623895188809e-06, + "loss": 0.8015, + "step": 6700 + }, + { + "epoch": 0.3688150145852826, + "grad_norm": 0.7707667350769043, + "learning_rate": 9.203389173391463e-06, + "loss": 0.7758, + "step": 6701 + }, + { + "epoch": 0.3688700533876383, + "grad_norm": 0.7374396920204163, + "learning_rate": 9.203154420002572e-06, + "loss": 0.7583, + "step": 6702 + }, + { + "epoch": 0.3689250921899939, + "grad_norm": 0.7156866192817688, + "learning_rate": 9.202919635023895e-06, + "loss": 0.8173, + "step": 6703 + }, + { + "epoch": 0.3689801309923496, + "grad_norm": 0.6811904311180115, + "learning_rate": 9.2026848184572e-06, + "loss": 0.7441, + "step": 6704 + }, + { + "epoch": 0.36903516979470524, + "grad_norm": 0.7515163421630859, + "learning_rate": 9.20244997030425e-06, + "loss": 0.7927, + "step": 6705 + }, + { + "epoch": 0.36909020859706093, + "grad_norm": 0.761116087436676, + "learning_rate": 9.202215090566813e-06, + "loss": 0.7686, + "step": 6706 + }, + { + "epoch": 0.36914524739941657, + "grad_norm": 0.8726711869239807, + "learning_rate": 9.20198017924665e-06, + "loss": 0.7831, + "step": 6707 + }, + { + "epoch": 0.36920028620177225, + "grad_norm": 0.6868153810501099, + "learning_rate": 9.20174523634553e-06, + "loss": 0.7855, + "step": 6708 + }, + { + "epoch": 0.3692553250041279, + "grad_norm": 0.7140498757362366, + "learning_rate": 9.201510261865218e-06, + "loss": 0.8144, + "step": 6709 + }, + { + "epoch": 0.3693103638064836, + "grad_norm": 0.8745181560516357, + "learning_rate": 9.201275255807478e-06, + "loss": 0.9204, + "step": 6710 + }, + { + "epoch": 0.3693654026088392, + "grad_norm": 0.6535945534706116, + "learning_rate": 9.20104021817408e-06, + "loss": 0.7729, + "step": 6711 + }, + { + "epoch": 0.3694204414111949, + "grad_norm": 0.655857503414154, + "learning_rate": 9.200805148966785e-06, + "loss": 0.8373, + "step": 6712 + }, + { + "epoch": 0.36947548021355053, + "grad_norm": 0.8393271565437317, + "learning_rate": 9.200570048187365e-06, + "loss": 0.8532, + "step": 6713 + }, + { + "epoch": 0.3695305190159062, + "grad_norm": 0.7484574913978577, + "learning_rate": 9.200334915837585e-06, + "loss": 0.8411, + "step": 6714 + }, + { + "epoch": 0.36958555781826186, + "grad_norm": 0.9913665652275085, + "learning_rate": 9.200099751919212e-06, + "loss": 0.9011, + "step": 6715 + }, + { + "epoch": 0.36964059662061755, + "grad_norm": 0.7314063310623169, + "learning_rate": 9.199864556434013e-06, + "loss": 0.7184, + "step": 6716 + }, + { + "epoch": 0.3696956354229732, + "grad_norm": 0.7881553173065186, + "learning_rate": 9.199629329383758e-06, + "loss": 0.796, + "step": 6717 + }, + { + "epoch": 0.36975067422532887, + "grad_norm": 0.7440283298492432, + "learning_rate": 9.199394070770212e-06, + "loss": 0.7472, + "step": 6718 + }, + { + "epoch": 0.3698057130276845, + "grad_norm": 0.6916326880455017, + "learning_rate": 9.199158780595144e-06, + "loss": 0.6808, + "step": 6719 + }, + { + "epoch": 0.3698607518300402, + "grad_norm": 0.8482714295387268, + "learning_rate": 9.198923458860323e-06, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.3699157906323958, + "grad_norm": 0.7541999816894531, + "learning_rate": 9.198688105567516e-06, + "loss": 0.7917, + "step": 6721 + }, + { + "epoch": 0.3699708294347515, + "grad_norm": 0.794335126876831, + "learning_rate": 9.198452720718494e-06, + "loss": 0.8463, + "step": 6722 + }, + { + "epoch": 0.37002586823710715, + "grad_norm": 0.7866827845573425, + "learning_rate": 9.198217304315025e-06, + "loss": 0.7938, + "step": 6723 + }, + { + "epoch": 0.37008090703946284, + "grad_norm": 0.7393556833267212, + "learning_rate": 9.19798185635888e-06, + "loss": 0.7825, + "step": 6724 + }, + { + "epoch": 0.37013594584181847, + "grad_norm": 0.7131090760231018, + "learning_rate": 9.197746376851825e-06, + "loss": 0.7184, + "step": 6725 + }, + { + "epoch": 0.37019098464417416, + "grad_norm": 0.7054039239883423, + "learning_rate": 9.197510865795634e-06, + "loss": 0.7458, + "step": 6726 + }, + { + "epoch": 0.3702460234465298, + "grad_norm": 0.7437009811401367, + "learning_rate": 9.197275323192073e-06, + "loss": 0.7921, + "step": 6727 + }, + { + "epoch": 0.3703010622488855, + "grad_norm": 1.0703076124191284, + "learning_rate": 9.197039749042916e-06, + "loss": 0.771, + "step": 6728 + }, + { + "epoch": 0.3703561010512411, + "grad_norm": 0.8278045654296875, + "learning_rate": 9.196804143349929e-06, + "loss": 0.8984, + "step": 6729 + }, + { + "epoch": 0.3704111398535968, + "grad_norm": 0.7713067531585693, + "learning_rate": 9.196568506114887e-06, + "loss": 0.7702, + "step": 6730 + }, + { + "epoch": 0.37046617865595244, + "grad_norm": 0.9040505290031433, + "learning_rate": 9.19633283733956e-06, + "loss": 0.7113, + "step": 6731 + }, + { + "epoch": 0.3705212174583081, + "grad_norm": 0.8853700757026672, + "learning_rate": 9.196097137025718e-06, + "loss": 0.8445, + "step": 6732 + }, + { + "epoch": 0.37057625626066376, + "grad_norm": 0.6870817542076111, + "learning_rate": 9.195861405175133e-06, + "loss": 0.7613, + "step": 6733 + }, + { + "epoch": 0.37063129506301945, + "grad_norm": 0.7539152503013611, + "learning_rate": 9.195625641789579e-06, + "loss": 0.7478, + "step": 6734 + }, + { + "epoch": 0.3706863338653751, + "grad_norm": 0.7084356546401978, + "learning_rate": 9.195389846870822e-06, + "loss": 0.7803, + "step": 6735 + }, + { + "epoch": 0.3707413726677308, + "grad_norm": 0.7883948087692261, + "learning_rate": 9.19515402042064e-06, + "loss": 0.8606, + "step": 6736 + }, + { + "epoch": 0.3707964114700864, + "grad_norm": 0.714948296546936, + "learning_rate": 9.194918162440804e-06, + "loss": 0.8066, + "step": 6737 + }, + { + "epoch": 0.3708514502724421, + "grad_norm": 0.7110786437988281, + "learning_rate": 9.194682272933085e-06, + "loss": 0.7439, + "step": 6738 + }, + { + "epoch": 0.37090648907479773, + "grad_norm": 0.7281045317649841, + "learning_rate": 9.194446351899257e-06, + "loss": 0.7772, + "step": 6739 + }, + { + "epoch": 0.3709615278771534, + "grad_norm": 0.7351245880126953, + "learning_rate": 9.194210399341093e-06, + "loss": 0.8777, + "step": 6740 + }, + { + "epoch": 0.37101656667950905, + "grad_norm": 0.8028532266616821, + "learning_rate": 9.193974415260367e-06, + "loss": 0.7461, + "step": 6741 + }, + { + "epoch": 0.37107160548186474, + "grad_norm": 0.8015451431274414, + "learning_rate": 9.19373839965885e-06, + "loss": 0.8006, + "step": 6742 + }, + { + "epoch": 0.3711266442842204, + "grad_norm": 0.9567442536354065, + "learning_rate": 9.193502352538321e-06, + "loss": 0.8636, + "step": 6743 + }, + { + "epoch": 0.371181683086576, + "grad_norm": 1.1413114070892334, + "learning_rate": 9.193266273900547e-06, + "loss": 0.8976, + "step": 6744 + }, + { + "epoch": 0.3712367218889317, + "grad_norm": 0.6971789002418518, + "learning_rate": 9.19303016374731e-06, + "loss": 0.7419, + "step": 6745 + }, + { + "epoch": 0.37129176069128733, + "grad_norm": 0.8117435574531555, + "learning_rate": 9.192794022080378e-06, + "loss": 0.8166, + "step": 6746 + }, + { + "epoch": 0.371346799493643, + "grad_norm": 0.7748119831085205, + "learning_rate": 9.19255784890153e-06, + "loss": 0.8073, + "step": 6747 + }, + { + "epoch": 0.37140183829599865, + "grad_norm": 0.6550068259239197, + "learning_rate": 9.192321644212539e-06, + "loss": 0.6976, + "step": 6748 + }, + { + "epoch": 0.37145687709835434, + "grad_norm": 0.7931404709815979, + "learning_rate": 9.19208540801518e-06, + "loss": 0.7153, + "step": 6749 + }, + { + "epoch": 0.37151191590071, + "grad_norm": 0.7107539176940918, + "learning_rate": 9.19184914031123e-06, + "loss": 0.7616, + "step": 6750 + }, + { + "epoch": 0.37156695470306567, + "grad_norm": 0.6983848810195923, + "learning_rate": 9.191612841102463e-06, + "loss": 0.6507, + "step": 6751 + }, + { + "epoch": 0.3716219935054213, + "grad_norm": 0.7653477787971497, + "learning_rate": 9.191376510390657e-06, + "loss": 0.708, + "step": 6752 + }, + { + "epoch": 0.371677032307777, + "grad_norm": 0.8903954029083252, + "learning_rate": 9.191140148177586e-06, + "loss": 0.8131, + "step": 6753 + }, + { + "epoch": 0.3717320711101326, + "grad_norm": 0.7584933042526245, + "learning_rate": 9.190903754465028e-06, + "loss": 0.8178, + "step": 6754 + }, + { + "epoch": 0.3717871099124883, + "grad_norm": 0.7338405847549438, + "learning_rate": 9.19066732925476e-06, + "loss": 0.7717, + "step": 6755 + }, + { + "epoch": 0.37184214871484395, + "grad_norm": 0.764944851398468, + "learning_rate": 9.190430872548557e-06, + "loss": 0.7762, + "step": 6756 + }, + { + "epoch": 0.37189718751719963, + "grad_norm": 0.7362231612205505, + "learning_rate": 9.190194384348199e-06, + "loss": 0.8277, + "step": 6757 + }, + { + "epoch": 0.37195222631955527, + "grad_norm": 0.7462226748466492, + "learning_rate": 9.18995786465546e-06, + "loss": 0.7362, + "step": 6758 + }, + { + "epoch": 0.37200726512191096, + "grad_norm": 0.7769725322723389, + "learning_rate": 9.18972131347212e-06, + "loss": 0.8217, + "step": 6759 + }, + { + "epoch": 0.3720623039242666, + "grad_norm": 0.7263969779014587, + "learning_rate": 9.189484730799956e-06, + "loss": 0.7719, + "step": 6760 + }, + { + "epoch": 0.3721173427266223, + "grad_norm": 0.7612473964691162, + "learning_rate": 9.189248116640746e-06, + "loss": 0.7149, + "step": 6761 + }, + { + "epoch": 0.3721723815289779, + "grad_norm": 0.6813042759895325, + "learning_rate": 9.189011470996268e-06, + "loss": 0.7119, + "step": 6762 + }, + { + "epoch": 0.3722274203313336, + "grad_norm": 0.7376571297645569, + "learning_rate": 9.188774793868302e-06, + "loss": 0.7998, + "step": 6763 + }, + { + "epoch": 0.37228245913368924, + "grad_norm": 0.8592102527618408, + "learning_rate": 9.188538085258626e-06, + "loss": 0.8026, + "step": 6764 + }, + { + "epoch": 0.3723374979360449, + "grad_norm": 0.7666613459587097, + "learning_rate": 9.188301345169017e-06, + "loss": 0.8571, + "step": 6765 + }, + { + "epoch": 0.37239253673840056, + "grad_norm": 0.7118985652923584, + "learning_rate": 9.188064573601258e-06, + "loss": 0.7637, + "step": 6766 + }, + { + "epoch": 0.37244757554075625, + "grad_norm": 0.8247082233428955, + "learning_rate": 9.187827770557127e-06, + "loss": 0.8209, + "step": 6767 + }, + { + "epoch": 0.3725026143431119, + "grad_norm": 0.7259567975997925, + "learning_rate": 9.187590936038403e-06, + "loss": 0.7918, + "step": 6768 + }, + { + "epoch": 0.37255765314546757, + "grad_norm": 0.7409893274307251, + "learning_rate": 9.187354070046867e-06, + "loss": 0.8004, + "step": 6769 + }, + { + "epoch": 0.3726126919478232, + "grad_norm": 0.8163084387779236, + "learning_rate": 9.187117172584298e-06, + "loss": 0.8452, + "step": 6770 + }, + { + "epoch": 0.3726677307501789, + "grad_norm": 0.9241586923599243, + "learning_rate": 9.186880243652477e-06, + "loss": 0.8939, + "step": 6771 + }, + { + "epoch": 0.3727227695525345, + "grad_norm": 0.710434079170227, + "learning_rate": 9.186643283253185e-06, + "loss": 0.7337, + "step": 6772 + }, + { + "epoch": 0.3727778083548902, + "grad_norm": 0.7850505709648132, + "learning_rate": 9.186406291388203e-06, + "loss": 0.7892, + "step": 6773 + }, + { + "epoch": 0.37283284715724585, + "grad_norm": 0.813979983329773, + "learning_rate": 9.186169268059311e-06, + "loss": 0.7993, + "step": 6774 + }, + { + "epoch": 0.37288788595960154, + "grad_norm": 0.7923213243484497, + "learning_rate": 9.185932213268292e-06, + "loss": 0.7501, + "step": 6775 + }, + { + "epoch": 0.3729429247619572, + "grad_norm": 0.7923155426979065, + "learning_rate": 9.185695127016928e-06, + "loss": 0.8435, + "step": 6776 + }, + { + "epoch": 0.37299796356431286, + "grad_norm": 0.69893479347229, + "learning_rate": 9.185458009306999e-06, + "loss": 0.7155, + "step": 6777 + }, + { + "epoch": 0.3730530023666685, + "grad_norm": 0.7848305106163025, + "learning_rate": 9.185220860140289e-06, + "loss": 0.7971, + "step": 6778 + }, + { + "epoch": 0.3731080411690242, + "grad_norm": 0.6707655787467957, + "learning_rate": 9.184983679518578e-06, + "loss": 0.6939, + "step": 6779 + }, + { + "epoch": 0.3731630799713798, + "grad_norm": 0.6612532734870911, + "learning_rate": 9.18474646744365e-06, + "loss": 0.7361, + "step": 6780 + }, + { + "epoch": 0.3732181187737355, + "grad_norm": 0.7753985524177551, + "learning_rate": 9.184509223917288e-06, + "loss": 0.7263, + "step": 6781 + }, + { + "epoch": 0.37327315757609114, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.184271948941275e-06, + "loss": 0.6923, + "step": 6782 + }, + { + "epoch": 0.37332819637844683, + "grad_norm": 0.7223647832870483, + "learning_rate": 9.184034642517393e-06, + "loss": 0.793, + "step": 6783 + }, + { + "epoch": 0.37338323518080246, + "grad_norm": 0.7428838014602661, + "learning_rate": 9.183797304647428e-06, + "loss": 0.7781, + "step": 6784 + }, + { + "epoch": 0.37343827398315815, + "grad_norm": 0.7301773428916931, + "learning_rate": 9.183559935333161e-06, + "loss": 0.7964, + "step": 6785 + }, + { + "epoch": 0.3734933127855138, + "grad_norm": 0.7883384823799133, + "learning_rate": 9.183322534576378e-06, + "loss": 0.8904, + "step": 6786 + }, + { + "epoch": 0.3735483515878694, + "grad_norm": 0.7943564653396606, + "learning_rate": 9.183085102378864e-06, + "loss": 0.7229, + "step": 6787 + }, + { + "epoch": 0.3736033903902251, + "grad_norm": 0.7385129928588867, + "learning_rate": 9.1828476387424e-06, + "loss": 0.7967, + "step": 6788 + }, + { + "epoch": 0.37365842919258074, + "grad_norm": 0.7968102097511292, + "learning_rate": 9.182610143668775e-06, + "loss": 0.8016, + "step": 6789 + }, + { + "epoch": 0.37371346799493643, + "grad_norm": 0.7810283303260803, + "learning_rate": 9.18237261715977e-06, + "loss": 0.8956, + "step": 6790 + }, + { + "epoch": 0.37376850679729207, + "grad_norm": 0.7110065221786499, + "learning_rate": 9.182135059217172e-06, + "loss": 0.7808, + "step": 6791 + }, + { + "epoch": 0.37382354559964776, + "grad_norm": 0.7513633370399475, + "learning_rate": 9.181897469842767e-06, + "loss": 0.8236, + "step": 6792 + }, + { + "epoch": 0.3738785844020034, + "grad_norm": 0.7850426435470581, + "learning_rate": 9.18165984903834e-06, + "loss": 0.8642, + "step": 6793 + }, + { + "epoch": 0.3739336232043591, + "grad_norm": 1.4948225021362305, + "learning_rate": 9.181422196805676e-06, + "loss": 0.8765, + "step": 6794 + }, + { + "epoch": 0.3739886620067147, + "grad_norm": 0.8242343068122864, + "learning_rate": 9.181184513146563e-06, + "loss": 0.7213, + "step": 6795 + }, + { + "epoch": 0.3740437008090704, + "grad_norm": 0.8017476797103882, + "learning_rate": 9.180946798062786e-06, + "loss": 0.655, + "step": 6796 + }, + { + "epoch": 0.37409873961142603, + "grad_norm": 0.9573387503623962, + "learning_rate": 9.180709051556132e-06, + "loss": 0.8674, + "step": 6797 + }, + { + "epoch": 0.3741537784137817, + "grad_norm": 0.7575511932373047, + "learning_rate": 9.180471273628388e-06, + "loss": 0.8672, + "step": 6798 + }, + { + "epoch": 0.37420881721613736, + "grad_norm": 0.7723323702812195, + "learning_rate": 9.180233464281343e-06, + "loss": 0.7698, + "step": 6799 + }, + { + "epoch": 0.37426385601849305, + "grad_norm": 0.8352731466293335, + "learning_rate": 9.17999562351678e-06, + "loss": 0.9248, + "step": 6800 + }, + { + "epoch": 0.3743188948208487, + "grad_norm": 0.7459322214126587, + "learning_rate": 9.179757751336488e-06, + "loss": 0.7561, + "step": 6801 + }, + { + "epoch": 0.37437393362320437, + "grad_norm": 0.8053051829338074, + "learning_rate": 9.179519847742257e-06, + "loss": 0.8743, + "step": 6802 + }, + { + "epoch": 0.37442897242556, + "grad_norm": 0.7781768441200256, + "learning_rate": 9.179281912735873e-06, + "loss": 0.7426, + "step": 6803 + }, + { + "epoch": 0.3744840112279157, + "grad_norm": 0.6812007427215576, + "learning_rate": 9.179043946319126e-06, + "loss": 0.761, + "step": 6804 + }, + { + "epoch": 0.3745390500302713, + "grad_norm": 0.8327108025550842, + "learning_rate": 9.178805948493803e-06, + "loss": 0.7633, + "step": 6805 + }, + { + "epoch": 0.374594088832627, + "grad_norm": 0.7519007921218872, + "learning_rate": 9.178567919261692e-06, + "loss": 0.8268, + "step": 6806 + }, + { + "epoch": 0.37464912763498265, + "grad_norm": 0.7507897019386292, + "learning_rate": 9.178329858624584e-06, + "loss": 0.8734, + "step": 6807 + }, + { + "epoch": 0.37470416643733834, + "grad_norm": 0.6874666213989258, + "learning_rate": 9.178091766584267e-06, + "loss": 0.6669, + "step": 6808 + }, + { + "epoch": 0.37475920523969397, + "grad_norm": 0.6987403631210327, + "learning_rate": 9.17785364314253e-06, + "loss": 0.7627, + "step": 6809 + }, + { + "epoch": 0.37481424404204966, + "grad_norm": 0.7777343392372131, + "learning_rate": 9.177615488301163e-06, + "loss": 0.7637, + "step": 6810 + }, + { + "epoch": 0.3748692828444053, + "grad_norm": 0.71980881690979, + "learning_rate": 9.177377302061958e-06, + "loss": 0.7964, + "step": 6811 + }, + { + "epoch": 0.374924321646761, + "grad_norm": 0.627328634262085, + "learning_rate": 9.177139084426704e-06, + "loss": 0.6862, + "step": 6812 + }, + { + "epoch": 0.3749793604491166, + "grad_norm": 0.7099852561950684, + "learning_rate": 9.176900835397188e-06, + "loss": 0.7592, + "step": 6813 + }, + { + "epoch": 0.3750343992514723, + "grad_norm": 0.7880212664604187, + "learning_rate": 9.176662554975205e-06, + "loss": 0.756, + "step": 6814 + }, + { + "epoch": 0.37508943805382794, + "grad_norm": 0.7347460389137268, + "learning_rate": 9.176424243162546e-06, + "loss": 0.8537, + "step": 6815 + }, + { + "epoch": 0.37514447685618363, + "grad_norm": 0.7020999789237976, + "learning_rate": 9.176185899960996e-06, + "loss": 0.7844, + "step": 6816 + }, + { + "epoch": 0.37519951565853926, + "grad_norm": 0.6857696175575256, + "learning_rate": 9.175947525372355e-06, + "loss": 0.8491, + "step": 6817 + }, + { + "epoch": 0.37525455446089495, + "grad_norm": 0.6882391571998596, + "learning_rate": 9.175709119398409e-06, + "loss": 0.7797, + "step": 6818 + }, + { + "epoch": 0.3753095932632506, + "grad_norm": 0.7788485288619995, + "learning_rate": 9.17547068204095e-06, + "loss": 0.6898, + "step": 6819 + }, + { + "epoch": 0.3753646320656063, + "grad_norm": 0.8529300093650818, + "learning_rate": 9.17523221330177e-06, + "loss": 0.8113, + "step": 6820 + }, + { + "epoch": 0.3754196708679619, + "grad_norm": 0.6297540068626404, + "learning_rate": 9.174993713182663e-06, + "loss": 0.7133, + "step": 6821 + }, + { + "epoch": 0.3754747096703176, + "grad_norm": 0.8225051760673523, + "learning_rate": 9.174755181685422e-06, + "loss": 0.83, + "step": 6822 + }, + { + "epoch": 0.37552974847267323, + "grad_norm": 0.7445290684700012, + "learning_rate": 9.174516618811838e-06, + "loss": 0.8597, + "step": 6823 + }, + { + "epoch": 0.3755847872750289, + "grad_norm": 0.7890744209289551, + "learning_rate": 9.174278024563706e-06, + "loss": 0.8021, + "step": 6824 + }, + { + "epoch": 0.37563982607738455, + "grad_norm": 0.644434928894043, + "learning_rate": 9.174039398942815e-06, + "loss": 0.7154, + "step": 6825 + }, + { + "epoch": 0.37569486487974024, + "grad_norm": 0.7664980292320251, + "learning_rate": 9.173800741950962e-06, + "loss": 0.8496, + "step": 6826 + }, + { + "epoch": 0.3757499036820959, + "grad_norm": 0.8062339425086975, + "learning_rate": 9.173562053589942e-06, + "loss": 0.7736, + "step": 6827 + }, + { + "epoch": 0.37580494248445157, + "grad_norm": 0.6334213018417358, + "learning_rate": 9.173323333861543e-06, + "loss": 0.6513, + "step": 6828 + }, + { + "epoch": 0.3758599812868072, + "grad_norm": 0.6825501322746277, + "learning_rate": 9.173084582767567e-06, + "loss": 0.755, + "step": 6829 + }, + { + "epoch": 0.37591502008916283, + "grad_norm": 0.7353835105895996, + "learning_rate": 9.172845800309801e-06, + "loss": 0.7783, + "step": 6830 + }, + { + "epoch": 0.3759700588915185, + "grad_norm": 0.7830193638801575, + "learning_rate": 9.172606986490046e-06, + "loss": 0.7352, + "step": 6831 + }, + { + "epoch": 0.37602509769387416, + "grad_norm": 0.7464943528175354, + "learning_rate": 9.172368141310091e-06, + "loss": 0.6454, + "step": 6832 + }, + { + "epoch": 0.37608013649622984, + "grad_norm": 0.7171493172645569, + "learning_rate": 9.172129264771736e-06, + "loss": 0.7978, + "step": 6833 + }, + { + "epoch": 0.3761351752985855, + "grad_norm": 0.6929624676704407, + "learning_rate": 9.171890356876774e-06, + "loss": 0.8026, + "step": 6834 + }, + { + "epoch": 0.37619021410094117, + "grad_norm": 0.7240758538246155, + "learning_rate": 9.171651417627e-06, + "loss": 0.8469, + "step": 6835 + }, + { + "epoch": 0.3762452529032968, + "grad_norm": 0.7713736891746521, + "learning_rate": 9.17141244702421e-06, + "loss": 0.8307, + "step": 6836 + }, + { + "epoch": 0.3763002917056525, + "grad_norm": 0.7417639493942261, + "learning_rate": 9.171173445070203e-06, + "loss": 0.8165, + "step": 6837 + }, + { + "epoch": 0.3763553305080081, + "grad_norm": 0.811005711555481, + "learning_rate": 9.17093441176677e-06, + "loss": 0.8418, + "step": 6838 + }, + { + "epoch": 0.3764103693103638, + "grad_norm": 0.9996818900108337, + "learning_rate": 9.170695347115713e-06, + "loss": 0.851, + "step": 6839 + }, + { + "epoch": 0.37646540811271945, + "grad_norm": 0.7703381776809692, + "learning_rate": 9.170456251118824e-06, + "loss": 0.8308, + "step": 6840 + }, + { + "epoch": 0.37652044691507514, + "grad_norm": 0.7194466590881348, + "learning_rate": 9.170217123777904e-06, + "loss": 0.699, + "step": 6841 + }, + { + "epoch": 0.37657548571743077, + "grad_norm": 0.7146462202072144, + "learning_rate": 9.169977965094748e-06, + "loss": 0.8247, + "step": 6842 + }, + { + "epoch": 0.37663052451978646, + "grad_norm": 0.7490555047988892, + "learning_rate": 9.169738775071153e-06, + "loss": 0.8627, + "step": 6843 + }, + { + "epoch": 0.3766855633221421, + "grad_norm": 0.827996015548706, + "learning_rate": 9.169499553708919e-06, + "loss": 0.7454, + "step": 6844 + }, + { + "epoch": 0.3767406021244978, + "grad_norm": 0.7185913324356079, + "learning_rate": 9.16926030100984e-06, + "loss": 0.7018, + "step": 6845 + }, + { + "epoch": 0.3767956409268534, + "grad_norm": 0.7879654169082642, + "learning_rate": 9.169021016975718e-06, + "loss": 0.8144, + "step": 6846 + }, + { + "epoch": 0.3768506797292091, + "grad_norm": 0.7072417736053467, + "learning_rate": 9.168781701608352e-06, + "loss": 0.7572, + "step": 6847 + }, + { + "epoch": 0.37690571853156474, + "grad_norm": 0.7359803915023804, + "learning_rate": 9.168542354909536e-06, + "loss": 0.7712, + "step": 6848 + }, + { + "epoch": 0.3769607573339204, + "grad_norm": 0.7672479748725891, + "learning_rate": 9.168302976881072e-06, + "loss": 0.7696, + "step": 6849 + }, + { + "epoch": 0.37701579613627606, + "grad_norm": 0.7276006937026978, + "learning_rate": 9.168063567524758e-06, + "loss": 0.8235, + "step": 6850 + }, + { + "epoch": 0.37707083493863175, + "grad_norm": 0.673577606678009, + "learning_rate": 9.167824126842396e-06, + "loss": 0.6515, + "step": 6851 + }, + { + "epoch": 0.3771258737409874, + "grad_norm": 0.7257997989654541, + "learning_rate": 9.167584654835782e-06, + "loss": 0.729, + "step": 6852 + }, + { + "epoch": 0.37718091254334307, + "grad_norm": 0.6655071377754211, + "learning_rate": 9.167345151506717e-06, + "loss": 0.7917, + "step": 6853 + }, + { + "epoch": 0.3772359513456987, + "grad_norm": 0.7603726983070374, + "learning_rate": 9.167105616857002e-06, + "loss": 0.8383, + "step": 6854 + }, + { + "epoch": 0.3772909901480544, + "grad_norm": 0.7066939473152161, + "learning_rate": 9.166866050888437e-06, + "loss": 0.7589, + "step": 6855 + }, + { + "epoch": 0.37734602895041003, + "grad_norm": 0.7002355456352234, + "learning_rate": 9.16662645360282e-06, + "loss": 0.8305, + "step": 6856 + }, + { + "epoch": 0.3774010677527657, + "grad_norm": 0.9499780535697937, + "learning_rate": 9.166386825001957e-06, + "loss": 0.78, + "step": 6857 + }, + { + "epoch": 0.37745610655512135, + "grad_norm": 0.7136938571929932, + "learning_rate": 9.166147165087645e-06, + "loss": 0.7449, + "step": 6858 + }, + { + "epoch": 0.37751114535747704, + "grad_norm": 0.740443766117096, + "learning_rate": 9.165907473861687e-06, + "loss": 0.8228, + "step": 6859 + }, + { + "epoch": 0.3775661841598327, + "grad_norm": 0.7649856209754944, + "learning_rate": 9.165667751325879e-06, + "loss": 0.7762, + "step": 6860 + }, + { + "epoch": 0.37762122296218836, + "grad_norm": 0.743251383304596, + "learning_rate": 9.165427997482032e-06, + "loss": 0.7536, + "step": 6861 + }, + { + "epoch": 0.377676261764544, + "grad_norm": 0.7023851871490479, + "learning_rate": 9.165188212331941e-06, + "loss": 0.7327, + "step": 6862 + }, + { + "epoch": 0.3777313005668997, + "grad_norm": 0.7304333448410034, + "learning_rate": 9.164948395877411e-06, + "loss": 0.8816, + "step": 6863 + }, + { + "epoch": 0.3777863393692553, + "grad_norm": 0.6666659116744995, + "learning_rate": 9.164708548120244e-06, + "loss": 0.7821, + "step": 6864 + }, + { + "epoch": 0.377841378171611, + "grad_norm": 0.6542865037918091, + "learning_rate": 9.164468669062242e-06, + "loss": 0.7044, + "step": 6865 + }, + { + "epoch": 0.37789641697396664, + "grad_norm": 0.7436043620109558, + "learning_rate": 9.16422875870521e-06, + "loss": 0.8492, + "step": 6866 + }, + { + "epoch": 0.37795145577632233, + "grad_norm": 0.7660424709320068, + "learning_rate": 9.163988817050947e-06, + "loss": 0.7236, + "step": 6867 + }, + { + "epoch": 0.37800649457867797, + "grad_norm": 0.7288914918899536, + "learning_rate": 9.16374884410126e-06, + "loss": 0.6361, + "step": 6868 + }, + { + "epoch": 0.37806153338103365, + "grad_norm": 0.884832501411438, + "learning_rate": 9.163508839857948e-06, + "loss": 0.8112, + "step": 6869 + }, + { + "epoch": 0.3781165721833893, + "grad_norm": 0.937660813331604, + "learning_rate": 9.163268804322822e-06, + "loss": 0.6405, + "step": 6870 + }, + { + "epoch": 0.378171610985745, + "grad_norm": 0.8295212388038635, + "learning_rate": 9.16302873749768e-06, + "loss": 0.8107, + "step": 6871 + }, + { + "epoch": 0.3782266497881006, + "grad_norm": 1.0573647022247314, + "learning_rate": 9.16278863938433e-06, + "loss": 0.7792, + "step": 6872 + }, + { + "epoch": 0.37828168859045624, + "grad_norm": 0.8450027108192444, + "learning_rate": 9.162548509984574e-06, + "loss": 0.8103, + "step": 6873 + }, + { + "epoch": 0.37833672739281193, + "grad_norm": 0.7372947931289673, + "learning_rate": 9.162308349300218e-06, + "loss": 0.8232, + "step": 6874 + }, + { + "epoch": 0.37839176619516757, + "grad_norm": 0.7573776841163635, + "learning_rate": 9.162068157333066e-06, + "loss": 0.773, + "step": 6875 + }, + { + "epoch": 0.37844680499752326, + "grad_norm": 0.7883201241493225, + "learning_rate": 9.161827934084924e-06, + "loss": 0.7561, + "step": 6876 + }, + { + "epoch": 0.3785018437998789, + "grad_norm": 0.7195025086402893, + "learning_rate": 9.161587679557598e-06, + "loss": 0.798, + "step": 6877 + }, + { + "epoch": 0.3785568826022346, + "grad_norm": 0.7047843337059021, + "learning_rate": 9.161347393752891e-06, + "loss": 0.8122, + "step": 6878 + }, + { + "epoch": 0.3786119214045902, + "grad_norm": 0.7354363203048706, + "learning_rate": 9.161107076672613e-06, + "loss": 0.7296, + "step": 6879 + }, + { + "epoch": 0.3786669602069459, + "grad_norm": 0.7748313546180725, + "learning_rate": 9.160866728318567e-06, + "loss": 0.9576, + "step": 6880 + }, + { + "epoch": 0.37872199900930154, + "grad_norm": 0.7197638750076294, + "learning_rate": 9.16062634869256e-06, + "loss": 0.8054, + "step": 6881 + }, + { + "epoch": 0.3787770378116572, + "grad_norm": 0.7086492776870728, + "learning_rate": 9.1603859377964e-06, + "loss": 0.8938, + "step": 6882 + }, + { + "epoch": 0.37883207661401286, + "grad_norm": 0.7764425873756409, + "learning_rate": 9.160145495631894e-06, + "loss": 0.7562, + "step": 6883 + }, + { + "epoch": 0.37888711541636855, + "grad_norm": 0.7673479914665222, + "learning_rate": 9.159905022200846e-06, + "loss": 0.6783, + "step": 6884 + }, + { + "epoch": 0.3789421542187242, + "grad_norm": 0.7323669195175171, + "learning_rate": 9.159664517505067e-06, + "loss": 0.8274, + "step": 6885 + }, + { + "epoch": 0.37899719302107987, + "grad_norm": 0.8283136487007141, + "learning_rate": 9.159423981546362e-06, + "loss": 0.7184, + "step": 6886 + }, + { + "epoch": 0.3790522318234355, + "grad_norm": 0.6949145793914795, + "learning_rate": 9.15918341432654e-06, + "loss": 0.7843, + "step": 6887 + }, + { + "epoch": 0.3791072706257912, + "grad_norm": 0.8584639430046082, + "learning_rate": 9.158942815847408e-06, + "loss": 0.71, + "step": 6888 + }, + { + "epoch": 0.3791623094281468, + "grad_norm": 0.7125271558761597, + "learning_rate": 9.158702186110777e-06, + "loss": 0.7432, + "step": 6889 + }, + { + "epoch": 0.3792173482305025, + "grad_norm": 0.6657430529594421, + "learning_rate": 9.158461525118452e-06, + "loss": 0.6715, + "step": 6890 + }, + { + "epoch": 0.37927238703285815, + "grad_norm": 0.770226240158081, + "learning_rate": 9.158220832872243e-06, + "loss": 0.7029, + "step": 6891 + }, + { + "epoch": 0.37932742583521384, + "grad_norm": 0.7697272300720215, + "learning_rate": 9.15798010937396e-06, + "loss": 0.686, + "step": 6892 + }, + { + "epoch": 0.37938246463756947, + "grad_norm": 0.7693290710449219, + "learning_rate": 9.157739354625413e-06, + "loss": 0.7669, + "step": 6893 + }, + { + "epoch": 0.37943750343992516, + "grad_norm": 0.8365996479988098, + "learning_rate": 9.157498568628406e-06, + "loss": 0.8254, + "step": 6894 + }, + { + "epoch": 0.3794925422422808, + "grad_norm": 0.8075883388519287, + "learning_rate": 9.157257751384756e-06, + "loss": 0.8311, + "step": 6895 + }, + { + "epoch": 0.3795475810446365, + "grad_norm": 0.8422812819480896, + "learning_rate": 9.15701690289627e-06, + "loss": 0.9173, + "step": 6896 + }, + { + "epoch": 0.3796026198469921, + "grad_norm": 0.7930355072021484, + "learning_rate": 9.156776023164755e-06, + "loss": 0.9376, + "step": 6897 + }, + { + "epoch": 0.3796576586493478, + "grad_norm": 0.7877563238143921, + "learning_rate": 9.156535112192026e-06, + "loss": 0.8358, + "step": 6898 + }, + { + "epoch": 0.37971269745170344, + "grad_norm": 0.7712885141372681, + "learning_rate": 9.156294169979891e-06, + "loss": 0.8781, + "step": 6899 + }, + { + "epoch": 0.37976773625405913, + "grad_norm": 0.6953728199005127, + "learning_rate": 9.156053196530162e-06, + "loss": 0.7861, + "step": 6900 + }, + { + "epoch": 0.37982277505641476, + "grad_norm": 0.9581564664840698, + "learning_rate": 9.155812191844649e-06, + "loss": 0.8294, + "step": 6901 + }, + { + "epoch": 0.37987781385877045, + "grad_norm": 0.738571286201477, + "learning_rate": 9.155571155925166e-06, + "loss": 0.7998, + "step": 6902 + }, + { + "epoch": 0.3799328526611261, + "grad_norm": 0.7059765458106995, + "learning_rate": 9.155330088773519e-06, + "loss": 0.7877, + "step": 6903 + }, + { + "epoch": 0.3799878914634818, + "grad_norm": 0.8572642207145691, + "learning_rate": 9.155088990391527e-06, + "loss": 0.7333, + "step": 6904 + }, + { + "epoch": 0.3800429302658374, + "grad_norm": 0.7442637085914612, + "learning_rate": 9.154847860780996e-06, + "loss": 0.685, + "step": 6905 + }, + { + "epoch": 0.3800979690681931, + "grad_norm": 0.7787682414054871, + "learning_rate": 9.154606699943741e-06, + "loss": 0.7893, + "step": 6906 + }, + { + "epoch": 0.38015300787054873, + "grad_norm": 0.8973822593688965, + "learning_rate": 9.154365507881574e-06, + "loss": 0.8297, + "step": 6907 + }, + { + "epoch": 0.3802080466729044, + "grad_norm": 0.7759919166564941, + "learning_rate": 9.154124284596311e-06, + "loss": 0.8257, + "step": 6908 + }, + { + "epoch": 0.38026308547526005, + "grad_norm": 0.8042850494384766, + "learning_rate": 9.153883030089759e-06, + "loss": 0.8024, + "step": 6909 + }, + { + "epoch": 0.38031812427761574, + "grad_norm": 0.8285790085792542, + "learning_rate": 9.153641744363733e-06, + "loss": 0.7824, + "step": 6910 + }, + { + "epoch": 0.3803731630799714, + "grad_norm": 0.7225445508956909, + "learning_rate": 9.15340042742005e-06, + "loss": 0.8065, + "step": 6911 + }, + { + "epoch": 0.38042820188232707, + "grad_norm": 0.7685298919677734, + "learning_rate": 9.15315907926052e-06, + "loss": 0.8151, + "step": 6912 + }, + { + "epoch": 0.3804832406846827, + "grad_norm": 0.9005589485168457, + "learning_rate": 9.152917699886958e-06, + "loss": 0.8413, + "step": 6913 + }, + { + "epoch": 0.3805382794870384, + "grad_norm": 0.8715279698371887, + "learning_rate": 9.152676289301178e-06, + "loss": 0.7233, + "step": 6914 + }, + { + "epoch": 0.380593318289394, + "grad_norm": 0.8764133453369141, + "learning_rate": 9.152434847504996e-06, + "loss": 0.783, + "step": 6915 + }, + { + "epoch": 0.38064835709174966, + "grad_norm": 0.6847019195556641, + "learning_rate": 9.152193374500225e-06, + "loss": 0.7133, + "step": 6916 + }, + { + "epoch": 0.38070339589410535, + "grad_norm": 0.7562721371650696, + "learning_rate": 9.151951870288678e-06, + "loss": 0.8155, + "step": 6917 + }, + { + "epoch": 0.380758434696461, + "grad_norm": 0.6888439059257507, + "learning_rate": 9.151710334872173e-06, + "loss": 0.6395, + "step": 6918 + }, + { + "epoch": 0.38081347349881667, + "grad_norm": 1.0951511859893799, + "learning_rate": 9.151468768252525e-06, + "loss": 0.8936, + "step": 6919 + }, + { + "epoch": 0.3808685123011723, + "grad_norm": 0.7261115908622742, + "learning_rate": 9.151227170431549e-06, + "loss": 0.7864, + "step": 6920 + }, + { + "epoch": 0.380923551103528, + "grad_norm": 1.2851859331130981, + "learning_rate": 9.150985541411061e-06, + "loss": 0.9419, + "step": 6921 + }, + { + "epoch": 0.3809785899058836, + "grad_norm": 0.7621721625328064, + "learning_rate": 9.150743881192876e-06, + "loss": 0.7773, + "step": 6922 + }, + { + "epoch": 0.3810336287082393, + "grad_norm": 0.7605605721473694, + "learning_rate": 9.150502189778811e-06, + "loss": 0.8752, + "step": 6923 + }, + { + "epoch": 0.38108866751059495, + "grad_norm": 0.8422327041625977, + "learning_rate": 9.150260467170683e-06, + "loss": 0.8555, + "step": 6924 + }, + { + "epoch": 0.38114370631295064, + "grad_norm": 0.7227829098701477, + "learning_rate": 9.15001871337031e-06, + "loss": 0.7637, + "step": 6925 + }, + { + "epoch": 0.38119874511530627, + "grad_norm": 0.6568942666053772, + "learning_rate": 9.149776928379506e-06, + "loss": 0.6944, + "step": 6926 + }, + { + "epoch": 0.38125378391766196, + "grad_norm": 0.9317567944526672, + "learning_rate": 9.149535112200087e-06, + "loss": 0.8098, + "step": 6927 + }, + { + "epoch": 0.3813088227200176, + "grad_norm": 0.6374759674072266, + "learning_rate": 9.149293264833877e-06, + "loss": 0.6654, + "step": 6928 + }, + { + "epoch": 0.3813638615223733, + "grad_norm": 0.7276837825775146, + "learning_rate": 9.149051386282685e-06, + "loss": 0.7728, + "step": 6929 + }, + { + "epoch": 0.3814189003247289, + "grad_norm": 0.7573683261871338, + "learning_rate": 9.148809476548337e-06, + "loss": 0.7681, + "step": 6930 + }, + { + "epoch": 0.3814739391270846, + "grad_norm": 0.7535703778266907, + "learning_rate": 9.148567535632647e-06, + "loss": 0.8498, + "step": 6931 + }, + { + "epoch": 0.38152897792944024, + "grad_norm": 0.7510126233100891, + "learning_rate": 9.148325563537432e-06, + "loss": 0.7874, + "step": 6932 + }, + { + "epoch": 0.3815840167317959, + "grad_norm": 0.7809224724769592, + "learning_rate": 9.148083560264515e-06, + "loss": 0.7223, + "step": 6933 + }, + { + "epoch": 0.38163905553415156, + "grad_norm": 0.7433155179023743, + "learning_rate": 9.14784152581571e-06, + "loss": 0.7914, + "step": 6934 + }, + { + "epoch": 0.38169409433650725, + "grad_norm": 0.7142858505249023, + "learning_rate": 9.14759946019284e-06, + "loss": 0.781, + "step": 6935 + }, + { + "epoch": 0.3817491331388629, + "grad_norm": 0.7910202741622925, + "learning_rate": 9.147357363397721e-06, + "loss": 0.755, + "step": 6936 + }, + { + "epoch": 0.3818041719412186, + "grad_norm": 1.007727026939392, + "learning_rate": 9.147115235432176e-06, + "loss": 0.7809, + "step": 6937 + }, + { + "epoch": 0.3818592107435742, + "grad_norm": 0.7227005362510681, + "learning_rate": 9.146873076298024e-06, + "loss": 0.7276, + "step": 6938 + }, + { + "epoch": 0.3819142495459299, + "grad_norm": 0.6945967674255371, + "learning_rate": 9.146630885997081e-06, + "loss": 0.825, + "step": 6939 + }, + { + "epoch": 0.38196928834828553, + "grad_norm": 0.6719669103622437, + "learning_rate": 9.146388664531172e-06, + "loss": 0.6486, + "step": 6940 + }, + { + "epoch": 0.3820243271506412, + "grad_norm": 0.7528467178344727, + "learning_rate": 9.146146411902115e-06, + "loss": 0.8143, + "step": 6941 + }, + { + "epoch": 0.38207936595299685, + "grad_norm": 0.6835548877716064, + "learning_rate": 9.145904128111732e-06, + "loss": 0.7742, + "step": 6942 + }, + { + "epoch": 0.38213440475535254, + "grad_norm": 0.7829870581626892, + "learning_rate": 9.145661813161844e-06, + "loss": 0.8147, + "step": 6943 + }, + { + "epoch": 0.3821894435577082, + "grad_norm": 0.6833155155181885, + "learning_rate": 9.145419467054271e-06, + "loss": 0.7615, + "step": 6944 + }, + { + "epoch": 0.38224448236006386, + "grad_norm": 0.7577275037765503, + "learning_rate": 9.145177089790833e-06, + "loss": 0.8611, + "step": 6945 + }, + { + "epoch": 0.3822995211624195, + "grad_norm": 0.7102984189987183, + "learning_rate": 9.144934681373356e-06, + "loss": 0.8373, + "step": 6946 + }, + { + "epoch": 0.3823545599647752, + "grad_norm": 0.6906121373176575, + "learning_rate": 9.144692241803658e-06, + "loss": 0.8314, + "step": 6947 + }, + { + "epoch": 0.3824095987671308, + "grad_norm": 0.7790967226028442, + "learning_rate": 9.144449771083563e-06, + "loss": 0.8285, + "step": 6948 + }, + { + "epoch": 0.3824646375694865, + "grad_norm": 0.8420237898826599, + "learning_rate": 9.144207269214893e-06, + "loss": 0.8159, + "step": 6949 + }, + { + "epoch": 0.38251967637184214, + "grad_norm": 0.7944310307502747, + "learning_rate": 9.143964736199471e-06, + "loss": 0.7981, + "step": 6950 + }, + { + "epoch": 0.38257471517419783, + "grad_norm": 0.7610076069831848, + "learning_rate": 9.14372217203912e-06, + "loss": 0.8011, + "step": 6951 + }, + { + "epoch": 0.38262975397655347, + "grad_norm": 0.7183333039283752, + "learning_rate": 9.143479576735661e-06, + "loss": 0.7504, + "step": 6952 + }, + { + "epoch": 0.38268479277890916, + "grad_norm": 0.7363573312759399, + "learning_rate": 9.14323695029092e-06, + "loss": 0.7561, + "step": 6953 + }, + { + "epoch": 0.3827398315812648, + "grad_norm": 0.7330427765846252, + "learning_rate": 9.142994292706716e-06, + "loss": 0.754, + "step": 6954 + }, + { + "epoch": 0.3827948703836205, + "grad_norm": 0.8307509422302246, + "learning_rate": 9.142751603984879e-06, + "loss": 0.8059, + "step": 6955 + }, + { + "epoch": 0.3828499091859761, + "grad_norm": 0.7340347766876221, + "learning_rate": 9.142508884127228e-06, + "loss": 0.8636, + "step": 6956 + }, + { + "epoch": 0.3829049479883318, + "grad_norm": 0.7032678127288818, + "learning_rate": 9.14226613313559e-06, + "loss": 0.8237, + "step": 6957 + }, + { + "epoch": 0.38295998679068743, + "grad_norm": 0.769809365272522, + "learning_rate": 9.142023351011788e-06, + "loss": 0.7523, + "step": 6958 + }, + { + "epoch": 0.38301502559304307, + "grad_norm": 0.7446833252906799, + "learning_rate": 9.141780537757647e-06, + "loss": 0.8382, + "step": 6959 + }, + { + "epoch": 0.38307006439539876, + "grad_norm": 0.6926285028457642, + "learning_rate": 9.141537693374994e-06, + "loss": 0.7997, + "step": 6960 + }, + { + "epoch": 0.3831251031977544, + "grad_norm": 0.7303034067153931, + "learning_rate": 9.141294817865651e-06, + "loss": 0.794, + "step": 6961 + }, + { + "epoch": 0.3831801420001101, + "grad_norm": 0.7453297972679138, + "learning_rate": 9.141051911231445e-06, + "loss": 0.7031, + "step": 6962 + }, + { + "epoch": 0.3832351808024657, + "grad_norm": 0.8503912091255188, + "learning_rate": 9.140808973474201e-06, + "loss": 0.7855, + "step": 6963 + }, + { + "epoch": 0.3832902196048214, + "grad_norm": 0.7304036617279053, + "learning_rate": 9.140566004595746e-06, + "loss": 0.7062, + "step": 6964 + }, + { + "epoch": 0.38334525840717704, + "grad_norm": 0.7534968852996826, + "learning_rate": 9.140323004597904e-06, + "loss": 0.8138, + "step": 6965 + }, + { + "epoch": 0.3834002972095327, + "grad_norm": 0.8122013807296753, + "learning_rate": 9.140079973482503e-06, + "loss": 0.7769, + "step": 6966 + }, + { + "epoch": 0.38345533601188836, + "grad_norm": 0.7345744967460632, + "learning_rate": 9.13983691125137e-06, + "loss": 0.7588, + "step": 6967 + }, + { + "epoch": 0.38351037481424405, + "grad_norm": 0.7251620292663574, + "learning_rate": 9.13959381790633e-06, + "loss": 0.8027, + "step": 6968 + }, + { + "epoch": 0.3835654136165997, + "grad_norm": 0.7157594561576843, + "learning_rate": 9.139350693449212e-06, + "loss": 0.7233, + "step": 6969 + }, + { + "epoch": 0.38362045241895537, + "grad_norm": 0.8076621890068054, + "learning_rate": 9.139107537881842e-06, + "loss": 0.7256, + "step": 6970 + }, + { + "epoch": 0.383675491221311, + "grad_norm": 0.717182993888855, + "learning_rate": 9.138864351206047e-06, + "loss": 0.7003, + "step": 6971 + }, + { + "epoch": 0.3837305300236667, + "grad_norm": 0.7534194588661194, + "learning_rate": 9.138621133423656e-06, + "loss": 0.7315, + "step": 6972 + }, + { + "epoch": 0.3837855688260223, + "grad_norm": 0.6400160193443298, + "learning_rate": 9.138377884536494e-06, + "loss": 0.6814, + "step": 6973 + }, + { + "epoch": 0.383840607628378, + "grad_norm": 0.7319507002830505, + "learning_rate": 9.138134604546394e-06, + "loss": 0.7942, + "step": 6974 + }, + { + "epoch": 0.38389564643073365, + "grad_norm": 0.7109829783439636, + "learning_rate": 9.137891293455181e-06, + "loss": 0.7528, + "step": 6975 + }, + { + "epoch": 0.38395068523308934, + "grad_norm": 1.006724238395691, + "learning_rate": 9.137647951264685e-06, + "loss": 0.7652, + "step": 6976 + }, + { + "epoch": 0.384005724035445, + "grad_norm": 0.7080540060997009, + "learning_rate": 9.137404577976736e-06, + "loss": 0.7706, + "step": 6977 + }, + { + "epoch": 0.38406076283780066, + "grad_norm": 0.7551368474960327, + "learning_rate": 9.137161173593161e-06, + "loss": 0.8202, + "step": 6978 + }, + { + "epoch": 0.3841158016401563, + "grad_norm": 0.6624314785003662, + "learning_rate": 9.13691773811579e-06, + "loss": 0.7258, + "step": 6979 + }, + { + "epoch": 0.384170840442512, + "grad_norm": 0.9603848457336426, + "learning_rate": 9.136674271546451e-06, + "loss": 0.9415, + "step": 6980 + }, + { + "epoch": 0.3842258792448676, + "grad_norm": 0.6964829564094543, + "learning_rate": 9.136430773886977e-06, + "loss": 0.7604, + "step": 6981 + }, + { + "epoch": 0.3842809180472233, + "grad_norm": 0.6503588557243347, + "learning_rate": 9.136187245139197e-06, + "loss": 0.7141, + "step": 6982 + }, + { + "epoch": 0.38433595684957894, + "grad_norm": 0.9179829359054565, + "learning_rate": 9.13594368530494e-06, + "loss": 0.7619, + "step": 6983 + }, + { + "epoch": 0.38439099565193463, + "grad_norm": 0.7993278503417969, + "learning_rate": 9.135700094386038e-06, + "loss": 0.832, + "step": 6984 + }, + { + "epoch": 0.38444603445429026, + "grad_norm": 0.8136988282203674, + "learning_rate": 9.13545647238432e-06, + "loss": 0.8127, + "step": 6985 + }, + { + "epoch": 0.38450107325664595, + "grad_norm": 0.9918104410171509, + "learning_rate": 9.135212819301619e-06, + "loss": 0.836, + "step": 6986 + }, + { + "epoch": 0.3845561120590016, + "grad_norm": 0.7767511010169983, + "learning_rate": 9.134969135139765e-06, + "loss": 0.8391, + "step": 6987 + }, + { + "epoch": 0.3846111508613573, + "grad_norm": 0.6889285445213318, + "learning_rate": 9.134725419900589e-06, + "loss": 0.7639, + "step": 6988 + }, + { + "epoch": 0.3846661896637129, + "grad_norm": 1.803467035293579, + "learning_rate": 9.134481673585924e-06, + "loss": 0.7629, + "step": 6989 + }, + { + "epoch": 0.3847212284660686, + "grad_norm": 0.721581757068634, + "learning_rate": 9.134237896197603e-06, + "loss": 0.8194, + "step": 6990 + }, + { + "epoch": 0.38477626726842423, + "grad_norm": 0.8163189888000488, + "learning_rate": 9.133994087737456e-06, + "loss": 0.7789, + "step": 6991 + }, + { + "epoch": 0.3848313060707799, + "grad_norm": 0.7518420815467834, + "learning_rate": 9.133750248207315e-06, + "loss": 0.7529, + "step": 6992 + }, + { + "epoch": 0.38488634487313556, + "grad_norm": 0.7318000197410583, + "learning_rate": 9.133506377609015e-06, + "loss": 0.7829, + "step": 6993 + }, + { + "epoch": 0.38494138367549124, + "grad_norm": 0.7765058875083923, + "learning_rate": 9.133262475944386e-06, + "loss": 0.7902, + "step": 6994 + }, + { + "epoch": 0.3849964224778469, + "grad_norm": 0.845567524433136, + "learning_rate": 9.133018543215265e-06, + "loss": 0.8117, + "step": 6995 + }, + { + "epoch": 0.38505146128020257, + "grad_norm": 0.7081887125968933, + "learning_rate": 9.13277457942348e-06, + "loss": 0.8131, + "step": 6996 + }, + { + "epoch": 0.3851065000825582, + "grad_norm": 0.7447869777679443, + "learning_rate": 9.132530584570869e-06, + "loss": 0.7765, + "step": 6997 + }, + { + "epoch": 0.3851615388849139, + "grad_norm": 0.8554795384407043, + "learning_rate": 9.132286558659265e-06, + "loss": 0.8966, + "step": 6998 + }, + { + "epoch": 0.3852165776872695, + "grad_norm": 0.7117023468017578, + "learning_rate": 9.1320425016905e-06, + "loss": 0.7461, + "step": 6999 + }, + { + "epoch": 0.3852716164896252, + "grad_norm": 0.6965934038162231, + "learning_rate": 9.131798413666411e-06, + "loss": 0.6827, + "step": 7000 + }, + { + "epoch": 0.38532665529198085, + "grad_norm": 0.7449018359184265, + "learning_rate": 9.13155429458883e-06, + "loss": 0.7562, + "step": 7001 + }, + { + "epoch": 0.3853816940943365, + "grad_norm": 0.7764221429824829, + "learning_rate": 9.131310144459593e-06, + "loss": 0.7842, + "step": 7002 + }, + { + "epoch": 0.38543673289669217, + "grad_norm": 0.9788658618927002, + "learning_rate": 9.131065963280536e-06, + "loss": 0.7857, + "step": 7003 + }, + { + "epoch": 0.3854917716990478, + "grad_norm": 0.7900908589363098, + "learning_rate": 9.13082175105349e-06, + "loss": 0.8733, + "step": 7004 + }, + { + "epoch": 0.3855468105014035, + "grad_norm": 0.814822793006897, + "learning_rate": 9.130577507780298e-06, + "loss": 0.8032, + "step": 7005 + }, + { + "epoch": 0.3856018493037591, + "grad_norm": 1.0648475885391235, + "learning_rate": 9.130333233462789e-06, + "loss": 0.8078, + "step": 7006 + }, + { + "epoch": 0.3856568881061148, + "grad_norm": 0.7359917163848877, + "learning_rate": 9.130088928102799e-06, + "loss": 0.6491, + "step": 7007 + }, + { + "epoch": 0.38571192690847045, + "grad_norm": 0.7321771383285522, + "learning_rate": 9.129844591702169e-06, + "loss": 0.7663, + "step": 7008 + }, + { + "epoch": 0.38576696571082614, + "grad_norm": 0.6937146186828613, + "learning_rate": 9.129600224262732e-06, + "loss": 0.7835, + "step": 7009 + }, + { + "epoch": 0.38582200451318177, + "grad_norm": 0.7330107688903809, + "learning_rate": 9.129355825786323e-06, + "loss": 0.7626, + "step": 7010 + }, + { + "epoch": 0.38587704331553746, + "grad_norm": 0.7021715044975281, + "learning_rate": 9.129111396274783e-06, + "loss": 0.7115, + "step": 7011 + }, + { + "epoch": 0.3859320821178931, + "grad_norm": 0.6599563360214233, + "learning_rate": 9.128866935729947e-06, + "loss": 0.6554, + "step": 7012 + }, + { + "epoch": 0.3859871209202488, + "grad_norm": 0.7323513031005859, + "learning_rate": 9.128622444153652e-06, + "loss": 0.7392, + "step": 7013 + }, + { + "epoch": 0.3860421597226044, + "grad_norm": 0.681888222694397, + "learning_rate": 9.128377921547736e-06, + "loss": 0.7474, + "step": 7014 + }, + { + "epoch": 0.3860971985249601, + "grad_norm": 0.8454889059066772, + "learning_rate": 9.128133367914036e-06, + "loss": 0.8355, + "step": 7015 + }, + { + "epoch": 0.38615223732731574, + "grad_norm": 0.7514123916625977, + "learning_rate": 9.12788878325439e-06, + "loss": 0.7683, + "step": 7016 + }, + { + "epoch": 0.38620727612967143, + "grad_norm": 0.7317092418670654, + "learning_rate": 9.12764416757064e-06, + "loss": 0.7201, + "step": 7017 + }, + { + "epoch": 0.38626231493202706, + "grad_norm": 0.7626729011535645, + "learning_rate": 9.127399520864619e-06, + "loss": 0.7701, + "step": 7018 + }, + { + "epoch": 0.38631735373438275, + "grad_norm": 0.9790363311767578, + "learning_rate": 9.127154843138168e-06, + "loss": 0.8034, + "step": 7019 + }, + { + "epoch": 0.3863723925367384, + "grad_norm": 0.663593590259552, + "learning_rate": 9.126910134393125e-06, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 0.3864274313390941, + "grad_norm": 0.6599924564361572, + "learning_rate": 9.126665394631332e-06, + "loss": 0.7395, + "step": 7021 + }, + { + "epoch": 0.3864824701414497, + "grad_norm": 0.8493411540985107, + "learning_rate": 9.126420623854625e-06, + "loss": 0.8008, + "step": 7022 + }, + { + "epoch": 0.3865375089438054, + "grad_norm": 0.7587194442749023, + "learning_rate": 9.126175822064846e-06, + "loss": 0.7533, + "step": 7023 + }, + { + "epoch": 0.38659254774616103, + "grad_norm": 0.773764431476593, + "learning_rate": 9.125930989263835e-06, + "loss": 0.75, + "step": 7024 + }, + { + "epoch": 0.3866475865485167, + "grad_norm": 0.7126749753952026, + "learning_rate": 9.12568612545343e-06, + "loss": 0.7794, + "step": 7025 + }, + { + "epoch": 0.38670262535087235, + "grad_norm": 0.7404584884643555, + "learning_rate": 9.125441230635472e-06, + "loss": 0.7264, + "step": 7026 + }, + { + "epoch": 0.38675766415322804, + "grad_norm": 0.8057644367218018, + "learning_rate": 9.125196304811804e-06, + "loss": 0.8058, + "step": 7027 + }, + { + "epoch": 0.3868127029555837, + "grad_norm": 0.9586995840072632, + "learning_rate": 9.124951347984263e-06, + "loss": 0.7659, + "step": 7028 + }, + { + "epoch": 0.38686774175793937, + "grad_norm": 0.7567793726921082, + "learning_rate": 9.124706360154693e-06, + "loss": 0.8961, + "step": 7029 + }, + { + "epoch": 0.386922780560295, + "grad_norm": 0.8523182272911072, + "learning_rate": 9.124461341324934e-06, + "loss": 0.8815, + "step": 7030 + }, + { + "epoch": 0.3869778193626507, + "grad_norm": 0.7466379404067993, + "learning_rate": 9.124216291496826e-06, + "loss": 0.7817, + "step": 7031 + }, + { + "epoch": 0.3870328581650063, + "grad_norm": 0.6721325516700745, + "learning_rate": 9.123971210672214e-06, + "loss": 0.7637, + "step": 7032 + }, + { + "epoch": 0.387087896967362, + "grad_norm": 0.6620928049087524, + "learning_rate": 9.123726098852936e-06, + "loss": 0.6956, + "step": 7033 + }, + { + "epoch": 0.38714293576971764, + "grad_norm": 0.6784290671348572, + "learning_rate": 9.12348095604084e-06, + "loss": 0.7034, + "step": 7034 + }, + { + "epoch": 0.38719797457207333, + "grad_norm": 0.7138848304748535, + "learning_rate": 9.123235782237763e-06, + "loss": 0.6037, + "step": 7035 + }, + { + "epoch": 0.38725301337442897, + "grad_norm": 0.8473613858222961, + "learning_rate": 9.122990577445548e-06, + "loss": 0.8157, + "step": 7036 + }, + { + "epoch": 0.38730805217678466, + "grad_norm": 0.835381031036377, + "learning_rate": 9.122745341666041e-06, + "loss": 0.8736, + "step": 7037 + }, + { + "epoch": 0.3873630909791403, + "grad_norm": 0.8823271989822388, + "learning_rate": 9.122500074901083e-06, + "loss": 0.7448, + "step": 7038 + }, + { + "epoch": 0.387418129781496, + "grad_norm": 0.6494244933128357, + "learning_rate": 9.122254777152519e-06, + "loss": 0.7423, + "step": 7039 + }, + { + "epoch": 0.3874731685838516, + "grad_norm": 0.7232181429862976, + "learning_rate": 9.122009448422191e-06, + "loss": 0.8489, + "step": 7040 + }, + { + "epoch": 0.3875282073862073, + "grad_norm": 0.7357699275016785, + "learning_rate": 9.121764088711945e-06, + "loss": 0.8799, + "step": 7041 + }, + { + "epoch": 0.38758324618856294, + "grad_norm": 0.7638574838638306, + "learning_rate": 9.121518698023621e-06, + "loss": 0.8539, + "step": 7042 + }, + { + "epoch": 0.3876382849909186, + "grad_norm": 0.7407062649726868, + "learning_rate": 9.121273276359068e-06, + "loss": 0.7152, + "step": 7043 + }, + { + "epoch": 0.38769332379327426, + "grad_norm": 0.6945983171463013, + "learning_rate": 9.121027823720126e-06, + "loss": 0.8224, + "step": 7044 + }, + { + "epoch": 0.3877483625956299, + "grad_norm": 0.7163639068603516, + "learning_rate": 9.120782340108643e-06, + "loss": 0.808, + "step": 7045 + }, + { + "epoch": 0.3878034013979856, + "grad_norm": 0.7062035799026489, + "learning_rate": 9.120536825526463e-06, + "loss": 0.783, + "step": 7046 + }, + { + "epoch": 0.3878584402003412, + "grad_norm": 0.7459971308708191, + "learning_rate": 9.120291279975431e-06, + "loss": 0.8219, + "step": 7047 + }, + { + "epoch": 0.3879134790026969, + "grad_norm": 0.9016150236129761, + "learning_rate": 9.120045703457394e-06, + "loss": 0.8605, + "step": 7048 + }, + { + "epoch": 0.38796851780505254, + "grad_norm": 0.78440922498703, + "learning_rate": 9.119800095974193e-06, + "loss": 0.8424, + "step": 7049 + }, + { + "epoch": 0.3880235566074082, + "grad_norm": 0.751504123210907, + "learning_rate": 9.119554457527681e-06, + "loss": 0.701, + "step": 7050 + }, + { + "epoch": 0.38807859540976386, + "grad_norm": 0.7540284991264343, + "learning_rate": 9.119308788119698e-06, + "loss": 0.7912, + "step": 7051 + }, + { + "epoch": 0.38813363421211955, + "grad_norm": 0.7977007627487183, + "learning_rate": 9.119063087752094e-06, + "loss": 0.9297, + "step": 7052 + }, + { + "epoch": 0.3881886730144752, + "grad_norm": 0.6923508644104004, + "learning_rate": 9.118817356426715e-06, + "loss": 0.7458, + "step": 7053 + }, + { + "epoch": 0.38824371181683087, + "grad_norm": 0.7170272469520569, + "learning_rate": 9.118571594145406e-06, + "loss": 0.733, + "step": 7054 + }, + { + "epoch": 0.3882987506191865, + "grad_norm": 0.7547701001167297, + "learning_rate": 9.118325800910015e-06, + "loss": 0.7758, + "step": 7055 + }, + { + "epoch": 0.3883537894215422, + "grad_norm": 0.7921421527862549, + "learning_rate": 9.118079976722391e-06, + "loss": 0.8262, + "step": 7056 + }, + { + "epoch": 0.38840882822389783, + "grad_norm": 0.734470784664154, + "learning_rate": 9.117834121584379e-06, + "loss": 0.817, + "step": 7057 + }, + { + "epoch": 0.3884638670262535, + "grad_norm": 0.8106420040130615, + "learning_rate": 9.117588235497829e-06, + "loss": 0.8203, + "step": 7058 + }, + { + "epoch": 0.38851890582860915, + "grad_norm": 0.7355543375015259, + "learning_rate": 9.117342318464588e-06, + "loss": 0.8076, + "step": 7059 + }, + { + "epoch": 0.38857394463096484, + "grad_norm": 0.7665252685546875, + "learning_rate": 9.117096370486504e-06, + "loss": 0.7611, + "step": 7060 + }, + { + "epoch": 0.3886289834333205, + "grad_norm": 0.7968598008155823, + "learning_rate": 9.116850391565426e-06, + "loss": 0.6461, + "step": 7061 + }, + { + "epoch": 0.38868402223567616, + "grad_norm": 0.7187741994857788, + "learning_rate": 9.116604381703203e-06, + "loss": 0.7982, + "step": 7062 + }, + { + "epoch": 0.3887390610380318, + "grad_norm": 0.8566913604736328, + "learning_rate": 9.11635834090168e-06, + "loss": 0.9072, + "step": 7063 + }, + { + "epoch": 0.3887940998403875, + "grad_norm": 0.7120797038078308, + "learning_rate": 9.116112269162714e-06, + "loss": 0.7353, + "step": 7064 + }, + { + "epoch": 0.3888491386427431, + "grad_norm": 0.7230019569396973, + "learning_rate": 9.115866166488148e-06, + "loss": 0.7717, + "step": 7065 + }, + { + "epoch": 0.3889041774450988, + "grad_norm": 0.6650584936141968, + "learning_rate": 9.115620032879833e-06, + "loss": 0.7384, + "step": 7066 + }, + { + "epoch": 0.38895921624745444, + "grad_norm": 0.970750629901886, + "learning_rate": 9.115373868339621e-06, + "loss": 0.8478, + "step": 7067 + }, + { + "epoch": 0.38901425504981013, + "grad_norm": 0.7066280245780945, + "learning_rate": 9.115127672869359e-06, + "loss": 0.7638, + "step": 7068 + }, + { + "epoch": 0.38906929385216577, + "grad_norm": 0.6952232718467712, + "learning_rate": 9.1148814464709e-06, + "loss": 0.7869, + "step": 7069 + }, + { + "epoch": 0.38912433265452145, + "grad_norm": 0.804489254951477, + "learning_rate": 9.114635189146094e-06, + "loss": 0.7905, + "step": 7070 + }, + { + "epoch": 0.3891793714568771, + "grad_norm": 0.6988457441329956, + "learning_rate": 9.114388900896791e-06, + "loss": 0.7107, + "step": 7071 + }, + { + "epoch": 0.3892344102592328, + "grad_norm": 0.6379980444908142, + "learning_rate": 9.114142581724842e-06, + "loss": 0.733, + "step": 7072 + }, + { + "epoch": 0.3892894490615884, + "grad_norm": 0.7238649129867554, + "learning_rate": 9.113896231632098e-06, + "loss": 0.8252, + "step": 7073 + }, + { + "epoch": 0.3893444878639441, + "grad_norm": 0.7168585062026978, + "learning_rate": 9.113649850620412e-06, + "loss": 0.6459, + "step": 7074 + }, + { + "epoch": 0.38939952666629973, + "grad_norm": 0.7315915822982788, + "learning_rate": 9.113403438691634e-06, + "loss": 0.7557, + "step": 7075 + }, + { + "epoch": 0.3894545654686554, + "grad_norm": 0.7438754439353943, + "learning_rate": 9.11315699584762e-06, + "loss": 0.7938, + "step": 7076 + }, + { + "epoch": 0.38950960427101106, + "grad_norm": 0.7497848272323608, + "learning_rate": 9.112910522090215e-06, + "loss": 0.8232, + "step": 7077 + }, + { + "epoch": 0.38956464307336675, + "grad_norm": 0.8072896003723145, + "learning_rate": 9.112664017421277e-06, + "loss": 0.7974, + "step": 7078 + }, + { + "epoch": 0.3896196818757224, + "grad_norm": 0.7255920767784119, + "learning_rate": 9.112417481842657e-06, + "loss": 0.7658, + "step": 7079 + }, + { + "epoch": 0.38967472067807807, + "grad_norm": 0.6263132095336914, + "learning_rate": 9.112170915356209e-06, + "loss": 0.7188, + "step": 7080 + }, + { + "epoch": 0.3897297594804337, + "grad_norm": 0.6817660927772522, + "learning_rate": 9.111924317963785e-06, + "loss": 0.7406, + "step": 7081 + }, + { + "epoch": 0.3897847982827894, + "grad_norm": 0.7829134464263916, + "learning_rate": 9.111677689667238e-06, + "loss": 0.8406, + "step": 7082 + }, + { + "epoch": 0.389839837085145, + "grad_norm": 0.7122843861579895, + "learning_rate": 9.111431030468421e-06, + "loss": 0.7722, + "step": 7083 + }, + { + "epoch": 0.3898948758875007, + "grad_norm": 0.7041764259338379, + "learning_rate": 9.11118434036919e-06, + "loss": 0.8307, + "step": 7084 + }, + { + "epoch": 0.38994991468985635, + "grad_norm": 0.7582009434700012, + "learning_rate": 9.110937619371398e-06, + "loss": 0.7461, + "step": 7085 + }, + { + "epoch": 0.39000495349221204, + "grad_norm": 0.7156100273132324, + "learning_rate": 9.110690867476899e-06, + "loss": 0.7294, + "step": 7086 + }, + { + "epoch": 0.39005999229456767, + "grad_norm": 0.79449063539505, + "learning_rate": 9.110444084687549e-06, + "loss": 0.8652, + "step": 7087 + }, + { + "epoch": 0.3901150310969233, + "grad_norm": 0.7692831754684448, + "learning_rate": 9.1101972710052e-06, + "loss": 0.7899, + "step": 7088 + }, + { + "epoch": 0.390170069899279, + "grad_norm": 0.7189639806747437, + "learning_rate": 9.109950426431708e-06, + "loss": 0.726, + "step": 7089 + }, + { + "epoch": 0.3902251087016346, + "grad_norm": 0.7491177916526794, + "learning_rate": 9.10970355096893e-06, + "loss": 0.8881, + "step": 7090 + }, + { + "epoch": 0.3902801475039903, + "grad_norm": 0.783027172088623, + "learning_rate": 9.10945664461872e-06, + "loss": 0.7728, + "step": 7091 + }, + { + "epoch": 0.39033518630634595, + "grad_norm": 1.0871556997299194, + "learning_rate": 9.109209707382934e-06, + "loss": 0.8059, + "step": 7092 + }, + { + "epoch": 0.39039022510870164, + "grad_norm": 0.7287113666534424, + "learning_rate": 9.108962739263429e-06, + "loss": 0.7896, + "step": 7093 + }, + { + "epoch": 0.39044526391105727, + "grad_norm": 0.7801700234413147, + "learning_rate": 9.108715740262058e-06, + "loss": 0.8012, + "step": 7094 + }, + { + "epoch": 0.39050030271341296, + "grad_norm": 0.846709132194519, + "learning_rate": 9.10846871038068e-06, + "loss": 0.8392, + "step": 7095 + }, + { + "epoch": 0.3905553415157686, + "grad_norm": 0.7408092617988586, + "learning_rate": 9.10822164962115e-06, + "loss": 0.8657, + "step": 7096 + }, + { + "epoch": 0.3906103803181243, + "grad_norm": 0.6748743057250977, + "learning_rate": 9.107974557985328e-06, + "loss": 0.7659, + "step": 7097 + }, + { + "epoch": 0.3906654191204799, + "grad_norm": 0.7512170672416687, + "learning_rate": 9.107727435475067e-06, + "loss": 0.7704, + "step": 7098 + }, + { + "epoch": 0.3907204579228356, + "grad_norm": 0.9039596319198608, + "learning_rate": 9.107480282092227e-06, + "loss": 0.8412, + "step": 7099 + }, + { + "epoch": 0.39077549672519124, + "grad_norm": 0.829785943031311, + "learning_rate": 9.107233097838663e-06, + "loss": 0.8229, + "step": 7100 + }, + { + "epoch": 0.39083053552754693, + "grad_norm": 0.7597842812538147, + "learning_rate": 9.106985882716238e-06, + "loss": 0.7798, + "step": 7101 + }, + { + "epoch": 0.39088557432990256, + "grad_norm": 0.7619945406913757, + "learning_rate": 9.106738636726802e-06, + "loss": 0.7504, + "step": 7102 + }, + { + "epoch": 0.39094061313225825, + "grad_norm": 0.6791092157363892, + "learning_rate": 9.10649135987222e-06, + "loss": 0.8167, + "step": 7103 + }, + { + "epoch": 0.3909956519346139, + "grad_norm": 0.7977412343025208, + "learning_rate": 9.10624405215435e-06, + "loss": 0.8252, + "step": 7104 + }, + { + "epoch": 0.3910506907369696, + "grad_norm": 0.7329283356666565, + "learning_rate": 9.105996713575047e-06, + "loss": 0.7084, + "step": 7105 + }, + { + "epoch": 0.3911057295393252, + "grad_norm": 0.7125133872032166, + "learning_rate": 9.105749344136172e-06, + "loss": 0.6672, + "step": 7106 + }, + { + "epoch": 0.3911607683416809, + "grad_norm": 0.6974679827690125, + "learning_rate": 9.105501943839583e-06, + "loss": 0.7354, + "step": 7107 + }, + { + "epoch": 0.39121580714403653, + "grad_norm": 0.7191265225410461, + "learning_rate": 9.10525451268714e-06, + "loss": 0.8133, + "step": 7108 + }, + { + "epoch": 0.3912708459463922, + "grad_norm": 0.7188206911087036, + "learning_rate": 9.105007050680704e-06, + "loss": 0.7947, + "step": 7109 + }, + { + "epoch": 0.39132588474874785, + "grad_norm": 0.9017364382743835, + "learning_rate": 9.104759557822135e-06, + "loss": 0.7848, + "step": 7110 + }, + { + "epoch": 0.39138092355110354, + "grad_norm": 0.7551164031028748, + "learning_rate": 9.104512034113292e-06, + "loss": 0.8266, + "step": 7111 + }, + { + "epoch": 0.3914359623534592, + "grad_norm": 0.7810001969337463, + "learning_rate": 9.104264479556033e-06, + "loss": 0.7731, + "step": 7112 + }, + { + "epoch": 0.39149100115581487, + "grad_norm": 0.787723183631897, + "learning_rate": 9.104016894152223e-06, + "loss": 0.8008, + "step": 7113 + }, + { + "epoch": 0.3915460399581705, + "grad_norm": 0.7303524017333984, + "learning_rate": 9.103769277903718e-06, + "loss": 0.826, + "step": 7114 + }, + { + "epoch": 0.3916010787605262, + "grad_norm": 0.707759439945221, + "learning_rate": 9.103521630812384e-06, + "loss": 0.6303, + "step": 7115 + }, + { + "epoch": 0.3916561175628818, + "grad_norm": 0.6929940581321716, + "learning_rate": 9.10327395288008e-06, + "loss": 0.733, + "step": 7116 + }, + { + "epoch": 0.3917111563652375, + "grad_norm": 0.7133205533027649, + "learning_rate": 9.103026244108667e-06, + "loss": 0.8421, + "step": 7117 + }, + { + "epoch": 0.39176619516759315, + "grad_norm": 1.2049434185028076, + "learning_rate": 9.102778504500005e-06, + "loss": 0.8618, + "step": 7118 + }, + { + "epoch": 0.39182123396994883, + "grad_norm": 0.7792720198631287, + "learning_rate": 9.10253073405596e-06, + "loss": 0.717, + "step": 7119 + }, + { + "epoch": 0.39187627277230447, + "grad_norm": 0.7234412431716919, + "learning_rate": 9.10228293277839e-06, + "loss": 0.7547, + "step": 7120 + }, + { + "epoch": 0.39193131157466016, + "grad_norm": 0.6845420002937317, + "learning_rate": 9.102035100669162e-06, + "loss": 0.7255, + "step": 7121 + }, + { + "epoch": 0.3919863503770158, + "grad_norm": 0.7446799874305725, + "learning_rate": 9.101787237730135e-06, + "loss": 0.7947, + "step": 7122 + }, + { + "epoch": 0.3920413891793715, + "grad_norm": 0.812924325466156, + "learning_rate": 9.101539343963176e-06, + "loss": 0.843, + "step": 7123 + }, + { + "epoch": 0.3920964279817271, + "grad_norm": 0.7373847365379333, + "learning_rate": 9.101291419370141e-06, + "loss": 0.7703, + "step": 7124 + }, + { + "epoch": 0.3921514667840828, + "grad_norm": 0.8305120468139648, + "learning_rate": 9.101043463952899e-06, + "loss": 0.8904, + "step": 7125 + }, + { + "epoch": 0.39220650558643844, + "grad_norm": 0.7263030409812927, + "learning_rate": 9.100795477713313e-06, + "loss": 0.8319, + "step": 7126 + }, + { + "epoch": 0.3922615443887941, + "grad_norm": 0.8358581066131592, + "learning_rate": 9.100547460653245e-06, + "loss": 0.8305, + "step": 7127 + }, + { + "epoch": 0.39231658319114976, + "grad_norm": 0.6608800292015076, + "learning_rate": 9.10029941277456e-06, + "loss": 0.7815, + "step": 7128 + }, + { + "epoch": 0.39237162199350545, + "grad_norm": 0.8590257167816162, + "learning_rate": 9.100051334079122e-06, + "loss": 0.8292, + "step": 7129 + }, + { + "epoch": 0.3924266607958611, + "grad_norm": 0.6241755485534668, + "learning_rate": 9.099803224568797e-06, + "loss": 0.6568, + "step": 7130 + }, + { + "epoch": 0.3924816995982167, + "grad_norm": 0.7298059463500977, + "learning_rate": 9.099555084245447e-06, + "loss": 0.727, + "step": 7131 + }, + { + "epoch": 0.3925367384005724, + "grad_norm": 0.7741055488586426, + "learning_rate": 9.099306913110939e-06, + "loss": 0.8481, + "step": 7132 + }, + { + "epoch": 0.39259177720292804, + "grad_norm": 0.9674170613288879, + "learning_rate": 9.099058711167137e-06, + "loss": 0.8507, + "step": 7133 + }, + { + "epoch": 0.3926468160052837, + "grad_norm": 0.7285159826278687, + "learning_rate": 9.098810478415907e-06, + "loss": 0.766, + "step": 7134 + }, + { + "epoch": 0.39270185480763936, + "grad_norm": 0.7215660810470581, + "learning_rate": 9.098562214859115e-06, + "loss": 0.794, + "step": 7135 + }, + { + "epoch": 0.39275689360999505, + "grad_norm": 0.764437735080719, + "learning_rate": 9.098313920498627e-06, + "loss": 0.8228, + "step": 7136 + }, + { + "epoch": 0.3928119324123507, + "grad_norm": 0.7222796082496643, + "learning_rate": 9.098065595336309e-06, + "loss": 0.8064, + "step": 7137 + }, + { + "epoch": 0.3928669712147064, + "grad_norm": 0.7044625878334045, + "learning_rate": 9.097817239374024e-06, + "loss": 0.8017, + "step": 7138 + }, + { + "epoch": 0.392922010017062, + "grad_norm": 0.7929979562759399, + "learning_rate": 9.097568852613646e-06, + "loss": 0.7527, + "step": 7139 + }, + { + "epoch": 0.3929770488194177, + "grad_norm": 0.7833721041679382, + "learning_rate": 9.097320435057033e-06, + "loss": 0.8335, + "step": 7140 + }, + { + "epoch": 0.39303208762177333, + "grad_norm": 0.8365728259086609, + "learning_rate": 9.097071986706058e-06, + "loss": 0.6439, + "step": 7141 + }, + { + "epoch": 0.393087126424129, + "grad_norm": 0.7547842264175415, + "learning_rate": 9.096823507562588e-06, + "loss": 0.8316, + "step": 7142 + }, + { + "epoch": 0.39314216522648465, + "grad_norm": 0.6598891019821167, + "learning_rate": 9.09657499762849e-06, + "loss": 0.6547, + "step": 7143 + }, + { + "epoch": 0.39319720402884034, + "grad_norm": 0.7913638949394226, + "learning_rate": 9.096326456905627e-06, + "loss": 0.7964, + "step": 7144 + }, + { + "epoch": 0.393252242831196, + "grad_norm": 0.6927905082702637, + "learning_rate": 9.096077885395874e-06, + "loss": 0.7836, + "step": 7145 + }, + { + "epoch": 0.39330728163355166, + "grad_norm": 0.7505417466163635, + "learning_rate": 9.095829283101094e-06, + "loss": 0.7707, + "step": 7146 + }, + { + "epoch": 0.3933623204359073, + "grad_norm": 0.8797083497047424, + "learning_rate": 9.095580650023158e-06, + "loss": 0.866, + "step": 7147 + }, + { + "epoch": 0.393417359238263, + "grad_norm": 0.7023645639419556, + "learning_rate": 9.095331986163935e-06, + "loss": 0.7013, + "step": 7148 + }, + { + "epoch": 0.3934723980406186, + "grad_norm": 0.697354793548584, + "learning_rate": 9.095083291525293e-06, + "loss": 0.7691, + "step": 7149 + }, + { + "epoch": 0.3935274368429743, + "grad_norm": 0.7211105227470398, + "learning_rate": 9.094834566109101e-06, + "loss": 0.6816, + "step": 7150 + }, + { + "epoch": 0.39358247564532994, + "grad_norm": 0.8593278527259827, + "learning_rate": 9.094585809917227e-06, + "loss": 0.915, + "step": 7151 + }, + { + "epoch": 0.39363751444768563, + "grad_norm": 0.7406070828437805, + "learning_rate": 9.094337022951545e-06, + "loss": 0.7825, + "step": 7152 + }, + { + "epoch": 0.39369255325004127, + "grad_norm": 0.7644504308700562, + "learning_rate": 9.09408820521392e-06, + "loss": 0.6796, + "step": 7153 + }, + { + "epoch": 0.39374759205239696, + "grad_norm": 0.8239033222198486, + "learning_rate": 9.093839356706224e-06, + "loss": 0.8396, + "step": 7154 + }, + { + "epoch": 0.3938026308547526, + "grad_norm": 0.6433991193771362, + "learning_rate": 9.093590477430327e-06, + "loss": 0.6941, + "step": 7155 + }, + { + "epoch": 0.3938576696571083, + "grad_norm": 0.6979972124099731, + "learning_rate": 9.093341567388102e-06, + "loss": 0.8142, + "step": 7156 + }, + { + "epoch": 0.3939127084594639, + "grad_norm": 0.7062026262283325, + "learning_rate": 9.093092626581414e-06, + "loss": 0.804, + "step": 7157 + }, + { + "epoch": 0.3939677472618196, + "grad_norm": 0.7070814967155457, + "learning_rate": 9.09284365501214e-06, + "loss": 0.765, + "step": 7158 + }, + { + "epoch": 0.39402278606417523, + "grad_norm": 0.8577908873558044, + "learning_rate": 9.092594652682147e-06, + "loss": 0.7074, + "step": 7159 + }, + { + "epoch": 0.3940778248665309, + "grad_norm": 0.7386197447776794, + "learning_rate": 9.092345619593309e-06, + "loss": 0.7629, + "step": 7160 + }, + { + "epoch": 0.39413286366888656, + "grad_norm": 0.8048123121261597, + "learning_rate": 9.092096555747496e-06, + "loss": 0.9225, + "step": 7161 + }, + { + "epoch": 0.39418790247124225, + "grad_norm": 0.7479888200759888, + "learning_rate": 9.091847461146582e-06, + "loss": 0.7284, + "step": 7162 + }, + { + "epoch": 0.3942429412735979, + "grad_norm": 0.7448734045028687, + "learning_rate": 9.091598335792438e-06, + "loss": 0.8694, + "step": 7163 + }, + { + "epoch": 0.39429798007595357, + "grad_norm": 0.7511261701583862, + "learning_rate": 9.091349179686935e-06, + "loss": 0.7822, + "step": 7164 + }, + { + "epoch": 0.3943530188783092, + "grad_norm": 0.7079344391822815, + "learning_rate": 9.091099992831946e-06, + "loss": 0.7238, + "step": 7165 + }, + { + "epoch": 0.3944080576806649, + "grad_norm": 0.7007229328155518, + "learning_rate": 9.090850775229347e-06, + "loss": 0.7269, + "step": 7166 + }, + { + "epoch": 0.3944630964830205, + "grad_norm": 0.769800066947937, + "learning_rate": 9.090601526881007e-06, + "loss": 0.7894, + "step": 7167 + }, + { + "epoch": 0.3945181352853762, + "grad_norm": 0.7211676836013794, + "learning_rate": 9.090352247788801e-06, + "loss": 0.7998, + "step": 7168 + }, + { + "epoch": 0.39457317408773185, + "grad_norm": 0.6784254312515259, + "learning_rate": 9.090102937954602e-06, + "loss": 0.7576, + "step": 7169 + }, + { + "epoch": 0.39462821289008754, + "grad_norm": 0.7696946859359741, + "learning_rate": 9.089853597380285e-06, + "loss": 0.8395, + "step": 7170 + }, + { + "epoch": 0.39468325169244317, + "grad_norm": 0.8720405697822571, + "learning_rate": 9.089604226067723e-06, + "loss": 0.8971, + "step": 7171 + }, + { + "epoch": 0.39473829049479886, + "grad_norm": 0.8457947373390198, + "learning_rate": 9.08935482401879e-06, + "loss": 0.7002, + "step": 7172 + }, + { + "epoch": 0.3947933292971545, + "grad_norm": 0.8181997537612915, + "learning_rate": 9.089105391235361e-06, + "loss": 0.8949, + "step": 7173 + }, + { + "epoch": 0.3948483680995101, + "grad_norm": 0.7717136144638062, + "learning_rate": 9.08885592771931e-06, + "loss": 0.829, + "step": 7174 + }, + { + "epoch": 0.3949034069018658, + "grad_norm": 0.6941567063331604, + "learning_rate": 9.088606433472514e-06, + "loss": 0.7592, + "step": 7175 + }, + { + "epoch": 0.39495844570422145, + "grad_norm": 0.7358599901199341, + "learning_rate": 9.088356908496845e-06, + "loss": 0.8657, + "step": 7176 + }, + { + "epoch": 0.39501348450657714, + "grad_norm": 1.1329307556152344, + "learning_rate": 9.08810735279418e-06, + "loss": 0.8307, + "step": 7177 + }, + { + "epoch": 0.3950685233089328, + "grad_norm": 0.7011532187461853, + "learning_rate": 9.087857766366395e-06, + "loss": 0.7487, + "step": 7178 + }, + { + "epoch": 0.39512356211128846, + "grad_norm": 0.7390572428703308, + "learning_rate": 9.087608149215366e-06, + "loss": 0.8244, + "step": 7179 + }, + { + "epoch": 0.3951786009136441, + "grad_norm": 0.6907634735107422, + "learning_rate": 9.087358501342966e-06, + "loss": 0.751, + "step": 7180 + }, + { + "epoch": 0.3952336397159998, + "grad_norm": 0.7467379570007324, + "learning_rate": 9.087108822751076e-06, + "loss": 0.8549, + "step": 7181 + }, + { + "epoch": 0.3952886785183554, + "grad_norm": 0.7493302226066589, + "learning_rate": 9.086859113441568e-06, + "loss": 0.8332, + "step": 7182 + }, + { + "epoch": 0.3953437173207111, + "grad_norm": 0.8364959955215454, + "learning_rate": 9.086609373416321e-06, + "loss": 0.7873, + "step": 7183 + }, + { + "epoch": 0.39539875612306674, + "grad_norm": 0.7330418825149536, + "learning_rate": 9.086359602677214e-06, + "loss": 0.7861, + "step": 7184 + }, + { + "epoch": 0.39545379492542243, + "grad_norm": 0.7296311855316162, + "learning_rate": 9.086109801226121e-06, + "loss": 0.7946, + "step": 7185 + }, + { + "epoch": 0.39550883372777806, + "grad_norm": 0.7884660363197327, + "learning_rate": 9.085859969064921e-06, + "loss": 0.7851, + "step": 7186 + }, + { + "epoch": 0.39556387253013375, + "grad_norm": 0.7311955690383911, + "learning_rate": 9.08561010619549e-06, + "loss": 0.7645, + "step": 7187 + }, + { + "epoch": 0.3956189113324894, + "grad_norm": 0.7447296977043152, + "learning_rate": 9.085360212619707e-06, + "loss": 0.7446, + "step": 7188 + }, + { + "epoch": 0.3956739501348451, + "grad_norm": 0.755628228187561, + "learning_rate": 9.08511028833945e-06, + "loss": 0.8107, + "step": 7189 + }, + { + "epoch": 0.3957289889372007, + "grad_norm": 0.6800833940505981, + "learning_rate": 9.0848603333566e-06, + "loss": 0.7471, + "step": 7190 + }, + { + "epoch": 0.3957840277395564, + "grad_norm": 0.6396341919898987, + "learning_rate": 9.08461034767303e-06, + "loss": 0.6797, + "step": 7191 + }, + { + "epoch": 0.39583906654191203, + "grad_norm": 0.729680597782135, + "learning_rate": 9.084360331290625e-06, + "loss": 0.7224, + "step": 7192 + }, + { + "epoch": 0.3958941053442677, + "grad_norm": 0.7630584239959717, + "learning_rate": 9.084110284211259e-06, + "loss": 0.8203, + "step": 7193 + }, + { + "epoch": 0.39594914414662336, + "grad_norm": 0.8799235820770264, + "learning_rate": 9.083860206436813e-06, + "loss": 0.8312, + "step": 7194 + }, + { + "epoch": 0.39600418294897904, + "grad_norm": 0.797081708908081, + "learning_rate": 9.083610097969169e-06, + "loss": 0.7561, + "step": 7195 + }, + { + "epoch": 0.3960592217513347, + "grad_norm": 0.7408759593963623, + "learning_rate": 9.083359958810203e-06, + "loss": 0.7854, + "step": 7196 + }, + { + "epoch": 0.39611426055369037, + "grad_norm": 0.7552130222320557, + "learning_rate": 9.083109788961797e-06, + "loss": 0.8145, + "step": 7197 + }, + { + "epoch": 0.396169299356046, + "grad_norm": 0.7147447466850281, + "learning_rate": 9.08285958842583e-06, + "loss": 0.792, + "step": 7198 + }, + { + "epoch": 0.3962243381584017, + "grad_norm": 0.7416259050369263, + "learning_rate": 9.082609357204183e-06, + "loss": 0.7801, + "step": 7199 + }, + { + "epoch": 0.3962793769607573, + "grad_norm": 0.7551109194755554, + "learning_rate": 9.082359095298741e-06, + "loss": 0.8841, + "step": 7200 + }, + { + "epoch": 0.396334415763113, + "grad_norm": 0.761472225189209, + "learning_rate": 9.082108802711377e-06, + "loss": 0.9061, + "step": 7201 + }, + { + "epoch": 0.39638945456546865, + "grad_norm": 0.7234126329421997, + "learning_rate": 9.081858479443977e-06, + "loss": 0.8308, + "step": 7202 + }, + { + "epoch": 0.39644449336782434, + "grad_norm": 0.7204816341400146, + "learning_rate": 9.08160812549842e-06, + "loss": 0.7481, + "step": 7203 + }, + { + "epoch": 0.39649953217017997, + "grad_norm": 0.7207956910133362, + "learning_rate": 9.081357740876591e-06, + "loss": 0.762, + "step": 7204 + }, + { + "epoch": 0.39655457097253566, + "grad_norm": 0.7967123985290527, + "learning_rate": 9.081107325580367e-06, + "loss": 0.7931, + "step": 7205 + }, + { + "epoch": 0.3966096097748913, + "grad_norm": 0.9839354753494263, + "learning_rate": 9.080856879611635e-06, + "loss": 0.8182, + "step": 7206 + }, + { + "epoch": 0.396664648577247, + "grad_norm": 0.8468357920646667, + "learning_rate": 9.080606402972274e-06, + "loss": 0.7056, + "step": 7207 + }, + { + "epoch": 0.3967196873796026, + "grad_norm": 0.6549574136734009, + "learning_rate": 9.080355895664169e-06, + "loss": 0.7604, + "step": 7208 + }, + { + "epoch": 0.3967747261819583, + "grad_norm": 0.7475417256355286, + "learning_rate": 9.080105357689201e-06, + "loss": 0.7107, + "step": 7209 + }, + { + "epoch": 0.39682976498431394, + "grad_norm": 0.7464179992675781, + "learning_rate": 9.079854789049251e-06, + "loss": 0.793, + "step": 7210 + }, + { + "epoch": 0.3968848037866696, + "grad_norm": 0.8332071900367737, + "learning_rate": 9.079604189746207e-06, + "loss": 0.8383, + "step": 7211 + }, + { + "epoch": 0.39693984258902526, + "grad_norm": 0.722055196762085, + "learning_rate": 9.07935355978195e-06, + "loss": 0.8569, + "step": 7212 + }, + { + "epoch": 0.39699488139138095, + "grad_norm": 0.7442018389701843, + "learning_rate": 9.079102899158363e-06, + "loss": 0.8165, + "step": 7213 + }, + { + "epoch": 0.3970499201937366, + "grad_norm": 0.6986141204833984, + "learning_rate": 9.07885220787733e-06, + "loss": 0.7562, + "step": 7214 + }, + { + "epoch": 0.39710495899609227, + "grad_norm": 0.7718464732170105, + "learning_rate": 9.078601485940736e-06, + "loss": 0.8529, + "step": 7215 + }, + { + "epoch": 0.3971599977984479, + "grad_norm": 0.7583653330802917, + "learning_rate": 9.078350733350464e-06, + "loss": 0.7855, + "step": 7216 + }, + { + "epoch": 0.39721503660080354, + "grad_norm": 0.7699223160743713, + "learning_rate": 9.078099950108401e-06, + "loss": 0.8061, + "step": 7217 + }, + { + "epoch": 0.39727007540315923, + "grad_norm": 0.7374141812324524, + "learning_rate": 9.07784913621643e-06, + "loss": 0.789, + "step": 7218 + }, + { + "epoch": 0.39732511420551486, + "grad_norm": 0.7446104884147644, + "learning_rate": 9.077598291676436e-06, + "loss": 0.8381, + "step": 7219 + }, + { + "epoch": 0.39738015300787055, + "grad_norm": 0.7017301917076111, + "learning_rate": 9.077347416490305e-06, + "loss": 0.7153, + "step": 7220 + }, + { + "epoch": 0.3974351918102262, + "grad_norm": 0.7676172852516174, + "learning_rate": 9.077096510659922e-06, + "loss": 0.8029, + "step": 7221 + }, + { + "epoch": 0.3974902306125819, + "grad_norm": 0.9340602159500122, + "learning_rate": 9.076845574187174e-06, + "loss": 0.7865, + "step": 7222 + }, + { + "epoch": 0.3975452694149375, + "grad_norm": 0.8634235262870789, + "learning_rate": 9.076594607073945e-06, + "loss": 0.7606, + "step": 7223 + }, + { + "epoch": 0.3976003082172932, + "grad_norm": 0.8967369198799133, + "learning_rate": 9.076343609322123e-06, + "loss": 0.7011, + "step": 7224 + }, + { + "epoch": 0.39765534701964883, + "grad_norm": 0.7269352078437805, + "learning_rate": 9.076092580933594e-06, + "loss": 0.8043, + "step": 7225 + }, + { + "epoch": 0.3977103858220045, + "grad_norm": 0.7550628781318665, + "learning_rate": 9.075841521910243e-06, + "loss": 0.7344, + "step": 7226 + }, + { + "epoch": 0.39776542462436015, + "grad_norm": 0.6973844766616821, + "learning_rate": 9.075590432253958e-06, + "loss": 0.6995, + "step": 7227 + }, + { + "epoch": 0.39782046342671584, + "grad_norm": 0.648560643196106, + "learning_rate": 9.075339311966627e-06, + "loss": 0.6997, + "step": 7228 + }, + { + "epoch": 0.3978755022290715, + "grad_norm": 0.8457548022270203, + "learning_rate": 9.075088161050134e-06, + "loss": 0.8548, + "step": 7229 + }, + { + "epoch": 0.39793054103142717, + "grad_norm": 0.7644637823104858, + "learning_rate": 9.074836979506373e-06, + "loss": 0.6966, + "step": 7230 + }, + { + "epoch": 0.3979855798337828, + "grad_norm": 0.7146210670471191, + "learning_rate": 9.074585767337227e-06, + "loss": 0.7673, + "step": 7231 + }, + { + "epoch": 0.3980406186361385, + "grad_norm": 0.8570694327354431, + "learning_rate": 9.074334524544585e-06, + "loss": 0.8233, + "step": 7232 + }, + { + "epoch": 0.3980956574384941, + "grad_norm": 0.7257633805274963, + "learning_rate": 9.074083251130334e-06, + "loss": 0.7464, + "step": 7233 + }, + { + "epoch": 0.3981506962408498, + "grad_norm": 0.9377032518386841, + "learning_rate": 9.073831947096365e-06, + "loss": 0.7814, + "step": 7234 + }, + { + "epoch": 0.39820573504320544, + "grad_norm": 0.8105629086494446, + "learning_rate": 9.073580612444566e-06, + "loss": 0.8069, + "step": 7235 + }, + { + "epoch": 0.39826077384556113, + "grad_norm": 0.7874456644058228, + "learning_rate": 9.073329247176824e-06, + "loss": 0.8414, + "step": 7236 + }, + { + "epoch": 0.39831581264791677, + "grad_norm": 0.6829617023468018, + "learning_rate": 9.07307785129503e-06, + "loss": 0.7633, + "step": 7237 + }, + { + "epoch": 0.39837085145027246, + "grad_norm": 0.6838501691818237, + "learning_rate": 9.072826424801075e-06, + "loss": 0.6972, + "step": 7238 + }, + { + "epoch": 0.3984258902526281, + "grad_norm": 0.7054216861724854, + "learning_rate": 9.072574967696845e-06, + "loss": 0.8049, + "step": 7239 + }, + { + "epoch": 0.3984809290549838, + "grad_norm": 0.9462615847587585, + "learning_rate": 9.072323479984232e-06, + "loss": 0.7988, + "step": 7240 + }, + { + "epoch": 0.3985359678573394, + "grad_norm": 0.7334465980529785, + "learning_rate": 9.072071961665128e-06, + "loss": 0.7538, + "step": 7241 + }, + { + "epoch": 0.3985910066596951, + "grad_norm": 0.7506609559059143, + "learning_rate": 9.071820412741418e-06, + "loss": 0.7991, + "step": 7242 + }, + { + "epoch": 0.39864604546205074, + "grad_norm": 0.6858688592910767, + "learning_rate": 9.071568833214998e-06, + "loss": 0.7258, + "step": 7243 + }, + { + "epoch": 0.3987010842644064, + "grad_norm": 0.8117396235466003, + "learning_rate": 9.071317223087754e-06, + "loss": 0.752, + "step": 7244 + }, + { + "epoch": 0.39875612306676206, + "grad_norm": 0.7772389054298401, + "learning_rate": 9.071065582361582e-06, + "loss": 0.7444, + "step": 7245 + }, + { + "epoch": 0.39881116186911775, + "grad_norm": 0.7221882939338684, + "learning_rate": 9.07081391103837e-06, + "loss": 0.8035, + "step": 7246 + }, + { + "epoch": 0.3988662006714734, + "grad_norm": 0.8113289475440979, + "learning_rate": 9.07056220912001e-06, + "loss": 0.7623, + "step": 7247 + }, + { + "epoch": 0.39892123947382907, + "grad_norm": 0.730823278427124, + "learning_rate": 9.070310476608395e-06, + "loss": 0.7872, + "step": 7248 + }, + { + "epoch": 0.3989762782761847, + "grad_norm": 0.7690893411636353, + "learning_rate": 9.070058713505415e-06, + "loss": 0.7402, + "step": 7249 + }, + { + "epoch": 0.3990313170785404, + "grad_norm": 0.6768597364425659, + "learning_rate": 9.069806919812963e-06, + "loss": 0.7283, + "step": 7250 + }, + { + "epoch": 0.399086355880896, + "grad_norm": 0.6938686370849609, + "learning_rate": 9.069555095532932e-06, + "loss": 0.7209, + "step": 7251 + }, + { + "epoch": 0.3991413946832517, + "grad_norm": 0.7162025570869446, + "learning_rate": 9.069303240667215e-06, + "loss": 0.7915, + "step": 7252 + }, + { + "epoch": 0.39919643348560735, + "grad_norm": 0.9170399308204651, + "learning_rate": 9.069051355217704e-06, + "loss": 0.8399, + "step": 7253 + }, + { + "epoch": 0.39925147228796304, + "grad_norm": 0.7080186009407043, + "learning_rate": 9.068799439186291e-06, + "loss": 0.8678, + "step": 7254 + }, + { + "epoch": 0.39930651109031867, + "grad_norm": 1.013613224029541, + "learning_rate": 9.068547492574872e-06, + "loss": 0.817, + "step": 7255 + }, + { + "epoch": 0.39936154989267436, + "grad_norm": 0.6911013722419739, + "learning_rate": 9.068295515385337e-06, + "loss": 0.7048, + "step": 7256 + }, + { + "epoch": 0.39941658869503, + "grad_norm": 0.748219907283783, + "learning_rate": 9.068043507619584e-06, + "loss": 0.8115, + "step": 7257 + }, + { + "epoch": 0.3994716274973857, + "grad_norm": 0.6763347387313843, + "learning_rate": 9.067791469279504e-06, + "loss": 0.763, + "step": 7258 + }, + { + "epoch": 0.3995266662997413, + "grad_norm": 0.7291030287742615, + "learning_rate": 9.067539400366993e-06, + "loss": 0.7319, + "step": 7259 + }, + { + "epoch": 0.39958170510209695, + "grad_norm": 0.6515628695487976, + "learning_rate": 9.067287300883945e-06, + "loss": 0.7903, + "step": 7260 + }, + { + "epoch": 0.39963674390445264, + "grad_norm": 0.7815985679626465, + "learning_rate": 9.067035170832253e-06, + "loss": 0.8241, + "step": 7261 + }, + { + "epoch": 0.3996917827068083, + "grad_norm": 0.6747417449951172, + "learning_rate": 9.066783010213812e-06, + "loss": 0.7544, + "step": 7262 + }, + { + "epoch": 0.39974682150916396, + "grad_norm": 0.6568340063095093, + "learning_rate": 9.066530819030522e-06, + "loss": 0.7754, + "step": 7263 + }, + { + "epoch": 0.3998018603115196, + "grad_norm": 0.6703339219093323, + "learning_rate": 9.066278597284273e-06, + "loss": 0.7581, + "step": 7264 + }, + { + "epoch": 0.3998568991138753, + "grad_norm": 0.7421279549598694, + "learning_rate": 9.066026344976962e-06, + "loss": 0.7974, + "step": 7265 + }, + { + "epoch": 0.3999119379162309, + "grad_norm": 0.7226015329360962, + "learning_rate": 9.065774062110486e-06, + "loss": 0.7777, + "step": 7266 + }, + { + "epoch": 0.3999669767185866, + "grad_norm": 0.7092894911766052, + "learning_rate": 9.06552174868674e-06, + "loss": 0.7885, + "step": 7267 + }, + { + "epoch": 0.40002201552094224, + "grad_norm": 0.837902307510376, + "learning_rate": 9.065269404707622e-06, + "loss": 0.7425, + "step": 7268 + }, + { + "epoch": 0.40007705432329793, + "grad_norm": 0.803811252117157, + "learning_rate": 9.065017030175027e-06, + "loss": 0.8418, + "step": 7269 + }, + { + "epoch": 0.40013209312565357, + "grad_norm": 0.8110278248786926, + "learning_rate": 9.064764625090854e-06, + "loss": 0.7724, + "step": 7270 + }, + { + "epoch": 0.40018713192800925, + "grad_norm": 0.7305173277854919, + "learning_rate": 9.064512189456995e-06, + "loss": 0.7465, + "step": 7271 + }, + { + "epoch": 0.4002421707303649, + "grad_norm": 0.7312467694282532, + "learning_rate": 9.06425972327535e-06, + "loss": 0.8406, + "step": 7272 + }, + { + "epoch": 0.4002972095327206, + "grad_norm": 0.7348741292953491, + "learning_rate": 9.064007226547819e-06, + "loss": 0.8103, + "step": 7273 + }, + { + "epoch": 0.4003522483350762, + "grad_norm": 0.6561787724494934, + "learning_rate": 9.063754699276297e-06, + "loss": 0.6634, + "step": 7274 + }, + { + "epoch": 0.4004072871374319, + "grad_norm": 0.7924866080284119, + "learning_rate": 9.063502141462682e-06, + "loss": 0.6592, + "step": 7275 + }, + { + "epoch": 0.40046232593978753, + "grad_norm": 0.6873973608016968, + "learning_rate": 9.063249553108873e-06, + "loss": 0.7912, + "step": 7276 + }, + { + "epoch": 0.4005173647421432, + "grad_norm": 0.6872708797454834, + "learning_rate": 9.062996934216768e-06, + "loss": 0.732, + "step": 7277 + }, + { + "epoch": 0.40057240354449886, + "grad_norm": 0.7381585836410522, + "learning_rate": 9.062744284788265e-06, + "loss": 0.84, + "step": 7278 + }, + { + "epoch": 0.40062744234685455, + "grad_norm": 0.7885964512825012, + "learning_rate": 9.062491604825266e-06, + "loss": 0.8229, + "step": 7279 + }, + { + "epoch": 0.4006824811492102, + "grad_norm": 0.9066407680511475, + "learning_rate": 9.062238894329664e-06, + "loss": 0.7299, + "step": 7280 + }, + { + "epoch": 0.40073751995156587, + "grad_norm": 0.7694007754325867, + "learning_rate": 9.061986153303364e-06, + "loss": 0.8033, + "step": 7281 + }, + { + "epoch": 0.4007925587539215, + "grad_norm": 1.021766185760498, + "learning_rate": 9.061733381748263e-06, + "loss": 0.79, + "step": 7282 + }, + { + "epoch": 0.4008475975562772, + "grad_norm": 0.7776662111282349, + "learning_rate": 9.06148057966626e-06, + "loss": 0.8484, + "step": 7283 + }, + { + "epoch": 0.4009026363586328, + "grad_norm": 0.8646043539047241, + "learning_rate": 9.061227747059257e-06, + "loss": 0.8223, + "step": 7284 + }, + { + "epoch": 0.4009576751609885, + "grad_norm": 0.7347257733345032, + "learning_rate": 9.060974883929154e-06, + "loss": 0.8062, + "step": 7285 + }, + { + "epoch": 0.40101271396334415, + "grad_norm": 0.8233902454376221, + "learning_rate": 9.06072199027785e-06, + "loss": 0.8922, + "step": 7286 + }, + { + "epoch": 0.40106775276569984, + "grad_norm": 0.7099601030349731, + "learning_rate": 9.060469066107246e-06, + "loss": 0.7125, + "step": 7287 + }, + { + "epoch": 0.40112279156805547, + "grad_norm": 0.7549998164176941, + "learning_rate": 9.060216111419246e-06, + "loss": 0.7851, + "step": 7288 + }, + { + "epoch": 0.40117783037041116, + "grad_norm": 0.753516435623169, + "learning_rate": 9.059963126215748e-06, + "loss": 0.7831, + "step": 7289 + }, + { + "epoch": 0.4012328691727668, + "grad_norm": 0.6718429327011108, + "learning_rate": 9.059710110498651e-06, + "loss": 0.7305, + "step": 7290 + }, + { + "epoch": 0.4012879079751225, + "grad_norm": 0.6796036958694458, + "learning_rate": 9.05945706426986e-06, + "loss": 0.802, + "step": 7291 + }, + { + "epoch": 0.4013429467774781, + "grad_norm": 0.8046827912330627, + "learning_rate": 9.05920398753128e-06, + "loss": 0.7286, + "step": 7292 + }, + { + "epoch": 0.4013979855798338, + "grad_norm": 0.7518643140792847, + "learning_rate": 9.058950880284807e-06, + "loss": 0.7287, + "step": 7293 + }, + { + "epoch": 0.40145302438218944, + "grad_norm": 0.8386855125427246, + "learning_rate": 9.058697742532345e-06, + "loss": 0.8201, + "step": 7294 + }, + { + "epoch": 0.4015080631845451, + "grad_norm": 0.7780192494392395, + "learning_rate": 9.058444574275797e-06, + "loss": 0.7999, + "step": 7295 + }, + { + "epoch": 0.40156310198690076, + "grad_norm": 0.7715566754341125, + "learning_rate": 9.058191375517068e-06, + "loss": 0.732, + "step": 7296 + }, + { + "epoch": 0.40161814078925645, + "grad_norm": 0.9940280914306641, + "learning_rate": 9.057938146258057e-06, + "loss": 0.8247, + "step": 7297 + }, + { + "epoch": 0.4016731795916121, + "grad_norm": 0.7567923069000244, + "learning_rate": 9.05768488650067e-06, + "loss": 0.8254, + "step": 7298 + }, + { + "epoch": 0.4017282183939678, + "grad_norm": 0.7544496655464172, + "learning_rate": 9.05743159624681e-06, + "loss": 0.811, + "step": 7299 + }, + { + "epoch": 0.4017832571963234, + "grad_norm": 0.63368821144104, + "learning_rate": 9.05717827549838e-06, + "loss": 0.6498, + "step": 7300 + }, + { + "epoch": 0.4018382959986791, + "grad_norm": 0.7077621221542358, + "learning_rate": 9.056924924257284e-06, + "loss": 0.7401, + "step": 7301 + }, + { + "epoch": 0.40189333480103473, + "grad_norm": 0.6782366037368774, + "learning_rate": 9.056671542525426e-06, + "loss": 0.8013, + "step": 7302 + }, + { + "epoch": 0.40194837360339036, + "grad_norm": 0.6605678200721741, + "learning_rate": 9.056418130304709e-06, + "loss": 0.8038, + "step": 7303 + }, + { + "epoch": 0.40200341240574605, + "grad_norm": 0.8716840147972107, + "learning_rate": 9.056164687597041e-06, + "loss": 0.7652, + "step": 7304 + }, + { + "epoch": 0.4020584512081017, + "grad_norm": 0.8464542031288147, + "learning_rate": 9.055911214404325e-06, + "loss": 0.8663, + "step": 7305 + }, + { + "epoch": 0.4021134900104574, + "grad_norm": 0.7165409326553345, + "learning_rate": 9.055657710728466e-06, + "loss": 0.8028, + "step": 7306 + }, + { + "epoch": 0.402168528812813, + "grad_norm": 0.7313430309295654, + "learning_rate": 9.055404176571369e-06, + "loss": 0.7538, + "step": 7307 + }, + { + "epoch": 0.4022235676151687, + "grad_norm": 0.7757230401039124, + "learning_rate": 9.05515061193494e-06, + "loss": 0.9096, + "step": 7308 + }, + { + "epoch": 0.40227860641752433, + "grad_norm": 0.7178354859352112, + "learning_rate": 9.054897016821085e-06, + "loss": 0.7186, + "step": 7309 + }, + { + "epoch": 0.40233364521988, + "grad_norm": 0.8331356048583984, + "learning_rate": 9.054643391231708e-06, + "loss": 0.8724, + "step": 7310 + }, + { + "epoch": 0.40238868402223565, + "grad_norm": 0.7709757685661316, + "learning_rate": 9.054389735168717e-06, + "loss": 0.692, + "step": 7311 + }, + { + "epoch": 0.40244372282459134, + "grad_norm": 0.7393380999565125, + "learning_rate": 9.054136048634018e-06, + "loss": 0.7863, + "step": 7312 + }, + { + "epoch": 0.402498761626947, + "grad_norm": 0.7372385859489441, + "learning_rate": 9.053882331629518e-06, + "loss": 0.781, + "step": 7313 + }, + { + "epoch": 0.40255380042930267, + "grad_norm": 0.7076019048690796, + "learning_rate": 9.053628584157123e-06, + "loss": 0.7598, + "step": 7314 + }, + { + "epoch": 0.4026088392316583, + "grad_norm": 0.7465673685073853, + "learning_rate": 9.053374806218742e-06, + "loss": 0.7454, + "step": 7315 + }, + { + "epoch": 0.402663878034014, + "grad_norm": 0.7414120435714722, + "learning_rate": 9.05312099781628e-06, + "loss": 0.7135, + "step": 7316 + }, + { + "epoch": 0.4027189168363696, + "grad_norm": 0.7490748167037964, + "learning_rate": 9.052867158951646e-06, + "loss": 0.6833, + "step": 7317 + }, + { + "epoch": 0.4027739556387253, + "grad_norm": 0.8027878999710083, + "learning_rate": 9.052613289626747e-06, + "loss": 0.7466, + "step": 7318 + }, + { + "epoch": 0.40282899444108095, + "grad_norm": 0.6777862310409546, + "learning_rate": 9.052359389843493e-06, + "loss": 0.7446, + "step": 7319 + }, + { + "epoch": 0.40288403324343663, + "grad_norm": 0.9240381717681885, + "learning_rate": 9.052105459603787e-06, + "loss": 0.7801, + "step": 7320 + }, + { + "epoch": 0.40293907204579227, + "grad_norm": 0.9592602252960205, + "learning_rate": 9.051851498909543e-06, + "loss": 0.9648, + "step": 7321 + }, + { + "epoch": 0.40299411084814796, + "grad_norm": 0.8469638228416443, + "learning_rate": 9.051597507762669e-06, + "loss": 0.8303, + "step": 7322 + }, + { + "epoch": 0.4030491496505036, + "grad_norm": 0.6981443166732788, + "learning_rate": 9.05134348616507e-06, + "loss": 0.7245, + "step": 7323 + }, + { + "epoch": 0.4031041884528593, + "grad_norm": 0.7133469581604004, + "learning_rate": 9.05108943411866e-06, + "loss": 0.7763, + "step": 7324 + }, + { + "epoch": 0.4031592272552149, + "grad_norm": 0.7043703198432922, + "learning_rate": 9.050835351625344e-06, + "loss": 0.8247, + "step": 7325 + }, + { + "epoch": 0.4032142660575706, + "grad_norm": 0.6662501692771912, + "learning_rate": 9.050581238687036e-06, + "loss": 0.7669, + "step": 7326 + }, + { + "epoch": 0.40326930485992624, + "grad_norm": 0.6482356786727905, + "learning_rate": 9.050327095305643e-06, + "loss": 0.6477, + "step": 7327 + }, + { + "epoch": 0.4033243436622819, + "grad_norm": 0.7465450167655945, + "learning_rate": 9.050072921483076e-06, + "loss": 0.8053, + "step": 7328 + }, + { + "epoch": 0.40337938246463756, + "grad_norm": 0.6765472292900085, + "learning_rate": 9.049818717221245e-06, + "loss": 0.765, + "step": 7329 + }, + { + "epoch": 0.40343442126699325, + "grad_norm": 0.7098689675331116, + "learning_rate": 9.04956448252206e-06, + "loss": 0.8059, + "step": 7330 + }, + { + "epoch": 0.4034894600693489, + "grad_norm": 0.6773823499679565, + "learning_rate": 9.049310217387432e-06, + "loss": 0.6848, + "step": 7331 + }, + { + "epoch": 0.40354449887170457, + "grad_norm": 0.6884829998016357, + "learning_rate": 9.049055921819275e-06, + "loss": 0.696, + "step": 7332 + }, + { + "epoch": 0.4035995376740602, + "grad_norm": 0.662545919418335, + "learning_rate": 9.048801595819494e-06, + "loss": 0.8286, + "step": 7333 + }, + { + "epoch": 0.4036545764764159, + "grad_norm": 0.6863077878952026, + "learning_rate": 9.048547239390007e-06, + "loss": 0.7215, + "step": 7334 + }, + { + "epoch": 0.4037096152787715, + "grad_norm": 0.6982632875442505, + "learning_rate": 9.048292852532721e-06, + "loss": 0.7635, + "step": 7335 + }, + { + "epoch": 0.4037646540811272, + "grad_norm": 0.8512400984764099, + "learning_rate": 9.048038435249548e-06, + "loss": 0.6226, + "step": 7336 + }, + { + "epoch": 0.40381969288348285, + "grad_norm": 0.6952843070030212, + "learning_rate": 9.047783987542405e-06, + "loss": 0.8317, + "step": 7337 + }, + { + "epoch": 0.40387473168583854, + "grad_norm": 0.7802778482437134, + "learning_rate": 9.0475295094132e-06, + "loss": 0.8615, + "step": 7338 + }, + { + "epoch": 0.4039297704881942, + "grad_norm": 0.8783930540084839, + "learning_rate": 9.047275000863844e-06, + "loss": 0.743, + "step": 7339 + }, + { + "epoch": 0.40398480929054986, + "grad_norm": 0.7205806970596313, + "learning_rate": 9.047020461896256e-06, + "loss": 0.7953, + "step": 7340 + }, + { + "epoch": 0.4040398480929055, + "grad_norm": 0.8438451290130615, + "learning_rate": 9.046765892512344e-06, + "loss": 0.7613, + "step": 7341 + }, + { + "epoch": 0.4040948868952612, + "grad_norm": 0.7300973534584045, + "learning_rate": 9.046511292714021e-06, + "loss": 0.7856, + "step": 7342 + }, + { + "epoch": 0.4041499256976168, + "grad_norm": 0.8472041487693787, + "learning_rate": 9.046256662503206e-06, + "loss": 0.8526, + "step": 7343 + }, + { + "epoch": 0.4042049644999725, + "grad_norm": 0.789465606212616, + "learning_rate": 9.046002001881807e-06, + "loss": 0.7792, + "step": 7344 + }, + { + "epoch": 0.40426000330232814, + "grad_norm": 0.7720938920974731, + "learning_rate": 9.04574731085174e-06, + "loss": 0.8065, + "step": 7345 + }, + { + "epoch": 0.4043150421046838, + "grad_norm": 0.6968526840209961, + "learning_rate": 9.04549258941492e-06, + "loss": 0.8135, + "step": 7346 + }, + { + "epoch": 0.40437008090703946, + "grad_norm": 0.746865451335907, + "learning_rate": 9.04523783757326e-06, + "loss": 0.8216, + "step": 7347 + }, + { + "epoch": 0.4044251197093951, + "grad_norm": 0.6750560998916626, + "learning_rate": 9.044983055328676e-06, + "loss": 0.7883, + "step": 7348 + }, + { + "epoch": 0.4044801585117508, + "grad_norm": 0.6791195273399353, + "learning_rate": 9.044728242683081e-06, + "loss": 0.7721, + "step": 7349 + }, + { + "epoch": 0.4045351973141064, + "grad_norm": 0.7238358855247498, + "learning_rate": 9.044473399638392e-06, + "loss": 0.739, + "step": 7350 + }, + { + "epoch": 0.4045902361164621, + "grad_norm": 0.6793557405471802, + "learning_rate": 9.044218526196523e-06, + "loss": 0.7853, + "step": 7351 + }, + { + "epoch": 0.40464527491881774, + "grad_norm": 0.767564058303833, + "learning_rate": 9.043963622359392e-06, + "loss": 0.8158, + "step": 7352 + }, + { + "epoch": 0.40470031372117343, + "grad_norm": 0.6800708770751953, + "learning_rate": 9.043708688128909e-06, + "loss": 0.7493, + "step": 7353 + }, + { + "epoch": 0.40475535252352907, + "grad_norm": 0.75978022813797, + "learning_rate": 9.043453723506996e-06, + "loss": 0.7066, + "step": 7354 + }, + { + "epoch": 0.40481039132588476, + "grad_norm": 1.0194984674453735, + "learning_rate": 9.043198728495568e-06, + "loss": 0.6238, + "step": 7355 + }, + { + "epoch": 0.4048654301282404, + "grad_norm": 0.7102386355400085, + "learning_rate": 9.04294370309654e-06, + "loss": 0.75, + "step": 7356 + }, + { + "epoch": 0.4049204689305961, + "grad_norm": 0.8468191623687744, + "learning_rate": 9.04268864731183e-06, + "loss": 0.8095, + "step": 7357 + }, + { + "epoch": 0.4049755077329517, + "grad_norm": 0.7022871971130371, + "learning_rate": 9.042433561143353e-06, + "loss": 0.8394, + "step": 7358 + }, + { + "epoch": 0.4050305465353074, + "grad_norm": 1.1873482465744019, + "learning_rate": 9.042178444593028e-06, + "loss": 0.7863, + "step": 7359 + }, + { + "epoch": 0.40508558533766303, + "grad_norm": 0.7074940204620361, + "learning_rate": 9.041923297662772e-06, + "loss": 0.7067, + "step": 7360 + }, + { + "epoch": 0.4051406241400187, + "grad_norm": 0.7602211833000183, + "learning_rate": 9.041668120354503e-06, + "loss": 0.6594, + "step": 7361 + }, + { + "epoch": 0.40519566294237436, + "grad_norm": 0.7903324365615845, + "learning_rate": 9.041412912670138e-06, + "loss": 0.7978, + "step": 7362 + }, + { + "epoch": 0.40525070174473005, + "grad_norm": 0.7422891855239868, + "learning_rate": 9.041157674611595e-06, + "loss": 0.8162, + "step": 7363 + }, + { + "epoch": 0.4053057405470857, + "grad_norm": 0.7978767156600952, + "learning_rate": 9.040902406180791e-06, + "loss": 0.762, + "step": 7364 + }, + { + "epoch": 0.40536077934944137, + "grad_norm": 0.7719776630401611, + "learning_rate": 9.04064710737965e-06, + "loss": 0.8098, + "step": 7365 + }, + { + "epoch": 0.405415818151797, + "grad_norm": 0.8646591305732727, + "learning_rate": 9.040391778210083e-06, + "loss": 0.9372, + "step": 7366 + }, + { + "epoch": 0.4054708569541527, + "grad_norm": 0.6616937518119812, + "learning_rate": 9.040136418674015e-06, + "loss": 0.7424, + "step": 7367 + }, + { + "epoch": 0.4055258957565083, + "grad_norm": 0.7676553130149841, + "learning_rate": 9.039881028773363e-06, + "loss": 0.6327, + "step": 7368 + }, + { + "epoch": 0.405580934558864, + "grad_norm": 0.6838239431381226, + "learning_rate": 9.039625608510047e-06, + "loss": 0.7548, + "step": 7369 + }, + { + "epoch": 0.40563597336121965, + "grad_norm": 0.7476304769515991, + "learning_rate": 9.039370157885986e-06, + "loss": 0.7262, + "step": 7370 + }, + { + "epoch": 0.40569101216357534, + "grad_norm": 0.8985139727592468, + "learning_rate": 9.0391146769031e-06, + "loss": 0.7729, + "step": 7371 + }, + { + "epoch": 0.40574605096593097, + "grad_norm": 0.7840422987937927, + "learning_rate": 9.038859165563308e-06, + "loss": 0.7855, + "step": 7372 + }, + { + "epoch": 0.40580108976828666, + "grad_norm": 0.6777672171592712, + "learning_rate": 9.038603623868534e-06, + "loss": 0.7379, + "step": 7373 + }, + { + "epoch": 0.4058561285706423, + "grad_norm": 0.7226746678352356, + "learning_rate": 9.038348051820694e-06, + "loss": 0.7686, + "step": 7374 + }, + { + "epoch": 0.405911167372998, + "grad_norm": 0.7647444605827332, + "learning_rate": 9.038092449421713e-06, + "loss": 0.8859, + "step": 7375 + }, + { + "epoch": 0.4059662061753536, + "grad_norm": 0.6524979472160339, + "learning_rate": 9.037836816673508e-06, + "loss": 0.6982, + "step": 7376 + }, + { + "epoch": 0.4060212449777093, + "grad_norm": 0.7842861413955688, + "learning_rate": 9.037581153578004e-06, + "loss": 0.8099, + "step": 7377 + }, + { + "epoch": 0.40607628378006494, + "grad_norm": 0.6424387693405151, + "learning_rate": 9.03732546013712e-06, + "loss": 0.7387, + "step": 7378 + }, + { + "epoch": 0.40613132258242063, + "grad_norm": 0.8444356918334961, + "learning_rate": 9.037069736352779e-06, + "loss": 0.8813, + "step": 7379 + }, + { + "epoch": 0.40618636138477626, + "grad_norm": 0.6487529277801514, + "learning_rate": 9.036813982226904e-06, + "loss": 0.7609, + "step": 7380 + }, + { + "epoch": 0.40624140018713195, + "grad_norm": 0.7891185879707336, + "learning_rate": 9.036558197761413e-06, + "loss": 0.8589, + "step": 7381 + }, + { + "epoch": 0.4062964389894876, + "grad_norm": 0.7183120250701904, + "learning_rate": 9.036302382958233e-06, + "loss": 0.8429, + "step": 7382 + }, + { + "epoch": 0.4063514777918433, + "grad_norm": 0.6386578679084778, + "learning_rate": 9.036046537819283e-06, + "loss": 0.6955, + "step": 7383 + }, + { + "epoch": 0.4064065165941989, + "grad_norm": 0.7572369575500488, + "learning_rate": 9.035790662346488e-06, + "loss": 0.8018, + "step": 7384 + }, + { + "epoch": 0.4064615553965546, + "grad_norm": 0.7105650305747986, + "learning_rate": 9.035534756541771e-06, + "loss": 0.8527, + "step": 7385 + }, + { + "epoch": 0.40651659419891023, + "grad_norm": 0.7031856179237366, + "learning_rate": 9.035278820407056e-06, + "loss": 0.6991, + "step": 7386 + }, + { + "epoch": 0.4065716330012659, + "grad_norm": 0.7407381534576416, + "learning_rate": 9.035022853944266e-06, + "loss": 0.708, + "step": 7387 + }, + { + "epoch": 0.40662667180362155, + "grad_norm": 0.7078498601913452, + "learning_rate": 9.034766857155322e-06, + "loss": 0.7584, + "step": 7388 + }, + { + "epoch": 0.4066817106059772, + "grad_norm": 0.7643301486968994, + "learning_rate": 9.034510830042151e-06, + "loss": 0.7836, + "step": 7389 + }, + { + "epoch": 0.4067367494083329, + "grad_norm": 0.7165302038192749, + "learning_rate": 9.034254772606676e-06, + "loss": 0.7769, + "step": 7390 + }, + { + "epoch": 0.4067917882106885, + "grad_norm": 0.7442395091056824, + "learning_rate": 9.033998684850824e-06, + "loss": 0.7231, + "step": 7391 + }, + { + "epoch": 0.4068468270130442, + "grad_norm": 0.7425046563148499, + "learning_rate": 9.033742566776517e-06, + "loss": 0.7709, + "step": 7392 + }, + { + "epoch": 0.40690186581539983, + "grad_norm": 0.768419086933136, + "learning_rate": 9.03348641838568e-06, + "loss": 0.7768, + "step": 7393 + }, + { + "epoch": 0.4069569046177555, + "grad_norm": 0.6785634160041809, + "learning_rate": 9.03323023968024e-06, + "loss": 0.7468, + "step": 7394 + }, + { + "epoch": 0.40701194342011116, + "grad_norm": 0.7075444459915161, + "learning_rate": 9.03297403066212e-06, + "loss": 0.7757, + "step": 7395 + }, + { + "epoch": 0.40706698222246684, + "grad_norm": 0.7580223679542542, + "learning_rate": 9.032717791333247e-06, + "loss": 0.7311, + "step": 7396 + }, + { + "epoch": 0.4071220210248225, + "grad_norm": 0.8110041618347168, + "learning_rate": 9.032461521695546e-06, + "loss": 0.7923, + "step": 7397 + }, + { + "epoch": 0.40717705982717817, + "grad_norm": 0.7204881310462952, + "learning_rate": 9.032205221750945e-06, + "loss": 0.759, + "step": 7398 + }, + { + "epoch": 0.4072320986295338, + "grad_norm": 0.8392491340637207, + "learning_rate": 9.031948891501368e-06, + "loss": 0.8292, + "step": 7399 + }, + { + "epoch": 0.4072871374318895, + "grad_norm": 0.7134600281715393, + "learning_rate": 9.031692530948742e-06, + "loss": 0.7, + "step": 7400 + }, + { + "epoch": 0.4073421762342451, + "grad_norm": 0.6324336528778076, + "learning_rate": 9.031436140094995e-06, + "loss": 0.6964, + "step": 7401 + }, + { + "epoch": 0.4073972150366008, + "grad_norm": 0.7281947731971741, + "learning_rate": 9.031179718942052e-06, + "loss": 0.7567, + "step": 7402 + }, + { + "epoch": 0.40745225383895645, + "grad_norm": 0.8828619718551636, + "learning_rate": 9.030923267491842e-06, + "loss": 0.8139, + "step": 7403 + }, + { + "epoch": 0.40750729264131214, + "grad_norm": 0.7039986252784729, + "learning_rate": 9.030666785746292e-06, + "loss": 0.7339, + "step": 7404 + }, + { + "epoch": 0.40756233144366777, + "grad_norm": 0.7049984931945801, + "learning_rate": 9.030410273707331e-06, + "loss": 0.6842, + "step": 7405 + }, + { + "epoch": 0.40761737024602346, + "grad_norm": 0.7149737477302551, + "learning_rate": 9.030153731376883e-06, + "loss": 0.6837, + "step": 7406 + }, + { + "epoch": 0.4076724090483791, + "grad_norm": 1.0804089307785034, + "learning_rate": 9.029897158756878e-06, + "loss": 0.7726, + "step": 7407 + }, + { + "epoch": 0.4077274478507348, + "grad_norm": 0.8354909420013428, + "learning_rate": 9.029640555849244e-06, + "loss": 0.8058, + "step": 7408 + }, + { + "epoch": 0.4077824866530904, + "grad_norm": 0.7091527581214905, + "learning_rate": 9.029383922655914e-06, + "loss": 0.7636, + "step": 7409 + }, + { + "epoch": 0.4078375254554461, + "grad_norm": 0.6720988750457764, + "learning_rate": 9.029127259178809e-06, + "loss": 0.7179, + "step": 7410 + }, + { + "epoch": 0.40789256425780174, + "grad_norm": 0.685858964920044, + "learning_rate": 9.028870565419865e-06, + "loss": 0.7637, + "step": 7411 + }, + { + "epoch": 0.4079476030601574, + "grad_norm": 0.7505033016204834, + "learning_rate": 9.028613841381007e-06, + "loss": 0.7463, + "step": 7412 + }, + { + "epoch": 0.40800264186251306, + "grad_norm": 0.8801671862602234, + "learning_rate": 9.028357087064166e-06, + "loss": 0.8399, + "step": 7413 + }, + { + "epoch": 0.40805768066486875, + "grad_norm": 0.7441918849945068, + "learning_rate": 9.02810030247127e-06, + "loss": 0.7689, + "step": 7414 + }, + { + "epoch": 0.4081127194672244, + "grad_norm": 0.7410128712654114, + "learning_rate": 9.027843487604251e-06, + "loss": 0.8013, + "step": 7415 + }, + { + "epoch": 0.40816775826958007, + "grad_norm": 0.8075226545333862, + "learning_rate": 9.02758664246504e-06, + "loss": 0.7717, + "step": 7416 + }, + { + "epoch": 0.4082227970719357, + "grad_norm": 0.7985545992851257, + "learning_rate": 9.027329767055566e-06, + "loss": 0.8459, + "step": 7417 + }, + { + "epoch": 0.4082778358742914, + "grad_norm": 0.7887235283851624, + "learning_rate": 9.027072861377757e-06, + "loss": 0.8201, + "step": 7418 + }, + { + "epoch": 0.40833287467664703, + "grad_norm": 0.7876266241073608, + "learning_rate": 9.02681592543355e-06, + "loss": 0.8205, + "step": 7419 + }, + { + "epoch": 0.4083879134790027, + "grad_norm": 0.758168637752533, + "learning_rate": 9.02655895922487e-06, + "loss": 0.6619, + "step": 7420 + }, + { + "epoch": 0.40844295228135835, + "grad_norm": 0.7279811501502991, + "learning_rate": 9.02630196275365e-06, + "loss": 0.7634, + "step": 7421 + }, + { + "epoch": 0.40849799108371404, + "grad_norm": 0.7540523409843445, + "learning_rate": 9.026044936021822e-06, + "loss": 0.7819, + "step": 7422 + }, + { + "epoch": 0.4085530298860697, + "grad_norm": 0.8091018795967102, + "learning_rate": 9.02578787903132e-06, + "loss": 0.7749, + "step": 7423 + }, + { + "epoch": 0.40860806868842536, + "grad_norm": 0.7625396847724915, + "learning_rate": 9.025530791784074e-06, + "loss": 0.7635, + "step": 7424 + }, + { + "epoch": 0.408663107490781, + "grad_norm": 0.7663947939872742, + "learning_rate": 9.025273674282015e-06, + "loss": 0.8281, + "step": 7425 + }, + { + "epoch": 0.4087181462931367, + "grad_norm": 0.6672662496566772, + "learning_rate": 9.025016526527077e-06, + "loss": 0.641, + "step": 7426 + }, + { + "epoch": 0.4087731850954923, + "grad_norm": 0.7649143934249878, + "learning_rate": 9.024759348521193e-06, + "loss": 0.7462, + "step": 7427 + }, + { + "epoch": 0.408828223897848, + "grad_norm": 0.7540067434310913, + "learning_rate": 9.024502140266293e-06, + "loss": 0.8756, + "step": 7428 + }, + { + "epoch": 0.40888326270020364, + "grad_norm": 0.721615731716156, + "learning_rate": 9.024244901764314e-06, + "loss": 0.8507, + "step": 7429 + }, + { + "epoch": 0.40893830150255933, + "grad_norm": 0.6949496269226074, + "learning_rate": 9.023987633017186e-06, + "loss": 0.7021, + "step": 7430 + }, + { + "epoch": 0.40899334030491497, + "grad_norm": 0.7108990550041199, + "learning_rate": 9.023730334026845e-06, + "loss": 0.807, + "step": 7431 + }, + { + "epoch": 0.4090483791072706, + "grad_norm": 0.7606124877929688, + "learning_rate": 9.023473004795225e-06, + "loss": 0.7769, + "step": 7432 + }, + { + "epoch": 0.4091034179096263, + "grad_norm": 0.7792031764984131, + "learning_rate": 9.023215645324256e-06, + "loss": 0.728, + "step": 7433 + }, + { + "epoch": 0.4091584567119819, + "grad_norm": 0.728884756565094, + "learning_rate": 9.022958255615877e-06, + "loss": 0.7831, + "step": 7434 + }, + { + "epoch": 0.4092134955143376, + "grad_norm": 0.8196625709533691, + "learning_rate": 9.022700835672022e-06, + "loss": 0.8265, + "step": 7435 + }, + { + "epoch": 0.40926853431669324, + "grad_norm": 0.762734055519104, + "learning_rate": 9.022443385494621e-06, + "loss": 0.8028, + "step": 7436 + }, + { + "epoch": 0.40932357311904893, + "grad_norm": 0.7259558439254761, + "learning_rate": 9.022185905085614e-06, + "loss": 0.789, + "step": 7437 + }, + { + "epoch": 0.40937861192140457, + "grad_norm": 0.7402371764183044, + "learning_rate": 9.021928394446936e-06, + "loss": 0.7667, + "step": 7438 + }, + { + "epoch": 0.40943365072376026, + "grad_norm": 0.8399797677993774, + "learning_rate": 9.021670853580519e-06, + "loss": 0.8451, + "step": 7439 + }, + { + "epoch": 0.4094886895261159, + "grad_norm": 0.6439585089683533, + "learning_rate": 9.0214132824883e-06, + "loss": 0.776, + "step": 7440 + }, + { + "epoch": 0.4095437283284716, + "grad_norm": 0.6956612467765808, + "learning_rate": 9.021155681172215e-06, + "loss": 0.6921, + "step": 7441 + }, + { + "epoch": 0.4095987671308272, + "grad_norm": 0.855413556098938, + "learning_rate": 9.020898049634203e-06, + "loss": 0.8552, + "step": 7442 + }, + { + "epoch": 0.4096538059331829, + "grad_norm": 0.6690535545349121, + "learning_rate": 9.020640387876194e-06, + "loss": 0.7552, + "step": 7443 + }, + { + "epoch": 0.40970884473553854, + "grad_norm": 0.6615462899208069, + "learning_rate": 9.020382695900131e-06, + "loss": 0.8216, + "step": 7444 + }, + { + "epoch": 0.4097638835378942, + "grad_norm": 0.6975858211517334, + "learning_rate": 9.020124973707947e-06, + "loss": 0.7453, + "step": 7445 + }, + { + "epoch": 0.40981892234024986, + "grad_norm": 0.6461964249610901, + "learning_rate": 9.019867221301579e-06, + "loss": 0.656, + "step": 7446 + }, + { + "epoch": 0.40987396114260555, + "grad_norm": 0.7221645712852478, + "learning_rate": 9.019609438682967e-06, + "loss": 0.661, + "step": 7447 + }, + { + "epoch": 0.4099289999449612, + "grad_norm": 0.6785755753517151, + "learning_rate": 9.019351625854044e-06, + "loss": 0.7294, + "step": 7448 + }, + { + "epoch": 0.40998403874731687, + "grad_norm": 0.7040538787841797, + "learning_rate": 9.019093782816751e-06, + "loss": 0.8546, + "step": 7449 + }, + { + "epoch": 0.4100390775496725, + "grad_norm": 0.737922191619873, + "learning_rate": 9.018835909573025e-06, + "loss": 0.8144, + "step": 7450 + }, + { + "epoch": 0.4100941163520282, + "grad_norm": 0.6705496311187744, + "learning_rate": 9.018578006124802e-06, + "loss": 0.6937, + "step": 7451 + }, + { + "epoch": 0.4101491551543838, + "grad_norm": 0.7347431182861328, + "learning_rate": 9.018320072474026e-06, + "loss": 0.7716, + "step": 7452 + }, + { + "epoch": 0.4102041939567395, + "grad_norm": 0.7023493647575378, + "learning_rate": 9.018062108622631e-06, + "loss": 0.7295, + "step": 7453 + }, + { + "epoch": 0.41025923275909515, + "grad_norm": 0.8017870187759399, + "learning_rate": 9.017804114572556e-06, + "loss": 0.7471, + "step": 7454 + }, + { + "epoch": 0.41031427156145084, + "grad_norm": 0.9171211123466492, + "learning_rate": 9.01754609032574e-06, + "loss": 0.8262, + "step": 7455 + }, + { + "epoch": 0.41036931036380647, + "grad_norm": 0.6682952046394348, + "learning_rate": 9.017288035884124e-06, + "loss": 0.7165, + "step": 7456 + }, + { + "epoch": 0.41042434916616216, + "grad_norm": 0.9339122772216797, + "learning_rate": 9.017029951249648e-06, + "loss": 0.8618, + "step": 7457 + }, + { + "epoch": 0.4104793879685178, + "grad_norm": 0.7063136696815491, + "learning_rate": 9.016771836424248e-06, + "loss": 0.8068, + "step": 7458 + }, + { + "epoch": 0.4105344267708735, + "grad_norm": 0.6717063784599304, + "learning_rate": 9.016513691409867e-06, + "loss": 0.738, + "step": 7459 + }, + { + "epoch": 0.4105894655732291, + "grad_norm": 0.6807749271392822, + "learning_rate": 9.016255516208443e-06, + "loss": 0.7842, + "step": 7460 + }, + { + "epoch": 0.4106445043755848, + "grad_norm": 0.6990453600883484, + "learning_rate": 9.01599731082192e-06, + "loss": 0.7726, + "step": 7461 + }, + { + "epoch": 0.41069954317794044, + "grad_norm": 0.6704931259155273, + "learning_rate": 9.015739075252234e-06, + "loss": 0.7006, + "step": 7462 + }, + { + "epoch": 0.41075458198029613, + "grad_norm": 0.7162300944328308, + "learning_rate": 9.01548080950133e-06, + "loss": 0.8462, + "step": 7463 + }, + { + "epoch": 0.41080962078265176, + "grad_norm": 0.6845411658287048, + "learning_rate": 9.015222513571144e-06, + "loss": 0.7466, + "step": 7464 + }, + { + "epoch": 0.41086465958500745, + "grad_norm": 0.7146134376525879, + "learning_rate": 9.014964187463623e-06, + "loss": 0.7594, + "step": 7465 + }, + { + "epoch": 0.4109196983873631, + "grad_norm": 0.7664906978607178, + "learning_rate": 9.014705831180706e-06, + "loss": 0.8376, + "step": 7466 + }, + { + "epoch": 0.4109747371897188, + "grad_norm": 0.7319341897964478, + "learning_rate": 9.014447444724332e-06, + "loss": 0.7748, + "step": 7467 + }, + { + "epoch": 0.4110297759920744, + "grad_norm": 0.7269605398178101, + "learning_rate": 9.014189028096448e-06, + "loss": 0.6941, + "step": 7468 + }, + { + "epoch": 0.4110848147944301, + "grad_norm": 0.72607421875, + "learning_rate": 9.013930581298993e-06, + "loss": 0.7174, + "step": 7469 + }, + { + "epoch": 0.41113985359678573, + "grad_norm": 0.7385421991348267, + "learning_rate": 9.01367210433391e-06, + "loss": 0.7761, + "step": 7470 + }, + { + "epoch": 0.4111948923991414, + "grad_norm": 0.8392042517662048, + "learning_rate": 9.013413597203144e-06, + "loss": 0.7417, + "step": 7471 + }, + { + "epoch": 0.41124993120149705, + "grad_norm": 0.7454584836959839, + "learning_rate": 9.013155059908634e-06, + "loss": 0.8976, + "step": 7472 + }, + { + "epoch": 0.41130497000385274, + "grad_norm": 0.7358037829399109, + "learning_rate": 9.012896492452325e-06, + "loss": 0.7706, + "step": 7473 + }, + { + "epoch": 0.4113600088062084, + "grad_norm": 0.7454121708869934, + "learning_rate": 9.01263789483616e-06, + "loss": 0.7425, + "step": 7474 + }, + { + "epoch": 0.411415047608564, + "grad_norm": 0.7842294573783875, + "learning_rate": 9.012379267062081e-06, + "loss": 0.7739, + "step": 7475 + }, + { + "epoch": 0.4114700864109197, + "grad_norm": 0.7181714773178101, + "learning_rate": 9.012120609132036e-06, + "loss": 0.8466, + "step": 7476 + }, + { + "epoch": 0.41152512521327533, + "grad_norm": 0.7239206433296204, + "learning_rate": 9.011861921047966e-06, + "loss": 0.7493, + "step": 7477 + }, + { + "epoch": 0.411580164015631, + "grad_norm": 0.6773414611816406, + "learning_rate": 9.011603202811816e-06, + "loss": 0.7433, + "step": 7478 + }, + { + "epoch": 0.41163520281798666, + "grad_norm": 0.7770900130271912, + "learning_rate": 9.011344454425527e-06, + "loss": 0.7488, + "step": 7479 + }, + { + "epoch": 0.41169024162034235, + "grad_norm": 0.7305957674980164, + "learning_rate": 9.011085675891051e-06, + "loss": 0.7989, + "step": 7480 + }, + { + "epoch": 0.411745280422698, + "grad_norm": 0.734603762626648, + "learning_rate": 9.010826867210327e-06, + "loss": 0.805, + "step": 7481 + }, + { + "epoch": 0.41180031922505367, + "grad_norm": 0.7438979148864746, + "learning_rate": 9.010568028385303e-06, + "loss": 0.8407, + "step": 7482 + }, + { + "epoch": 0.4118553580274093, + "grad_norm": 0.6718543767929077, + "learning_rate": 9.01030915941792e-06, + "loss": 0.7575, + "step": 7483 + }, + { + "epoch": 0.411910396829765, + "grad_norm": 0.8157614469528198, + "learning_rate": 9.01005026031013e-06, + "loss": 0.8231, + "step": 7484 + }, + { + "epoch": 0.4119654356321206, + "grad_norm": 0.8927714824676514, + "learning_rate": 9.009791331063874e-06, + "loss": 0.808, + "step": 7485 + }, + { + "epoch": 0.4120204744344763, + "grad_norm": 0.7604075074195862, + "learning_rate": 9.009532371681101e-06, + "loss": 0.7505, + "step": 7486 + }, + { + "epoch": 0.41207551323683195, + "grad_norm": 0.6861944794654846, + "learning_rate": 9.009273382163754e-06, + "loss": 0.719, + "step": 7487 + }, + { + "epoch": 0.41213055203918764, + "grad_norm": 0.7043709754943848, + "learning_rate": 9.009014362513784e-06, + "loss": 0.8193, + "step": 7488 + }, + { + "epoch": 0.41218559084154327, + "grad_norm": 0.7459648847579956, + "learning_rate": 9.008755312733136e-06, + "loss": 0.8617, + "step": 7489 + }, + { + "epoch": 0.41224062964389896, + "grad_norm": 0.7272594571113586, + "learning_rate": 9.008496232823754e-06, + "loss": 0.7255, + "step": 7490 + }, + { + "epoch": 0.4122956684462546, + "grad_norm": 0.7486668229103088, + "learning_rate": 9.008237122787586e-06, + "loss": 0.6479, + "step": 7491 + }, + { + "epoch": 0.4123507072486103, + "grad_norm": 0.8149027228355408, + "learning_rate": 9.007977982626582e-06, + "loss": 0.8052, + "step": 7492 + }, + { + "epoch": 0.4124057460509659, + "grad_norm": 0.7054859399795532, + "learning_rate": 9.00771881234269e-06, + "loss": 0.8215, + "step": 7493 + }, + { + "epoch": 0.4124607848533216, + "grad_norm": 0.6840499639511108, + "learning_rate": 9.007459611937854e-06, + "loss": 0.776, + "step": 7494 + }, + { + "epoch": 0.41251582365567724, + "grad_norm": 0.7340932488441467, + "learning_rate": 9.007200381414026e-06, + "loss": 0.713, + "step": 7495 + }, + { + "epoch": 0.4125708624580329, + "grad_norm": 0.8282599449157715, + "learning_rate": 9.00694112077315e-06, + "loss": 0.7037, + "step": 7496 + }, + { + "epoch": 0.41262590126038856, + "grad_norm": 0.849588930606842, + "learning_rate": 9.00668183001718e-06, + "loss": 0.7845, + "step": 7497 + }, + { + "epoch": 0.41268094006274425, + "grad_norm": 0.8330783843994141, + "learning_rate": 9.00642250914806e-06, + "loss": 0.9049, + "step": 7498 + }, + { + "epoch": 0.4127359788650999, + "grad_norm": 0.7020101547241211, + "learning_rate": 9.00616315816774e-06, + "loss": 0.8146, + "step": 7499 + }, + { + "epoch": 0.4127910176674556, + "grad_norm": 0.7632037997245789, + "learning_rate": 9.005903777078173e-06, + "loss": 0.6629, + "step": 7500 + }, + { + "epoch": 0.4128460564698112, + "grad_norm": 0.7286840081214905, + "learning_rate": 9.005644365881304e-06, + "loss": 0.7795, + "step": 7501 + }, + { + "epoch": 0.4129010952721669, + "grad_norm": 0.710451066493988, + "learning_rate": 9.005384924579084e-06, + "loss": 0.7615, + "step": 7502 + }, + { + "epoch": 0.41295613407452253, + "grad_norm": 0.7657510042190552, + "learning_rate": 9.005125453173463e-06, + "loss": 0.8938, + "step": 7503 + }, + { + "epoch": 0.4130111728768782, + "grad_norm": 0.6978467702865601, + "learning_rate": 9.004865951666392e-06, + "loss": 0.7464, + "step": 7504 + }, + { + "epoch": 0.41306621167923385, + "grad_norm": 0.7028319835662842, + "learning_rate": 9.00460642005982e-06, + "loss": 0.7899, + "step": 7505 + }, + { + "epoch": 0.41312125048158954, + "grad_norm": 0.923951268196106, + "learning_rate": 9.004346858355698e-06, + "loss": 0.8851, + "step": 7506 + }, + { + "epoch": 0.4131762892839452, + "grad_norm": 0.7293704748153687, + "learning_rate": 9.004087266555978e-06, + "loss": 0.7594, + "step": 7507 + }, + { + "epoch": 0.41323132808630086, + "grad_norm": 0.7458868622779846, + "learning_rate": 9.003827644662608e-06, + "loss": 0.7538, + "step": 7508 + }, + { + "epoch": 0.4132863668886565, + "grad_norm": 0.6764113306999207, + "learning_rate": 9.003567992677543e-06, + "loss": 0.7303, + "step": 7509 + }, + { + "epoch": 0.4133414056910122, + "grad_norm": 0.7827350497245789, + "learning_rate": 9.003308310602732e-06, + "loss": 0.7708, + "step": 7510 + }, + { + "epoch": 0.4133964444933678, + "grad_norm": 0.7683281302452087, + "learning_rate": 9.003048598440127e-06, + "loss": 0.7971, + "step": 7511 + }, + { + "epoch": 0.4134514832957235, + "grad_norm": 0.8793813586235046, + "learning_rate": 9.002788856191679e-06, + "loss": 0.7434, + "step": 7512 + }, + { + "epoch": 0.41350652209807914, + "grad_norm": 0.6598063111305237, + "learning_rate": 9.002529083859343e-06, + "loss": 0.7082, + "step": 7513 + }, + { + "epoch": 0.41356156090043483, + "grad_norm": 0.8239839673042297, + "learning_rate": 9.002269281445071e-06, + "loss": 0.8457, + "step": 7514 + }, + { + "epoch": 0.41361659970279047, + "grad_norm": 0.7433123588562012, + "learning_rate": 9.002009448950812e-06, + "loss": 0.7399, + "step": 7515 + }, + { + "epoch": 0.41367163850514616, + "grad_norm": 0.8310487866401672, + "learning_rate": 9.001749586378524e-06, + "loss": 0.7482, + "step": 7516 + }, + { + "epoch": 0.4137266773075018, + "grad_norm": 0.7170824408531189, + "learning_rate": 9.001489693730155e-06, + "loss": 0.7856, + "step": 7517 + }, + { + "epoch": 0.4137817161098574, + "grad_norm": 0.9063520431518555, + "learning_rate": 9.00122977100766e-06, + "loss": 0.8623, + "step": 7518 + }, + { + "epoch": 0.4138367549122131, + "grad_norm": 0.8753733038902283, + "learning_rate": 9.000969818212996e-06, + "loss": 0.7875, + "step": 7519 + }, + { + "epoch": 0.41389179371456875, + "grad_norm": 0.7013519406318665, + "learning_rate": 9.000709835348112e-06, + "loss": 0.724, + "step": 7520 + }, + { + "epoch": 0.41394683251692443, + "grad_norm": 0.7385973334312439, + "learning_rate": 9.000449822414963e-06, + "loss": 0.7286, + "step": 7521 + }, + { + "epoch": 0.41400187131928007, + "grad_norm": 0.7605431079864502, + "learning_rate": 9.000189779415505e-06, + "loss": 0.728, + "step": 7522 + }, + { + "epoch": 0.41405691012163576, + "grad_norm": 0.7631710767745972, + "learning_rate": 8.99992970635169e-06, + "loss": 0.8276, + "step": 7523 + }, + { + "epoch": 0.4141119489239914, + "grad_norm": 0.8066657185554504, + "learning_rate": 8.999669603225477e-06, + "loss": 0.8319, + "step": 7524 + }, + { + "epoch": 0.4141669877263471, + "grad_norm": 0.689407229423523, + "learning_rate": 8.999409470038815e-06, + "loss": 0.6675, + "step": 7525 + }, + { + "epoch": 0.4142220265287027, + "grad_norm": 0.7391255497932434, + "learning_rate": 8.999149306793664e-06, + "loss": 0.8228, + "step": 7526 + }, + { + "epoch": 0.4142770653310584, + "grad_norm": 0.7208844423294067, + "learning_rate": 8.998889113491977e-06, + "loss": 0.7689, + "step": 7527 + }, + { + "epoch": 0.41433210413341404, + "grad_norm": 0.8278803825378418, + "learning_rate": 8.99862889013571e-06, + "loss": 0.7964, + "step": 7528 + }, + { + "epoch": 0.4143871429357697, + "grad_norm": 0.7287253141403198, + "learning_rate": 8.998368636726817e-06, + "loss": 0.7689, + "step": 7529 + }, + { + "epoch": 0.41444218173812536, + "grad_norm": 0.7159145474433899, + "learning_rate": 8.998108353267257e-06, + "loss": 0.7537, + "step": 7530 + }, + { + "epoch": 0.41449722054048105, + "grad_norm": 0.7605739235877991, + "learning_rate": 8.997848039758985e-06, + "loss": 0.7327, + "step": 7531 + }, + { + "epoch": 0.4145522593428367, + "grad_norm": 0.7290406227111816, + "learning_rate": 8.997587696203958e-06, + "loss": 0.6804, + "step": 7532 + }, + { + "epoch": 0.41460729814519237, + "grad_norm": 0.7613189816474915, + "learning_rate": 8.997327322604131e-06, + "loss": 0.7465, + "step": 7533 + }, + { + "epoch": 0.414662336947548, + "grad_norm": 0.7796703577041626, + "learning_rate": 8.99706691896146e-06, + "loss": 0.7444, + "step": 7534 + }, + { + "epoch": 0.4147173757499037, + "grad_norm": 0.8758549094200134, + "learning_rate": 8.996806485277904e-06, + "loss": 0.8586, + "step": 7535 + }, + { + "epoch": 0.4147724145522593, + "grad_norm": 0.9599420428276062, + "learning_rate": 8.996546021555423e-06, + "loss": 0.7554, + "step": 7536 + }, + { + "epoch": 0.414827453354615, + "grad_norm": 0.8216326236724854, + "learning_rate": 8.996285527795972e-06, + "loss": 0.7995, + "step": 7537 + }, + { + "epoch": 0.41488249215697065, + "grad_norm": 0.6777452230453491, + "learning_rate": 8.996025004001507e-06, + "loss": 0.7809, + "step": 7538 + }, + { + "epoch": 0.41493753095932634, + "grad_norm": 0.7354100942611694, + "learning_rate": 8.995764450173989e-06, + "loss": 0.6548, + "step": 7539 + }, + { + "epoch": 0.414992569761682, + "grad_norm": 0.7548280358314514, + "learning_rate": 8.995503866315373e-06, + "loss": 0.8308, + "step": 7540 + }, + { + "epoch": 0.41504760856403766, + "grad_norm": 0.6891447901725769, + "learning_rate": 8.995243252427622e-06, + "loss": 0.8386, + "step": 7541 + }, + { + "epoch": 0.4151026473663933, + "grad_norm": 0.6848340034484863, + "learning_rate": 8.99498260851269e-06, + "loss": 0.7587, + "step": 7542 + }, + { + "epoch": 0.415157686168749, + "grad_norm": 0.7109090685844421, + "learning_rate": 8.994721934572538e-06, + "loss": 0.6847, + "step": 7543 + }, + { + "epoch": 0.4152127249711046, + "grad_norm": 0.6708144545555115, + "learning_rate": 8.994461230609128e-06, + "loss": 0.7266, + "step": 7544 + }, + { + "epoch": 0.4152677637734603, + "grad_norm": 0.6985414028167725, + "learning_rate": 8.994200496624415e-06, + "loss": 0.7696, + "step": 7545 + }, + { + "epoch": 0.41532280257581594, + "grad_norm": 0.6989198923110962, + "learning_rate": 8.993939732620359e-06, + "loss": 0.7894, + "step": 7546 + }, + { + "epoch": 0.41537784137817163, + "grad_norm": 0.6667589545249939, + "learning_rate": 8.993678938598921e-06, + "loss": 0.7417, + "step": 7547 + }, + { + "epoch": 0.41543288018052726, + "grad_norm": 1.0692487955093384, + "learning_rate": 8.993418114562064e-06, + "loss": 0.7147, + "step": 7548 + }, + { + "epoch": 0.41548791898288295, + "grad_norm": 0.6709207892417908, + "learning_rate": 8.993157260511742e-06, + "loss": 0.7694, + "step": 7549 + }, + { + "epoch": 0.4155429577852386, + "grad_norm": 0.6714604496955872, + "learning_rate": 8.992896376449923e-06, + "loss": 0.6969, + "step": 7550 + }, + { + "epoch": 0.4155979965875943, + "grad_norm": 0.8266897201538086, + "learning_rate": 8.99263546237856e-06, + "loss": 0.8392, + "step": 7551 + }, + { + "epoch": 0.4156530353899499, + "grad_norm": 0.675188422203064, + "learning_rate": 8.992374518299619e-06, + "loss": 0.7525, + "step": 7552 + }, + { + "epoch": 0.4157080741923056, + "grad_norm": 0.7406265139579773, + "learning_rate": 8.992113544215059e-06, + "loss": 0.7895, + "step": 7553 + }, + { + "epoch": 0.41576311299466123, + "grad_norm": 0.837336003780365, + "learning_rate": 8.991852540126844e-06, + "loss": 0.7376, + "step": 7554 + }, + { + "epoch": 0.4158181517970169, + "grad_norm": 0.6774994730949402, + "learning_rate": 8.991591506036931e-06, + "loss": 0.7231, + "step": 7555 + }, + { + "epoch": 0.41587319059937256, + "grad_norm": 0.6941245794296265, + "learning_rate": 8.991330441947287e-06, + "loss": 0.7213, + "step": 7556 + }, + { + "epoch": 0.41592822940172824, + "grad_norm": 0.7588210105895996, + "learning_rate": 8.991069347859871e-06, + "loss": 0.7829, + "step": 7557 + }, + { + "epoch": 0.4159832682040839, + "grad_norm": 0.7580196857452393, + "learning_rate": 8.990808223776647e-06, + "loss": 0.7782, + "step": 7558 + }, + { + "epoch": 0.41603830700643957, + "grad_norm": 0.7597478032112122, + "learning_rate": 8.990547069699576e-06, + "loss": 0.7764, + "step": 7559 + }, + { + "epoch": 0.4160933458087952, + "grad_norm": 0.7950314283370972, + "learning_rate": 8.990285885630622e-06, + "loss": 0.7263, + "step": 7560 + }, + { + "epoch": 0.41614838461115083, + "grad_norm": 0.6962432265281677, + "learning_rate": 8.990024671571747e-06, + "loss": 0.6616, + "step": 7561 + }, + { + "epoch": 0.4162034234135065, + "grad_norm": 0.682816207408905, + "learning_rate": 8.989763427524915e-06, + "loss": 0.7862, + "step": 7562 + }, + { + "epoch": 0.41625846221586216, + "grad_norm": 0.686673104763031, + "learning_rate": 8.989502153492089e-06, + "loss": 0.8199, + "step": 7563 + }, + { + "epoch": 0.41631350101821785, + "grad_norm": 0.7954965233802795, + "learning_rate": 8.989240849475231e-06, + "loss": 0.8021, + "step": 7564 + }, + { + "epoch": 0.4163685398205735, + "grad_norm": 0.7516284584999084, + "learning_rate": 8.988979515476309e-06, + "loss": 0.7803, + "step": 7565 + }, + { + "epoch": 0.41642357862292917, + "grad_norm": 0.7148317694664001, + "learning_rate": 8.988718151497284e-06, + "loss": 0.7407, + "step": 7566 + }, + { + "epoch": 0.4164786174252848, + "grad_norm": 0.7898986339569092, + "learning_rate": 8.98845675754012e-06, + "loss": 0.8382, + "step": 7567 + }, + { + "epoch": 0.4165336562276405, + "grad_norm": 0.7014235854148865, + "learning_rate": 8.988195333606784e-06, + "loss": 0.7205, + "step": 7568 + }, + { + "epoch": 0.4165886950299961, + "grad_norm": 0.6520957350730896, + "learning_rate": 8.987933879699238e-06, + "loss": 0.7452, + "step": 7569 + }, + { + "epoch": 0.4166437338323518, + "grad_norm": 0.7462863922119141, + "learning_rate": 8.987672395819449e-06, + "loss": 0.7787, + "step": 7570 + }, + { + "epoch": 0.41669877263470745, + "grad_norm": 0.7366049885749817, + "learning_rate": 8.987410881969382e-06, + "loss": 0.7662, + "step": 7571 + }, + { + "epoch": 0.41675381143706314, + "grad_norm": 0.7732293009757996, + "learning_rate": 8.987149338151002e-06, + "loss": 0.8258, + "step": 7572 + }, + { + "epoch": 0.41680885023941877, + "grad_norm": 0.9309358596801758, + "learning_rate": 8.986887764366275e-06, + "loss": 0.6538, + "step": 7573 + }, + { + "epoch": 0.41686388904177446, + "grad_norm": 0.6976680755615234, + "learning_rate": 8.986626160617167e-06, + "loss": 0.7175, + "step": 7574 + }, + { + "epoch": 0.4169189278441301, + "grad_norm": 0.7541783452033997, + "learning_rate": 8.986364526905645e-06, + "loss": 0.8153, + "step": 7575 + }, + { + "epoch": 0.4169739666464858, + "grad_norm": 0.8968943357467651, + "learning_rate": 8.986102863233673e-06, + "loss": 0.7859, + "step": 7576 + }, + { + "epoch": 0.4170290054488414, + "grad_norm": 0.6910044550895691, + "learning_rate": 8.985841169603218e-06, + "loss": 0.8381, + "step": 7577 + }, + { + "epoch": 0.4170840442511971, + "grad_norm": 0.8944257497787476, + "learning_rate": 8.985579446016249e-06, + "loss": 0.7062, + "step": 7578 + }, + { + "epoch": 0.41713908305355274, + "grad_norm": 0.6665629744529724, + "learning_rate": 8.98531769247473e-06, + "loss": 0.7928, + "step": 7579 + }, + { + "epoch": 0.41719412185590843, + "grad_norm": 0.7642979621887207, + "learning_rate": 8.985055908980634e-06, + "loss": 0.8442, + "step": 7580 + }, + { + "epoch": 0.41724916065826406, + "grad_norm": 0.7575559020042419, + "learning_rate": 8.98479409553592e-06, + "loss": 0.795, + "step": 7581 + }, + { + "epoch": 0.41730419946061975, + "grad_norm": 0.6567206978797913, + "learning_rate": 8.984532252142563e-06, + "loss": 0.713, + "step": 7582 + }, + { + "epoch": 0.4173592382629754, + "grad_norm": 0.6677179336547852, + "learning_rate": 8.984270378802527e-06, + "loss": 0.8173, + "step": 7583 + }, + { + "epoch": 0.4174142770653311, + "grad_norm": 0.6846007704734802, + "learning_rate": 8.984008475517782e-06, + "loss": 0.7154, + "step": 7584 + }, + { + "epoch": 0.4174693158676867, + "grad_norm": 0.7758762836456299, + "learning_rate": 8.983746542290294e-06, + "loss": 0.8686, + "step": 7585 + }, + { + "epoch": 0.4175243546700424, + "grad_norm": 0.6850305199623108, + "learning_rate": 8.983484579122036e-06, + "loss": 0.7568, + "step": 7586 + }, + { + "epoch": 0.41757939347239803, + "grad_norm": 0.7165307998657227, + "learning_rate": 8.983222586014973e-06, + "loss": 0.7856, + "step": 7587 + }, + { + "epoch": 0.4176344322747537, + "grad_norm": 0.7747449278831482, + "learning_rate": 8.982960562971074e-06, + "loss": 0.8148, + "step": 7588 + }, + { + "epoch": 0.41768947107710935, + "grad_norm": 0.789235532283783, + "learning_rate": 8.982698509992311e-06, + "loss": 0.8021, + "step": 7589 + }, + { + "epoch": 0.41774450987946504, + "grad_norm": 0.664186954498291, + "learning_rate": 8.982436427080652e-06, + "loss": 0.7394, + "step": 7590 + }, + { + "epoch": 0.4177995486818207, + "grad_norm": 0.7045899033546448, + "learning_rate": 8.982174314238069e-06, + "loss": 0.7029, + "step": 7591 + }, + { + "epoch": 0.41785458748417637, + "grad_norm": 0.7569751739501953, + "learning_rate": 8.981912171466525e-06, + "loss": 0.6106, + "step": 7592 + }, + { + "epoch": 0.417909626286532, + "grad_norm": 0.7383938431739807, + "learning_rate": 8.981649998767998e-06, + "loss": 0.8163, + "step": 7593 + }, + { + "epoch": 0.4179646650888877, + "grad_norm": 0.7314342856407166, + "learning_rate": 8.981387796144456e-06, + "loss": 0.6847, + "step": 7594 + }, + { + "epoch": 0.4180197038912433, + "grad_norm": 0.7249840497970581, + "learning_rate": 8.981125563597867e-06, + "loss": 0.8025, + "step": 7595 + }, + { + "epoch": 0.418074742693599, + "grad_norm": 0.7260022759437561, + "learning_rate": 8.980863301130206e-06, + "loss": 0.7807, + "step": 7596 + }, + { + "epoch": 0.41812978149595464, + "grad_norm": 0.6249421834945679, + "learning_rate": 8.980601008743441e-06, + "loss": 0.6744, + "step": 7597 + }, + { + "epoch": 0.41818482029831033, + "grad_norm": 0.8132835626602173, + "learning_rate": 8.980338686439544e-06, + "loss": 0.7992, + "step": 7598 + }, + { + "epoch": 0.41823985910066597, + "grad_norm": 0.7279506921768188, + "learning_rate": 8.980076334220487e-06, + "loss": 0.8402, + "step": 7599 + }, + { + "epoch": 0.41829489790302166, + "grad_norm": 0.7168325781822205, + "learning_rate": 8.979813952088242e-06, + "loss": 0.9107, + "step": 7600 + }, + { + "epoch": 0.4183499367053773, + "grad_norm": 0.633661150932312, + "learning_rate": 8.97955154004478e-06, + "loss": 0.6328, + "step": 7601 + }, + { + "epoch": 0.418404975507733, + "grad_norm": 0.6770638227462769, + "learning_rate": 8.979289098092074e-06, + "loss": 0.7604, + "step": 7602 + }, + { + "epoch": 0.4184600143100886, + "grad_norm": 0.7589067816734314, + "learning_rate": 8.979026626232098e-06, + "loss": 0.7774, + "step": 7603 + }, + { + "epoch": 0.41851505311244425, + "grad_norm": 0.7116312980651855, + "learning_rate": 8.97876412446682e-06, + "loss": 0.8186, + "step": 7604 + }, + { + "epoch": 0.41857009191479994, + "grad_norm": 0.7369259595870972, + "learning_rate": 8.978501592798219e-06, + "loss": 0.6705, + "step": 7605 + }, + { + "epoch": 0.41862513071715557, + "grad_norm": 0.6201806664466858, + "learning_rate": 8.978239031228265e-06, + "loss": 0.7011, + "step": 7606 + }, + { + "epoch": 0.41868016951951126, + "grad_norm": 0.7652842998504639, + "learning_rate": 8.977976439758929e-06, + "loss": 0.8112, + "step": 7607 + }, + { + "epoch": 0.4187352083218669, + "grad_norm": 0.7214640974998474, + "learning_rate": 8.97771381839219e-06, + "loss": 0.767, + "step": 7608 + }, + { + "epoch": 0.4187902471242226, + "grad_norm": 0.8093706369400024, + "learning_rate": 8.977451167130015e-06, + "loss": 0.8112, + "step": 7609 + }, + { + "epoch": 0.4188452859265782, + "grad_norm": 0.7023005485534668, + "learning_rate": 8.977188485974382e-06, + "loss": 0.7678, + "step": 7610 + }, + { + "epoch": 0.4189003247289339, + "grad_norm": 0.8126183748245239, + "learning_rate": 8.976925774927267e-06, + "loss": 0.8207, + "step": 7611 + }, + { + "epoch": 0.41895536353128954, + "grad_norm": 0.9624595642089844, + "learning_rate": 8.976663033990643e-06, + "loss": 0.7853, + "step": 7612 + }, + { + "epoch": 0.4190104023336452, + "grad_norm": 0.7866421937942505, + "learning_rate": 8.976400263166483e-06, + "loss": 0.6319, + "step": 7613 + }, + { + "epoch": 0.41906544113600086, + "grad_norm": 0.7555810213088989, + "learning_rate": 8.976137462456762e-06, + "loss": 0.7781, + "step": 7614 + }, + { + "epoch": 0.41912047993835655, + "grad_norm": 0.7383303046226501, + "learning_rate": 8.975874631863457e-06, + "loss": 0.8152, + "step": 7615 + }, + { + "epoch": 0.4191755187407122, + "grad_norm": 0.7873355746269226, + "learning_rate": 8.975611771388542e-06, + "loss": 0.723, + "step": 7616 + }, + { + "epoch": 0.41923055754306787, + "grad_norm": 0.7265962362289429, + "learning_rate": 8.975348881033993e-06, + "loss": 0.8016, + "step": 7617 + }, + { + "epoch": 0.4192855963454235, + "grad_norm": 0.7074393033981323, + "learning_rate": 8.975085960801788e-06, + "loss": 0.7453, + "step": 7618 + }, + { + "epoch": 0.4193406351477792, + "grad_norm": 0.6975581049919128, + "learning_rate": 8.9748230106939e-06, + "loss": 0.6516, + "step": 7619 + }, + { + "epoch": 0.41939567395013483, + "grad_norm": 0.7730469107627869, + "learning_rate": 8.974560030712304e-06, + "loss": 0.7297, + "step": 7620 + }, + { + "epoch": 0.4194507127524905, + "grad_norm": 0.7289026379585266, + "learning_rate": 8.974297020858982e-06, + "loss": 0.7087, + "step": 7621 + }, + { + "epoch": 0.41950575155484615, + "grad_norm": 0.8029256463050842, + "learning_rate": 8.974033981135906e-06, + "loss": 0.7923, + "step": 7622 + }, + { + "epoch": 0.41956079035720184, + "grad_norm": 0.765312135219574, + "learning_rate": 8.973770911545055e-06, + "loss": 0.7824, + "step": 7623 + }, + { + "epoch": 0.4196158291595575, + "grad_norm": 0.7903861403465271, + "learning_rate": 8.973507812088404e-06, + "loss": 0.8207, + "step": 7624 + }, + { + "epoch": 0.41967086796191316, + "grad_norm": 0.6875497698783875, + "learning_rate": 8.973244682767934e-06, + "loss": 0.7972, + "step": 7625 + }, + { + "epoch": 0.4197259067642688, + "grad_norm": 0.7781878709793091, + "learning_rate": 8.972981523585617e-06, + "loss": 0.754, + "step": 7626 + }, + { + "epoch": 0.4197809455666245, + "grad_norm": 0.6495640873908997, + "learning_rate": 8.972718334543437e-06, + "loss": 0.6851, + "step": 7627 + }, + { + "epoch": 0.4198359843689801, + "grad_norm": 0.7610780596733093, + "learning_rate": 8.97245511564337e-06, + "loss": 0.8161, + "step": 7628 + }, + { + "epoch": 0.4198910231713358, + "grad_norm": 0.7764771580696106, + "learning_rate": 8.972191866887393e-06, + "loss": 0.8341, + "step": 7629 + }, + { + "epoch": 0.41994606197369144, + "grad_norm": 0.7709774374961853, + "learning_rate": 8.971928588277485e-06, + "loss": 0.765, + "step": 7630 + }, + { + "epoch": 0.42000110077604713, + "grad_norm": 0.8213009238243103, + "learning_rate": 8.971665279815625e-06, + "loss": 0.8971, + "step": 7631 + }, + { + "epoch": 0.42005613957840277, + "grad_norm": 0.7232406735420227, + "learning_rate": 8.971401941503792e-06, + "loss": 0.7919, + "step": 7632 + }, + { + "epoch": 0.42011117838075845, + "grad_norm": 0.7322028279304504, + "learning_rate": 8.971138573343964e-06, + "loss": 0.8167, + "step": 7633 + }, + { + "epoch": 0.4201662171831141, + "grad_norm": 0.7204442024230957, + "learning_rate": 8.970875175338123e-06, + "loss": 0.8152, + "step": 7634 + }, + { + "epoch": 0.4202212559854698, + "grad_norm": 0.7385342121124268, + "learning_rate": 8.970611747488246e-06, + "loss": 0.8204, + "step": 7635 + }, + { + "epoch": 0.4202762947878254, + "grad_norm": 0.758941113948822, + "learning_rate": 8.970348289796316e-06, + "loss": 0.8402, + "step": 7636 + }, + { + "epoch": 0.4203313335901811, + "grad_norm": 0.7331902384757996, + "learning_rate": 8.970084802264309e-06, + "loss": 0.7305, + "step": 7637 + }, + { + "epoch": 0.42038637239253673, + "grad_norm": 0.7822885513305664, + "learning_rate": 8.969821284894208e-06, + "loss": 0.8708, + "step": 7638 + }, + { + "epoch": 0.4204414111948924, + "grad_norm": 0.6625984311103821, + "learning_rate": 8.969557737687992e-06, + "loss": 0.7806, + "step": 7639 + }, + { + "epoch": 0.42049644999724806, + "grad_norm": 1.02848482131958, + "learning_rate": 8.969294160647645e-06, + "loss": 0.7176, + "step": 7640 + }, + { + "epoch": 0.42055148879960375, + "grad_norm": 0.7888724207878113, + "learning_rate": 8.969030553775144e-06, + "loss": 0.8326, + "step": 7641 + }, + { + "epoch": 0.4206065276019594, + "grad_norm": 0.7148883938789368, + "learning_rate": 8.968766917072472e-06, + "loss": 0.7405, + "step": 7642 + }, + { + "epoch": 0.42066156640431507, + "grad_norm": 0.6629698872566223, + "learning_rate": 8.96850325054161e-06, + "loss": 0.845, + "step": 7643 + }, + { + "epoch": 0.4207166052066707, + "grad_norm": 0.8414682149887085, + "learning_rate": 8.96823955418454e-06, + "loss": 1.3631, + "step": 7644 + }, + { + "epoch": 0.4207716440090264, + "grad_norm": 0.7105298638343811, + "learning_rate": 8.967975828003244e-06, + "loss": 0.6808, + "step": 7645 + }, + { + "epoch": 0.420826682811382, + "grad_norm": 0.7324852347373962, + "learning_rate": 8.967712071999703e-06, + "loss": 0.8237, + "step": 7646 + }, + { + "epoch": 0.42088172161373766, + "grad_norm": 0.737324595451355, + "learning_rate": 8.9674482861759e-06, + "loss": 0.8486, + "step": 7647 + }, + { + "epoch": 0.42093676041609335, + "grad_norm": 0.6763800382614136, + "learning_rate": 8.967184470533818e-06, + "loss": 0.72, + "step": 7648 + }, + { + "epoch": 0.420991799218449, + "grad_norm": 0.7560757994651794, + "learning_rate": 8.96692062507544e-06, + "loss": 0.7704, + "step": 7649 + }, + { + "epoch": 0.42104683802080467, + "grad_norm": 0.7289260029792786, + "learning_rate": 8.966656749802748e-06, + "loss": 0.7411, + "step": 7650 + }, + { + "epoch": 0.4211018768231603, + "grad_norm": 0.6935442686080933, + "learning_rate": 8.966392844717726e-06, + "loss": 0.7848, + "step": 7651 + }, + { + "epoch": 0.421156915625516, + "grad_norm": 0.7111918330192566, + "learning_rate": 8.966128909822356e-06, + "loss": 0.8377, + "step": 7652 + }, + { + "epoch": 0.4212119544278716, + "grad_norm": 0.8594884872436523, + "learning_rate": 8.965864945118625e-06, + "loss": 0.8227, + "step": 7653 + }, + { + "epoch": 0.4212669932302273, + "grad_norm": 0.6521008014678955, + "learning_rate": 8.965600950608513e-06, + "loss": 0.7034, + "step": 7654 + }, + { + "epoch": 0.42132203203258295, + "grad_norm": 0.6362404823303223, + "learning_rate": 8.965336926294007e-06, + "loss": 0.6712, + "step": 7655 + }, + { + "epoch": 0.42137707083493864, + "grad_norm": 0.6955040097236633, + "learning_rate": 8.965072872177088e-06, + "loss": 0.7789, + "step": 7656 + }, + { + "epoch": 0.42143210963729427, + "grad_norm": 0.7311720252037048, + "learning_rate": 8.964808788259745e-06, + "loss": 0.7522, + "step": 7657 + }, + { + "epoch": 0.42148714843964996, + "grad_norm": 0.781131386756897, + "learning_rate": 8.96454467454396e-06, + "loss": 0.7831, + "step": 7658 + }, + { + "epoch": 0.4215421872420056, + "grad_norm": 0.6740639805793762, + "learning_rate": 8.964280531031718e-06, + "loss": 0.7102, + "step": 7659 + }, + { + "epoch": 0.4215972260443613, + "grad_norm": 0.7843424677848816, + "learning_rate": 8.964016357725003e-06, + "loss": 0.8325, + "step": 7660 + }, + { + "epoch": 0.4216522648467169, + "grad_norm": 0.7833517789840698, + "learning_rate": 8.963752154625804e-06, + "loss": 0.8603, + "step": 7661 + }, + { + "epoch": 0.4217073036490726, + "grad_norm": 0.7270992994308472, + "learning_rate": 8.963487921736104e-06, + "loss": 0.745, + "step": 7662 + }, + { + "epoch": 0.42176234245142824, + "grad_norm": 0.6517582535743713, + "learning_rate": 8.963223659057892e-06, + "loss": 0.6983, + "step": 7663 + }, + { + "epoch": 0.42181738125378393, + "grad_norm": 0.6974934935569763, + "learning_rate": 8.962959366593149e-06, + "loss": 0.733, + "step": 7664 + }, + { + "epoch": 0.42187242005613956, + "grad_norm": 0.712045431137085, + "learning_rate": 8.962695044343865e-06, + "loss": 0.725, + "step": 7665 + }, + { + "epoch": 0.42192745885849525, + "grad_norm": 0.7311459183692932, + "learning_rate": 8.962430692312028e-06, + "loss": 0.8025, + "step": 7666 + }, + { + "epoch": 0.4219824976608509, + "grad_norm": 0.7439966201782227, + "learning_rate": 8.962166310499621e-06, + "loss": 0.7711, + "step": 7667 + }, + { + "epoch": 0.4220375364632066, + "grad_norm": 0.690832257270813, + "learning_rate": 8.961901898908632e-06, + "loss": 0.8414, + "step": 7668 + }, + { + "epoch": 0.4220925752655622, + "grad_norm": 0.8437964916229248, + "learning_rate": 8.961637457541049e-06, + "loss": 0.8253, + "step": 7669 + }, + { + "epoch": 0.4221476140679179, + "grad_norm": 0.7876344323158264, + "learning_rate": 8.96137298639886e-06, + "loss": 0.754, + "step": 7670 + }, + { + "epoch": 0.42220265287027353, + "grad_norm": 0.7551780343055725, + "learning_rate": 8.961108485484052e-06, + "loss": 0.8555, + "step": 7671 + }, + { + "epoch": 0.4222576916726292, + "grad_norm": 0.6867276430130005, + "learning_rate": 8.96084395479861e-06, + "loss": 0.7216, + "step": 7672 + }, + { + "epoch": 0.42231273047498485, + "grad_norm": 0.9052873849868774, + "learning_rate": 8.960579394344528e-06, + "loss": 0.7945, + "step": 7673 + }, + { + "epoch": 0.42236776927734054, + "grad_norm": 0.6731994152069092, + "learning_rate": 8.96031480412379e-06, + "loss": 0.7691, + "step": 7674 + }, + { + "epoch": 0.4224228080796962, + "grad_norm": 0.7074670195579529, + "learning_rate": 8.960050184138389e-06, + "loss": 0.8008, + "step": 7675 + }, + { + "epoch": 0.42247784688205187, + "grad_norm": 0.9482604265213013, + "learning_rate": 8.959785534390309e-06, + "loss": 0.7095, + "step": 7676 + }, + { + "epoch": 0.4225328856844075, + "grad_norm": 0.6915413737297058, + "learning_rate": 8.95952085488154e-06, + "loss": 0.6717, + "step": 7677 + }, + { + "epoch": 0.4225879244867632, + "grad_norm": 0.7565900087356567, + "learning_rate": 8.959256145614073e-06, + "loss": 0.8311, + "step": 7678 + }, + { + "epoch": 0.4226429632891188, + "grad_norm": 0.8307167887687683, + "learning_rate": 8.958991406589896e-06, + "loss": 0.8585, + "step": 7679 + }, + { + "epoch": 0.4226980020914745, + "grad_norm": 0.7955091595649719, + "learning_rate": 8.958726637811e-06, + "loss": 0.8154, + "step": 7680 + }, + { + "epoch": 0.42275304089383015, + "grad_norm": 0.7692292332649231, + "learning_rate": 8.958461839279376e-06, + "loss": 0.7965, + "step": 7681 + }, + { + "epoch": 0.42280807969618583, + "grad_norm": 0.7355942726135254, + "learning_rate": 8.95819701099701e-06, + "loss": 0.7557, + "step": 7682 + }, + { + "epoch": 0.42286311849854147, + "grad_norm": 0.8781518936157227, + "learning_rate": 8.957932152965895e-06, + "loss": 0.8033, + "step": 7683 + }, + { + "epoch": 0.42291815730089716, + "grad_norm": 0.7180802226066589, + "learning_rate": 8.957667265188022e-06, + "loss": 0.7283, + "step": 7684 + }, + { + "epoch": 0.4229731961032528, + "grad_norm": 0.6967236995697021, + "learning_rate": 8.95740234766538e-06, + "loss": 0.769, + "step": 7685 + }, + { + "epoch": 0.4230282349056085, + "grad_norm": 0.7462503910064697, + "learning_rate": 8.957137400399963e-06, + "loss": 0.8179, + "step": 7686 + }, + { + "epoch": 0.4230832737079641, + "grad_norm": 0.67714524269104, + "learning_rate": 8.956872423393761e-06, + "loss": 0.7976, + "step": 7687 + }, + { + "epoch": 0.4231383125103198, + "grad_norm": 0.8239946365356445, + "learning_rate": 8.956607416648763e-06, + "loss": 0.7946, + "step": 7688 + }, + { + "epoch": 0.42319335131267544, + "grad_norm": 0.6724610924720764, + "learning_rate": 8.956342380166963e-06, + "loss": 0.7633, + "step": 7689 + }, + { + "epoch": 0.42324839011503107, + "grad_norm": 0.744987964630127, + "learning_rate": 8.956077313950354e-06, + "loss": 0.9028, + "step": 7690 + }, + { + "epoch": 0.42330342891738676, + "grad_norm": 0.7700596451759338, + "learning_rate": 8.955812218000925e-06, + "loss": 0.8954, + "step": 7691 + }, + { + "epoch": 0.4233584677197424, + "grad_norm": 0.6952996253967285, + "learning_rate": 8.955547092320673e-06, + "loss": 0.8094, + "step": 7692 + }, + { + "epoch": 0.4234135065220981, + "grad_norm": 0.6410536766052246, + "learning_rate": 8.955281936911586e-06, + "loss": 0.6281, + "step": 7693 + }, + { + "epoch": 0.4234685453244537, + "grad_norm": 1.0939754247665405, + "learning_rate": 8.95501675177566e-06, + "loss": 0.8239, + "step": 7694 + }, + { + "epoch": 0.4235235841268094, + "grad_norm": 0.7419464588165283, + "learning_rate": 8.954751536914885e-06, + "loss": 0.8015, + "step": 7695 + }, + { + "epoch": 0.42357862292916504, + "grad_norm": 0.8171356320381165, + "learning_rate": 8.954486292331257e-06, + "loss": 0.8183, + "step": 7696 + }, + { + "epoch": 0.4236336617315207, + "grad_norm": 0.745884358882904, + "learning_rate": 8.95422101802677e-06, + "loss": 0.7457, + "step": 7697 + }, + { + "epoch": 0.42368870053387636, + "grad_norm": 0.7355740070343018, + "learning_rate": 8.953955714003414e-06, + "loss": 0.7517, + "step": 7698 + }, + { + "epoch": 0.42374373933623205, + "grad_norm": 0.7103458642959595, + "learning_rate": 8.953690380263186e-06, + "loss": 0.7306, + "step": 7699 + }, + { + "epoch": 0.4237987781385877, + "grad_norm": 0.7453970909118652, + "learning_rate": 8.95342501680808e-06, + "loss": 0.8396, + "step": 7700 + }, + { + "epoch": 0.4238538169409434, + "grad_norm": 0.7132760286331177, + "learning_rate": 8.953159623640088e-06, + "loss": 0.7861, + "step": 7701 + }, + { + "epoch": 0.423908855743299, + "grad_norm": 0.785827100276947, + "learning_rate": 8.952894200761209e-06, + "loss": 0.8681, + "step": 7702 + }, + { + "epoch": 0.4239638945456547, + "grad_norm": 0.7075281143188477, + "learning_rate": 8.952628748173433e-06, + "loss": 0.7257, + "step": 7703 + }, + { + "epoch": 0.42401893334801033, + "grad_norm": 0.8205186724662781, + "learning_rate": 8.952363265878758e-06, + "loss": 0.7361, + "step": 7704 + }, + { + "epoch": 0.424073972150366, + "grad_norm": 0.6517061591148376, + "learning_rate": 8.952097753879181e-06, + "loss": 0.7127, + "step": 7705 + }, + { + "epoch": 0.42412901095272165, + "grad_norm": 0.7252761125564575, + "learning_rate": 8.951832212176692e-06, + "loss": 0.796, + "step": 7706 + }, + { + "epoch": 0.42418404975507734, + "grad_norm": 0.6688609719276428, + "learning_rate": 8.951566640773292e-06, + "loss": 0.7698, + "step": 7707 + }, + { + "epoch": 0.424239088557433, + "grad_norm": 0.7163566946983337, + "learning_rate": 8.951301039670974e-06, + "loss": 0.8069, + "step": 7708 + }, + { + "epoch": 0.42429412735978866, + "grad_norm": 0.7027623057365417, + "learning_rate": 8.951035408871735e-06, + "loss": 0.7061, + "step": 7709 + }, + { + "epoch": 0.4243491661621443, + "grad_norm": 0.9558683037757874, + "learning_rate": 8.950769748377572e-06, + "loss": 0.926, + "step": 7710 + }, + { + "epoch": 0.4244042049645, + "grad_norm": 0.7173893451690674, + "learning_rate": 8.950504058190482e-06, + "loss": 0.7519, + "step": 7711 + }, + { + "epoch": 0.4244592437668556, + "grad_norm": 0.8481128811836243, + "learning_rate": 8.950238338312459e-06, + "loss": 0.7804, + "step": 7712 + }, + { + "epoch": 0.4245142825692113, + "grad_norm": 0.6957072615623474, + "learning_rate": 8.949972588745502e-06, + "loss": 0.611, + "step": 7713 + }, + { + "epoch": 0.42456932137156694, + "grad_norm": 0.7910122871398926, + "learning_rate": 8.94970680949161e-06, + "loss": 0.8435, + "step": 7714 + }, + { + "epoch": 0.42462436017392263, + "grad_norm": 0.8068616986274719, + "learning_rate": 8.949441000552777e-06, + "loss": 0.8658, + "step": 7715 + }, + { + "epoch": 0.42467939897627827, + "grad_norm": 0.718110978603363, + "learning_rate": 8.949175161931006e-06, + "loss": 0.7908, + "step": 7716 + }, + { + "epoch": 0.42473443777863396, + "grad_norm": 0.7329656481742859, + "learning_rate": 8.948909293628289e-06, + "loss": 0.7477, + "step": 7717 + }, + { + "epoch": 0.4247894765809896, + "grad_norm": 0.7046940326690674, + "learning_rate": 8.948643395646625e-06, + "loss": 0.7985, + "step": 7718 + }, + { + "epoch": 0.4248445153833453, + "grad_norm": 0.6699581742286682, + "learning_rate": 8.948377467988017e-06, + "loss": 0.6575, + "step": 7719 + }, + { + "epoch": 0.4248995541857009, + "grad_norm": 0.8055217266082764, + "learning_rate": 8.94811151065446e-06, + "loss": 0.7008, + "step": 7720 + }, + { + "epoch": 0.4249545929880566, + "grad_norm": 0.8374543190002441, + "learning_rate": 8.947845523647954e-06, + "loss": 0.8918, + "step": 7721 + }, + { + "epoch": 0.42500963179041223, + "grad_norm": 0.6974833607673645, + "learning_rate": 8.947579506970498e-06, + "loss": 0.8594, + "step": 7722 + }, + { + "epoch": 0.4250646705927679, + "grad_norm": 0.7466567754745483, + "learning_rate": 8.947313460624091e-06, + "loss": 0.6935, + "step": 7723 + }, + { + "epoch": 0.42511970939512356, + "grad_norm": 0.8118101358413696, + "learning_rate": 8.947047384610734e-06, + "loss": 0.8432, + "step": 7724 + }, + { + "epoch": 0.42517474819747925, + "grad_norm": 0.6885644197463989, + "learning_rate": 8.946781278932422e-06, + "loss": 0.8059, + "step": 7725 + }, + { + "epoch": 0.4252297869998349, + "grad_norm": 0.7257012128829956, + "learning_rate": 8.94651514359116e-06, + "loss": 0.8239, + "step": 7726 + }, + { + "epoch": 0.42528482580219057, + "grad_norm": 1.311591386795044, + "learning_rate": 8.946248978588947e-06, + "loss": 0.8207, + "step": 7727 + }, + { + "epoch": 0.4253398646045462, + "grad_norm": 0.7694151997566223, + "learning_rate": 8.945982783927784e-06, + "loss": 0.8948, + "step": 7728 + }, + { + "epoch": 0.4253949034069019, + "grad_norm": 0.6922980546951294, + "learning_rate": 8.945716559609669e-06, + "loss": 0.7883, + "step": 7729 + }, + { + "epoch": 0.4254499422092575, + "grad_norm": 0.7803757786750793, + "learning_rate": 8.945450305636605e-06, + "loss": 0.9166, + "step": 7730 + }, + { + "epoch": 0.4255049810116132, + "grad_norm": 0.6775311827659607, + "learning_rate": 8.945184022010593e-06, + "loss": 0.6976, + "step": 7731 + }, + { + "epoch": 0.42556001981396885, + "grad_norm": 0.7108052968978882, + "learning_rate": 8.944917708733634e-06, + "loss": 0.7763, + "step": 7732 + }, + { + "epoch": 0.4256150586163245, + "grad_norm": 0.7215770483016968, + "learning_rate": 8.94465136580773e-06, + "loss": 0.7907, + "step": 7733 + }, + { + "epoch": 0.42567009741868017, + "grad_norm": 0.6690788865089417, + "learning_rate": 8.944384993234881e-06, + "loss": 0.8403, + "step": 7734 + }, + { + "epoch": 0.4257251362210358, + "grad_norm": 0.7372478246688843, + "learning_rate": 8.94411859101709e-06, + "loss": 0.7618, + "step": 7735 + }, + { + "epoch": 0.4257801750233915, + "grad_norm": 0.9398306608200073, + "learning_rate": 8.94385215915636e-06, + "loss": 0.9043, + "step": 7736 + }, + { + "epoch": 0.4258352138257471, + "grad_norm": 0.8790311217308044, + "learning_rate": 8.943585697654693e-06, + "loss": 0.9378, + "step": 7737 + }, + { + "epoch": 0.4258902526281028, + "grad_norm": 0.7579166889190674, + "learning_rate": 8.943319206514091e-06, + "loss": 0.7913, + "step": 7738 + }, + { + "epoch": 0.42594529143045845, + "grad_norm": 0.6426860690116882, + "learning_rate": 8.943052685736559e-06, + "loss": 0.744, + "step": 7739 + }, + { + "epoch": 0.42600033023281414, + "grad_norm": 0.688117265701294, + "learning_rate": 8.942786135324098e-06, + "loss": 0.8386, + "step": 7740 + }, + { + "epoch": 0.4260553690351698, + "grad_norm": 0.7178692817687988, + "learning_rate": 8.94251955527871e-06, + "loss": 0.7937, + "step": 7741 + }, + { + "epoch": 0.42611040783752546, + "grad_norm": 0.7980415225028992, + "learning_rate": 8.942252945602403e-06, + "loss": 0.76, + "step": 7742 + }, + { + "epoch": 0.4261654466398811, + "grad_norm": 0.6858333349227905, + "learning_rate": 8.941986306297175e-06, + "loss": 0.8155, + "step": 7743 + }, + { + "epoch": 0.4262204854422368, + "grad_norm": 0.763297975063324, + "learning_rate": 8.941719637365037e-06, + "loss": 0.8003, + "step": 7744 + }, + { + "epoch": 0.4262755242445924, + "grad_norm": 0.661016047000885, + "learning_rate": 8.941452938807986e-06, + "loss": 0.6788, + "step": 7745 + }, + { + "epoch": 0.4263305630469481, + "grad_norm": 0.7168089151382446, + "learning_rate": 8.94118621062803e-06, + "loss": 0.7791, + "step": 7746 + }, + { + "epoch": 0.42638560184930374, + "grad_norm": 0.6879743337631226, + "learning_rate": 8.940919452827174e-06, + "loss": 0.7978, + "step": 7747 + }, + { + "epoch": 0.42644064065165943, + "grad_norm": 0.672298014163971, + "learning_rate": 8.940652665407424e-06, + "loss": 0.7569, + "step": 7748 + }, + { + "epoch": 0.42649567945401506, + "grad_norm": 0.7237414717674255, + "learning_rate": 8.940385848370782e-06, + "loss": 0.6788, + "step": 7749 + }, + { + "epoch": 0.42655071825637075, + "grad_norm": 0.6793895363807678, + "learning_rate": 8.940119001719255e-06, + "loss": 0.749, + "step": 7750 + }, + { + "epoch": 0.4266057570587264, + "grad_norm": 1.1172789335250854, + "learning_rate": 8.939852125454847e-06, + "loss": 0.9017, + "step": 7751 + }, + { + "epoch": 0.4266607958610821, + "grad_norm": 0.7138717770576477, + "learning_rate": 8.939585219579567e-06, + "loss": 0.8586, + "step": 7752 + }, + { + "epoch": 0.4267158346634377, + "grad_norm": 0.8678629398345947, + "learning_rate": 8.939318284095417e-06, + "loss": 0.7333, + "step": 7753 + }, + { + "epoch": 0.4267708734657934, + "grad_norm": 0.7274941802024841, + "learning_rate": 8.939051319004407e-06, + "loss": 0.8426, + "step": 7754 + }, + { + "epoch": 0.42682591226814903, + "grad_norm": 0.6845358610153198, + "learning_rate": 8.93878432430854e-06, + "loss": 0.7731, + "step": 7755 + }, + { + "epoch": 0.4268809510705047, + "grad_norm": 0.7042781710624695, + "learning_rate": 8.938517300009826e-06, + "loss": 0.6703, + "step": 7756 + }, + { + "epoch": 0.42693598987286036, + "grad_norm": 0.7147190570831299, + "learning_rate": 8.93825024611027e-06, + "loss": 0.7977, + "step": 7757 + }, + { + "epoch": 0.42699102867521604, + "grad_norm": 0.6584187150001526, + "learning_rate": 8.93798316261188e-06, + "loss": 0.716, + "step": 7758 + }, + { + "epoch": 0.4270460674775717, + "grad_norm": 0.8061439990997314, + "learning_rate": 8.93771604951666e-06, + "loss": 0.9075, + "step": 7759 + }, + { + "epoch": 0.42710110627992737, + "grad_norm": 0.6741406917572021, + "learning_rate": 8.937448906826622e-06, + "loss": 0.7828, + "step": 7760 + }, + { + "epoch": 0.427156145082283, + "grad_norm": 0.8791692852973938, + "learning_rate": 8.937181734543773e-06, + "loss": 0.7685, + "step": 7761 + }, + { + "epoch": 0.4272111838846387, + "grad_norm": 0.6804112195968628, + "learning_rate": 8.936914532670119e-06, + "loss": 0.7672, + "step": 7762 + }, + { + "epoch": 0.4272662226869943, + "grad_norm": 0.6983451843261719, + "learning_rate": 8.936647301207668e-06, + "loss": 0.8228, + "step": 7763 + }, + { + "epoch": 0.42732126148935, + "grad_norm": 0.8248929977416992, + "learning_rate": 8.936380040158432e-06, + "loss": 0.7628, + "step": 7764 + }, + { + "epoch": 0.42737630029170565, + "grad_norm": 0.8324941992759705, + "learning_rate": 8.936112749524415e-06, + "loss": 0.8125, + "step": 7765 + }, + { + "epoch": 0.42743133909406134, + "grad_norm": 0.7489150762557983, + "learning_rate": 8.935845429307631e-06, + "loss": 0.8766, + "step": 7766 + }, + { + "epoch": 0.42748637789641697, + "grad_norm": 0.7323104739189148, + "learning_rate": 8.935578079510083e-06, + "loss": 0.8607, + "step": 7767 + }, + { + "epoch": 0.42754141669877266, + "grad_norm": 0.6825152635574341, + "learning_rate": 8.935310700133786e-06, + "loss": 0.7817, + "step": 7768 + }, + { + "epoch": 0.4275964555011283, + "grad_norm": 0.8928677439689636, + "learning_rate": 8.935043291180748e-06, + "loss": 0.7621, + "step": 7769 + }, + { + "epoch": 0.427651494303484, + "grad_norm": 0.7071405649185181, + "learning_rate": 8.934775852652975e-06, + "loss": 0.7798, + "step": 7770 + }, + { + "epoch": 0.4277065331058396, + "grad_norm": 0.8225427269935608, + "learning_rate": 8.934508384552481e-06, + "loss": 0.7212, + "step": 7771 + }, + { + "epoch": 0.4277615719081953, + "grad_norm": 0.6931234002113342, + "learning_rate": 8.934240886881276e-06, + "loss": 0.7301, + "step": 7772 + }, + { + "epoch": 0.42781661071055094, + "grad_norm": 0.6901859641075134, + "learning_rate": 8.933973359641369e-06, + "loss": 0.6974, + "step": 7773 + }, + { + "epoch": 0.4278716495129066, + "grad_norm": 0.7736960649490356, + "learning_rate": 8.93370580283477e-06, + "loss": 0.6562, + "step": 7774 + }, + { + "epoch": 0.42792668831526226, + "grad_norm": 0.7363499999046326, + "learning_rate": 8.933438216463495e-06, + "loss": 0.8274, + "step": 7775 + }, + { + "epoch": 0.4279817271176179, + "grad_norm": 0.6855602860450745, + "learning_rate": 8.933170600529548e-06, + "loss": 0.7576, + "step": 7776 + }, + { + "epoch": 0.4280367659199736, + "grad_norm": 0.7641676664352417, + "learning_rate": 8.932902955034945e-06, + "loss": 0.7837, + "step": 7777 + }, + { + "epoch": 0.4280918047223292, + "grad_norm": 0.74812251329422, + "learning_rate": 8.932635279981695e-06, + "loss": 0.8402, + "step": 7778 + }, + { + "epoch": 0.4281468435246849, + "grad_norm": 0.7445259094238281, + "learning_rate": 8.932367575371813e-06, + "loss": 0.862, + "step": 7779 + }, + { + "epoch": 0.42820188232704054, + "grad_norm": 0.8977177739143372, + "learning_rate": 8.932099841207306e-06, + "loss": 0.7735, + "step": 7780 + }, + { + "epoch": 0.42825692112939623, + "grad_norm": 0.74172043800354, + "learning_rate": 8.93183207749019e-06, + "loss": 0.7053, + "step": 7781 + }, + { + "epoch": 0.42831195993175186, + "grad_norm": 0.6670083999633789, + "learning_rate": 8.931564284222479e-06, + "loss": 0.6348, + "step": 7782 + }, + { + "epoch": 0.42836699873410755, + "grad_norm": 0.7575422525405884, + "learning_rate": 8.93129646140618e-06, + "loss": 0.9354, + "step": 7783 + }, + { + "epoch": 0.4284220375364632, + "grad_norm": 0.7436977624893188, + "learning_rate": 8.931028609043311e-06, + "loss": 0.7461, + "step": 7784 + }, + { + "epoch": 0.4284770763388189, + "grad_norm": 0.7383070588111877, + "learning_rate": 8.930760727135882e-06, + "loss": 0.7629, + "step": 7785 + }, + { + "epoch": 0.4285321151411745, + "grad_norm": 0.6926067471504211, + "learning_rate": 8.93049281568591e-06, + "loss": 0.6788, + "step": 7786 + }, + { + "epoch": 0.4285871539435302, + "grad_norm": 0.7680530548095703, + "learning_rate": 8.930224874695404e-06, + "loss": 0.722, + "step": 7787 + }, + { + "epoch": 0.42864219274588583, + "grad_norm": 0.9880867004394531, + "learning_rate": 8.92995690416638e-06, + "loss": 0.833, + "step": 7788 + }, + { + "epoch": 0.4286972315482415, + "grad_norm": 0.7915430068969727, + "learning_rate": 8.929688904100853e-06, + "loss": 0.7643, + "step": 7789 + }, + { + "epoch": 0.42875227035059715, + "grad_norm": 0.6972275376319885, + "learning_rate": 8.929420874500836e-06, + "loss": 0.7697, + "step": 7790 + }, + { + "epoch": 0.42880730915295284, + "grad_norm": 0.9583331346511841, + "learning_rate": 8.929152815368343e-06, + "loss": 0.7591, + "step": 7791 + }, + { + "epoch": 0.4288623479553085, + "grad_norm": 0.7254299521446228, + "learning_rate": 8.928884726705388e-06, + "loss": 0.7913, + "step": 7792 + }, + { + "epoch": 0.42891738675766417, + "grad_norm": 0.7925865054130554, + "learning_rate": 8.928616608513989e-06, + "loss": 0.8248, + "step": 7793 + }, + { + "epoch": 0.4289724255600198, + "grad_norm": 0.9367457628250122, + "learning_rate": 8.928348460796157e-06, + "loss": 0.7767, + "step": 7794 + }, + { + "epoch": 0.4290274643623755, + "grad_norm": 0.8511868119239807, + "learning_rate": 8.928080283553912e-06, + "loss": 0.841, + "step": 7795 + }, + { + "epoch": 0.4290825031647311, + "grad_norm": 0.8518061637878418, + "learning_rate": 8.927812076789267e-06, + "loss": 0.7907, + "step": 7796 + }, + { + "epoch": 0.4291375419670868, + "grad_norm": 0.7208365797996521, + "learning_rate": 8.927543840504236e-06, + "loss": 0.7344, + "step": 7797 + }, + { + "epoch": 0.42919258076944244, + "grad_norm": 0.7541850209236145, + "learning_rate": 8.927275574700838e-06, + "loss": 0.7724, + "step": 7798 + }, + { + "epoch": 0.42924761957179813, + "grad_norm": 0.7378629446029663, + "learning_rate": 8.927007279381087e-06, + "loss": 0.7614, + "step": 7799 + }, + { + "epoch": 0.42930265837415377, + "grad_norm": 0.7358561158180237, + "learning_rate": 8.926738954547001e-06, + "loss": 0.7288, + "step": 7800 + }, + { + "epoch": 0.42935769717650946, + "grad_norm": 0.7385967969894409, + "learning_rate": 8.926470600200597e-06, + "loss": 0.7562, + "step": 7801 + }, + { + "epoch": 0.4294127359788651, + "grad_norm": 0.6904877424240112, + "learning_rate": 8.92620221634389e-06, + "loss": 0.6507, + "step": 7802 + }, + { + "epoch": 0.4294677747812208, + "grad_norm": 0.7205148935317993, + "learning_rate": 8.925933802978898e-06, + "loss": 0.7683, + "step": 7803 + }, + { + "epoch": 0.4295228135835764, + "grad_norm": 0.6830344200134277, + "learning_rate": 8.925665360107639e-06, + "loss": 0.6886, + "step": 7804 + }, + { + "epoch": 0.4295778523859321, + "grad_norm": 0.7648812532424927, + "learning_rate": 8.92539688773213e-06, + "loss": 0.7559, + "step": 7805 + }, + { + "epoch": 0.42963289118828774, + "grad_norm": 0.7819112539291382, + "learning_rate": 8.925128385854389e-06, + "loss": 0.7443, + "step": 7806 + }, + { + "epoch": 0.4296879299906434, + "grad_norm": 0.6742433309555054, + "learning_rate": 8.924859854476433e-06, + "loss": 0.7191, + "step": 7807 + }, + { + "epoch": 0.42974296879299906, + "grad_norm": 0.7368177771568298, + "learning_rate": 8.924591293600281e-06, + "loss": 0.6946, + "step": 7808 + }, + { + "epoch": 0.42979800759535475, + "grad_norm": 0.663112998008728, + "learning_rate": 8.924322703227953e-06, + "loss": 0.7405, + "step": 7809 + }, + { + "epoch": 0.4298530463977104, + "grad_norm": 0.6735410690307617, + "learning_rate": 8.924054083361465e-06, + "loss": 0.7982, + "step": 7810 + }, + { + "epoch": 0.42990808520006607, + "grad_norm": 0.7770369648933411, + "learning_rate": 8.923785434002834e-06, + "loss": 0.9179, + "step": 7811 + }, + { + "epoch": 0.4299631240024217, + "grad_norm": 0.7464482188224792, + "learning_rate": 8.923516755154085e-06, + "loss": 0.8514, + "step": 7812 + }, + { + "epoch": 0.4300181628047774, + "grad_norm": 0.9249551892280579, + "learning_rate": 8.923248046817235e-06, + "loss": 0.8287, + "step": 7813 + }, + { + "epoch": 0.430073201607133, + "grad_norm": 0.7071338891983032, + "learning_rate": 8.922979308994302e-06, + "loss": 0.7509, + "step": 7814 + }, + { + "epoch": 0.4301282404094887, + "grad_norm": 0.6910794377326965, + "learning_rate": 8.922710541687305e-06, + "loss": 0.7373, + "step": 7815 + }, + { + "epoch": 0.43018327921184435, + "grad_norm": 0.8424028158187866, + "learning_rate": 8.922441744898267e-06, + "loss": 0.741, + "step": 7816 + }, + { + "epoch": 0.43023831801420004, + "grad_norm": 0.8162125945091248, + "learning_rate": 8.922172918629208e-06, + "loss": 0.8044, + "step": 7817 + }, + { + "epoch": 0.43029335681655567, + "grad_norm": 0.7415170669555664, + "learning_rate": 8.921904062882145e-06, + "loss": 0.7427, + "step": 7818 + }, + { + "epoch": 0.4303483956189113, + "grad_norm": 1.1357808113098145, + "learning_rate": 8.921635177659103e-06, + "loss": 0.7802, + "step": 7819 + }, + { + "epoch": 0.430403434421267, + "grad_norm": 0.7039839625358582, + "learning_rate": 8.9213662629621e-06, + "loss": 0.7368, + "step": 7820 + }, + { + "epoch": 0.43045847322362263, + "grad_norm": 0.721077024936676, + "learning_rate": 8.921097318793157e-06, + "loss": 0.6575, + "step": 7821 + }, + { + "epoch": 0.4305135120259783, + "grad_norm": 0.7823510766029358, + "learning_rate": 8.920828345154297e-06, + "loss": 0.7499, + "step": 7822 + }, + { + "epoch": 0.43056855082833395, + "grad_norm": 0.6400569677352905, + "learning_rate": 8.920559342047539e-06, + "loss": 0.7091, + "step": 7823 + }, + { + "epoch": 0.43062358963068964, + "grad_norm": 0.8974951505661011, + "learning_rate": 8.920290309474908e-06, + "loss": 0.7228, + "step": 7824 + }, + { + "epoch": 0.4306786284330453, + "grad_norm": 0.8176010847091675, + "learning_rate": 8.920021247438426e-06, + "loss": 0.8852, + "step": 7825 + }, + { + "epoch": 0.43073366723540096, + "grad_norm": 0.7591422200202942, + "learning_rate": 8.919752155940112e-06, + "loss": 0.8382, + "step": 7826 + }, + { + "epoch": 0.4307887060377566, + "grad_norm": 0.7089776396751404, + "learning_rate": 8.919483034981988e-06, + "loss": 0.7188, + "step": 7827 + }, + { + "epoch": 0.4308437448401123, + "grad_norm": 0.7328840494155884, + "learning_rate": 8.919213884566081e-06, + "loss": 0.7609, + "step": 7828 + }, + { + "epoch": 0.4308987836424679, + "grad_norm": 0.6473509669303894, + "learning_rate": 8.918944704694411e-06, + "loss": 0.7027, + "step": 7829 + }, + { + "epoch": 0.4309538224448236, + "grad_norm": 0.6585624814033508, + "learning_rate": 8.918675495369003e-06, + "loss": 0.7133, + "step": 7830 + }, + { + "epoch": 0.43100886124717924, + "grad_norm": 0.7232397794723511, + "learning_rate": 8.918406256591876e-06, + "loss": 0.7458, + "step": 7831 + }, + { + "epoch": 0.43106390004953493, + "grad_norm": 0.8752645254135132, + "learning_rate": 8.918136988365059e-06, + "loss": 0.671, + "step": 7832 + }, + { + "epoch": 0.43111893885189057, + "grad_norm": 0.7890885472297668, + "learning_rate": 8.917867690690573e-06, + "loss": 0.7674, + "step": 7833 + }, + { + "epoch": 0.43117397765424625, + "grad_norm": 0.6725128293037415, + "learning_rate": 8.917598363570441e-06, + "loss": 0.7373, + "step": 7834 + }, + { + "epoch": 0.4312290164566019, + "grad_norm": 0.808897852897644, + "learning_rate": 8.917329007006688e-06, + "loss": 0.8397, + "step": 7835 + }, + { + "epoch": 0.4312840552589576, + "grad_norm": 0.7268605828285217, + "learning_rate": 8.91705962100134e-06, + "loss": 0.7957, + "step": 7836 + }, + { + "epoch": 0.4313390940613132, + "grad_norm": 0.7336069345474243, + "learning_rate": 8.916790205556421e-06, + "loss": 0.746, + "step": 7837 + }, + { + "epoch": 0.4313941328636689, + "grad_norm": 0.7380902171134949, + "learning_rate": 8.916520760673955e-06, + "loss": 0.674, + "step": 7838 + }, + { + "epoch": 0.43144917166602453, + "grad_norm": 0.8041831851005554, + "learning_rate": 8.916251286355967e-06, + "loss": 0.8392, + "step": 7839 + }, + { + "epoch": 0.4315042104683802, + "grad_norm": 0.6745681166648865, + "learning_rate": 8.915981782604481e-06, + "loss": 0.7676, + "step": 7840 + }, + { + "epoch": 0.43155924927073586, + "grad_norm": 0.6572039127349854, + "learning_rate": 8.915712249421526e-06, + "loss": 0.7471, + "step": 7841 + }, + { + "epoch": 0.43161428807309155, + "grad_norm": 0.7250062227249146, + "learning_rate": 8.915442686809124e-06, + "loss": 0.8566, + "step": 7842 + }, + { + "epoch": 0.4316693268754472, + "grad_norm": 0.7008941769599915, + "learning_rate": 8.915173094769306e-06, + "loss": 0.7876, + "step": 7843 + }, + { + "epoch": 0.43172436567780287, + "grad_norm": 0.7078337073326111, + "learning_rate": 8.914903473304093e-06, + "loss": 0.756, + "step": 7844 + }, + { + "epoch": 0.4317794044801585, + "grad_norm": 0.7822949886322021, + "learning_rate": 8.914633822415513e-06, + "loss": 0.9423, + "step": 7845 + }, + { + "epoch": 0.4318344432825142, + "grad_norm": 0.6707580089569092, + "learning_rate": 8.914364142105593e-06, + "loss": 0.639, + "step": 7846 + }, + { + "epoch": 0.4318894820848698, + "grad_norm": 0.7868423461914062, + "learning_rate": 8.914094432376362e-06, + "loss": 0.7768, + "step": 7847 + }, + { + "epoch": 0.4319445208872255, + "grad_norm": 0.6147592067718506, + "learning_rate": 8.913824693229845e-06, + "loss": 0.6693, + "step": 7848 + }, + { + "epoch": 0.43199955968958115, + "grad_norm": 0.6901249885559082, + "learning_rate": 8.913554924668067e-06, + "loss": 0.7779, + "step": 7849 + }, + { + "epoch": 0.43205459849193684, + "grad_norm": 0.7062137126922607, + "learning_rate": 8.913285126693058e-06, + "loss": 0.7951, + "step": 7850 + }, + { + "epoch": 0.43210963729429247, + "grad_norm": 0.6363390684127808, + "learning_rate": 8.913015299306846e-06, + "loss": 0.6723, + "step": 7851 + }, + { + "epoch": 0.43216467609664816, + "grad_norm": 0.7168677449226379, + "learning_rate": 8.912745442511459e-06, + "loss": 0.7442, + "step": 7852 + }, + { + "epoch": 0.4322197148990038, + "grad_norm": 0.7347995042800903, + "learning_rate": 8.912475556308925e-06, + "loss": 0.8361, + "step": 7853 + }, + { + "epoch": 0.4322747537013595, + "grad_norm": 0.683777391910553, + "learning_rate": 8.91220564070127e-06, + "loss": 0.7583, + "step": 7854 + }, + { + "epoch": 0.4323297925037151, + "grad_norm": 0.7436330914497375, + "learning_rate": 8.911935695690527e-06, + "loss": 0.8414, + "step": 7855 + }, + { + "epoch": 0.4323848313060708, + "grad_norm": 0.7748109102249146, + "learning_rate": 8.911665721278721e-06, + "loss": 0.7812, + "step": 7856 + }, + { + "epoch": 0.43243987010842644, + "grad_norm": 0.7984411120414734, + "learning_rate": 8.911395717467883e-06, + "loss": 0.6845, + "step": 7857 + }, + { + "epoch": 0.4324949089107821, + "grad_norm": 0.680144727230072, + "learning_rate": 8.911125684260042e-06, + "loss": 0.7156, + "step": 7858 + }, + { + "epoch": 0.43254994771313776, + "grad_norm": 0.7738325595855713, + "learning_rate": 8.910855621657228e-06, + "loss": 0.7295, + "step": 7859 + }, + { + "epoch": 0.43260498651549345, + "grad_norm": 0.7276971340179443, + "learning_rate": 8.910585529661469e-06, + "loss": 0.7982, + "step": 7860 + }, + { + "epoch": 0.4326600253178491, + "grad_norm": 0.7655037641525269, + "learning_rate": 8.910315408274796e-06, + "loss": 0.8416, + "step": 7861 + }, + { + "epoch": 0.4327150641202047, + "grad_norm": 0.7220892906188965, + "learning_rate": 8.910045257499238e-06, + "loss": 0.8002, + "step": 7862 + }, + { + "epoch": 0.4327701029225604, + "grad_norm": 0.6255655884742737, + "learning_rate": 8.90977507733683e-06, + "loss": 0.6477, + "step": 7863 + }, + { + "epoch": 0.43282514172491604, + "grad_norm": 0.649472713470459, + "learning_rate": 8.909504867789594e-06, + "loss": 0.6838, + "step": 7864 + }, + { + "epoch": 0.43288018052727173, + "grad_norm": 0.6915234923362732, + "learning_rate": 8.909234628859568e-06, + "loss": 0.7146, + "step": 7865 + }, + { + "epoch": 0.43293521932962736, + "grad_norm": 0.7120145559310913, + "learning_rate": 8.908964360548783e-06, + "loss": 0.7782, + "step": 7866 + }, + { + "epoch": 0.43299025813198305, + "grad_norm": 0.8125410079956055, + "learning_rate": 8.908694062859267e-06, + "loss": 0.7514, + "step": 7867 + }, + { + "epoch": 0.4330452969343387, + "grad_norm": 0.6821436882019043, + "learning_rate": 8.908423735793053e-06, + "loss": 0.8074, + "step": 7868 + }, + { + "epoch": 0.4331003357366944, + "grad_norm": 0.8079590201377869, + "learning_rate": 8.908153379352171e-06, + "loss": 0.7932, + "step": 7869 + }, + { + "epoch": 0.43315537453905, + "grad_norm": 0.676013708114624, + "learning_rate": 8.907882993538655e-06, + "loss": 0.6611, + "step": 7870 + }, + { + "epoch": 0.4332104133414057, + "grad_norm": 0.706624448299408, + "learning_rate": 8.907612578354537e-06, + "loss": 0.8241, + "step": 7871 + }, + { + "epoch": 0.43326545214376133, + "grad_norm": 0.6533300876617432, + "learning_rate": 8.907342133801848e-06, + "loss": 0.6969, + "step": 7872 + }, + { + "epoch": 0.433320490946117, + "grad_norm": 0.6778282523155212, + "learning_rate": 8.907071659882622e-06, + "loss": 0.6877, + "step": 7873 + }, + { + "epoch": 0.43337552974847265, + "grad_norm": 0.7068879008293152, + "learning_rate": 8.906801156598892e-06, + "loss": 0.7912, + "step": 7874 + }, + { + "epoch": 0.43343056855082834, + "grad_norm": 0.6620263457298279, + "learning_rate": 8.90653062395269e-06, + "loss": 0.7317, + "step": 7875 + }, + { + "epoch": 0.433485607353184, + "grad_norm": 0.7084807753562927, + "learning_rate": 8.906260061946049e-06, + "loss": 0.7268, + "step": 7876 + }, + { + "epoch": 0.43354064615553967, + "grad_norm": 0.7899147272109985, + "learning_rate": 8.905989470581003e-06, + "loss": 0.8258, + "step": 7877 + }, + { + "epoch": 0.4335956849578953, + "grad_norm": 0.6657128930091858, + "learning_rate": 8.905718849859585e-06, + "loss": 0.6564, + "step": 7878 + }, + { + "epoch": 0.433650723760251, + "grad_norm": 0.8737723231315613, + "learning_rate": 8.905448199783831e-06, + "loss": 0.8646, + "step": 7879 + }, + { + "epoch": 0.4337057625626066, + "grad_norm": 0.7517673969268799, + "learning_rate": 8.905177520355775e-06, + "loss": 0.7658, + "step": 7880 + }, + { + "epoch": 0.4337608013649623, + "grad_norm": 0.6724270582199097, + "learning_rate": 8.904906811577447e-06, + "loss": 0.7509, + "step": 7881 + }, + { + "epoch": 0.43381584016731795, + "grad_norm": 0.6490511894226074, + "learning_rate": 8.904636073450885e-06, + "loss": 0.7282, + "step": 7882 + }, + { + "epoch": 0.43387087896967363, + "grad_norm": 0.73885178565979, + "learning_rate": 8.904365305978126e-06, + "loss": 0.7575, + "step": 7883 + }, + { + "epoch": 0.43392591777202927, + "grad_norm": 0.6823462843894958, + "learning_rate": 8.9040945091612e-06, + "loss": 0.7566, + "step": 7884 + }, + { + "epoch": 0.43398095657438496, + "grad_norm": 0.6705971956253052, + "learning_rate": 8.903823683002146e-06, + "loss": 0.7726, + "step": 7885 + }, + { + "epoch": 0.4340359953767406, + "grad_norm": 0.6898428201675415, + "learning_rate": 8.903552827502998e-06, + "loss": 0.7545, + "step": 7886 + }, + { + "epoch": 0.4340910341790963, + "grad_norm": 0.810357928276062, + "learning_rate": 8.90328194266579e-06, + "loss": 0.8883, + "step": 7887 + }, + { + "epoch": 0.4341460729814519, + "grad_norm": 0.6505162119865417, + "learning_rate": 8.903011028492563e-06, + "loss": 0.7205, + "step": 7888 + }, + { + "epoch": 0.4342011117838076, + "grad_norm": 0.8401693105697632, + "learning_rate": 8.902740084985348e-06, + "loss": 0.8105, + "step": 7889 + }, + { + "epoch": 0.43425615058616324, + "grad_norm": 0.7151880860328674, + "learning_rate": 8.902469112146183e-06, + "loss": 0.7748, + "step": 7890 + }, + { + "epoch": 0.4343111893885189, + "grad_norm": 0.7257007956504822, + "learning_rate": 8.902198109977107e-06, + "loss": 0.7818, + "step": 7891 + }, + { + "epoch": 0.43436622819087456, + "grad_norm": 0.786691427230835, + "learning_rate": 8.901927078480153e-06, + "loss": 0.8527, + "step": 7892 + }, + { + "epoch": 0.43442126699323025, + "grad_norm": 0.7420910596847534, + "learning_rate": 8.901656017657358e-06, + "loss": 0.7087, + "step": 7893 + }, + { + "epoch": 0.4344763057955859, + "grad_norm": 0.6713958978652954, + "learning_rate": 8.901384927510763e-06, + "loss": 0.7366, + "step": 7894 + }, + { + "epoch": 0.43453134459794157, + "grad_norm": 1.0276658535003662, + "learning_rate": 8.901113808042402e-06, + "loss": 0.7462, + "step": 7895 + }, + { + "epoch": 0.4345863834002972, + "grad_norm": 0.7207444906234741, + "learning_rate": 8.900842659254314e-06, + "loss": 0.6777, + "step": 7896 + }, + { + "epoch": 0.4346414222026529, + "grad_norm": 0.7581979036331177, + "learning_rate": 8.900571481148538e-06, + "loss": 0.8081, + "step": 7897 + }, + { + "epoch": 0.4346964610050085, + "grad_norm": 0.9224075675010681, + "learning_rate": 8.90030027372711e-06, + "loss": 0.892, + "step": 7898 + }, + { + "epoch": 0.4347514998073642, + "grad_norm": 0.6844260096549988, + "learning_rate": 8.900029036992069e-06, + "loss": 0.8063, + "step": 7899 + }, + { + "epoch": 0.43480653860971985, + "grad_norm": 0.7008691430091858, + "learning_rate": 8.899757770945453e-06, + "loss": 0.6998, + "step": 7900 + }, + { + "epoch": 0.43486157741207554, + "grad_norm": 0.7311949729919434, + "learning_rate": 8.899486475589303e-06, + "loss": 0.7724, + "step": 7901 + }, + { + "epoch": 0.4349166162144312, + "grad_norm": 0.7441468238830566, + "learning_rate": 8.899215150925656e-06, + "loss": 0.7728, + "step": 7902 + }, + { + "epoch": 0.43497165501678686, + "grad_norm": 0.7405179142951965, + "learning_rate": 8.89894379695655e-06, + "loss": 0.8267, + "step": 7903 + }, + { + "epoch": 0.4350266938191425, + "grad_norm": 0.6967620253562927, + "learning_rate": 8.898672413684029e-06, + "loss": 0.7284, + "step": 7904 + }, + { + "epoch": 0.43508173262149813, + "grad_norm": 0.8979219794273376, + "learning_rate": 8.898401001110127e-06, + "loss": 0.8267, + "step": 7905 + }, + { + "epoch": 0.4351367714238538, + "grad_norm": 0.7905356884002686, + "learning_rate": 8.898129559236888e-06, + "loss": 0.8011, + "step": 7906 + }, + { + "epoch": 0.43519181022620945, + "grad_norm": 0.6740859150886536, + "learning_rate": 8.897858088066351e-06, + "loss": 0.6597, + "step": 7907 + }, + { + "epoch": 0.43524684902856514, + "grad_norm": 0.7451572418212891, + "learning_rate": 8.897586587600555e-06, + "loss": 0.7466, + "step": 7908 + }, + { + "epoch": 0.4353018878309208, + "grad_norm": 0.7726565003395081, + "learning_rate": 8.897315057841542e-06, + "loss": 0.7873, + "step": 7909 + }, + { + "epoch": 0.43535692663327646, + "grad_norm": 0.8348171710968018, + "learning_rate": 8.897043498791354e-06, + "loss": 0.7583, + "step": 7910 + }, + { + "epoch": 0.4354119654356321, + "grad_norm": 0.6714087724685669, + "learning_rate": 8.896771910452027e-06, + "loss": 0.7909, + "step": 7911 + }, + { + "epoch": 0.4354670042379878, + "grad_norm": 0.7397969365119934, + "learning_rate": 8.896500292825607e-06, + "loss": 0.7734, + "step": 7912 + }, + { + "epoch": 0.4355220430403434, + "grad_norm": 0.6806391477584839, + "learning_rate": 8.896228645914133e-06, + "loss": 0.7898, + "step": 7913 + }, + { + "epoch": 0.4355770818426991, + "grad_norm": 0.7135224342346191, + "learning_rate": 8.89595696971965e-06, + "loss": 0.7453, + "step": 7914 + }, + { + "epoch": 0.43563212064505474, + "grad_norm": 0.8275992274284363, + "learning_rate": 8.895685264244195e-06, + "loss": 0.7326, + "step": 7915 + }, + { + "epoch": 0.43568715944741043, + "grad_norm": 0.7254159450531006, + "learning_rate": 8.895413529489813e-06, + "loss": 0.7523, + "step": 7916 + }, + { + "epoch": 0.43574219824976607, + "grad_norm": 0.8060647249221802, + "learning_rate": 8.895141765458546e-06, + "loss": 0.7878, + "step": 7917 + }, + { + "epoch": 0.43579723705212176, + "grad_norm": 0.7007316946983337, + "learning_rate": 8.894869972152435e-06, + "loss": 0.7837, + "step": 7918 + }, + { + "epoch": 0.4358522758544774, + "grad_norm": 0.6874841451644897, + "learning_rate": 8.894598149573524e-06, + "loss": 0.7773, + "step": 7919 + }, + { + "epoch": 0.4359073146568331, + "grad_norm": 0.7557696104049683, + "learning_rate": 8.894326297723856e-06, + "loss": 0.6905, + "step": 7920 + }, + { + "epoch": 0.4359623534591887, + "grad_norm": 0.7589512467384338, + "learning_rate": 8.894054416605475e-06, + "loss": 0.8292, + "step": 7921 + }, + { + "epoch": 0.4360173922615444, + "grad_norm": 0.9062818884849548, + "learning_rate": 8.893782506220424e-06, + "loss": 0.9149, + "step": 7922 + }, + { + "epoch": 0.43607243106390003, + "grad_norm": 0.7553420662879944, + "learning_rate": 8.893510566570744e-06, + "loss": 0.7256, + "step": 7923 + }, + { + "epoch": 0.4361274698662557, + "grad_norm": 0.7130489349365234, + "learning_rate": 8.89323859765848e-06, + "loss": 0.7375, + "step": 7924 + }, + { + "epoch": 0.43618250866861136, + "grad_norm": 0.6234793066978455, + "learning_rate": 8.89296659948568e-06, + "loss": 0.716, + "step": 7925 + }, + { + "epoch": 0.43623754747096705, + "grad_norm": 0.7527539134025574, + "learning_rate": 8.892694572054383e-06, + "loss": 0.7884, + "step": 7926 + }, + { + "epoch": 0.4362925862733227, + "grad_norm": 0.7677647471427917, + "learning_rate": 8.892422515366636e-06, + "loss": 0.7136, + "step": 7927 + }, + { + "epoch": 0.43634762507567837, + "grad_norm": 0.7212143540382385, + "learning_rate": 8.892150429424484e-06, + "loss": 0.8113, + "step": 7928 + }, + { + "epoch": 0.436402663878034, + "grad_norm": 0.6735568046569824, + "learning_rate": 8.89187831422997e-06, + "loss": 0.6472, + "step": 7929 + }, + { + "epoch": 0.4364577026803897, + "grad_norm": 0.7120702862739563, + "learning_rate": 8.891606169785141e-06, + "loss": 0.8032, + "step": 7930 + }, + { + "epoch": 0.4365127414827453, + "grad_norm": 0.679499089717865, + "learning_rate": 8.891333996092041e-06, + "loss": 0.7366, + "step": 7931 + }, + { + "epoch": 0.436567780285101, + "grad_norm": 0.7774114012718201, + "learning_rate": 8.891061793152718e-06, + "loss": 0.7917, + "step": 7932 + }, + { + "epoch": 0.43662281908745665, + "grad_norm": 0.6951174139976501, + "learning_rate": 8.890789560969216e-06, + "loss": 0.7518, + "step": 7933 + }, + { + "epoch": 0.43667785788981234, + "grad_norm": 0.7645227909088135, + "learning_rate": 8.89051729954358e-06, + "loss": 0.7787, + "step": 7934 + }, + { + "epoch": 0.43673289669216797, + "grad_norm": 0.7127084732055664, + "learning_rate": 8.890245008877857e-06, + "loss": 0.8137, + "step": 7935 + }, + { + "epoch": 0.43678793549452366, + "grad_norm": 0.7541413903236389, + "learning_rate": 8.889972688974095e-06, + "loss": 0.776, + "step": 7936 + }, + { + "epoch": 0.4368429742968793, + "grad_norm": 0.690963625907898, + "learning_rate": 8.889700339834339e-06, + "loss": 0.7691, + "step": 7937 + }, + { + "epoch": 0.436898013099235, + "grad_norm": 0.750221848487854, + "learning_rate": 8.889427961460636e-06, + "loss": 0.7831, + "step": 7938 + }, + { + "epoch": 0.4369530519015906, + "grad_norm": 0.7255545854568481, + "learning_rate": 8.889155553855035e-06, + "loss": 0.7831, + "step": 7939 + }, + { + "epoch": 0.4370080907039463, + "grad_norm": 0.7187026143074036, + "learning_rate": 8.88888311701958e-06, + "loss": 0.792, + "step": 7940 + }, + { + "epoch": 0.43706312950630194, + "grad_norm": 0.8313350081443787, + "learning_rate": 8.888610650956322e-06, + "loss": 0.706, + "step": 7941 + }, + { + "epoch": 0.43711816830865763, + "grad_norm": 0.8083454370498657, + "learning_rate": 8.888338155667307e-06, + "loss": 0.7857, + "step": 7942 + }, + { + "epoch": 0.43717320711101326, + "grad_norm": 0.8200840353965759, + "learning_rate": 8.888065631154583e-06, + "loss": 0.8601, + "step": 7943 + }, + { + "epoch": 0.43722824591336895, + "grad_norm": 0.7503816485404968, + "learning_rate": 8.887793077420198e-06, + "loss": 0.7744, + "step": 7944 + }, + { + "epoch": 0.4372832847157246, + "grad_norm": 0.7466493248939514, + "learning_rate": 8.887520494466202e-06, + "loss": 0.7818, + "step": 7945 + }, + { + "epoch": 0.4373383235180803, + "grad_norm": 0.728118360042572, + "learning_rate": 8.887247882294641e-06, + "loss": 0.7157, + "step": 7946 + }, + { + "epoch": 0.4373933623204359, + "grad_norm": 0.9199670553207397, + "learning_rate": 8.886975240907568e-06, + "loss": 0.8283, + "step": 7947 + }, + { + "epoch": 0.43744840112279154, + "grad_norm": 0.735584557056427, + "learning_rate": 8.886702570307027e-06, + "loss": 0.6588, + "step": 7948 + }, + { + "epoch": 0.43750343992514723, + "grad_norm": 0.8619036674499512, + "learning_rate": 8.886429870495072e-06, + "loss": 0.7269, + "step": 7949 + }, + { + "epoch": 0.43755847872750286, + "grad_norm": 0.7304830551147461, + "learning_rate": 8.886157141473747e-06, + "loss": 0.6725, + "step": 7950 + }, + { + "epoch": 0.43761351752985855, + "grad_norm": 0.7669086456298828, + "learning_rate": 8.885884383245109e-06, + "loss": 0.6957, + "step": 7951 + }, + { + "epoch": 0.4376685563322142, + "grad_norm": 0.7558299899101257, + "learning_rate": 8.885611595811203e-06, + "loss": 0.8159, + "step": 7952 + }, + { + "epoch": 0.4377235951345699, + "grad_norm": 0.7661786079406738, + "learning_rate": 8.88533877917408e-06, + "loss": 0.764, + "step": 7953 + }, + { + "epoch": 0.4377786339369255, + "grad_norm": 0.7461101412773132, + "learning_rate": 8.88506593333579e-06, + "loss": 0.7544, + "step": 7954 + }, + { + "epoch": 0.4378336727392812, + "grad_norm": 0.7989180088043213, + "learning_rate": 8.884793058298387e-06, + "loss": 0.6913, + "step": 7955 + }, + { + "epoch": 0.43788871154163683, + "grad_norm": 0.7964022755622864, + "learning_rate": 8.884520154063917e-06, + "loss": 0.7339, + "step": 7956 + }, + { + "epoch": 0.4379437503439925, + "grad_norm": 0.7278034687042236, + "learning_rate": 8.884247220634433e-06, + "loss": 0.8477, + "step": 7957 + }, + { + "epoch": 0.43799878914634816, + "grad_norm": 0.7294753789901733, + "learning_rate": 8.883974258011988e-06, + "loss": 0.8412, + "step": 7958 + }, + { + "epoch": 0.43805382794870384, + "grad_norm": 0.665734589099884, + "learning_rate": 8.88370126619863e-06, + "loss": 0.7838, + "step": 7959 + }, + { + "epoch": 0.4381088667510595, + "grad_norm": 0.6984216570854187, + "learning_rate": 8.883428245196414e-06, + "loss": 0.7657, + "step": 7960 + }, + { + "epoch": 0.43816390555341517, + "grad_norm": 0.8048402070999146, + "learning_rate": 8.883155195007393e-06, + "loss": 0.7553, + "step": 7961 + }, + { + "epoch": 0.4382189443557708, + "grad_norm": 0.7145794630050659, + "learning_rate": 8.882882115633616e-06, + "loss": 0.6583, + "step": 7962 + }, + { + "epoch": 0.4382739831581265, + "grad_norm": 0.7073546648025513, + "learning_rate": 8.882609007077135e-06, + "loss": 0.7869, + "step": 7963 + }, + { + "epoch": 0.4383290219604821, + "grad_norm": 0.8300859928131104, + "learning_rate": 8.882335869340004e-06, + "loss": 0.773, + "step": 7964 + }, + { + "epoch": 0.4383840607628378, + "grad_norm": 0.8343188762664795, + "learning_rate": 8.882062702424276e-06, + "loss": 0.6743, + "step": 7965 + }, + { + "epoch": 0.43843909956519345, + "grad_norm": 0.7106530666351318, + "learning_rate": 8.881789506332007e-06, + "loss": 0.7414, + "step": 7966 + }, + { + "epoch": 0.43849413836754914, + "grad_norm": 0.7015630602836609, + "learning_rate": 8.881516281065244e-06, + "loss": 0.7434, + "step": 7967 + }, + { + "epoch": 0.43854917716990477, + "grad_norm": 0.8106673955917358, + "learning_rate": 8.881243026626044e-06, + "loss": 0.7741, + "step": 7968 + }, + { + "epoch": 0.43860421597226046, + "grad_norm": 0.8181495070457458, + "learning_rate": 8.88096974301646e-06, + "loss": 0.8046, + "step": 7969 + }, + { + "epoch": 0.4386592547746161, + "grad_norm": 0.7767857313156128, + "learning_rate": 8.880696430238546e-06, + "loss": 0.8586, + "step": 7970 + }, + { + "epoch": 0.4387142935769718, + "grad_norm": 0.7257522940635681, + "learning_rate": 8.880423088294359e-06, + "loss": 0.7799, + "step": 7971 + }, + { + "epoch": 0.4387693323793274, + "grad_norm": 0.6896021366119385, + "learning_rate": 8.880149717185948e-06, + "loss": 0.8178, + "step": 7972 + }, + { + "epoch": 0.4388243711816831, + "grad_norm": 0.7646406292915344, + "learning_rate": 8.879876316915372e-06, + "loss": 0.8754, + "step": 7973 + }, + { + "epoch": 0.43887940998403874, + "grad_norm": 0.8043848872184753, + "learning_rate": 8.879602887484684e-06, + "loss": 0.8562, + "step": 7974 + }, + { + "epoch": 0.4389344487863944, + "grad_norm": 0.6727305054664612, + "learning_rate": 8.879329428895937e-06, + "loss": 0.6168, + "step": 7975 + }, + { + "epoch": 0.43898948758875006, + "grad_norm": 0.7634731531143188, + "learning_rate": 8.87905594115119e-06, + "loss": 0.857, + "step": 7976 + }, + { + "epoch": 0.43904452639110575, + "grad_norm": 0.6544492244720459, + "learning_rate": 8.878782424252497e-06, + "loss": 0.6302, + "step": 7977 + }, + { + "epoch": 0.4390995651934614, + "grad_norm": 0.8126636743545532, + "learning_rate": 8.878508878201915e-06, + "loss": 0.7823, + "step": 7978 + }, + { + "epoch": 0.43915460399581707, + "grad_norm": 0.7235779166221619, + "learning_rate": 8.878235303001497e-06, + "loss": 0.7527, + "step": 7979 + }, + { + "epoch": 0.4392096427981727, + "grad_norm": 0.6961055397987366, + "learning_rate": 8.8779616986533e-06, + "loss": 0.7383, + "step": 7980 + }, + { + "epoch": 0.4392646816005284, + "grad_norm": 0.7684490084648132, + "learning_rate": 8.877688065159382e-06, + "loss": 0.8009, + "step": 7981 + }, + { + "epoch": 0.43931972040288403, + "grad_norm": 0.7897803783416748, + "learning_rate": 8.877414402521797e-06, + "loss": 0.7561, + "step": 7982 + }, + { + "epoch": 0.4393747592052397, + "grad_norm": 0.7877688407897949, + "learning_rate": 8.877140710742606e-06, + "loss": 0.7949, + "step": 7983 + }, + { + "epoch": 0.43942979800759535, + "grad_norm": 0.8341611623764038, + "learning_rate": 8.876866989823862e-06, + "loss": 0.7585, + "step": 7984 + }, + { + "epoch": 0.43948483680995104, + "grad_norm": 0.7663636207580566, + "learning_rate": 8.876593239767622e-06, + "loss": 0.771, + "step": 7985 + }, + { + "epoch": 0.4395398756123067, + "grad_norm": 0.6824129223823547, + "learning_rate": 8.876319460575946e-06, + "loss": 0.7852, + "step": 7986 + }, + { + "epoch": 0.43959491441466236, + "grad_norm": 0.6533854007720947, + "learning_rate": 8.876045652250891e-06, + "loss": 0.723, + "step": 7987 + }, + { + "epoch": 0.439649953217018, + "grad_norm": 0.7174259424209595, + "learning_rate": 8.875771814794515e-06, + "loss": 0.749, + "step": 7988 + }, + { + "epoch": 0.4397049920193737, + "grad_norm": 0.8585928678512573, + "learning_rate": 8.875497948208875e-06, + "loss": 0.6727, + "step": 7989 + }, + { + "epoch": 0.4397600308217293, + "grad_norm": 0.7558062672615051, + "learning_rate": 8.875224052496029e-06, + "loss": 0.7929, + "step": 7990 + }, + { + "epoch": 0.43981506962408495, + "grad_norm": 0.7063853144645691, + "learning_rate": 8.874950127658037e-06, + "loss": 0.7397, + "step": 7991 + }, + { + "epoch": 0.43987010842644064, + "grad_norm": 0.7165526747703552, + "learning_rate": 8.874676173696956e-06, + "loss": 0.7678, + "step": 7992 + }, + { + "epoch": 0.4399251472287963, + "grad_norm": 0.7657830715179443, + "learning_rate": 8.874402190614847e-06, + "loss": 0.8318, + "step": 7993 + }, + { + "epoch": 0.43998018603115197, + "grad_norm": 0.7776834964752197, + "learning_rate": 8.874128178413769e-06, + "loss": 0.8589, + "step": 7994 + }, + { + "epoch": 0.4400352248335076, + "grad_norm": 0.6805633306503296, + "learning_rate": 8.873854137095778e-06, + "loss": 0.7009, + "step": 7995 + }, + { + "epoch": 0.4400902636358633, + "grad_norm": 0.6962490677833557, + "learning_rate": 8.87358006666294e-06, + "loss": 0.7896, + "step": 7996 + }, + { + "epoch": 0.4401453024382189, + "grad_norm": 0.611610472202301, + "learning_rate": 8.873305967117307e-06, + "loss": 0.5993, + "step": 7997 + }, + { + "epoch": 0.4402003412405746, + "grad_norm": 0.7442964911460876, + "learning_rate": 8.873031838460946e-06, + "loss": 0.8277, + "step": 7998 + }, + { + "epoch": 0.44025538004293024, + "grad_norm": 0.6858734488487244, + "learning_rate": 8.872757680695914e-06, + "loss": 0.8064, + "step": 7999 + }, + { + "epoch": 0.44031041884528593, + "grad_norm": 0.6654849052429199, + "learning_rate": 8.872483493824273e-06, + "loss": 0.7408, + "step": 8000 + }, + { + "epoch": 0.44036545764764157, + "grad_norm": 0.8241575956344604, + "learning_rate": 8.87220927784808e-06, + "loss": 0.8819, + "step": 8001 + }, + { + "epoch": 0.44042049644999726, + "grad_norm": 0.7078573107719421, + "learning_rate": 8.8719350327694e-06, + "loss": 0.7709, + "step": 8002 + }, + { + "epoch": 0.4404755352523529, + "grad_norm": 0.7369210720062256, + "learning_rate": 8.871660758590292e-06, + "loss": 0.7867, + "step": 8003 + }, + { + "epoch": 0.4405305740547086, + "grad_norm": 0.7206673622131348, + "learning_rate": 8.87138645531282e-06, + "loss": 0.8697, + "step": 8004 + }, + { + "epoch": 0.4405856128570642, + "grad_norm": 0.8370183706283569, + "learning_rate": 8.871112122939041e-06, + "loss": 0.7201, + "step": 8005 + }, + { + "epoch": 0.4406406516594199, + "grad_norm": 0.8015196323394775, + "learning_rate": 8.870837761471023e-06, + "loss": 0.774, + "step": 8006 + }, + { + "epoch": 0.44069569046177554, + "grad_norm": 0.730185329914093, + "learning_rate": 8.870563370910821e-06, + "loss": 0.7153, + "step": 8007 + }, + { + "epoch": 0.4407507292641312, + "grad_norm": 0.6719930768013, + "learning_rate": 8.870288951260503e-06, + "loss": 0.7949, + "step": 8008 + }, + { + "epoch": 0.44080576806648686, + "grad_norm": 0.7614291906356812, + "learning_rate": 8.870014502522128e-06, + "loss": 0.7143, + "step": 8009 + }, + { + "epoch": 0.44086080686884255, + "grad_norm": 0.7438056468963623, + "learning_rate": 8.86974002469776e-06, + "loss": 0.6859, + "step": 8010 + }, + { + "epoch": 0.4409158456711982, + "grad_norm": 0.759903073310852, + "learning_rate": 8.869465517789463e-06, + "loss": 0.8095, + "step": 8011 + }, + { + "epoch": 0.44097088447355387, + "grad_norm": 0.7622823119163513, + "learning_rate": 8.869190981799298e-06, + "loss": 0.786, + "step": 8012 + }, + { + "epoch": 0.4410259232759095, + "grad_norm": 0.677003800868988, + "learning_rate": 8.86891641672933e-06, + "loss": 0.7074, + "step": 8013 + }, + { + "epoch": 0.4410809620782652, + "grad_norm": 0.9258451461791992, + "learning_rate": 8.86864182258162e-06, + "loss": 0.7218, + "step": 8014 + }, + { + "epoch": 0.4411360008806208, + "grad_norm": 0.7027828693389893, + "learning_rate": 8.868367199358236e-06, + "loss": 0.7654, + "step": 8015 + }, + { + "epoch": 0.4411910396829765, + "grad_norm": 0.8279967308044434, + "learning_rate": 8.868092547061239e-06, + "loss": 0.8969, + "step": 8016 + }, + { + "epoch": 0.44124607848533215, + "grad_norm": 0.7366079688072205, + "learning_rate": 8.867817865692693e-06, + "loss": 0.8421, + "step": 8017 + }, + { + "epoch": 0.44130111728768784, + "grad_norm": 0.7548787593841553, + "learning_rate": 8.867543155254665e-06, + "loss": 0.79, + "step": 8018 + }, + { + "epoch": 0.44135615609004347, + "grad_norm": 0.7558487057685852, + "learning_rate": 8.867268415749215e-06, + "loss": 0.8461, + "step": 8019 + }, + { + "epoch": 0.44141119489239916, + "grad_norm": 0.6413403153419495, + "learning_rate": 8.866993647178413e-06, + "loss": 0.6811, + "step": 8020 + }, + { + "epoch": 0.4414662336947548, + "grad_norm": 0.9251089692115784, + "learning_rate": 8.86671884954432e-06, + "loss": 0.868, + "step": 8021 + }, + { + "epoch": 0.4415212724971105, + "grad_norm": 0.7920099496841431, + "learning_rate": 8.866444022849006e-06, + "loss": 0.8131, + "step": 8022 + }, + { + "epoch": 0.4415763112994661, + "grad_norm": 0.8738380670547485, + "learning_rate": 8.866169167094532e-06, + "loss": 0.857, + "step": 8023 + }, + { + "epoch": 0.4416313501018218, + "grad_norm": 0.7181336283683777, + "learning_rate": 8.865894282282965e-06, + "loss": 0.7869, + "step": 8024 + }, + { + "epoch": 0.44168638890417744, + "grad_norm": 0.8003776669502258, + "learning_rate": 8.865619368416373e-06, + "loss": 0.8874, + "step": 8025 + }, + { + "epoch": 0.44174142770653313, + "grad_norm": 0.7186623215675354, + "learning_rate": 8.86534442549682e-06, + "loss": 0.7931, + "step": 8026 + }, + { + "epoch": 0.44179646650888876, + "grad_norm": 0.7006831765174866, + "learning_rate": 8.865069453526371e-06, + "loss": 0.7046, + "step": 8027 + }, + { + "epoch": 0.44185150531124445, + "grad_norm": 0.7394786477088928, + "learning_rate": 8.864794452507097e-06, + "loss": 0.685, + "step": 8028 + }, + { + "epoch": 0.4419065441136001, + "grad_norm": 0.7512097358703613, + "learning_rate": 8.864519422441062e-06, + "loss": 0.8047, + "step": 8029 + }, + { + "epoch": 0.4419615829159558, + "grad_norm": 0.6866902709007263, + "learning_rate": 8.864244363330333e-06, + "loss": 0.7099, + "step": 8030 + }, + { + "epoch": 0.4420166217183114, + "grad_norm": 0.7316723465919495, + "learning_rate": 8.863969275176978e-06, + "loss": 0.7767, + "step": 8031 + }, + { + "epoch": 0.4420716605206671, + "grad_norm": 0.7103593349456787, + "learning_rate": 8.863694157983064e-06, + "loss": 0.7832, + "step": 8032 + }, + { + "epoch": 0.44212669932302273, + "grad_norm": 0.6922749876976013, + "learning_rate": 8.863419011750659e-06, + "loss": 0.7833, + "step": 8033 + }, + { + "epoch": 0.44218173812537837, + "grad_norm": 0.7989425659179688, + "learning_rate": 8.863143836481831e-06, + "loss": 0.8651, + "step": 8034 + }, + { + "epoch": 0.44223677692773405, + "grad_norm": 0.6765440702438354, + "learning_rate": 8.862868632178648e-06, + "loss": 0.7858, + "step": 8035 + }, + { + "epoch": 0.4422918157300897, + "grad_norm": 0.670767068862915, + "learning_rate": 8.862593398843178e-06, + "loss": 0.6789, + "step": 8036 + }, + { + "epoch": 0.4423468545324454, + "grad_norm": 0.7556853294372559, + "learning_rate": 8.86231813647749e-06, + "loss": 0.8036, + "step": 8037 + }, + { + "epoch": 0.442401893334801, + "grad_norm": 0.788690984249115, + "learning_rate": 8.862042845083654e-06, + "loss": 0.8355, + "step": 8038 + }, + { + "epoch": 0.4424569321371567, + "grad_norm": 0.8439056873321533, + "learning_rate": 8.861767524663736e-06, + "loss": 0.7327, + "step": 8039 + }, + { + "epoch": 0.44251197093951233, + "grad_norm": 0.7101821899414062, + "learning_rate": 8.861492175219808e-06, + "loss": 0.8303, + "step": 8040 + }, + { + "epoch": 0.442567009741868, + "grad_norm": 0.741680383682251, + "learning_rate": 8.861216796753937e-06, + "loss": 0.7377, + "step": 8041 + }, + { + "epoch": 0.44262204854422366, + "grad_norm": 0.7588099837303162, + "learning_rate": 8.860941389268196e-06, + "loss": 0.8217, + "step": 8042 + }, + { + "epoch": 0.44267708734657935, + "grad_norm": 0.7654829025268555, + "learning_rate": 8.860665952764654e-06, + "loss": 0.8416, + "step": 8043 + }, + { + "epoch": 0.442732126148935, + "grad_norm": 0.7025987505912781, + "learning_rate": 8.860390487245378e-06, + "loss": 0.7312, + "step": 8044 + }, + { + "epoch": 0.44278716495129067, + "grad_norm": 0.7206251621246338, + "learning_rate": 8.860114992712441e-06, + "loss": 0.7522, + "step": 8045 + }, + { + "epoch": 0.4428422037536463, + "grad_norm": 0.7041749954223633, + "learning_rate": 8.859839469167912e-06, + "loss": 0.746, + "step": 8046 + }, + { + "epoch": 0.442897242556002, + "grad_norm": 0.6941862106323242, + "learning_rate": 8.859563916613864e-06, + "loss": 0.7692, + "step": 8047 + }, + { + "epoch": 0.4429522813583576, + "grad_norm": 0.6897740364074707, + "learning_rate": 8.859288335052367e-06, + "loss": 0.7963, + "step": 8048 + }, + { + "epoch": 0.4430073201607133, + "grad_norm": 0.6744545698165894, + "learning_rate": 8.859012724485492e-06, + "loss": 0.7647, + "step": 8049 + }, + { + "epoch": 0.44306235896306895, + "grad_norm": 0.7899364829063416, + "learning_rate": 8.858737084915309e-06, + "loss": 0.8373, + "step": 8050 + }, + { + "epoch": 0.44311739776542464, + "grad_norm": 0.806016743183136, + "learning_rate": 8.85846141634389e-06, + "loss": 0.7871, + "step": 8051 + }, + { + "epoch": 0.44317243656778027, + "grad_norm": 0.7444993257522583, + "learning_rate": 8.85818571877331e-06, + "loss": 0.8099, + "step": 8052 + }, + { + "epoch": 0.44322747537013596, + "grad_norm": 0.772735059261322, + "learning_rate": 8.85790999220564e-06, + "loss": 0.7113, + "step": 8053 + }, + { + "epoch": 0.4432825141724916, + "grad_norm": 0.7743984460830688, + "learning_rate": 8.85763423664295e-06, + "loss": 0.8935, + "step": 8054 + }, + { + "epoch": 0.4433375529748473, + "grad_norm": 0.6751214265823364, + "learning_rate": 8.857358452087313e-06, + "loss": 0.6769, + "step": 8055 + }, + { + "epoch": 0.4433925917772029, + "grad_norm": 0.6921005845069885, + "learning_rate": 8.857082638540803e-06, + "loss": 0.7071, + "step": 8056 + }, + { + "epoch": 0.4434476305795586, + "grad_norm": 0.7884092330932617, + "learning_rate": 8.856806796005491e-06, + "loss": 0.7919, + "step": 8057 + }, + { + "epoch": 0.44350266938191424, + "grad_norm": 0.6522679924964905, + "learning_rate": 8.856530924483452e-06, + "loss": 0.7449, + "step": 8058 + }, + { + "epoch": 0.4435577081842699, + "grad_norm": 0.7172590494155884, + "learning_rate": 8.85625502397676e-06, + "loss": 0.7306, + "step": 8059 + }, + { + "epoch": 0.44361274698662556, + "grad_norm": 0.698658287525177, + "learning_rate": 8.855979094487488e-06, + "loss": 0.803, + "step": 8060 + }, + { + "epoch": 0.44366778578898125, + "grad_norm": 0.685589075088501, + "learning_rate": 8.855703136017708e-06, + "loss": 0.763, + "step": 8061 + }, + { + "epoch": 0.4437228245913369, + "grad_norm": 0.8259774446487427, + "learning_rate": 8.855427148569495e-06, + "loss": 0.811, + "step": 8062 + }, + { + "epoch": 0.4437778633936926, + "grad_norm": 0.6976660490036011, + "learning_rate": 8.855151132144926e-06, + "loss": 0.7345, + "step": 8063 + }, + { + "epoch": 0.4438329021960482, + "grad_norm": 0.7696738243103027, + "learning_rate": 8.854875086746071e-06, + "loss": 0.823, + "step": 8064 + }, + { + "epoch": 0.4438879409984039, + "grad_norm": 0.6627930998802185, + "learning_rate": 8.854599012375006e-06, + "loss": 0.7455, + "step": 8065 + }, + { + "epoch": 0.44394297980075953, + "grad_norm": 0.7492700815200806, + "learning_rate": 8.854322909033809e-06, + "loss": 0.8195, + "step": 8066 + }, + { + "epoch": 0.4439980186031152, + "grad_norm": 0.8335888981819153, + "learning_rate": 8.85404677672455e-06, + "loss": 0.7683, + "step": 8067 + }, + { + "epoch": 0.44405305740547085, + "grad_norm": 0.7448242902755737, + "learning_rate": 8.853770615449309e-06, + "loss": 0.8352, + "step": 8068 + }, + { + "epoch": 0.44410809620782654, + "grad_norm": 0.700616180896759, + "learning_rate": 8.853494425210158e-06, + "loss": 0.7892, + "step": 8069 + }, + { + "epoch": 0.4441631350101822, + "grad_norm": 0.6959284543991089, + "learning_rate": 8.853218206009176e-06, + "loss": 0.6944, + "step": 8070 + }, + { + "epoch": 0.44421817381253786, + "grad_norm": 0.7507375478744507, + "learning_rate": 8.852941957848438e-06, + "loss": 0.8921, + "step": 8071 + }, + { + "epoch": 0.4442732126148935, + "grad_norm": 0.7843918204307556, + "learning_rate": 8.852665680730019e-06, + "loss": 0.816, + "step": 8072 + }, + { + "epoch": 0.4443282514172492, + "grad_norm": 0.8702702522277832, + "learning_rate": 8.852389374655995e-06, + "loss": 0.8191, + "step": 8073 + }, + { + "epoch": 0.4443832902196048, + "grad_norm": 0.6784317493438721, + "learning_rate": 8.852113039628445e-06, + "loss": 0.7726, + "step": 8074 + }, + { + "epoch": 0.4444383290219605, + "grad_norm": 0.724530041217804, + "learning_rate": 8.851836675649443e-06, + "loss": 0.8214, + "step": 8075 + }, + { + "epoch": 0.44449336782431614, + "grad_norm": 0.9814287424087524, + "learning_rate": 8.851560282721067e-06, + "loss": 0.8368, + "step": 8076 + }, + { + "epoch": 0.4445484066266718, + "grad_norm": 0.6606815457344055, + "learning_rate": 8.851283860845398e-06, + "loss": 0.7772, + "step": 8077 + }, + { + "epoch": 0.44460344542902747, + "grad_norm": 0.6910951137542725, + "learning_rate": 8.851007410024507e-06, + "loss": 0.7007, + "step": 8078 + }, + { + "epoch": 0.4446584842313831, + "grad_norm": 0.6764300465583801, + "learning_rate": 8.850730930260479e-06, + "loss": 0.7265, + "step": 8079 + }, + { + "epoch": 0.4447135230337388, + "grad_norm": 0.669622004032135, + "learning_rate": 8.850454421555386e-06, + "loss": 0.7551, + "step": 8080 + }, + { + "epoch": 0.4447685618360944, + "grad_norm": 0.7068240642547607, + "learning_rate": 8.850177883911307e-06, + "loss": 0.8358, + "step": 8081 + }, + { + "epoch": 0.4448236006384501, + "grad_norm": 0.7100360989570618, + "learning_rate": 8.849901317330324e-06, + "loss": 0.7074, + "step": 8082 + }, + { + "epoch": 0.44487863944080575, + "grad_norm": 0.7510328888893127, + "learning_rate": 8.849624721814511e-06, + "loss": 0.6654, + "step": 8083 + }, + { + "epoch": 0.44493367824316143, + "grad_norm": 0.8106432557106018, + "learning_rate": 8.849348097365951e-06, + "loss": 0.6944, + "step": 8084 + }, + { + "epoch": 0.44498871704551707, + "grad_norm": 0.6852346062660217, + "learning_rate": 8.84907144398672e-06, + "loss": 0.7203, + "step": 8085 + }, + { + "epoch": 0.44504375584787276, + "grad_norm": 0.8495593667030334, + "learning_rate": 8.848794761678898e-06, + "loss": 0.7918, + "step": 8086 + }, + { + "epoch": 0.4450987946502284, + "grad_norm": 0.7110981941223145, + "learning_rate": 8.848518050444565e-06, + "loss": 0.8176, + "step": 8087 + }, + { + "epoch": 0.4451538334525841, + "grad_norm": 0.7740922570228577, + "learning_rate": 8.8482413102858e-06, + "loss": 0.7573, + "step": 8088 + }, + { + "epoch": 0.4452088722549397, + "grad_norm": 0.9645134806632996, + "learning_rate": 8.847964541204685e-06, + "loss": 0.7842, + "step": 8089 + }, + { + "epoch": 0.4452639110572954, + "grad_norm": 0.767621636390686, + "learning_rate": 8.847687743203299e-06, + "loss": 0.8182, + "step": 8090 + }, + { + "epoch": 0.44531894985965104, + "grad_norm": 0.6842975616455078, + "learning_rate": 8.84741091628372e-06, + "loss": 0.7795, + "step": 8091 + }, + { + "epoch": 0.4453739886620067, + "grad_norm": 0.768644392490387, + "learning_rate": 8.847134060448032e-06, + "loss": 0.7363, + "step": 8092 + }, + { + "epoch": 0.44542902746436236, + "grad_norm": 0.6813824772834778, + "learning_rate": 8.846857175698314e-06, + "loss": 0.7601, + "step": 8093 + }, + { + "epoch": 0.44548406626671805, + "grad_norm": 0.8608306646347046, + "learning_rate": 8.846580262036645e-06, + "loss": 0.8205, + "step": 8094 + }, + { + "epoch": 0.4455391050690737, + "grad_norm": 0.6917694807052612, + "learning_rate": 8.84630331946511e-06, + "loss": 0.7207, + "step": 8095 + }, + { + "epoch": 0.44559414387142937, + "grad_norm": 0.6777203679084778, + "learning_rate": 8.84602634798579e-06, + "loss": 0.6939, + "step": 8096 + }, + { + "epoch": 0.445649182673785, + "grad_norm": 0.7249894142150879, + "learning_rate": 8.845749347600764e-06, + "loss": 0.7918, + "step": 8097 + }, + { + "epoch": 0.4457042214761407, + "grad_norm": 0.7446995973587036, + "learning_rate": 8.845472318312116e-06, + "loss": 0.7379, + "step": 8098 + }, + { + "epoch": 0.4457592602784963, + "grad_norm": 0.8245479464530945, + "learning_rate": 8.845195260121927e-06, + "loss": 0.8532, + "step": 8099 + }, + { + "epoch": 0.445814299080852, + "grad_norm": 0.7160329818725586, + "learning_rate": 8.84491817303228e-06, + "loss": 0.7042, + "step": 8100 + }, + { + "epoch": 0.44586933788320765, + "grad_norm": 0.8056026101112366, + "learning_rate": 8.844641057045257e-06, + "loss": 0.8581, + "step": 8101 + }, + { + "epoch": 0.44592437668556334, + "grad_norm": 0.7257886528968811, + "learning_rate": 8.84436391216294e-06, + "loss": 0.7297, + "step": 8102 + }, + { + "epoch": 0.445979415487919, + "grad_norm": 0.7400404810905457, + "learning_rate": 8.844086738387415e-06, + "loss": 0.7703, + "step": 8103 + }, + { + "epoch": 0.44603445429027466, + "grad_norm": 0.665271520614624, + "learning_rate": 8.843809535720763e-06, + "loss": 0.7769, + "step": 8104 + }, + { + "epoch": 0.4460894930926303, + "grad_norm": 0.7041043639183044, + "learning_rate": 8.843532304165066e-06, + "loss": 0.7995, + "step": 8105 + }, + { + "epoch": 0.446144531894986, + "grad_norm": 0.8517841100692749, + "learning_rate": 8.84325504372241e-06, + "loss": 0.8239, + "step": 8106 + }, + { + "epoch": 0.4461995706973416, + "grad_norm": 0.7045741677284241, + "learning_rate": 8.842977754394877e-06, + "loss": 0.7982, + "step": 8107 + }, + { + "epoch": 0.4462546094996973, + "grad_norm": 0.7056185007095337, + "learning_rate": 8.842700436184552e-06, + "loss": 0.8003, + "step": 8108 + }, + { + "epoch": 0.44630964830205294, + "grad_norm": 0.9042232632637024, + "learning_rate": 8.842423089093519e-06, + "loss": 0.7534, + "step": 8109 + }, + { + "epoch": 0.44636468710440863, + "grad_norm": 0.8584854602813721, + "learning_rate": 8.842145713123863e-06, + "loss": 0.7759, + "step": 8110 + }, + { + "epoch": 0.44641972590676426, + "grad_norm": 0.7333530187606812, + "learning_rate": 8.841868308277668e-06, + "loss": 0.7218, + "step": 8111 + }, + { + "epoch": 0.44647476470911995, + "grad_norm": 0.7866941094398499, + "learning_rate": 8.84159087455702e-06, + "loss": 0.7016, + "step": 8112 + }, + { + "epoch": 0.4465298035114756, + "grad_norm": 0.7785252928733826, + "learning_rate": 8.841313411964001e-06, + "loss": 0.8232, + "step": 8113 + }, + { + "epoch": 0.4465848423138313, + "grad_norm": 0.7060698866844177, + "learning_rate": 8.841035920500702e-06, + "loss": 0.6987, + "step": 8114 + }, + { + "epoch": 0.4466398811161869, + "grad_norm": 0.7211717963218689, + "learning_rate": 8.840758400169203e-06, + "loss": 0.8604, + "step": 8115 + }, + { + "epoch": 0.4466949199185426, + "grad_norm": 0.979678213596344, + "learning_rate": 8.840480850971593e-06, + "loss": 0.9028, + "step": 8116 + }, + { + "epoch": 0.44674995872089823, + "grad_norm": 0.6595104336738586, + "learning_rate": 8.840203272909957e-06, + "loss": 0.6899, + "step": 8117 + }, + { + "epoch": 0.4468049975232539, + "grad_norm": 0.6392405033111572, + "learning_rate": 8.83992566598638e-06, + "loss": 0.7729, + "step": 8118 + }, + { + "epoch": 0.44686003632560956, + "grad_norm": 1.1084040403366089, + "learning_rate": 8.839648030202949e-06, + "loss": 0.822, + "step": 8119 + }, + { + "epoch": 0.4469150751279652, + "grad_norm": 0.7024106383323669, + "learning_rate": 8.839370365561754e-06, + "loss": 0.7615, + "step": 8120 + }, + { + "epoch": 0.4469701139303209, + "grad_norm": 0.7204060554504395, + "learning_rate": 8.839092672064878e-06, + "loss": 0.7527, + "step": 8121 + }, + { + "epoch": 0.4470251527326765, + "grad_norm": 0.7307723760604858, + "learning_rate": 8.838814949714407e-06, + "loss": 0.8139, + "step": 8122 + }, + { + "epoch": 0.4470801915350322, + "grad_norm": 0.824034571647644, + "learning_rate": 8.838537198512434e-06, + "loss": 0.8299, + "step": 8123 + }, + { + "epoch": 0.44713523033738783, + "grad_norm": 0.6603747606277466, + "learning_rate": 8.83825941846104e-06, + "loss": 0.6762, + "step": 8124 + }, + { + "epoch": 0.4471902691397435, + "grad_norm": 0.7403088808059692, + "learning_rate": 8.837981609562316e-06, + "loss": 0.716, + "step": 8125 + }, + { + "epoch": 0.44724530794209916, + "grad_norm": 0.742173969745636, + "learning_rate": 8.837703771818351e-06, + "loss": 0.7672, + "step": 8126 + }, + { + "epoch": 0.44730034674445485, + "grad_norm": 0.7158839106559753, + "learning_rate": 8.837425905231232e-06, + "loss": 0.6941, + "step": 8127 + }, + { + "epoch": 0.4473553855468105, + "grad_norm": 0.7659464478492737, + "learning_rate": 8.837148009803044e-06, + "loss": 0.7293, + "step": 8128 + }, + { + "epoch": 0.44741042434916617, + "grad_norm": 0.8681113719940186, + "learning_rate": 8.836870085535882e-06, + "loss": 0.8647, + "step": 8129 + }, + { + "epoch": 0.4474654631515218, + "grad_norm": 0.7117272615432739, + "learning_rate": 8.83659213243183e-06, + "loss": 0.8035, + "step": 8130 + }, + { + "epoch": 0.4475205019538775, + "grad_norm": 0.8220957517623901, + "learning_rate": 8.836314150492978e-06, + "loss": 0.6978, + "step": 8131 + }, + { + "epoch": 0.4475755407562331, + "grad_norm": 0.7045003175735474, + "learning_rate": 8.836036139721418e-06, + "loss": 0.747, + "step": 8132 + }, + { + "epoch": 0.4476305795585888, + "grad_norm": 0.6833191514015198, + "learning_rate": 8.835758100119235e-06, + "loss": 0.7604, + "step": 8133 + }, + { + "epoch": 0.44768561836094445, + "grad_norm": 0.7305697798728943, + "learning_rate": 8.835480031688521e-06, + "loss": 0.7301, + "step": 8134 + }, + { + "epoch": 0.44774065716330014, + "grad_norm": 0.7266964912414551, + "learning_rate": 8.835201934431366e-06, + "loss": 0.7675, + "step": 8135 + }, + { + "epoch": 0.44779569596565577, + "grad_norm": 0.6822015047073364, + "learning_rate": 8.834923808349861e-06, + "loss": 0.8226, + "step": 8136 + }, + { + "epoch": 0.44785073476801146, + "grad_norm": 0.7443515062332153, + "learning_rate": 8.834645653446095e-06, + "loss": 0.9289, + "step": 8137 + }, + { + "epoch": 0.4479057735703671, + "grad_norm": 0.7337210178375244, + "learning_rate": 8.834367469722158e-06, + "loss": 0.7758, + "step": 8138 + }, + { + "epoch": 0.4479608123727228, + "grad_norm": 0.6794925332069397, + "learning_rate": 8.83408925718014e-06, + "loss": 0.8426, + "step": 8139 + }, + { + "epoch": 0.4480158511750784, + "grad_norm": 0.7808265089988708, + "learning_rate": 8.833811015822135e-06, + "loss": 0.8464, + "step": 8140 + }, + { + "epoch": 0.4480708899774341, + "grad_norm": 0.7837018370628357, + "learning_rate": 8.833532745650234e-06, + "loss": 0.8722, + "step": 8141 + }, + { + "epoch": 0.44812592877978974, + "grad_norm": 0.9218140840530396, + "learning_rate": 8.833254446666526e-06, + "loss": 0.7981, + "step": 8142 + }, + { + "epoch": 0.44818096758214543, + "grad_norm": 0.7980387806892395, + "learning_rate": 8.832976118873103e-06, + "loss": 0.7705, + "step": 8143 + }, + { + "epoch": 0.44823600638450106, + "grad_norm": 0.7354007363319397, + "learning_rate": 8.832697762272057e-06, + "loss": 0.8286, + "step": 8144 + }, + { + "epoch": 0.44829104518685675, + "grad_norm": 0.7006223201751709, + "learning_rate": 8.832419376865482e-06, + "loss": 0.7107, + "step": 8145 + }, + { + "epoch": 0.4483460839892124, + "grad_norm": 0.7838212847709656, + "learning_rate": 8.83214096265547e-06, + "loss": 0.7676, + "step": 8146 + }, + { + "epoch": 0.4484011227915681, + "grad_norm": 0.7768213748931885, + "learning_rate": 8.83186251964411e-06, + "loss": 0.8689, + "step": 8147 + }, + { + "epoch": 0.4484561615939237, + "grad_norm": 0.7451630234718323, + "learning_rate": 8.831584047833497e-06, + "loss": 0.8625, + "step": 8148 + }, + { + "epoch": 0.4485112003962794, + "grad_norm": 0.7573269605636597, + "learning_rate": 8.831305547225725e-06, + "loss": 0.7357, + "step": 8149 + }, + { + "epoch": 0.44856623919863503, + "grad_norm": 0.6884848475456238, + "learning_rate": 8.831027017822886e-06, + "loss": 0.7306, + "step": 8150 + }, + { + "epoch": 0.4486212780009907, + "grad_norm": 0.7715907096862793, + "learning_rate": 8.830748459627073e-06, + "loss": 0.8311, + "step": 8151 + }, + { + "epoch": 0.44867631680334635, + "grad_norm": 0.6919859647750854, + "learning_rate": 8.83046987264038e-06, + "loss": 0.845, + "step": 8152 + }, + { + "epoch": 0.44873135560570204, + "grad_norm": 0.7066411972045898, + "learning_rate": 8.830191256864902e-06, + "loss": 0.7554, + "step": 8153 + }, + { + "epoch": 0.4487863944080577, + "grad_norm": 0.754196047782898, + "learning_rate": 8.829912612302729e-06, + "loss": 0.7396, + "step": 8154 + }, + { + "epoch": 0.44884143321041337, + "grad_norm": 0.7612286806106567, + "learning_rate": 8.82963393895596e-06, + "loss": 0.8154, + "step": 8155 + }, + { + "epoch": 0.448896472012769, + "grad_norm": 0.8576892614364624, + "learning_rate": 8.829355236826688e-06, + "loss": 0.7395, + "step": 8156 + }, + { + "epoch": 0.4489515108151247, + "grad_norm": 0.6813738346099854, + "learning_rate": 8.829076505917005e-06, + "loss": 0.7661, + "step": 8157 + }, + { + "epoch": 0.4490065496174803, + "grad_norm": 0.7453964948654175, + "learning_rate": 8.828797746229009e-06, + "loss": 0.8221, + "step": 8158 + }, + { + "epoch": 0.449061588419836, + "grad_norm": 0.7546728849411011, + "learning_rate": 8.828518957764795e-06, + "loss": 0.7717, + "step": 8159 + }, + { + "epoch": 0.44911662722219164, + "grad_norm": 0.8270652890205383, + "learning_rate": 8.828240140526456e-06, + "loss": 0.7582, + "step": 8160 + }, + { + "epoch": 0.44917166602454733, + "grad_norm": 0.8188696503639221, + "learning_rate": 8.827961294516089e-06, + "loss": 0.8841, + "step": 8161 + }, + { + "epoch": 0.44922670482690297, + "grad_norm": 0.9101365208625793, + "learning_rate": 8.82768241973579e-06, + "loss": 0.7099, + "step": 8162 + }, + { + "epoch": 0.4492817436292586, + "grad_norm": 0.6749762892723083, + "learning_rate": 8.827403516187656e-06, + "loss": 0.7766, + "step": 8163 + }, + { + "epoch": 0.4493367824316143, + "grad_norm": 1.1351534128189087, + "learning_rate": 8.827124583873781e-06, + "loss": 0.7536, + "step": 8164 + }, + { + "epoch": 0.4493918212339699, + "grad_norm": 0.8729487061500549, + "learning_rate": 8.826845622796261e-06, + "loss": 0.8613, + "step": 8165 + }, + { + "epoch": 0.4494468600363256, + "grad_norm": 0.7495871782302856, + "learning_rate": 8.826566632957193e-06, + "loss": 0.8365, + "step": 8166 + }, + { + "epoch": 0.44950189883868125, + "grad_norm": 0.6414516568183899, + "learning_rate": 8.826287614358677e-06, + "loss": 0.6574, + "step": 8167 + }, + { + "epoch": 0.44955693764103694, + "grad_norm": 0.6954017281532288, + "learning_rate": 8.826008567002805e-06, + "loss": 0.7857, + "step": 8168 + }, + { + "epoch": 0.44961197644339257, + "grad_norm": 0.7199459075927734, + "learning_rate": 8.825729490891678e-06, + "loss": 0.8585, + "step": 8169 + }, + { + "epoch": 0.44966701524574826, + "grad_norm": 0.8245406746864319, + "learning_rate": 8.825450386027392e-06, + "loss": 0.7238, + "step": 8170 + }, + { + "epoch": 0.4497220540481039, + "grad_norm": 0.6348667740821838, + "learning_rate": 8.825171252412044e-06, + "loss": 0.6991, + "step": 8171 + }, + { + "epoch": 0.4497770928504596, + "grad_norm": 0.6304741501808167, + "learning_rate": 8.824892090047734e-06, + "loss": 0.7101, + "step": 8172 + }, + { + "epoch": 0.4498321316528152, + "grad_norm": 0.7088820338249207, + "learning_rate": 8.82461289893656e-06, + "loss": 0.8217, + "step": 8173 + }, + { + "epoch": 0.4498871704551709, + "grad_norm": 0.7570851445198059, + "learning_rate": 8.824333679080617e-06, + "loss": 0.8029, + "step": 8174 + }, + { + "epoch": 0.44994220925752654, + "grad_norm": 0.7544378042221069, + "learning_rate": 8.824054430482007e-06, + "loss": 0.777, + "step": 8175 + }, + { + "epoch": 0.4499972480598822, + "grad_norm": 0.8226260542869568, + "learning_rate": 8.823775153142827e-06, + "loss": 0.8391, + "step": 8176 + }, + { + "epoch": 0.45005228686223786, + "grad_norm": 0.6861422061920166, + "learning_rate": 8.823495847065176e-06, + "loss": 0.7491, + "step": 8177 + }, + { + "epoch": 0.45010732566459355, + "grad_norm": 0.6643275618553162, + "learning_rate": 8.823216512251153e-06, + "loss": 0.6773, + "step": 8178 + }, + { + "epoch": 0.4501623644669492, + "grad_norm": 0.8201391100883484, + "learning_rate": 8.82293714870286e-06, + "loss": 0.8065, + "step": 8179 + }, + { + "epoch": 0.45021740326930487, + "grad_norm": 0.7783405780792236, + "learning_rate": 8.822657756422394e-06, + "loss": 0.7884, + "step": 8180 + }, + { + "epoch": 0.4502724420716605, + "grad_norm": 0.720745861530304, + "learning_rate": 8.822378335411856e-06, + "loss": 0.765, + "step": 8181 + }, + { + "epoch": 0.4503274808740162, + "grad_norm": 0.740364670753479, + "learning_rate": 8.822098885673346e-06, + "loss": 0.6354, + "step": 8182 + }, + { + "epoch": 0.45038251967637183, + "grad_norm": 0.8049225807189941, + "learning_rate": 8.821819407208963e-06, + "loss": 0.7023, + "step": 8183 + }, + { + "epoch": 0.4504375584787275, + "grad_norm": 0.7320911288261414, + "learning_rate": 8.821539900020808e-06, + "loss": 0.8429, + "step": 8184 + }, + { + "epoch": 0.45049259728108315, + "grad_norm": 0.7065376043319702, + "learning_rate": 8.821260364110984e-06, + "loss": 0.7283, + "step": 8185 + }, + { + "epoch": 0.45054763608343884, + "grad_norm": 0.7172972559928894, + "learning_rate": 8.820980799481588e-06, + "loss": 0.7673, + "step": 8186 + }, + { + "epoch": 0.4506026748857945, + "grad_norm": 0.712273895740509, + "learning_rate": 8.820701206134724e-06, + "loss": 0.7317, + "step": 8187 + }, + { + "epoch": 0.45065771368815016, + "grad_norm": 0.6954227685928345, + "learning_rate": 8.820421584072492e-06, + "loss": 0.7037, + "step": 8188 + }, + { + "epoch": 0.4507127524905058, + "grad_norm": 0.6790304780006409, + "learning_rate": 8.820141933296994e-06, + "loss": 0.7544, + "step": 8189 + }, + { + "epoch": 0.4507677912928615, + "grad_norm": 0.7483745813369751, + "learning_rate": 8.819862253810332e-06, + "loss": 0.7894, + "step": 8190 + }, + { + "epoch": 0.4508228300952171, + "grad_norm": 0.7926133871078491, + "learning_rate": 8.819582545614608e-06, + "loss": 0.8085, + "step": 8191 + }, + { + "epoch": 0.4508778688975728, + "grad_norm": 0.8442840576171875, + "learning_rate": 8.819302808711924e-06, + "loss": 0.8252, + "step": 8192 + }, + { + "epoch": 0.45093290769992844, + "grad_norm": 0.8359581232070923, + "learning_rate": 8.819023043104383e-06, + "loss": 0.8187, + "step": 8193 + }, + { + "epoch": 0.45098794650228413, + "grad_norm": 0.7793936133384705, + "learning_rate": 8.818743248794085e-06, + "loss": 0.8425, + "step": 8194 + }, + { + "epoch": 0.45104298530463977, + "grad_norm": 0.735509991645813, + "learning_rate": 8.818463425783136e-06, + "loss": 0.7781, + "step": 8195 + }, + { + "epoch": 0.45109802410699545, + "grad_norm": 0.6735361814498901, + "learning_rate": 8.818183574073639e-06, + "loss": 0.6987, + "step": 8196 + }, + { + "epoch": 0.4511530629093511, + "grad_norm": 0.7780157923698425, + "learning_rate": 8.817903693667695e-06, + "loss": 0.8474, + "step": 8197 + }, + { + "epoch": 0.4512081017117068, + "grad_norm": 0.6714445948600769, + "learning_rate": 8.817623784567411e-06, + "loss": 0.7216, + "step": 8198 + }, + { + "epoch": 0.4512631405140624, + "grad_norm": 0.6311395168304443, + "learning_rate": 8.817343846774886e-06, + "loss": 0.5724, + "step": 8199 + }, + { + "epoch": 0.4513181793164181, + "grad_norm": 0.7446333169937134, + "learning_rate": 8.817063880292227e-06, + "loss": 0.7867, + "step": 8200 + }, + { + "epoch": 0.45137321811877373, + "grad_norm": 0.7684246301651001, + "learning_rate": 8.816783885121539e-06, + "loss": 0.8141, + "step": 8201 + }, + { + "epoch": 0.4514282569211294, + "grad_norm": 0.754781186580658, + "learning_rate": 8.816503861264925e-06, + "loss": 0.8438, + "step": 8202 + }, + { + "epoch": 0.45148329572348506, + "grad_norm": 0.7705762982368469, + "learning_rate": 8.816223808724488e-06, + "loss": 0.8948, + "step": 8203 + }, + { + "epoch": 0.4515383345258407, + "grad_norm": 0.7731552720069885, + "learning_rate": 8.815943727502333e-06, + "loss": 0.7462, + "step": 8204 + }, + { + "epoch": 0.4515933733281964, + "grad_norm": 0.6615393757820129, + "learning_rate": 8.81566361760057e-06, + "loss": 0.7499, + "step": 8205 + }, + { + "epoch": 0.451648412130552, + "grad_norm": 0.724453866481781, + "learning_rate": 8.8153834790213e-06, + "loss": 0.7382, + "step": 8206 + }, + { + "epoch": 0.4517034509329077, + "grad_norm": 0.6369735598564148, + "learning_rate": 8.815103311766629e-06, + "loss": 0.7452, + "step": 8207 + }, + { + "epoch": 0.45175848973526334, + "grad_norm": 0.686000406742096, + "learning_rate": 8.814823115838659e-06, + "loss": 0.6971, + "step": 8208 + }, + { + "epoch": 0.451813528537619, + "grad_norm": 0.7372714281082153, + "learning_rate": 8.814542891239505e-06, + "loss": 0.8553, + "step": 8209 + }, + { + "epoch": 0.45186856733997466, + "grad_norm": 0.8348672986030579, + "learning_rate": 8.814262637971264e-06, + "loss": 0.7135, + "step": 8210 + }, + { + "epoch": 0.45192360614233035, + "grad_norm": 0.7829258441925049, + "learning_rate": 8.813982356036049e-06, + "loss": 0.7974, + "step": 8211 + }, + { + "epoch": 0.451978644944686, + "grad_norm": 0.7013983726501465, + "learning_rate": 8.81370204543596e-06, + "loss": 0.7531, + "step": 8212 + }, + { + "epoch": 0.45203368374704167, + "grad_norm": 0.8424196243286133, + "learning_rate": 8.81342170617311e-06, + "loss": 0.8217, + "step": 8213 + }, + { + "epoch": 0.4520887225493973, + "grad_norm": 0.7113365530967712, + "learning_rate": 8.813141338249603e-06, + "loss": 0.7728, + "step": 8214 + }, + { + "epoch": 0.452143761351753, + "grad_norm": 0.958642303943634, + "learning_rate": 8.812860941667545e-06, + "loss": 0.7234, + "step": 8215 + }, + { + "epoch": 0.4521988001541086, + "grad_norm": 0.6712706685066223, + "learning_rate": 8.812580516429045e-06, + "loss": 0.6998, + "step": 8216 + }, + { + "epoch": 0.4522538389564643, + "grad_norm": 0.7258469462394714, + "learning_rate": 8.812300062536212e-06, + "loss": 0.6758, + "step": 8217 + }, + { + "epoch": 0.45230887775881995, + "grad_norm": 0.735047459602356, + "learning_rate": 8.812019579991152e-06, + "loss": 0.7045, + "step": 8218 + }, + { + "epoch": 0.45236391656117564, + "grad_norm": 0.8339886665344238, + "learning_rate": 8.811739068795971e-06, + "loss": 0.8069, + "step": 8219 + }, + { + "epoch": 0.45241895536353127, + "grad_norm": 0.7170082926750183, + "learning_rate": 8.81145852895278e-06, + "loss": 0.6345, + "step": 8220 + }, + { + "epoch": 0.45247399416588696, + "grad_norm": 0.6892569661140442, + "learning_rate": 8.81117796046369e-06, + "loss": 0.712, + "step": 8221 + }, + { + "epoch": 0.4525290329682426, + "grad_norm": 0.6837140321731567, + "learning_rate": 8.810897363330804e-06, + "loss": 0.7184, + "step": 8222 + }, + { + "epoch": 0.4525840717705983, + "grad_norm": 0.7410069108009338, + "learning_rate": 8.810616737556235e-06, + "loss": 0.8265, + "step": 8223 + }, + { + "epoch": 0.4526391105729539, + "grad_norm": 0.6945875883102417, + "learning_rate": 8.810336083142089e-06, + "loss": 0.7163, + "step": 8224 + }, + { + "epoch": 0.4526941493753096, + "grad_norm": 0.6978884339332581, + "learning_rate": 8.810055400090477e-06, + "loss": 0.795, + "step": 8225 + }, + { + "epoch": 0.45274918817766524, + "grad_norm": 0.7209095358848572, + "learning_rate": 8.809774688403509e-06, + "loss": 0.7317, + "step": 8226 + }, + { + "epoch": 0.45280422698002093, + "grad_norm": 0.7279626727104187, + "learning_rate": 8.809493948083294e-06, + "loss": 0.7699, + "step": 8227 + }, + { + "epoch": 0.45285926578237656, + "grad_norm": 0.7642556428909302, + "learning_rate": 8.809213179131943e-06, + "loss": 0.8518, + "step": 8228 + }, + { + "epoch": 0.45291430458473225, + "grad_norm": 0.6868709325790405, + "learning_rate": 8.808932381551565e-06, + "loss": 0.737, + "step": 8229 + }, + { + "epoch": 0.4529693433870879, + "grad_norm": 0.7012789845466614, + "learning_rate": 8.80865155534427e-06, + "loss": 0.8146, + "step": 8230 + }, + { + "epoch": 0.4530243821894436, + "grad_norm": 0.678683340549469, + "learning_rate": 8.808370700512171e-06, + "loss": 0.7531, + "step": 8231 + }, + { + "epoch": 0.4530794209917992, + "grad_norm": 0.690559983253479, + "learning_rate": 8.808089817057377e-06, + "loss": 0.6779, + "step": 8232 + }, + { + "epoch": 0.4531344597941549, + "grad_norm": 0.7179763317108154, + "learning_rate": 8.807808904981997e-06, + "loss": 0.8815, + "step": 8233 + }, + { + "epoch": 0.45318949859651053, + "grad_norm": 0.7708277702331543, + "learning_rate": 8.807527964288147e-06, + "loss": 0.8084, + "step": 8234 + }, + { + "epoch": 0.4532445373988662, + "grad_norm": 0.6828494071960449, + "learning_rate": 8.807246994977936e-06, + "loss": 0.7587, + "step": 8235 + }, + { + "epoch": 0.45329957620122185, + "grad_norm": 0.7085250616073608, + "learning_rate": 8.806965997053475e-06, + "loss": 0.7894, + "step": 8236 + }, + { + "epoch": 0.45335461500357754, + "grad_norm": 0.7723467946052551, + "learning_rate": 8.806684970516876e-06, + "loss": 0.7408, + "step": 8237 + }, + { + "epoch": 0.4534096538059332, + "grad_norm": 0.8887566328048706, + "learning_rate": 8.806403915370253e-06, + "loss": 0.9022, + "step": 8238 + }, + { + "epoch": 0.45346469260828887, + "grad_norm": 0.7379833459854126, + "learning_rate": 8.806122831615718e-06, + "loss": 0.8264, + "step": 8239 + }, + { + "epoch": 0.4535197314106445, + "grad_norm": 0.903279721736908, + "learning_rate": 8.80584171925538e-06, + "loss": 0.7432, + "step": 8240 + }, + { + "epoch": 0.4535747702130002, + "grad_norm": 0.7671363353729248, + "learning_rate": 8.805560578291356e-06, + "loss": 0.8109, + "step": 8241 + }, + { + "epoch": 0.4536298090153558, + "grad_norm": 0.6047827005386353, + "learning_rate": 8.805279408725755e-06, + "loss": 0.6628, + "step": 8242 + }, + { + "epoch": 0.4536848478177115, + "grad_norm": 1.0570796728134155, + "learning_rate": 8.804998210560696e-06, + "loss": 0.7981, + "step": 8243 + }, + { + "epoch": 0.45373988662006715, + "grad_norm": 0.7116600871086121, + "learning_rate": 8.804716983798288e-06, + "loss": 0.7601, + "step": 8244 + }, + { + "epoch": 0.45379492542242283, + "grad_norm": 0.7162767648696899, + "learning_rate": 8.804435728440644e-06, + "loss": 0.8389, + "step": 8245 + }, + { + "epoch": 0.45384996422477847, + "grad_norm": 0.6715626120567322, + "learning_rate": 8.80415444448988e-06, + "loss": 0.6377, + "step": 8246 + }, + { + "epoch": 0.4539050030271341, + "grad_norm": 0.7168908715248108, + "learning_rate": 8.80387313194811e-06, + "loss": 0.7946, + "step": 8247 + }, + { + "epoch": 0.4539600418294898, + "grad_norm": 0.7497992515563965, + "learning_rate": 8.803591790817448e-06, + "loss": 0.8026, + "step": 8248 + }, + { + "epoch": 0.4540150806318454, + "grad_norm": 0.6665049195289612, + "learning_rate": 8.803310421100009e-06, + "loss": 0.779, + "step": 8249 + }, + { + "epoch": 0.4540701194342011, + "grad_norm": 0.766674280166626, + "learning_rate": 8.803029022797905e-06, + "loss": 0.7467, + "step": 8250 + }, + { + "epoch": 0.45412515823655675, + "grad_norm": 0.7306104302406311, + "learning_rate": 8.802747595913255e-06, + "loss": 0.8323, + "step": 8251 + }, + { + "epoch": 0.45418019703891244, + "grad_norm": 0.6425766944885254, + "learning_rate": 8.802466140448169e-06, + "loss": 0.7226, + "step": 8252 + }, + { + "epoch": 0.45423523584126807, + "grad_norm": 0.7992560267448425, + "learning_rate": 8.802184656404769e-06, + "loss": 0.7285, + "step": 8253 + }, + { + "epoch": 0.45429027464362376, + "grad_norm": 0.6935924887657166, + "learning_rate": 8.801903143785164e-06, + "loss": 0.5757, + "step": 8254 + }, + { + "epoch": 0.4543453134459794, + "grad_norm": 0.7091512084007263, + "learning_rate": 8.801621602591473e-06, + "loss": 0.7719, + "step": 8255 + }, + { + "epoch": 0.4544003522483351, + "grad_norm": 0.851231038570404, + "learning_rate": 8.801340032825814e-06, + "loss": 0.7804, + "step": 8256 + }, + { + "epoch": 0.4544553910506907, + "grad_norm": 0.7443445920944214, + "learning_rate": 8.801058434490298e-06, + "loss": 0.7172, + "step": 8257 + }, + { + "epoch": 0.4545104298530464, + "grad_norm": 0.7156546115875244, + "learning_rate": 8.800776807587046e-06, + "loss": 0.7756, + "step": 8258 + }, + { + "epoch": 0.45456546865540204, + "grad_norm": 0.8027580380439758, + "learning_rate": 8.800495152118172e-06, + "loss": 0.8035, + "step": 8259 + }, + { + "epoch": 0.4546205074577577, + "grad_norm": 0.6868240833282471, + "learning_rate": 8.800213468085794e-06, + "loss": 0.7159, + "step": 8260 + }, + { + "epoch": 0.45467554626011336, + "grad_norm": 0.9127504229545593, + "learning_rate": 8.79993175549203e-06, + "loss": 0.7705, + "step": 8261 + }, + { + "epoch": 0.45473058506246905, + "grad_norm": 0.7074575424194336, + "learning_rate": 8.799650014338994e-06, + "loss": 0.7841, + "step": 8262 + }, + { + "epoch": 0.4547856238648247, + "grad_norm": 0.7462378740310669, + "learning_rate": 8.799368244628807e-06, + "loss": 0.8125, + "step": 8263 + }, + { + "epoch": 0.4548406626671804, + "grad_norm": 0.7510300874710083, + "learning_rate": 8.799086446363585e-06, + "loss": 0.8354, + "step": 8264 + }, + { + "epoch": 0.454895701469536, + "grad_norm": 0.7134591937065125, + "learning_rate": 8.798804619545446e-06, + "loss": 0.7968, + "step": 8265 + }, + { + "epoch": 0.4549507402718917, + "grad_norm": 1.0424071550369263, + "learning_rate": 8.798522764176509e-06, + "loss": 0.8638, + "step": 8266 + }, + { + "epoch": 0.45500577907424733, + "grad_norm": 0.6805267930030823, + "learning_rate": 8.79824088025889e-06, + "loss": 0.757, + "step": 8267 + }, + { + "epoch": 0.455060817876603, + "grad_norm": 0.8145313262939453, + "learning_rate": 8.79795896779471e-06, + "loss": 0.7589, + "step": 8268 + }, + { + "epoch": 0.45511585667895865, + "grad_norm": 0.7611781358718872, + "learning_rate": 8.79767702678609e-06, + "loss": 0.8426, + "step": 8269 + }, + { + "epoch": 0.45517089548131434, + "grad_norm": 0.7639568448066711, + "learning_rate": 8.797395057235142e-06, + "loss": 0.6609, + "step": 8270 + }, + { + "epoch": 0.45522593428367, + "grad_norm": 0.8577544093132019, + "learning_rate": 8.79711305914399e-06, + "loss": 0.8085, + "step": 8271 + }, + { + "epoch": 0.45528097308602566, + "grad_norm": 0.7740383148193359, + "learning_rate": 8.796831032514754e-06, + "loss": 0.8689, + "step": 8272 + }, + { + "epoch": 0.4553360118883813, + "grad_norm": 0.7300885915756226, + "learning_rate": 8.796548977349553e-06, + "loss": 0.8303, + "step": 8273 + }, + { + "epoch": 0.455391050690737, + "grad_norm": 0.6677057147026062, + "learning_rate": 8.796266893650504e-06, + "loss": 0.7449, + "step": 8274 + }, + { + "epoch": 0.4554460894930926, + "grad_norm": 0.7269144058227539, + "learning_rate": 8.79598478141973e-06, + "loss": 0.8744, + "step": 8275 + }, + { + "epoch": 0.4555011282954483, + "grad_norm": 0.7458559274673462, + "learning_rate": 8.795702640659351e-06, + "loss": 0.8036, + "step": 8276 + }, + { + "epoch": 0.45555616709780394, + "grad_norm": 0.7693114280700684, + "learning_rate": 8.795420471371487e-06, + "loss": 0.7617, + "step": 8277 + }, + { + "epoch": 0.45561120590015963, + "grad_norm": 0.7594510316848755, + "learning_rate": 8.79513827355826e-06, + "loss": 0.7049, + "step": 8278 + }, + { + "epoch": 0.45566624470251527, + "grad_norm": 0.7481217980384827, + "learning_rate": 8.794856047221786e-06, + "loss": 0.804, + "step": 8279 + }, + { + "epoch": 0.45572128350487096, + "grad_norm": 0.726859986782074, + "learning_rate": 8.794573792364192e-06, + "loss": 0.7322, + "step": 8280 + }, + { + "epoch": 0.4557763223072266, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.794291508987597e-06, + "loss": 0.8467, + "step": 8281 + }, + { + "epoch": 0.4558313611095823, + "grad_norm": 0.6264217495918274, + "learning_rate": 8.794009197094122e-06, + "loss": 0.6203, + "step": 8282 + }, + { + "epoch": 0.4558863999119379, + "grad_norm": 0.6973850131034851, + "learning_rate": 8.79372685668589e-06, + "loss": 0.8211, + "step": 8283 + }, + { + "epoch": 0.4559414387142936, + "grad_norm": 0.6992879509925842, + "learning_rate": 8.793444487765022e-06, + "loss": 0.7831, + "step": 8284 + }, + { + "epoch": 0.45599647751664923, + "grad_norm": 0.7641519904136658, + "learning_rate": 8.793162090333643e-06, + "loss": 0.7519, + "step": 8285 + }, + { + "epoch": 0.4560515163190049, + "grad_norm": 0.7296152710914612, + "learning_rate": 8.79287966439387e-06, + "loss": 0.8738, + "step": 8286 + }, + { + "epoch": 0.45610655512136056, + "grad_norm": 0.7549383044242859, + "learning_rate": 8.79259720994783e-06, + "loss": 0.7868, + "step": 8287 + }, + { + "epoch": 0.45616159392371625, + "grad_norm": 0.7932083606719971, + "learning_rate": 8.792314726997644e-06, + "loss": 0.8443, + "step": 8288 + }, + { + "epoch": 0.4562166327260719, + "grad_norm": 0.7999894022941589, + "learning_rate": 8.792032215545437e-06, + "loss": 0.852, + "step": 8289 + }, + { + "epoch": 0.4562716715284275, + "grad_norm": 0.8092383742332458, + "learning_rate": 8.79174967559333e-06, + "loss": 0.7922, + "step": 8290 + }, + { + "epoch": 0.4563267103307832, + "grad_norm": 0.7481340169906616, + "learning_rate": 8.791467107143447e-06, + "loss": 0.7086, + "step": 8291 + }, + { + "epoch": 0.45638174913313884, + "grad_norm": 0.8096129298210144, + "learning_rate": 8.791184510197912e-06, + "loss": 0.6645, + "step": 8292 + }, + { + "epoch": 0.4564367879354945, + "grad_norm": 0.7276492118835449, + "learning_rate": 8.79090188475885e-06, + "loss": 0.7174, + "step": 8293 + }, + { + "epoch": 0.45649182673785016, + "grad_norm": 0.815535843372345, + "learning_rate": 8.790619230828385e-06, + "loss": 0.8622, + "step": 8294 + }, + { + "epoch": 0.45654686554020585, + "grad_norm": 0.8191169500350952, + "learning_rate": 8.790336548408637e-06, + "loss": 0.8666, + "step": 8295 + }, + { + "epoch": 0.4566019043425615, + "grad_norm": 0.7449167966842651, + "learning_rate": 8.790053837501737e-06, + "loss": 0.7728, + "step": 8296 + }, + { + "epoch": 0.45665694314491717, + "grad_norm": 0.7311065196990967, + "learning_rate": 8.789771098109808e-06, + "loss": 0.8059, + "step": 8297 + }, + { + "epoch": 0.4567119819472728, + "grad_norm": 0.7381907105445862, + "learning_rate": 8.789488330234971e-06, + "loss": 0.7722, + "step": 8298 + }, + { + "epoch": 0.4567670207496285, + "grad_norm": 0.8180661201477051, + "learning_rate": 8.789205533879355e-06, + "loss": 0.9032, + "step": 8299 + }, + { + "epoch": 0.4568220595519841, + "grad_norm": 0.7993118762969971, + "learning_rate": 8.788922709045087e-06, + "loss": 0.8065, + "step": 8300 + }, + { + "epoch": 0.4568770983543398, + "grad_norm": 0.8449206948280334, + "learning_rate": 8.788639855734287e-06, + "loss": 0.7895, + "step": 8301 + }, + { + "epoch": 0.45693213715669545, + "grad_norm": 0.9224583506584167, + "learning_rate": 8.788356973949084e-06, + "loss": 0.78, + "step": 8302 + }, + { + "epoch": 0.45698717595905114, + "grad_norm": 0.7109915614128113, + "learning_rate": 8.788074063691604e-06, + "loss": 0.8029, + "step": 8303 + }, + { + "epoch": 0.4570422147614068, + "grad_norm": 0.7372310757637024, + "learning_rate": 8.787791124963976e-06, + "loss": 0.8118, + "step": 8304 + }, + { + "epoch": 0.45709725356376246, + "grad_norm": 0.8127168416976929, + "learning_rate": 8.787508157768323e-06, + "loss": 0.8665, + "step": 8305 + }, + { + "epoch": 0.4571522923661181, + "grad_norm": 0.7193050980567932, + "learning_rate": 8.787225162106771e-06, + "loss": 0.749, + "step": 8306 + }, + { + "epoch": 0.4572073311684738, + "grad_norm": 0.8825041651725769, + "learning_rate": 8.786942137981449e-06, + "loss": 0.9651, + "step": 8307 + }, + { + "epoch": 0.4572623699708294, + "grad_norm": 0.6854885816574097, + "learning_rate": 8.786659085394485e-06, + "loss": 0.8259, + "step": 8308 + }, + { + "epoch": 0.4573174087731851, + "grad_norm": 0.6698010563850403, + "learning_rate": 8.786376004348004e-06, + "loss": 0.7212, + "step": 8309 + }, + { + "epoch": 0.45737244757554074, + "grad_norm": 0.7706398963928223, + "learning_rate": 8.786092894844132e-06, + "loss": 0.719, + "step": 8310 + }, + { + "epoch": 0.45742748637789643, + "grad_norm": 0.8905620574951172, + "learning_rate": 8.785809756885002e-06, + "loss": 0.7518, + "step": 8311 + }, + { + "epoch": 0.45748252518025206, + "grad_norm": 0.7537117004394531, + "learning_rate": 8.78552659047274e-06, + "loss": 0.8267, + "step": 8312 + }, + { + "epoch": 0.45753756398260775, + "grad_norm": 0.7840754985809326, + "learning_rate": 8.78524339560947e-06, + "loss": 0.8417, + "step": 8313 + }, + { + "epoch": 0.4575926027849634, + "grad_norm": 0.7373713254928589, + "learning_rate": 8.784960172297327e-06, + "loss": 0.784, + "step": 8314 + }, + { + "epoch": 0.4576476415873191, + "grad_norm": 0.6648432016372681, + "learning_rate": 8.784676920538436e-06, + "loss": 0.7252, + "step": 8315 + }, + { + "epoch": 0.4577026803896747, + "grad_norm": 0.7904912829399109, + "learning_rate": 8.784393640334925e-06, + "loss": 0.7777, + "step": 8316 + }, + { + "epoch": 0.4577577191920304, + "grad_norm": 0.7691501379013062, + "learning_rate": 8.784110331688927e-06, + "loss": 0.733, + "step": 8317 + }, + { + "epoch": 0.45781275799438603, + "grad_norm": 0.6054617762565613, + "learning_rate": 8.783826994602566e-06, + "loss": 0.6367, + "step": 8318 + }, + { + "epoch": 0.4578677967967417, + "grad_norm": 0.7495457530021667, + "learning_rate": 8.783543629077976e-06, + "loss": 0.8672, + "step": 8319 + }, + { + "epoch": 0.45792283559909736, + "grad_norm": 0.6979867815971375, + "learning_rate": 8.783260235117283e-06, + "loss": 0.7338, + "step": 8320 + }, + { + "epoch": 0.45797787440145304, + "grad_norm": 0.6927759647369385, + "learning_rate": 8.78297681272262e-06, + "loss": 0.6925, + "step": 8321 + }, + { + "epoch": 0.4580329132038087, + "grad_norm": 0.9076687097549438, + "learning_rate": 8.782693361896115e-06, + "loss": 0.8225, + "step": 8322 + }, + { + "epoch": 0.45808795200616437, + "grad_norm": 0.7990893721580505, + "learning_rate": 8.782409882639902e-06, + "loss": 0.8144, + "step": 8323 + }, + { + "epoch": 0.45814299080852, + "grad_norm": 0.7958230376243591, + "learning_rate": 8.782126374956107e-06, + "loss": 0.7717, + "step": 8324 + }, + { + "epoch": 0.4581980296108757, + "grad_norm": 0.7694645524024963, + "learning_rate": 8.781842838846861e-06, + "loss": 0.8314, + "step": 8325 + }, + { + "epoch": 0.4582530684132313, + "grad_norm": 0.8653621077537537, + "learning_rate": 8.781559274314297e-06, + "loss": 0.7567, + "step": 8326 + }, + { + "epoch": 0.458308107215587, + "grad_norm": 0.7834668755531311, + "learning_rate": 8.781275681360548e-06, + "loss": 0.7431, + "step": 8327 + }, + { + "epoch": 0.45836314601794265, + "grad_norm": 0.6800104975700378, + "learning_rate": 8.780992059987742e-06, + "loss": 0.8266, + "step": 8328 + }, + { + "epoch": 0.45841818482029834, + "grad_norm": 0.7274910807609558, + "learning_rate": 8.780708410198011e-06, + "loss": 0.7358, + "step": 8329 + }, + { + "epoch": 0.45847322362265397, + "grad_norm": 0.8102344870567322, + "learning_rate": 8.780424731993488e-06, + "loss": 0.7397, + "step": 8330 + }, + { + "epoch": 0.45852826242500966, + "grad_norm": 0.7536956071853638, + "learning_rate": 8.780141025376305e-06, + "loss": 0.7053, + "step": 8331 + }, + { + "epoch": 0.4585833012273653, + "grad_norm": 0.678535521030426, + "learning_rate": 8.779857290348594e-06, + "loss": 0.792, + "step": 8332 + }, + { + "epoch": 0.4586383400297209, + "grad_norm": 0.8847216963768005, + "learning_rate": 8.779573526912487e-06, + "loss": 0.8117, + "step": 8333 + }, + { + "epoch": 0.4586933788320766, + "grad_norm": 0.6997288465499878, + "learning_rate": 8.779289735070117e-06, + "loss": 0.7797, + "step": 8334 + }, + { + "epoch": 0.45874841763443225, + "grad_norm": 0.7445441484451294, + "learning_rate": 8.779005914823617e-06, + "loss": 0.7505, + "step": 8335 + }, + { + "epoch": 0.45880345643678794, + "grad_norm": 0.618844211101532, + "learning_rate": 8.778722066175121e-06, + "loss": 0.661, + "step": 8336 + }, + { + "epoch": 0.45885849523914357, + "grad_norm": 0.6810492873191833, + "learning_rate": 8.778438189126761e-06, + "loss": 0.6819, + "step": 8337 + }, + { + "epoch": 0.45891353404149926, + "grad_norm": 0.6785591244697571, + "learning_rate": 8.778154283680671e-06, + "loss": 0.7808, + "step": 8338 + }, + { + "epoch": 0.4589685728438549, + "grad_norm": 0.7461212873458862, + "learning_rate": 8.777870349838984e-06, + "loss": 0.8566, + "step": 8339 + }, + { + "epoch": 0.4590236116462106, + "grad_norm": 0.6731496453285217, + "learning_rate": 8.777586387603836e-06, + "loss": 0.823, + "step": 8340 + }, + { + "epoch": 0.4590786504485662, + "grad_norm": 0.7295553684234619, + "learning_rate": 8.77730239697736e-06, + "loss": 0.9229, + "step": 8341 + }, + { + "epoch": 0.4591336892509219, + "grad_norm": 0.783275842666626, + "learning_rate": 8.77701837796169e-06, + "loss": 0.782, + "step": 8342 + }, + { + "epoch": 0.45918872805327754, + "grad_norm": 0.6952852606773376, + "learning_rate": 8.77673433055896e-06, + "loss": 0.7977, + "step": 8343 + }, + { + "epoch": 0.45924376685563323, + "grad_norm": 0.7381969094276428, + "learning_rate": 8.776450254771305e-06, + "loss": 0.768, + "step": 8344 + }, + { + "epoch": 0.45929880565798886, + "grad_norm": 0.7911093831062317, + "learning_rate": 8.776166150600862e-06, + "loss": 0.8284, + "step": 8345 + }, + { + "epoch": 0.45935384446034455, + "grad_norm": 0.7319246530532837, + "learning_rate": 8.775882018049765e-06, + "loss": 0.8135, + "step": 8346 + }, + { + "epoch": 0.4594088832627002, + "grad_norm": 0.7888429760932922, + "learning_rate": 8.77559785712015e-06, + "loss": 0.9001, + "step": 8347 + }, + { + "epoch": 0.4594639220650559, + "grad_norm": 0.6983326077461243, + "learning_rate": 8.775313667814151e-06, + "loss": 0.7537, + "step": 8348 + }, + { + "epoch": 0.4595189608674115, + "grad_norm": 0.7532416582107544, + "learning_rate": 8.775029450133905e-06, + "loss": 0.8307, + "step": 8349 + }, + { + "epoch": 0.4595739996697672, + "grad_norm": 0.7159993052482605, + "learning_rate": 8.774745204081549e-06, + "loss": 0.7874, + "step": 8350 + }, + { + "epoch": 0.45962903847212283, + "grad_norm": 0.6898767352104187, + "learning_rate": 8.774460929659218e-06, + "loss": 0.7453, + "step": 8351 + }, + { + "epoch": 0.4596840772744785, + "grad_norm": 0.6833236813545227, + "learning_rate": 8.774176626869051e-06, + "loss": 0.7281, + "step": 8352 + }, + { + "epoch": 0.45973911607683415, + "grad_norm": 0.7840244770050049, + "learning_rate": 8.77389229571318e-06, + "loss": 0.7194, + "step": 8353 + }, + { + "epoch": 0.45979415487918984, + "grad_norm": 0.7920441627502441, + "learning_rate": 8.773607936193747e-06, + "loss": 0.7135, + "step": 8354 + }, + { + "epoch": 0.4598491936815455, + "grad_norm": 0.7395668625831604, + "learning_rate": 8.773323548312884e-06, + "loss": 0.8162, + "step": 8355 + }, + { + "epoch": 0.45990423248390117, + "grad_norm": 0.7854128479957581, + "learning_rate": 8.773039132072734e-06, + "loss": 0.8252, + "step": 8356 + }, + { + "epoch": 0.4599592712862568, + "grad_norm": 0.694997251033783, + "learning_rate": 8.772754687475431e-06, + "loss": 0.6627, + "step": 8357 + }, + { + "epoch": 0.4600143100886125, + "grad_norm": 0.7698866724967957, + "learning_rate": 8.772470214523112e-06, + "loss": 0.8814, + "step": 8358 + }, + { + "epoch": 0.4600693488909681, + "grad_norm": 0.7323407530784607, + "learning_rate": 8.77218571321792e-06, + "loss": 0.7769, + "step": 8359 + }, + { + "epoch": 0.4601243876933238, + "grad_norm": 0.6637027263641357, + "learning_rate": 8.771901183561986e-06, + "loss": 0.6741, + "step": 8360 + }, + { + "epoch": 0.46017942649567944, + "grad_norm": 0.7423702478408813, + "learning_rate": 8.771616625557455e-06, + "loss": 0.7303, + "step": 8361 + }, + { + "epoch": 0.46023446529803513, + "grad_norm": 0.7599568367004395, + "learning_rate": 8.771332039206463e-06, + "loss": 0.8161, + "step": 8362 + }, + { + "epoch": 0.46028950410039077, + "grad_norm": 0.9063183069229126, + "learning_rate": 8.771047424511148e-06, + "loss": 0.8098, + "step": 8363 + }, + { + "epoch": 0.46034454290274646, + "grad_norm": 0.658210813999176, + "learning_rate": 8.770762781473651e-06, + "loss": 0.7097, + "step": 8364 + }, + { + "epoch": 0.4603995817051021, + "grad_norm": 0.8396975994110107, + "learning_rate": 8.770478110096111e-06, + "loss": 0.8731, + "step": 8365 + }, + { + "epoch": 0.4604546205074578, + "grad_norm": 0.7334815263748169, + "learning_rate": 8.770193410380663e-06, + "loss": 0.7689, + "step": 8366 + }, + { + "epoch": 0.4605096593098134, + "grad_norm": 0.8220386505126953, + "learning_rate": 8.769908682329453e-06, + "loss": 0.8139, + "step": 8367 + }, + { + "epoch": 0.4605646981121691, + "grad_norm": 0.8077995181083679, + "learning_rate": 8.76962392594462e-06, + "loss": 0.7379, + "step": 8368 + }, + { + "epoch": 0.46061973691452474, + "grad_norm": 0.8007730841636658, + "learning_rate": 8.7693391412283e-06, + "loss": 0.7835, + "step": 8369 + }, + { + "epoch": 0.4606747757168804, + "grad_norm": 0.7108187079429626, + "learning_rate": 8.769054328182637e-06, + "loss": 0.6787, + "step": 8370 + }, + { + "epoch": 0.46072981451923606, + "grad_norm": 0.7623056173324585, + "learning_rate": 8.768769486809772e-06, + "loss": 0.8056, + "step": 8371 + }, + { + "epoch": 0.46078485332159175, + "grad_norm": 0.6991614103317261, + "learning_rate": 8.768484617111843e-06, + "loss": 0.7404, + "step": 8372 + }, + { + "epoch": 0.4608398921239474, + "grad_norm": 0.7531471848487854, + "learning_rate": 8.768199719090991e-06, + "loss": 0.8104, + "step": 8373 + }, + { + "epoch": 0.46089493092630307, + "grad_norm": 1.0271111726760864, + "learning_rate": 8.76791479274936e-06, + "loss": 0.9028, + "step": 8374 + }, + { + "epoch": 0.4609499697286587, + "grad_norm": 0.7346897125244141, + "learning_rate": 8.76762983808909e-06, + "loss": 0.8179, + "step": 8375 + }, + { + "epoch": 0.46100500853101434, + "grad_norm": 0.6413559913635254, + "learning_rate": 8.767344855112324e-06, + "loss": 0.7995, + "step": 8376 + }, + { + "epoch": 0.46106004733337, + "grad_norm": 0.7187537550926208, + "learning_rate": 8.767059843821199e-06, + "loss": 0.7973, + "step": 8377 + }, + { + "epoch": 0.46111508613572566, + "grad_norm": 0.6819092035293579, + "learning_rate": 8.766774804217864e-06, + "loss": 0.8255, + "step": 8378 + }, + { + "epoch": 0.46117012493808135, + "grad_norm": 0.683318018913269, + "learning_rate": 8.766489736304457e-06, + "loss": 0.6794, + "step": 8379 + }, + { + "epoch": 0.461225163740437, + "grad_norm": 0.7345470786094666, + "learning_rate": 8.76620464008312e-06, + "loss": 0.8741, + "step": 8380 + }, + { + "epoch": 0.46128020254279267, + "grad_norm": 0.7369397282600403, + "learning_rate": 8.765919515556e-06, + "loss": 0.8301, + "step": 8381 + }, + { + "epoch": 0.4613352413451483, + "grad_norm": 0.7304979562759399, + "learning_rate": 8.765634362725233e-06, + "loss": 0.7507, + "step": 8382 + }, + { + "epoch": 0.461390280147504, + "grad_norm": 0.7968454957008362, + "learning_rate": 8.765349181592969e-06, + "loss": 0.7396, + "step": 8383 + }, + { + "epoch": 0.46144531894985963, + "grad_norm": 0.691439151763916, + "learning_rate": 8.765063972161347e-06, + "loss": 0.7199, + "step": 8384 + }, + { + "epoch": 0.4615003577522153, + "grad_norm": 0.8355879187583923, + "learning_rate": 8.764778734432513e-06, + "loss": 0.7369, + "step": 8385 + }, + { + "epoch": 0.46155539655457095, + "grad_norm": 0.908017098903656, + "learning_rate": 8.76449346840861e-06, + "loss": 0.8271, + "step": 8386 + }, + { + "epoch": 0.46161043535692664, + "grad_norm": 0.6426172852516174, + "learning_rate": 8.764208174091781e-06, + "loss": 0.6646, + "step": 8387 + }, + { + "epoch": 0.4616654741592823, + "grad_norm": 0.7003652453422546, + "learning_rate": 8.763922851484171e-06, + "loss": 0.7272, + "step": 8388 + }, + { + "epoch": 0.46172051296163796, + "grad_norm": 0.7470494508743286, + "learning_rate": 8.763637500587925e-06, + "loss": 0.8333, + "step": 8389 + }, + { + "epoch": 0.4617755517639936, + "grad_norm": 0.6974903345108032, + "learning_rate": 8.763352121405187e-06, + "loss": 0.834, + "step": 8390 + }, + { + "epoch": 0.4618305905663493, + "grad_norm": 0.8146659135818481, + "learning_rate": 8.7630667139381e-06, + "loss": 0.724, + "step": 8391 + }, + { + "epoch": 0.4618856293687049, + "grad_norm": 0.6614096164703369, + "learning_rate": 8.762781278188813e-06, + "loss": 0.6822, + "step": 8392 + }, + { + "epoch": 0.4619406681710606, + "grad_norm": 0.712944746017456, + "learning_rate": 8.762495814159469e-06, + "loss": 0.7864, + "step": 8393 + }, + { + "epoch": 0.46199570697341624, + "grad_norm": 0.7531552910804749, + "learning_rate": 8.762210321852213e-06, + "loss": 0.7494, + "step": 8394 + }, + { + "epoch": 0.46205074577577193, + "grad_norm": 0.8150199055671692, + "learning_rate": 8.761924801269191e-06, + "loss": 0.7869, + "step": 8395 + }, + { + "epoch": 0.46210578457812757, + "grad_norm": 0.8586462736129761, + "learning_rate": 8.76163925241255e-06, + "loss": 0.7647, + "step": 8396 + }, + { + "epoch": 0.46216082338048325, + "grad_norm": 0.7258061766624451, + "learning_rate": 8.761353675284434e-06, + "loss": 0.7672, + "step": 8397 + }, + { + "epoch": 0.4622158621828389, + "grad_norm": 0.6592851281166077, + "learning_rate": 8.761068069886992e-06, + "loss": 0.7488, + "step": 8398 + }, + { + "epoch": 0.4622709009851946, + "grad_norm": 0.7410836219787598, + "learning_rate": 8.760782436222368e-06, + "loss": 0.6669, + "step": 8399 + }, + { + "epoch": 0.4623259397875502, + "grad_norm": 0.7121642231941223, + "learning_rate": 8.76049677429271e-06, + "loss": 0.7005, + "step": 8400 + }, + { + "epoch": 0.4623809785899059, + "grad_norm": 0.7170663475990295, + "learning_rate": 8.760211084100166e-06, + "loss": 0.8154, + "step": 8401 + }, + { + "epoch": 0.46243601739226153, + "grad_norm": 0.6851769685745239, + "learning_rate": 8.759925365646882e-06, + "loss": 0.7948, + "step": 8402 + }, + { + "epoch": 0.4624910561946172, + "grad_norm": 0.7728533744812012, + "learning_rate": 8.759639618935006e-06, + "loss": 0.8263, + "step": 8403 + }, + { + "epoch": 0.46254609499697286, + "grad_norm": 0.7276784777641296, + "learning_rate": 8.759353843966682e-06, + "loss": 0.6992, + "step": 8404 + }, + { + "epoch": 0.46260113379932855, + "grad_norm": 0.7533649802207947, + "learning_rate": 8.759068040744063e-06, + "loss": 0.7744, + "step": 8405 + }, + { + "epoch": 0.4626561726016842, + "grad_norm": 0.6911979913711548, + "learning_rate": 8.758782209269294e-06, + "loss": 0.6977, + "step": 8406 + }, + { + "epoch": 0.46271121140403987, + "grad_norm": 0.6723766922950745, + "learning_rate": 8.758496349544526e-06, + "loss": 0.7286, + "step": 8407 + }, + { + "epoch": 0.4627662502063955, + "grad_norm": 0.7327921390533447, + "learning_rate": 8.758210461571903e-06, + "loss": 0.7708, + "step": 8408 + }, + { + "epoch": 0.4628212890087512, + "grad_norm": 0.7498626708984375, + "learning_rate": 8.757924545353578e-06, + "loss": 0.7476, + "step": 8409 + }, + { + "epoch": 0.4628763278111068, + "grad_norm": 0.8944914937019348, + "learning_rate": 8.757638600891696e-06, + "loss": 0.7814, + "step": 8410 + }, + { + "epoch": 0.4629313666134625, + "grad_norm": 0.7242841124534607, + "learning_rate": 8.757352628188411e-06, + "loss": 0.7564, + "step": 8411 + }, + { + "epoch": 0.46298640541581815, + "grad_norm": 0.6706324815750122, + "learning_rate": 8.757066627245866e-06, + "loss": 0.7792, + "step": 8412 + }, + { + "epoch": 0.46304144421817384, + "grad_norm": 0.8044155836105347, + "learning_rate": 8.756780598066218e-06, + "loss": 0.7873, + "step": 8413 + }, + { + "epoch": 0.46309648302052947, + "grad_norm": 0.9265295267105103, + "learning_rate": 8.75649454065161e-06, + "loss": 0.878, + "step": 8414 + }, + { + "epoch": 0.46315152182288516, + "grad_norm": 0.8162378668785095, + "learning_rate": 8.756208455004194e-06, + "loss": 0.8758, + "step": 8415 + }, + { + "epoch": 0.4632065606252408, + "grad_norm": 0.7081401348114014, + "learning_rate": 8.755922341126121e-06, + "loss": 0.8053, + "step": 8416 + }, + { + "epoch": 0.4632615994275965, + "grad_norm": 0.663885235786438, + "learning_rate": 8.755636199019544e-06, + "loss": 0.7456, + "step": 8417 + }, + { + "epoch": 0.4633166382299521, + "grad_norm": 0.6934974193572998, + "learning_rate": 8.755350028686608e-06, + "loss": 0.7316, + "step": 8418 + }, + { + "epoch": 0.46337167703230775, + "grad_norm": 0.7162168025970459, + "learning_rate": 8.755063830129467e-06, + "loss": 0.8566, + "step": 8419 + }, + { + "epoch": 0.46342671583466344, + "grad_norm": 0.7507640719413757, + "learning_rate": 8.75477760335027e-06, + "loss": 0.8141, + "step": 8420 + }, + { + "epoch": 0.46348175463701907, + "grad_norm": 0.6853382587432861, + "learning_rate": 8.754491348351172e-06, + "loss": 0.6995, + "step": 8421 + }, + { + "epoch": 0.46353679343937476, + "grad_norm": 0.6421381831169128, + "learning_rate": 8.75420506513432e-06, + "loss": 0.6344, + "step": 8422 + }, + { + "epoch": 0.4635918322417304, + "grad_norm": 0.8042624592781067, + "learning_rate": 8.753918753701868e-06, + "loss": 0.7506, + "step": 8423 + }, + { + "epoch": 0.4636468710440861, + "grad_norm": 0.7184088230133057, + "learning_rate": 8.753632414055969e-06, + "loss": 0.7997, + "step": 8424 + }, + { + "epoch": 0.4637019098464417, + "grad_norm": 0.749919593334198, + "learning_rate": 8.753346046198773e-06, + "loss": 0.8168, + "step": 8425 + }, + { + "epoch": 0.4637569486487974, + "grad_norm": 0.6583670973777771, + "learning_rate": 8.753059650132433e-06, + "loss": 0.6615, + "step": 8426 + }, + { + "epoch": 0.46381198745115304, + "grad_norm": 0.7560496926307678, + "learning_rate": 8.7527732258591e-06, + "loss": 0.7221, + "step": 8427 + }, + { + "epoch": 0.46386702625350873, + "grad_norm": 0.7031972408294678, + "learning_rate": 8.752486773380928e-06, + "loss": 0.8124, + "step": 8428 + }, + { + "epoch": 0.46392206505586436, + "grad_norm": 0.684124767780304, + "learning_rate": 8.752200292700072e-06, + "loss": 0.6862, + "step": 8429 + }, + { + "epoch": 0.46397710385822005, + "grad_norm": 0.8015589118003845, + "learning_rate": 8.751913783818682e-06, + "loss": 0.7863, + "step": 8430 + }, + { + "epoch": 0.4640321426605757, + "grad_norm": 0.6815705299377441, + "learning_rate": 8.751627246738912e-06, + "loss": 0.8116, + "step": 8431 + }, + { + "epoch": 0.4640871814629314, + "grad_norm": 0.7402058839797974, + "learning_rate": 8.751340681462914e-06, + "loss": 0.7341, + "step": 8432 + }, + { + "epoch": 0.464142220265287, + "grad_norm": 0.7484470009803772, + "learning_rate": 8.751054087992848e-06, + "loss": 0.8103, + "step": 8433 + }, + { + "epoch": 0.4641972590676427, + "grad_norm": 0.8148707151412964, + "learning_rate": 8.75076746633086e-06, + "loss": 0.8995, + "step": 8434 + }, + { + "epoch": 0.46425229786999833, + "grad_norm": 0.6403086185455322, + "learning_rate": 8.750480816479107e-06, + "loss": 0.6705, + "step": 8435 + }, + { + "epoch": 0.464307336672354, + "grad_norm": 0.7787690758705139, + "learning_rate": 8.750194138439748e-06, + "loss": 0.854, + "step": 8436 + }, + { + "epoch": 0.46436237547470965, + "grad_norm": 0.6975393891334534, + "learning_rate": 8.749907432214931e-06, + "loss": 0.7588, + "step": 8437 + }, + { + "epoch": 0.46441741427706534, + "grad_norm": 0.8002430200576782, + "learning_rate": 8.749620697806812e-06, + "loss": 0.8244, + "step": 8438 + }, + { + "epoch": 0.464472453079421, + "grad_norm": 0.8049100637435913, + "learning_rate": 8.74933393521755e-06, + "loss": 0.7686, + "step": 8439 + }, + { + "epoch": 0.46452749188177667, + "grad_norm": 0.6716971397399902, + "learning_rate": 8.749047144449298e-06, + "loss": 0.7823, + "step": 8440 + }, + { + "epoch": 0.4645825306841323, + "grad_norm": 0.7292011380195618, + "learning_rate": 8.748760325504212e-06, + "loss": 0.7643, + "step": 8441 + }, + { + "epoch": 0.464637569486488, + "grad_norm": 0.6823335886001587, + "learning_rate": 8.748473478384444e-06, + "loss": 0.7539, + "step": 8442 + }, + { + "epoch": 0.4646926082888436, + "grad_norm": 0.761730968952179, + "learning_rate": 8.748186603092155e-06, + "loss": 0.7279, + "step": 8443 + }, + { + "epoch": 0.4647476470911993, + "grad_norm": 0.694007933139801, + "learning_rate": 8.747899699629498e-06, + "loss": 0.7907, + "step": 8444 + }, + { + "epoch": 0.46480268589355495, + "grad_norm": 0.7638683319091797, + "learning_rate": 8.74761276799863e-06, + "loss": 0.7278, + "step": 8445 + }, + { + "epoch": 0.46485772469591063, + "grad_norm": 0.6281229853630066, + "learning_rate": 8.747325808201708e-06, + "loss": 0.6609, + "step": 8446 + }, + { + "epoch": 0.46491276349826627, + "grad_norm": 0.7273259162902832, + "learning_rate": 8.747038820240887e-06, + "loss": 0.7553, + "step": 8447 + }, + { + "epoch": 0.46496780230062196, + "grad_norm": 0.807482898235321, + "learning_rate": 8.746751804118326e-06, + "loss": 0.7783, + "step": 8448 + }, + { + "epoch": 0.4650228411029776, + "grad_norm": 0.7088230848312378, + "learning_rate": 8.746464759836182e-06, + "loss": 0.762, + "step": 8449 + }, + { + "epoch": 0.4650778799053333, + "grad_norm": 0.7039850354194641, + "learning_rate": 8.746177687396612e-06, + "loss": 0.7811, + "step": 8450 + }, + { + "epoch": 0.4651329187076889, + "grad_norm": 0.7154161334037781, + "learning_rate": 8.745890586801773e-06, + "loss": 0.76, + "step": 8451 + }, + { + "epoch": 0.4651879575100446, + "grad_norm": 0.6738846302032471, + "learning_rate": 8.745603458053822e-06, + "loss": 0.7119, + "step": 8452 + }, + { + "epoch": 0.46524299631240024, + "grad_norm": 0.6615753173828125, + "learning_rate": 8.745316301154919e-06, + "loss": 0.8061, + "step": 8453 + }, + { + "epoch": 0.4652980351147559, + "grad_norm": 0.7285076379776001, + "learning_rate": 8.74502911610722e-06, + "loss": 0.7522, + "step": 8454 + }, + { + "epoch": 0.46535307391711156, + "grad_norm": 0.7100732922554016, + "learning_rate": 8.744741902912886e-06, + "loss": 0.7665, + "step": 8455 + }, + { + "epoch": 0.46540811271946725, + "grad_norm": 0.6564487814903259, + "learning_rate": 8.744454661574074e-06, + "loss": 0.7352, + "step": 8456 + }, + { + "epoch": 0.4654631515218229, + "grad_norm": 0.689549446105957, + "learning_rate": 8.744167392092944e-06, + "loss": 0.7011, + "step": 8457 + }, + { + "epoch": 0.46551819032417857, + "grad_norm": 0.6660958528518677, + "learning_rate": 8.743880094471651e-06, + "loss": 0.7074, + "step": 8458 + }, + { + "epoch": 0.4655732291265342, + "grad_norm": 0.7470804452896118, + "learning_rate": 8.743592768712361e-06, + "loss": 0.6684, + "step": 8459 + }, + { + "epoch": 0.4656282679288899, + "grad_norm": 0.8058002591133118, + "learning_rate": 8.743305414817227e-06, + "loss": 0.7945, + "step": 8460 + }, + { + "epoch": 0.4656833067312455, + "grad_norm": 0.7756261825561523, + "learning_rate": 8.743018032788413e-06, + "loss": 0.8442, + "step": 8461 + }, + { + "epoch": 0.46573834553360116, + "grad_norm": 0.9267478585243225, + "learning_rate": 8.742730622628077e-06, + "loss": 0.8721, + "step": 8462 + }, + { + "epoch": 0.46579338433595685, + "grad_norm": 0.8684219121932983, + "learning_rate": 8.74244318433838e-06, + "loss": 0.7833, + "step": 8463 + }, + { + "epoch": 0.4658484231383125, + "grad_norm": 0.7060475945472717, + "learning_rate": 8.742155717921481e-06, + "loss": 0.7724, + "step": 8464 + }, + { + "epoch": 0.4659034619406682, + "grad_norm": 0.7316318154335022, + "learning_rate": 8.741868223379543e-06, + "loss": 0.7489, + "step": 8465 + }, + { + "epoch": 0.4659585007430238, + "grad_norm": 0.8131282925605774, + "learning_rate": 8.741580700714724e-06, + "loss": 0.7453, + "step": 8466 + }, + { + "epoch": 0.4660135395453795, + "grad_norm": 0.6985850930213928, + "learning_rate": 8.741293149929187e-06, + "loss": 0.7083, + "step": 8467 + }, + { + "epoch": 0.46606857834773513, + "grad_norm": 0.7512301206588745, + "learning_rate": 8.74100557102509e-06, + "loss": 0.7343, + "step": 8468 + }, + { + "epoch": 0.4661236171500908, + "grad_norm": 0.7547290921211243, + "learning_rate": 8.740717964004596e-06, + "loss": 0.8358, + "step": 8469 + }, + { + "epoch": 0.46617865595244645, + "grad_norm": 0.9091271758079529, + "learning_rate": 8.740430328869868e-06, + "loss": 0.762, + "step": 8470 + }, + { + "epoch": 0.46623369475480214, + "grad_norm": 0.6960130333900452, + "learning_rate": 8.740142665623069e-06, + "loss": 0.7317, + "step": 8471 + }, + { + "epoch": 0.4662887335571578, + "grad_norm": 0.684309184551239, + "learning_rate": 8.739854974266357e-06, + "loss": 0.7653, + "step": 8472 + }, + { + "epoch": 0.46634377235951346, + "grad_norm": 0.7669411301612854, + "learning_rate": 8.739567254801898e-06, + "loss": 0.7152, + "step": 8473 + }, + { + "epoch": 0.4663988111618691, + "grad_norm": 0.7072784900665283, + "learning_rate": 8.73927950723185e-06, + "loss": 0.7508, + "step": 8474 + }, + { + "epoch": 0.4664538499642248, + "grad_norm": 0.7249277234077454, + "learning_rate": 8.73899173155838e-06, + "loss": 0.7469, + "step": 8475 + }, + { + "epoch": 0.4665088887665804, + "grad_norm": 0.7664750218391418, + "learning_rate": 8.738703927783647e-06, + "loss": 0.8692, + "step": 8476 + }, + { + "epoch": 0.4665639275689361, + "grad_norm": 0.7579765319824219, + "learning_rate": 8.738416095909818e-06, + "loss": 0.8283, + "step": 8477 + }, + { + "epoch": 0.46661896637129174, + "grad_norm": 0.7066456079483032, + "learning_rate": 8.738128235939054e-06, + "loss": 0.7125, + "step": 8478 + }, + { + "epoch": 0.46667400517364743, + "grad_norm": 0.766106367111206, + "learning_rate": 8.737840347873518e-06, + "loss": 0.7683, + "step": 8479 + }, + { + "epoch": 0.46672904397600307, + "grad_norm": 0.7599226236343384, + "learning_rate": 8.737552431715374e-06, + "loss": 0.8375, + "step": 8480 + }, + { + "epoch": 0.46678408277835876, + "grad_norm": 0.6955341100692749, + "learning_rate": 8.737264487466789e-06, + "loss": 0.7012, + "step": 8481 + }, + { + "epoch": 0.4668391215807144, + "grad_norm": 0.6096246242523193, + "learning_rate": 8.736976515129923e-06, + "loss": 0.6126, + "step": 8482 + }, + { + "epoch": 0.4668941603830701, + "grad_norm": 0.7469536066055298, + "learning_rate": 8.73668851470694e-06, + "loss": 0.7675, + "step": 8483 + }, + { + "epoch": 0.4669491991854257, + "grad_norm": 0.8018775582313538, + "learning_rate": 8.73640048620001e-06, + "loss": 0.7372, + "step": 8484 + }, + { + "epoch": 0.4670042379877814, + "grad_norm": 0.7446827292442322, + "learning_rate": 8.736112429611293e-06, + "loss": 0.7277, + "step": 8485 + }, + { + "epoch": 0.46705927679013703, + "grad_norm": 0.6292026042938232, + "learning_rate": 8.735824344942954e-06, + "loss": 0.6172, + "step": 8486 + }, + { + "epoch": 0.4671143155924927, + "grad_norm": 0.7207980751991272, + "learning_rate": 8.735536232197159e-06, + "loss": 0.8363, + "step": 8487 + }, + { + "epoch": 0.46716935439484836, + "grad_norm": 0.8585891127586365, + "learning_rate": 8.735248091376073e-06, + "loss": 0.8006, + "step": 8488 + }, + { + "epoch": 0.46722439319720405, + "grad_norm": 0.8149702548980713, + "learning_rate": 8.734959922481863e-06, + "loss": 0.7869, + "step": 8489 + }, + { + "epoch": 0.4672794319995597, + "grad_norm": 0.7113268971443176, + "learning_rate": 8.734671725516695e-06, + "loss": 0.7774, + "step": 8490 + }, + { + "epoch": 0.46733447080191537, + "grad_norm": 0.6940683722496033, + "learning_rate": 8.734383500482733e-06, + "loss": 0.7157, + "step": 8491 + }, + { + "epoch": 0.467389509604271, + "grad_norm": 0.7823536396026611, + "learning_rate": 8.734095247382145e-06, + "loss": 0.8161, + "step": 8492 + }, + { + "epoch": 0.4674445484066267, + "grad_norm": 0.7094922065734863, + "learning_rate": 8.733806966217096e-06, + "loss": 0.7593, + "step": 8493 + }, + { + "epoch": 0.4674995872089823, + "grad_norm": 0.656432569026947, + "learning_rate": 8.733518656989753e-06, + "loss": 0.7853, + "step": 8494 + }, + { + "epoch": 0.467554626011338, + "grad_norm": 0.6715715527534485, + "learning_rate": 8.733230319702284e-06, + "loss": 0.839, + "step": 8495 + }, + { + "epoch": 0.46760966481369365, + "grad_norm": 0.7496705055236816, + "learning_rate": 8.732941954356854e-06, + "loss": 0.8231, + "step": 8496 + }, + { + "epoch": 0.46766470361604934, + "grad_norm": 0.7728047370910645, + "learning_rate": 8.732653560955635e-06, + "loss": 0.7852, + "step": 8497 + }, + { + "epoch": 0.46771974241840497, + "grad_norm": 1.5637458562850952, + "learning_rate": 8.732365139500787e-06, + "loss": 0.7749, + "step": 8498 + }, + { + "epoch": 0.46777478122076066, + "grad_norm": 0.6603190898895264, + "learning_rate": 8.732076689994484e-06, + "loss": 0.6628, + "step": 8499 + }, + { + "epoch": 0.4678298200231163, + "grad_norm": 0.7170974612236023, + "learning_rate": 8.73178821243889e-06, + "loss": 0.7855, + "step": 8500 + }, + { + "epoch": 0.467884858825472, + "grad_norm": 0.7220103740692139, + "learning_rate": 8.731499706836175e-06, + "loss": 0.7035, + "step": 8501 + }, + { + "epoch": 0.4679398976278276, + "grad_norm": 0.6940942406654358, + "learning_rate": 8.731211173188507e-06, + "loss": 0.7857, + "step": 8502 + }, + { + "epoch": 0.4679949364301833, + "grad_norm": 2.441596508026123, + "learning_rate": 8.730922611498057e-06, + "loss": 0.695, + "step": 8503 + }, + { + "epoch": 0.46804997523253894, + "grad_norm": 0.7654910087585449, + "learning_rate": 8.730634021766989e-06, + "loss": 0.788, + "step": 8504 + }, + { + "epoch": 0.4681050140348946, + "grad_norm": 0.791824996471405, + "learning_rate": 8.730345403997475e-06, + "loss": 0.7899, + "step": 8505 + }, + { + "epoch": 0.46816005283725026, + "grad_norm": 0.6863934993743896, + "learning_rate": 8.730056758191682e-06, + "loss": 0.7402, + "step": 8506 + }, + { + "epoch": 0.4682150916396059, + "grad_norm": 0.7920359373092651, + "learning_rate": 8.729768084351783e-06, + "loss": 0.7835, + "step": 8507 + }, + { + "epoch": 0.4682701304419616, + "grad_norm": 0.7077129483222961, + "learning_rate": 8.729479382479944e-06, + "loss": 0.7761, + "step": 8508 + }, + { + "epoch": 0.4683251692443172, + "grad_norm": 0.6870049238204956, + "learning_rate": 8.729190652578337e-06, + "loss": 0.8169, + "step": 8509 + }, + { + "epoch": 0.4683802080466729, + "grad_norm": 0.6802713871002197, + "learning_rate": 8.728901894649131e-06, + "loss": 0.7914, + "step": 8510 + }, + { + "epoch": 0.46843524684902854, + "grad_norm": 0.6645112633705139, + "learning_rate": 8.728613108694497e-06, + "loss": 0.7543, + "step": 8511 + }, + { + "epoch": 0.46849028565138423, + "grad_norm": 0.708292543888092, + "learning_rate": 8.728324294716604e-06, + "loss": 0.7015, + "step": 8512 + }, + { + "epoch": 0.46854532445373986, + "grad_norm": 0.7444465160369873, + "learning_rate": 8.728035452717625e-06, + "loss": 0.7999, + "step": 8513 + }, + { + "epoch": 0.46860036325609555, + "grad_norm": 0.7028616666793823, + "learning_rate": 8.727746582699728e-06, + "loss": 0.8094, + "step": 8514 + }, + { + "epoch": 0.4686554020584512, + "grad_norm": 0.7063208222389221, + "learning_rate": 8.727457684665088e-06, + "loss": 0.8028, + "step": 8515 + }, + { + "epoch": 0.4687104408608069, + "grad_norm": 0.8455138802528381, + "learning_rate": 8.727168758615871e-06, + "loss": 0.7691, + "step": 8516 + }, + { + "epoch": 0.4687654796631625, + "grad_norm": 1.0325778722763062, + "learning_rate": 8.726879804554252e-06, + "loss": 0.7042, + "step": 8517 + }, + { + "epoch": 0.4688205184655182, + "grad_norm": 0.7352754473686218, + "learning_rate": 8.726590822482402e-06, + "loss": 0.8467, + "step": 8518 + }, + { + "epoch": 0.46887555726787383, + "grad_norm": 0.7247193455696106, + "learning_rate": 8.726301812402494e-06, + "loss": 0.8034, + "step": 8519 + }, + { + "epoch": 0.4689305960702295, + "grad_norm": 0.6876820921897888, + "learning_rate": 8.726012774316699e-06, + "loss": 0.7308, + "step": 8520 + }, + { + "epoch": 0.46898563487258516, + "grad_norm": 0.6987231969833374, + "learning_rate": 8.725723708227188e-06, + "loss": 0.7655, + "step": 8521 + }, + { + "epoch": 0.46904067367494084, + "grad_norm": 0.7471843361854553, + "learning_rate": 8.725434614136135e-06, + "loss": 0.7271, + "step": 8522 + }, + { + "epoch": 0.4690957124772965, + "grad_norm": 0.7564642429351807, + "learning_rate": 8.725145492045715e-06, + "loss": 0.7335, + "step": 8523 + }, + { + "epoch": 0.46915075127965217, + "grad_norm": 0.7488992214202881, + "learning_rate": 8.724856341958095e-06, + "loss": 0.8815, + "step": 8524 + }, + { + "epoch": 0.4692057900820078, + "grad_norm": 0.6776759028434753, + "learning_rate": 8.724567163875455e-06, + "loss": 0.7452, + "step": 8525 + }, + { + "epoch": 0.4692608288843635, + "grad_norm": 0.6905981302261353, + "learning_rate": 8.724277957799963e-06, + "loss": 0.6815, + "step": 8526 + }, + { + "epoch": 0.4693158676867191, + "grad_norm": 0.7392297983169556, + "learning_rate": 8.723988723733795e-06, + "loss": 0.7546, + "step": 8527 + }, + { + "epoch": 0.4693709064890748, + "grad_norm": 0.7479110360145569, + "learning_rate": 8.723699461679128e-06, + "loss": 0.7455, + "step": 8528 + }, + { + "epoch": 0.46942594529143045, + "grad_norm": 0.7231360673904419, + "learning_rate": 8.723410171638129e-06, + "loss": 0.7611, + "step": 8529 + }, + { + "epoch": 0.46948098409378614, + "grad_norm": 0.7493714690208435, + "learning_rate": 8.723120853612976e-06, + "loss": 0.6997, + "step": 8530 + }, + { + "epoch": 0.46953602289614177, + "grad_norm": 0.8056793808937073, + "learning_rate": 8.722831507605844e-06, + "loss": 0.7431, + "step": 8531 + }, + { + "epoch": 0.46959106169849746, + "grad_norm": 0.7528547048568726, + "learning_rate": 8.722542133618907e-06, + "loss": 0.8798, + "step": 8532 + }, + { + "epoch": 0.4696461005008531, + "grad_norm": 0.6964863538742065, + "learning_rate": 8.72225273165434e-06, + "loss": 0.8462, + "step": 8533 + }, + { + "epoch": 0.4697011393032088, + "grad_norm": 0.7354302406311035, + "learning_rate": 8.721963301714318e-06, + "loss": 0.7882, + "step": 8534 + }, + { + "epoch": 0.4697561781055644, + "grad_norm": 0.7365205883979797, + "learning_rate": 8.721673843801014e-06, + "loss": 0.7483, + "step": 8535 + }, + { + "epoch": 0.4698112169079201, + "grad_norm": 0.7485378384590149, + "learning_rate": 8.72138435791661e-06, + "loss": 0.8539, + "step": 8536 + }, + { + "epoch": 0.46986625571027574, + "grad_norm": 0.7674353718757629, + "learning_rate": 8.721094844063274e-06, + "loss": 0.834, + "step": 8537 + }, + { + "epoch": 0.4699212945126314, + "grad_norm": 0.7054184079170227, + "learning_rate": 8.720805302243185e-06, + "loss": 0.7938, + "step": 8538 + }, + { + "epoch": 0.46997633331498706, + "grad_norm": 0.7414574027061462, + "learning_rate": 8.72051573245852e-06, + "loss": 0.7932, + "step": 8539 + }, + { + "epoch": 0.47003137211734275, + "grad_norm": 0.6734428405761719, + "learning_rate": 8.720226134711455e-06, + "loss": 0.8775, + "step": 8540 + }, + { + "epoch": 0.4700864109196984, + "grad_norm": 0.6588559150695801, + "learning_rate": 8.719936509004166e-06, + "loss": 0.6985, + "step": 8541 + }, + { + "epoch": 0.4701414497220541, + "grad_norm": 0.6557223200798035, + "learning_rate": 8.71964685533883e-06, + "loss": 0.7243, + "step": 8542 + }, + { + "epoch": 0.4701964885244097, + "grad_norm": 0.7876269221305847, + "learning_rate": 8.719357173717624e-06, + "loss": 0.8075, + "step": 8543 + }, + { + "epoch": 0.4702515273267654, + "grad_norm": 0.8346554040908813, + "learning_rate": 8.719067464142726e-06, + "loss": 0.8427, + "step": 8544 + }, + { + "epoch": 0.47030656612912103, + "grad_norm": 0.7190483808517456, + "learning_rate": 8.718777726616311e-06, + "loss": 0.7689, + "step": 8545 + }, + { + "epoch": 0.4703616049314767, + "grad_norm": 1.303118109703064, + "learning_rate": 8.718487961140558e-06, + "loss": 0.7537, + "step": 8546 + }, + { + "epoch": 0.47041664373383235, + "grad_norm": 0.7733024954795837, + "learning_rate": 8.718198167717647e-06, + "loss": 0.747, + "step": 8547 + }, + { + "epoch": 0.470471682536188, + "grad_norm": 0.6692484617233276, + "learning_rate": 8.717908346349751e-06, + "loss": 0.725, + "step": 8548 + }, + { + "epoch": 0.4705267213385437, + "grad_norm": 0.9639461636543274, + "learning_rate": 8.717618497039054e-06, + "loss": 0.8642, + "step": 8549 + }, + { + "epoch": 0.4705817601408993, + "grad_norm": 0.7584646344184875, + "learning_rate": 8.717328619787728e-06, + "loss": 0.8174, + "step": 8550 + }, + { + "epoch": 0.470636798943255, + "grad_norm": 0.7051709890365601, + "learning_rate": 8.717038714597957e-06, + "loss": 0.7962, + "step": 8551 + }, + { + "epoch": 0.47069183774561063, + "grad_norm": 0.738913893699646, + "learning_rate": 8.716748781471918e-06, + "loss": 0.7367, + "step": 8552 + }, + { + "epoch": 0.4707468765479663, + "grad_norm": 0.7027214169502258, + "learning_rate": 8.716458820411791e-06, + "loss": 0.7613, + "step": 8553 + }, + { + "epoch": 0.47080191535032195, + "grad_norm": 0.6701993346214294, + "learning_rate": 8.716168831419754e-06, + "loss": 0.638, + "step": 8554 + }, + { + "epoch": 0.47085695415267764, + "grad_norm": 0.7422072887420654, + "learning_rate": 8.715878814497984e-06, + "loss": 0.8338, + "step": 8555 + }, + { + "epoch": 0.4709119929550333, + "grad_norm": 0.985992968082428, + "learning_rate": 8.715588769648667e-06, + "loss": 0.7765, + "step": 8556 + }, + { + "epoch": 0.47096703175738897, + "grad_norm": 0.6937553882598877, + "learning_rate": 8.715298696873978e-06, + "loss": 0.7306, + "step": 8557 + }, + { + "epoch": 0.4710220705597446, + "grad_norm": 1.1683214902877808, + "learning_rate": 8.715008596176099e-06, + "loss": 0.7782, + "step": 8558 + }, + { + "epoch": 0.4710771093621003, + "grad_norm": 0.7493681907653809, + "learning_rate": 8.714718467557209e-06, + "loss": 0.9166, + "step": 8559 + }, + { + "epoch": 0.4711321481644559, + "grad_norm": 0.7562084794044495, + "learning_rate": 8.71442831101949e-06, + "loss": 0.7999, + "step": 8560 + }, + { + "epoch": 0.4711871869668116, + "grad_norm": 0.7950266003608704, + "learning_rate": 8.71413812656512e-06, + "loss": 0.8094, + "step": 8561 + }, + { + "epoch": 0.47124222576916724, + "grad_norm": 1.1411044597625732, + "learning_rate": 8.713847914196287e-06, + "loss": 0.7631, + "step": 8562 + }, + { + "epoch": 0.47129726457152293, + "grad_norm": 0.7270122170448303, + "learning_rate": 8.713557673915162e-06, + "loss": 0.7529, + "step": 8563 + }, + { + "epoch": 0.47135230337387857, + "grad_norm": 0.8138573169708252, + "learning_rate": 8.713267405723935e-06, + "loss": 0.8215, + "step": 8564 + }, + { + "epoch": 0.47140734217623426, + "grad_norm": 0.732982873916626, + "learning_rate": 8.712977109624783e-06, + "loss": 0.7099, + "step": 8565 + }, + { + "epoch": 0.4714623809785899, + "grad_norm": 0.7307591438293457, + "learning_rate": 8.712686785619888e-06, + "loss": 0.7035, + "step": 8566 + }, + { + "epoch": 0.4715174197809456, + "grad_norm": 0.8684857487678528, + "learning_rate": 8.712396433711434e-06, + "loss": 0.8605, + "step": 8567 + }, + { + "epoch": 0.4715724585833012, + "grad_norm": 0.7490718364715576, + "learning_rate": 8.712106053901603e-06, + "loss": 0.7439, + "step": 8568 + }, + { + "epoch": 0.4716274973856569, + "grad_norm": 0.8572973012924194, + "learning_rate": 8.711815646192575e-06, + "loss": 0.8187, + "step": 8569 + }, + { + "epoch": 0.47168253618801254, + "grad_norm": 0.785270094871521, + "learning_rate": 8.711525210586536e-06, + "loss": 0.7812, + "step": 8570 + }, + { + "epoch": 0.4717375749903682, + "grad_norm": 0.683651864528656, + "learning_rate": 8.711234747085663e-06, + "loss": 0.7682, + "step": 8571 + }, + { + "epoch": 0.47179261379272386, + "grad_norm": 0.7990714907646179, + "learning_rate": 8.710944255692147e-06, + "loss": 0.8114, + "step": 8572 + }, + { + "epoch": 0.47184765259507955, + "grad_norm": 0.9354856610298157, + "learning_rate": 8.710653736408165e-06, + "loss": 0.7353, + "step": 8573 + }, + { + "epoch": 0.4719026913974352, + "grad_norm": 0.8309356570243835, + "learning_rate": 8.710363189235904e-06, + "loss": 0.8635, + "step": 8574 + }, + { + "epoch": 0.47195773019979087, + "grad_norm": 0.7018463015556335, + "learning_rate": 8.710072614177547e-06, + "loss": 0.6372, + "step": 8575 + }, + { + "epoch": 0.4720127690021465, + "grad_norm": 0.7626469135284424, + "learning_rate": 8.709782011235277e-06, + "loss": 0.7684, + "step": 8576 + }, + { + "epoch": 0.4720678078045022, + "grad_norm": 0.6995826959609985, + "learning_rate": 8.70949138041128e-06, + "loss": 0.7301, + "step": 8577 + }, + { + "epoch": 0.4721228466068578, + "grad_norm": 0.719307541847229, + "learning_rate": 8.709200721707736e-06, + "loss": 0.7437, + "step": 8578 + }, + { + "epoch": 0.4721778854092135, + "grad_norm": 0.7355539202690125, + "learning_rate": 8.708910035126832e-06, + "loss": 0.7926, + "step": 8579 + }, + { + "epoch": 0.47223292421156915, + "grad_norm": 0.7262680530548096, + "learning_rate": 8.708619320670755e-06, + "loss": 0.7641, + "step": 8580 + }, + { + "epoch": 0.47228796301392484, + "grad_norm": 0.844745934009552, + "learning_rate": 8.708328578341687e-06, + "loss": 0.7228, + "step": 8581 + }, + { + "epoch": 0.47234300181628047, + "grad_norm": 0.8169287443161011, + "learning_rate": 8.708037808141814e-06, + "loss": 0.7076, + "step": 8582 + }, + { + "epoch": 0.47239804061863616, + "grad_norm": 0.7342209219932556, + "learning_rate": 8.707747010073322e-06, + "loss": 0.7997, + "step": 8583 + }, + { + "epoch": 0.4724530794209918, + "grad_norm": 0.7138200402259827, + "learning_rate": 8.707456184138394e-06, + "loss": 0.7796, + "step": 8584 + }, + { + "epoch": 0.4725081182233475, + "grad_norm": 0.7168061137199402, + "learning_rate": 8.70716533033922e-06, + "loss": 0.6876, + "step": 8585 + }, + { + "epoch": 0.4725631570257031, + "grad_norm": 0.7256397604942322, + "learning_rate": 8.706874448677982e-06, + "loss": 0.8296, + "step": 8586 + }, + { + "epoch": 0.4726181958280588, + "grad_norm": 0.8232730627059937, + "learning_rate": 8.70658353915687e-06, + "loss": 0.8001, + "step": 8587 + }, + { + "epoch": 0.47267323463041444, + "grad_norm": 0.7110162973403931, + "learning_rate": 8.706292601778067e-06, + "loss": 0.7061, + "step": 8588 + }, + { + "epoch": 0.47272827343277013, + "grad_norm": 0.9466721415519714, + "learning_rate": 8.706001636543761e-06, + "loss": 0.8713, + "step": 8589 + }, + { + "epoch": 0.47278331223512576, + "grad_norm": 0.7017776370048523, + "learning_rate": 8.705710643456138e-06, + "loss": 0.759, + "step": 8590 + }, + { + "epoch": 0.4728383510374814, + "grad_norm": 0.7140772938728333, + "learning_rate": 8.705419622517386e-06, + "loss": 0.6962, + "step": 8591 + }, + { + "epoch": 0.4728933898398371, + "grad_norm": 1.1076452732086182, + "learning_rate": 8.705128573729694e-06, + "loss": 0.8264, + "step": 8592 + }, + { + "epoch": 0.4729484286421927, + "grad_norm": 0.7308200597763062, + "learning_rate": 8.704837497095247e-06, + "loss": 0.6243, + "step": 8593 + }, + { + "epoch": 0.4730034674445484, + "grad_norm": 0.9445781111717224, + "learning_rate": 8.704546392616231e-06, + "loss": 0.6676, + "step": 8594 + }, + { + "epoch": 0.47305850624690404, + "grad_norm": 0.6527873277664185, + "learning_rate": 8.704255260294837e-06, + "loss": 0.6979, + "step": 8595 + }, + { + "epoch": 0.47311354504925973, + "grad_norm": 0.6732963919639587, + "learning_rate": 8.703964100133252e-06, + "loss": 0.7724, + "step": 8596 + }, + { + "epoch": 0.47316858385161537, + "grad_norm": 0.7661726474761963, + "learning_rate": 8.703672912133665e-06, + "loss": 0.7988, + "step": 8597 + }, + { + "epoch": 0.47322362265397105, + "grad_norm": 0.7006877660751343, + "learning_rate": 8.703381696298262e-06, + "loss": 0.6765, + "step": 8598 + }, + { + "epoch": 0.4732786614563267, + "grad_norm": 0.7195086479187012, + "learning_rate": 8.703090452629236e-06, + "loss": 0.6676, + "step": 8599 + }, + { + "epoch": 0.4733337002586824, + "grad_norm": 0.6692042350769043, + "learning_rate": 8.702799181128771e-06, + "loss": 0.7882, + "step": 8600 + }, + { + "epoch": 0.473388739061038, + "grad_norm": 0.7736524343490601, + "learning_rate": 8.70250788179906e-06, + "loss": 0.7977, + "step": 8601 + }, + { + "epoch": 0.4734437778633937, + "grad_norm": 0.8821607828140259, + "learning_rate": 8.70221655464229e-06, + "loss": 0.7465, + "step": 8602 + }, + { + "epoch": 0.47349881666574933, + "grad_norm": 0.7565156817436218, + "learning_rate": 8.701925199660652e-06, + "loss": 0.831, + "step": 8603 + }, + { + "epoch": 0.473553855468105, + "grad_norm": 0.8542304039001465, + "learning_rate": 8.701633816856335e-06, + "loss": 0.7538, + "step": 8604 + }, + { + "epoch": 0.47360889427046066, + "grad_norm": 0.6891050338745117, + "learning_rate": 8.701342406231529e-06, + "loss": 0.7687, + "step": 8605 + }, + { + "epoch": 0.47366393307281635, + "grad_norm": 0.8570719361305237, + "learning_rate": 8.701050967788424e-06, + "loss": 0.7236, + "step": 8606 + }, + { + "epoch": 0.473718971875172, + "grad_norm": 0.7921456098556519, + "learning_rate": 8.700759501529212e-06, + "loss": 0.8214, + "step": 8607 + }, + { + "epoch": 0.47377401067752767, + "grad_norm": 0.7584527730941772, + "learning_rate": 8.70046800745608e-06, + "loss": 0.8204, + "step": 8608 + }, + { + "epoch": 0.4738290494798833, + "grad_norm": 0.8033978343009949, + "learning_rate": 8.700176485571222e-06, + "loss": 0.8278, + "step": 8609 + }, + { + "epoch": 0.473884088282239, + "grad_norm": 0.9950750470161438, + "learning_rate": 8.699884935876828e-06, + "loss": 0.8181, + "step": 8610 + }, + { + "epoch": 0.4739391270845946, + "grad_norm": 0.7213684916496277, + "learning_rate": 8.69959335837509e-06, + "loss": 0.7099, + "step": 8611 + }, + { + "epoch": 0.4739941658869503, + "grad_norm": 0.7847200632095337, + "learning_rate": 8.699301753068199e-06, + "loss": 0.8272, + "step": 8612 + }, + { + "epoch": 0.47404920468930595, + "grad_norm": 0.7075058221817017, + "learning_rate": 8.699010119958344e-06, + "loss": 0.7127, + "step": 8613 + }, + { + "epoch": 0.47410424349166164, + "grad_norm": 0.682741641998291, + "learning_rate": 8.69871845904772e-06, + "loss": 0.8446, + "step": 8614 + }, + { + "epoch": 0.47415928229401727, + "grad_norm": 0.7120605111122131, + "learning_rate": 8.69842677033852e-06, + "loss": 0.7776, + "step": 8615 + }, + { + "epoch": 0.47421432109637296, + "grad_norm": 0.822405219078064, + "learning_rate": 8.698135053832933e-06, + "loss": 0.8018, + "step": 8616 + }, + { + "epoch": 0.4742693598987286, + "grad_norm": 0.6815186738967896, + "learning_rate": 8.697843309533152e-06, + "loss": 0.7413, + "step": 8617 + }, + { + "epoch": 0.4743243987010843, + "grad_norm": 0.7587849497795105, + "learning_rate": 8.69755153744137e-06, + "loss": 0.7809, + "step": 8618 + }, + { + "epoch": 0.4743794375034399, + "grad_norm": 0.7092488408088684, + "learning_rate": 8.697259737559782e-06, + "loss": 0.7921, + "step": 8619 + }, + { + "epoch": 0.4744344763057956, + "grad_norm": 0.7396836280822754, + "learning_rate": 8.69696790989058e-06, + "loss": 0.7946, + "step": 8620 + }, + { + "epoch": 0.47448951510815124, + "grad_norm": 0.6760729551315308, + "learning_rate": 8.696676054435955e-06, + "loss": 0.7389, + "step": 8621 + }, + { + "epoch": 0.4745445539105069, + "grad_norm": 1.1640692949295044, + "learning_rate": 8.696384171198105e-06, + "loss": 0.8291, + "step": 8622 + }, + { + "epoch": 0.47459959271286256, + "grad_norm": 0.7415158152580261, + "learning_rate": 8.696092260179219e-06, + "loss": 0.7534, + "step": 8623 + }, + { + "epoch": 0.47465463151521825, + "grad_norm": 0.7730052471160889, + "learning_rate": 8.695800321381492e-06, + "loss": 0.8447, + "step": 8624 + }, + { + "epoch": 0.4747096703175739, + "grad_norm": 0.811522364616394, + "learning_rate": 8.695508354807121e-06, + "loss": 0.7466, + "step": 8625 + }, + { + "epoch": 0.4747647091199296, + "grad_norm": 0.7908332347869873, + "learning_rate": 8.695216360458298e-06, + "loss": 0.7769, + "step": 8626 + }, + { + "epoch": 0.4748197479222852, + "grad_norm": 0.744971752166748, + "learning_rate": 8.694924338337217e-06, + "loss": 0.7651, + "step": 8627 + }, + { + "epoch": 0.4748747867246409, + "grad_norm": 0.705565869808197, + "learning_rate": 8.694632288446075e-06, + "loss": 0.8258, + "step": 8628 + }, + { + "epoch": 0.47492982552699653, + "grad_norm": 0.8199328780174255, + "learning_rate": 8.694340210787065e-06, + "loss": 0.733, + "step": 8629 + }, + { + "epoch": 0.4749848643293522, + "grad_norm": 0.6965511441230774, + "learning_rate": 8.694048105362382e-06, + "loss": 0.7548, + "step": 8630 + }, + { + "epoch": 0.47503990313170785, + "grad_norm": 0.7943055629730225, + "learning_rate": 8.693755972174225e-06, + "loss": 0.7518, + "step": 8631 + }, + { + "epoch": 0.47509494193406354, + "grad_norm": 0.6277437806129456, + "learning_rate": 8.693463811224785e-06, + "loss": 0.6941, + "step": 8632 + }, + { + "epoch": 0.4751499807364192, + "grad_norm": 1.0745574235916138, + "learning_rate": 8.693171622516259e-06, + "loss": 0.8056, + "step": 8633 + }, + { + "epoch": 0.4752050195387748, + "grad_norm": 0.7005153894424438, + "learning_rate": 8.692879406050844e-06, + "loss": 0.757, + "step": 8634 + }, + { + "epoch": 0.4752600583411305, + "grad_norm": 0.6971127986907959, + "learning_rate": 8.692587161830737e-06, + "loss": 0.7509, + "step": 8635 + }, + { + "epoch": 0.47531509714348613, + "grad_norm": 0.7583497762680054, + "learning_rate": 8.692294889858133e-06, + "loss": 0.7895, + "step": 8636 + }, + { + "epoch": 0.4753701359458418, + "grad_norm": 0.719932496547699, + "learning_rate": 8.692002590135228e-06, + "loss": 0.762, + "step": 8637 + }, + { + "epoch": 0.47542517474819745, + "grad_norm": 0.7041804790496826, + "learning_rate": 8.691710262664222e-06, + "loss": 0.7101, + "step": 8638 + }, + { + "epoch": 0.47548021355055314, + "grad_norm": 0.7395016551017761, + "learning_rate": 8.691417907447309e-06, + "loss": 0.723, + "step": 8639 + }, + { + "epoch": 0.4755352523529088, + "grad_norm": 0.6605637073516846, + "learning_rate": 8.691125524486686e-06, + "loss": 0.644, + "step": 8640 + }, + { + "epoch": 0.47559029115526447, + "grad_norm": 0.694732129573822, + "learning_rate": 8.690833113784552e-06, + "loss": 0.7162, + "step": 8641 + }, + { + "epoch": 0.4756453299576201, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.690540675343105e-06, + "loss": 0.6995, + "step": 8642 + }, + { + "epoch": 0.4757003687599758, + "grad_norm": 0.6961628794670105, + "learning_rate": 8.69024820916454e-06, + "loss": 0.7955, + "step": 8643 + }, + { + "epoch": 0.4757554075623314, + "grad_norm": 0.706266462802887, + "learning_rate": 8.68995571525106e-06, + "loss": 0.7237, + "step": 8644 + }, + { + "epoch": 0.4758104463646871, + "grad_norm": 0.7727495431900024, + "learning_rate": 8.689663193604858e-06, + "loss": 0.7215, + "step": 8645 + }, + { + "epoch": 0.47586548516704275, + "grad_norm": 0.7320648431777954, + "learning_rate": 8.689370644228136e-06, + "loss": 0.7592, + "step": 8646 + }, + { + "epoch": 0.47592052396939843, + "grad_norm": 0.8149487376213074, + "learning_rate": 8.689078067123093e-06, + "loss": 0.7666, + "step": 8647 + }, + { + "epoch": 0.47597556277175407, + "grad_norm": 0.6584552526473999, + "learning_rate": 8.688785462291927e-06, + "loss": 0.7497, + "step": 8648 + }, + { + "epoch": 0.47603060157410976, + "grad_norm": 0.7197825312614441, + "learning_rate": 8.688492829736836e-06, + "loss": 0.7559, + "step": 8649 + }, + { + "epoch": 0.4760856403764654, + "grad_norm": 0.8116913437843323, + "learning_rate": 8.68820016946002e-06, + "loss": 0.7029, + "step": 8650 + }, + { + "epoch": 0.4761406791788211, + "grad_norm": 0.6733378171920776, + "learning_rate": 8.68790748146368e-06, + "loss": 0.7242, + "step": 8651 + }, + { + "epoch": 0.4761957179811767, + "grad_norm": 0.690464437007904, + "learning_rate": 8.687614765750012e-06, + "loss": 0.6668, + "step": 8652 + }, + { + "epoch": 0.4762507567835324, + "grad_norm": 0.7901185154914856, + "learning_rate": 8.687322022321221e-06, + "loss": 0.7436, + "step": 8653 + }, + { + "epoch": 0.47630579558588804, + "grad_norm": 0.7608267068862915, + "learning_rate": 8.687029251179504e-06, + "loss": 0.8292, + "step": 8654 + }, + { + "epoch": 0.4763608343882437, + "grad_norm": 0.6851119995117188, + "learning_rate": 8.686736452327062e-06, + "loss": 0.7974, + "step": 8655 + }, + { + "epoch": 0.47641587319059936, + "grad_norm": 0.6946395635604858, + "learning_rate": 8.686443625766094e-06, + "loss": 0.6745, + "step": 8656 + }, + { + "epoch": 0.47647091199295505, + "grad_norm": 0.7403521537780762, + "learning_rate": 8.686150771498804e-06, + "loss": 0.7759, + "step": 8657 + }, + { + "epoch": 0.4765259507953107, + "grad_norm": 0.8415689468383789, + "learning_rate": 8.685857889527393e-06, + "loss": 0.7911, + "step": 8658 + }, + { + "epoch": 0.47658098959766637, + "grad_norm": 0.6947778463363647, + "learning_rate": 8.68556497985406e-06, + "loss": 0.8026, + "step": 8659 + }, + { + "epoch": 0.476636028400022, + "grad_norm": 0.6807059645652771, + "learning_rate": 8.685272042481006e-06, + "loss": 0.7194, + "step": 8660 + }, + { + "epoch": 0.4766910672023777, + "grad_norm": 0.8948639631271362, + "learning_rate": 8.684979077410434e-06, + "loss": 0.8017, + "step": 8661 + }, + { + "epoch": 0.4767461060047333, + "grad_norm": 0.6697849035263062, + "learning_rate": 8.684686084644546e-06, + "loss": 0.7653, + "step": 8662 + }, + { + "epoch": 0.476801144807089, + "grad_norm": 0.7303311228752136, + "learning_rate": 8.684393064185543e-06, + "loss": 0.8287, + "step": 8663 + }, + { + "epoch": 0.47685618360944465, + "grad_norm": 0.6545100808143616, + "learning_rate": 8.68410001603563e-06, + "loss": 0.7438, + "step": 8664 + }, + { + "epoch": 0.47691122241180034, + "grad_norm": 0.8757766485214233, + "learning_rate": 8.683806940197006e-06, + "loss": 0.8343, + "step": 8665 + }, + { + "epoch": 0.476966261214156, + "grad_norm": 0.6414330005645752, + "learning_rate": 8.683513836671876e-06, + "loss": 0.7201, + "step": 8666 + }, + { + "epoch": 0.47702130001651166, + "grad_norm": 0.6736441850662231, + "learning_rate": 8.68322070546244e-06, + "loss": 0.7365, + "step": 8667 + }, + { + "epoch": 0.4770763388188673, + "grad_norm": 0.780491054058075, + "learning_rate": 8.682927546570905e-06, + "loss": 0.924, + "step": 8668 + }, + { + "epoch": 0.477131377621223, + "grad_norm": 0.6913807988166809, + "learning_rate": 8.68263435999947e-06, + "loss": 0.8269, + "step": 8669 + }, + { + "epoch": 0.4771864164235786, + "grad_norm": 0.7264360189437866, + "learning_rate": 8.682341145750344e-06, + "loss": 0.788, + "step": 8670 + }, + { + "epoch": 0.4772414552259343, + "grad_norm": 0.7777243852615356, + "learning_rate": 8.682047903825725e-06, + "loss": 0.8691, + "step": 8671 + }, + { + "epoch": 0.47729649402828994, + "grad_norm": 0.7590457797050476, + "learning_rate": 8.681754634227821e-06, + "loss": 0.8249, + "step": 8672 + }, + { + "epoch": 0.47735153283064563, + "grad_norm": 0.7672324776649475, + "learning_rate": 8.681461336958836e-06, + "loss": 0.8334, + "step": 8673 + }, + { + "epoch": 0.47740657163300126, + "grad_norm": 0.7181395888328552, + "learning_rate": 8.681168012020971e-06, + "loss": 0.8089, + "step": 8674 + }, + { + "epoch": 0.47746161043535695, + "grad_norm": 0.7671428918838501, + "learning_rate": 8.680874659416433e-06, + "loss": 0.7634, + "step": 8675 + }, + { + "epoch": 0.4775166492377126, + "grad_norm": 0.73219895362854, + "learning_rate": 8.680581279147427e-06, + "loss": 0.7013, + "step": 8676 + }, + { + "epoch": 0.4775716880400682, + "grad_norm": 0.8050867319107056, + "learning_rate": 8.680287871216158e-06, + "loss": 0.7524, + "step": 8677 + }, + { + "epoch": 0.4776267268424239, + "grad_norm": 0.7154340744018555, + "learning_rate": 8.679994435624828e-06, + "loss": 0.802, + "step": 8678 + }, + { + "epoch": 0.47768176564477954, + "grad_norm": 0.7005884051322937, + "learning_rate": 8.679700972375647e-06, + "loss": 0.7633, + "step": 8679 + }, + { + "epoch": 0.47773680444713523, + "grad_norm": 0.8203871846199036, + "learning_rate": 8.679407481470818e-06, + "loss": 0.7782, + "step": 8680 + }, + { + "epoch": 0.47779184324949087, + "grad_norm": 0.6582844853401184, + "learning_rate": 8.679113962912547e-06, + "loss": 0.6799, + "step": 8681 + }, + { + "epoch": 0.47784688205184656, + "grad_norm": 0.7052889466285706, + "learning_rate": 8.67882041670304e-06, + "loss": 0.7814, + "step": 8682 + }, + { + "epoch": 0.4779019208542022, + "grad_norm": 0.7533165812492371, + "learning_rate": 8.678526842844504e-06, + "loss": 0.7983, + "step": 8683 + }, + { + "epoch": 0.4779569596565579, + "grad_norm": 0.7335212230682373, + "learning_rate": 8.678233241339144e-06, + "loss": 0.8023, + "step": 8684 + }, + { + "epoch": 0.4780119984589135, + "grad_norm": 0.7824274897575378, + "learning_rate": 8.67793961218917e-06, + "loss": 0.8219, + "step": 8685 + }, + { + "epoch": 0.4780670372612692, + "grad_norm": 0.6547996401786804, + "learning_rate": 8.677645955396784e-06, + "loss": 0.715, + "step": 8686 + }, + { + "epoch": 0.47812207606362483, + "grad_norm": 0.7507368326187134, + "learning_rate": 8.677352270964196e-06, + "loss": 0.9379, + "step": 8687 + }, + { + "epoch": 0.4781771148659805, + "grad_norm": 0.6403020620346069, + "learning_rate": 8.677058558893613e-06, + "loss": 0.659, + "step": 8688 + }, + { + "epoch": 0.47823215366833616, + "grad_norm": 0.7075803279876709, + "learning_rate": 8.676764819187242e-06, + "loss": 0.7515, + "step": 8689 + }, + { + "epoch": 0.47828719247069185, + "grad_norm": 0.6899601817131042, + "learning_rate": 8.676471051847291e-06, + "loss": 0.8398, + "step": 8690 + }, + { + "epoch": 0.4783422312730475, + "grad_norm": 0.7145645618438721, + "learning_rate": 8.676177256875969e-06, + "loss": 0.7711, + "step": 8691 + }, + { + "epoch": 0.47839727007540317, + "grad_norm": 0.7139655351638794, + "learning_rate": 8.675883434275479e-06, + "loss": 0.8664, + "step": 8692 + }, + { + "epoch": 0.4784523088777588, + "grad_norm": 0.7100433111190796, + "learning_rate": 8.675589584048037e-06, + "loss": 0.7812, + "step": 8693 + }, + { + "epoch": 0.4785073476801145, + "grad_norm": 0.6103882789611816, + "learning_rate": 8.675295706195845e-06, + "loss": 0.6565, + "step": 8694 + }, + { + "epoch": 0.4785623864824701, + "grad_norm": 0.7236714959144592, + "learning_rate": 8.675001800721114e-06, + "loss": 0.6849, + "step": 8695 + }, + { + "epoch": 0.4786174252848258, + "grad_norm": 0.7567160129547119, + "learning_rate": 8.674707867626056e-06, + "loss": 0.8289, + "step": 8696 + }, + { + "epoch": 0.47867246408718145, + "grad_norm": 0.7004136443138123, + "learning_rate": 8.674413906912876e-06, + "loss": 0.7466, + "step": 8697 + }, + { + "epoch": 0.47872750288953714, + "grad_norm": 0.713835597038269, + "learning_rate": 8.674119918583783e-06, + "loss": 0.7875, + "step": 8698 + }, + { + "epoch": 0.47878254169189277, + "grad_norm": 0.8476874232292175, + "learning_rate": 8.67382590264099e-06, + "loss": 0.8028, + "step": 8699 + }, + { + "epoch": 0.47883758049424846, + "grad_norm": 0.720273494720459, + "learning_rate": 8.673531859086706e-06, + "loss": 0.7829, + "step": 8700 + }, + { + "epoch": 0.4788926192966041, + "grad_norm": 0.8042417168617249, + "learning_rate": 8.673237787923137e-06, + "loss": 0.7914, + "step": 8701 + }, + { + "epoch": 0.4789476580989598, + "grad_norm": 0.7779260277748108, + "learning_rate": 8.672943689152498e-06, + "loss": 0.6921, + "step": 8702 + }, + { + "epoch": 0.4790026969013154, + "grad_norm": 0.7957637906074524, + "learning_rate": 8.672649562776997e-06, + "loss": 0.8761, + "step": 8703 + }, + { + "epoch": 0.4790577357036711, + "grad_norm": 0.7467649579048157, + "learning_rate": 8.672355408798845e-06, + "loss": 0.7984, + "step": 8704 + }, + { + "epoch": 0.47911277450602674, + "grad_norm": 0.6746538877487183, + "learning_rate": 8.672061227220252e-06, + "loss": 0.7392, + "step": 8705 + }, + { + "epoch": 0.47916781330838243, + "grad_norm": 0.7331795692443848, + "learning_rate": 8.671767018043432e-06, + "loss": 0.7171, + "step": 8706 + }, + { + "epoch": 0.47922285211073806, + "grad_norm": 0.7879608273506165, + "learning_rate": 8.671472781270592e-06, + "loss": 0.8497, + "step": 8707 + }, + { + "epoch": 0.47927789091309375, + "grad_norm": 0.8659428358078003, + "learning_rate": 8.671178516903946e-06, + "loss": 0.8102, + "step": 8708 + }, + { + "epoch": 0.4793329297154494, + "grad_norm": 0.6489408612251282, + "learning_rate": 8.670884224945704e-06, + "loss": 0.6752, + "step": 8709 + }, + { + "epoch": 0.4793879685178051, + "grad_norm": 0.8182825446128845, + "learning_rate": 8.670589905398079e-06, + "loss": 0.7972, + "step": 8710 + }, + { + "epoch": 0.4794430073201607, + "grad_norm": 0.7759343981742859, + "learning_rate": 8.670295558263285e-06, + "loss": 0.7856, + "step": 8711 + }, + { + "epoch": 0.4794980461225164, + "grad_norm": 0.7421835064888, + "learning_rate": 8.670001183543528e-06, + "loss": 0.8165, + "step": 8712 + }, + { + "epoch": 0.47955308492487203, + "grad_norm": 0.6498512625694275, + "learning_rate": 8.669706781241028e-06, + "loss": 0.7212, + "step": 8713 + }, + { + "epoch": 0.4796081237272277, + "grad_norm": 0.8493219614028931, + "learning_rate": 8.669412351357993e-06, + "loss": 0.8036, + "step": 8714 + }, + { + "epoch": 0.47966316252958335, + "grad_norm": 0.6834331750869751, + "learning_rate": 8.669117893896637e-06, + "loss": 0.8127, + "step": 8715 + }, + { + "epoch": 0.47971820133193904, + "grad_norm": 0.7793670296669006, + "learning_rate": 8.668823408859172e-06, + "loss": 0.7276, + "step": 8716 + }, + { + "epoch": 0.4797732401342947, + "grad_norm": 0.7108075022697449, + "learning_rate": 8.668528896247815e-06, + "loss": 0.8328, + "step": 8717 + }, + { + "epoch": 0.47982827893665037, + "grad_norm": 0.6662433743476868, + "learning_rate": 8.668234356064774e-06, + "loss": 0.6751, + "step": 8718 + }, + { + "epoch": 0.479883317739006, + "grad_norm": 0.6595591902732849, + "learning_rate": 8.667939788312267e-06, + "loss": 0.707, + "step": 8719 + }, + { + "epoch": 0.47993835654136163, + "grad_norm": 0.7435836791992188, + "learning_rate": 8.667645192992506e-06, + "loss": 0.7885, + "step": 8720 + }, + { + "epoch": 0.4799933953437173, + "grad_norm": 0.6999356746673584, + "learning_rate": 8.667350570107706e-06, + "loss": 0.7538, + "step": 8721 + }, + { + "epoch": 0.48004843414607296, + "grad_norm": 0.7111191749572754, + "learning_rate": 8.66705591966008e-06, + "loss": 0.6814, + "step": 8722 + }, + { + "epoch": 0.48010347294842864, + "grad_norm": 0.6752734780311584, + "learning_rate": 8.666761241651844e-06, + "loss": 0.7221, + "step": 8723 + }, + { + "epoch": 0.4801585117507843, + "grad_norm": 0.7432951331138611, + "learning_rate": 8.666466536085212e-06, + "loss": 0.7689, + "step": 8724 + }, + { + "epoch": 0.48021355055313997, + "grad_norm": 0.7384392023086548, + "learning_rate": 8.666171802962398e-06, + "loss": 0.7862, + "step": 8725 + }, + { + "epoch": 0.4802685893554956, + "grad_norm": 0.6878762245178223, + "learning_rate": 8.66587704228562e-06, + "loss": 0.7246, + "step": 8726 + }, + { + "epoch": 0.4803236281578513, + "grad_norm": 0.6640586853027344, + "learning_rate": 8.66558225405709e-06, + "loss": 0.7181, + "step": 8727 + }, + { + "epoch": 0.4803786669602069, + "grad_norm": 0.6808595061302185, + "learning_rate": 8.665287438279024e-06, + "loss": 0.7866, + "step": 8728 + }, + { + "epoch": 0.4804337057625626, + "grad_norm": 0.5966268181800842, + "learning_rate": 8.66499259495364e-06, + "loss": 0.6755, + "step": 8729 + }, + { + "epoch": 0.48048874456491825, + "grad_norm": 0.742016077041626, + "learning_rate": 8.664697724083152e-06, + "loss": 0.8682, + "step": 8730 + }, + { + "epoch": 0.48054378336727394, + "grad_norm": 0.6621154546737671, + "learning_rate": 8.66440282566978e-06, + "loss": 0.7525, + "step": 8731 + }, + { + "epoch": 0.48059882216962957, + "grad_norm": 0.7347434759140015, + "learning_rate": 8.664107899715733e-06, + "loss": 0.7919, + "step": 8732 + }, + { + "epoch": 0.48065386097198526, + "grad_norm": 0.7564681172370911, + "learning_rate": 8.663812946223234e-06, + "loss": 0.9172, + "step": 8733 + }, + { + "epoch": 0.4807088997743409, + "grad_norm": 0.7193084359169006, + "learning_rate": 8.663517965194497e-06, + "loss": 0.7931, + "step": 8734 + }, + { + "epoch": 0.4807639385766966, + "grad_norm": 0.6882064938545227, + "learning_rate": 8.66322295663174e-06, + "loss": 0.7678, + "step": 8735 + }, + { + "epoch": 0.4808189773790522, + "grad_norm": 0.7954713106155396, + "learning_rate": 8.662927920537179e-06, + "loss": 0.6357, + "step": 8736 + }, + { + "epoch": 0.4808740161814079, + "grad_norm": 0.7123041749000549, + "learning_rate": 8.662632856913034e-06, + "loss": 0.7234, + "step": 8737 + }, + { + "epoch": 0.48092905498376354, + "grad_norm": 0.745145320892334, + "learning_rate": 8.66233776576152e-06, + "loss": 0.7516, + "step": 8738 + }, + { + "epoch": 0.4809840937861192, + "grad_norm": 0.6904219388961792, + "learning_rate": 8.662042647084856e-06, + "loss": 0.7995, + "step": 8739 + }, + { + "epoch": 0.48103913258847486, + "grad_norm": 0.71831214427948, + "learning_rate": 8.661747500885258e-06, + "loss": 0.7965, + "step": 8740 + }, + { + "epoch": 0.48109417139083055, + "grad_norm": 0.8514378666877747, + "learning_rate": 8.661452327164948e-06, + "loss": 0.8023, + "step": 8741 + }, + { + "epoch": 0.4811492101931862, + "grad_norm": 0.7411143779754639, + "learning_rate": 8.66115712592614e-06, + "loss": 0.797, + "step": 8742 + }, + { + "epoch": 0.4812042489955419, + "grad_norm": 0.737178385257721, + "learning_rate": 8.660861897171057e-06, + "loss": 0.7286, + "step": 8743 + }, + { + "epoch": 0.4812592877978975, + "grad_norm": 0.6823513507843018, + "learning_rate": 8.660566640901918e-06, + "loss": 0.7482, + "step": 8744 + }, + { + "epoch": 0.4813143266002532, + "grad_norm": 0.7205879092216492, + "learning_rate": 8.660271357120937e-06, + "loss": 0.8294, + "step": 8745 + }, + { + "epoch": 0.48136936540260883, + "grad_norm": 0.6887338757514954, + "learning_rate": 8.659976045830337e-06, + "loss": 0.7711, + "step": 8746 + }, + { + "epoch": 0.4814244042049645, + "grad_norm": 0.7498533129692078, + "learning_rate": 8.659680707032336e-06, + "loss": 0.7296, + "step": 8747 + }, + { + "epoch": 0.48147944300732015, + "grad_norm": 0.8041636943817139, + "learning_rate": 8.659385340729155e-06, + "loss": 0.9213, + "step": 8748 + }, + { + "epoch": 0.48153448180967584, + "grad_norm": 0.8623721599578857, + "learning_rate": 8.659089946923014e-06, + "loss": 0.8024, + "step": 8749 + }, + { + "epoch": 0.4815895206120315, + "grad_norm": 0.7212050557136536, + "learning_rate": 8.658794525616132e-06, + "loss": 0.732, + "step": 8750 + }, + { + "epoch": 0.48164455941438716, + "grad_norm": 0.7141492366790771, + "learning_rate": 8.658499076810729e-06, + "loss": 0.8062, + "step": 8751 + }, + { + "epoch": 0.4816995982167428, + "grad_norm": 0.7191516160964966, + "learning_rate": 8.658203600509027e-06, + "loss": 0.805, + "step": 8752 + }, + { + "epoch": 0.4817546370190985, + "grad_norm": 0.71059650182724, + "learning_rate": 8.657908096713245e-06, + "loss": 0.6755, + "step": 8753 + }, + { + "epoch": 0.4818096758214541, + "grad_norm": 0.6715459823608398, + "learning_rate": 8.657612565425607e-06, + "loss": 0.8093, + "step": 8754 + }, + { + "epoch": 0.4818647146238098, + "grad_norm": 0.7438814640045166, + "learning_rate": 8.65731700664833e-06, + "loss": 0.8059, + "step": 8755 + }, + { + "epoch": 0.48191975342616544, + "grad_norm": 0.7295387387275696, + "learning_rate": 8.657021420383637e-06, + "loss": 0.8437, + "step": 8756 + }, + { + "epoch": 0.48197479222852113, + "grad_norm": 0.7053797245025635, + "learning_rate": 8.656725806633753e-06, + "loss": 0.8424, + "step": 8757 + }, + { + "epoch": 0.48202983103087677, + "grad_norm": 0.6902007460594177, + "learning_rate": 8.656430165400894e-06, + "loss": 0.6967, + "step": 8758 + }, + { + "epoch": 0.48208486983323245, + "grad_norm": 0.66749507188797, + "learning_rate": 8.656134496687286e-06, + "loss": 0.7858, + "step": 8759 + }, + { + "epoch": 0.4821399086355881, + "grad_norm": 0.6755428314208984, + "learning_rate": 8.65583880049515e-06, + "loss": 0.6669, + "step": 8760 + }, + { + "epoch": 0.4821949474379438, + "grad_norm": 0.921096920967102, + "learning_rate": 8.655543076826706e-06, + "loss": 0.8545, + "step": 8761 + }, + { + "epoch": 0.4822499862402994, + "grad_norm": 0.7931553721427917, + "learning_rate": 8.65524732568418e-06, + "loss": 0.8708, + "step": 8762 + }, + { + "epoch": 0.48230502504265504, + "grad_norm": 0.7891780734062195, + "learning_rate": 8.654951547069794e-06, + "loss": 0.687, + "step": 8763 + }, + { + "epoch": 0.48236006384501073, + "grad_norm": 0.747662365436554, + "learning_rate": 8.65465574098577e-06, + "loss": 0.8153, + "step": 8764 + }, + { + "epoch": 0.48241510264736637, + "grad_norm": 0.7758497595787048, + "learning_rate": 8.65435990743433e-06, + "loss": 0.8018, + "step": 8765 + }, + { + "epoch": 0.48247014144972206, + "grad_norm": 0.6997805237770081, + "learning_rate": 8.654064046417703e-06, + "loss": 0.7845, + "step": 8766 + }, + { + "epoch": 0.4825251802520777, + "grad_norm": 0.7188366651535034, + "learning_rate": 8.653768157938106e-06, + "loss": 0.7528, + "step": 8767 + }, + { + "epoch": 0.4825802190544334, + "grad_norm": 0.6848055124282837, + "learning_rate": 8.653472241997767e-06, + "loss": 0.7658, + "step": 8768 + }, + { + "epoch": 0.482635257856789, + "grad_norm": 1.0603824853897095, + "learning_rate": 8.653176298598907e-06, + "loss": 0.7692, + "step": 8769 + }, + { + "epoch": 0.4826902966591447, + "grad_norm": 0.8191514611244202, + "learning_rate": 8.652880327743753e-06, + "loss": 0.7706, + "step": 8770 + }, + { + "epoch": 0.48274533546150034, + "grad_norm": 0.6318503618240356, + "learning_rate": 8.652584329434527e-06, + "loss": 0.6635, + "step": 8771 + }, + { + "epoch": 0.482800374263856, + "grad_norm": 0.6860769391059875, + "learning_rate": 8.652288303673457e-06, + "loss": 0.739, + "step": 8772 + }, + { + "epoch": 0.48285541306621166, + "grad_norm": 0.7414761185646057, + "learning_rate": 8.651992250462765e-06, + "loss": 0.7949, + "step": 8773 + }, + { + "epoch": 0.48291045186856735, + "grad_norm": 0.7255183458328247, + "learning_rate": 8.651696169804676e-06, + "loss": 0.8569, + "step": 8774 + }, + { + "epoch": 0.482965490670923, + "grad_norm": 0.7034135460853577, + "learning_rate": 8.651400061701417e-06, + "loss": 0.7562, + "step": 8775 + }, + { + "epoch": 0.48302052947327867, + "grad_norm": 0.7041038274765015, + "learning_rate": 8.651103926155212e-06, + "loss": 0.7194, + "step": 8776 + }, + { + "epoch": 0.4830755682756343, + "grad_norm": 1.0965619087219238, + "learning_rate": 8.650807763168287e-06, + "loss": 0.9033, + "step": 8777 + }, + { + "epoch": 0.48313060707799, + "grad_norm": 0.7400044798851013, + "learning_rate": 8.650511572742869e-06, + "loss": 0.7626, + "step": 8778 + }, + { + "epoch": 0.4831856458803456, + "grad_norm": 0.6957885026931763, + "learning_rate": 8.650215354881182e-06, + "loss": 0.7283, + "step": 8779 + }, + { + "epoch": 0.4832406846827013, + "grad_norm": 0.7992473840713501, + "learning_rate": 8.649919109585454e-06, + "loss": 0.8376, + "step": 8780 + }, + { + "epoch": 0.48329572348505695, + "grad_norm": 0.8556981086730957, + "learning_rate": 8.649622836857911e-06, + "loss": 0.7737, + "step": 8781 + }, + { + "epoch": 0.48335076228741264, + "grad_norm": 0.8476192355155945, + "learning_rate": 8.64932653670078e-06, + "loss": 0.8926, + "step": 8782 + }, + { + "epoch": 0.48340580108976827, + "grad_norm": 0.6461093425750732, + "learning_rate": 8.649030209116289e-06, + "loss": 0.7452, + "step": 8783 + }, + { + "epoch": 0.48346083989212396, + "grad_norm": 0.6997528076171875, + "learning_rate": 8.648733854106661e-06, + "loss": 0.7962, + "step": 8784 + }, + { + "epoch": 0.4835158786944796, + "grad_norm": 0.7606356739997864, + "learning_rate": 8.648437471674128e-06, + "loss": 0.6517, + "step": 8785 + }, + { + "epoch": 0.4835709174968353, + "grad_norm": 0.8118630051612854, + "learning_rate": 8.648141061820913e-06, + "loss": 0.7539, + "step": 8786 + }, + { + "epoch": 0.4836259562991909, + "grad_norm": 0.8778805136680603, + "learning_rate": 8.64784462454925e-06, + "loss": 0.763, + "step": 8787 + }, + { + "epoch": 0.4836809951015466, + "grad_norm": 0.7741022706031799, + "learning_rate": 8.647548159861361e-06, + "loss": 0.7749, + "step": 8788 + }, + { + "epoch": 0.48373603390390224, + "grad_norm": 0.76578688621521, + "learning_rate": 8.647251667759478e-06, + "loss": 0.6968, + "step": 8789 + }, + { + "epoch": 0.48379107270625793, + "grad_norm": 0.8477250933647156, + "learning_rate": 8.646955148245827e-06, + "loss": 0.8364, + "step": 8790 + }, + { + "epoch": 0.48384611150861356, + "grad_norm": 0.9105041027069092, + "learning_rate": 8.646658601322635e-06, + "loss": 0.823, + "step": 8791 + }, + { + "epoch": 0.48390115031096925, + "grad_norm": 0.7642726898193359, + "learning_rate": 8.646362026992135e-06, + "loss": 0.721, + "step": 8792 + }, + { + "epoch": 0.4839561891133249, + "grad_norm": 0.7567259669303894, + "learning_rate": 8.646065425256555e-06, + "loss": 0.7876, + "step": 8793 + }, + { + "epoch": 0.4840112279156806, + "grad_norm": 0.7691231966018677, + "learning_rate": 8.64576879611812e-06, + "loss": 0.8308, + "step": 8794 + }, + { + "epoch": 0.4840662667180362, + "grad_norm": 1.0769426822662354, + "learning_rate": 8.645472139579067e-06, + "loss": 0.892, + "step": 8795 + }, + { + "epoch": 0.4841213055203919, + "grad_norm": 0.6987955570220947, + "learning_rate": 8.64517545564162e-06, + "loss": 0.8254, + "step": 8796 + }, + { + "epoch": 0.48417634432274753, + "grad_norm": 0.7736005783081055, + "learning_rate": 8.644878744308007e-06, + "loss": 0.7666, + "step": 8797 + }, + { + "epoch": 0.4842313831251032, + "grad_norm": 0.6233380436897278, + "learning_rate": 8.644582005580464e-06, + "loss": 0.6443, + "step": 8798 + }, + { + "epoch": 0.48428642192745885, + "grad_norm": 0.7343530654907227, + "learning_rate": 8.644285239461217e-06, + "loss": 0.724, + "step": 8799 + }, + { + "epoch": 0.48434146072981454, + "grad_norm": 0.725321352481842, + "learning_rate": 8.643988445952499e-06, + "loss": 0.7249, + "step": 8800 + }, + { + "epoch": 0.4843964995321702, + "grad_norm": 0.7256256341934204, + "learning_rate": 8.643691625056539e-06, + "loss": 0.8656, + "step": 8801 + }, + { + "epoch": 0.48445153833452587, + "grad_norm": 0.8559528589248657, + "learning_rate": 8.643394776775567e-06, + "loss": 0.9186, + "step": 8802 + }, + { + "epoch": 0.4845065771368815, + "grad_norm": 0.6735692024230957, + "learning_rate": 8.643097901111815e-06, + "loss": 0.7007, + "step": 8803 + }, + { + "epoch": 0.4845616159392372, + "grad_norm": 0.8373280167579651, + "learning_rate": 8.642800998067515e-06, + "loss": 0.8774, + "step": 8804 + }, + { + "epoch": 0.4846166547415928, + "grad_norm": 0.731311023235321, + "learning_rate": 8.642504067644898e-06, + "loss": 0.7102, + "step": 8805 + }, + { + "epoch": 0.48467169354394846, + "grad_norm": 0.7259742617607117, + "learning_rate": 8.642207109846195e-06, + "loss": 0.7174, + "step": 8806 + }, + { + "epoch": 0.48472673234630415, + "grad_norm": 0.6454386115074158, + "learning_rate": 8.641910124673638e-06, + "loss": 0.7656, + "step": 8807 + }, + { + "epoch": 0.4847817711486598, + "grad_norm": 0.7701624631881714, + "learning_rate": 8.641613112129462e-06, + "loss": 0.7926, + "step": 8808 + }, + { + "epoch": 0.48483680995101547, + "grad_norm": 0.6812854409217834, + "learning_rate": 8.641316072215893e-06, + "loss": 0.7072, + "step": 8809 + }, + { + "epoch": 0.4848918487533711, + "grad_norm": 0.8180119395256042, + "learning_rate": 8.641019004935169e-06, + "loss": 0.8621, + "step": 8810 + }, + { + "epoch": 0.4849468875557268, + "grad_norm": 0.6346331834793091, + "learning_rate": 8.64072191028952e-06, + "loss": 0.6907, + "step": 8811 + }, + { + "epoch": 0.4850019263580824, + "grad_norm": 0.6819741129875183, + "learning_rate": 8.64042478828118e-06, + "loss": 0.77, + "step": 8812 + }, + { + "epoch": 0.4850569651604381, + "grad_norm": 0.9074214100837708, + "learning_rate": 8.640127638912383e-06, + "loss": 0.7799, + "step": 8813 + }, + { + "epoch": 0.48511200396279375, + "grad_norm": 0.8065158724784851, + "learning_rate": 8.63983046218536e-06, + "loss": 0.8033, + "step": 8814 + }, + { + "epoch": 0.48516704276514944, + "grad_norm": 0.6241241097450256, + "learning_rate": 8.639533258102345e-06, + "loss": 0.6936, + "step": 8815 + }, + { + "epoch": 0.48522208156750507, + "grad_norm": 0.6928265690803528, + "learning_rate": 8.639236026665573e-06, + "loss": 0.7526, + "step": 8816 + }, + { + "epoch": 0.48527712036986076, + "grad_norm": 0.8171425461769104, + "learning_rate": 8.638938767877276e-06, + "loss": 0.8227, + "step": 8817 + }, + { + "epoch": 0.4853321591722164, + "grad_norm": 0.7007083296775818, + "learning_rate": 8.638641481739692e-06, + "loss": 0.7439, + "step": 8818 + }, + { + "epoch": 0.4853871979745721, + "grad_norm": 0.8905115127563477, + "learning_rate": 8.63834416825505e-06, + "loss": 0.6873, + "step": 8819 + }, + { + "epoch": 0.4854422367769277, + "grad_norm": 0.702198326587677, + "learning_rate": 8.638046827425588e-06, + "loss": 0.7999, + "step": 8820 + }, + { + "epoch": 0.4854972755792834, + "grad_norm": 0.7280104160308838, + "learning_rate": 8.63774945925354e-06, + "loss": 0.8562, + "step": 8821 + }, + { + "epoch": 0.48555231438163904, + "grad_norm": 0.9803630113601685, + "learning_rate": 8.63745206374114e-06, + "loss": 0.8347, + "step": 8822 + }, + { + "epoch": 0.4856073531839947, + "grad_norm": 0.6781168580055237, + "learning_rate": 8.637154640890625e-06, + "loss": 0.8124, + "step": 8823 + }, + { + "epoch": 0.48566239198635036, + "grad_norm": 0.7219669222831726, + "learning_rate": 8.63685719070423e-06, + "loss": 0.8053, + "step": 8824 + }, + { + "epoch": 0.48571743078870605, + "grad_norm": 0.7077241539955139, + "learning_rate": 8.636559713184187e-06, + "loss": 0.7534, + "step": 8825 + }, + { + "epoch": 0.4857724695910617, + "grad_norm": 0.70063316822052, + "learning_rate": 8.636262208332737e-06, + "loss": 0.7509, + "step": 8826 + }, + { + "epoch": 0.4858275083934174, + "grad_norm": 0.7292184233665466, + "learning_rate": 8.635964676152114e-06, + "loss": 0.7485, + "step": 8827 + }, + { + "epoch": 0.485882547195773, + "grad_norm": 0.7970258593559265, + "learning_rate": 8.635667116644552e-06, + "loss": 0.8874, + "step": 8828 + }, + { + "epoch": 0.4859375859981287, + "grad_norm": 0.7090024352073669, + "learning_rate": 8.63536952981229e-06, + "loss": 0.7665, + "step": 8829 + }, + { + "epoch": 0.48599262480048433, + "grad_norm": 0.761409342288971, + "learning_rate": 8.635071915657565e-06, + "loss": 0.7977, + "step": 8830 + }, + { + "epoch": 0.48604766360284, + "grad_norm": 0.724896252155304, + "learning_rate": 8.634774274182611e-06, + "loss": 0.8591, + "step": 8831 + }, + { + "epoch": 0.48610270240519565, + "grad_norm": 0.737424910068512, + "learning_rate": 8.634476605389666e-06, + "loss": 0.8256, + "step": 8832 + }, + { + "epoch": 0.48615774120755134, + "grad_norm": 0.8261227607727051, + "learning_rate": 8.63417890928097e-06, + "loss": 0.8089, + "step": 8833 + }, + { + "epoch": 0.486212780009907, + "grad_norm": 0.6744595766067505, + "learning_rate": 8.633881185858756e-06, + "loss": 0.7821, + "step": 8834 + }, + { + "epoch": 0.48626781881226266, + "grad_norm": 0.6717672944068909, + "learning_rate": 8.633583435125263e-06, + "loss": 0.7823, + "step": 8835 + }, + { + "epoch": 0.4863228576146183, + "grad_norm": 0.753616213798523, + "learning_rate": 8.633285657082732e-06, + "loss": 0.8044, + "step": 8836 + }, + { + "epoch": 0.486377896416974, + "grad_norm": 0.6910914182662964, + "learning_rate": 8.632987851733397e-06, + "loss": 0.8244, + "step": 8837 + }, + { + "epoch": 0.4864329352193296, + "grad_norm": 0.9127064347267151, + "learning_rate": 8.632690019079499e-06, + "loss": 0.7918, + "step": 8838 + }, + { + "epoch": 0.4864879740216853, + "grad_norm": 0.715918779373169, + "learning_rate": 8.632392159123274e-06, + "loss": 0.744, + "step": 8839 + }, + { + "epoch": 0.48654301282404094, + "grad_norm": 0.8206684589385986, + "learning_rate": 8.632094271866963e-06, + "loss": 0.7852, + "step": 8840 + }, + { + "epoch": 0.48659805162639663, + "grad_norm": 0.6502171158790588, + "learning_rate": 8.631796357312802e-06, + "loss": 0.7653, + "step": 8841 + }, + { + "epoch": 0.48665309042875227, + "grad_norm": 0.6987786889076233, + "learning_rate": 8.631498415463033e-06, + "loss": 0.7669, + "step": 8842 + }, + { + "epoch": 0.48670812923110796, + "grad_norm": 0.7902390360832214, + "learning_rate": 8.631200446319894e-06, + "loss": 0.8438, + "step": 8843 + }, + { + "epoch": 0.4867631680334636, + "grad_norm": 0.7464659810066223, + "learning_rate": 8.630902449885625e-06, + "loss": 0.8276, + "step": 8844 + }, + { + "epoch": 0.4868182068358193, + "grad_norm": 0.7375630736351013, + "learning_rate": 8.630604426162465e-06, + "loss": 0.7921, + "step": 8845 + }, + { + "epoch": 0.4868732456381749, + "grad_norm": 0.7206295728683472, + "learning_rate": 8.630306375152653e-06, + "loss": 0.8424, + "step": 8846 + }, + { + "epoch": 0.4869282844405306, + "grad_norm": 0.7384368181228638, + "learning_rate": 8.63000829685843e-06, + "loss": 0.8702, + "step": 8847 + }, + { + "epoch": 0.48698332324288623, + "grad_norm": 0.7839015126228333, + "learning_rate": 8.629710191282037e-06, + "loss": 0.7064, + "step": 8848 + }, + { + "epoch": 0.48703836204524187, + "grad_norm": 0.6909724473953247, + "learning_rate": 8.629412058425712e-06, + "loss": 0.6924, + "step": 8849 + }, + { + "epoch": 0.48709340084759756, + "grad_norm": 0.6553036570549011, + "learning_rate": 8.6291138982917e-06, + "loss": 0.6526, + "step": 8850 + }, + { + "epoch": 0.4871484396499532, + "grad_norm": 0.7202072143554688, + "learning_rate": 8.628815710882239e-06, + "loss": 0.7272, + "step": 8851 + }, + { + "epoch": 0.4872034784523089, + "grad_norm": 0.6898619532585144, + "learning_rate": 8.62851749619957e-06, + "loss": 0.7687, + "step": 8852 + }, + { + "epoch": 0.4872585172546645, + "grad_norm": 0.7888908386230469, + "learning_rate": 8.628219254245935e-06, + "loss": 0.7654, + "step": 8853 + }, + { + "epoch": 0.4873135560570202, + "grad_norm": 0.7312424778938293, + "learning_rate": 8.627920985023575e-06, + "loss": 0.8053, + "step": 8854 + }, + { + "epoch": 0.48736859485937584, + "grad_norm": 0.6588439345359802, + "learning_rate": 8.627622688534731e-06, + "loss": 0.7229, + "step": 8855 + }, + { + "epoch": 0.4874236336617315, + "grad_norm": 0.8292293548583984, + "learning_rate": 8.627324364781647e-06, + "loss": 0.8482, + "step": 8856 + }, + { + "epoch": 0.48747867246408716, + "grad_norm": 0.7573973536491394, + "learning_rate": 8.627026013766564e-06, + "loss": 0.7282, + "step": 8857 + }, + { + "epoch": 0.48753371126644285, + "grad_norm": 1.2215768098831177, + "learning_rate": 8.626727635491726e-06, + "loss": 0.7771, + "step": 8858 + }, + { + "epoch": 0.4875887500687985, + "grad_norm": 0.7324759364128113, + "learning_rate": 8.626429229959369e-06, + "loss": 0.781, + "step": 8859 + }, + { + "epoch": 0.48764378887115417, + "grad_norm": 0.6995676159858704, + "learning_rate": 8.626130797171745e-06, + "loss": 0.6907, + "step": 8860 + }, + { + "epoch": 0.4876988276735098, + "grad_norm": 0.7400509119033813, + "learning_rate": 8.625832337131092e-06, + "loss": 0.6572, + "step": 8861 + }, + { + "epoch": 0.4877538664758655, + "grad_norm": 0.6634842753410339, + "learning_rate": 8.625533849839653e-06, + "loss": 0.7229, + "step": 8862 + }, + { + "epoch": 0.4878089052782211, + "grad_norm": 0.7357299327850342, + "learning_rate": 8.625235335299673e-06, + "loss": 0.6418, + "step": 8863 + }, + { + "epoch": 0.4878639440805768, + "grad_norm": 0.6473466157913208, + "learning_rate": 8.624936793513394e-06, + "loss": 0.6796, + "step": 8864 + }, + { + "epoch": 0.48791898288293245, + "grad_norm": 0.9110734462738037, + "learning_rate": 8.62463822448306e-06, + "loss": 0.8143, + "step": 8865 + }, + { + "epoch": 0.48797402168528814, + "grad_norm": 0.7932308316230774, + "learning_rate": 8.624339628210916e-06, + "loss": 0.9103, + "step": 8866 + }, + { + "epoch": 0.4880290604876438, + "grad_norm": 0.6677752137184143, + "learning_rate": 8.624041004699205e-06, + "loss": 0.8073, + "step": 8867 + }, + { + "epoch": 0.48808409928999946, + "grad_norm": 0.7379121780395508, + "learning_rate": 8.623742353950171e-06, + "loss": 0.8643, + "step": 8868 + }, + { + "epoch": 0.4881391380923551, + "grad_norm": 0.7479479312896729, + "learning_rate": 8.623443675966062e-06, + "loss": 0.6117, + "step": 8869 + }, + { + "epoch": 0.4881941768947108, + "grad_norm": 0.7822794914245605, + "learning_rate": 8.623144970749118e-06, + "loss": 0.8629, + "step": 8870 + }, + { + "epoch": 0.4882492156970664, + "grad_norm": 0.7040950655937195, + "learning_rate": 8.622846238301587e-06, + "loss": 0.7519, + "step": 8871 + }, + { + "epoch": 0.4883042544994221, + "grad_norm": 0.747368574142456, + "learning_rate": 8.622547478625714e-06, + "loss": 0.7459, + "step": 8872 + }, + { + "epoch": 0.48835929330177774, + "grad_norm": 0.6755948066711426, + "learning_rate": 8.622248691723742e-06, + "loss": 0.7515, + "step": 8873 + }, + { + "epoch": 0.48841433210413343, + "grad_norm": 0.7265586256980896, + "learning_rate": 8.62194987759792e-06, + "loss": 0.7691, + "step": 8874 + }, + { + "epoch": 0.48846937090648906, + "grad_norm": 0.6696380972862244, + "learning_rate": 8.621651036250493e-06, + "loss": 0.778, + "step": 8875 + }, + { + "epoch": 0.48852440970884475, + "grad_norm": 0.7666454911231995, + "learning_rate": 8.621352167683705e-06, + "loss": 0.7396, + "step": 8876 + }, + { + "epoch": 0.4885794485112004, + "grad_norm": 0.7079235315322876, + "learning_rate": 8.621053271899803e-06, + "loss": 0.7917, + "step": 8877 + }, + { + "epoch": 0.4886344873135561, + "grad_norm": 0.6888919472694397, + "learning_rate": 8.620754348901034e-06, + "loss": 0.605, + "step": 8878 + }, + { + "epoch": 0.4886895261159117, + "grad_norm": 0.7177572250366211, + "learning_rate": 8.620455398689645e-06, + "loss": 0.7534, + "step": 8879 + }, + { + "epoch": 0.4887445649182674, + "grad_norm": 0.7268772721290588, + "learning_rate": 8.620156421267883e-06, + "loss": 0.7748, + "step": 8880 + }, + { + "epoch": 0.48879960372062303, + "grad_norm": 0.8015080690383911, + "learning_rate": 8.619857416637993e-06, + "loss": 0.6716, + "step": 8881 + }, + { + "epoch": 0.4888546425229787, + "grad_norm": 0.7464118599891663, + "learning_rate": 8.619558384802226e-06, + "loss": 0.796, + "step": 8882 + }, + { + "epoch": 0.48890968132533436, + "grad_norm": 0.6829718351364136, + "learning_rate": 8.619259325762826e-06, + "loss": 0.788, + "step": 8883 + }, + { + "epoch": 0.48896472012769004, + "grad_norm": 0.6553084850311279, + "learning_rate": 8.618960239522041e-06, + "loss": 0.7215, + "step": 8884 + }, + { + "epoch": 0.4890197589300457, + "grad_norm": 0.8056252598762512, + "learning_rate": 8.618661126082119e-06, + "loss": 0.8588, + "step": 8885 + }, + { + "epoch": 0.48907479773240137, + "grad_norm": 0.8145674467086792, + "learning_rate": 8.618361985445309e-06, + "loss": 0.8095, + "step": 8886 + }, + { + "epoch": 0.489129836534757, + "grad_norm": 0.740031898021698, + "learning_rate": 8.61806281761386e-06, + "loss": 0.7029, + "step": 8887 + }, + { + "epoch": 0.4891848753371127, + "grad_norm": 0.7442640662193298, + "learning_rate": 8.617763622590019e-06, + "loss": 0.782, + "step": 8888 + }, + { + "epoch": 0.4892399141394683, + "grad_norm": 0.6992725133895874, + "learning_rate": 8.617464400376035e-06, + "loss": 0.7877, + "step": 8889 + }, + { + "epoch": 0.489294952941824, + "grad_norm": 1.19756281375885, + "learning_rate": 8.617165150974157e-06, + "loss": 0.6985, + "step": 8890 + }, + { + "epoch": 0.48934999174417965, + "grad_norm": 0.6418262720108032, + "learning_rate": 8.616865874386633e-06, + "loss": 0.7385, + "step": 8891 + }, + { + "epoch": 0.4894050305465353, + "grad_norm": 0.787406325340271, + "learning_rate": 8.616566570615714e-06, + "loss": 0.8686, + "step": 8892 + }, + { + "epoch": 0.48946006934889097, + "grad_norm": 0.6990430951118469, + "learning_rate": 8.616267239663648e-06, + "loss": 0.7683, + "step": 8893 + }, + { + "epoch": 0.4895151081512466, + "grad_norm": 0.7180235981941223, + "learning_rate": 8.615967881532687e-06, + "loss": 0.8337, + "step": 8894 + }, + { + "epoch": 0.4895701469536023, + "grad_norm": 0.7647475600242615, + "learning_rate": 8.615668496225077e-06, + "loss": 0.8668, + "step": 8895 + }, + { + "epoch": 0.4896251857559579, + "grad_norm": 0.843063473701477, + "learning_rate": 8.615369083743072e-06, + "loss": 0.7968, + "step": 8896 + }, + { + "epoch": 0.4896802245583136, + "grad_norm": 0.9526075124740601, + "learning_rate": 8.61506964408892e-06, + "loss": 0.8766, + "step": 8897 + }, + { + "epoch": 0.48973526336066925, + "grad_norm": 0.7850056290626526, + "learning_rate": 8.614770177264874e-06, + "loss": 0.8033, + "step": 8898 + }, + { + "epoch": 0.48979030216302494, + "grad_norm": 0.8658629655838013, + "learning_rate": 8.614470683273182e-06, + "loss": 0.8206, + "step": 8899 + }, + { + "epoch": 0.48984534096538057, + "grad_norm": 0.8060176968574524, + "learning_rate": 8.614171162116096e-06, + "loss": 0.7602, + "step": 8900 + }, + { + "epoch": 0.48990037976773626, + "grad_norm": 0.7398280501365662, + "learning_rate": 8.613871613795865e-06, + "loss": 0.8067, + "step": 8901 + }, + { + "epoch": 0.4899554185700919, + "grad_norm": 0.7341256141662598, + "learning_rate": 8.613572038314744e-06, + "loss": 0.7305, + "step": 8902 + }, + { + "epoch": 0.4900104573724476, + "grad_norm": 0.7832887172698975, + "learning_rate": 8.613272435674984e-06, + "loss": 0.7012, + "step": 8903 + }, + { + "epoch": 0.4900654961748032, + "grad_norm": 0.6536995768547058, + "learning_rate": 8.612972805878834e-06, + "loss": 0.745, + "step": 8904 + }, + { + "epoch": 0.4901205349771589, + "grad_norm": 0.7511856555938721, + "learning_rate": 8.612673148928547e-06, + "loss": 0.7741, + "step": 8905 + }, + { + "epoch": 0.49017557377951454, + "grad_norm": 0.6117261648178101, + "learning_rate": 8.612373464826377e-06, + "loss": 0.5813, + "step": 8906 + }, + { + "epoch": 0.49023061258187023, + "grad_norm": 0.7832254767417908, + "learning_rate": 8.612073753574574e-06, + "loss": 0.7426, + "step": 8907 + }, + { + "epoch": 0.49028565138422586, + "grad_norm": 0.7516622543334961, + "learning_rate": 8.611774015175393e-06, + "loss": 0.8205, + "step": 8908 + }, + { + "epoch": 0.49034069018658155, + "grad_norm": 0.7776936888694763, + "learning_rate": 8.611474249631085e-06, + "loss": 0.8457, + "step": 8909 + }, + { + "epoch": 0.4903957289889372, + "grad_norm": 0.9364853501319885, + "learning_rate": 8.6111744569439e-06, + "loss": 0.9114, + "step": 8910 + }, + { + "epoch": 0.4904507677912929, + "grad_norm": 0.7584181427955627, + "learning_rate": 8.610874637116099e-06, + "loss": 0.6852, + "step": 8911 + }, + { + "epoch": 0.4905058065936485, + "grad_norm": 0.7326254844665527, + "learning_rate": 8.610574790149929e-06, + "loss": 0.7843, + "step": 8912 + }, + { + "epoch": 0.4905608453960042, + "grad_norm": 0.918258547782898, + "learning_rate": 8.610274916047645e-06, + "loss": 0.766, + "step": 8913 + }, + { + "epoch": 0.49061588419835983, + "grad_norm": 1.0083420276641846, + "learning_rate": 8.609975014811502e-06, + "loss": 0.7436, + "step": 8914 + }, + { + "epoch": 0.4906709230007155, + "grad_norm": 0.712664783000946, + "learning_rate": 8.609675086443752e-06, + "loss": 0.7891, + "step": 8915 + }, + { + "epoch": 0.49072596180307115, + "grad_norm": 0.7635206580162048, + "learning_rate": 8.609375130946651e-06, + "loss": 0.7842, + "step": 8916 + }, + { + "epoch": 0.49078100060542684, + "grad_norm": 0.7567723989486694, + "learning_rate": 8.609075148322452e-06, + "loss": 0.8435, + "step": 8917 + }, + { + "epoch": 0.4908360394077825, + "grad_norm": 0.8918718099594116, + "learning_rate": 8.60877513857341e-06, + "loss": 0.8015, + "step": 8918 + }, + { + "epoch": 0.49089107821013817, + "grad_norm": 0.8701914548873901, + "learning_rate": 8.608475101701781e-06, + "loss": 0.7806, + "step": 8919 + }, + { + "epoch": 0.4909461170124938, + "grad_norm": 0.7528215646743774, + "learning_rate": 8.608175037709819e-06, + "loss": 0.7958, + "step": 8920 + }, + { + "epoch": 0.4910011558148495, + "grad_norm": 0.7277387380599976, + "learning_rate": 8.60787494659978e-06, + "loss": 0.7878, + "step": 8921 + }, + { + "epoch": 0.4910561946172051, + "grad_norm": 0.6739892959594727, + "learning_rate": 8.607574828373917e-06, + "loss": 0.7212, + "step": 8922 + }, + { + "epoch": 0.4911112334195608, + "grad_norm": 0.712480366230011, + "learning_rate": 8.607274683034487e-06, + "loss": 0.7966, + "step": 8923 + }, + { + "epoch": 0.49116627222191644, + "grad_norm": 0.7192126512527466, + "learning_rate": 8.606974510583747e-06, + "loss": 0.7032, + "step": 8924 + }, + { + "epoch": 0.49122131102427213, + "grad_norm": 0.7502614855766296, + "learning_rate": 8.606674311023953e-06, + "loss": 0.7465, + "step": 8925 + }, + { + "epoch": 0.49127634982662777, + "grad_norm": 0.8475236892700195, + "learning_rate": 8.606374084357361e-06, + "loss": 0.8083, + "step": 8926 + }, + { + "epoch": 0.49133138862898346, + "grad_norm": 0.6972761750221252, + "learning_rate": 8.606073830586224e-06, + "loss": 0.7206, + "step": 8927 + }, + { + "epoch": 0.4913864274313391, + "grad_norm": 0.6209561824798584, + "learning_rate": 8.605773549712803e-06, + "loss": 0.6664, + "step": 8928 + }, + { + "epoch": 0.4914414662336948, + "grad_norm": 0.7905771732330322, + "learning_rate": 8.605473241739353e-06, + "loss": 0.7243, + "step": 8929 + }, + { + "epoch": 0.4914965050360504, + "grad_norm": 0.762959897518158, + "learning_rate": 8.605172906668131e-06, + "loss": 0.7747, + "step": 8930 + }, + { + "epoch": 0.4915515438384061, + "grad_norm": 0.7297530174255371, + "learning_rate": 8.604872544501394e-06, + "loss": 0.7441, + "step": 8931 + }, + { + "epoch": 0.49160658264076174, + "grad_norm": 0.6732318997383118, + "learning_rate": 8.6045721552414e-06, + "loss": 0.7621, + "step": 8932 + }, + { + "epoch": 0.4916616214431174, + "grad_norm": 0.7010045647621155, + "learning_rate": 8.604271738890407e-06, + "loss": 0.7971, + "step": 8933 + }, + { + "epoch": 0.49171666024547306, + "grad_norm": 0.6996648907661438, + "learning_rate": 8.603971295450672e-06, + "loss": 0.8119, + "step": 8934 + }, + { + "epoch": 0.4917716990478287, + "grad_norm": 0.7679941058158875, + "learning_rate": 8.603670824924456e-06, + "loss": 0.8035, + "step": 8935 + }, + { + "epoch": 0.4918267378501844, + "grad_norm": 0.8009630441665649, + "learning_rate": 8.603370327314011e-06, + "loss": 0.7817, + "step": 8936 + }, + { + "epoch": 0.49188177665254, + "grad_norm": 0.7167709469795227, + "learning_rate": 8.603069802621601e-06, + "loss": 0.7621, + "step": 8937 + }, + { + "epoch": 0.4919368154548957, + "grad_norm": 0.7447960376739502, + "learning_rate": 8.602769250849483e-06, + "loss": 0.7664, + "step": 8938 + }, + { + "epoch": 0.49199185425725134, + "grad_norm": 0.653131365776062, + "learning_rate": 8.602468671999915e-06, + "loss": 0.6927, + "step": 8939 + }, + { + "epoch": 0.492046893059607, + "grad_norm": 0.6758691072463989, + "learning_rate": 8.602168066075158e-06, + "loss": 0.7519, + "step": 8940 + }, + { + "epoch": 0.49210193186196266, + "grad_norm": 0.9186220765113831, + "learning_rate": 8.60186743307747e-06, + "loss": 0.7265, + "step": 8941 + }, + { + "epoch": 0.49215697066431835, + "grad_norm": 0.6781855225563049, + "learning_rate": 8.60156677300911e-06, + "loss": 0.6719, + "step": 8942 + }, + { + "epoch": 0.492212009466674, + "grad_norm": 0.7262865304946899, + "learning_rate": 8.601266085872336e-06, + "loss": 0.6449, + "step": 8943 + }, + { + "epoch": 0.4922670482690297, + "grad_norm": 0.6877585053443909, + "learning_rate": 8.600965371669411e-06, + "loss": 0.6999, + "step": 8944 + }, + { + "epoch": 0.4923220870713853, + "grad_norm": 1.1133443117141724, + "learning_rate": 8.600664630402596e-06, + "loss": 0.7842, + "step": 8945 + }, + { + "epoch": 0.492377125873741, + "grad_norm": 0.643478274345398, + "learning_rate": 8.600363862074149e-06, + "loss": 0.7009, + "step": 8946 + }, + { + "epoch": 0.49243216467609663, + "grad_norm": 0.7692574262619019, + "learning_rate": 8.600063066686331e-06, + "loss": 0.7777, + "step": 8947 + }, + { + "epoch": 0.4924872034784523, + "grad_norm": 0.884963870048523, + "learning_rate": 8.599762244241403e-06, + "loss": 0.7789, + "step": 8948 + }, + { + "epoch": 0.49254224228080795, + "grad_norm": 0.6918813586235046, + "learning_rate": 8.599461394741624e-06, + "loss": 0.7769, + "step": 8949 + }, + { + "epoch": 0.49259728108316364, + "grad_norm": 0.7432044148445129, + "learning_rate": 8.599160518189258e-06, + "loss": 0.7972, + "step": 8950 + }, + { + "epoch": 0.4926523198855193, + "grad_norm": 0.7530491948127747, + "learning_rate": 8.598859614586564e-06, + "loss": 0.8812, + "step": 8951 + }, + { + "epoch": 0.49270735868787496, + "grad_norm": 0.8738592267036438, + "learning_rate": 8.598558683935806e-06, + "loss": 0.6967, + "step": 8952 + }, + { + "epoch": 0.4927623974902306, + "grad_norm": 1.032084584236145, + "learning_rate": 8.598257726239242e-06, + "loss": 0.8513, + "step": 8953 + }, + { + "epoch": 0.4928174362925863, + "grad_norm": 0.8717961311340332, + "learning_rate": 8.597956741499136e-06, + "loss": 0.7703, + "step": 8954 + }, + { + "epoch": 0.4928724750949419, + "grad_norm": 0.6788356900215149, + "learning_rate": 8.597655729717753e-06, + "loss": 0.7649, + "step": 8955 + }, + { + "epoch": 0.4929275138972976, + "grad_norm": 1.0595613718032837, + "learning_rate": 8.59735469089735e-06, + "loss": 0.6967, + "step": 8956 + }, + { + "epoch": 0.49298255269965324, + "grad_norm": 0.7583820819854736, + "learning_rate": 8.597053625040193e-06, + "loss": 0.8384, + "step": 8957 + }, + { + "epoch": 0.49303759150200893, + "grad_norm": 0.7232168912887573, + "learning_rate": 8.596752532148545e-06, + "loss": 0.7643, + "step": 8958 + }, + { + "epoch": 0.49309263030436457, + "grad_norm": 0.727190375328064, + "learning_rate": 8.596451412224666e-06, + "loss": 0.845, + "step": 8959 + }, + { + "epoch": 0.49314766910672025, + "grad_norm": 0.6844252347946167, + "learning_rate": 8.596150265270821e-06, + "loss": 0.7099, + "step": 8960 + }, + { + "epoch": 0.4932027079090759, + "grad_norm": 0.7379910945892334, + "learning_rate": 8.595849091289275e-06, + "loss": 0.8168, + "step": 8961 + }, + { + "epoch": 0.4932577467114316, + "grad_norm": 0.77718186378479, + "learning_rate": 8.595547890282288e-06, + "loss": 0.8457, + "step": 8962 + }, + { + "epoch": 0.4933127855137872, + "grad_norm": 0.686126172542572, + "learning_rate": 8.595246662252127e-06, + "loss": 0.7918, + "step": 8963 + }, + { + "epoch": 0.4933678243161429, + "grad_norm": 0.7406145930290222, + "learning_rate": 8.594945407201051e-06, + "loss": 0.6866, + "step": 8964 + }, + { + "epoch": 0.49342286311849853, + "grad_norm": 0.9543277025222778, + "learning_rate": 8.594644125131331e-06, + "loss": 0.8444, + "step": 8965 + }, + { + "epoch": 0.4934779019208542, + "grad_norm": 0.8659517765045166, + "learning_rate": 8.594342816045228e-06, + "loss": 0.7661, + "step": 8966 + }, + { + "epoch": 0.49353294072320986, + "grad_norm": 0.7289552092552185, + "learning_rate": 8.594041479945005e-06, + "loss": 0.7734, + "step": 8967 + }, + { + "epoch": 0.49358797952556555, + "grad_norm": 0.7232840657234192, + "learning_rate": 8.59374011683293e-06, + "loss": 0.8557, + "step": 8968 + }, + { + "epoch": 0.4936430183279212, + "grad_norm": 0.738684356212616, + "learning_rate": 8.593438726711265e-06, + "loss": 0.7779, + "step": 8969 + }, + { + "epoch": 0.49369805713027687, + "grad_norm": 0.7486668229103088, + "learning_rate": 8.593137309582276e-06, + "loss": 0.7326, + "step": 8970 + }, + { + "epoch": 0.4937530959326325, + "grad_norm": 0.6564297080039978, + "learning_rate": 8.59283586544823e-06, + "loss": 0.6927, + "step": 8971 + }, + { + "epoch": 0.4938081347349882, + "grad_norm": 0.722540557384491, + "learning_rate": 8.592534394311392e-06, + "loss": 0.7254, + "step": 8972 + }, + { + "epoch": 0.4938631735373438, + "grad_norm": 0.7466141581535339, + "learning_rate": 8.592232896174026e-06, + "loss": 0.8551, + "step": 8973 + }, + { + "epoch": 0.4939182123396995, + "grad_norm": 0.7819109559059143, + "learning_rate": 8.591931371038398e-06, + "loss": 0.7271, + "step": 8974 + }, + { + "epoch": 0.49397325114205515, + "grad_norm": 0.7847672700881958, + "learning_rate": 8.591629818906776e-06, + "loss": 0.8404, + "step": 8975 + }, + { + "epoch": 0.49402828994441084, + "grad_norm": 0.8167426586151123, + "learning_rate": 8.591328239781428e-06, + "loss": 0.7375, + "step": 8976 + }, + { + "epoch": 0.49408332874676647, + "grad_norm": 0.7894755005836487, + "learning_rate": 8.591026633664615e-06, + "loss": 0.7872, + "step": 8977 + }, + { + "epoch": 0.4941383675491221, + "grad_norm": 0.726204514503479, + "learning_rate": 8.590725000558609e-06, + "loss": 0.7289, + "step": 8978 + }, + { + "epoch": 0.4941934063514778, + "grad_norm": 0.7116577625274658, + "learning_rate": 8.590423340465675e-06, + "loss": 0.7379, + "step": 8979 + }, + { + "epoch": 0.4942484451538334, + "grad_norm": 0.7302193641662598, + "learning_rate": 8.59012165338808e-06, + "loss": 0.7951, + "step": 8980 + }, + { + "epoch": 0.4943034839561891, + "grad_norm": 0.680555522441864, + "learning_rate": 8.58981993932809e-06, + "loss": 0.7609, + "step": 8981 + }, + { + "epoch": 0.49435852275854475, + "grad_norm": 0.874546229839325, + "learning_rate": 8.589518198287976e-06, + "loss": 0.8025, + "step": 8982 + }, + { + "epoch": 0.49441356156090044, + "grad_norm": 0.7164583206176758, + "learning_rate": 8.589216430270004e-06, + "loss": 0.7466, + "step": 8983 + }, + { + "epoch": 0.49446860036325607, + "grad_norm": 0.9155141115188599, + "learning_rate": 8.588914635276442e-06, + "loss": 0.7896, + "step": 8984 + }, + { + "epoch": 0.49452363916561176, + "grad_norm": 0.6777059435844421, + "learning_rate": 8.588612813309558e-06, + "loss": 0.7468, + "step": 8985 + }, + { + "epoch": 0.4945786779679674, + "grad_norm": 0.7100371718406677, + "learning_rate": 8.58831096437162e-06, + "loss": 0.7216, + "step": 8986 + }, + { + "epoch": 0.4946337167703231, + "grad_norm": 0.6842584609985352, + "learning_rate": 8.5880090884649e-06, + "loss": 0.7103, + "step": 8987 + }, + { + "epoch": 0.4946887555726787, + "grad_norm": 0.6347573399543762, + "learning_rate": 8.587707185591661e-06, + "loss": 0.7103, + "step": 8988 + }, + { + "epoch": 0.4947437943750344, + "grad_norm": 0.7175829410552979, + "learning_rate": 8.587405255754177e-06, + "loss": 0.8375, + "step": 8989 + }, + { + "epoch": 0.49479883317739004, + "grad_norm": 0.8402735590934753, + "learning_rate": 8.587103298954715e-06, + "loss": 0.6841, + "step": 8990 + }, + { + "epoch": 0.49485387197974573, + "grad_norm": 0.6988743543624878, + "learning_rate": 8.586801315195545e-06, + "loss": 0.7637, + "step": 8991 + }, + { + "epoch": 0.49490891078210136, + "grad_norm": 0.6672561168670654, + "learning_rate": 8.586499304478934e-06, + "loss": 0.7103, + "step": 8992 + }, + { + "epoch": 0.49496394958445705, + "grad_norm": 0.6821330189704895, + "learning_rate": 8.586197266807158e-06, + "loss": 0.6881, + "step": 8993 + }, + { + "epoch": 0.4950189883868127, + "grad_norm": 0.7886170744895935, + "learning_rate": 8.585895202182482e-06, + "loss": 0.7892, + "step": 8994 + }, + { + "epoch": 0.4950740271891684, + "grad_norm": 0.7348074913024902, + "learning_rate": 8.585593110607177e-06, + "loss": 0.7835, + "step": 8995 + }, + { + "epoch": 0.495129065991524, + "grad_norm": 0.9375506639480591, + "learning_rate": 8.585290992083514e-06, + "loss": 0.8017, + "step": 8996 + }, + { + "epoch": 0.4951841047938797, + "grad_norm": 0.7442331910133362, + "learning_rate": 8.584988846613765e-06, + "loss": 0.72, + "step": 8997 + }, + { + "epoch": 0.49523914359623533, + "grad_norm": 0.7347918748855591, + "learning_rate": 8.584686674200197e-06, + "loss": 0.8229, + "step": 8998 + }, + { + "epoch": 0.495294182398591, + "grad_norm": 0.7168740630149841, + "learning_rate": 8.584384474845084e-06, + "loss": 0.7288, + "step": 8999 + }, + { + "epoch": 0.49534922120094665, + "grad_norm": 0.7834853529930115, + "learning_rate": 8.584082248550697e-06, + "loss": 0.8521, + "step": 9000 + }, + { + "epoch": 0.49540426000330234, + "grad_norm": 0.6499035358428955, + "learning_rate": 8.58377999531931e-06, + "loss": 0.6887, + "step": 9001 + }, + { + "epoch": 0.495459298805658, + "grad_norm": 0.8000181913375854, + "learning_rate": 8.583477715153189e-06, + "loss": 0.8688, + "step": 9002 + }, + { + "epoch": 0.49551433760801367, + "grad_norm": 0.7539342045783997, + "learning_rate": 8.58317540805461e-06, + "loss": 0.6151, + "step": 9003 + }, + { + "epoch": 0.4955693764103693, + "grad_norm": 0.7677812576293945, + "learning_rate": 8.582873074025841e-06, + "loss": 0.8168, + "step": 9004 + }, + { + "epoch": 0.495624415212725, + "grad_norm": 0.7679157853126526, + "learning_rate": 8.58257071306916e-06, + "loss": 0.7719, + "step": 9005 + }, + { + "epoch": 0.4956794540150806, + "grad_norm": 0.9745703935623169, + "learning_rate": 8.582268325186836e-06, + "loss": 0.8272, + "step": 9006 + }, + { + "epoch": 0.4957344928174363, + "grad_norm": 0.66932612657547, + "learning_rate": 8.581965910381143e-06, + "loss": 0.7256, + "step": 9007 + }, + { + "epoch": 0.49578953161979195, + "grad_norm": 0.7630981206893921, + "learning_rate": 8.581663468654351e-06, + "loss": 0.7594, + "step": 9008 + }, + { + "epoch": 0.49584457042214763, + "grad_norm": 0.7420778870582581, + "learning_rate": 8.581361000008737e-06, + "loss": 0.7834, + "step": 9009 + }, + { + "epoch": 0.49589960922450327, + "grad_norm": 0.6775205731391907, + "learning_rate": 8.58105850444657e-06, + "loss": 0.7609, + "step": 9010 + }, + { + "epoch": 0.49595464802685896, + "grad_norm": 0.6588264107704163, + "learning_rate": 8.580755981970128e-06, + "loss": 0.805, + "step": 9011 + }, + { + "epoch": 0.4960096868292146, + "grad_norm": 0.7325689196586609, + "learning_rate": 8.580453432581681e-06, + "loss": 0.8817, + "step": 9012 + }, + { + "epoch": 0.4960647256315703, + "grad_norm": 0.7319273948669434, + "learning_rate": 8.580150856283505e-06, + "loss": 0.8001, + "step": 9013 + }, + { + "epoch": 0.4961197644339259, + "grad_norm": 0.7841789126396179, + "learning_rate": 8.579848253077875e-06, + "loss": 0.8415, + "step": 9014 + }, + { + "epoch": 0.4961748032362816, + "grad_norm": 0.7593979239463806, + "learning_rate": 8.579545622967062e-06, + "loss": 0.8238, + "step": 9015 + }, + { + "epoch": 0.49622984203863724, + "grad_norm": 0.6938808560371399, + "learning_rate": 8.579242965953343e-06, + "loss": 0.7325, + "step": 9016 + }, + { + "epoch": 0.4962848808409929, + "grad_norm": 0.7907594442367554, + "learning_rate": 8.578940282038993e-06, + "loss": 0.6947, + "step": 9017 + }, + { + "epoch": 0.49633991964334856, + "grad_norm": 0.708703875541687, + "learning_rate": 8.578637571226283e-06, + "loss": 0.6712, + "step": 9018 + }, + { + "epoch": 0.49639495844570425, + "grad_norm": 0.6820377707481384, + "learning_rate": 8.578334833517492e-06, + "loss": 0.7269, + "step": 9019 + }, + { + "epoch": 0.4964499972480599, + "grad_norm": 0.6858653426170349, + "learning_rate": 8.578032068914896e-06, + "loss": 0.7325, + "step": 9020 + }, + { + "epoch": 0.4965050360504155, + "grad_norm": 0.8758736848831177, + "learning_rate": 8.577729277420768e-06, + "loss": 0.6652, + "step": 9021 + }, + { + "epoch": 0.4965600748527712, + "grad_norm": 0.731316328048706, + "learning_rate": 8.577426459037383e-06, + "loss": 0.7835, + "step": 9022 + }, + { + "epoch": 0.49661511365512684, + "grad_norm": 0.813778817653656, + "learning_rate": 8.57712361376702e-06, + "loss": 0.8025, + "step": 9023 + }, + { + "epoch": 0.4966701524574825, + "grad_norm": 0.7167351841926575, + "learning_rate": 8.576820741611952e-06, + "loss": 0.7483, + "step": 9024 + }, + { + "epoch": 0.49672519125983816, + "grad_norm": 0.7243192791938782, + "learning_rate": 8.576517842574457e-06, + "loss": 0.8411, + "step": 9025 + }, + { + "epoch": 0.49678023006219385, + "grad_norm": 0.5869036316871643, + "learning_rate": 8.576214916656814e-06, + "loss": 0.6661, + "step": 9026 + }, + { + "epoch": 0.4968352688645495, + "grad_norm": 0.7502203583717346, + "learning_rate": 8.575911963861293e-06, + "loss": 0.8838, + "step": 9027 + }, + { + "epoch": 0.4968903076669052, + "grad_norm": 0.687562108039856, + "learning_rate": 8.575608984190177e-06, + "loss": 0.7446, + "step": 9028 + }, + { + "epoch": 0.4969453464692608, + "grad_norm": 0.7735342383384705, + "learning_rate": 8.57530597764574e-06, + "loss": 0.8464, + "step": 9029 + }, + { + "epoch": 0.4970003852716165, + "grad_norm": 0.7828487753868103, + "learning_rate": 8.575002944230261e-06, + "loss": 0.7504, + "step": 9030 + }, + { + "epoch": 0.49705542407397213, + "grad_norm": 0.6359286904335022, + "learning_rate": 8.574699883946018e-06, + "loss": 0.6805, + "step": 9031 + }, + { + "epoch": 0.4971104628763278, + "grad_norm": 0.7462830543518066, + "learning_rate": 8.574396796795285e-06, + "loss": 0.8317, + "step": 9032 + }, + { + "epoch": 0.49716550167868345, + "grad_norm": 0.705115795135498, + "learning_rate": 8.574093682780344e-06, + "loss": 0.7401, + "step": 9033 + }, + { + "epoch": 0.49722054048103914, + "grad_norm": 0.6466538310050964, + "learning_rate": 8.573790541903472e-06, + "loss": 0.7761, + "step": 9034 + }, + { + "epoch": 0.4972755792833948, + "grad_norm": 0.7479867339134216, + "learning_rate": 8.573487374166946e-06, + "loss": 0.8394, + "step": 9035 + }, + { + "epoch": 0.49733061808575046, + "grad_norm": 0.7378019094467163, + "learning_rate": 8.573184179573046e-06, + "loss": 0.8215, + "step": 9036 + }, + { + "epoch": 0.4973856568881061, + "grad_norm": 0.6526094675064087, + "learning_rate": 8.57288095812405e-06, + "loss": 0.8055, + "step": 9037 + }, + { + "epoch": 0.4974406956904618, + "grad_norm": 0.679595947265625, + "learning_rate": 8.572577709822238e-06, + "loss": 0.8241, + "step": 9038 + }, + { + "epoch": 0.4974957344928174, + "grad_norm": 0.753466010093689, + "learning_rate": 8.572274434669886e-06, + "loss": 0.896, + "step": 9039 + }, + { + "epoch": 0.4975507732951731, + "grad_norm": 0.7068368792533875, + "learning_rate": 8.571971132669277e-06, + "loss": 0.778, + "step": 9040 + }, + { + "epoch": 0.49760581209752874, + "grad_norm": 0.7397973537445068, + "learning_rate": 8.571667803822689e-06, + "loss": 0.782, + "step": 9041 + }, + { + "epoch": 0.49766085089988443, + "grad_norm": 0.7837033271789551, + "learning_rate": 8.571364448132402e-06, + "loss": 0.7509, + "step": 9042 + }, + { + "epoch": 0.49771588970224007, + "grad_norm": 0.6808765530586243, + "learning_rate": 8.571061065600696e-06, + "loss": 0.672, + "step": 9043 + }, + { + "epoch": 0.49777092850459576, + "grad_norm": 0.6574100255966187, + "learning_rate": 8.570757656229852e-06, + "loss": 0.751, + "step": 9044 + }, + { + "epoch": 0.4978259673069514, + "grad_norm": 0.7357671856880188, + "learning_rate": 8.570454220022146e-06, + "loss": 0.7977, + "step": 9045 + }, + { + "epoch": 0.4978810061093071, + "grad_norm": 0.7937216758728027, + "learning_rate": 8.570150756979865e-06, + "loss": 0.8151, + "step": 9046 + }, + { + "epoch": 0.4979360449116627, + "grad_norm": 0.7050907611846924, + "learning_rate": 8.569847267105285e-06, + "loss": 0.7667, + "step": 9047 + }, + { + "epoch": 0.4979910837140184, + "grad_norm": 0.7105300426483154, + "learning_rate": 8.569543750400688e-06, + "loss": 0.7031, + "step": 9048 + }, + { + "epoch": 0.49804612251637403, + "grad_norm": 0.7174646854400635, + "learning_rate": 8.569240206868358e-06, + "loss": 0.7692, + "step": 9049 + }, + { + "epoch": 0.4981011613187297, + "grad_norm": 0.7525906562805176, + "learning_rate": 8.568936636510573e-06, + "loss": 0.7584, + "step": 9050 + }, + { + "epoch": 0.49815620012108536, + "grad_norm": 1.5518100261688232, + "learning_rate": 8.568633039329615e-06, + "loss": 0.7932, + "step": 9051 + }, + { + "epoch": 0.49821123892344105, + "grad_norm": 0.7037720084190369, + "learning_rate": 8.568329415327766e-06, + "loss": 0.8345, + "step": 9052 + }, + { + "epoch": 0.4982662777257967, + "grad_norm": 0.6422694325447083, + "learning_rate": 8.568025764507308e-06, + "loss": 0.7396, + "step": 9053 + }, + { + "epoch": 0.49832131652815237, + "grad_norm": 0.777306854724884, + "learning_rate": 8.567722086870525e-06, + "loss": 0.8605, + "step": 9054 + }, + { + "epoch": 0.498376355330508, + "grad_norm": 0.6619865298271179, + "learning_rate": 8.567418382419697e-06, + "loss": 0.7395, + "step": 9055 + }, + { + "epoch": 0.4984313941328637, + "grad_norm": 0.7214456796646118, + "learning_rate": 8.567114651157106e-06, + "loss": 0.7932, + "step": 9056 + }, + { + "epoch": 0.4984864329352193, + "grad_norm": 0.75806725025177, + "learning_rate": 8.566810893085037e-06, + "loss": 0.7998, + "step": 9057 + }, + { + "epoch": 0.498541471737575, + "grad_norm": 0.8089895844459534, + "learning_rate": 8.566507108205773e-06, + "loss": 0.7849, + "step": 9058 + }, + { + "epoch": 0.49859651053993065, + "grad_norm": 0.817814290523529, + "learning_rate": 8.566203296521597e-06, + "loss": 0.7261, + "step": 9059 + }, + { + "epoch": 0.49865154934228634, + "grad_norm": 0.7417539954185486, + "learning_rate": 8.56589945803479e-06, + "loss": 0.7087, + "step": 9060 + }, + { + "epoch": 0.49870658814464197, + "grad_norm": 0.7518000602722168, + "learning_rate": 8.565595592747639e-06, + "loss": 0.7245, + "step": 9061 + }, + { + "epoch": 0.49876162694699766, + "grad_norm": 0.9537304043769836, + "learning_rate": 8.565291700662423e-06, + "loss": 0.901, + "step": 9062 + }, + { + "epoch": 0.4988166657493533, + "grad_norm": 0.784545361995697, + "learning_rate": 8.56498778178143e-06, + "loss": 0.7813, + "step": 9063 + }, + { + "epoch": 0.4988717045517089, + "grad_norm": 0.9218429923057556, + "learning_rate": 8.564683836106945e-06, + "loss": 0.8452, + "step": 9064 + }, + { + "epoch": 0.4989267433540646, + "grad_norm": 0.6902065277099609, + "learning_rate": 8.56437986364125e-06, + "loss": 0.7527, + "step": 9065 + }, + { + "epoch": 0.49898178215642025, + "grad_norm": 0.7388677000999451, + "learning_rate": 8.56407586438663e-06, + "loss": 0.82, + "step": 9066 + }, + { + "epoch": 0.49903682095877594, + "grad_norm": 0.6959313154220581, + "learning_rate": 8.563771838345369e-06, + "loss": 0.7274, + "step": 9067 + }, + { + "epoch": 0.4990918597611316, + "grad_norm": 0.6582610607147217, + "learning_rate": 8.563467785519753e-06, + "loss": 0.6518, + "step": 9068 + }, + { + "epoch": 0.49914689856348726, + "grad_norm": 0.6525924801826477, + "learning_rate": 8.563163705912066e-06, + "loss": 0.7006, + "step": 9069 + }, + { + "epoch": 0.4992019373658429, + "grad_norm": 0.8092843890190125, + "learning_rate": 8.562859599524596e-06, + "loss": 0.6915, + "step": 9070 + }, + { + "epoch": 0.4992569761681986, + "grad_norm": 0.6540575623512268, + "learning_rate": 8.562555466359626e-06, + "loss": 0.6729, + "step": 9071 + }, + { + "epoch": 0.4993120149705542, + "grad_norm": 0.8220445513725281, + "learning_rate": 8.562251306419443e-06, + "loss": 0.8172, + "step": 9072 + }, + { + "epoch": 0.4993670537729099, + "grad_norm": 0.7461502552032471, + "learning_rate": 8.561947119706334e-06, + "loss": 0.6902, + "step": 9073 + }, + { + "epoch": 0.49942209257526554, + "grad_norm": 0.8166316151618958, + "learning_rate": 8.56164290622258e-06, + "loss": 0.8238, + "step": 9074 + }, + { + "epoch": 0.49947713137762123, + "grad_norm": 0.8453896641731262, + "learning_rate": 8.561338665970476e-06, + "loss": 0.7697, + "step": 9075 + }, + { + "epoch": 0.49953217017997686, + "grad_norm": 0.7606340050697327, + "learning_rate": 8.5610343989523e-06, + "loss": 0.6951, + "step": 9076 + }, + { + "epoch": 0.49958720898233255, + "grad_norm": 0.7408013343811035, + "learning_rate": 8.560730105170345e-06, + "loss": 0.8298, + "step": 9077 + }, + { + "epoch": 0.4996422477846882, + "grad_norm": 0.7625541090965271, + "learning_rate": 8.560425784626896e-06, + "loss": 0.6738, + "step": 9078 + }, + { + "epoch": 0.4996972865870439, + "grad_norm": 0.6940996646881104, + "learning_rate": 8.560121437324238e-06, + "loss": 0.78, + "step": 9079 + }, + { + "epoch": 0.4997523253893995, + "grad_norm": 0.8087461590766907, + "learning_rate": 8.559817063264661e-06, + "loss": 0.7831, + "step": 9080 + }, + { + "epoch": 0.4998073641917552, + "grad_norm": 0.7418510317802429, + "learning_rate": 8.559512662450452e-06, + "loss": 0.801, + "step": 9081 + }, + { + "epoch": 0.49986240299411083, + "grad_norm": 0.6793946027755737, + "learning_rate": 8.5592082348839e-06, + "loss": 0.7329, + "step": 9082 + }, + { + "epoch": 0.4999174417964665, + "grad_norm": 0.8197429180145264, + "learning_rate": 8.55890378056729e-06, + "loss": 0.804, + "step": 9083 + }, + { + "epoch": 0.49997248059882216, + "grad_norm": 0.7526460886001587, + "learning_rate": 8.558599299502912e-06, + "loss": 0.8378, + "step": 9084 + }, + { + "epoch": 0.5000275194011778, + "grad_norm": 0.8169133067131042, + "learning_rate": 8.558294791693055e-06, + "loss": 0.828, + "step": 9085 + }, + { + "epoch": 0.5000825582035335, + "grad_norm": 0.8386932015419006, + "learning_rate": 8.557990257140007e-06, + "loss": 0.7961, + "step": 9086 + }, + { + "epoch": 0.5001375970058891, + "grad_norm": 0.7183443903923035, + "learning_rate": 8.557685695846057e-06, + "loss": 0.6964, + "step": 9087 + }, + { + "epoch": 0.5001926358082448, + "grad_norm": 0.77079176902771, + "learning_rate": 8.557381107813491e-06, + "loss": 0.8222, + "step": 9088 + }, + { + "epoch": 0.5002476746106005, + "grad_norm": 0.6519342660903931, + "learning_rate": 8.557076493044603e-06, + "loss": 0.772, + "step": 9089 + }, + { + "epoch": 0.5003027134129562, + "grad_norm": 0.7039975523948669, + "learning_rate": 8.556771851541678e-06, + "loss": 0.7491, + "step": 9090 + }, + { + "epoch": 0.5003577522153118, + "grad_norm": 0.6459039449691772, + "learning_rate": 8.556467183307012e-06, + "loss": 0.7104, + "step": 9091 + }, + { + "epoch": 0.5004127910176674, + "grad_norm": 0.7359183430671692, + "learning_rate": 8.556162488342887e-06, + "loss": 0.829, + "step": 9092 + }, + { + "epoch": 0.5004678298200231, + "grad_norm": 0.7029602527618408, + "learning_rate": 8.555857766651599e-06, + "loss": 0.8163, + "step": 9093 + }, + { + "epoch": 0.5005228686223788, + "grad_norm": 0.6687049865722656, + "learning_rate": 8.555553018235435e-06, + "loss": 0.7589, + "step": 9094 + }, + { + "epoch": 0.5005779074247344, + "grad_norm": 0.7277147173881531, + "learning_rate": 8.555248243096686e-06, + "loss": 0.8334, + "step": 9095 + }, + { + "epoch": 0.5006329462270901, + "grad_norm": 0.6512065529823303, + "learning_rate": 8.554943441237642e-06, + "loss": 0.7174, + "step": 9096 + }, + { + "epoch": 0.5006879850294458, + "grad_norm": 0.725351095199585, + "learning_rate": 8.554638612660594e-06, + "loss": 0.6514, + "step": 9097 + }, + { + "epoch": 0.5007430238318015, + "grad_norm": 0.7983208894729614, + "learning_rate": 8.554333757367836e-06, + "loss": 0.8385, + "step": 9098 + }, + { + "epoch": 0.500798062634157, + "grad_norm": 0.6631388068199158, + "learning_rate": 8.554028875361657e-06, + "loss": 0.7103, + "step": 9099 + }, + { + "epoch": 0.5008531014365127, + "grad_norm": 0.730421245098114, + "learning_rate": 8.553723966644347e-06, + "loss": 0.8005, + "step": 9100 + }, + { + "epoch": 0.5009081402388684, + "grad_norm": 0.7385838627815247, + "learning_rate": 8.5534190312182e-06, + "loss": 0.7586, + "step": 9101 + }, + { + "epoch": 0.5009631790412241, + "grad_norm": 0.712458610534668, + "learning_rate": 8.553114069085506e-06, + "loss": 0.7587, + "step": 9102 + }, + { + "epoch": 0.5010182178435797, + "grad_norm": 0.7393542528152466, + "learning_rate": 8.552809080248559e-06, + "loss": 0.746, + "step": 9103 + }, + { + "epoch": 0.5010732566459354, + "grad_norm": 0.6596370935440063, + "learning_rate": 8.552504064709649e-06, + "loss": 0.6968, + "step": 9104 + }, + { + "epoch": 0.5011282954482911, + "grad_norm": 0.7340545654296875, + "learning_rate": 8.552199022471069e-06, + "loss": 0.8326, + "step": 9105 + }, + { + "epoch": 0.5011833342506467, + "grad_norm": 0.6586140990257263, + "learning_rate": 8.55189395353511e-06, + "loss": 0.7144, + "step": 9106 + }, + { + "epoch": 0.5012383730530023, + "grad_norm": 0.6875959038734436, + "learning_rate": 8.551588857904071e-06, + "loss": 0.721, + "step": 9107 + }, + { + "epoch": 0.501293411855358, + "grad_norm": 0.6754499077796936, + "learning_rate": 8.551283735580238e-06, + "loss": 0.6771, + "step": 9108 + }, + { + "epoch": 0.5013484506577137, + "grad_norm": 0.8027325868606567, + "learning_rate": 8.55097858656591e-06, + "loss": 0.8196, + "step": 9109 + }, + { + "epoch": 0.5014034894600693, + "grad_norm": 0.6992260217666626, + "learning_rate": 8.550673410863376e-06, + "loss": 0.7923, + "step": 9110 + }, + { + "epoch": 0.501458528262425, + "grad_norm": 0.741205632686615, + "learning_rate": 8.550368208474928e-06, + "loss": 0.7036, + "step": 9111 + }, + { + "epoch": 0.5015135670647807, + "grad_norm": 0.6485981345176697, + "learning_rate": 8.550062979402866e-06, + "loss": 0.6351, + "step": 9112 + }, + { + "epoch": 0.5015686058671364, + "grad_norm": 0.6984226703643799, + "learning_rate": 8.549757723649481e-06, + "loss": 0.7714, + "step": 9113 + }, + { + "epoch": 0.5016236446694919, + "grad_norm": 0.7773998975753784, + "learning_rate": 8.549452441217067e-06, + "loss": 0.8901, + "step": 9114 + }, + { + "epoch": 0.5016786834718476, + "grad_norm": 0.6912227272987366, + "learning_rate": 8.549147132107918e-06, + "loss": 0.7702, + "step": 9115 + }, + { + "epoch": 0.5017337222742033, + "grad_norm": 0.6742583513259888, + "learning_rate": 8.54884179632433e-06, + "loss": 0.7789, + "step": 9116 + }, + { + "epoch": 0.501788761076559, + "grad_norm": 0.7896195650100708, + "learning_rate": 8.548536433868595e-06, + "loss": 0.7358, + "step": 9117 + }, + { + "epoch": 0.5018437998789146, + "grad_norm": 0.7112523913383484, + "learning_rate": 8.548231044743011e-06, + "loss": 0.7286, + "step": 9118 + }, + { + "epoch": 0.5018988386812703, + "grad_norm": 0.9162774085998535, + "learning_rate": 8.547925628949873e-06, + "loss": 0.935, + "step": 9119 + }, + { + "epoch": 0.501953877483626, + "grad_norm": 0.6319599747657776, + "learning_rate": 8.547620186491477e-06, + "loss": 0.625, + "step": 9120 + }, + { + "epoch": 0.5020089162859817, + "grad_norm": 0.7074719667434692, + "learning_rate": 8.547314717370115e-06, + "loss": 0.6614, + "step": 9121 + }, + { + "epoch": 0.5020639550883372, + "grad_norm": 0.7417262196540833, + "learning_rate": 8.547009221588086e-06, + "loss": 0.8476, + "step": 9122 + }, + { + "epoch": 0.5021189938906929, + "grad_norm": 0.7057339549064636, + "learning_rate": 8.546703699147685e-06, + "loss": 0.805, + "step": 9123 + }, + { + "epoch": 0.5021740326930486, + "grad_norm": 0.7420887351036072, + "learning_rate": 8.546398150051207e-06, + "loss": 0.7331, + "step": 9124 + }, + { + "epoch": 0.5022290714954043, + "grad_norm": 0.9526195526123047, + "learning_rate": 8.546092574300953e-06, + "loss": 0.7803, + "step": 9125 + }, + { + "epoch": 0.5022841102977599, + "grad_norm": 0.748130202293396, + "learning_rate": 8.545786971899214e-06, + "loss": 0.7998, + "step": 9126 + }, + { + "epoch": 0.5023391491001156, + "grad_norm": 0.7266026139259338, + "learning_rate": 8.545481342848289e-06, + "loss": 0.8377, + "step": 9127 + }, + { + "epoch": 0.5023941879024713, + "grad_norm": 0.6762456893920898, + "learning_rate": 8.545175687150478e-06, + "loss": 0.7312, + "step": 9128 + }, + { + "epoch": 0.502449226704827, + "grad_norm": 0.7011429667472839, + "learning_rate": 8.544870004808072e-06, + "loss": 0.7666, + "step": 9129 + }, + { + "epoch": 0.5025042655071825, + "grad_norm": 0.6652229428291321, + "learning_rate": 8.544564295823375e-06, + "loss": 0.6904, + "step": 9130 + }, + { + "epoch": 0.5025593043095382, + "grad_norm": 0.8333765268325806, + "learning_rate": 8.54425856019868e-06, + "loss": 0.7318, + "step": 9131 + }, + { + "epoch": 0.5026143431118939, + "grad_norm": 0.6827245950698853, + "learning_rate": 8.543952797936285e-06, + "loss": 0.7692, + "step": 9132 + }, + { + "epoch": 0.5026693819142496, + "grad_norm": 0.8744323253631592, + "learning_rate": 8.543647009038491e-06, + "loss": 0.7316, + "step": 9133 + }, + { + "epoch": 0.5027244207166052, + "grad_norm": 0.7024276852607727, + "learning_rate": 8.543341193507594e-06, + "loss": 0.7008, + "step": 9134 + }, + { + "epoch": 0.5027794595189609, + "grad_norm": 0.8786055445671082, + "learning_rate": 8.543035351345895e-06, + "loss": 0.7054, + "step": 9135 + }, + { + "epoch": 0.5028344983213165, + "grad_norm": 0.727924108505249, + "learning_rate": 8.54272948255569e-06, + "loss": 0.8049, + "step": 9136 + }, + { + "epoch": 0.5028895371236722, + "grad_norm": 0.8366256356239319, + "learning_rate": 8.542423587139277e-06, + "loss": 0.7926, + "step": 9137 + }, + { + "epoch": 0.5029445759260278, + "grad_norm": 0.7657913565635681, + "learning_rate": 8.542117665098958e-06, + "loss": 0.8152, + "step": 9138 + }, + { + "epoch": 0.5029996147283835, + "grad_norm": 0.7543498277664185, + "learning_rate": 8.54181171643703e-06, + "loss": 0.7566, + "step": 9139 + }, + { + "epoch": 0.5030546535307392, + "grad_norm": 0.7771349549293518, + "learning_rate": 8.541505741155794e-06, + "loss": 0.7907, + "step": 9140 + }, + { + "epoch": 0.5031096923330949, + "grad_norm": 0.6661877632141113, + "learning_rate": 8.541199739257548e-06, + "loss": 0.7481, + "step": 9141 + }, + { + "epoch": 0.5031647311354505, + "grad_norm": 0.7700417637825012, + "learning_rate": 8.540893710744593e-06, + "loss": 0.7544, + "step": 9142 + }, + { + "epoch": 0.5032197699378061, + "grad_norm": 0.6476640105247498, + "learning_rate": 8.54058765561923e-06, + "loss": 0.7221, + "step": 9143 + }, + { + "epoch": 0.5032748087401618, + "grad_norm": 0.7098944187164307, + "learning_rate": 8.540281573883755e-06, + "loss": 0.8083, + "step": 9144 + }, + { + "epoch": 0.5033298475425175, + "grad_norm": 0.9733545184135437, + "learning_rate": 8.539975465540473e-06, + "loss": 0.7381, + "step": 9145 + }, + { + "epoch": 0.5033848863448731, + "grad_norm": 0.641211986541748, + "learning_rate": 8.539669330591685e-06, + "loss": 0.7511, + "step": 9146 + }, + { + "epoch": 0.5034399251472288, + "grad_norm": 0.626027524471283, + "learning_rate": 8.539363169039687e-06, + "loss": 0.7321, + "step": 9147 + }, + { + "epoch": 0.5034949639495845, + "grad_norm": 0.7627241611480713, + "learning_rate": 8.539056980886785e-06, + "loss": 0.7269, + "step": 9148 + }, + { + "epoch": 0.5035500027519401, + "grad_norm": 0.6711145639419556, + "learning_rate": 8.538750766135275e-06, + "loss": 0.8179, + "step": 9149 + }, + { + "epoch": 0.5036050415542958, + "grad_norm": 0.6981950998306274, + "learning_rate": 8.538444524787463e-06, + "loss": 0.8095, + "step": 9150 + }, + { + "epoch": 0.5036600803566514, + "grad_norm": 0.8869871497154236, + "learning_rate": 8.53813825684565e-06, + "loss": 0.8549, + "step": 9151 + }, + { + "epoch": 0.5037151191590071, + "grad_norm": 0.6461544036865234, + "learning_rate": 8.537831962312137e-06, + "loss": 0.7388, + "step": 9152 + }, + { + "epoch": 0.5037701579613627, + "grad_norm": 0.8279222249984741, + "learning_rate": 8.537525641189224e-06, + "loss": 0.8609, + "step": 9153 + }, + { + "epoch": 0.5038251967637184, + "grad_norm": 0.7117578387260437, + "learning_rate": 8.537219293479217e-06, + "loss": 0.802, + "step": 9154 + }, + { + "epoch": 0.5038802355660741, + "grad_norm": 0.6831860542297363, + "learning_rate": 8.536912919184416e-06, + "loss": 0.7821, + "step": 9155 + }, + { + "epoch": 0.5039352743684298, + "grad_norm": 1.1528539657592773, + "learning_rate": 8.536606518307125e-06, + "loss": 0.8578, + "step": 9156 + }, + { + "epoch": 0.5039903131707854, + "grad_norm": 0.6545060873031616, + "learning_rate": 8.536300090849645e-06, + "loss": 0.7744, + "step": 9157 + }, + { + "epoch": 0.504045351973141, + "grad_norm": 0.7176601886749268, + "learning_rate": 8.535993636814281e-06, + "loss": 0.8104, + "step": 9158 + }, + { + "epoch": 0.5041003907754967, + "grad_norm": 0.8458410501480103, + "learning_rate": 8.535687156203334e-06, + "loss": 0.8653, + "step": 9159 + }, + { + "epoch": 0.5041554295778524, + "grad_norm": 0.7500274777412415, + "learning_rate": 8.53538064901911e-06, + "loss": 0.8043, + "step": 9160 + }, + { + "epoch": 0.504210468380208, + "grad_norm": 0.6982965469360352, + "learning_rate": 8.535074115263911e-06, + "loss": 0.7564, + "step": 9161 + }, + { + "epoch": 0.5042655071825637, + "grad_norm": 0.8344218134880066, + "learning_rate": 8.534767554940042e-06, + "loss": 0.7575, + "step": 9162 + }, + { + "epoch": 0.5043205459849194, + "grad_norm": 0.7527137398719788, + "learning_rate": 8.534460968049806e-06, + "loss": 0.7757, + "step": 9163 + }, + { + "epoch": 0.5043755847872751, + "grad_norm": 0.7136969566345215, + "learning_rate": 8.534154354595508e-06, + "loss": 0.826, + "step": 9164 + }, + { + "epoch": 0.5044306235896306, + "grad_norm": 0.8102819919586182, + "learning_rate": 8.533847714579449e-06, + "loss": 0.7247, + "step": 9165 + }, + { + "epoch": 0.5044856623919863, + "grad_norm": 0.7568309903144836, + "learning_rate": 8.53354104800394e-06, + "loss": 0.8509, + "step": 9166 + }, + { + "epoch": 0.504540701194342, + "grad_norm": 0.7719592452049255, + "learning_rate": 8.53323435487128e-06, + "loss": 0.8039, + "step": 9167 + }, + { + "epoch": 0.5045957399966977, + "grad_norm": 0.7514411807060242, + "learning_rate": 8.532927635183778e-06, + "loss": 0.8759, + "step": 9168 + }, + { + "epoch": 0.5046507787990533, + "grad_norm": 0.9781903028488159, + "learning_rate": 8.532620888943736e-06, + "loss": 0.8022, + "step": 9169 + }, + { + "epoch": 0.504705817601409, + "grad_norm": 0.7713304758071899, + "learning_rate": 8.532314116153462e-06, + "loss": 0.8372, + "step": 9170 + }, + { + "epoch": 0.5047608564037647, + "grad_norm": 0.7519709467887878, + "learning_rate": 8.53200731681526e-06, + "loss": 0.7374, + "step": 9171 + }, + { + "epoch": 0.5048158952061204, + "grad_norm": 0.6923980712890625, + "learning_rate": 8.531700490931438e-06, + "loss": 0.7511, + "step": 9172 + }, + { + "epoch": 0.5048709340084759, + "grad_norm": 0.682357907295227, + "learning_rate": 8.5313936385043e-06, + "loss": 0.7647, + "step": 9173 + }, + { + "epoch": 0.5049259728108316, + "grad_norm": 0.8255659341812134, + "learning_rate": 8.531086759536152e-06, + "loss": 0.7533, + "step": 9174 + }, + { + "epoch": 0.5049810116131873, + "grad_norm": 0.6774975061416626, + "learning_rate": 8.530779854029301e-06, + "loss": 0.7019, + "step": 9175 + }, + { + "epoch": 0.505036050415543, + "grad_norm": 0.7973241209983826, + "learning_rate": 8.530472921986053e-06, + "loss": 0.7824, + "step": 9176 + }, + { + "epoch": 0.5050910892178986, + "grad_norm": 0.8216109275817871, + "learning_rate": 8.530165963408716e-06, + "loss": 0.8063, + "step": 9177 + }, + { + "epoch": 0.5051461280202543, + "grad_norm": 0.7277935743331909, + "learning_rate": 8.5298589782996e-06, + "loss": 0.7631, + "step": 9178 + }, + { + "epoch": 0.50520116682261, + "grad_norm": 0.6647855043411255, + "learning_rate": 8.529551966661004e-06, + "loss": 0.7462, + "step": 9179 + }, + { + "epoch": 0.5052562056249656, + "grad_norm": 0.766272783279419, + "learning_rate": 8.529244928495241e-06, + "loss": 0.8075, + "step": 9180 + }, + { + "epoch": 0.5053112444273212, + "grad_norm": 0.7276293635368347, + "learning_rate": 8.52893786380462e-06, + "loss": 0.7908, + "step": 9181 + }, + { + "epoch": 0.5053662832296769, + "grad_norm": 0.7864169478416443, + "learning_rate": 8.528630772591447e-06, + "loss": 0.8082, + "step": 9182 + }, + { + "epoch": 0.5054213220320326, + "grad_norm": 0.9106804132461548, + "learning_rate": 8.528323654858028e-06, + "loss": 0.8989, + "step": 9183 + }, + { + "epoch": 0.5054763608343883, + "grad_norm": 0.7288523316383362, + "learning_rate": 8.52801651060667e-06, + "loss": 0.7972, + "step": 9184 + }, + { + "epoch": 0.5055313996367439, + "grad_norm": 0.7149643301963806, + "learning_rate": 8.527709339839689e-06, + "loss": 0.8191, + "step": 9185 + }, + { + "epoch": 0.5055864384390996, + "grad_norm": 0.6661714911460876, + "learning_rate": 8.527402142559388e-06, + "loss": 0.6596, + "step": 9186 + }, + { + "epoch": 0.5056414772414553, + "grad_norm": 0.7071447372436523, + "learning_rate": 8.527094918768076e-06, + "loss": 0.7633, + "step": 9187 + }, + { + "epoch": 0.5056965160438109, + "grad_norm": 0.7314093112945557, + "learning_rate": 8.526787668468064e-06, + "loss": 0.7815, + "step": 9188 + }, + { + "epoch": 0.5057515548461665, + "grad_norm": 0.8200539946556091, + "learning_rate": 8.526480391661657e-06, + "loss": 0.8376, + "step": 9189 + }, + { + "epoch": 0.5058065936485222, + "grad_norm": 0.7422435283660889, + "learning_rate": 8.52617308835117e-06, + "loss": 0.8783, + "step": 9190 + }, + { + "epoch": 0.5058616324508779, + "grad_norm": 0.7845084071159363, + "learning_rate": 8.525865758538909e-06, + "loss": 0.8005, + "step": 9191 + }, + { + "epoch": 0.5059166712532335, + "grad_norm": 0.6854296922683716, + "learning_rate": 8.525558402227185e-06, + "loss": 0.8118, + "step": 9192 + }, + { + "epoch": 0.5059717100555892, + "grad_norm": 0.6805297136306763, + "learning_rate": 8.525251019418309e-06, + "loss": 0.6765, + "step": 9193 + }, + { + "epoch": 0.5060267488579449, + "grad_norm": 0.7194867134094238, + "learning_rate": 8.524943610114587e-06, + "loss": 0.6752, + "step": 9194 + }, + { + "epoch": 0.5060817876603005, + "grad_norm": 0.6935137510299683, + "learning_rate": 8.524636174318335e-06, + "loss": 0.7122, + "step": 9195 + }, + { + "epoch": 0.5061368264626561, + "grad_norm": 0.8652825951576233, + "learning_rate": 8.52432871203186e-06, + "loss": 0.7725, + "step": 9196 + }, + { + "epoch": 0.5061918652650118, + "grad_norm": 0.9104461669921875, + "learning_rate": 8.524021223257472e-06, + "loss": 0.8589, + "step": 9197 + }, + { + "epoch": 0.5062469040673675, + "grad_norm": 0.7680580019950867, + "learning_rate": 8.523713707997486e-06, + "loss": 0.842, + "step": 9198 + }, + { + "epoch": 0.5063019428697232, + "grad_norm": 0.7324872612953186, + "learning_rate": 8.52340616625421e-06, + "loss": 0.802, + "step": 9199 + }, + { + "epoch": 0.5063569816720788, + "grad_norm": 0.8812359571456909, + "learning_rate": 8.523098598029958e-06, + "loss": 0.8286, + "step": 9200 + }, + { + "epoch": 0.5064120204744345, + "grad_norm": 0.6992496848106384, + "learning_rate": 8.522791003327038e-06, + "loss": 0.811, + "step": 9201 + }, + { + "epoch": 0.5064670592767901, + "grad_norm": 0.8191942572593689, + "learning_rate": 8.522483382147766e-06, + "loss": 0.7192, + "step": 9202 + }, + { + "epoch": 0.5065220980791458, + "grad_norm": 0.9354501366615295, + "learning_rate": 8.522175734494452e-06, + "loss": 0.7424, + "step": 9203 + }, + { + "epoch": 0.5065771368815014, + "grad_norm": 0.6481999754905701, + "learning_rate": 8.521868060369405e-06, + "loss": 0.6385, + "step": 9204 + }, + { + "epoch": 0.5066321756838571, + "grad_norm": 0.7158499360084534, + "learning_rate": 8.521560359774943e-06, + "loss": 0.6116, + "step": 9205 + }, + { + "epoch": 0.5066872144862128, + "grad_norm": 0.8738408088684082, + "learning_rate": 8.521252632713376e-06, + "loss": 0.894, + "step": 9206 + }, + { + "epoch": 0.5067422532885685, + "grad_norm": 0.7037062644958496, + "learning_rate": 8.520944879187015e-06, + "loss": 0.6958, + "step": 9207 + }, + { + "epoch": 0.5067972920909241, + "grad_norm": 0.7205594778060913, + "learning_rate": 8.520637099198175e-06, + "loss": 0.7188, + "step": 9208 + }, + { + "epoch": 0.5068523308932797, + "grad_norm": 0.6761966347694397, + "learning_rate": 8.520329292749169e-06, + "loss": 0.7669, + "step": 9209 + }, + { + "epoch": 0.5069073696956354, + "grad_norm": 0.682556688785553, + "learning_rate": 8.520021459842312e-06, + "loss": 0.7745, + "step": 9210 + }, + { + "epoch": 0.5069624084979911, + "grad_norm": 0.6687794923782349, + "learning_rate": 8.519713600479913e-06, + "loss": 0.7814, + "step": 9211 + }, + { + "epoch": 0.5070174473003467, + "grad_norm": 0.6391967535018921, + "learning_rate": 8.51940571466429e-06, + "loss": 0.7331, + "step": 9212 + }, + { + "epoch": 0.5070724861027024, + "grad_norm": 0.8420151472091675, + "learning_rate": 8.519097802397758e-06, + "loss": 0.8257, + "step": 9213 + }, + { + "epoch": 0.5071275249050581, + "grad_norm": 0.692787230014801, + "learning_rate": 8.518789863682625e-06, + "loss": 0.7179, + "step": 9214 + }, + { + "epoch": 0.5071825637074138, + "grad_norm": 0.6874318718910217, + "learning_rate": 8.518481898521213e-06, + "loss": 0.6847, + "step": 9215 + }, + { + "epoch": 0.5072376025097693, + "grad_norm": 0.8107750415802002, + "learning_rate": 8.518173906915832e-06, + "loss": 0.8459, + "step": 9216 + }, + { + "epoch": 0.507292641312125, + "grad_norm": 0.7952812910079956, + "learning_rate": 8.517865888868797e-06, + "loss": 0.8503, + "step": 9217 + }, + { + "epoch": 0.5073476801144807, + "grad_norm": 0.6926921606063843, + "learning_rate": 8.517557844382424e-06, + "loss": 0.6713, + "step": 9218 + }, + { + "epoch": 0.5074027189168364, + "grad_norm": 0.8203585147857666, + "learning_rate": 8.517249773459026e-06, + "loss": 0.8483, + "step": 9219 + }, + { + "epoch": 0.507457757719192, + "grad_norm": 0.6788125038146973, + "learning_rate": 8.516941676100923e-06, + "loss": 0.7521, + "step": 9220 + }, + { + "epoch": 0.5075127965215477, + "grad_norm": 0.6439838409423828, + "learning_rate": 8.516633552310426e-06, + "loss": 0.7359, + "step": 9221 + }, + { + "epoch": 0.5075678353239034, + "grad_norm": 0.6872217655181885, + "learning_rate": 8.516325402089854e-06, + "loss": 0.73, + "step": 9222 + }, + { + "epoch": 0.5076228741262591, + "grad_norm": 0.6695985794067383, + "learning_rate": 8.51601722544152e-06, + "loss": 0.7519, + "step": 9223 + }, + { + "epoch": 0.5076779129286146, + "grad_norm": 0.7779402136802673, + "learning_rate": 8.515709022367741e-06, + "loss": 0.7325, + "step": 9224 + }, + { + "epoch": 0.5077329517309703, + "grad_norm": 0.9289746284484863, + "learning_rate": 8.515400792870836e-06, + "loss": 0.7839, + "step": 9225 + }, + { + "epoch": 0.507787990533326, + "grad_norm": 0.6949248313903809, + "learning_rate": 8.51509253695312e-06, + "loss": 0.7363, + "step": 9226 + }, + { + "epoch": 0.5078430293356817, + "grad_norm": 0.6463130116462708, + "learning_rate": 8.514784254616908e-06, + "loss": 0.7607, + "step": 9227 + }, + { + "epoch": 0.5078980681380373, + "grad_norm": 0.7332046031951904, + "learning_rate": 8.514475945864519e-06, + "loss": 0.6833, + "step": 9228 + }, + { + "epoch": 0.507953106940393, + "grad_norm": 0.8674100637435913, + "learning_rate": 8.51416761069827e-06, + "loss": 0.669, + "step": 9229 + }, + { + "epoch": 0.5080081457427487, + "grad_norm": 0.8073185682296753, + "learning_rate": 8.513859249120477e-06, + "loss": 0.7215, + "step": 9230 + }, + { + "epoch": 0.5080631845451044, + "grad_norm": 0.674117386341095, + "learning_rate": 8.51355086113346e-06, + "loss": 0.7813, + "step": 9231 + }, + { + "epoch": 0.5081182233474599, + "grad_norm": 0.8564596176147461, + "learning_rate": 8.513242446739534e-06, + "loss": 0.7393, + "step": 9232 + }, + { + "epoch": 0.5081732621498156, + "grad_norm": 0.684637188911438, + "learning_rate": 8.512934005941015e-06, + "loss": 0.781, + "step": 9233 + }, + { + "epoch": 0.5082283009521713, + "grad_norm": 0.816123902797699, + "learning_rate": 8.51262553874023e-06, + "loss": 0.8597, + "step": 9234 + }, + { + "epoch": 0.5082833397545269, + "grad_norm": 0.6582320332527161, + "learning_rate": 8.512317045139488e-06, + "loss": 0.6654, + "step": 9235 + }, + { + "epoch": 0.5083383785568826, + "grad_norm": 1.0153518915176392, + "learning_rate": 8.512008525141113e-06, + "loss": 0.7946, + "step": 9236 + }, + { + "epoch": 0.5083934173592383, + "grad_norm": 0.7455416917800903, + "learning_rate": 8.511699978747422e-06, + "loss": 0.8365, + "step": 9237 + }, + { + "epoch": 0.508448456161594, + "grad_norm": 0.6498221755027771, + "learning_rate": 8.511391405960733e-06, + "loss": 0.7252, + "step": 9238 + }, + { + "epoch": 0.5085034949639495, + "grad_norm": 0.6856792569160461, + "learning_rate": 8.511082806783368e-06, + "loss": 0.7282, + "step": 9239 + }, + { + "epoch": 0.5085585337663052, + "grad_norm": 0.6930065751075745, + "learning_rate": 8.510774181217643e-06, + "loss": 0.7404, + "step": 9240 + }, + { + "epoch": 0.5086135725686609, + "grad_norm": 0.6953150033950806, + "learning_rate": 8.51046552926588e-06, + "loss": 0.7684, + "step": 9241 + }, + { + "epoch": 0.5086686113710166, + "grad_norm": 0.7307711839675903, + "learning_rate": 8.510156850930395e-06, + "loss": 0.7557, + "step": 9242 + }, + { + "epoch": 0.5087236501733722, + "grad_norm": 0.7296478152275085, + "learning_rate": 8.509848146213513e-06, + "loss": 0.7469, + "step": 9243 + }, + { + "epoch": 0.5087786889757279, + "grad_norm": 0.7035672664642334, + "learning_rate": 8.509539415117553e-06, + "loss": 0.7151, + "step": 9244 + }, + { + "epoch": 0.5088337277780836, + "grad_norm": 0.7818698883056641, + "learning_rate": 8.509230657644832e-06, + "loss": 0.7134, + "step": 9245 + }, + { + "epoch": 0.5088887665804392, + "grad_norm": 0.7503119111061096, + "learning_rate": 8.508921873797674e-06, + "loss": 0.7028, + "step": 9246 + }, + { + "epoch": 0.5089438053827948, + "grad_norm": 0.7733498215675354, + "learning_rate": 8.508613063578397e-06, + "loss": 0.8159, + "step": 9247 + }, + { + "epoch": 0.5089988441851505, + "grad_norm": 0.9236353635787964, + "learning_rate": 8.508304226989326e-06, + "loss": 0.8013, + "step": 9248 + }, + { + "epoch": 0.5090538829875062, + "grad_norm": 0.6567198634147644, + "learning_rate": 8.507995364032777e-06, + "loss": 0.8285, + "step": 9249 + }, + { + "epoch": 0.5091089217898619, + "grad_norm": 0.6555445790290833, + "learning_rate": 8.507686474711074e-06, + "loss": 0.6917, + "step": 9250 + }, + { + "epoch": 0.5091639605922175, + "grad_norm": 0.8505375385284424, + "learning_rate": 8.507377559026539e-06, + "loss": 0.824, + "step": 9251 + }, + { + "epoch": 0.5092189993945732, + "grad_norm": 0.703413188457489, + "learning_rate": 8.507068616981493e-06, + "loss": 0.7162, + "step": 9252 + }, + { + "epoch": 0.5092740381969288, + "grad_norm": 0.7257823944091797, + "learning_rate": 8.50675964857826e-06, + "loss": 0.8031, + "step": 9253 + }, + { + "epoch": 0.5093290769992845, + "grad_norm": 0.6861198544502258, + "learning_rate": 8.506450653819159e-06, + "loss": 0.7724, + "step": 9254 + }, + { + "epoch": 0.5093841158016401, + "grad_norm": 0.7733107209205627, + "learning_rate": 8.506141632706512e-06, + "loss": 0.7834, + "step": 9255 + }, + { + "epoch": 0.5094391546039958, + "grad_norm": 0.7472217082977295, + "learning_rate": 8.505832585242644e-06, + "loss": 0.7594, + "step": 9256 + }, + { + "epoch": 0.5094941934063515, + "grad_norm": 0.6273325085639954, + "learning_rate": 8.505523511429876e-06, + "loss": 0.6798, + "step": 9257 + }, + { + "epoch": 0.5095492322087072, + "grad_norm": 0.7366517186164856, + "learning_rate": 8.505214411270533e-06, + "loss": 0.7916, + "step": 9258 + }, + { + "epoch": 0.5096042710110628, + "grad_norm": 0.6654453873634338, + "learning_rate": 8.504905284766936e-06, + "loss": 0.7228, + "step": 9259 + }, + { + "epoch": 0.5096593098134184, + "grad_norm": 0.7926275134086609, + "learning_rate": 8.50459613192141e-06, + "loss": 0.8303, + "step": 9260 + }, + { + "epoch": 0.5097143486157741, + "grad_norm": 0.7256377935409546, + "learning_rate": 8.504286952736277e-06, + "loss": 0.7977, + "step": 9261 + }, + { + "epoch": 0.5097693874181298, + "grad_norm": 0.7333946824073792, + "learning_rate": 8.50397774721386e-06, + "loss": 0.7978, + "step": 9262 + }, + { + "epoch": 0.5098244262204854, + "grad_norm": 0.6102882623672485, + "learning_rate": 8.503668515356485e-06, + "loss": 0.6386, + "step": 9263 + }, + { + "epoch": 0.5098794650228411, + "grad_norm": 0.7939823865890503, + "learning_rate": 8.503359257166477e-06, + "loss": 0.7328, + "step": 9264 + }, + { + "epoch": 0.5099345038251968, + "grad_norm": 0.7245013117790222, + "learning_rate": 8.503049972646157e-06, + "loss": 0.795, + "step": 9265 + }, + { + "epoch": 0.5099895426275525, + "grad_norm": 0.6722108125686646, + "learning_rate": 8.502740661797852e-06, + "loss": 0.7062, + "step": 9266 + }, + { + "epoch": 0.510044581429908, + "grad_norm": 0.6759012341499329, + "learning_rate": 8.502431324623884e-06, + "loss": 0.7427, + "step": 9267 + }, + { + "epoch": 0.5100996202322637, + "grad_norm": 0.6448835730552673, + "learning_rate": 8.502121961126581e-06, + "loss": 0.7381, + "step": 9268 + }, + { + "epoch": 0.5101546590346194, + "grad_norm": 0.6437426209449768, + "learning_rate": 8.501812571308266e-06, + "loss": 0.6733, + "step": 9269 + }, + { + "epoch": 0.5102096978369751, + "grad_norm": 0.6879013776779175, + "learning_rate": 8.501503155171267e-06, + "loss": 0.7227, + "step": 9270 + }, + { + "epoch": 0.5102647366393307, + "grad_norm": 0.6628512740135193, + "learning_rate": 8.501193712717906e-06, + "loss": 0.7151, + "step": 9271 + }, + { + "epoch": 0.5103197754416864, + "grad_norm": 0.7653747797012329, + "learning_rate": 8.500884243950511e-06, + "loss": 0.8189, + "step": 9272 + }, + { + "epoch": 0.5103748142440421, + "grad_norm": 0.7180060148239136, + "learning_rate": 8.500574748871407e-06, + "loss": 0.7633, + "step": 9273 + }, + { + "epoch": 0.5104298530463978, + "grad_norm": 0.7045086622238159, + "learning_rate": 8.50026522748292e-06, + "loss": 0.746, + "step": 9274 + }, + { + "epoch": 0.5104848918487533, + "grad_norm": 0.6224614381790161, + "learning_rate": 8.499955679787376e-06, + "loss": 0.7436, + "step": 9275 + }, + { + "epoch": 0.510539930651109, + "grad_norm": 0.6716495156288147, + "learning_rate": 8.499646105787103e-06, + "loss": 0.8006, + "step": 9276 + }, + { + "epoch": 0.5105949694534647, + "grad_norm": 0.83705735206604, + "learning_rate": 8.499336505484426e-06, + "loss": 0.886, + "step": 9277 + }, + { + "epoch": 0.5106500082558203, + "grad_norm": 0.7942199110984802, + "learning_rate": 8.499026878881673e-06, + "loss": 0.7709, + "step": 9278 + }, + { + "epoch": 0.510705047058176, + "grad_norm": 0.7500330209732056, + "learning_rate": 8.49871722598117e-06, + "loss": 0.7737, + "step": 9279 + }, + { + "epoch": 0.5107600858605317, + "grad_norm": 0.7283433675765991, + "learning_rate": 8.498407546785245e-06, + "loss": 0.8345, + "step": 9280 + }, + { + "epoch": 0.5108151246628874, + "grad_norm": 0.6970989108085632, + "learning_rate": 8.498097841296224e-06, + "loss": 0.7451, + "step": 9281 + }, + { + "epoch": 0.5108701634652429, + "grad_norm": 0.8338573575019836, + "learning_rate": 8.497788109516438e-06, + "loss": 0.8198, + "step": 9282 + }, + { + "epoch": 0.5109252022675986, + "grad_norm": 0.6544861197471619, + "learning_rate": 8.497478351448213e-06, + "loss": 0.7549, + "step": 9283 + }, + { + "epoch": 0.5109802410699543, + "grad_norm": 0.6627360582351685, + "learning_rate": 8.497168567093876e-06, + "loss": 0.7136, + "step": 9284 + }, + { + "epoch": 0.51103527987231, + "grad_norm": 0.7176669239997864, + "learning_rate": 8.496858756455755e-06, + "loss": 0.766, + "step": 9285 + }, + { + "epoch": 0.5110903186746656, + "grad_norm": 0.8260897397994995, + "learning_rate": 8.496548919536183e-06, + "loss": 0.8167, + "step": 9286 + }, + { + "epoch": 0.5111453574770213, + "grad_norm": 0.7077773809432983, + "learning_rate": 8.496239056337483e-06, + "loss": 0.776, + "step": 9287 + }, + { + "epoch": 0.511200396279377, + "grad_norm": 0.7609447836875916, + "learning_rate": 8.495929166861988e-06, + "loss": 0.7339, + "step": 9288 + }, + { + "epoch": 0.5112554350817327, + "grad_norm": 0.6896487474441528, + "learning_rate": 8.495619251112022e-06, + "loss": 0.7639, + "step": 9289 + }, + { + "epoch": 0.5113104738840882, + "grad_norm": 0.6946871280670166, + "learning_rate": 8.495309309089918e-06, + "loss": 0.8242, + "step": 9290 + }, + { + "epoch": 0.5113655126864439, + "grad_norm": 0.79847252368927, + "learning_rate": 8.494999340798007e-06, + "loss": 0.8226, + "step": 9291 + }, + { + "epoch": 0.5114205514887996, + "grad_norm": 0.7845447063446045, + "learning_rate": 8.494689346238615e-06, + "loss": 0.8593, + "step": 9292 + }, + { + "epoch": 0.5114755902911553, + "grad_norm": 1.1577119827270508, + "learning_rate": 8.494379325414074e-06, + "loss": 0.746, + "step": 9293 + }, + { + "epoch": 0.5115306290935109, + "grad_norm": 0.6720938682556152, + "learning_rate": 8.494069278326713e-06, + "loss": 0.6768, + "step": 9294 + }, + { + "epoch": 0.5115856678958666, + "grad_norm": 0.7389395833015442, + "learning_rate": 8.493759204978862e-06, + "loss": 0.8126, + "step": 9295 + }, + { + "epoch": 0.5116407066982223, + "grad_norm": 0.7629536986351013, + "learning_rate": 8.493449105372853e-06, + "loss": 0.7107, + "step": 9296 + }, + { + "epoch": 0.511695745500578, + "grad_norm": 0.7339474558830261, + "learning_rate": 8.493138979511015e-06, + "loss": 0.8144, + "step": 9297 + }, + { + "epoch": 0.5117507843029335, + "grad_norm": 0.7222825288772583, + "learning_rate": 8.49282882739568e-06, + "loss": 0.7512, + "step": 9298 + }, + { + "epoch": 0.5118058231052892, + "grad_norm": 0.676659107208252, + "learning_rate": 8.49251864902918e-06, + "loss": 0.6515, + "step": 9299 + }, + { + "epoch": 0.5118608619076449, + "grad_norm": 0.6336323618888855, + "learning_rate": 8.492208444413844e-06, + "loss": 0.719, + "step": 9300 + }, + { + "epoch": 0.5119159007100006, + "grad_norm": 0.701543927192688, + "learning_rate": 8.491898213552e-06, + "loss": 0.728, + "step": 9301 + }, + { + "epoch": 0.5119709395123562, + "grad_norm": 0.6809069514274597, + "learning_rate": 8.491587956445988e-06, + "loss": 0.8844, + "step": 9302 + }, + { + "epoch": 0.5120259783147119, + "grad_norm": 0.8046489357948303, + "learning_rate": 8.491277673098135e-06, + "loss": 0.817, + "step": 9303 + }, + { + "epoch": 0.5120810171170675, + "grad_norm": 0.8630616068840027, + "learning_rate": 8.490967363510774e-06, + "loss": 0.7745, + "step": 9304 + }, + { + "epoch": 0.5121360559194232, + "grad_norm": 0.7457678914070129, + "learning_rate": 8.490657027686235e-06, + "loss": 0.7956, + "step": 9305 + }, + { + "epoch": 0.5121910947217788, + "grad_norm": 0.6383466124534607, + "learning_rate": 8.490346665626854e-06, + "loss": 0.8046, + "step": 9306 + }, + { + "epoch": 0.5122461335241345, + "grad_norm": 0.7658202052116394, + "learning_rate": 8.49003627733496e-06, + "loss": 0.7905, + "step": 9307 + }, + { + "epoch": 0.5123011723264902, + "grad_norm": 0.6793283224105835, + "learning_rate": 8.48972586281289e-06, + "loss": 0.6646, + "step": 9308 + }, + { + "epoch": 0.5123562111288459, + "grad_norm": 0.7345246076583862, + "learning_rate": 8.489415422062972e-06, + "loss": 0.788, + "step": 9309 + }, + { + "epoch": 0.5124112499312015, + "grad_norm": 0.6665463447570801, + "learning_rate": 8.489104955087542e-06, + "loss": 0.706, + "step": 9310 + }, + { + "epoch": 0.5124662887335572, + "grad_norm": 0.7895458936691284, + "learning_rate": 8.488794461888934e-06, + "loss": 0.7464, + "step": 9311 + }, + { + "epoch": 0.5125213275359128, + "grad_norm": 0.7375221252441406, + "learning_rate": 8.488483942469481e-06, + "loss": 0.8029, + "step": 9312 + }, + { + "epoch": 0.5125763663382685, + "grad_norm": 0.792348325252533, + "learning_rate": 8.488173396831514e-06, + "loss": 0.7324, + "step": 9313 + }, + { + "epoch": 0.5126314051406241, + "grad_norm": 0.6500192880630493, + "learning_rate": 8.487862824977373e-06, + "loss": 0.7331, + "step": 9314 + }, + { + "epoch": 0.5126864439429798, + "grad_norm": 0.6607314348220825, + "learning_rate": 8.487552226909386e-06, + "loss": 0.7782, + "step": 9315 + }, + { + "epoch": 0.5127414827453355, + "grad_norm": 0.8261791467666626, + "learning_rate": 8.487241602629892e-06, + "loss": 0.8036, + "step": 9316 + }, + { + "epoch": 0.5127965215476912, + "grad_norm": 0.8301663994789124, + "learning_rate": 8.486930952141222e-06, + "loss": 0.7928, + "step": 9317 + }, + { + "epoch": 0.5128515603500468, + "grad_norm": 0.6957940459251404, + "learning_rate": 8.486620275445713e-06, + "loss": 0.7359, + "step": 9318 + }, + { + "epoch": 0.5129065991524024, + "grad_norm": 0.7562606334686279, + "learning_rate": 8.4863095725457e-06, + "loss": 0.7546, + "step": 9319 + }, + { + "epoch": 0.5129616379547581, + "grad_norm": 0.795886218547821, + "learning_rate": 8.485998843443517e-06, + "loss": 0.7558, + "step": 9320 + }, + { + "epoch": 0.5130166767571137, + "grad_norm": 0.6558147072792053, + "learning_rate": 8.4856880881415e-06, + "loss": 0.6832, + "step": 9321 + }, + { + "epoch": 0.5130717155594694, + "grad_norm": 0.7300151586532593, + "learning_rate": 8.485377306641984e-06, + "loss": 0.8018, + "step": 9322 + }, + { + "epoch": 0.5131267543618251, + "grad_norm": 0.7114105224609375, + "learning_rate": 8.485066498947305e-06, + "loss": 0.7374, + "step": 9323 + }, + { + "epoch": 0.5131817931641808, + "grad_norm": 0.7061085104942322, + "learning_rate": 8.484755665059798e-06, + "loss": 0.7905, + "step": 9324 + }, + { + "epoch": 0.5132368319665364, + "grad_norm": 0.8481647968292236, + "learning_rate": 8.484444804981802e-06, + "loss": 0.8518, + "step": 9325 + }, + { + "epoch": 0.513291870768892, + "grad_norm": 0.7583557367324829, + "learning_rate": 8.48413391871565e-06, + "loss": 0.8328, + "step": 9326 + }, + { + "epoch": 0.5133469095712477, + "grad_norm": 0.7381925582885742, + "learning_rate": 8.483823006263683e-06, + "loss": 0.76, + "step": 9327 + }, + { + "epoch": 0.5134019483736034, + "grad_norm": 0.8037852644920349, + "learning_rate": 8.483512067628232e-06, + "loss": 0.711, + "step": 9328 + }, + { + "epoch": 0.513456987175959, + "grad_norm": 0.6682618260383606, + "learning_rate": 8.483201102811637e-06, + "loss": 0.7479, + "step": 9329 + }, + { + "epoch": 0.5135120259783147, + "grad_norm": 0.662234365940094, + "learning_rate": 8.482890111816237e-06, + "loss": 0.7701, + "step": 9330 + }, + { + "epoch": 0.5135670647806704, + "grad_norm": 0.7081482410430908, + "learning_rate": 8.482579094644365e-06, + "loss": 0.8255, + "step": 9331 + }, + { + "epoch": 0.5136221035830261, + "grad_norm": 0.9659954905509949, + "learning_rate": 8.482268051298364e-06, + "loss": 0.8742, + "step": 9332 + }, + { + "epoch": 0.5136771423853816, + "grad_norm": 0.7837772369384766, + "learning_rate": 8.481956981780564e-06, + "loss": 0.7692, + "step": 9333 + }, + { + "epoch": 0.5137321811877373, + "grad_norm": 0.681918203830719, + "learning_rate": 8.481645886093311e-06, + "loss": 0.6952, + "step": 9334 + }, + { + "epoch": 0.513787219990093, + "grad_norm": 0.7253187894821167, + "learning_rate": 8.481334764238937e-06, + "loss": 0.7074, + "step": 9335 + }, + { + "epoch": 0.5138422587924487, + "grad_norm": 0.8845877051353455, + "learning_rate": 8.481023616219783e-06, + "loss": 0.675, + "step": 9336 + }, + { + "epoch": 0.5138972975948043, + "grad_norm": 0.6569344401359558, + "learning_rate": 8.480712442038188e-06, + "loss": 0.7181, + "step": 9337 + }, + { + "epoch": 0.51395233639716, + "grad_norm": 0.7372813820838928, + "learning_rate": 8.480401241696491e-06, + "loss": 0.8137, + "step": 9338 + }, + { + "epoch": 0.5140073751995157, + "grad_norm": 0.843099057674408, + "learning_rate": 8.48009001519703e-06, + "loss": 0.7648, + "step": 9339 + }, + { + "epoch": 0.5140624140018714, + "grad_norm": 0.7762032747268677, + "learning_rate": 8.479778762542142e-06, + "loss": 0.7805, + "step": 9340 + }, + { + "epoch": 0.5141174528042269, + "grad_norm": 0.739086925983429, + "learning_rate": 8.479467483734169e-06, + "loss": 0.7125, + "step": 9341 + }, + { + "epoch": 0.5141724916065826, + "grad_norm": 0.7351683974266052, + "learning_rate": 8.479156178775451e-06, + "loss": 0.7855, + "step": 9342 + }, + { + "epoch": 0.5142275304089383, + "grad_norm": 0.7601314187049866, + "learning_rate": 8.478844847668325e-06, + "loss": 0.8349, + "step": 9343 + }, + { + "epoch": 0.514282569211294, + "grad_norm": 0.6841638684272766, + "learning_rate": 8.478533490415133e-06, + "loss": 0.7986, + "step": 9344 + }, + { + "epoch": 0.5143376080136496, + "grad_norm": 0.6734872460365295, + "learning_rate": 8.478222107018213e-06, + "loss": 0.6941, + "step": 9345 + }, + { + "epoch": 0.5143926468160053, + "grad_norm": 0.801930844783783, + "learning_rate": 8.47791069747991e-06, + "loss": 0.8537, + "step": 9346 + }, + { + "epoch": 0.514447685618361, + "grad_norm": 0.6960629224777222, + "learning_rate": 8.477599261802558e-06, + "loss": 0.6629, + "step": 9347 + }, + { + "epoch": 0.5145027244207167, + "grad_norm": 0.7791358232498169, + "learning_rate": 8.477287799988502e-06, + "loss": 0.8777, + "step": 9348 + }, + { + "epoch": 0.5145577632230722, + "grad_norm": 0.7022722959518433, + "learning_rate": 8.476976312040082e-06, + "loss": 0.7116, + "step": 9349 + }, + { + "epoch": 0.5146128020254279, + "grad_norm": 0.7791306376457214, + "learning_rate": 8.476664797959639e-06, + "loss": 0.7262, + "step": 9350 + }, + { + "epoch": 0.5146678408277836, + "grad_norm": 0.7391177415847778, + "learning_rate": 8.476353257749514e-06, + "loss": 0.7308, + "step": 9351 + }, + { + "epoch": 0.5147228796301393, + "grad_norm": 0.6989552974700928, + "learning_rate": 8.476041691412046e-06, + "loss": 0.7754, + "step": 9352 + }, + { + "epoch": 0.5147779184324949, + "grad_norm": 0.7639930844306946, + "learning_rate": 8.475730098949582e-06, + "loss": 0.8385, + "step": 9353 + }, + { + "epoch": 0.5148329572348506, + "grad_norm": 0.7687931060791016, + "learning_rate": 8.47541848036446e-06, + "loss": 0.8118, + "step": 9354 + }, + { + "epoch": 0.5148879960372063, + "grad_norm": 0.8831589221954346, + "learning_rate": 8.475106835659024e-06, + "loss": 0.7705, + "step": 9355 + }, + { + "epoch": 0.5149430348395619, + "grad_norm": 0.7585502862930298, + "learning_rate": 8.474795164835614e-06, + "loss": 0.8167, + "step": 9356 + }, + { + "epoch": 0.5149980736419175, + "grad_norm": 0.7078690528869629, + "learning_rate": 8.474483467896572e-06, + "loss": 0.7412, + "step": 9357 + }, + { + "epoch": 0.5150531124442732, + "grad_norm": 0.8950889706611633, + "learning_rate": 8.474171744844246e-06, + "loss": 0.8132, + "step": 9358 + }, + { + "epoch": 0.5151081512466289, + "grad_norm": 0.7196077704429626, + "learning_rate": 8.473859995680973e-06, + "loss": 0.8041, + "step": 9359 + }, + { + "epoch": 0.5151631900489846, + "grad_norm": 0.7705141305923462, + "learning_rate": 8.473548220409099e-06, + "loss": 0.8437, + "step": 9360 + }, + { + "epoch": 0.5152182288513402, + "grad_norm": 0.6507467031478882, + "learning_rate": 8.473236419030966e-06, + "loss": 0.7713, + "step": 9361 + }, + { + "epoch": 0.5152732676536959, + "grad_norm": 0.7120817303657532, + "learning_rate": 8.472924591548917e-06, + "loss": 0.7688, + "step": 9362 + }, + { + "epoch": 0.5153283064560515, + "grad_norm": 0.7830487489700317, + "learning_rate": 8.472612737965297e-06, + "loss": 0.8875, + "step": 9363 + }, + { + "epoch": 0.5153833452584071, + "grad_norm": 0.8790529370307922, + "learning_rate": 8.47230085828245e-06, + "loss": 0.7648, + "step": 9364 + }, + { + "epoch": 0.5154383840607628, + "grad_norm": 0.8956806659698486, + "learning_rate": 8.471988952502718e-06, + "loss": 0.7891, + "step": 9365 + }, + { + "epoch": 0.5154934228631185, + "grad_norm": 0.7370011210441589, + "learning_rate": 8.471677020628448e-06, + "loss": 0.7609, + "step": 9366 + }, + { + "epoch": 0.5155484616654742, + "grad_norm": 0.6794238090515137, + "learning_rate": 8.471365062661982e-06, + "loss": 0.6679, + "step": 9367 + }, + { + "epoch": 0.5156035004678298, + "grad_norm": 0.7330273985862732, + "learning_rate": 8.471053078605664e-06, + "loss": 0.7276, + "step": 9368 + }, + { + "epoch": 0.5156585392701855, + "grad_norm": 0.7796601057052612, + "learning_rate": 8.470741068461843e-06, + "loss": 0.7897, + "step": 9369 + }, + { + "epoch": 0.5157135780725411, + "grad_norm": 0.6834099888801575, + "learning_rate": 8.470429032232858e-06, + "loss": 0.7924, + "step": 9370 + }, + { + "epoch": 0.5157686168748968, + "grad_norm": 0.6991616487503052, + "learning_rate": 8.47011696992106e-06, + "loss": 0.7901, + "step": 9371 + }, + { + "epoch": 0.5158236556772524, + "grad_norm": 0.7321401834487915, + "learning_rate": 8.469804881528792e-06, + "loss": 0.6718, + "step": 9372 + }, + { + "epoch": 0.5158786944796081, + "grad_norm": 0.7091043591499329, + "learning_rate": 8.469492767058398e-06, + "loss": 0.8204, + "step": 9373 + }, + { + "epoch": 0.5159337332819638, + "grad_norm": 0.8777012825012207, + "learning_rate": 8.469180626512223e-06, + "loss": 0.8045, + "step": 9374 + }, + { + "epoch": 0.5159887720843195, + "grad_norm": 0.6652738451957703, + "learning_rate": 8.468868459892619e-06, + "loss": 0.7248, + "step": 9375 + }, + { + "epoch": 0.5160438108866751, + "grad_norm": 0.7209659218788147, + "learning_rate": 8.468556267201925e-06, + "loss": 0.7508, + "step": 9376 + }, + { + "epoch": 0.5160988496890307, + "grad_norm": 0.7685441970825195, + "learning_rate": 8.468244048442494e-06, + "loss": 0.7501, + "step": 9377 + }, + { + "epoch": 0.5161538884913864, + "grad_norm": 0.6773725152015686, + "learning_rate": 8.467931803616665e-06, + "loss": 0.8036, + "step": 9378 + }, + { + "epoch": 0.5162089272937421, + "grad_norm": 0.7167890071868896, + "learning_rate": 8.467619532726792e-06, + "loss": 0.7229, + "step": 9379 + }, + { + "epoch": 0.5162639660960977, + "grad_norm": 0.7066929340362549, + "learning_rate": 8.467307235775218e-06, + "loss": 0.7433, + "step": 9380 + }, + { + "epoch": 0.5163190048984534, + "grad_norm": 0.7261828780174255, + "learning_rate": 8.46699491276429e-06, + "loss": 0.7873, + "step": 9381 + }, + { + "epoch": 0.5163740437008091, + "grad_norm": 0.7442463636398315, + "learning_rate": 8.466682563696356e-06, + "loss": 0.7953, + "step": 9382 + }, + { + "epoch": 0.5164290825031648, + "grad_norm": 0.5668768286705017, + "learning_rate": 8.466370188573765e-06, + "loss": 0.5602, + "step": 9383 + }, + { + "epoch": 0.5164841213055203, + "grad_norm": 0.7364997267723083, + "learning_rate": 8.466057787398864e-06, + "loss": 0.8274, + "step": 9384 + }, + { + "epoch": 0.516539160107876, + "grad_norm": 0.7793132066726685, + "learning_rate": 8.465745360174e-06, + "loss": 0.7832, + "step": 9385 + }, + { + "epoch": 0.5165941989102317, + "grad_norm": 0.6818128824234009, + "learning_rate": 8.46543290690152e-06, + "loss": 0.8314, + "step": 9386 + }, + { + "epoch": 0.5166492377125874, + "grad_norm": 0.7392195463180542, + "learning_rate": 8.465120427583778e-06, + "loss": 0.8124, + "step": 9387 + }, + { + "epoch": 0.516704276514943, + "grad_norm": 0.8582521677017212, + "learning_rate": 8.464807922223115e-06, + "loss": 0.7417, + "step": 9388 + }, + { + "epoch": 0.5167593153172987, + "grad_norm": 0.7322097420692444, + "learning_rate": 8.464495390821882e-06, + "loss": 0.7408, + "step": 9389 + }, + { + "epoch": 0.5168143541196544, + "grad_norm": 0.8177433013916016, + "learning_rate": 8.464182833382432e-06, + "loss": 0.87, + "step": 9390 + }, + { + "epoch": 0.5168693929220101, + "grad_norm": 0.7088115215301514, + "learning_rate": 8.46387024990711e-06, + "loss": 0.7748, + "step": 9391 + }, + { + "epoch": 0.5169244317243656, + "grad_norm": 0.6648650169372559, + "learning_rate": 8.463557640398268e-06, + "loss": 0.6302, + "step": 9392 + }, + { + "epoch": 0.5169794705267213, + "grad_norm": 0.6688859462738037, + "learning_rate": 8.463245004858251e-06, + "loss": 0.7252, + "step": 9393 + }, + { + "epoch": 0.517034509329077, + "grad_norm": 0.7231030464172363, + "learning_rate": 8.462932343289412e-06, + "loss": 0.8497, + "step": 9394 + }, + { + "epoch": 0.5170895481314327, + "grad_norm": 0.7142065763473511, + "learning_rate": 8.462619655694103e-06, + "loss": 0.7041, + "step": 9395 + }, + { + "epoch": 0.5171445869337883, + "grad_norm": 0.7197136878967285, + "learning_rate": 8.462306942074669e-06, + "loss": 0.7022, + "step": 9396 + }, + { + "epoch": 0.517199625736144, + "grad_norm": 0.7620192766189575, + "learning_rate": 8.461994202433463e-06, + "loss": 0.8243, + "step": 9397 + }, + { + "epoch": 0.5172546645384997, + "grad_norm": 0.7697533965110779, + "learning_rate": 8.461681436772836e-06, + "loss": 0.7861, + "step": 9398 + }, + { + "epoch": 0.5173097033408554, + "grad_norm": 0.7224711179733276, + "learning_rate": 8.461368645095138e-06, + "loss": 0.7588, + "step": 9399 + }, + { + "epoch": 0.5173647421432109, + "grad_norm": 0.9285979270935059, + "learning_rate": 8.46105582740272e-06, + "loss": 0.8113, + "step": 9400 + }, + { + "epoch": 0.5174197809455666, + "grad_norm": 0.7297842502593994, + "learning_rate": 8.460742983697934e-06, + "loss": 0.7115, + "step": 9401 + }, + { + "epoch": 0.5174748197479223, + "grad_norm": 0.6712872982025146, + "learning_rate": 8.460430113983126e-06, + "loss": 0.751, + "step": 9402 + }, + { + "epoch": 0.517529858550278, + "grad_norm": 0.7807186245918274, + "learning_rate": 8.460117218260657e-06, + "loss": 0.8375, + "step": 9403 + }, + { + "epoch": 0.5175848973526336, + "grad_norm": 0.621530294418335, + "learning_rate": 8.45980429653287e-06, + "loss": 0.638, + "step": 9404 + }, + { + "epoch": 0.5176399361549893, + "grad_norm": 0.7086256146430969, + "learning_rate": 8.45949134880212e-06, + "loss": 0.8304, + "step": 9405 + }, + { + "epoch": 0.517694974957345, + "grad_norm": 0.62705397605896, + "learning_rate": 8.45917837507076e-06, + "loss": 0.7008, + "step": 9406 + }, + { + "epoch": 0.5177500137597005, + "grad_norm": 0.9109121561050415, + "learning_rate": 8.458865375341142e-06, + "loss": 0.7529, + "step": 9407 + }, + { + "epoch": 0.5178050525620562, + "grad_norm": 0.6909900903701782, + "learning_rate": 8.458552349615615e-06, + "loss": 0.8453, + "step": 9408 + }, + { + "epoch": 0.5178600913644119, + "grad_norm": 0.7548434138298035, + "learning_rate": 8.458239297896536e-06, + "loss": 0.7516, + "step": 9409 + }, + { + "epoch": 0.5179151301667676, + "grad_norm": 0.7595730423927307, + "learning_rate": 8.457926220186257e-06, + "loss": 0.7599, + "step": 9410 + }, + { + "epoch": 0.5179701689691232, + "grad_norm": 0.7449337840080261, + "learning_rate": 8.45761311648713e-06, + "loss": 0.8236, + "step": 9411 + }, + { + "epoch": 0.5180252077714789, + "grad_norm": 0.7529160976409912, + "learning_rate": 8.457299986801507e-06, + "loss": 0.8655, + "step": 9412 + }, + { + "epoch": 0.5180802465738346, + "grad_norm": 0.6777701377868652, + "learning_rate": 8.456986831131742e-06, + "loss": 0.7737, + "step": 9413 + }, + { + "epoch": 0.5181352853761902, + "grad_norm": 0.9363510012626648, + "learning_rate": 8.456673649480191e-06, + "loss": 0.8227, + "step": 9414 + }, + { + "epoch": 0.5181903241785458, + "grad_norm": 0.798001229763031, + "learning_rate": 8.456360441849206e-06, + "loss": 0.8881, + "step": 9415 + }, + { + "epoch": 0.5182453629809015, + "grad_norm": 0.7212072610855103, + "learning_rate": 8.456047208241141e-06, + "loss": 0.8165, + "step": 9416 + }, + { + "epoch": 0.5183004017832572, + "grad_norm": 0.6918027997016907, + "learning_rate": 8.45573394865835e-06, + "loss": 0.8048, + "step": 9417 + }, + { + "epoch": 0.5183554405856129, + "grad_norm": 0.6474916338920593, + "learning_rate": 8.455420663103187e-06, + "loss": 0.6502, + "step": 9418 + }, + { + "epoch": 0.5184104793879685, + "grad_norm": 0.6592364311218262, + "learning_rate": 8.455107351578008e-06, + "loss": 0.7509, + "step": 9419 + }, + { + "epoch": 0.5184655181903242, + "grad_norm": 0.7658745646476746, + "learning_rate": 8.454794014085168e-06, + "loss": 0.8444, + "step": 9420 + }, + { + "epoch": 0.5185205569926798, + "grad_norm": 0.6814215183258057, + "learning_rate": 8.45448065062702e-06, + "loss": 0.7367, + "step": 9421 + }, + { + "epoch": 0.5185755957950355, + "grad_norm": 0.644740104675293, + "learning_rate": 8.45416726120592e-06, + "loss": 0.7456, + "step": 9422 + }, + { + "epoch": 0.5186306345973911, + "grad_norm": 0.8578751087188721, + "learning_rate": 8.453853845824225e-06, + "loss": 0.8481, + "step": 9423 + }, + { + "epoch": 0.5186856733997468, + "grad_norm": 0.6630389094352722, + "learning_rate": 8.453540404484288e-06, + "loss": 0.7487, + "step": 9424 + }, + { + "epoch": 0.5187407122021025, + "grad_norm": 0.7756431698799133, + "learning_rate": 8.453226937188466e-06, + "loss": 0.798, + "step": 9425 + }, + { + "epoch": 0.5187957510044582, + "grad_norm": 0.7856318354606628, + "learning_rate": 8.452913443939113e-06, + "loss": 0.785, + "step": 9426 + }, + { + "epoch": 0.5188507898068138, + "grad_norm": 0.7563977837562561, + "learning_rate": 8.45259992473859e-06, + "loss": 0.8182, + "step": 9427 + }, + { + "epoch": 0.5189058286091695, + "grad_norm": 0.6945043802261353, + "learning_rate": 8.452286379589247e-06, + "loss": 0.7262, + "step": 9428 + }, + { + "epoch": 0.5189608674115251, + "grad_norm": 0.6607717275619507, + "learning_rate": 8.451972808493444e-06, + "loss": 0.7257, + "step": 9429 + }, + { + "epoch": 0.5190159062138808, + "grad_norm": 0.6682843565940857, + "learning_rate": 8.451659211453539e-06, + "loss": 0.6775, + "step": 9430 + }, + { + "epoch": 0.5190709450162364, + "grad_norm": 0.7175559401512146, + "learning_rate": 8.451345588471886e-06, + "loss": 0.7154, + "step": 9431 + }, + { + "epoch": 0.5191259838185921, + "grad_norm": 0.7499119639396667, + "learning_rate": 8.451031939550845e-06, + "loss": 0.7537, + "step": 9432 + }, + { + "epoch": 0.5191810226209478, + "grad_norm": 0.65048748254776, + "learning_rate": 8.450718264692771e-06, + "loss": 0.7253, + "step": 9433 + }, + { + "epoch": 0.5192360614233035, + "grad_norm": 0.7067640423774719, + "learning_rate": 8.450404563900022e-06, + "loss": 0.7245, + "step": 9434 + }, + { + "epoch": 0.519291100225659, + "grad_norm": 0.7079932689666748, + "learning_rate": 8.450090837174956e-06, + "loss": 0.7776, + "step": 9435 + }, + { + "epoch": 0.5193461390280147, + "grad_norm": 0.8260107636451721, + "learning_rate": 8.44977708451993e-06, + "loss": 0.8529, + "step": 9436 + }, + { + "epoch": 0.5194011778303704, + "grad_norm": 0.6412167549133301, + "learning_rate": 8.449463305937304e-06, + "loss": 0.7371, + "step": 9437 + }, + { + "epoch": 0.5194562166327261, + "grad_norm": 0.7067576050758362, + "learning_rate": 8.449149501429435e-06, + "loss": 0.7161, + "step": 9438 + }, + { + "epoch": 0.5195112554350817, + "grad_norm": 0.6966904997825623, + "learning_rate": 8.448835670998681e-06, + "loss": 0.7285, + "step": 9439 + }, + { + "epoch": 0.5195662942374374, + "grad_norm": 0.8066132664680481, + "learning_rate": 8.448521814647401e-06, + "loss": 0.8265, + "step": 9440 + }, + { + "epoch": 0.5196213330397931, + "grad_norm": 0.7597149610519409, + "learning_rate": 8.448207932377957e-06, + "loss": 0.7721, + "step": 9441 + }, + { + "epoch": 0.5196763718421488, + "grad_norm": 0.6965302228927612, + "learning_rate": 8.447894024192702e-06, + "loss": 0.749, + "step": 9442 + }, + { + "epoch": 0.5197314106445043, + "grad_norm": 0.7032600045204163, + "learning_rate": 8.447580090094e-06, + "loss": 0.7923, + "step": 9443 + }, + { + "epoch": 0.51978644944686, + "grad_norm": 0.7255309820175171, + "learning_rate": 8.447266130084208e-06, + "loss": 0.6739, + "step": 9444 + }, + { + "epoch": 0.5198414882492157, + "grad_norm": 0.6602993011474609, + "learning_rate": 8.446952144165686e-06, + "loss": 0.7886, + "step": 9445 + }, + { + "epoch": 0.5198965270515714, + "grad_norm": 0.7017884850502014, + "learning_rate": 8.446638132340796e-06, + "loss": 0.7554, + "step": 9446 + }, + { + "epoch": 0.519951565853927, + "grad_norm": 0.7234843969345093, + "learning_rate": 8.446324094611894e-06, + "loss": 0.8294, + "step": 9447 + }, + { + "epoch": 0.5200066046562827, + "grad_norm": 0.6859332919120789, + "learning_rate": 8.446010030981347e-06, + "loss": 0.7563, + "step": 9448 + }, + { + "epoch": 0.5200616434586384, + "grad_norm": 0.7759458422660828, + "learning_rate": 8.445695941451507e-06, + "loss": 0.7577, + "step": 9449 + }, + { + "epoch": 0.520116682260994, + "grad_norm": 0.7852263450622559, + "learning_rate": 8.44538182602474e-06, + "loss": 0.7446, + "step": 9450 + }, + { + "epoch": 0.5201717210633496, + "grad_norm": 0.8143053650856018, + "learning_rate": 8.445067684703406e-06, + "loss": 0.7995, + "step": 9451 + }, + { + "epoch": 0.5202267598657053, + "grad_norm": 0.692738950252533, + "learning_rate": 8.444753517489865e-06, + "loss": 0.7185, + "step": 9452 + }, + { + "epoch": 0.520281798668061, + "grad_norm": 0.6615390181541443, + "learning_rate": 8.444439324386478e-06, + "loss": 0.7128, + "step": 9453 + }, + { + "epoch": 0.5203368374704166, + "grad_norm": 0.7360419034957886, + "learning_rate": 8.444125105395608e-06, + "loss": 0.6565, + "step": 9454 + }, + { + "epoch": 0.5203918762727723, + "grad_norm": 0.7280182838439941, + "learning_rate": 8.443810860519615e-06, + "loss": 0.7295, + "step": 9455 + }, + { + "epoch": 0.520446915075128, + "grad_norm": 0.787367582321167, + "learning_rate": 8.44349658976086e-06, + "loss": 0.7342, + "step": 9456 + }, + { + "epoch": 0.5205019538774837, + "grad_norm": 0.7496024966239929, + "learning_rate": 8.44318229312171e-06, + "loss": 0.7499, + "step": 9457 + }, + { + "epoch": 0.5205569926798392, + "grad_norm": 0.9167383909225464, + "learning_rate": 8.44286797060452e-06, + "loss": 0.7797, + "step": 9458 + }, + { + "epoch": 0.5206120314821949, + "grad_norm": 0.7032341957092285, + "learning_rate": 8.442553622211659e-06, + "loss": 0.7627, + "step": 9459 + }, + { + "epoch": 0.5206670702845506, + "grad_norm": 1.2905993461608887, + "learning_rate": 8.442239247945485e-06, + "loss": 0.7841, + "step": 9460 + }, + { + "epoch": 0.5207221090869063, + "grad_norm": 0.6909230351448059, + "learning_rate": 8.441924847808362e-06, + "loss": 0.7234, + "step": 9461 + }, + { + "epoch": 0.5207771478892619, + "grad_norm": 0.6632175445556641, + "learning_rate": 8.441610421802653e-06, + "loss": 0.6733, + "step": 9462 + }, + { + "epoch": 0.5208321866916176, + "grad_norm": 0.7838154435157776, + "learning_rate": 8.441295969930722e-06, + "loss": 0.7583, + "step": 9463 + }, + { + "epoch": 0.5208872254939733, + "grad_norm": 0.6380481123924255, + "learning_rate": 8.440981492194932e-06, + "loss": 0.7109, + "step": 9464 + }, + { + "epoch": 0.520942264296329, + "grad_norm": 0.6859052181243896, + "learning_rate": 8.440666988597646e-06, + "loss": 0.7387, + "step": 9465 + }, + { + "epoch": 0.5209973030986845, + "grad_norm": 0.7411379814147949, + "learning_rate": 8.440352459141226e-06, + "loss": 0.7852, + "step": 9466 + }, + { + "epoch": 0.5210523419010402, + "grad_norm": 0.6925216913223267, + "learning_rate": 8.44003790382804e-06, + "loss": 0.8228, + "step": 9467 + }, + { + "epoch": 0.5211073807033959, + "grad_norm": 0.7136396169662476, + "learning_rate": 8.43972332266045e-06, + "loss": 0.8168, + "step": 9468 + }, + { + "epoch": 0.5211624195057516, + "grad_norm": 0.719639003276825, + "learning_rate": 8.43940871564082e-06, + "loss": 0.6728, + "step": 9469 + }, + { + "epoch": 0.5212174583081072, + "grad_norm": 0.647861897945404, + "learning_rate": 8.439094082771513e-06, + "loss": 0.6986, + "step": 9470 + }, + { + "epoch": 0.5212724971104629, + "grad_norm": 0.6644579172134399, + "learning_rate": 8.438779424054897e-06, + "loss": 0.6263, + "step": 9471 + }, + { + "epoch": 0.5213275359128186, + "grad_norm": 0.7157352566719055, + "learning_rate": 8.438464739493335e-06, + "loss": 0.827, + "step": 9472 + }, + { + "epoch": 0.5213825747151742, + "grad_norm": 0.793765127658844, + "learning_rate": 8.438150029089193e-06, + "loss": 0.741, + "step": 9473 + }, + { + "epoch": 0.5214376135175298, + "grad_norm": 0.7078518867492676, + "learning_rate": 8.437835292844836e-06, + "loss": 0.7618, + "step": 9474 + }, + { + "epoch": 0.5214926523198855, + "grad_norm": 0.7492140531539917, + "learning_rate": 8.437520530762628e-06, + "loss": 0.7894, + "step": 9475 + }, + { + "epoch": 0.5215476911222412, + "grad_norm": 0.6534473299980164, + "learning_rate": 8.437205742844937e-06, + "loss": 0.7567, + "step": 9476 + }, + { + "epoch": 0.5216027299245969, + "grad_norm": 0.8745388984680176, + "learning_rate": 8.436890929094126e-06, + "loss": 0.8758, + "step": 9477 + }, + { + "epoch": 0.5216577687269525, + "grad_norm": 0.6804752349853516, + "learning_rate": 8.436576089512564e-06, + "loss": 0.7841, + "step": 9478 + }, + { + "epoch": 0.5217128075293082, + "grad_norm": 0.712065577507019, + "learning_rate": 8.436261224102615e-06, + "loss": 0.8079, + "step": 9479 + }, + { + "epoch": 0.5217678463316638, + "grad_norm": 0.8733783960342407, + "learning_rate": 8.435946332866648e-06, + "loss": 0.8295, + "step": 9480 + }, + { + "epoch": 0.5218228851340195, + "grad_norm": 0.6871289610862732, + "learning_rate": 8.435631415807028e-06, + "loss": 0.7087, + "step": 9481 + }, + { + "epoch": 0.5218779239363751, + "grad_norm": 0.8363185524940491, + "learning_rate": 8.43531647292612e-06, + "loss": 0.7329, + "step": 9482 + }, + { + "epoch": 0.5219329627387308, + "grad_norm": 0.6845195293426514, + "learning_rate": 8.435001504226295e-06, + "loss": 0.7651, + "step": 9483 + }, + { + "epoch": 0.5219880015410865, + "grad_norm": 0.7527645826339722, + "learning_rate": 8.434686509709917e-06, + "loss": 0.6856, + "step": 9484 + }, + { + "epoch": 0.5220430403434422, + "grad_norm": 0.6945710778236389, + "learning_rate": 8.434371489379356e-06, + "loss": 0.6875, + "step": 9485 + }, + { + "epoch": 0.5220980791457978, + "grad_norm": 0.7668873071670532, + "learning_rate": 8.434056443236977e-06, + "loss": 0.7662, + "step": 9486 + }, + { + "epoch": 0.5221531179481534, + "grad_norm": 0.9873473048210144, + "learning_rate": 8.433741371285148e-06, + "loss": 0.7662, + "step": 9487 + }, + { + "epoch": 0.5222081567505091, + "grad_norm": 0.8635447025299072, + "learning_rate": 8.43342627352624e-06, + "loss": 0.645, + "step": 9488 + }, + { + "epoch": 0.5222631955528648, + "grad_norm": 0.7836978435516357, + "learning_rate": 8.43311114996262e-06, + "loss": 0.7647, + "step": 9489 + }, + { + "epoch": 0.5223182343552204, + "grad_norm": 0.8370835185050964, + "learning_rate": 8.432796000596652e-06, + "loss": 0.8402, + "step": 9490 + }, + { + "epoch": 0.5223732731575761, + "grad_norm": 0.9627843499183655, + "learning_rate": 8.432480825430712e-06, + "loss": 0.6985, + "step": 9491 + }, + { + "epoch": 0.5224283119599318, + "grad_norm": 0.6774263978004456, + "learning_rate": 8.432165624467163e-06, + "loss": 0.7051, + "step": 9492 + }, + { + "epoch": 0.5224833507622874, + "grad_norm": 0.6590597033500671, + "learning_rate": 8.431850397708375e-06, + "loss": 0.7147, + "step": 9493 + }, + { + "epoch": 0.522538389564643, + "grad_norm": 0.8153522610664368, + "learning_rate": 8.43153514515672e-06, + "loss": 0.6759, + "step": 9494 + }, + { + "epoch": 0.5225934283669987, + "grad_norm": 0.7457708716392517, + "learning_rate": 8.431219866814563e-06, + "loss": 0.7168, + "step": 9495 + }, + { + "epoch": 0.5226484671693544, + "grad_norm": 0.6994161009788513, + "learning_rate": 8.430904562684278e-06, + "loss": 0.8393, + "step": 9496 + }, + { + "epoch": 0.52270350597171, + "grad_norm": 0.780337393283844, + "learning_rate": 8.430589232768232e-06, + "loss": 0.6528, + "step": 9497 + }, + { + "epoch": 0.5227585447740657, + "grad_norm": 0.6833232641220093, + "learning_rate": 8.430273877068796e-06, + "loss": 0.7545, + "step": 9498 + }, + { + "epoch": 0.5228135835764214, + "grad_norm": 0.7330057621002197, + "learning_rate": 8.42995849558834e-06, + "loss": 0.7932, + "step": 9499 + }, + { + "epoch": 0.5228686223787771, + "grad_norm": 0.8131541609764099, + "learning_rate": 8.429643088329233e-06, + "loss": 0.7546, + "step": 9500 + }, + { + "epoch": 0.5229236611811326, + "grad_norm": 0.7353833317756653, + "learning_rate": 8.42932765529385e-06, + "loss": 0.7508, + "step": 9501 + }, + { + "epoch": 0.5229786999834883, + "grad_norm": 0.7166246771812439, + "learning_rate": 8.429012196484554e-06, + "loss": 0.728, + "step": 9502 + }, + { + "epoch": 0.523033738785844, + "grad_norm": 0.732064962387085, + "learning_rate": 8.428696711903721e-06, + "loss": 0.8306, + "step": 9503 + }, + { + "epoch": 0.5230887775881997, + "grad_norm": 0.6858934164047241, + "learning_rate": 8.428381201553721e-06, + "loss": 0.7801, + "step": 9504 + }, + { + "epoch": 0.5231438163905553, + "grad_norm": 0.7046478986740112, + "learning_rate": 8.428065665436928e-06, + "loss": 0.7365, + "step": 9505 + }, + { + "epoch": 0.523198855192911, + "grad_norm": 0.6669325828552246, + "learning_rate": 8.42775010355571e-06, + "loss": 0.7764, + "step": 9506 + }, + { + "epoch": 0.5232538939952667, + "grad_norm": 0.655619740486145, + "learning_rate": 8.427434515912438e-06, + "loss": 0.7919, + "step": 9507 + }, + { + "epoch": 0.5233089327976224, + "grad_norm": 0.6236690878868103, + "learning_rate": 8.427118902509487e-06, + "loss": 0.6653, + "step": 9508 + }, + { + "epoch": 0.5233639715999779, + "grad_norm": 0.8233165740966797, + "learning_rate": 8.426803263349228e-06, + "loss": 0.8012, + "step": 9509 + }, + { + "epoch": 0.5234190104023336, + "grad_norm": 0.6626759171485901, + "learning_rate": 8.426487598434035e-06, + "loss": 0.7728, + "step": 9510 + }, + { + "epoch": 0.5234740492046893, + "grad_norm": 0.9209974408149719, + "learning_rate": 8.426171907766275e-06, + "loss": 0.769, + "step": 9511 + }, + { + "epoch": 0.523529088007045, + "grad_norm": 0.6297587156295776, + "learning_rate": 8.425856191348325e-06, + "loss": 0.7333, + "step": 9512 + }, + { + "epoch": 0.5235841268094006, + "grad_norm": 0.6995256543159485, + "learning_rate": 8.425540449182558e-06, + "loss": 0.7486, + "step": 9513 + }, + { + "epoch": 0.5236391656117563, + "grad_norm": 0.8076607584953308, + "learning_rate": 8.425224681271345e-06, + "loss": 0.8533, + "step": 9514 + }, + { + "epoch": 0.523694204414112, + "grad_norm": 1.2198601961135864, + "learning_rate": 8.42490888761706e-06, + "loss": 0.7291, + "step": 9515 + }, + { + "epoch": 0.5237492432164677, + "grad_norm": 0.7047159671783447, + "learning_rate": 8.424593068222076e-06, + "loss": 0.713, + "step": 9516 + }, + { + "epoch": 0.5238042820188232, + "grad_norm": 0.7652333378791809, + "learning_rate": 8.424277223088768e-06, + "loss": 0.8149, + "step": 9517 + }, + { + "epoch": 0.5238593208211789, + "grad_norm": 1.1311010122299194, + "learning_rate": 8.42396135221951e-06, + "loss": 0.8195, + "step": 9518 + }, + { + "epoch": 0.5239143596235346, + "grad_norm": 0.7855533957481384, + "learning_rate": 8.423645455616674e-06, + "loss": 0.7901, + "step": 9519 + }, + { + "epoch": 0.5239693984258903, + "grad_norm": 0.7028971314430237, + "learning_rate": 8.423329533282635e-06, + "loss": 0.8006, + "step": 9520 + }, + { + "epoch": 0.5240244372282459, + "grad_norm": 0.703809916973114, + "learning_rate": 8.423013585219769e-06, + "loss": 0.7581, + "step": 9521 + }, + { + "epoch": 0.5240794760306016, + "grad_norm": 0.94233238697052, + "learning_rate": 8.422697611430448e-06, + "loss": 0.7689, + "step": 9522 + }, + { + "epoch": 0.5241345148329573, + "grad_norm": 0.8164071440696716, + "learning_rate": 8.422381611917047e-06, + "loss": 0.8761, + "step": 9523 + }, + { + "epoch": 0.5241895536353129, + "grad_norm": 0.6242091059684753, + "learning_rate": 8.422065586681944e-06, + "loss": 0.6975, + "step": 9524 + }, + { + "epoch": 0.5242445924376685, + "grad_norm": 0.6607261300086975, + "learning_rate": 8.42174953572751e-06, + "loss": 0.6847, + "step": 9525 + }, + { + "epoch": 0.5242996312400242, + "grad_norm": 0.7174261212348938, + "learning_rate": 8.421433459056123e-06, + "loss": 0.7905, + "step": 9526 + }, + { + "epoch": 0.5243546700423799, + "grad_norm": 0.7414089441299438, + "learning_rate": 8.42111735667016e-06, + "loss": 0.7788, + "step": 9527 + }, + { + "epoch": 0.5244097088447356, + "grad_norm": 0.7347442507743835, + "learning_rate": 8.420801228571992e-06, + "loss": 0.7691, + "step": 9528 + }, + { + "epoch": 0.5244647476470912, + "grad_norm": 0.6947832107543945, + "learning_rate": 8.420485074763999e-06, + "loss": 0.6702, + "step": 9529 + }, + { + "epoch": 0.5245197864494469, + "grad_norm": 0.6865423321723938, + "learning_rate": 8.420168895248557e-06, + "loss": 0.7577, + "step": 9530 + }, + { + "epoch": 0.5245748252518025, + "grad_norm": 0.7023190855979919, + "learning_rate": 8.419852690028039e-06, + "loss": 0.7711, + "step": 9531 + }, + { + "epoch": 0.5246298640541582, + "grad_norm": 0.8312145471572876, + "learning_rate": 8.419536459104824e-06, + "loss": 0.7999, + "step": 9532 + }, + { + "epoch": 0.5246849028565138, + "grad_norm": 0.6700688600540161, + "learning_rate": 8.419220202481288e-06, + "loss": 0.7163, + "step": 9533 + }, + { + "epoch": 0.5247399416588695, + "grad_norm": 0.767062246799469, + "learning_rate": 8.418903920159809e-06, + "loss": 0.7451, + "step": 9534 + }, + { + "epoch": 0.5247949804612252, + "grad_norm": 0.6814010143280029, + "learning_rate": 8.418587612142763e-06, + "loss": 0.771, + "step": 9535 + }, + { + "epoch": 0.5248500192635808, + "grad_norm": 0.6728426218032837, + "learning_rate": 8.418271278432528e-06, + "loss": 0.8336, + "step": 9536 + }, + { + "epoch": 0.5249050580659365, + "grad_norm": 0.7112382650375366, + "learning_rate": 8.417954919031482e-06, + "loss": 0.7392, + "step": 9537 + }, + { + "epoch": 0.5249600968682921, + "grad_norm": 0.7371365427970886, + "learning_rate": 8.417638533942e-06, + "loss": 0.8233, + "step": 9538 + }, + { + "epoch": 0.5250151356706478, + "grad_norm": 0.6593502163887024, + "learning_rate": 8.41732212316646e-06, + "loss": 0.7455, + "step": 9539 + }, + { + "epoch": 0.5250701744730034, + "grad_norm": 0.685553252696991, + "learning_rate": 8.417005686707245e-06, + "loss": 0.7783, + "step": 9540 + }, + { + "epoch": 0.5251252132753591, + "grad_norm": 0.7003353238105774, + "learning_rate": 8.41668922456673e-06, + "loss": 0.7733, + "step": 9541 + }, + { + "epoch": 0.5251802520777148, + "grad_norm": 0.7602891325950623, + "learning_rate": 8.416372736747292e-06, + "loss": 0.7236, + "step": 9542 + }, + { + "epoch": 0.5252352908800705, + "grad_norm": 0.647531270980835, + "learning_rate": 8.41605622325131e-06, + "loss": 0.7388, + "step": 9543 + }, + { + "epoch": 0.5252903296824261, + "grad_norm": 0.7309756875038147, + "learning_rate": 8.415739684081165e-06, + "loss": 0.7178, + "step": 9544 + }, + { + "epoch": 0.5253453684847817, + "grad_norm": 0.6991532444953918, + "learning_rate": 8.415423119239236e-06, + "loss": 0.8078, + "step": 9545 + }, + { + "epoch": 0.5254004072871374, + "grad_norm": 0.7392330765724182, + "learning_rate": 8.4151065287279e-06, + "loss": 0.8452, + "step": 9546 + }, + { + "epoch": 0.5254554460894931, + "grad_norm": 0.7617329955101013, + "learning_rate": 8.414789912549537e-06, + "loss": 0.7885, + "step": 9547 + }, + { + "epoch": 0.5255104848918487, + "grad_norm": 1.160125732421875, + "learning_rate": 8.414473270706527e-06, + "loss": 0.9628, + "step": 9548 + }, + { + "epoch": 0.5255655236942044, + "grad_norm": 0.7578685879707336, + "learning_rate": 8.414156603201252e-06, + "loss": 0.7745, + "step": 9549 + }, + { + "epoch": 0.5256205624965601, + "grad_norm": 0.6963017582893372, + "learning_rate": 8.413839910036089e-06, + "loss": 0.7693, + "step": 9550 + }, + { + "epoch": 0.5256756012989158, + "grad_norm": 0.6631398797035217, + "learning_rate": 8.413523191213415e-06, + "loss": 0.6606, + "step": 9551 + }, + { + "epoch": 0.5257306401012714, + "grad_norm": 0.707343339920044, + "learning_rate": 8.41320644673562e-06, + "loss": 0.7161, + "step": 9552 + }, + { + "epoch": 0.525785678903627, + "grad_norm": 0.833448588848114, + "learning_rate": 8.412889676605075e-06, + "loss": 0.7509, + "step": 9553 + }, + { + "epoch": 0.5258407177059827, + "grad_norm": 0.6214264631271362, + "learning_rate": 8.412572880824168e-06, + "loss": 0.7436, + "step": 9554 + }, + { + "epoch": 0.5258957565083384, + "grad_norm": 0.6479233503341675, + "learning_rate": 8.412256059395274e-06, + "loss": 0.7359, + "step": 9555 + }, + { + "epoch": 0.525950795310694, + "grad_norm": 0.7596501111984253, + "learning_rate": 8.411939212320778e-06, + "loss": 0.7422, + "step": 9556 + }, + { + "epoch": 0.5260058341130497, + "grad_norm": 0.8040934205055237, + "learning_rate": 8.41162233960306e-06, + "loss": 0.7721, + "step": 9557 + }, + { + "epoch": 0.5260608729154054, + "grad_norm": 0.7190027832984924, + "learning_rate": 8.411305441244505e-06, + "loss": 0.8794, + "step": 9558 + }, + { + "epoch": 0.5261159117177611, + "grad_norm": 0.8002649545669556, + "learning_rate": 8.410988517247486e-06, + "loss": 0.7958, + "step": 9559 + }, + { + "epoch": 0.5261709505201166, + "grad_norm": 0.7151750326156616, + "learning_rate": 8.410671567614394e-06, + "loss": 0.7597, + "step": 9560 + }, + { + "epoch": 0.5262259893224723, + "grad_norm": 0.9718102812767029, + "learning_rate": 8.410354592347607e-06, + "loss": 0.8272, + "step": 9561 + }, + { + "epoch": 0.526281028124828, + "grad_norm": 0.701932966709137, + "learning_rate": 8.410037591449506e-06, + "loss": 0.808, + "step": 9562 + }, + { + "epoch": 0.5263360669271837, + "grad_norm": 0.8247585296630859, + "learning_rate": 8.409720564922476e-06, + "loss": 0.7598, + "step": 9563 + }, + { + "epoch": 0.5263911057295393, + "grad_norm": 0.7305104732513428, + "learning_rate": 8.409403512768899e-06, + "loss": 0.8161, + "step": 9564 + }, + { + "epoch": 0.526446144531895, + "grad_norm": 0.8726410865783691, + "learning_rate": 8.409086434991158e-06, + "loss": 0.8598, + "step": 9565 + }, + { + "epoch": 0.5265011833342507, + "grad_norm": 0.7329155802726746, + "learning_rate": 8.408769331591637e-06, + "loss": 0.7355, + "step": 9566 + }, + { + "epoch": 0.5265562221366064, + "grad_norm": 0.8227902054786682, + "learning_rate": 8.408452202572716e-06, + "loss": 0.7888, + "step": 9567 + }, + { + "epoch": 0.5266112609389619, + "grad_norm": 0.7190666794776917, + "learning_rate": 8.408135047936783e-06, + "loss": 0.669, + "step": 9568 + }, + { + "epoch": 0.5266662997413176, + "grad_norm": 0.6529938578605652, + "learning_rate": 8.407817867686217e-06, + "loss": 0.7345, + "step": 9569 + }, + { + "epoch": 0.5267213385436733, + "grad_norm": 0.6985379457473755, + "learning_rate": 8.407500661823407e-06, + "loss": 0.852, + "step": 9570 + }, + { + "epoch": 0.526776377346029, + "grad_norm": 0.7480047345161438, + "learning_rate": 8.407183430350732e-06, + "loss": 0.7422, + "step": 9571 + }, + { + "epoch": 0.5268314161483846, + "grad_norm": 0.7599420547485352, + "learning_rate": 8.406866173270579e-06, + "loss": 0.7499, + "step": 9572 + }, + { + "epoch": 0.5268864549507403, + "grad_norm": 0.813448965549469, + "learning_rate": 8.406548890585331e-06, + "loss": 0.7979, + "step": 9573 + }, + { + "epoch": 0.526941493753096, + "grad_norm": 0.6029278039932251, + "learning_rate": 8.406231582297374e-06, + "loss": 0.7289, + "step": 9574 + }, + { + "epoch": 0.5269965325554516, + "grad_norm": 0.656829297542572, + "learning_rate": 8.40591424840909e-06, + "loss": 0.6778, + "step": 9575 + }, + { + "epoch": 0.5270515713578072, + "grad_norm": 0.7147198915481567, + "learning_rate": 8.405596888922869e-06, + "loss": 0.7212, + "step": 9576 + }, + { + "epoch": 0.5271066101601629, + "grad_norm": 0.7722035050392151, + "learning_rate": 8.405279503841094e-06, + "loss": 0.8008, + "step": 9577 + }, + { + "epoch": 0.5271616489625186, + "grad_norm": 0.6828493475914001, + "learning_rate": 8.40496209316615e-06, + "loss": 0.787, + "step": 9578 + }, + { + "epoch": 0.5272166877648742, + "grad_norm": 0.6965187788009644, + "learning_rate": 8.40464465690042e-06, + "loss": 0.6803, + "step": 9579 + }, + { + "epoch": 0.5272717265672299, + "grad_norm": 0.7300547957420349, + "learning_rate": 8.404327195046293e-06, + "loss": 0.8165, + "step": 9580 + }, + { + "epoch": 0.5273267653695856, + "grad_norm": 0.7367526292800903, + "learning_rate": 8.404009707606153e-06, + "loss": 0.7709, + "step": 9581 + }, + { + "epoch": 0.5273818041719412, + "grad_norm": 0.6694689989089966, + "learning_rate": 8.40369219458239e-06, + "loss": 0.7971, + "step": 9582 + }, + { + "epoch": 0.5274368429742968, + "grad_norm": 0.6723141074180603, + "learning_rate": 8.403374655977384e-06, + "loss": 0.695, + "step": 9583 + }, + { + "epoch": 0.5274918817766525, + "grad_norm": 0.7737089395523071, + "learning_rate": 8.403057091793528e-06, + "loss": 0.7765, + "step": 9584 + }, + { + "epoch": 0.5275469205790082, + "grad_norm": 0.8378487825393677, + "learning_rate": 8.402739502033204e-06, + "loss": 0.7984, + "step": 9585 + }, + { + "epoch": 0.5276019593813639, + "grad_norm": 0.7496509552001953, + "learning_rate": 8.402421886698802e-06, + "loss": 0.7846, + "step": 9586 + }, + { + "epoch": 0.5276569981837195, + "grad_norm": 0.7020435929298401, + "learning_rate": 8.402104245792706e-06, + "loss": 0.8102, + "step": 9587 + }, + { + "epoch": 0.5277120369860752, + "grad_norm": 0.8877277374267578, + "learning_rate": 8.401786579317308e-06, + "loss": 0.6995, + "step": 9588 + }, + { + "epoch": 0.5277670757884309, + "grad_norm": 0.6975196599960327, + "learning_rate": 8.401468887274991e-06, + "loss": 0.7475, + "step": 9589 + }, + { + "epoch": 0.5278221145907865, + "grad_norm": 0.8267357349395752, + "learning_rate": 8.401151169668144e-06, + "loss": 0.7091, + "step": 9590 + }, + { + "epoch": 0.5278771533931421, + "grad_norm": 0.6778179407119751, + "learning_rate": 8.400833426499156e-06, + "loss": 0.8198, + "step": 9591 + }, + { + "epoch": 0.5279321921954978, + "grad_norm": 0.7343330979347229, + "learning_rate": 8.400515657770414e-06, + "loss": 0.7565, + "step": 9592 + }, + { + "epoch": 0.5279872309978535, + "grad_norm": 0.7745271325111389, + "learning_rate": 8.400197863484307e-06, + "loss": 0.7991, + "step": 9593 + }, + { + "epoch": 0.5280422698002092, + "grad_norm": 0.7652345895767212, + "learning_rate": 8.399880043643224e-06, + "loss": 0.7752, + "step": 9594 + }, + { + "epoch": 0.5280973086025648, + "grad_norm": 0.9764432311058044, + "learning_rate": 8.399562198249551e-06, + "loss": 0.784, + "step": 9595 + }, + { + "epoch": 0.5281523474049205, + "grad_norm": 0.6763052940368652, + "learning_rate": 8.399244327305678e-06, + "loss": 0.7695, + "step": 9596 + }, + { + "epoch": 0.5282073862072761, + "grad_norm": 0.7788934111595154, + "learning_rate": 8.398926430813996e-06, + "loss": 0.8152, + "step": 9597 + }, + { + "epoch": 0.5282624250096318, + "grad_norm": 0.8088317513465881, + "learning_rate": 8.398608508776894e-06, + "loss": 0.7751, + "step": 9598 + }, + { + "epoch": 0.5283174638119874, + "grad_norm": 0.6735319495201111, + "learning_rate": 8.398290561196756e-06, + "loss": 0.7305, + "step": 9599 + }, + { + "epoch": 0.5283725026143431, + "grad_norm": 0.7279297113418579, + "learning_rate": 8.39797258807598e-06, + "loss": 0.7381, + "step": 9600 + }, + { + "epoch": 0.5284275414166988, + "grad_norm": 0.74604332447052, + "learning_rate": 8.39765458941695e-06, + "loss": 0.8138, + "step": 9601 + }, + { + "epoch": 0.5284825802190545, + "grad_norm": 0.7735850214958191, + "learning_rate": 8.397336565222057e-06, + "loss": 0.7364, + "step": 9602 + }, + { + "epoch": 0.52853761902141, + "grad_norm": 0.7890003323554993, + "learning_rate": 8.397018515493693e-06, + "loss": 0.8301, + "step": 9603 + }, + { + "epoch": 0.5285926578237657, + "grad_norm": 0.739054262638092, + "learning_rate": 8.396700440234245e-06, + "loss": 0.7503, + "step": 9604 + }, + { + "epoch": 0.5286476966261214, + "grad_norm": 0.7611023783683777, + "learning_rate": 8.396382339446108e-06, + "loss": 0.7225, + "step": 9605 + }, + { + "epoch": 0.5287027354284771, + "grad_norm": 0.770602285861969, + "learning_rate": 8.39606421313167e-06, + "loss": 0.71, + "step": 9606 + }, + { + "epoch": 0.5287577742308327, + "grad_norm": 0.7495261430740356, + "learning_rate": 8.395746061293322e-06, + "loss": 0.7729, + "step": 9607 + }, + { + "epoch": 0.5288128130331884, + "grad_norm": 0.7159668207168579, + "learning_rate": 8.395427883933456e-06, + "loss": 0.8457, + "step": 9608 + }, + { + "epoch": 0.5288678518355441, + "grad_norm": 0.7663426399230957, + "learning_rate": 8.395109681054463e-06, + "loss": 0.784, + "step": 9609 + }, + { + "epoch": 0.5289228906378998, + "grad_norm": 0.7271933555603027, + "learning_rate": 8.394791452658732e-06, + "loss": 0.7981, + "step": 9610 + }, + { + "epoch": 0.5289779294402553, + "grad_norm": 0.7782096266746521, + "learning_rate": 8.394473198748661e-06, + "loss": 0.7953, + "step": 9611 + }, + { + "epoch": 0.529032968242611, + "grad_norm": 0.8318955302238464, + "learning_rate": 8.394154919326636e-06, + "loss": 0.6875, + "step": 9612 + }, + { + "epoch": 0.5290880070449667, + "grad_norm": 0.7402167916297913, + "learning_rate": 8.393836614395051e-06, + "loss": 0.7805, + "step": 9613 + }, + { + "epoch": 0.5291430458473224, + "grad_norm": 0.6314370632171631, + "learning_rate": 8.393518283956299e-06, + "loss": 0.6841, + "step": 9614 + }, + { + "epoch": 0.529198084649678, + "grad_norm": 0.8387365937232971, + "learning_rate": 8.393199928012772e-06, + "loss": 0.8503, + "step": 9615 + }, + { + "epoch": 0.5292531234520337, + "grad_norm": 0.7066243886947632, + "learning_rate": 8.392881546566863e-06, + "loss": 0.8494, + "step": 9616 + }, + { + "epoch": 0.5293081622543894, + "grad_norm": 0.7034226059913635, + "learning_rate": 8.392563139620964e-06, + "loss": 0.7335, + "step": 9617 + }, + { + "epoch": 0.5293632010567451, + "grad_norm": 0.6969622373580933, + "learning_rate": 8.392244707177468e-06, + "loss": 0.7203, + "step": 9618 + }, + { + "epoch": 0.5294182398591006, + "grad_norm": 0.7694050073623657, + "learning_rate": 8.391926249238768e-06, + "loss": 0.7864, + "step": 9619 + }, + { + "epoch": 0.5294732786614563, + "grad_norm": 0.7284281253814697, + "learning_rate": 8.391607765807262e-06, + "loss": 0.6704, + "step": 9620 + }, + { + "epoch": 0.529528317463812, + "grad_norm": 1.0466688871383667, + "learning_rate": 8.391289256885337e-06, + "loss": 0.7807, + "step": 9621 + }, + { + "epoch": 0.5295833562661676, + "grad_norm": 0.7118388414382935, + "learning_rate": 8.39097072247539e-06, + "loss": 0.738, + "step": 9622 + }, + { + "epoch": 0.5296383950685233, + "grad_norm": 0.794377863407135, + "learning_rate": 8.390652162579815e-06, + "loss": 0.6831, + "step": 9623 + }, + { + "epoch": 0.529693433870879, + "grad_norm": 0.6042492389678955, + "learning_rate": 8.390333577201007e-06, + "loss": 0.6773, + "step": 9624 + }, + { + "epoch": 0.5297484726732347, + "grad_norm": 0.6452521681785583, + "learning_rate": 8.390014966341357e-06, + "loss": 0.7168, + "step": 9625 + }, + { + "epoch": 0.5298035114755902, + "grad_norm": 0.7113651633262634, + "learning_rate": 8.389696330003265e-06, + "loss": 0.709, + "step": 9626 + }, + { + "epoch": 0.5298585502779459, + "grad_norm": 0.6469250917434692, + "learning_rate": 8.38937766818912e-06, + "loss": 0.6804, + "step": 9627 + }, + { + "epoch": 0.5299135890803016, + "grad_norm": 0.7529417872428894, + "learning_rate": 8.389058980901322e-06, + "loss": 0.8537, + "step": 9628 + }, + { + "epoch": 0.5299686278826573, + "grad_norm": 0.7681186199188232, + "learning_rate": 8.388740268142262e-06, + "loss": 0.7383, + "step": 9629 + }, + { + "epoch": 0.5300236666850129, + "grad_norm": 0.6585648655891418, + "learning_rate": 8.388421529914337e-06, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 0.5300787054873686, + "grad_norm": 0.7432085871696472, + "learning_rate": 8.388102766219943e-06, + "loss": 0.7391, + "step": 9631 + }, + { + "epoch": 0.5301337442897243, + "grad_norm": 0.6672815084457397, + "learning_rate": 8.387783977061476e-06, + "loss": 0.8056, + "step": 9632 + }, + { + "epoch": 0.53018878309208, + "grad_norm": 0.7566675543785095, + "learning_rate": 8.387465162441332e-06, + "loss": 0.7858, + "step": 9633 + }, + { + "epoch": 0.5302438218944355, + "grad_norm": 0.6522077322006226, + "learning_rate": 8.387146322361907e-06, + "loss": 0.759, + "step": 9634 + }, + { + "epoch": 0.5302988606967912, + "grad_norm": 0.7246397137641907, + "learning_rate": 8.386827456825597e-06, + "loss": 0.8158, + "step": 9635 + }, + { + "epoch": 0.5303538994991469, + "grad_norm": 0.7577807307243347, + "learning_rate": 8.386508565834797e-06, + "loss": 0.7495, + "step": 9636 + }, + { + "epoch": 0.5304089383015026, + "grad_norm": 0.7080703973770142, + "learning_rate": 8.386189649391906e-06, + "loss": 0.8086, + "step": 9637 + }, + { + "epoch": 0.5304639771038582, + "grad_norm": 0.7505277395248413, + "learning_rate": 8.385870707499321e-06, + "loss": 0.7206, + "step": 9638 + }, + { + "epoch": 0.5305190159062139, + "grad_norm": 0.7044165134429932, + "learning_rate": 8.385551740159437e-06, + "loss": 0.7838, + "step": 9639 + }, + { + "epoch": 0.5305740547085696, + "grad_norm": 0.7921645641326904, + "learning_rate": 8.385232747374652e-06, + "loss": 0.7604, + "step": 9640 + }, + { + "epoch": 0.5306290935109252, + "grad_norm": 0.9930111169815063, + "learning_rate": 8.384913729147364e-06, + "loss": 0.7839, + "step": 9641 + }, + { + "epoch": 0.5306841323132808, + "grad_norm": 0.7333244681358337, + "learning_rate": 8.38459468547997e-06, + "loss": 0.7941, + "step": 9642 + }, + { + "epoch": 0.5307391711156365, + "grad_norm": 0.7857590913772583, + "learning_rate": 8.384275616374868e-06, + "loss": 0.8535, + "step": 9643 + }, + { + "epoch": 0.5307942099179922, + "grad_norm": 0.8568746447563171, + "learning_rate": 8.383956521834459e-06, + "loss": 0.6586, + "step": 9644 + }, + { + "epoch": 0.5308492487203479, + "grad_norm": 0.7061276435852051, + "learning_rate": 8.383637401861136e-06, + "loss": 0.7288, + "step": 9645 + }, + { + "epoch": 0.5309042875227035, + "grad_norm": 0.7348940968513489, + "learning_rate": 8.383318256457303e-06, + "loss": 0.8099, + "step": 9646 + }, + { + "epoch": 0.5309593263250592, + "grad_norm": 0.6526725888252258, + "learning_rate": 8.382999085625353e-06, + "loss": 0.6702, + "step": 9647 + }, + { + "epoch": 0.5310143651274148, + "grad_norm": 0.8122747540473938, + "learning_rate": 8.382679889367687e-06, + "loss": 0.67, + "step": 9648 + }, + { + "epoch": 0.5310694039297705, + "grad_norm": 0.9145376682281494, + "learning_rate": 8.382360667686706e-06, + "loss": 0.7719, + "step": 9649 + }, + { + "epoch": 0.5311244427321261, + "grad_norm": 0.6659818887710571, + "learning_rate": 8.382041420584807e-06, + "loss": 0.806, + "step": 9650 + }, + { + "epoch": 0.5311794815344818, + "grad_norm": 0.7088539004325867, + "learning_rate": 8.381722148064391e-06, + "loss": 0.7046, + "step": 9651 + }, + { + "epoch": 0.5312345203368375, + "grad_norm": 0.8610590696334839, + "learning_rate": 8.381402850127854e-06, + "loss": 0.6998, + "step": 9652 + }, + { + "epoch": 0.5312895591391932, + "grad_norm": 0.775830864906311, + "learning_rate": 8.3810835267776e-06, + "loss": 0.8874, + "step": 9653 + }, + { + "epoch": 0.5313445979415488, + "grad_norm": 0.6871606707572937, + "learning_rate": 8.380764178016028e-06, + "loss": 0.7903, + "step": 9654 + }, + { + "epoch": 0.5313996367439044, + "grad_norm": 0.7005272507667542, + "learning_rate": 8.380444803845537e-06, + "loss": 0.6685, + "step": 9655 + }, + { + "epoch": 0.5314546755462601, + "grad_norm": 0.8922042846679688, + "learning_rate": 8.380125404268527e-06, + "loss": 0.7797, + "step": 9656 + }, + { + "epoch": 0.5315097143486158, + "grad_norm": 0.7242267727851868, + "learning_rate": 8.3798059792874e-06, + "loss": 0.863, + "step": 9657 + }, + { + "epoch": 0.5315647531509714, + "grad_norm": 0.6625328660011292, + "learning_rate": 8.379486528904555e-06, + "loss": 0.7, + "step": 9658 + }, + { + "epoch": 0.5316197919533271, + "grad_norm": 0.9882226586341858, + "learning_rate": 8.379167053122394e-06, + "loss": 0.7534, + "step": 9659 + }, + { + "epoch": 0.5316748307556828, + "grad_norm": 0.6894702911376953, + "learning_rate": 8.378847551943318e-06, + "loss": 0.7503, + "step": 9660 + }, + { + "epoch": 0.5317298695580385, + "grad_norm": 0.6820259690284729, + "learning_rate": 8.37852802536973e-06, + "loss": 0.7713, + "step": 9661 + }, + { + "epoch": 0.531784908360394, + "grad_norm": 0.667918860912323, + "learning_rate": 8.378208473404028e-06, + "loss": 0.7524, + "step": 9662 + }, + { + "epoch": 0.5318399471627497, + "grad_norm": 0.7789241075515747, + "learning_rate": 8.377888896048617e-06, + "loss": 0.6906, + "step": 9663 + }, + { + "epoch": 0.5318949859651054, + "grad_norm": 0.7264542579650879, + "learning_rate": 8.377569293305894e-06, + "loss": 0.7836, + "step": 9664 + }, + { + "epoch": 0.531950024767461, + "grad_norm": 0.6979835629463196, + "learning_rate": 8.377249665178267e-06, + "loss": 0.7739, + "step": 9665 + }, + { + "epoch": 0.5320050635698167, + "grad_norm": 0.8008072376251221, + "learning_rate": 8.376930011668136e-06, + "loss": 0.7853, + "step": 9666 + }, + { + "epoch": 0.5320601023721724, + "grad_norm": 0.7185621857643127, + "learning_rate": 8.376610332777901e-06, + "loss": 0.7311, + "step": 9667 + }, + { + "epoch": 0.5321151411745281, + "grad_norm": 0.7644047141075134, + "learning_rate": 8.376290628509969e-06, + "loss": 0.6919, + "step": 9668 + }, + { + "epoch": 0.5321701799768837, + "grad_norm": 0.7387600541114807, + "learning_rate": 8.37597089886674e-06, + "loss": 0.7285, + "step": 9669 + }, + { + "epoch": 0.5322252187792393, + "grad_norm": 0.7344895005226135, + "learning_rate": 8.375651143850614e-06, + "loss": 0.7514, + "step": 9670 + }, + { + "epoch": 0.532280257581595, + "grad_norm": 0.6930707097053528, + "learning_rate": 8.375331363464002e-06, + "loss": 0.8318, + "step": 9671 + }, + { + "epoch": 0.5323352963839507, + "grad_norm": 0.678162693977356, + "learning_rate": 8.3750115577093e-06, + "loss": 0.7123, + "step": 9672 + }, + { + "epoch": 0.5323903351863063, + "grad_norm": 0.7780481576919556, + "learning_rate": 8.374691726588914e-06, + "loss": 0.7672, + "step": 9673 + }, + { + "epoch": 0.532445373988662, + "grad_norm": 0.6664674282073975, + "learning_rate": 8.374371870105252e-06, + "loss": 0.6994, + "step": 9674 + }, + { + "epoch": 0.5325004127910177, + "grad_norm": 0.6952562928199768, + "learning_rate": 8.374051988260712e-06, + "loss": 0.8638, + "step": 9675 + }, + { + "epoch": 0.5325554515933734, + "grad_norm": 0.764005184173584, + "learning_rate": 8.373732081057699e-06, + "loss": 0.756, + "step": 9676 + }, + { + "epoch": 0.5326104903957289, + "grad_norm": 0.9434393048286438, + "learning_rate": 8.373412148498621e-06, + "loss": 0.8668, + "step": 9677 + }, + { + "epoch": 0.5326655291980846, + "grad_norm": 0.752609133720398, + "learning_rate": 8.373092190585878e-06, + "loss": 0.8078, + "step": 9678 + }, + { + "epoch": 0.5327205680004403, + "grad_norm": 0.671940803527832, + "learning_rate": 8.37277220732188e-06, + "loss": 0.7726, + "step": 9679 + }, + { + "epoch": 0.532775606802796, + "grad_norm": 0.7824863791465759, + "learning_rate": 8.372452198709027e-06, + "loss": 0.8246, + "step": 9680 + }, + { + "epoch": 0.5328306456051516, + "grad_norm": 0.7300587892532349, + "learning_rate": 8.372132164749726e-06, + "loss": 0.7953, + "step": 9681 + }, + { + "epoch": 0.5328856844075073, + "grad_norm": 0.7146018743515015, + "learning_rate": 8.371812105446384e-06, + "loss": 0.7409, + "step": 9682 + }, + { + "epoch": 0.532940723209863, + "grad_norm": 0.73857581615448, + "learning_rate": 8.371492020801404e-06, + "loss": 0.8067, + "step": 9683 + }, + { + "epoch": 0.5329957620122187, + "grad_norm": 0.6760877966880798, + "learning_rate": 8.37117191081719e-06, + "loss": 0.7363, + "step": 9684 + }, + { + "epoch": 0.5330508008145742, + "grad_norm": 0.766482412815094, + "learning_rate": 8.370851775496154e-06, + "loss": 0.7358, + "step": 9685 + }, + { + "epoch": 0.5331058396169299, + "grad_norm": 0.7230576276779175, + "learning_rate": 8.370531614840697e-06, + "loss": 0.8154, + "step": 9686 + }, + { + "epoch": 0.5331608784192856, + "grad_norm": 0.7357933521270752, + "learning_rate": 8.370211428853225e-06, + "loss": 0.7187, + "step": 9687 + }, + { + "epoch": 0.5332159172216413, + "grad_norm": 0.8208534121513367, + "learning_rate": 8.369891217536148e-06, + "loss": 0.8037, + "step": 9688 + }, + { + "epoch": 0.5332709560239969, + "grad_norm": 0.6771863698959351, + "learning_rate": 8.36957098089187e-06, + "loss": 0.733, + "step": 9689 + }, + { + "epoch": 0.5333259948263526, + "grad_norm": 0.6382480263710022, + "learning_rate": 8.369250718922798e-06, + "loss": 0.7391, + "step": 9690 + }, + { + "epoch": 0.5333810336287083, + "grad_norm": 0.6638994812965393, + "learning_rate": 8.368930431631342e-06, + "loss": 0.7176, + "step": 9691 + }, + { + "epoch": 0.533436072431064, + "grad_norm": 0.7599604725837708, + "learning_rate": 8.368610119019903e-06, + "loss": 0.8814, + "step": 9692 + }, + { + "epoch": 0.5334911112334195, + "grad_norm": 0.6896547079086304, + "learning_rate": 8.368289781090894e-06, + "loss": 0.7618, + "step": 9693 + }, + { + "epoch": 0.5335461500357752, + "grad_norm": 0.7081224918365479, + "learning_rate": 8.36796941784672e-06, + "loss": 0.656, + "step": 9694 + }, + { + "epoch": 0.5336011888381309, + "grad_norm": 0.8819646835327148, + "learning_rate": 8.367649029289791e-06, + "loss": 0.8946, + "step": 9695 + }, + { + "epoch": 0.5336562276404866, + "grad_norm": 0.6597925424575806, + "learning_rate": 8.367328615422512e-06, + "loss": 0.6891, + "step": 9696 + }, + { + "epoch": 0.5337112664428422, + "grad_norm": 0.6855770945549011, + "learning_rate": 8.367008176247294e-06, + "loss": 0.7158, + "step": 9697 + }, + { + "epoch": 0.5337663052451979, + "grad_norm": 0.6874905228614807, + "learning_rate": 8.366687711766541e-06, + "loss": 0.7445, + "step": 9698 + }, + { + "epoch": 0.5338213440475535, + "grad_norm": 0.6990895867347717, + "learning_rate": 8.366367221982666e-06, + "loss": 0.6189, + "step": 9699 + }, + { + "epoch": 0.5338763828499092, + "grad_norm": 0.7235365509986877, + "learning_rate": 8.366046706898075e-06, + "loss": 0.6406, + "step": 9700 + }, + { + "epoch": 0.5339314216522648, + "grad_norm": 0.7563154697418213, + "learning_rate": 8.36572616651518e-06, + "loss": 0.7798, + "step": 9701 + }, + { + "epoch": 0.5339864604546205, + "grad_norm": 0.6845980286598206, + "learning_rate": 8.365405600836387e-06, + "loss": 0.7665, + "step": 9702 + }, + { + "epoch": 0.5340414992569762, + "grad_norm": 0.6374378204345703, + "learning_rate": 8.365085009864106e-06, + "loss": 0.6935, + "step": 9703 + }, + { + "epoch": 0.5340965380593319, + "grad_norm": 0.726672887802124, + "learning_rate": 8.364764393600747e-06, + "loss": 0.7821, + "step": 9704 + }, + { + "epoch": 0.5341515768616875, + "grad_norm": 0.6784456372261047, + "learning_rate": 8.364443752048719e-06, + "loss": 0.7722, + "step": 9705 + }, + { + "epoch": 0.5342066156640431, + "grad_norm": 0.6344080567359924, + "learning_rate": 8.364123085210433e-06, + "loss": 0.7256, + "step": 9706 + }, + { + "epoch": 0.5342616544663988, + "grad_norm": 0.7913152575492859, + "learning_rate": 8.363802393088299e-06, + "loss": 0.7892, + "step": 9707 + }, + { + "epoch": 0.5343166932687544, + "grad_norm": 0.6792107820510864, + "learning_rate": 8.363481675684726e-06, + "loss": 0.7374, + "step": 9708 + }, + { + "epoch": 0.5343717320711101, + "grad_norm": 1.0153685808181763, + "learning_rate": 8.363160933002126e-06, + "loss": 0.7396, + "step": 9709 + }, + { + "epoch": 0.5344267708734658, + "grad_norm": 0.7655258774757385, + "learning_rate": 8.362840165042906e-06, + "loss": 0.7746, + "step": 9710 + }, + { + "epoch": 0.5344818096758215, + "grad_norm": 0.7830179929733276, + "learning_rate": 8.362519371809483e-06, + "loss": 0.7082, + "step": 9711 + }, + { + "epoch": 0.5345368484781771, + "grad_norm": 0.7410556674003601, + "learning_rate": 8.362198553304261e-06, + "loss": 0.7055, + "step": 9712 + }, + { + "epoch": 0.5345918872805328, + "grad_norm": 0.6542297005653381, + "learning_rate": 8.361877709529658e-06, + "loss": 0.7153, + "step": 9713 + }, + { + "epoch": 0.5346469260828884, + "grad_norm": 0.6752653121948242, + "learning_rate": 8.36155684048808e-06, + "loss": 0.6901, + "step": 9714 + }, + { + "epoch": 0.5347019648852441, + "grad_norm": 0.7158684134483337, + "learning_rate": 8.361235946181943e-06, + "loss": 0.7775, + "step": 9715 + }, + { + "epoch": 0.5347570036875997, + "grad_norm": 0.6174392700195312, + "learning_rate": 8.360915026613652e-06, + "loss": 0.6501, + "step": 9716 + }, + { + "epoch": 0.5348120424899554, + "grad_norm": 0.7110500931739807, + "learning_rate": 8.360594081785627e-06, + "loss": 0.742, + "step": 9717 + }, + { + "epoch": 0.5348670812923111, + "grad_norm": 0.8456488251686096, + "learning_rate": 8.360273111700276e-06, + "loss": 0.8237, + "step": 9718 + }, + { + "epoch": 0.5349221200946668, + "grad_norm": 0.6660711169242859, + "learning_rate": 8.359952116360011e-06, + "loss": 0.7856, + "step": 9719 + }, + { + "epoch": 0.5349771588970224, + "grad_norm": 0.7661204934120178, + "learning_rate": 8.359631095767244e-06, + "loss": 0.8336, + "step": 9720 + }, + { + "epoch": 0.535032197699378, + "grad_norm": 0.7747855186462402, + "learning_rate": 8.359310049924392e-06, + "loss": 0.7302, + "step": 9721 + }, + { + "epoch": 0.5350872365017337, + "grad_norm": 0.8156001567840576, + "learning_rate": 8.358988978833864e-06, + "loss": 0.7878, + "step": 9722 + }, + { + "epoch": 0.5351422753040894, + "grad_norm": 0.7371010780334473, + "learning_rate": 8.358667882498073e-06, + "loss": 0.803, + "step": 9723 + }, + { + "epoch": 0.535197314106445, + "grad_norm": 0.7141744494438171, + "learning_rate": 8.358346760919431e-06, + "loss": 0.687, + "step": 9724 + }, + { + "epoch": 0.5352523529088007, + "grad_norm": 0.6395956873893738, + "learning_rate": 8.358025614100358e-06, + "loss": 0.7052, + "step": 9725 + }, + { + "epoch": 0.5353073917111564, + "grad_norm": 0.7135289311408997, + "learning_rate": 8.35770444204326e-06, + "loss": 0.7882, + "step": 9726 + }, + { + "epoch": 0.5353624305135121, + "grad_norm": 0.702408492565155, + "learning_rate": 8.357383244750557e-06, + "loss": 0.6965, + "step": 9727 + }, + { + "epoch": 0.5354174693158676, + "grad_norm": 0.731193482875824, + "learning_rate": 8.357062022224658e-06, + "loss": 0.7525, + "step": 9728 + }, + { + "epoch": 0.5354725081182233, + "grad_norm": 0.8115057945251465, + "learning_rate": 8.356740774467982e-06, + "loss": 0.7466, + "step": 9729 + }, + { + "epoch": 0.535527546920579, + "grad_norm": 0.8644380569458008, + "learning_rate": 8.356419501482938e-06, + "loss": 0.7989, + "step": 9730 + }, + { + "epoch": 0.5355825857229347, + "grad_norm": 1.414620041847229, + "learning_rate": 8.356098203271945e-06, + "loss": 0.7782, + "step": 9731 + }, + { + "epoch": 0.5356376245252903, + "grad_norm": 0.7355421185493469, + "learning_rate": 8.355776879837417e-06, + "loss": 0.7163, + "step": 9732 + }, + { + "epoch": 0.535692663327646, + "grad_norm": 0.6556879281997681, + "learning_rate": 8.355455531181766e-06, + "loss": 0.7543, + "step": 9733 + }, + { + "epoch": 0.5357477021300017, + "grad_norm": 0.6632516980171204, + "learning_rate": 8.355134157307412e-06, + "loss": 0.7382, + "step": 9734 + }, + { + "epoch": 0.5358027409323574, + "grad_norm": 0.7096145153045654, + "learning_rate": 8.354812758216767e-06, + "loss": 0.7797, + "step": 9735 + }, + { + "epoch": 0.5358577797347129, + "grad_norm": 0.6404649019241333, + "learning_rate": 8.354491333912244e-06, + "loss": 0.6637, + "step": 9736 + }, + { + "epoch": 0.5359128185370686, + "grad_norm": 0.6987022757530212, + "learning_rate": 8.354169884396266e-06, + "loss": 0.7682, + "step": 9737 + }, + { + "epoch": 0.5359678573394243, + "grad_norm": 0.6593581438064575, + "learning_rate": 8.353848409671245e-06, + "loss": 0.6747, + "step": 9738 + }, + { + "epoch": 0.53602289614178, + "grad_norm": 0.6999880075454712, + "learning_rate": 8.353526909739596e-06, + "loss": 0.6659, + "step": 9739 + }, + { + "epoch": 0.5360779349441356, + "grad_norm": 0.6448989510536194, + "learning_rate": 8.353205384603735e-06, + "loss": 0.7297, + "step": 9740 + }, + { + "epoch": 0.5361329737464913, + "grad_norm": 0.6666765213012695, + "learning_rate": 8.352883834266082e-06, + "loss": 0.6459, + "step": 9741 + }, + { + "epoch": 0.536188012548847, + "grad_norm": 0.8020225763320923, + "learning_rate": 8.352562258729051e-06, + "loss": 0.8122, + "step": 9742 + }, + { + "epoch": 0.5362430513512026, + "grad_norm": 0.6883382201194763, + "learning_rate": 8.35224065799506e-06, + "loss": 0.7084, + "step": 9743 + }, + { + "epoch": 0.5362980901535582, + "grad_norm": 0.7366660237312317, + "learning_rate": 8.351919032066525e-06, + "loss": 0.848, + "step": 9744 + }, + { + "epoch": 0.5363531289559139, + "grad_norm": 0.7408311367034912, + "learning_rate": 8.351597380945863e-06, + "loss": 0.798, + "step": 9745 + }, + { + "epoch": 0.5364081677582696, + "grad_norm": 0.6841676235198975, + "learning_rate": 8.351275704635495e-06, + "loss": 0.7372, + "step": 9746 + }, + { + "epoch": 0.5364632065606253, + "grad_norm": 0.6903505325317383, + "learning_rate": 8.350954003137833e-06, + "loss": 0.7371, + "step": 9747 + }, + { + "epoch": 0.5365182453629809, + "grad_norm": 0.6444700956344604, + "learning_rate": 8.350632276455298e-06, + "loss": 0.6685, + "step": 9748 + }, + { + "epoch": 0.5365732841653366, + "grad_norm": 0.6821029186248779, + "learning_rate": 8.350310524590307e-06, + "loss": 0.8796, + "step": 9749 + }, + { + "epoch": 0.5366283229676923, + "grad_norm": 0.6733999848365784, + "learning_rate": 8.349988747545282e-06, + "loss": 0.6833, + "step": 9750 + }, + { + "epoch": 0.5366833617700478, + "grad_norm": 0.8097321391105652, + "learning_rate": 8.349666945322636e-06, + "loss": 0.834, + "step": 9751 + }, + { + "epoch": 0.5367384005724035, + "grad_norm": 0.7692395448684692, + "learning_rate": 8.34934511792479e-06, + "loss": 0.7866, + "step": 9752 + }, + { + "epoch": 0.5367934393747592, + "grad_norm": 0.7551112174987793, + "learning_rate": 8.349023265354164e-06, + "loss": 0.8378, + "step": 9753 + }, + { + "epoch": 0.5368484781771149, + "grad_norm": 0.5796393156051636, + "learning_rate": 8.348701387613176e-06, + "loss": 0.5995, + "step": 9754 + }, + { + "epoch": 0.5369035169794705, + "grad_norm": 0.6839799284934998, + "learning_rate": 8.348379484704244e-06, + "loss": 0.8262, + "step": 9755 + }, + { + "epoch": 0.5369585557818262, + "grad_norm": 0.7710869908332825, + "learning_rate": 8.348057556629786e-06, + "loss": 0.7796, + "step": 9756 + }, + { + "epoch": 0.5370135945841819, + "grad_norm": 0.733096718788147, + "learning_rate": 8.347735603392225e-06, + "loss": 0.8233, + "step": 9757 + }, + { + "epoch": 0.5370686333865375, + "grad_norm": 0.6438466906547546, + "learning_rate": 8.347413624993982e-06, + "loss": 0.7582, + "step": 9758 + }, + { + "epoch": 0.5371236721888931, + "grad_norm": 0.6877560615539551, + "learning_rate": 8.34709162143747e-06, + "loss": 0.7428, + "step": 9759 + }, + { + "epoch": 0.5371787109912488, + "grad_norm": 1.060831069946289, + "learning_rate": 8.346769592725115e-06, + "loss": 0.8636, + "step": 9760 + }, + { + "epoch": 0.5372337497936045, + "grad_norm": 0.6828434467315674, + "learning_rate": 8.346447538859334e-06, + "loss": 0.7801, + "step": 9761 + }, + { + "epoch": 0.5372887885959602, + "grad_norm": 0.6784753203392029, + "learning_rate": 8.346125459842552e-06, + "loss": 0.7356, + "step": 9762 + }, + { + "epoch": 0.5373438273983158, + "grad_norm": 0.6493560075759888, + "learning_rate": 8.345803355677185e-06, + "loss": 0.749, + "step": 9763 + }, + { + "epoch": 0.5373988662006715, + "grad_norm": 0.7109258770942688, + "learning_rate": 8.345481226365657e-06, + "loss": 0.7599, + "step": 9764 + }, + { + "epoch": 0.5374539050030271, + "grad_norm": 0.8526985049247742, + "learning_rate": 8.345159071910387e-06, + "loss": 0.6605, + "step": 9765 + }, + { + "epoch": 0.5375089438053828, + "grad_norm": 0.9194039702415466, + "learning_rate": 8.344836892313797e-06, + "loss": 0.794, + "step": 9766 + }, + { + "epoch": 0.5375639826077384, + "grad_norm": 0.7258954048156738, + "learning_rate": 8.344514687578307e-06, + "loss": 0.871, + "step": 9767 + }, + { + "epoch": 0.5376190214100941, + "grad_norm": 0.7099377512931824, + "learning_rate": 8.34419245770634e-06, + "loss": 0.8098, + "step": 9768 + }, + { + "epoch": 0.5376740602124498, + "grad_norm": 0.7883020639419556, + "learning_rate": 8.34387020270032e-06, + "loss": 0.8383, + "step": 9769 + }, + { + "epoch": 0.5377290990148055, + "grad_norm": 0.7009730339050293, + "learning_rate": 8.343547922562664e-06, + "loss": 0.7794, + "step": 9770 + }, + { + "epoch": 0.5377841378171611, + "grad_norm": 0.6569581031799316, + "learning_rate": 8.343225617295798e-06, + "loss": 0.7574, + "step": 9771 + }, + { + "epoch": 0.5378391766195167, + "grad_norm": 0.6159278154373169, + "learning_rate": 8.342903286902142e-06, + "loss": 0.7136, + "step": 9772 + }, + { + "epoch": 0.5378942154218724, + "grad_norm": 0.6594879627227783, + "learning_rate": 8.342580931384121e-06, + "loss": 0.6906, + "step": 9773 + }, + { + "epoch": 0.5379492542242281, + "grad_norm": 0.7002933025360107, + "learning_rate": 8.342258550744156e-06, + "loss": 0.7272, + "step": 9774 + }, + { + "epoch": 0.5380042930265837, + "grad_norm": 0.8243216276168823, + "learning_rate": 8.341936144984672e-06, + "loss": 0.8105, + "step": 9775 + }, + { + "epoch": 0.5380593318289394, + "grad_norm": 0.8358921408653259, + "learning_rate": 8.34161371410809e-06, + "loss": 0.7118, + "step": 9776 + }, + { + "epoch": 0.5381143706312951, + "grad_norm": 0.6339066028594971, + "learning_rate": 8.34129125811683e-06, + "loss": 0.7035, + "step": 9777 + }, + { + "epoch": 0.5381694094336508, + "grad_norm": 0.7407625317573547, + "learning_rate": 8.340968777013324e-06, + "loss": 0.7447, + "step": 9778 + }, + { + "epoch": 0.5382244482360063, + "grad_norm": 0.6876600384712219, + "learning_rate": 8.340646270799991e-06, + "loss": 0.7298, + "step": 9779 + }, + { + "epoch": 0.538279487038362, + "grad_norm": 0.7021264433860779, + "learning_rate": 8.340323739479251e-06, + "loss": 0.7869, + "step": 9780 + }, + { + "epoch": 0.5383345258407177, + "grad_norm": 0.7341023087501526, + "learning_rate": 8.340001183053535e-06, + "loss": 0.7447, + "step": 9781 + }, + { + "epoch": 0.5383895646430734, + "grad_norm": 0.6829406023025513, + "learning_rate": 8.339678601525263e-06, + "loss": 0.7438, + "step": 9782 + }, + { + "epoch": 0.538444603445429, + "grad_norm": 0.7671583294868469, + "learning_rate": 8.33935599489686e-06, + "loss": 0.8678, + "step": 9783 + }, + { + "epoch": 0.5384996422477847, + "grad_norm": 0.701797366142273, + "learning_rate": 8.339033363170753e-06, + "loss": 0.8431, + "step": 9784 + }, + { + "epoch": 0.5385546810501404, + "grad_norm": 0.748235285282135, + "learning_rate": 8.338710706349363e-06, + "loss": 0.7905, + "step": 9785 + }, + { + "epoch": 0.5386097198524961, + "grad_norm": 0.8202430605888367, + "learning_rate": 8.338388024435119e-06, + "loss": 0.7734, + "step": 9786 + }, + { + "epoch": 0.5386647586548516, + "grad_norm": 0.8218014240264893, + "learning_rate": 8.338065317430442e-06, + "loss": 0.846, + "step": 9787 + }, + { + "epoch": 0.5387197974572073, + "grad_norm": 0.6773214936256409, + "learning_rate": 8.337742585337762e-06, + "loss": 0.7692, + "step": 9788 + }, + { + "epoch": 0.538774836259563, + "grad_norm": 0.7011464834213257, + "learning_rate": 8.337419828159501e-06, + "loss": 0.7534, + "step": 9789 + }, + { + "epoch": 0.5388298750619187, + "grad_norm": 0.8299004435539246, + "learning_rate": 8.337097045898087e-06, + "loss": 0.7997, + "step": 9790 + }, + { + "epoch": 0.5388849138642743, + "grad_norm": 0.8600753545761108, + "learning_rate": 8.336774238555942e-06, + "loss": 0.8307, + "step": 9791 + }, + { + "epoch": 0.53893995266663, + "grad_norm": 0.676490843296051, + "learning_rate": 8.336451406135498e-06, + "loss": 0.7748, + "step": 9792 + }, + { + "epoch": 0.5389949914689857, + "grad_norm": 0.7094627618789673, + "learning_rate": 8.336128548639177e-06, + "loss": 0.7524, + "step": 9793 + }, + { + "epoch": 0.5390500302713412, + "grad_norm": 0.6804066896438599, + "learning_rate": 8.335805666069407e-06, + "loss": 0.8299, + "step": 9794 + }, + { + "epoch": 0.5391050690736969, + "grad_norm": 0.6992025971412659, + "learning_rate": 8.335482758428614e-06, + "loss": 0.7548, + "step": 9795 + }, + { + "epoch": 0.5391601078760526, + "grad_norm": 0.6649640798568726, + "learning_rate": 8.335159825719227e-06, + "loss": 0.6595, + "step": 9796 + }, + { + "epoch": 0.5392151466784083, + "grad_norm": 0.7292002439498901, + "learning_rate": 8.33483686794367e-06, + "loss": 0.7944, + "step": 9797 + }, + { + "epoch": 0.5392701854807639, + "grad_norm": 0.9124587178230286, + "learning_rate": 8.334513885104375e-06, + "loss": 0.8586, + "step": 9798 + }, + { + "epoch": 0.5393252242831196, + "grad_norm": 0.7091020941734314, + "learning_rate": 8.334190877203761e-06, + "loss": 0.7019, + "step": 9799 + }, + { + "epoch": 0.5393802630854753, + "grad_norm": 0.7470952272415161, + "learning_rate": 8.333867844244265e-06, + "loss": 0.7866, + "step": 9800 + }, + { + "epoch": 0.539435301887831, + "grad_norm": 0.7368966341018677, + "learning_rate": 8.333544786228309e-06, + "loss": 0.8135, + "step": 9801 + }, + { + "epoch": 0.5394903406901865, + "grad_norm": 0.668305516242981, + "learning_rate": 8.333221703158322e-06, + "loss": 0.7549, + "step": 9802 + }, + { + "epoch": 0.5395453794925422, + "grad_norm": 0.6788874268531799, + "learning_rate": 8.332898595036735e-06, + "loss": 0.8077, + "step": 9803 + }, + { + "epoch": 0.5396004182948979, + "grad_norm": 0.654863715171814, + "learning_rate": 8.332575461865972e-06, + "loss": 0.7695, + "step": 9804 + }, + { + "epoch": 0.5396554570972536, + "grad_norm": 0.7460314631462097, + "learning_rate": 8.332252303648464e-06, + "loss": 0.7711, + "step": 9805 + }, + { + "epoch": 0.5397104958996092, + "grad_norm": 0.7923582792282104, + "learning_rate": 8.331929120386643e-06, + "loss": 0.7348, + "step": 9806 + }, + { + "epoch": 0.5397655347019649, + "grad_norm": 0.6570843458175659, + "learning_rate": 8.331605912082932e-06, + "loss": 0.7029, + "step": 9807 + }, + { + "epoch": 0.5398205735043206, + "grad_norm": 0.7728865742683411, + "learning_rate": 8.331282678739762e-06, + "loss": 0.8249, + "step": 9808 + }, + { + "epoch": 0.5398756123066762, + "grad_norm": 0.7121468186378479, + "learning_rate": 8.330959420359565e-06, + "loss": 0.8698, + "step": 9809 + }, + { + "epoch": 0.5399306511090318, + "grad_norm": 0.7779444456100464, + "learning_rate": 8.330636136944768e-06, + "loss": 0.7448, + "step": 9810 + }, + { + "epoch": 0.5399856899113875, + "grad_norm": 0.7770833373069763, + "learning_rate": 8.330312828497801e-06, + "loss": 0.8489, + "step": 9811 + }, + { + "epoch": 0.5400407287137432, + "grad_norm": 0.6705769896507263, + "learning_rate": 8.329989495021096e-06, + "loss": 0.7349, + "step": 9812 + }, + { + "epoch": 0.5400957675160989, + "grad_norm": 0.6775381565093994, + "learning_rate": 8.329666136517079e-06, + "loss": 0.8093, + "step": 9813 + }, + { + "epoch": 0.5401508063184545, + "grad_norm": 0.6621832251548767, + "learning_rate": 8.329342752988183e-06, + "loss": 0.7877, + "step": 9814 + }, + { + "epoch": 0.5402058451208102, + "grad_norm": 0.704339861869812, + "learning_rate": 8.329019344436839e-06, + "loss": 0.7708, + "step": 9815 + }, + { + "epoch": 0.5402608839231658, + "grad_norm": 0.789944052696228, + "learning_rate": 8.328695910865476e-06, + "loss": 0.7563, + "step": 9816 + }, + { + "epoch": 0.5403159227255215, + "grad_norm": 0.6997420191764832, + "learning_rate": 8.328372452276525e-06, + "loss": 0.7023, + "step": 9817 + }, + { + "epoch": 0.5403709615278771, + "grad_norm": 0.6453180313110352, + "learning_rate": 8.328048968672418e-06, + "loss": 0.7193, + "step": 9818 + }, + { + "epoch": 0.5404260003302328, + "grad_norm": 0.7059640884399414, + "learning_rate": 8.327725460055586e-06, + "loss": 0.7875, + "step": 9819 + }, + { + "epoch": 0.5404810391325885, + "grad_norm": 0.7725005745887756, + "learning_rate": 8.327401926428461e-06, + "loss": 0.7503, + "step": 9820 + }, + { + "epoch": 0.5405360779349442, + "grad_norm": 0.7710940837860107, + "learning_rate": 8.327078367793473e-06, + "loss": 0.8314, + "step": 9821 + }, + { + "epoch": 0.5405911167372998, + "grad_norm": 0.9090666770935059, + "learning_rate": 8.326754784153055e-06, + "loss": 0.8021, + "step": 9822 + }, + { + "epoch": 0.5406461555396554, + "grad_norm": 0.7135322690010071, + "learning_rate": 8.326431175509638e-06, + "loss": 0.8084, + "step": 9823 + }, + { + "epoch": 0.5407011943420111, + "grad_norm": 0.9126102328300476, + "learning_rate": 8.326107541865656e-06, + "loss": 0.75, + "step": 9824 + }, + { + "epoch": 0.5407562331443668, + "grad_norm": 0.7263361215591431, + "learning_rate": 8.325783883223539e-06, + "loss": 0.6808, + "step": 9825 + }, + { + "epoch": 0.5408112719467224, + "grad_norm": 0.7234700918197632, + "learning_rate": 8.32546019958572e-06, + "loss": 0.7582, + "step": 9826 + }, + { + "epoch": 0.5408663107490781, + "grad_norm": 0.7043294310569763, + "learning_rate": 8.325136490954633e-06, + "loss": 0.8421, + "step": 9827 + }, + { + "epoch": 0.5409213495514338, + "grad_norm": 0.7947664856910706, + "learning_rate": 8.32481275733271e-06, + "loss": 0.8672, + "step": 9828 + }, + { + "epoch": 0.5409763883537895, + "grad_norm": 0.704590916633606, + "learning_rate": 8.324488998722384e-06, + "loss": 0.7356, + "step": 9829 + }, + { + "epoch": 0.541031427156145, + "grad_norm": 0.7630662322044373, + "learning_rate": 8.32416521512609e-06, + "loss": 0.7082, + "step": 9830 + }, + { + "epoch": 0.5410864659585007, + "grad_norm": 0.728721022605896, + "learning_rate": 8.323841406546259e-06, + "loss": 0.7987, + "step": 9831 + }, + { + "epoch": 0.5411415047608564, + "grad_norm": 0.7164294719696045, + "learning_rate": 8.323517572985326e-06, + "loss": 0.721, + "step": 9832 + }, + { + "epoch": 0.5411965435632121, + "grad_norm": 0.7555723190307617, + "learning_rate": 8.323193714445722e-06, + "loss": 0.814, + "step": 9833 + }, + { + "epoch": 0.5412515823655677, + "grad_norm": 0.827485978603363, + "learning_rate": 8.322869830929887e-06, + "loss": 0.8817, + "step": 9834 + }, + { + "epoch": 0.5413066211679234, + "grad_norm": 0.718950092792511, + "learning_rate": 8.322545922440252e-06, + "loss": 0.8648, + "step": 9835 + }, + { + "epoch": 0.5413616599702791, + "grad_norm": 0.7361611723899841, + "learning_rate": 8.32222198897925e-06, + "loss": 0.7392, + "step": 9836 + }, + { + "epoch": 0.5414166987726347, + "grad_norm": 0.6712168455123901, + "learning_rate": 8.321898030549316e-06, + "loss": 0.7505, + "step": 9837 + }, + { + "epoch": 0.5414717375749903, + "grad_norm": 0.7475710511207581, + "learning_rate": 8.321574047152887e-06, + "loss": 0.7969, + "step": 9838 + }, + { + "epoch": 0.541526776377346, + "grad_norm": 0.9751361608505249, + "learning_rate": 8.321250038792397e-06, + "loss": 0.8534, + "step": 9839 + }, + { + "epoch": 0.5415818151797017, + "grad_norm": 0.6858723163604736, + "learning_rate": 8.32092600547028e-06, + "loss": 0.8277, + "step": 9840 + }, + { + "epoch": 0.5416368539820573, + "grad_norm": 0.8899725675582886, + "learning_rate": 8.320601947188971e-06, + "loss": 0.8599, + "step": 9841 + }, + { + "epoch": 0.541691892784413, + "grad_norm": 0.7140665650367737, + "learning_rate": 8.320277863950907e-06, + "loss": 0.7429, + "step": 9842 + }, + { + "epoch": 0.5417469315867687, + "grad_norm": 0.7467615604400635, + "learning_rate": 8.319953755758525e-06, + "loss": 0.7826, + "step": 9843 + }, + { + "epoch": 0.5418019703891244, + "grad_norm": 0.6578202843666077, + "learning_rate": 8.319629622614258e-06, + "loss": 0.6833, + "step": 9844 + }, + { + "epoch": 0.5418570091914799, + "grad_norm": 0.9430698156356812, + "learning_rate": 8.319305464520543e-06, + "loss": 0.8243, + "step": 9845 + }, + { + "epoch": 0.5419120479938356, + "grad_norm": 0.8632097840309143, + "learning_rate": 8.318981281479817e-06, + "loss": 0.7975, + "step": 9846 + }, + { + "epoch": 0.5419670867961913, + "grad_norm": 0.7241839170455933, + "learning_rate": 8.318657073494517e-06, + "loss": 0.7226, + "step": 9847 + }, + { + "epoch": 0.542022125598547, + "grad_norm": 0.6927164196968079, + "learning_rate": 8.318332840567078e-06, + "loss": 0.7125, + "step": 9848 + }, + { + "epoch": 0.5420771644009026, + "grad_norm": 0.6414939761161804, + "learning_rate": 8.318008582699937e-06, + "loss": 0.7366, + "step": 9849 + }, + { + "epoch": 0.5421322032032583, + "grad_norm": 0.7584436535835266, + "learning_rate": 8.317684299895533e-06, + "loss": 0.8601, + "step": 9850 + }, + { + "epoch": 0.542187242005614, + "grad_norm": 0.6045856475830078, + "learning_rate": 8.317359992156302e-06, + "loss": 0.6697, + "step": 9851 + }, + { + "epoch": 0.5422422808079697, + "grad_norm": 0.715048611164093, + "learning_rate": 8.31703565948468e-06, + "loss": 0.7535, + "step": 9852 + }, + { + "epoch": 0.5422973196103252, + "grad_norm": 0.6925113201141357, + "learning_rate": 8.316711301883106e-06, + "loss": 0.8122, + "step": 9853 + }, + { + "epoch": 0.5423523584126809, + "grad_norm": 0.6787780523300171, + "learning_rate": 8.316386919354018e-06, + "loss": 0.7428, + "step": 9854 + }, + { + "epoch": 0.5424073972150366, + "grad_norm": 0.6831366419792175, + "learning_rate": 8.316062511899855e-06, + "loss": 0.767, + "step": 9855 + }, + { + "epoch": 0.5424624360173923, + "grad_norm": 0.6865691542625427, + "learning_rate": 8.315738079523053e-06, + "loss": 0.6549, + "step": 9856 + }, + { + "epoch": 0.5425174748197479, + "grad_norm": 0.7149406671524048, + "learning_rate": 8.31541362222605e-06, + "loss": 0.8127, + "step": 9857 + }, + { + "epoch": 0.5425725136221036, + "grad_norm": 0.6826779842376709, + "learning_rate": 8.315089140011286e-06, + "loss": 0.706, + "step": 9858 + }, + { + "epoch": 0.5426275524244593, + "grad_norm": 0.688204288482666, + "learning_rate": 8.3147646328812e-06, + "loss": 0.8675, + "step": 9859 + }, + { + "epoch": 0.542682591226815, + "grad_norm": 0.6659492254257202, + "learning_rate": 8.31444010083823e-06, + "loss": 0.7851, + "step": 9860 + }, + { + "epoch": 0.5427376300291705, + "grad_norm": 0.8049291372299194, + "learning_rate": 8.314115543884816e-06, + "loss": 0.7442, + "step": 9861 + }, + { + "epoch": 0.5427926688315262, + "grad_norm": 0.7505989670753479, + "learning_rate": 8.313790962023397e-06, + "loss": 0.8391, + "step": 9862 + }, + { + "epoch": 0.5428477076338819, + "grad_norm": 0.6810199618339539, + "learning_rate": 8.31346635525641e-06, + "loss": 0.8131, + "step": 9863 + }, + { + "epoch": 0.5429027464362376, + "grad_norm": 0.6724215745925903, + "learning_rate": 8.313141723586298e-06, + "loss": 0.75, + "step": 9864 + }, + { + "epoch": 0.5429577852385932, + "grad_norm": 0.7804376482963562, + "learning_rate": 8.3128170670155e-06, + "loss": 0.704, + "step": 9865 + }, + { + "epoch": 0.5430128240409489, + "grad_norm": 0.9494230151176453, + "learning_rate": 8.312492385546455e-06, + "loss": 0.8578, + "step": 9866 + }, + { + "epoch": 0.5430678628433045, + "grad_norm": 0.6780333518981934, + "learning_rate": 8.312167679181606e-06, + "loss": 0.701, + "step": 9867 + }, + { + "epoch": 0.5431229016456602, + "grad_norm": 0.7407701015472412, + "learning_rate": 8.31184294792339e-06, + "loss": 0.8505, + "step": 9868 + }, + { + "epoch": 0.5431779404480158, + "grad_norm": 0.680903434753418, + "learning_rate": 8.311518191774249e-06, + "loss": 0.7645, + "step": 9869 + }, + { + "epoch": 0.5432329792503715, + "grad_norm": 0.6695752143859863, + "learning_rate": 8.311193410736622e-06, + "loss": 0.816, + "step": 9870 + }, + { + "epoch": 0.5432880180527272, + "grad_norm": 0.6725142598152161, + "learning_rate": 8.310868604812954e-06, + "loss": 0.7044, + "step": 9871 + }, + { + "epoch": 0.5433430568550829, + "grad_norm": 0.922627866268158, + "learning_rate": 8.310543774005684e-06, + "loss": 0.7589, + "step": 9872 + }, + { + "epoch": 0.5433980956574385, + "grad_norm": 1.0136839151382446, + "learning_rate": 8.310218918317251e-06, + "loss": 0.7573, + "step": 9873 + }, + { + "epoch": 0.5434531344597942, + "grad_norm": 0.9053532481193542, + "learning_rate": 8.309894037750099e-06, + "loss": 0.8269, + "step": 9874 + }, + { + "epoch": 0.5435081732621498, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.309569132306671e-06, + "loss": 0.716, + "step": 9875 + }, + { + "epoch": 0.5435632120645055, + "grad_norm": 0.7157679796218872, + "learning_rate": 8.309244201989408e-06, + "loss": 0.7433, + "step": 9876 + }, + { + "epoch": 0.5436182508668611, + "grad_norm": 0.9316089749336243, + "learning_rate": 8.308919246800748e-06, + "loss": 0.7499, + "step": 9877 + }, + { + "epoch": 0.5436732896692168, + "grad_norm": 0.6682490110397339, + "learning_rate": 8.308594266743139e-06, + "loss": 0.7286, + "step": 9878 + }, + { + "epoch": 0.5437283284715725, + "grad_norm": 0.7241143584251404, + "learning_rate": 8.308269261819022e-06, + "loss": 0.7934, + "step": 9879 + }, + { + "epoch": 0.5437833672739281, + "grad_norm": 0.7402396202087402, + "learning_rate": 8.307944232030838e-06, + "loss": 0.7361, + "step": 9880 + }, + { + "epoch": 0.5438384060762838, + "grad_norm": 0.6839993596076965, + "learning_rate": 8.307619177381029e-06, + "loss": 0.749, + "step": 9881 + }, + { + "epoch": 0.5438934448786394, + "grad_norm": 0.6536363363265991, + "learning_rate": 8.307294097872041e-06, + "loss": 0.706, + "step": 9882 + }, + { + "epoch": 0.5439484836809951, + "grad_norm": 0.602644681930542, + "learning_rate": 8.306968993506317e-06, + "loss": 0.6857, + "step": 9883 + }, + { + "epoch": 0.5440035224833507, + "grad_norm": 0.6567881107330322, + "learning_rate": 8.306643864286297e-06, + "loss": 0.6989, + "step": 9884 + }, + { + "epoch": 0.5440585612857064, + "grad_norm": 1.0013506412506104, + "learning_rate": 8.306318710214427e-06, + "loss": 0.7251, + "step": 9885 + }, + { + "epoch": 0.5441136000880621, + "grad_norm": 0.7016813158988953, + "learning_rate": 8.305993531293153e-06, + "loss": 0.7535, + "step": 9886 + }, + { + "epoch": 0.5441686388904178, + "grad_norm": 0.7345741391181946, + "learning_rate": 8.305668327524915e-06, + "loss": 0.887, + "step": 9887 + }, + { + "epoch": 0.5442236776927734, + "grad_norm": 1.0925754308700562, + "learning_rate": 8.305343098912158e-06, + "loss": 0.7779, + "step": 9888 + }, + { + "epoch": 0.544278716495129, + "grad_norm": 0.79815274477005, + "learning_rate": 8.305017845457328e-06, + "loss": 0.7736, + "step": 9889 + }, + { + "epoch": 0.5443337552974847, + "grad_norm": 0.6324154138565063, + "learning_rate": 8.304692567162868e-06, + "loss": 0.6823, + "step": 9890 + }, + { + "epoch": 0.5443887940998404, + "grad_norm": 0.6990262866020203, + "learning_rate": 8.304367264031223e-06, + "loss": 0.7804, + "step": 9891 + }, + { + "epoch": 0.544443832902196, + "grad_norm": 1.4203195571899414, + "learning_rate": 8.304041936064839e-06, + "loss": 0.8702, + "step": 9892 + }, + { + "epoch": 0.5444988717045517, + "grad_norm": 0.6986544132232666, + "learning_rate": 8.303716583266161e-06, + "loss": 0.7666, + "step": 9893 + }, + { + "epoch": 0.5445539105069074, + "grad_norm": 0.7037138938903809, + "learning_rate": 8.303391205637632e-06, + "loss": 0.7995, + "step": 9894 + }, + { + "epoch": 0.5446089493092631, + "grad_norm": 0.7101728320121765, + "learning_rate": 8.3030658031817e-06, + "loss": 0.8185, + "step": 9895 + }, + { + "epoch": 0.5446639881116186, + "grad_norm": 0.6571425795555115, + "learning_rate": 8.302740375900808e-06, + "loss": 0.6152, + "step": 9896 + }, + { + "epoch": 0.5447190269139743, + "grad_norm": 0.7560263276100159, + "learning_rate": 8.302414923797406e-06, + "loss": 0.9037, + "step": 9897 + }, + { + "epoch": 0.54477406571633, + "grad_norm": 0.8692007064819336, + "learning_rate": 8.302089446873935e-06, + "loss": 0.7689, + "step": 9898 + }, + { + "epoch": 0.5448291045186857, + "grad_norm": 0.7533506751060486, + "learning_rate": 8.301763945132845e-06, + "loss": 0.7671, + "step": 9899 + }, + { + "epoch": 0.5448841433210413, + "grad_norm": 0.6992233991622925, + "learning_rate": 8.301438418576581e-06, + "loss": 0.723, + "step": 9900 + }, + { + "epoch": 0.544939182123397, + "grad_norm": 0.7966120839118958, + "learning_rate": 8.301112867207589e-06, + "loss": 0.7968, + "step": 9901 + }, + { + "epoch": 0.5449942209257527, + "grad_norm": 0.800558865070343, + "learning_rate": 8.300787291028316e-06, + "loss": 0.8583, + "step": 9902 + }, + { + "epoch": 0.5450492597281084, + "grad_norm": 0.7019909024238586, + "learning_rate": 8.30046169004121e-06, + "loss": 0.7045, + "step": 9903 + }, + { + "epoch": 0.5451042985304639, + "grad_norm": 0.7778449654579163, + "learning_rate": 8.300136064248717e-06, + "loss": 0.7964, + "step": 9904 + }, + { + "epoch": 0.5451593373328196, + "grad_norm": 0.6894309520721436, + "learning_rate": 8.299810413653284e-06, + "loss": 0.7382, + "step": 9905 + }, + { + "epoch": 0.5452143761351753, + "grad_norm": 0.6942182183265686, + "learning_rate": 8.299484738257361e-06, + "loss": 0.73, + "step": 9906 + }, + { + "epoch": 0.545269414937531, + "grad_norm": 0.6607787609100342, + "learning_rate": 8.299159038063394e-06, + "loss": 0.6987, + "step": 9907 + }, + { + "epoch": 0.5453244537398866, + "grad_norm": 0.7447709441184998, + "learning_rate": 8.29883331307383e-06, + "loss": 0.7787, + "step": 9908 + }, + { + "epoch": 0.5453794925422423, + "grad_norm": 0.6315301656723022, + "learning_rate": 8.298507563291116e-06, + "loss": 0.7047, + "step": 9909 + }, + { + "epoch": 0.545434531344598, + "grad_norm": 0.8095656633377075, + "learning_rate": 8.298181788717705e-06, + "loss": 0.691, + "step": 9910 + }, + { + "epoch": 0.5454895701469537, + "grad_norm": 0.6419453024864197, + "learning_rate": 8.29785598935604e-06, + "loss": 0.7333, + "step": 9911 + }, + { + "epoch": 0.5455446089493092, + "grad_norm": 0.7209222316741943, + "learning_rate": 8.297530165208574e-06, + "loss": 0.8174, + "step": 9912 + }, + { + "epoch": 0.5455996477516649, + "grad_norm": 0.6778598427772522, + "learning_rate": 8.297204316277754e-06, + "loss": 0.7696, + "step": 9913 + }, + { + "epoch": 0.5456546865540206, + "grad_norm": 0.6573307514190674, + "learning_rate": 8.296878442566028e-06, + "loss": 0.7843, + "step": 9914 + }, + { + "epoch": 0.5457097253563763, + "grad_norm": 0.6987473964691162, + "learning_rate": 8.296552544075847e-06, + "loss": 0.809, + "step": 9915 + }, + { + "epoch": 0.5457647641587319, + "grad_norm": 0.7149204015731812, + "learning_rate": 8.29622662080966e-06, + "loss": 0.848, + "step": 9916 + }, + { + "epoch": 0.5458198029610876, + "grad_norm": 0.6252632141113281, + "learning_rate": 8.295900672769913e-06, + "loss": 0.7029, + "step": 9917 + }, + { + "epoch": 0.5458748417634433, + "grad_norm": 0.713376522064209, + "learning_rate": 8.295574699959062e-06, + "loss": 0.726, + "step": 9918 + }, + { + "epoch": 0.5459298805657989, + "grad_norm": 0.6864717602729797, + "learning_rate": 8.295248702379552e-06, + "loss": 0.7428, + "step": 9919 + }, + { + "epoch": 0.5459849193681545, + "grad_norm": 0.8085678219795227, + "learning_rate": 8.294922680033837e-06, + "loss": 0.8697, + "step": 9920 + }, + { + "epoch": 0.5460399581705102, + "grad_norm": 0.7366700768470764, + "learning_rate": 8.294596632924363e-06, + "loss": 0.7714, + "step": 9921 + }, + { + "epoch": 0.5460949969728659, + "grad_norm": 0.670632541179657, + "learning_rate": 8.294270561053583e-06, + "loss": 0.7032, + "step": 9922 + }, + { + "epoch": 0.5461500357752215, + "grad_norm": 0.7867220640182495, + "learning_rate": 8.293944464423946e-06, + "loss": 0.8903, + "step": 9923 + }, + { + "epoch": 0.5462050745775772, + "grad_norm": 0.8441565632820129, + "learning_rate": 8.293618343037907e-06, + "loss": 0.8694, + "step": 9924 + }, + { + "epoch": 0.5462601133799329, + "grad_norm": 0.7048027515411377, + "learning_rate": 8.293292196897913e-06, + "loss": 0.8226, + "step": 9925 + }, + { + "epoch": 0.5463151521822885, + "grad_norm": 0.6344078779220581, + "learning_rate": 8.292966026006416e-06, + "loss": 0.7615, + "step": 9926 + }, + { + "epoch": 0.5463701909846441, + "grad_norm": 0.6744484901428223, + "learning_rate": 8.292639830365867e-06, + "loss": 0.6944, + "step": 9927 + }, + { + "epoch": 0.5464252297869998, + "grad_norm": 0.8113303780555725, + "learning_rate": 8.292313609978721e-06, + "loss": 0.7558, + "step": 9928 + }, + { + "epoch": 0.5464802685893555, + "grad_norm": 0.640190839767456, + "learning_rate": 8.291987364847425e-06, + "loss": 0.7167, + "step": 9929 + }, + { + "epoch": 0.5465353073917112, + "grad_norm": 0.7714816331863403, + "learning_rate": 8.291661094974434e-06, + "loss": 0.8662, + "step": 9930 + }, + { + "epoch": 0.5465903461940668, + "grad_norm": 0.6785402894020081, + "learning_rate": 8.291334800362199e-06, + "loss": 0.6835, + "step": 9931 + }, + { + "epoch": 0.5466453849964225, + "grad_norm": 0.704868495464325, + "learning_rate": 8.291008481013173e-06, + "loss": 0.7343, + "step": 9932 + }, + { + "epoch": 0.5467004237987781, + "grad_norm": 0.7587466239929199, + "learning_rate": 8.290682136929809e-06, + "loss": 0.7856, + "step": 9933 + }, + { + "epoch": 0.5467554626011338, + "grad_norm": 0.7460505962371826, + "learning_rate": 8.290355768114557e-06, + "loss": 0.7463, + "step": 9934 + }, + { + "epoch": 0.5468105014034894, + "grad_norm": 0.7185021042823792, + "learning_rate": 8.290029374569873e-06, + "loss": 0.8106, + "step": 9935 + }, + { + "epoch": 0.5468655402058451, + "grad_norm": 0.7023874521255493, + "learning_rate": 8.289702956298209e-06, + "loss": 0.6863, + "step": 9936 + }, + { + "epoch": 0.5469205790082008, + "grad_norm": 0.8688495755195618, + "learning_rate": 8.289376513302017e-06, + "loss": 0.8898, + "step": 9937 + }, + { + "epoch": 0.5469756178105565, + "grad_norm": 0.6405122876167297, + "learning_rate": 8.289050045583752e-06, + "loss": 0.6804, + "step": 9938 + }, + { + "epoch": 0.5470306566129121, + "grad_norm": 0.8364881277084351, + "learning_rate": 8.288723553145868e-06, + "loss": 0.8356, + "step": 9939 + }, + { + "epoch": 0.5470856954152677, + "grad_norm": 0.6621617078781128, + "learning_rate": 8.288397035990818e-06, + "loss": 0.7508, + "step": 9940 + }, + { + "epoch": 0.5471407342176234, + "grad_norm": 0.6822347640991211, + "learning_rate": 8.288070494121056e-06, + "loss": 0.7722, + "step": 9941 + }, + { + "epoch": 0.5471957730199791, + "grad_norm": 0.6727223992347717, + "learning_rate": 8.287743927539036e-06, + "loss": 0.743, + "step": 9942 + }, + { + "epoch": 0.5472508118223347, + "grad_norm": 0.7852441668510437, + "learning_rate": 8.287417336247214e-06, + "loss": 0.8321, + "step": 9943 + }, + { + "epoch": 0.5473058506246904, + "grad_norm": 0.6982126235961914, + "learning_rate": 8.287090720248041e-06, + "loss": 0.6669, + "step": 9944 + }, + { + "epoch": 0.5473608894270461, + "grad_norm": 0.7820166945457458, + "learning_rate": 8.286764079543976e-06, + "loss": 0.7592, + "step": 9945 + }, + { + "epoch": 0.5474159282294018, + "grad_norm": 0.6868422627449036, + "learning_rate": 8.28643741413747e-06, + "loss": 0.8308, + "step": 9946 + }, + { + "epoch": 0.5474709670317573, + "grad_norm": 0.8227942585945129, + "learning_rate": 8.286110724030982e-06, + "loss": 0.7982, + "step": 9947 + }, + { + "epoch": 0.547526005834113, + "grad_norm": 0.6838171482086182, + "learning_rate": 8.285784009226964e-06, + "loss": 0.7907, + "step": 9948 + }, + { + "epoch": 0.5475810446364687, + "grad_norm": 0.7200812697410583, + "learning_rate": 8.285457269727875e-06, + "loss": 0.88, + "step": 9949 + }, + { + "epoch": 0.5476360834388244, + "grad_norm": 0.7469412684440613, + "learning_rate": 8.285130505536168e-06, + "loss": 0.8167, + "step": 9950 + }, + { + "epoch": 0.54769112224118, + "grad_norm": 0.6660227179527283, + "learning_rate": 8.284803716654298e-06, + "loss": 0.7685, + "step": 9951 + }, + { + "epoch": 0.5477461610435357, + "grad_norm": 0.7116572260856628, + "learning_rate": 8.284476903084723e-06, + "loss": 0.7415, + "step": 9952 + }, + { + "epoch": 0.5478011998458914, + "grad_norm": 0.6540791988372803, + "learning_rate": 8.284150064829899e-06, + "loss": 0.6571, + "step": 9953 + }, + { + "epoch": 0.5478562386482471, + "grad_norm": 0.7527759075164795, + "learning_rate": 8.283823201892283e-06, + "loss": 0.8678, + "step": 9954 + }, + { + "epoch": 0.5479112774506026, + "grad_norm": 0.7795953750610352, + "learning_rate": 8.283496314274331e-06, + "loss": 0.8086, + "step": 9955 + }, + { + "epoch": 0.5479663162529583, + "grad_norm": 0.862503170967102, + "learning_rate": 8.283169401978498e-06, + "loss": 0.7442, + "step": 9956 + }, + { + "epoch": 0.548021355055314, + "grad_norm": 0.6552054286003113, + "learning_rate": 8.282842465007244e-06, + "loss": 0.6664, + "step": 9957 + }, + { + "epoch": 0.5480763938576697, + "grad_norm": 0.7242427468299866, + "learning_rate": 8.282515503363024e-06, + "loss": 0.8199, + "step": 9958 + }, + { + "epoch": 0.5481314326600253, + "grad_norm": 0.7529763579368591, + "learning_rate": 8.282188517048295e-06, + "loss": 0.761, + "step": 9959 + }, + { + "epoch": 0.548186471462381, + "grad_norm": 0.7909425497055054, + "learning_rate": 8.281861506065519e-06, + "loss": 0.7389, + "step": 9960 + }, + { + "epoch": 0.5482415102647367, + "grad_norm": 0.6594850420951843, + "learning_rate": 8.281534470417147e-06, + "loss": 0.7473, + "step": 9961 + }, + { + "epoch": 0.5482965490670924, + "grad_norm": 0.6900844573974609, + "learning_rate": 8.281207410105642e-06, + "loss": 0.7551, + "step": 9962 + }, + { + "epoch": 0.5483515878694479, + "grad_norm": 0.6922640204429626, + "learning_rate": 8.28088032513346e-06, + "loss": 0.7654, + "step": 9963 + }, + { + "epoch": 0.5484066266718036, + "grad_norm": 0.7758432626724243, + "learning_rate": 8.28055321550306e-06, + "loss": 0.8033, + "step": 9964 + }, + { + "epoch": 0.5484616654741593, + "grad_norm": 0.7074280977249146, + "learning_rate": 8.2802260812169e-06, + "loss": 0.7302, + "step": 9965 + }, + { + "epoch": 0.5485167042765149, + "grad_norm": 0.7724928259849548, + "learning_rate": 8.27989892227744e-06, + "loss": 0.7621, + "step": 9966 + }, + { + "epoch": 0.5485717430788706, + "grad_norm": 0.7364168167114258, + "learning_rate": 8.279571738687137e-06, + "loss": 0.7587, + "step": 9967 + }, + { + "epoch": 0.5486267818812263, + "grad_norm": 0.7298350930213928, + "learning_rate": 8.27924453044845e-06, + "loss": 0.7371, + "step": 9968 + }, + { + "epoch": 0.548681820683582, + "grad_norm": 0.8056737780570984, + "learning_rate": 8.27891729756384e-06, + "loss": 0.9871, + "step": 9969 + }, + { + "epoch": 0.5487368594859375, + "grad_norm": 0.7499688267707825, + "learning_rate": 8.278590040035763e-06, + "loss": 0.8574, + "step": 9970 + }, + { + "epoch": 0.5487918982882932, + "grad_norm": 0.7398175001144409, + "learning_rate": 8.278262757866683e-06, + "loss": 0.744, + "step": 9971 + }, + { + "epoch": 0.5488469370906489, + "grad_norm": 0.7099171876907349, + "learning_rate": 8.277935451059058e-06, + "loss": 0.7108, + "step": 9972 + }, + { + "epoch": 0.5489019758930046, + "grad_norm": 0.6720188856124878, + "learning_rate": 8.277608119615345e-06, + "loss": 0.8565, + "step": 9973 + }, + { + "epoch": 0.5489570146953602, + "grad_norm": 0.7870737910270691, + "learning_rate": 8.27728076353801e-06, + "loss": 0.7429, + "step": 9974 + }, + { + "epoch": 0.5490120534977159, + "grad_norm": 0.7358133792877197, + "learning_rate": 8.276953382829507e-06, + "loss": 0.7549, + "step": 9975 + }, + { + "epoch": 0.5490670923000716, + "grad_norm": 0.8968467116355896, + "learning_rate": 8.276625977492303e-06, + "loss": 0.6983, + "step": 9976 + }, + { + "epoch": 0.5491221311024272, + "grad_norm": 0.7346875071525574, + "learning_rate": 8.276298547528852e-06, + "loss": 0.8541, + "step": 9977 + }, + { + "epoch": 0.5491771699047828, + "grad_norm": 0.7297229170799255, + "learning_rate": 8.27597109294162e-06, + "loss": 0.8378, + "step": 9978 + }, + { + "epoch": 0.5492322087071385, + "grad_norm": 0.6907635927200317, + "learning_rate": 8.275643613733064e-06, + "loss": 0.7058, + "step": 9979 + }, + { + "epoch": 0.5492872475094942, + "grad_norm": 0.7612239718437195, + "learning_rate": 8.27531610990565e-06, + "loss": 0.6827, + "step": 9980 + }, + { + "epoch": 0.5493422863118499, + "grad_norm": 1.3160386085510254, + "learning_rate": 8.274988581461837e-06, + "loss": 0.7357, + "step": 9981 + }, + { + "epoch": 0.5493973251142055, + "grad_norm": 0.6370541453361511, + "learning_rate": 8.274661028404083e-06, + "loss": 0.7323, + "step": 9982 + }, + { + "epoch": 0.5494523639165612, + "grad_norm": 0.7051724195480347, + "learning_rate": 8.274333450734856e-06, + "loss": 0.7714, + "step": 9983 + }, + { + "epoch": 0.5495074027189168, + "grad_norm": 0.7452969551086426, + "learning_rate": 8.274005848456614e-06, + "loss": 0.7516, + "step": 9984 + }, + { + "epoch": 0.5495624415212725, + "grad_norm": 0.7132626175880432, + "learning_rate": 8.273678221571823e-06, + "loss": 0.6417, + "step": 9985 + }, + { + "epoch": 0.5496174803236281, + "grad_norm": 0.7873446345329285, + "learning_rate": 8.273350570082941e-06, + "loss": 0.8457, + "step": 9986 + }, + { + "epoch": 0.5496725191259838, + "grad_norm": 0.691470205783844, + "learning_rate": 8.273022893992432e-06, + "loss": 0.7871, + "step": 9987 + }, + { + "epoch": 0.5497275579283395, + "grad_norm": 0.6671431064605713, + "learning_rate": 8.27269519330276e-06, + "loss": 0.6919, + "step": 9988 + }, + { + "epoch": 0.5497825967306952, + "grad_norm": 0.8026914596557617, + "learning_rate": 8.272367468016387e-06, + "loss": 0.6885, + "step": 9989 + }, + { + "epoch": 0.5498376355330508, + "grad_norm": 0.9003152251243591, + "learning_rate": 8.272039718135774e-06, + "loss": 0.7671, + "step": 9990 + }, + { + "epoch": 0.5498926743354065, + "grad_norm": 0.6515254378318787, + "learning_rate": 8.271711943663388e-06, + "loss": 0.7589, + "step": 9991 + }, + { + "epoch": 0.5499477131377621, + "grad_norm": 0.6495782136917114, + "learning_rate": 8.27138414460169e-06, + "loss": 0.7277, + "step": 9992 + }, + { + "epoch": 0.5500027519401178, + "grad_norm": 0.7564565539360046, + "learning_rate": 8.271056320953146e-06, + "loss": 0.6977, + "step": 9993 + }, + { + "epoch": 0.5500577907424734, + "grad_norm": 0.8551548719406128, + "learning_rate": 8.270728472720218e-06, + "loss": 0.684, + "step": 9994 + }, + { + "epoch": 0.5501128295448291, + "grad_norm": 0.6614843010902405, + "learning_rate": 8.270400599905369e-06, + "loss": 0.6559, + "step": 9995 + }, + { + "epoch": 0.5501678683471848, + "grad_norm": 0.6920068264007568, + "learning_rate": 8.270072702511065e-06, + "loss": 0.7497, + "step": 9996 + }, + { + "epoch": 0.5502229071495405, + "grad_norm": 0.7426198124885559, + "learning_rate": 8.26974478053977e-06, + "loss": 0.7434, + "step": 9997 + }, + { + "epoch": 0.550277945951896, + "grad_norm": 1.2630934715270996, + "learning_rate": 8.269416833993949e-06, + "loss": 0.7306, + "step": 9998 + }, + { + "epoch": 0.5503329847542517, + "grad_norm": 0.7069457769393921, + "learning_rate": 8.269088862876066e-06, + "loss": 0.6735, + "step": 9999 + }, + { + "epoch": 0.5503880235566074, + "grad_norm": 0.8945016264915466, + "learning_rate": 8.268760867188586e-06, + "loss": 0.7575, + "step": 10000 + }, + { + "epoch": 0.5504430623589631, + "grad_norm": 0.7708195447921753, + "learning_rate": 8.268432846933974e-06, + "loss": 0.6988, + "step": 10001 + }, + { + "epoch": 0.5504981011613187, + "grad_norm": 0.7884799838066101, + "learning_rate": 8.268104802114696e-06, + "loss": 0.8085, + "step": 10002 + }, + { + "epoch": 0.5505531399636744, + "grad_norm": 0.7801569104194641, + "learning_rate": 8.267776732733217e-06, + "loss": 0.886, + "step": 10003 + }, + { + "epoch": 0.5506081787660301, + "grad_norm": 0.714645504951477, + "learning_rate": 8.267448638792004e-06, + "loss": 0.7151, + "step": 10004 + }, + { + "epoch": 0.5506632175683858, + "grad_norm": 0.653136134147644, + "learning_rate": 8.267120520293519e-06, + "loss": 0.6347, + "step": 10005 + }, + { + "epoch": 0.5507182563707413, + "grad_norm": 0.8821585774421692, + "learning_rate": 8.266792377240233e-06, + "loss": 0.6457, + "step": 10006 + }, + { + "epoch": 0.550773295173097, + "grad_norm": 0.7056930661201477, + "learning_rate": 8.266464209634608e-06, + "loss": 0.8709, + "step": 10007 + }, + { + "epoch": 0.5508283339754527, + "grad_norm": 0.6505821347236633, + "learning_rate": 8.266136017479113e-06, + "loss": 0.7674, + "step": 10008 + }, + { + "epoch": 0.5508833727778083, + "grad_norm": 0.7947389483451843, + "learning_rate": 8.265807800776216e-06, + "loss": 0.7882, + "step": 10009 + }, + { + "epoch": 0.550938411580164, + "grad_norm": 0.7466071844100952, + "learning_rate": 8.265479559528379e-06, + "loss": 0.7673, + "step": 10010 + }, + { + "epoch": 0.5509934503825197, + "grad_norm": 0.706430971622467, + "learning_rate": 8.265151293738074e-06, + "loss": 0.7796, + "step": 10011 + }, + { + "epoch": 0.5510484891848754, + "grad_norm": 0.7701015472412109, + "learning_rate": 8.264823003407765e-06, + "loss": 0.7631, + "step": 10012 + }, + { + "epoch": 0.551103527987231, + "grad_norm": 0.6923625469207764, + "learning_rate": 8.264494688539922e-06, + "loss": 0.7659, + "step": 10013 + }, + { + "epoch": 0.5511585667895866, + "grad_norm": 0.6585322618484497, + "learning_rate": 8.264166349137008e-06, + "loss": 0.7248, + "step": 10014 + }, + { + "epoch": 0.5512136055919423, + "grad_norm": 0.698451578617096, + "learning_rate": 8.263837985201493e-06, + "loss": 0.7768, + "step": 10015 + }, + { + "epoch": 0.551268644394298, + "grad_norm": 0.7585058808326721, + "learning_rate": 8.263509596735847e-06, + "loss": 0.8535, + "step": 10016 + }, + { + "epoch": 0.5513236831966536, + "grad_norm": 0.6973930597305298, + "learning_rate": 8.263181183742536e-06, + "loss": 0.8253, + "step": 10017 + }, + { + "epoch": 0.5513787219990093, + "grad_norm": 0.6752467751502991, + "learning_rate": 8.26285274622403e-06, + "loss": 0.7402, + "step": 10018 + }, + { + "epoch": 0.551433760801365, + "grad_norm": 0.717555820941925, + "learning_rate": 8.262524284182794e-06, + "loss": 0.8057, + "step": 10019 + }, + { + "epoch": 0.5514887996037207, + "grad_norm": 0.6975438594818115, + "learning_rate": 8.2621957976213e-06, + "loss": 0.803, + "step": 10020 + }, + { + "epoch": 0.5515438384060762, + "grad_norm": 0.667797327041626, + "learning_rate": 8.261867286542016e-06, + "loss": 0.7387, + "step": 10021 + }, + { + "epoch": 0.5515988772084319, + "grad_norm": 0.7330532670021057, + "learning_rate": 8.261538750947411e-06, + "loss": 0.8143, + "step": 10022 + }, + { + "epoch": 0.5516539160107876, + "grad_norm": 0.7034017443656921, + "learning_rate": 8.261210190839952e-06, + "loss": 0.739, + "step": 10023 + }, + { + "epoch": 0.5517089548131433, + "grad_norm": 0.709284245967865, + "learning_rate": 8.260881606222113e-06, + "loss": 0.8021, + "step": 10024 + }, + { + "epoch": 0.5517639936154989, + "grad_norm": 0.7587909698486328, + "learning_rate": 8.260552997096359e-06, + "loss": 0.8346, + "step": 10025 + }, + { + "epoch": 0.5518190324178546, + "grad_norm": 0.7413986325263977, + "learning_rate": 8.26022436346516e-06, + "loss": 0.6777, + "step": 10026 + }, + { + "epoch": 0.5518740712202103, + "grad_norm": 0.7112768292427063, + "learning_rate": 8.25989570533099e-06, + "loss": 0.7017, + "step": 10027 + }, + { + "epoch": 0.551929110022566, + "grad_norm": 0.7097088098526001, + "learning_rate": 8.259567022696315e-06, + "loss": 0.7315, + "step": 10028 + }, + { + "epoch": 0.5519841488249215, + "grad_norm": 0.6544226408004761, + "learning_rate": 8.259238315563606e-06, + "loss": 0.7729, + "step": 10029 + }, + { + "epoch": 0.5520391876272772, + "grad_norm": 0.6892885565757751, + "learning_rate": 8.258909583935335e-06, + "loss": 0.7919, + "step": 10030 + }, + { + "epoch": 0.5520942264296329, + "grad_norm": 0.697424054145813, + "learning_rate": 8.258580827813972e-06, + "loss": 0.7514, + "step": 10031 + }, + { + "epoch": 0.5521492652319886, + "grad_norm": 0.7021437883377075, + "learning_rate": 8.258252047201989e-06, + "loss": 0.747, + "step": 10032 + }, + { + "epoch": 0.5522043040343442, + "grad_norm": 0.6974816918373108, + "learning_rate": 8.257923242101854e-06, + "loss": 0.7245, + "step": 10033 + }, + { + "epoch": 0.5522593428366999, + "grad_norm": 0.6645311117172241, + "learning_rate": 8.25759441251604e-06, + "loss": 0.649, + "step": 10034 + }, + { + "epoch": 0.5523143816390556, + "grad_norm": 0.7223736643791199, + "learning_rate": 8.25726555844702e-06, + "loss": 0.7792, + "step": 10035 + }, + { + "epoch": 0.5523694204414112, + "grad_norm": 0.7253531813621521, + "learning_rate": 8.256936679897262e-06, + "loss": 0.7636, + "step": 10036 + }, + { + "epoch": 0.5524244592437668, + "grad_norm": 0.6979514956474304, + "learning_rate": 8.256607776869241e-06, + "loss": 0.7929, + "step": 10037 + }, + { + "epoch": 0.5524794980461225, + "grad_norm": 0.7442019581794739, + "learning_rate": 8.25627884936543e-06, + "loss": 0.6984, + "step": 10038 + }, + { + "epoch": 0.5525345368484782, + "grad_norm": 0.7519513964653015, + "learning_rate": 8.255949897388294e-06, + "loss": 0.7228, + "step": 10039 + }, + { + "epoch": 0.5525895756508339, + "grad_norm": 0.7302790880203247, + "learning_rate": 8.255620920940313e-06, + "loss": 0.7555, + "step": 10040 + }, + { + "epoch": 0.5526446144531895, + "grad_norm": 0.6521434187889099, + "learning_rate": 8.255291920023956e-06, + "loss": 0.7825, + "step": 10041 + }, + { + "epoch": 0.5526996532555452, + "grad_norm": 0.8270126581192017, + "learning_rate": 8.254962894641695e-06, + "loss": 0.7939, + "step": 10042 + }, + { + "epoch": 0.5527546920579008, + "grad_norm": 0.7209310531616211, + "learning_rate": 8.254633844796007e-06, + "loss": 0.8286, + "step": 10043 + }, + { + "epoch": 0.5528097308602565, + "grad_norm": 0.6506814360618591, + "learning_rate": 8.25430477048936e-06, + "loss": 0.7209, + "step": 10044 + }, + { + "epoch": 0.5528647696626121, + "grad_norm": 0.6914637684822083, + "learning_rate": 8.25397567172423e-06, + "loss": 0.705, + "step": 10045 + }, + { + "epoch": 0.5529198084649678, + "grad_norm": 0.8369725942611694, + "learning_rate": 8.253646548503091e-06, + "loss": 0.8254, + "step": 10046 + }, + { + "epoch": 0.5529748472673235, + "grad_norm": 0.7809324860572815, + "learning_rate": 8.253317400828414e-06, + "loss": 0.8117, + "step": 10047 + }, + { + "epoch": 0.5530298860696792, + "grad_norm": 0.7184550762176514, + "learning_rate": 8.252988228702676e-06, + "loss": 0.738, + "step": 10048 + }, + { + "epoch": 0.5530849248720348, + "grad_norm": 0.7111478447914124, + "learning_rate": 8.252659032128347e-06, + "loss": 0.7143, + "step": 10049 + }, + { + "epoch": 0.5531399636743904, + "grad_norm": 0.7506794333457947, + "learning_rate": 8.252329811107905e-06, + "loss": 0.7721, + "step": 10050 + }, + { + "epoch": 0.5531950024767461, + "grad_norm": 0.7700625658035278, + "learning_rate": 8.252000565643823e-06, + "loss": 0.7993, + "step": 10051 + }, + { + "epoch": 0.5532500412791017, + "grad_norm": 0.6985816955566406, + "learning_rate": 8.251671295738575e-06, + "loss": 0.7461, + "step": 10052 + }, + { + "epoch": 0.5533050800814574, + "grad_norm": 0.6932175755500793, + "learning_rate": 8.251342001394635e-06, + "loss": 0.6804, + "step": 10053 + }, + { + "epoch": 0.5533601188838131, + "grad_norm": 0.8060765266418457, + "learning_rate": 8.25101268261448e-06, + "loss": 0.7137, + "step": 10054 + }, + { + "epoch": 0.5534151576861688, + "grad_norm": 0.6853482127189636, + "learning_rate": 8.250683339400582e-06, + "loss": 0.7229, + "step": 10055 + }, + { + "epoch": 0.5534701964885244, + "grad_norm": 0.7581862211227417, + "learning_rate": 8.25035397175542e-06, + "loss": 0.8091, + "step": 10056 + }, + { + "epoch": 0.55352523529088, + "grad_norm": 0.7375245094299316, + "learning_rate": 8.250024579681466e-06, + "loss": 0.7234, + "step": 10057 + }, + { + "epoch": 0.5535802740932357, + "grad_norm": 0.7904585599899292, + "learning_rate": 8.249695163181198e-06, + "loss": 0.7295, + "step": 10058 + }, + { + "epoch": 0.5536353128955914, + "grad_norm": 0.6593602895736694, + "learning_rate": 8.249365722257092e-06, + "loss": 0.7492, + "step": 10059 + }, + { + "epoch": 0.553690351697947, + "grad_norm": 0.7226922512054443, + "learning_rate": 8.249036256911622e-06, + "loss": 0.8177, + "step": 10060 + }, + { + "epoch": 0.5537453905003027, + "grad_norm": 0.7268722653388977, + "learning_rate": 8.248706767147265e-06, + "loss": 0.8059, + "step": 10061 + }, + { + "epoch": 0.5538004293026584, + "grad_norm": 0.7797269225120544, + "learning_rate": 8.248377252966499e-06, + "loss": 0.8122, + "step": 10062 + }, + { + "epoch": 0.5538554681050141, + "grad_norm": 0.7199145555496216, + "learning_rate": 8.248047714371797e-06, + "loss": 0.7312, + "step": 10063 + }, + { + "epoch": 0.5539105069073696, + "grad_norm": 0.6950703263282776, + "learning_rate": 8.24771815136564e-06, + "loss": 0.757, + "step": 10064 + }, + { + "epoch": 0.5539655457097253, + "grad_norm": 0.6413441896438599, + "learning_rate": 8.247388563950502e-06, + "loss": 0.6955, + "step": 10065 + }, + { + "epoch": 0.554020584512081, + "grad_norm": 0.7650758624076843, + "learning_rate": 8.24705895212886e-06, + "loss": 0.8355, + "step": 10066 + }, + { + "epoch": 0.5540756233144367, + "grad_norm": 0.7067090272903442, + "learning_rate": 8.246729315903192e-06, + "loss": 0.7409, + "step": 10067 + }, + { + "epoch": 0.5541306621167923, + "grad_norm": 0.7763532996177673, + "learning_rate": 8.246399655275976e-06, + "loss": 0.8097, + "step": 10068 + }, + { + "epoch": 0.554185700919148, + "grad_norm": 0.6865057945251465, + "learning_rate": 8.246069970249689e-06, + "loss": 0.7597, + "step": 10069 + }, + { + "epoch": 0.5542407397215037, + "grad_norm": 0.7643107771873474, + "learning_rate": 8.24574026082681e-06, + "loss": 0.7403, + "step": 10070 + }, + { + "epoch": 0.5542957785238594, + "grad_norm": 0.7354087829589844, + "learning_rate": 8.245410527009815e-06, + "loss": 0.8896, + "step": 10071 + }, + { + "epoch": 0.5543508173262149, + "grad_norm": 0.7971135973930359, + "learning_rate": 8.245080768801183e-06, + "loss": 0.7738, + "step": 10072 + }, + { + "epoch": 0.5544058561285706, + "grad_norm": 1.0506731271743774, + "learning_rate": 8.244750986203394e-06, + "loss": 0.7888, + "step": 10073 + }, + { + "epoch": 0.5544608949309263, + "grad_norm": 0.8305885195732117, + "learning_rate": 8.244421179218925e-06, + "loss": 0.8186, + "step": 10074 + }, + { + "epoch": 0.554515933733282, + "grad_norm": 0.9507874250411987, + "learning_rate": 8.244091347850253e-06, + "loss": 0.7975, + "step": 10075 + }, + { + "epoch": 0.5545709725356376, + "grad_norm": 0.7146797776222229, + "learning_rate": 8.243761492099861e-06, + "loss": 0.6895, + "step": 10076 + }, + { + "epoch": 0.5546260113379933, + "grad_norm": 0.734990656375885, + "learning_rate": 8.243431611970225e-06, + "loss": 0.8087, + "step": 10077 + }, + { + "epoch": 0.554681050140349, + "grad_norm": 0.6807795166969299, + "learning_rate": 8.243101707463825e-06, + "loss": 0.7861, + "step": 10078 + }, + { + "epoch": 0.5547360889427047, + "grad_norm": 0.7412874698638916, + "learning_rate": 8.242771778583142e-06, + "loss": 0.7864, + "step": 10079 + }, + { + "epoch": 0.5547911277450602, + "grad_norm": 0.6655074954032898, + "learning_rate": 8.242441825330652e-06, + "loss": 0.6554, + "step": 10080 + }, + { + "epoch": 0.5548461665474159, + "grad_norm": 0.7549700140953064, + "learning_rate": 8.242111847708838e-06, + "loss": 0.8031, + "step": 10081 + }, + { + "epoch": 0.5549012053497716, + "grad_norm": 0.8907766342163086, + "learning_rate": 8.241781845720181e-06, + "loss": 0.8068, + "step": 10082 + }, + { + "epoch": 0.5549562441521273, + "grad_norm": 0.7347774505615234, + "learning_rate": 8.241451819367157e-06, + "loss": 0.7453, + "step": 10083 + }, + { + "epoch": 0.5550112829544829, + "grad_norm": 0.6856632828712463, + "learning_rate": 8.24112176865225e-06, + "loss": 0.6235, + "step": 10084 + }, + { + "epoch": 0.5550663217568386, + "grad_norm": 0.7134507298469543, + "learning_rate": 8.24079169357794e-06, + "loss": 0.7991, + "step": 10085 + }, + { + "epoch": 0.5551213605591943, + "grad_norm": 0.7814854383468628, + "learning_rate": 8.240461594146704e-06, + "loss": 0.7681, + "step": 10086 + }, + { + "epoch": 0.5551763993615499, + "grad_norm": 0.6893261671066284, + "learning_rate": 8.240131470361028e-06, + "loss": 0.7746, + "step": 10087 + }, + { + "epoch": 0.5552314381639055, + "grad_norm": 0.925003170967102, + "learning_rate": 8.239801322223393e-06, + "loss": 0.7621, + "step": 10088 + }, + { + "epoch": 0.5552864769662612, + "grad_norm": 0.6261017918586731, + "learning_rate": 8.239471149736277e-06, + "loss": 0.7673, + "step": 10089 + }, + { + "epoch": 0.5553415157686169, + "grad_norm": 0.7268226146697998, + "learning_rate": 8.239140952902162e-06, + "loss": 0.7375, + "step": 10090 + }, + { + "epoch": 0.5553965545709726, + "grad_norm": 0.8062194585800171, + "learning_rate": 8.238810731723532e-06, + "loss": 0.8002, + "step": 10091 + }, + { + "epoch": 0.5554515933733282, + "grad_norm": 0.892842173576355, + "learning_rate": 8.238480486202867e-06, + "loss": 0.7959, + "step": 10092 + }, + { + "epoch": 0.5555066321756839, + "grad_norm": 0.7530377507209778, + "learning_rate": 8.23815021634265e-06, + "loss": 0.8137, + "step": 10093 + }, + { + "epoch": 0.5555616709780395, + "grad_norm": 0.6994850635528564, + "learning_rate": 8.237819922145364e-06, + "loss": 0.7966, + "step": 10094 + }, + { + "epoch": 0.5556167097803951, + "grad_norm": 0.8502941727638245, + "learning_rate": 8.237489603613488e-06, + "loss": 0.7668, + "step": 10095 + }, + { + "epoch": 0.5556717485827508, + "grad_norm": 0.6583576798439026, + "learning_rate": 8.237159260749507e-06, + "loss": 0.7379, + "step": 10096 + }, + { + "epoch": 0.5557267873851065, + "grad_norm": 0.9539539217948914, + "learning_rate": 8.236828893555904e-06, + "loss": 0.7563, + "step": 10097 + }, + { + "epoch": 0.5557818261874622, + "grad_norm": 0.7446413040161133, + "learning_rate": 8.236498502035162e-06, + "loss": 0.7329, + "step": 10098 + }, + { + "epoch": 0.5558368649898178, + "grad_norm": 0.8950835466384888, + "learning_rate": 8.236168086189761e-06, + "loss": 0.8144, + "step": 10099 + }, + { + "epoch": 0.5558919037921735, + "grad_norm": 0.7255009412765503, + "learning_rate": 8.235837646022191e-06, + "loss": 0.6946, + "step": 10100 + }, + { + "epoch": 0.5559469425945291, + "grad_norm": 0.6983402967453003, + "learning_rate": 8.235507181534929e-06, + "loss": 0.7371, + "step": 10101 + }, + { + "epoch": 0.5560019813968848, + "grad_norm": 1.043593168258667, + "learning_rate": 8.235176692730463e-06, + "loss": 0.6763, + "step": 10102 + }, + { + "epoch": 0.5560570201992404, + "grad_norm": 0.7452800869941711, + "learning_rate": 8.234846179611272e-06, + "loss": 0.8945, + "step": 10103 + }, + { + "epoch": 0.5561120590015961, + "grad_norm": 0.6367164254188538, + "learning_rate": 8.234515642179845e-06, + "loss": 0.6542, + "step": 10104 + }, + { + "epoch": 0.5561670978039518, + "grad_norm": 0.8377598524093628, + "learning_rate": 8.234185080438664e-06, + "loss": 0.787, + "step": 10105 + }, + { + "epoch": 0.5562221366063075, + "grad_norm": 0.7353680729866028, + "learning_rate": 8.233854494390214e-06, + "loss": 0.6391, + "step": 10106 + }, + { + "epoch": 0.5562771754086631, + "grad_norm": 0.7431599497795105, + "learning_rate": 8.233523884036977e-06, + "loss": 0.8221, + "step": 10107 + }, + { + "epoch": 0.5563322142110187, + "grad_norm": 0.7292743921279907, + "learning_rate": 8.233193249381442e-06, + "loss": 0.7791, + "step": 10108 + }, + { + "epoch": 0.5563872530133744, + "grad_norm": 0.7251895666122437, + "learning_rate": 8.232862590426091e-06, + "loss": 0.7993, + "step": 10109 + }, + { + "epoch": 0.5564422918157301, + "grad_norm": 0.7373167276382446, + "learning_rate": 8.23253190717341e-06, + "loss": 0.861, + "step": 10110 + }, + { + "epoch": 0.5564973306180857, + "grad_norm": 0.6689401268959045, + "learning_rate": 8.232201199625887e-06, + "loss": 0.7002, + "step": 10111 + }, + { + "epoch": 0.5565523694204414, + "grad_norm": 0.7405139207839966, + "learning_rate": 8.231870467786003e-06, + "loss": 0.8041, + "step": 10112 + }, + { + "epoch": 0.5566074082227971, + "grad_norm": 0.7561736702919006, + "learning_rate": 8.231539711656246e-06, + "loss": 0.7687, + "step": 10113 + }, + { + "epoch": 0.5566624470251528, + "grad_norm": 0.6857489943504333, + "learning_rate": 8.231208931239103e-06, + "loss": 0.7175, + "step": 10114 + }, + { + "epoch": 0.5567174858275084, + "grad_norm": 0.7410408854484558, + "learning_rate": 8.230878126537057e-06, + "loss": 0.7337, + "step": 10115 + }, + { + "epoch": 0.556772524629864, + "grad_norm": 0.7533249258995056, + "learning_rate": 8.230547297552595e-06, + "loss": 0.7226, + "step": 10116 + }, + { + "epoch": 0.5568275634322197, + "grad_norm": 0.6227561235427856, + "learning_rate": 8.230216444288207e-06, + "loss": 0.711, + "step": 10117 + }, + { + "epoch": 0.5568826022345754, + "grad_norm": 0.6790871024131775, + "learning_rate": 8.229885566746373e-06, + "loss": 0.728, + "step": 10118 + }, + { + "epoch": 0.556937641036931, + "grad_norm": 1.0007857084274292, + "learning_rate": 8.229554664929587e-06, + "loss": 0.9193, + "step": 10119 + }, + { + "epoch": 0.5569926798392867, + "grad_norm": 0.7167220711708069, + "learning_rate": 8.229223738840331e-06, + "loss": 0.8288, + "step": 10120 + }, + { + "epoch": 0.5570477186416424, + "grad_norm": 0.8037107586860657, + "learning_rate": 8.228892788481095e-06, + "loss": 0.8462, + "step": 10121 + }, + { + "epoch": 0.5571027574439981, + "grad_norm": 0.7355597615242004, + "learning_rate": 8.228561813854363e-06, + "loss": 0.7998, + "step": 10122 + }, + { + "epoch": 0.5571577962463536, + "grad_norm": 0.7384124994277954, + "learning_rate": 8.228230814962625e-06, + "loss": 0.7861, + "step": 10123 + }, + { + "epoch": 0.5572128350487093, + "grad_norm": 0.8170364499092102, + "learning_rate": 8.227899791808371e-06, + "loss": 0.8005, + "step": 10124 + }, + { + "epoch": 0.557267873851065, + "grad_norm": 0.678702175617218, + "learning_rate": 8.227568744394084e-06, + "loss": 0.7408, + "step": 10125 + }, + { + "epoch": 0.5573229126534207, + "grad_norm": 0.7212443947792053, + "learning_rate": 8.227237672722255e-06, + "loss": 0.7127, + "step": 10126 + }, + { + "epoch": 0.5573779514557763, + "grad_norm": 0.7035290002822876, + "learning_rate": 8.22690657679537e-06, + "loss": 0.8263, + "step": 10127 + }, + { + "epoch": 0.557432990258132, + "grad_norm": 0.6535285115242004, + "learning_rate": 8.226575456615921e-06, + "loss": 0.6979, + "step": 10128 + }, + { + "epoch": 0.5574880290604877, + "grad_norm": 0.7353794574737549, + "learning_rate": 8.226244312186396e-06, + "loss": 0.6838, + "step": 10129 + }, + { + "epoch": 0.5575430678628434, + "grad_norm": 0.5839618444442749, + "learning_rate": 8.225913143509278e-06, + "loss": 0.5925, + "step": 10130 + }, + { + "epoch": 0.5575981066651989, + "grad_norm": 0.6922228336334229, + "learning_rate": 8.225581950587063e-06, + "loss": 0.6808, + "step": 10131 + }, + { + "epoch": 0.5576531454675546, + "grad_norm": 0.753989040851593, + "learning_rate": 8.225250733422236e-06, + "loss": 0.6567, + "step": 10132 + }, + { + "epoch": 0.5577081842699103, + "grad_norm": 0.7327600717544556, + "learning_rate": 8.22491949201729e-06, + "loss": 0.8311, + "step": 10133 + }, + { + "epoch": 0.557763223072266, + "grad_norm": 0.6435133218765259, + "learning_rate": 8.224588226374712e-06, + "loss": 0.6684, + "step": 10134 + }, + { + "epoch": 0.5578182618746216, + "grad_norm": 0.6402057409286499, + "learning_rate": 8.22425693649699e-06, + "loss": 0.7569, + "step": 10135 + }, + { + "epoch": 0.5578733006769773, + "grad_norm": 0.7454472780227661, + "learning_rate": 8.223925622386617e-06, + "loss": 0.7908, + "step": 10136 + }, + { + "epoch": 0.557928339479333, + "grad_norm": 0.7373154759407043, + "learning_rate": 8.223594284046084e-06, + "loss": 0.8232, + "step": 10137 + }, + { + "epoch": 0.5579833782816885, + "grad_norm": 0.6478374004364014, + "learning_rate": 8.223262921477878e-06, + "loss": 0.7353, + "step": 10138 + }, + { + "epoch": 0.5580384170840442, + "grad_norm": 0.715212881565094, + "learning_rate": 8.222931534684488e-06, + "loss": 0.729, + "step": 10139 + }, + { + "epoch": 0.5580934558863999, + "grad_norm": 0.9226915240287781, + "learning_rate": 8.22260012366841e-06, + "loss": 0.7846, + "step": 10140 + }, + { + "epoch": 0.5581484946887556, + "grad_norm": 0.6481993198394775, + "learning_rate": 8.222268688432132e-06, + "loss": 0.6955, + "step": 10141 + }, + { + "epoch": 0.5582035334911112, + "grad_norm": 0.7240349054336548, + "learning_rate": 8.221937228978145e-06, + "loss": 0.7956, + "step": 10142 + }, + { + "epoch": 0.5582585722934669, + "grad_norm": 0.7089122533798218, + "learning_rate": 8.221605745308939e-06, + "loss": 0.7481, + "step": 10143 + }, + { + "epoch": 0.5583136110958226, + "grad_norm": 0.7292537093162537, + "learning_rate": 8.221274237427009e-06, + "loss": 0.7797, + "step": 10144 + }, + { + "epoch": 0.5583686498981782, + "grad_norm": 0.7104652523994446, + "learning_rate": 8.220942705334841e-06, + "loss": 0.7966, + "step": 10145 + }, + { + "epoch": 0.5584236887005338, + "grad_norm": 0.7656546831130981, + "learning_rate": 8.220611149034931e-06, + "loss": 0.7541, + "step": 10146 + }, + { + "epoch": 0.5584787275028895, + "grad_norm": 0.7618892788887024, + "learning_rate": 8.22027956852977e-06, + "loss": 0.6994, + "step": 10147 + }, + { + "epoch": 0.5585337663052452, + "grad_norm": 0.6445756554603577, + "learning_rate": 8.219947963821851e-06, + "loss": 0.7303, + "step": 10148 + }, + { + "epoch": 0.5585888051076009, + "grad_norm": 0.6529820561408997, + "learning_rate": 8.219616334913663e-06, + "loss": 0.7008, + "step": 10149 + }, + { + "epoch": 0.5586438439099565, + "grad_norm": 0.6890642046928406, + "learning_rate": 8.219284681807703e-06, + "loss": 0.8124, + "step": 10150 + }, + { + "epoch": 0.5586988827123122, + "grad_norm": 0.7273370027542114, + "learning_rate": 8.218953004506458e-06, + "loss": 0.7507, + "step": 10151 + }, + { + "epoch": 0.5587539215146679, + "grad_norm": 0.7239277362823486, + "learning_rate": 8.218621303012425e-06, + "loss": 0.7929, + "step": 10152 + }, + { + "epoch": 0.5588089603170235, + "grad_norm": 0.660275399684906, + "learning_rate": 8.218289577328096e-06, + "loss": 0.7418, + "step": 10153 + }, + { + "epoch": 0.5588639991193791, + "grad_norm": 0.7406648993492126, + "learning_rate": 8.217957827455965e-06, + "loss": 0.8072, + "step": 10154 + }, + { + "epoch": 0.5589190379217348, + "grad_norm": 0.7051703333854675, + "learning_rate": 8.217626053398522e-06, + "loss": 0.6562, + "step": 10155 + }, + { + "epoch": 0.5589740767240905, + "grad_norm": 0.93423992395401, + "learning_rate": 8.217294255158266e-06, + "loss": 0.738, + "step": 10156 + }, + { + "epoch": 0.5590291155264462, + "grad_norm": 0.8362720608711243, + "learning_rate": 8.216962432737685e-06, + "loss": 0.8585, + "step": 10157 + }, + { + "epoch": 0.5590841543288018, + "grad_norm": 0.9195587038993835, + "learning_rate": 8.216630586139277e-06, + "loss": 0.8778, + "step": 10158 + }, + { + "epoch": 0.5591391931311575, + "grad_norm": 0.7181550860404968, + "learning_rate": 8.216298715365534e-06, + "loss": 0.702, + "step": 10159 + }, + { + "epoch": 0.5591942319335131, + "grad_norm": 0.6900259852409363, + "learning_rate": 8.21596682041895e-06, + "loss": 0.7652, + "step": 10160 + }, + { + "epoch": 0.5592492707358688, + "grad_norm": 0.7523833513259888, + "learning_rate": 8.215634901302022e-06, + "loss": 0.7881, + "step": 10161 + }, + { + "epoch": 0.5593043095382244, + "grad_norm": 0.6659645438194275, + "learning_rate": 8.215302958017241e-06, + "loss": 0.694, + "step": 10162 + }, + { + "epoch": 0.5593593483405801, + "grad_norm": 0.8898606300354004, + "learning_rate": 8.214970990567105e-06, + "loss": 0.8534, + "step": 10163 + }, + { + "epoch": 0.5594143871429358, + "grad_norm": 0.6759241819381714, + "learning_rate": 8.214638998954108e-06, + "loss": 0.8241, + "step": 10164 + }, + { + "epoch": 0.5594694259452915, + "grad_norm": 0.7136911749839783, + "learning_rate": 8.214306983180744e-06, + "loss": 0.7846, + "step": 10165 + }, + { + "epoch": 0.559524464747647, + "grad_norm": 0.6781616806983948, + "learning_rate": 8.213974943249509e-06, + "loss": 0.7116, + "step": 10166 + }, + { + "epoch": 0.5595795035500027, + "grad_norm": 0.7134156227111816, + "learning_rate": 8.213642879162898e-06, + "loss": 0.7537, + "step": 10167 + }, + { + "epoch": 0.5596345423523584, + "grad_norm": 1.306710124015808, + "learning_rate": 8.213310790923408e-06, + "loss": 0.8506, + "step": 10168 + }, + { + "epoch": 0.5596895811547141, + "grad_norm": 0.725304901599884, + "learning_rate": 8.212978678533534e-06, + "loss": 0.8115, + "step": 10169 + }, + { + "epoch": 0.5597446199570697, + "grad_norm": 0.7833520174026489, + "learning_rate": 8.212646541995772e-06, + "loss": 0.919, + "step": 10170 + }, + { + "epoch": 0.5597996587594254, + "grad_norm": 0.6938104033470154, + "learning_rate": 8.212314381312621e-06, + "loss": 0.7303, + "step": 10171 + }, + { + "epoch": 0.5598546975617811, + "grad_norm": 0.6860232949256897, + "learning_rate": 8.211982196486573e-06, + "loss": 0.7709, + "step": 10172 + }, + { + "epoch": 0.5599097363641368, + "grad_norm": 0.6611567139625549, + "learning_rate": 8.211649987520126e-06, + "loss": 0.7711, + "step": 10173 + }, + { + "epoch": 0.5599647751664923, + "grad_norm": 0.8603463172912598, + "learning_rate": 8.211317754415778e-06, + "loss": 0.8527, + "step": 10174 + }, + { + "epoch": 0.560019813968848, + "grad_norm": 0.7350558638572693, + "learning_rate": 8.210985497176025e-06, + "loss": 0.8148, + "step": 10175 + }, + { + "epoch": 0.5600748527712037, + "grad_norm": 0.6881470084190369, + "learning_rate": 8.210653215803365e-06, + "loss": 0.7526, + "step": 10176 + }, + { + "epoch": 0.5601298915735594, + "grad_norm": 0.6879626512527466, + "learning_rate": 8.210320910300296e-06, + "loss": 0.7649, + "step": 10177 + }, + { + "epoch": 0.560184930375915, + "grad_norm": 0.6843587160110474, + "learning_rate": 8.209988580669312e-06, + "loss": 0.8131, + "step": 10178 + }, + { + "epoch": 0.5602399691782707, + "grad_norm": 0.6684302687644958, + "learning_rate": 8.209656226912915e-06, + "loss": 0.7256, + "step": 10179 + }, + { + "epoch": 0.5602950079806264, + "grad_norm": 0.7973861694335938, + "learning_rate": 8.209323849033601e-06, + "loss": 0.7924, + "step": 10180 + }, + { + "epoch": 0.560350046782982, + "grad_norm": 0.6850616931915283, + "learning_rate": 8.208991447033867e-06, + "loss": 0.7423, + "step": 10181 + }, + { + "epoch": 0.5604050855853376, + "grad_norm": 0.8284440636634827, + "learning_rate": 8.208659020916213e-06, + "loss": 0.7637, + "step": 10182 + }, + { + "epoch": 0.5604601243876933, + "grad_norm": 0.7671821713447571, + "learning_rate": 8.208326570683136e-06, + "loss": 0.7688, + "step": 10183 + }, + { + "epoch": 0.560515163190049, + "grad_norm": 0.8359144330024719, + "learning_rate": 8.207994096337135e-06, + "loss": 0.8179, + "step": 10184 + }, + { + "epoch": 0.5605702019924046, + "grad_norm": 0.6389699578285217, + "learning_rate": 8.207661597880709e-06, + "loss": 0.6987, + "step": 10185 + }, + { + "epoch": 0.5606252407947603, + "grad_norm": 0.6472755074501038, + "learning_rate": 8.20732907531636e-06, + "loss": 0.6984, + "step": 10186 + }, + { + "epoch": 0.560680279597116, + "grad_norm": 0.8231903314590454, + "learning_rate": 8.20699652864658e-06, + "loss": 0.8212, + "step": 10187 + }, + { + "epoch": 0.5607353183994717, + "grad_norm": 0.7550386190414429, + "learning_rate": 8.206663957873876e-06, + "loss": 0.7446, + "step": 10188 + }, + { + "epoch": 0.5607903572018272, + "grad_norm": 0.6704659461975098, + "learning_rate": 8.206331363000743e-06, + "loss": 0.7035, + "step": 10189 + }, + { + "epoch": 0.5608453960041829, + "grad_norm": 0.7258654236793518, + "learning_rate": 8.20599874402968e-06, + "loss": 0.7032, + "step": 10190 + }, + { + "epoch": 0.5609004348065386, + "grad_norm": 0.674609363079071, + "learning_rate": 8.20566610096319e-06, + "loss": 0.7545, + "step": 10191 + }, + { + "epoch": 0.5609554736088943, + "grad_norm": 0.6978347301483154, + "learning_rate": 8.205333433803773e-06, + "loss": 0.8198, + "step": 10192 + }, + { + "epoch": 0.5610105124112499, + "grad_norm": 0.6252121329307556, + "learning_rate": 8.205000742553925e-06, + "loss": 0.6639, + "step": 10193 + }, + { + "epoch": 0.5610655512136056, + "grad_norm": 0.7288224101066589, + "learning_rate": 8.204668027216152e-06, + "loss": 0.8035, + "step": 10194 + }, + { + "epoch": 0.5611205900159613, + "grad_norm": 0.6591556072235107, + "learning_rate": 8.20433528779295e-06, + "loss": 0.7552, + "step": 10195 + }, + { + "epoch": 0.561175628818317, + "grad_norm": 0.769827127456665, + "learning_rate": 8.204002524286823e-06, + "loss": 0.7279, + "step": 10196 + }, + { + "epoch": 0.5612306676206725, + "grad_norm": 0.74398273229599, + "learning_rate": 8.203669736700271e-06, + "loss": 0.7638, + "step": 10197 + }, + { + "epoch": 0.5612857064230282, + "grad_norm": 0.9343454241752625, + "learning_rate": 8.203336925035795e-06, + "loss": 0.7513, + "step": 10198 + }, + { + "epoch": 0.5613407452253839, + "grad_norm": 0.6667190194129944, + "learning_rate": 8.203004089295894e-06, + "loss": 0.77, + "step": 10199 + }, + { + "epoch": 0.5613957840277396, + "grad_norm": 0.7684557437896729, + "learning_rate": 8.202671229483073e-06, + "loss": 0.803, + "step": 10200 + }, + { + "epoch": 0.5614508228300952, + "grad_norm": 0.6551374793052673, + "learning_rate": 8.202338345599832e-06, + "loss": 0.6914, + "step": 10201 + }, + { + "epoch": 0.5615058616324509, + "grad_norm": 0.717464029788971, + "learning_rate": 8.202005437648674e-06, + "loss": 0.6797, + "step": 10202 + }, + { + "epoch": 0.5615609004348066, + "grad_norm": 0.7053301334381104, + "learning_rate": 8.2016725056321e-06, + "loss": 0.7857, + "step": 10203 + }, + { + "epoch": 0.5616159392371622, + "grad_norm": 0.8392077684402466, + "learning_rate": 8.20133954955261e-06, + "loss": 0.8321, + "step": 10204 + }, + { + "epoch": 0.5616709780395178, + "grad_norm": 0.6630520820617676, + "learning_rate": 8.201006569412711e-06, + "loss": 0.7093, + "step": 10205 + }, + { + "epoch": 0.5617260168418735, + "grad_norm": 0.6835867762565613, + "learning_rate": 8.200673565214905e-06, + "loss": 0.6623, + "step": 10206 + }, + { + "epoch": 0.5617810556442292, + "grad_norm": 0.7635336518287659, + "learning_rate": 8.200340536961691e-06, + "loss": 0.8378, + "step": 10207 + }, + { + "epoch": 0.5618360944465849, + "grad_norm": 0.6500052213668823, + "learning_rate": 8.200007484655575e-06, + "loss": 0.6836, + "step": 10208 + }, + { + "epoch": 0.5618911332489405, + "grad_norm": 0.6549860835075378, + "learning_rate": 8.199674408299058e-06, + "loss": 0.6868, + "step": 10209 + }, + { + "epoch": 0.5619461720512962, + "grad_norm": 0.7995957732200623, + "learning_rate": 8.199341307894647e-06, + "loss": 0.7719, + "step": 10210 + }, + { + "epoch": 0.5620012108536518, + "grad_norm": 0.6869412064552307, + "learning_rate": 8.199008183444843e-06, + "loss": 0.7921, + "step": 10211 + }, + { + "epoch": 0.5620562496560075, + "grad_norm": 0.9125131964683533, + "learning_rate": 8.198675034952149e-06, + "loss": 0.9015, + "step": 10212 + }, + { + "epoch": 0.5621112884583631, + "grad_norm": 0.6851146221160889, + "learning_rate": 8.198341862419068e-06, + "loss": 0.7773, + "step": 10213 + }, + { + "epoch": 0.5621663272607188, + "grad_norm": 0.6808778047561646, + "learning_rate": 8.198008665848108e-06, + "loss": 0.7375, + "step": 10214 + }, + { + "epoch": 0.5622213660630745, + "grad_norm": 0.6419697999954224, + "learning_rate": 8.19767544524177e-06, + "loss": 0.7496, + "step": 10215 + }, + { + "epoch": 0.5622764048654302, + "grad_norm": 0.7325716614723206, + "learning_rate": 8.197342200602559e-06, + "loss": 0.7424, + "step": 10216 + }, + { + "epoch": 0.5623314436677858, + "grad_norm": 0.6165832281112671, + "learning_rate": 8.19700893193298e-06, + "loss": 0.6364, + "step": 10217 + }, + { + "epoch": 0.5623864824701414, + "grad_norm": 0.7632125020027161, + "learning_rate": 8.196675639235539e-06, + "loss": 0.7175, + "step": 10218 + }, + { + "epoch": 0.5624415212724971, + "grad_norm": 0.6789713501930237, + "learning_rate": 8.196342322512738e-06, + "loss": 0.7122, + "step": 10219 + }, + { + "epoch": 0.5624965600748528, + "grad_norm": 0.7341050505638123, + "learning_rate": 8.196008981767084e-06, + "loss": 0.7598, + "step": 10220 + }, + { + "epoch": 0.5625515988772084, + "grad_norm": 0.7318429350852966, + "learning_rate": 8.195675617001083e-06, + "loss": 0.7723, + "step": 10221 + }, + { + "epoch": 0.5626066376795641, + "grad_norm": 0.6940313577651978, + "learning_rate": 8.195342228217238e-06, + "loss": 0.7885, + "step": 10222 + }, + { + "epoch": 0.5626616764819198, + "grad_norm": 0.8792300820350647, + "learning_rate": 8.195008815418058e-06, + "loss": 0.7657, + "step": 10223 + }, + { + "epoch": 0.5627167152842754, + "grad_norm": 0.7234559655189514, + "learning_rate": 8.194675378606044e-06, + "loss": 0.7988, + "step": 10224 + }, + { + "epoch": 0.562771754086631, + "grad_norm": 0.6698254942893982, + "learning_rate": 8.194341917783708e-06, + "loss": 0.6378, + "step": 10225 + }, + { + "epoch": 0.5628267928889867, + "grad_norm": 0.6546483635902405, + "learning_rate": 8.194008432953552e-06, + "loss": 0.7113, + "step": 10226 + }, + { + "epoch": 0.5628818316913424, + "grad_norm": 0.6532583832740784, + "learning_rate": 8.193674924118085e-06, + "loss": 0.6782, + "step": 10227 + }, + { + "epoch": 0.562936870493698, + "grad_norm": 0.770578920841217, + "learning_rate": 8.19334139127981e-06, + "loss": 0.8519, + "step": 10228 + }, + { + "epoch": 0.5629919092960537, + "grad_norm": 0.7255409359931946, + "learning_rate": 8.193007834441235e-06, + "loss": 0.6555, + "step": 10229 + }, + { + "epoch": 0.5630469480984094, + "grad_norm": 0.6659883856773376, + "learning_rate": 8.19267425360487e-06, + "loss": 0.7836, + "step": 10230 + }, + { + "epoch": 0.5631019869007651, + "grad_norm": 0.6596028208732605, + "learning_rate": 8.192340648773221e-06, + "loss": 0.6199, + "step": 10231 + }, + { + "epoch": 0.5631570257031207, + "grad_norm": 0.8226001858711243, + "learning_rate": 8.192007019948793e-06, + "loss": 0.8101, + "step": 10232 + }, + { + "epoch": 0.5632120645054763, + "grad_norm": 0.7465038895606995, + "learning_rate": 8.191673367134094e-06, + "loss": 0.8437, + "step": 10233 + }, + { + "epoch": 0.563267103307832, + "grad_norm": 1.0008004903793335, + "learning_rate": 8.191339690331632e-06, + "loss": 0.8626, + "step": 10234 + }, + { + "epoch": 0.5633221421101877, + "grad_norm": 0.7538222670555115, + "learning_rate": 8.191005989543917e-06, + "loss": 0.7222, + "step": 10235 + }, + { + "epoch": 0.5633771809125433, + "grad_norm": 0.6252872943878174, + "learning_rate": 8.190672264773454e-06, + "loss": 0.8038, + "step": 10236 + }, + { + "epoch": 0.563432219714899, + "grad_norm": 0.7083514928817749, + "learning_rate": 8.190338516022752e-06, + "loss": 0.7863, + "step": 10237 + }, + { + "epoch": 0.5634872585172547, + "grad_norm": 0.6887454390525818, + "learning_rate": 8.19000474329432e-06, + "loss": 0.7034, + "step": 10238 + }, + { + "epoch": 0.5635422973196104, + "grad_norm": 0.7487072348594666, + "learning_rate": 8.189670946590666e-06, + "loss": 0.8618, + "step": 10239 + }, + { + "epoch": 0.5635973361219659, + "grad_norm": 0.6999371647834778, + "learning_rate": 8.189337125914298e-06, + "loss": 0.7613, + "step": 10240 + }, + { + "epoch": 0.5636523749243216, + "grad_norm": 0.8265380263328552, + "learning_rate": 8.18900328126773e-06, + "loss": 0.7576, + "step": 10241 + }, + { + "epoch": 0.5637074137266773, + "grad_norm": 0.6688962578773499, + "learning_rate": 8.188669412653463e-06, + "loss": 0.712, + "step": 10242 + }, + { + "epoch": 0.563762452529033, + "grad_norm": 0.6343923211097717, + "learning_rate": 8.188335520074011e-06, + "loss": 0.7239, + "step": 10243 + }, + { + "epoch": 0.5638174913313886, + "grad_norm": 0.7122388482093811, + "learning_rate": 8.188001603531883e-06, + "loss": 0.7892, + "step": 10244 + }, + { + "epoch": 0.5638725301337443, + "grad_norm": 0.6646286845207214, + "learning_rate": 8.187667663029587e-06, + "loss": 0.7805, + "step": 10245 + }, + { + "epoch": 0.5639275689361, + "grad_norm": 0.742938220500946, + "learning_rate": 8.187333698569638e-06, + "loss": 0.8444, + "step": 10246 + }, + { + "epoch": 0.5639826077384557, + "grad_norm": 0.7260885238647461, + "learning_rate": 8.18699971015454e-06, + "loss": 0.8621, + "step": 10247 + }, + { + "epoch": 0.5640376465408112, + "grad_norm": 0.7920067310333252, + "learning_rate": 8.186665697786804e-06, + "loss": 0.7391, + "step": 10248 + }, + { + "epoch": 0.5640926853431669, + "grad_norm": 0.7472825646400452, + "learning_rate": 8.186331661468943e-06, + "loss": 0.7249, + "step": 10249 + }, + { + "epoch": 0.5641477241455226, + "grad_norm": 0.692643940448761, + "learning_rate": 8.185997601203465e-06, + "loss": 0.7884, + "step": 10250 + }, + { + "epoch": 0.5642027629478783, + "grad_norm": 0.715455174446106, + "learning_rate": 8.185663516992884e-06, + "loss": 0.7369, + "step": 10251 + }, + { + "epoch": 0.5642578017502339, + "grad_norm": 0.7566105723381042, + "learning_rate": 8.185329408839705e-06, + "loss": 0.7378, + "step": 10252 + }, + { + "epoch": 0.5643128405525896, + "grad_norm": 0.8163520693778992, + "learning_rate": 8.184995276746445e-06, + "loss": 0.7326, + "step": 10253 + }, + { + "epoch": 0.5643678793549453, + "grad_norm": 0.6280468106269836, + "learning_rate": 8.184661120715615e-06, + "loss": 0.6858, + "step": 10254 + }, + { + "epoch": 0.564422918157301, + "grad_norm": 0.7246795892715454, + "learning_rate": 8.184326940749723e-06, + "loss": 0.8111, + "step": 10255 + }, + { + "epoch": 0.5644779569596565, + "grad_norm": 0.7429527640342712, + "learning_rate": 8.18399273685128e-06, + "loss": 0.7642, + "step": 10256 + }, + { + "epoch": 0.5645329957620122, + "grad_norm": 0.7308861017227173, + "learning_rate": 8.183658509022802e-06, + "loss": 0.7844, + "step": 10257 + }, + { + "epoch": 0.5645880345643679, + "grad_norm": 0.7549033164978027, + "learning_rate": 8.1833242572668e-06, + "loss": 0.8585, + "step": 10258 + }, + { + "epoch": 0.5646430733667236, + "grad_norm": 0.6779888868331909, + "learning_rate": 8.182989981585782e-06, + "loss": 0.6808, + "step": 10259 + }, + { + "epoch": 0.5646981121690792, + "grad_norm": 0.887113630771637, + "learning_rate": 8.182655681982266e-06, + "loss": 0.8229, + "step": 10260 + }, + { + "epoch": 0.5647531509714349, + "grad_norm": 0.6405711770057678, + "learning_rate": 8.18232135845876e-06, + "loss": 0.6901, + "step": 10261 + }, + { + "epoch": 0.5648081897737905, + "grad_norm": 0.7302486300468445, + "learning_rate": 8.18198701101778e-06, + "loss": 0.6853, + "step": 10262 + }, + { + "epoch": 0.5648632285761462, + "grad_norm": 0.6374662518501282, + "learning_rate": 8.181652639661837e-06, + "loss": 0.7177, + "step": 10263 + }, + { + "epoch": 0.5649182673785018, + "grad_norm": 0.9267570972442627, + "learning_rate": 8.181318244393444e-06, + "loss": 0.7926, + "step": 10264 + }, + { + "epoch": 0.5649733061808575, + "grad_norm": 0.8196623921394348, + "learning_rate": 8.180983825215114e-06, + "loss": 0.7127, + "step": 10265 + }, + { + "epoch": 0.5650283449832132, + "grad_norm": 0.7004575133323669, + "learning_rate": 8.180649382129361e-06, + "loss": 0.7858, + "step": 10266 + }, + { + "epoch": 0.5650833837855688, + "grad_norm": 0.7667824625968933, + "learning_rate": 8.180314915138701e-06, + "loss": 0.7742, + "step": 10267 + }, + { + "epoch": 0.5651384225879245, + "grad_norm": 0.7372623682022095, + "learning_rate": 8.179980424245644e-06, + "loss": 0.7949, + "step": 10268 + }, + { + "epoch": 0.5651934613902801, + "grad_norm": 0.6417940258979797, + "learning_rate": 8.179645909452704e-06, + "loss": 0.6683, + "step": 10269 + }, + { + "epoch": 0.5652485001926358, + "grad_norm": 0.6736140251159668, + "learning_rate": 8.179311370762398e-06, + "loss": 0.6564, + "step": 10270 + }, + { + "epoch": 0.5653035389949914, + "grad_norm": 0.6727200746536255, + "learning_rate": 8.178976808177239e-06, + "loss": 0.8065, + "step": 10271 + }, + { + "epoch": 0.5653585777973471, + "grad_norm": 0.7565415501594543, + "learning_rate": 8.17864222169974e-06, + "loss": 0.9055, + "step": 10272 + }, + { + "epoch": 0.5654136165997028, + "grad_norm": 0.8938627243041992, + "learning_rate": 8.178307611332418e-06, + "loss": 0.8009, + "step": 10273 + }, + { + "epoch": 0.5654686554020585, + "grad_norm": 0.7439131140708923, + "learning_rate": 8.177972977077786e-06, + "loss": 0.7807, + "step": 10274 + }, + { + "epoch": 0.5655236942044141, + "grad_norm": 0.7603998184204102, + "learning_rate": 8.17763831893836e-06, + "loss": 0.818, + "step": 10275 + }, + { + "epoch": 0.5655787330067698, + "grad_norm": 0.7088946104049683, + "learning_rate": 8.177303636916655e-06, + "loss": 0.7741, + "step": 10276 + }, + { + "epoch": 0.5656337718091254, + "grad_norm": 0.6801518201828003, + "learning_rate": 8.176968931015187e-06, + "loss": 0.7633, + "step": 10277 + }, + { + "epoch": 0.5656888106114811, + "grad_norm": 0.6739299297332764, + "learning_rate": 8.17663420123647e-06, + "loss": 0.7772, + "step": 10278 + }, + { + "epoch": 0.5657438494138367, + "grad_norm": 0.7432494759559631, + "learning_rate": 8.176299447583021e-06, + "loss": 0.7368, + "step": 10279 + }, + { + "epoch": 0.5657988882161924, + "grad_norm": 0.7847158908843994, + "learning_rate": 8.175964670057357e-06, + "loss": 0.7824, + "step": 10280 + }, + { + "epoch": 0.5658539270185481, + "grad_norm": 0.8732449412345886, + "learning_rate": 8.17562986866199e-06, + "loss": 0.8035, + "step": 10281 + }, + { + "epoch": 0.5659089658209038, + "grad_norm": 0.7988447546958923, + "learning_rate": 8.17529504339944e-06, + "loss": 0.828, + "step": 10282 + }, + { + "epoch": 0.5659640046232594, + "grad_norm": 0.7063263058662415, + "learning_rate": 8.174960194272224e-06, + "loss": 0.7723, + "step": 10283 + }, + { + "epoch": 0.566019043425615, + "grad_norm": 0.7635022401809692, + "learning_rate": 8.174625321282856e-06, + "loss": 0.7156, + "step": 10284 + }, + { + "epoch": 0.5660740822279707, + "grad_norm": 0.6505927443504333, + "learning_rate": 8.174290424433853e-06, + "loss": 0.7409, + "step": 10285 + }, + { + "epoch": 0.5661291210303264, + "grad_norm": 0.6919816136360168, + "learning_rate": 8.173955503727734e-06, + "loss": 0.7829, + "step": 10286 + }, + { + "epoch": 0.566184159832682, + "grad_norm": 0.7024216651916504, + "learning_rate": 8.173620559167015e-06, + "loss": 0.7378, + "step": 10287 + }, + { + "epoch": 0.5662391986350377, + "grad_norm": 0.7134365439414978, + "learning_rate": 8.173285590754212e-06, + "loss": 0.7737, + "step": 10288 + }, + { + "epoch": 0.5662942374373934, + "grad_norm": 0.6867973804473877, + "learning_rate": 8.172950598491845e-06, + "loss": 0.7169, + "step": 10289 + }, + { + "epoch": 0.5663492762397491, + "grad_norm": 0.6900742650032043, + "learning_rate": 8.172615582382432e-06, + "loss": 0.7888, + "step": 10290 + }, + { + "epoch": 0.5664043150421046, + "grad_norm": 0.7026718854904175, + "learning_rate": 8.172280542428488e-06, + "loss": 0.8179, + "step": 10291 + }, + { + "epoch": 0.5664593538444603, + "grad_norm": 0.6940855979919434, + "learning_rate": 8.171945478632533e-06, + "loss": 0.7686, + "step": 10292 + }, + { + "epoch": 0.566514392646816, + "grad_norm": 0.6717686653137207, + "learning_rate": 8.171610390997085e-06, + "loss": 0.7865, + "step": 10293 + }, + { + "epoch": 0.5665694314491717, + "grad_norm": 0.6947711110115051, + "learning_rate": 8.171275279524661e-06, + "loss": 0.7811, + "step": 10294 + }, + { + "epoch": 0.5666244702515273, + "grad_norm": 0.6907814741134644, + "learning_rate": 8.170940144217782e-06, + "loss": 0.7095, + "step": 10295 + }, + { + "epoch": 0.566679509053883, + "grad_norm": 0.723952054977417, + "learning_rate": 8.170604985078965e-06, + "loss": 0.7814, + "step": 10296 + }, + { + "epoch": 0.5667345478562387, + "grad_norm": 0.7775490880012512, + "learning_rate": 8.17026980211073e-06, + "loss": 0.797, + "step": 10297 + }, + { + "epoch": 0.5667895866585944, + "grad_norm": 0.7557885646820068, + "learning_rate": 8.169934595315597e-06, + "loss": 0.8423, + "step": 10298 + }, + { + "epoch": 0.5668446254609499, + "grad_norm": 0.7838338017463684, + "learning_rate": 8.169599364696083e-06, + "loss": 0.7114, + "step": 10299 + }, + { + "epoch": 0.5668996642633056, + "grad_norm": 0.6632605791091919, + "learning_rate": 8.169264110254707e-06, + "loss": 0.6723, + "step": 10300 + }, + { + "epoch": 0.5669547030656613, + "grad_norm": 0.735756516456604, + "learning_rate": 8.168928831993991e-06, + "loss": 0.7533, + "step": 10301 + }, + { + "epoch": 0.567009741868017, + "grad_norm": 0.6981016993522644, + "learning_rate": 8.168593529916457e-06, + "loss": 0.7882, + "step": 10302 + }, + { + "epoch": 0.5670647806703726, + "grad_norm": 0.6413942575454712, + "learning_rate": 8.168258204024619e-06, + "loss": 0.6593, + "step": 10303 + }, + { + "epoch": 0.5671198194727283, + "grad_norm": 0.7040891051292419, + "learning_rate": 8.167922854321002e-06, + "loss": 0.7295, + "step": 10304 + }, + { + "epoch": 0.567174858275084, + "grad_norm": 0.7132521867752075, + "learning_rate": 8.167587480808126e-06, + "loss": 0.7128, + "step": 10305 + }, + { + "epoch": 0.5672298970774396, + "grad_norm": 0.756529688835144, + "learning_rate": 8.167252083488508e-06, + "loss": 0.7044, + "step": 10306 + }, + { + "epoch": 0.5672849358797952, + "grad_norm": 0.8456888198852539, + "learning_rate": 8.166916662364672e-06, + "loss": 0.8304, + "step": 10307 + }, + { + "epoch": 0.5673399746821509, + "grad_norm": 0.7758522629737854, + "learning_rate": 8.166581217439138e-06, + "loss": 0.7192, + "step": 10308 + }, + { + "epoch": 0.5673950134845066, + "grad_norm": 0.8110343217849731, + "learning_rate": 8.166245748714428e-06, + "loss": 0.8794, + "step": 10309 + }, + { + "epoch": 0.5674500522868622, + "grad_norm": 0.6803586483001709, + "learning_rate": 8.165910256193062e-06, + "loss": 0.7402, + "step": 10310 + }, + { + "epoch": 0.5675050910892179, + "grad_norm": 0.7294176816940308, + "learning_rate": 8.165574739877563e-06, + "loss": 0.7325, + "step": 10311 + }, + { + "epoch": 0.5675601298915736, + "grad_norm": 0.835488498210907, + "learning_rate": 8.165239199770448e-06, + "loss": 0.8317, + "step": 10312 + }, + { + "epoch": 0.5676151686939293, + "grad_norm": 0.6497608423233032, + "learning_rate": 8.164903635874246e-06, + "loss": 0.6902, + "step": 10313 + }, + { + "epoch": 0.5676702074962848, + "grad_norm": 0.6782082915306091, + "learning_rate": 8.164568048191474e-06, + "loss": 0.7941, + "step": 10314 + }, + { + "epoch": 0.5677252462986405, + "grad_norm": 0.6974388957023621, + "learning_rate": 8.164232436724656e-06, + "loss": 0.7899, + "step": 10315 + }, + { + "epoch": 0.5677802851009962, + "grad_norm": 0.7222558259963989, + "learning_rate": 8.163896801476314e-06, + "loss": 0.8034, + "step": 10316 + }, + { + "epoch": 0.5678353239033519, + "grad_norm": 0.6562586426734924, + "learning_rate": 8.16356114244897e-06, + "loss": 0.7864, + "step": 10317 + }, + { + "epoch": 0.5678903627057075, + "grad_norm": 0.6888270378112793, + "learning_rate": 8.16322545964515e-06, + "loss": 0.8455, + "step": 10318 + }, + { + "epoch": 0.5679454015080632, + "grad_norm": 0.642084002494812, + "learning_rate": 8.162889753067372e-06, + "loss": 0.7478, + "step": 10319 + }, + { + "epoch": 0.5680004403104189, + "grad_norm": 0.7077270746231079, + "learning_rate": 8.16255402271816e-06, + "loss": 0.7281, + "step": 10320 + }, + { + "epoch": 0.5680554791127745, + "grad_norm": 0.7202198505401611, + "learning_rate": 8.16221826860004e-06, + "loss": 0.7893, + "step": 10321 + }, + { + "epoch": 0.5681105179151301, + "grad_norm": 0.8950369954109192, + "learning_rate": 8.161882490715534e-06, + "loss": 0.772, + "step": 10322 + }, + { + "epoch": 0.5681655567174858, + "grad_norm": 0.6986666917800903, + "learning_rate": 8.161546689067166e-06, + "loss": 0.7712, + "step": 10323 + }, + { + "epoch": 0.5682205955198415, + "grad_norm": 0.7095959782600403, + "learning_rate": 8.161210863657458e-06, + "loss": 0.8373, + "step": 10324 + }, + { + "epoch": 0.5682756343221972, + "grad_norm": 0.7510485649108887, + "learning_rate": 8.160875014488936e-06, + "loss": 0.9106, + "step": 10325 + }, + { + "epoch": 0.5683306731245528, + "grad_norm": 0.7558283805847168, + "learning_rate": 8.160539141564123e-06, + "loss": 0.8192, + "step": 10326 + }, + { + "epoch": 0.5683857119269085, + "grad_norm": 0.7523400187492371, + "learning_rate": 8.160203244885545e-06, + "loss": 0.8276, + "step": 10327 + }, + { + "epoch": 0.5684407507292641, + "grad_norm": 0.6911195516586304, + "learning_rate": 8.159867324455724e-06, + "loss": 0.6286, + "step": 10328 + }, + { + "epoch": 0.5684957895316198, + "grad_norm": 0.6456325054168701, + "learning_rate": 8.159531380277188e-06, + "loss": 0.7419, + "step": 10329 + }, + { + "epoch": 0.5685508283339754, + "grad_norm": 0.9318492412567139, + "learning_rate": 8.159195412352458e-06, + "loss": 0.8131, + "step": 10330 + }, + { + "epoch": 0.5686058671363311, + "grad_norm": 0.7012938857078552, + "learning_rate": 8.158859420684062e-06, + "loss": 0.7074, + "step": 10331 + }, + { + "epoch": 0.5686609059386868, + "grad_norm": 0.7152053117752075, + "learning_rate": 8.158523405274523e-06, + "loss": 0.7186, + "step": 10332 + }, + { + "epoch": 0.5687159447410425, + "grad_norm": 0.7074982523918152, + "learning_rate": 8.158187366126368e-06, + "loss": 0.8021, + "step": 10333 + }, + { + "epoch": 0.5687709835433981, + "grad_norm": 0.689536452293396, + "learning_rate": 8.157851303242123e-06, + "loss": 0.7493, + "step": 10334 + }, + { + "epoch": 0.5688260223457537, + "grad_norm": 0.7411753535270691, + "learning_rate": 8.157515216624313e-06, + "loss": 0.8012, + "step": 10335 + }, + { + "epoch": 0.5688810611481094, + "grad_norm": 0.6831420063972473, + "learning_rate": 8.157179106275463e-06, + "loss": 0.7114, + "step": 10336 + }, + { + "epoch": 0.5689360999504651, + "grad_norm": 0.6786901950836182, + "learning_rate": 8.1568429721981e-06, + "loss": 0.7638, + "step": 10337 + }, + { + "epoch": 0.5689911387528207, + "grad_norm": 0.7546970844268799, + "learning_rate": 8.15650681439475e-06, + "loss": 0.7711, + "step": 10338 + }, + { + "epoch": 0.5690461775551764, + "grad_norm": 0.8071785569190979, + "learning_rate": 8.156170632867942e-06, + "loss": 0.8105, + "step": 10339 + }, + { + "epoch": 0.5691012163575321, + "grad_norm": 0.7872087359428406, + "learning_rate": 8.155834427620198e-06, + "loss": 0.7657, + "step": 10340 + }, + { + "epoch": 0.5691562551598878, + "grad_norm": 0.724328875541687, + "learning_rate": 8.155498198654047e-06, + "loss": 0.7978, + "step": 10341 + }, + { + "epoch": 0.5692112939622433, + "grad_norm": 0.8559905886650085, + "learning_rate": 8.155161945972016e-06, + "loss": 0.7766, + "step": 10342 + }, + { + "epoch": 0.569266332764599, + "grad_norm": 0.607418417930603, + "learning_rate": 8.154825669576635e-06, + "loss": 0.642, + "step": 10343 + }, + { + "epoch": 0.5693213715669547, + "grad_norm": 0.7403624653816223, + "learning_rate": 8.154489369470426e-06, + "loss": 0.7301, + "step": 10344 + }, + { + "epoch": 0.5693764103693104, + "grad_norm": 0.7388540506362915, + "learning_rate": 8.154153045655922e-06, + "loss": 0.7895, + "step": 10345 + }, + { + "epoch": 0.569431449171666, + "grad_norm": 0.8327579498291016, + "learning_rate": 8.153816698135646e-06, + "loss": 0.7589, + "step": 10346 + }, + { + "epoch": 0.5694864879740217, + "grad_norm": 0.7738710641860962, + "learning_rate": 8.153480326912128e-06, + "loss": 0.7828, + "step": 10347 + }, + { + "epoch": 0.5695415267763774, + "grad_norm": 0.8280724287033081, + "learning_rate": 8.153143931987896e-06, + "loss": 0.8194, + "step": 10348 + }, + { + "epoch": 0.5695965655787331, + "grad_norm": 0.8290724754333496, + "learning_rate": 8.152807513365478e-06, + "loss": 0.5941, + "step": 10349 + }, + { + "epoch": 0.5696516043810886, + "grad_norm": 0.7514322400093079, + "learning_rate": 8.152471071047403e-06, + "loss": 0.676, + "step": 10350 + }, + { + "epoch": 0.5697066431834443, + "grad_norm": 0.6990258693695068, + "learning_rate": 8.1521346050362e-06, + "loss": 0.804, + "step": 10351 + }, + { + "epoch": 0.5697616819858, + "grad_norm": 0.6781288981437683, + "learning_rate": 8.151798115334396e-06, + "loss": 0.7372, + "step": 10352 + }, + { + "epoch": 0.5698167207881556, + "grad_norm": 0.764301061630249, + "learning_rate": 8.151461601944523e-06, + "loss": 0.8242, + "step": 10353 + }, + { + "epoch": 0.5698717595905113, + "grad_norm": 0.7577376961708069, + "learning_rate": 8.151125064869106e-06, + "loss": 0.7354, + "step": 10354 + }, + { + "epoch": 0.569926798392867, + "grad_norm": 0.767764687538147, + "learning_rate": 8.150788504110678e-06, + "loss": 0.7262, + "step": 10355 + }, + { + "epoch": 0.5699818371952227, + "grad_norm": 0.6634765267372131, + "learning_rate": 8.150451919671767e-06, + "loss": 0.7527, + "step": 10356 + }, + { + "epoch": 0.5700368759975782, + "grad_norm": 0.8803308010101318, + "learning_rate": 8.150115311554901e-06, + "loss": 0.8172, + "step": 10357 + }, + { + "epoch": 0.5700919147999339, + "grad_norm": 0.695791482925415, + "learning_rate": 8.149778679762611e-06, + "loss": 0.7538, + "step": 10358 + }, + { + "epoch": 0.5701469536022896, + "grad_norm": 0.7047555446624756, + "learning_rate": 8.149442024297432e-06, + "loss": 0.7533, + "step": 10359 + }, + { + "epoch": 0.5702019924046453, + "grad_norm": 0.7148274183273315, + "learning_rate": 8.149105345161886e-06, + "loss": 0.6736, + "step": 10360 + }, + { + "epoch": 0.5702570312070009, + "grad_norm": 0.673204243183136, + "learning_rate": 8.148768642358508e-06, + "loss": 0.7713, + "step": 10361 + }, + { + "epoch": 0.5703120700093566, + "grad_norm": 0.6258989572525024, + "learning_rate": 8.148431915889827e-06, + "loss": 0.6578, + "step": 10362 + }, + { + "epoch": 0.5703671088117123, + "grad_norm": 0.8411956429481506, + "learning_rate": 8.148095165758377e-06, + "loss": 0.8387, + "step": 10363 + }, + { + "epoch": 0.570422147614068, + "grad_norm": 0.7802130579948425, + "learning_rate": 8.147758391966685e-06, + "loss": 0.8564, + "step": 10364 + }, + { + "epoch": 0.5704771864164235, + "grad_norm": 0.6665176153182983, + "learning_rate": 8.147421594517282e-06, + "loss": 0.688, + "step": 10365 + }, + { + "epoch": 0.5705322252187792, + "grad_norm": 0.7166683673858643, + "learning_rate": 8.147084773412702e-06, + "loss": 0.6704, + "step": 10366 + }, + { + "epoch": 0.5705872640211349, + "grad_norm": 0.6948957443237305, + "learning_rate": 8.146747928655476e-06, + "loss": 0.7116, + "step": 10367 + }, + { + "epoch": 0.5706423028234906, + "grad_norm": 0.588965892791748, + "learning_rate": 8.146411060248134e-06, + "loss": 0.5644, + "step": 10368 + }, + { + "epoch": 0.5706973416258462, + "grad_norm": 0.8020890355110168, + "learning_rate": 8.14607416819321e-06, + "loss": 0.6978, + "step": 10369 + }, + { + "epoch": 0.5707523804282019, + "grad_norm": 0.9900732040405273, + "learning_rate": 8.145737252493234e-06, + "loss": 0.7295, + "step": 10370 + }, + { + "epoch": 0.5708074192305576, + "grad_norm": 0.7236563563346863, + "learning_rate": 8.145400313150737e-06, + "loss": 0.7555, + "step": 10371 + }, + { + "epoch": 0.5708624580329132, + "grad_norm": 0.6784152984619141, + "learning_rate": 8.145063350168257e-06, + "loss": 0.7283, + "step": 10372 + }, + { + "epoch": 0.5709174968352688, + "grad_norm": 0.6255244612693787, + "learning_rate": 8.14472636354832e-06, + "loss": 0.6722, + "step": 10373 + }, + { + "epoch": 0.5709725356376245, + "grad_norm": 0.8250948786735535, + "learning_rate": 8.14438935329346e-06, + "loss": 0.8406, + "step": 10374 + }, + { + "epoch": 0.5710275744399802, + "grad_norm": 0.7308233380317688, + "learning_rate": 8.144052319406215e-06, + "loss": 0.8084, + "step": 10375 + }, + { + "epoch": 0.5710826132423359, + "grad_norm": 0.7850058674812317, + "learning_rate": 8.143715261889112e-06, + "loss": 0.7892, + "step": 10376 + }, + { + "epoch": 0.5711376520446915, + "grad_norm": 0.81241774559021, + "learning_rate": 8.143378180744687e-06, + "loss": 0.7819, + "step": 10377 + }, + { + "epoch": 0.5711926908470472, + "grad_norm": 0.7174570560455322, + "learning_rate": 8.143041075975473e-06, + "loss": 0.7104, + "step": 10378 + }, + { + "epoch": 0.5712477296494028, + "grad_norm": 0.6954129934310913, + "learning_rate": 8.142703947584004e-06, + "loss": 0.7821, + "step": 10379 + }, + { + "epoch": 0.5713027684517585, + "grad_norm": 0.6895242929458618, + "learning_rate": 8.142366795572813e-06, + "loss": 0.7687, + "step": 10380 + }, + { + "epoch": 0.5713578072541141, + "grad_norm": 0.6543757319450378, + "learning_rate": 8.142029619944434e-06, + "loss": 0.7042, + "step": 10381 + }, + { + "epoch": 0.5714128460564698, + "grad_norm": 0.6712427139282227, + "learning_rate": 8.141692420701404e-06, + "loss": 0.6861, + "step": 10382 + }, + { + "epoch": 0.5714678848588255, + "grad_norm": 1.6716055870056152, + "learning_rate": 8.141355197846253e-06, + "loss": 0.8209, + "step": 10383 + }, + { + "epoch": 0.5715229236611812, + "grad_norm": 0.7509854435920715, + "learning_rate": 8.141017951381516e-06, + "loss": 0.8246, + "step": 10384 + }, + { + "epoch": 0.5715779624635368, + "grad_norm": 0.7161786556243896, + "learning_rate": 8.14068068130973e-06, + "loss": 0.835, + "step": 10385 + }, + { + "epoch": 0.5716330012658924, + "grad_norm": 0.7423714995384216, + "learning_rate": 8.140343387633427e-06, + "loss": 0.8004, + "step": 10386 + }, + { + "epoch": 0.5716880400682481, + "grad_norm": 0.6955768465995789, + "learning_rate": 8.140006070355146e-06, + "loss": 0.7299, + "step": 10387 + }, + { + "epoch": 0.5717430788706038, + "grad_norm": 0.6742254495620728, + "learning_rate": 8.13966872947742e-06, + "loss": 0.6549, + "step": 10388 + }, + { + "epoch": 0.5717981176729594, + "grad_norm": 0.7332299947738647, + "learning_rate": 8.139331365002782e-06, + "loss": 0.7945, + "step": 10389 + }, + { + "epoch": 0.5718531564753151, + "grad_norm": 0.6552133560180664, + "learning_rate": 8.138993976933771e-06, + "loss": 0.7193, + "step": 10390 + }, + { + "epoch": 0.5719081952776708, + "grad_norm": 0.6708530187606812, + "learning_rate": 8.138656565272923e-06, + "loss": 0.8053, + "step": 10391 + }, + { + "epoch": 0.5719632340800265, + "grad_norm": 0.7837093472480774, + "learning_rate": 8.138319130022771e-06, + "loss": 0.7752, + "step": 10392 + }, + { + "epoch": 0.572018272882382, + "grad_norm": 0.6910337805747986, + "learning_rate": 8.137981671185853e-06, + "loss": 0.7573, + "step": 10393 + }, + { + "epoch": 0.5720733116847377, + "grad_norm": 0.6758334636688232, + "learning_rate": 8.137644188764704e-06, + "loss": 0.8251, + "step": 10394 + }, + { + "epoch": 0.5721283504870934, + "grad_norm": 0.7513287663459778, + "learning_rate": 8.137306682761862e-06, + "loss": 0.6491, + "step": 10395 + }, + { + "epoch": 0.572183389289449, + "grad_norm": 0.678210973739624, + "learning_rate": 8.136969153179863e-06, + "loss": 0.7761, + "step": 10396 + }, + { + "epoch": 0.5722384280918047, + "grad_norm": 0.8256083726882935, + "learning_rate": 8.13663160002124e-06, + "loss": 0.7813, + "step": 10397 + }, + { + "epoch": 0.5722934668941604, + "grad_norm": 0.8383314609527588, + "learning_rate": 8.136294023288538e-06, + "loss": 0.7669, + "step": 10398 + }, + { + "epoch": 0.5723485056965161, + "grad_norm": 0.7150036692619324, + "learning_rate": 8.135956422984287e-06, + "loss": 0.8322, + "step": 10399 + }, + { + "epoch": 0.5724035444988717, + "grad_norm": 1.3011385202407837, + "learning_rate": 8.13561879911103e-06, + "loss": 0.8044, + "step": 10400 + }, + { + "epoch": 0.5724585833012273, + "grad_norm": 0.6749194860458374, + "learning_rate": 8.135281151671298e-06, + "loss": 0.6426, + "step": 10401 + }, + { + "epoch": 0.572513622103583, + "grad_norm": 0.7370286583900452, + "learning_rate": 8.134943480667635e-06, + "loss": 0.8051, + "step": 10402 + }, + { + "epoch": 0.5725686609059387, + "grad_norm": 0.6827631592750549, + "learning_rate": 8.134605786102574e-06, + "loss": 0.6961, + "step": 10403 + }, + { + "epoch": 0.5726236997082943, + "grad_norm": 0.7593247294425964, + "learning_rate": 8.134268067978655e-06, + "loss": 0.7514, + "step": 10404 + }, + { + "epoch": 0.57267873851065, + "grad_norm": 0.7229800224304199, + "learning_rate": 8.133930326298417e-06, + "loss": 0.8105, + "step": 10405 + }, + { + "epoch": 0.5727337773130057, + "grad_norm": 0.720973551273346, + "learning_rate": 8.133592561064396e-06, + "loss": 0.6866, + "step": 10406 + }, + { + "epoch": 0.5727888161153614, + "grad_norm": 0.7530742883682251, + "learning_rate": 8.133254772279135e-06, + "loss": 0.773, + "step": 10407 + }, + { + "epoch": 0.5728438549177169, + "grad_norm": 0.6897457838058472, + "learning_rate": 8.132916959945167e-06, + "loss": 0.8107, + "step": 10408 + }, + { + "epoch": 0.5728988937200726, + "grad_norm": 0.6659066081047058, + "learning_rate": 8.132579124065034e-06, + "loss": 0.8036, + "step": 10409 + }, + { + "epoch": 0.5729539325224283, + "grad_norm": 0.6925005316734314, + "learning_rate": 8.132241264641276e-06, + "loss": 0.7869, + "step": 10410 + }, + { + "epoch": 0.573008971324784, + "grad_norm": 0.8681634068489075, + "learning_rate": 8.131903381676433e-06, + "loss": 0.7411, + "step": 10411 + }, + { + "epoch": 0.5730640101271396, + "grad_norm": 0.669561505317688, + "learning_rate": 8.13156547517304e-06, + "loss": 0.7398, + "step": 10412 + }, + { + "epoch": 0.5731190489294953, + "grad_norm": 0.6737409234046936, + "learning_rate": 8.131227545133639e-06, + "loss": 0.7319, + "step": 10413 + }, + { + "epoch": 0.573174087731851, + "grad_norm": 0.7111513614654541, + "learning_rate": 8.130889591560772e-06, + "loss": 0.7192, + "step": 10414 + }, + { + "epoch": 0.5732291265342067, + "grad_norm": 0.6618744134902954, + "learning_rate": 8.130551614456974e-06, + "loss": 0.6636, + "step": 10415 + }, + { + "epoch": 0.5732841653365622, + "grad_norm": 0.8150144815444946, + "learning_rate": 8.13021361382479e-06, + "loss": 0.7168, + "step": 10416 + }, + { + "epoch": 0.5733392041389179, + "grad_norm": 0.744898796081543, + "learning_rate": 8.129875589666758e-06, + "loss": 0.8562, + "step": 10417 + }, + { + "epoch": 0.5733942429412736, + "grad_norm": 0.7831705212593079, + "learning_rate": 8.129537541985419e-06, + "loss": 0.8491, + "step": 10418 + }, + { + "epoch": 0.5734492817436293, + "grad_norm": 0.8097667098045349, + "learning_rate": 8.129199470783313e-06, + "loss": 0.7623, + "step": 10419 + }, + { + "epoch": 0.5735043205459849, + "grad_norm": 0.7951840758323669, + "learning_rate": 8.128861376062982e-06, + "loss": 0.8195, + "step": 10420 + }, + { + "epoch": 0.5735593593483406, + "grad_norm": 0.5902833938598633, + "learning_rate": 8.128523257826966e-06, + "loss": 0.6244, + "step": 10421 + }, + { + "epoch": 0.5736143981506963, + "grad_norm": 1.113287329673767, + "learning_rate": 8.128185116077805e-06, + "loss": 0.8382, + "step": 10422 + }, + { + "epoch": 0.573669436953052, + "grad_norm": 0.6899390816688538, + "learning_rate": 8.127846950818046e-06, + "loss": 0.7632, + "step": 10423 + }, + { + "epoch": 0.5737244757554075, + "grad_norm": 0.6905965805053711, + "learning_rate": 8.127508762050225e-06, + "loss": 0.7429, + "step": 10424 + }, + { + "epoch": 0.5737795145577632, + "grad_norm": 0.7036122679710388, + "learning_rate": 8.127170549776882e-06, + "loss": 0.7699, + "step": 10425 + }, + { + "epoch": 0.5738345533601189, + "grad_norm": 0.6599798202514648, + "learning_rate": 8.126832314000566e-06, + "loss": 0.7169, + "step": 10426 + }, + { + "epoch": 0.5738895921624746, + "grad_norm": 0.8682155609130859, + "learning_rate": 8.126494054723815e-06, + "loss": 0.851, + "step": 10427 + }, + { + "epoch": 0.5739446309648302, + "grad_norm": 0.6661516427993774, + "learning_rate": 8.12615577194917e-06, + "loss": 0.7287, + "step": 10428 + }, + { + "epoch": 0.5739996697671859, + "grad_norm": 0.6805256009101868, + "learning_rate": 8.125817465679176e-06, + "loss": 0.7033, + "step": 10429 + }, + { + "epoch": 0.5740547085695415, + "grad_norm": 0.7088646292686462, + "learning_rate": 8.125479135916375e-06, + "loss": 0.7295, + "step": 10430 + }, + { + "epoch": 0.5741097473718972, + "grad_norm": 0.6854971647262573, + "learning_rate": 8.12514078266331e-06, + "loss": 0.8102, + "step": 10431 + }, + { + "epoch": 0.5741647861742528, + "grad_norm": 0.7481474876403809, + "learning_rate": 8.124802405922521e-06, + "loss": 0.7463, + "step": 10432 + }, + { + "epoch": 0.5742198249766085, + "grad_norm": 0.8280898928642273, + "learning_rate": 8.124464005696556e-06, + "loss": 0.8067, + "step": 10433 + }, + { + "epoch": 0.5742748637789642, + "grad_norm": 0.696812629699707, + "learning_rate": 8.124125581987953e-06, + "loss": 0.7041, + "step": 10434 + }, + { + "epoch": 0.5743299025813199, + "grad_norm": 0.791084349155426, + "learning_rate": 8.123787134799262e-06, + "loss": 0.8244, + "step": 10435 + }, + { + "epoch": 0.5743849413836755, + "grad_norm": 0.7422665953636169, + "learning_rate": 8.123448664133022e-06, + "loss": 0.7792, + "step": 10436 + }, + { + "epoch": 0.5744399801860312, + "grad_norm": 0.7302834987640381, + "learning_rate": 8.123110169991777e-06, + "loss": 0.7617, + "step": 10437 + }, + { + "epoch": 0.5744950189883868, + "grad_norm": 0.6640440821647644, + "learning_rate": 8.122771652378071e-06, + "loss": 0.7965, + "step": 10438 + }, + { + "epoch": 0.5745500577907424, + "grad_norm": 0.7704516649246216, + "learning_rate": 8.12243311129445e-06, + "loss": 0.7814, + "step": 10439 + }, + { + "epoch": 0.5746050965930981, + "grad_norm": 0.673254668712616, + "learning_rate": 8.122094546743459e-06, + "loss": 0.7364, + "step": 10440 + }, + { + "epoch": 0.5746601353954538, + "grad_norm": 0.7648451924324036, + "learning_rate": 8.121755958727639e-06, + "loss": 0.8585, + "step": 10441 + }, + { + "epoch": 0.5747151741978095, + "grad_norm": 0.6660173535346985, + "learning_rate": 8.121417347249539e-06, + "loss": 0.6989, + "step": 10442 + }, + { + "epoch": 0.5747702130001651, + "grad_norm": 0.7128653526306152, + "learning_rate": 8.1210787123117e-06, + "loss": 0.8317, + "step": 10443 + }, + { + "epoch": 0.5748252518025208, + "grad_norm": 0.6404966115951538, + "learning_rate": 8.12074005391667e-06, + "loss": 0.6957, + "step": 10444 + }, + { + "epoch": 0.5748802906048764, + "grad_norm": 0.9597657918930054, + "learning_rate": 8.120401372066993e-06, + "loss": 0.9266, + "step": 10445 + }, + { + "epoch": 0.5749353294072321, + "grad_norm": 0.7735045552253723, + "learning_rate": 8.120062666765213e-06, + "loss": 0.8159, + "step": 10446 + }, + { + "epoch": 0.5749903682095877, + "grad_norm": 0.8031814098358154, + "learning_rate": 8.11972393801388e-06, + "loss": 0.7741, + "step": 10447 + }, + { + "epoch": 0.5750454070119434, + "grad_norm": 0.7008558511734009, + "learning_rate": 8.119385185815535e-06, + "loss": 0.6558, + "step": 10448 + }, + { + "epoch": 0.5751004458142991, + "grad_norm": 0.8162875175476074, + "learning_rate": 8.119046410172725e-06, + "loss": 0.7196, + "step": 10449 + }, + { + "epoch": 0.5751554846166548, + "grad_norm": 0.8142701983451843, + "learning_rate": 8.118707611088e-06, + "loss": 0.7709, + "step": 10450 + }, + { + "epoch": 0.5752105234190104, + "grad_norm": 0.7671986818313599, + "learning_rate": 8.118368788563902e-06, + "loss": 0.8725, + "step": 10451 + }, + { + "epoch": 0.575265562221366, + "grad_norm": 0.6604374051094055, + "learning_rate": 8.118029942602979e-06, + "loss": 0.7119, + "step": 10452 + }, + { + "epoch": 0.5753206010237217, + "grad_norm": 0.7119179368019104, + "learning_rate": 8.117691073207776e-06, + "loss": 0.7445, + "step": 10453 + }, + { + "epoch": 0.5753756398260774, + "grad_norm": 0.7572842240333557, + "learning_rate": 8.117352180380843e-06, + "loss": 0.7672, + "step": 10454 + }, + { + "epoch": 0.575430678628433, + "grad_norm": 0.688667356967926, + "learning_rate": 8.117013264124725e-06, + "loss": 0.7733, + "step": 10455 + }, + { + "epoch": 0.5754857174307887, + "grad_norm": 0.6683163046836853, + "learning_rate": 8.116674324441971e-06, + "loss": 0.6381, + "step": 10456 + }, + { + "epoch": 0.5755407562331444, + "grad_norm": 0.7792099714279175, + "learning_rate": 8.116335361335126e-06, + "loss": 0.7781, + "step": 10457 + }, + { + "epoch": 0.5755957950355001, + "grad_norm": 0.702132523059845, + "learning_rate": 8.115996374806738e-06, + "loss": 0.7442, + "step": 10458 + }, + { + "epoch": 0.5756508338378556, + "grad_norm": 0.7021365761756897, + "learning_rate": 8.115657364859356e-06, + "loss": 0.7215, + "step": 10459 + }, + { + "epoch": 0.5757058726402113, + "grad_norm": 0.7032247185707092, + "learning_rate": 8.115318331495527e-06, + "loss": 0.7069, + "step": 10460 + }, + { + "epoch": 0.575760911442567, + "grad_norm": 0.8301237225532532, + "learning_rate": 8.1149792747178e-06, + "loss": 0.789, + "step": 10461 + }, + { + "epoch": 0.5758159502449227, + "grad_norm": 0.7051018476486206, + "learning_rate": 8.11464019452872e-06, + "loss": 0.7511, + "step": 10462 + }, + { + "epoch": 0.5758709890472783, + "grad_norm": 0.8422626256942749, + "learning_rate": 8.114301090930843e-06, + "loss": 0.6507, + "step": 10463 + }, + { + "epoch": 0.575926027849634, + "grad_norm": 0.7751632332801819, + "learning_rate": 8.113961963926708e-06, + "loss": 0.7357, + "step": 10464 + }, + { + "epoch": 0.5759810666519897, + "grad_norm": 0.7158333659172058, + "learning_rate": 8.11362281351887e-06, + "loss": 0.8382, + "step": 10465 + }, + { + "epoch": 0.5760361054543454, + "grad_norm": 0.6926481127738953, + "learning_rate": 8.113283639709878e-06, + "loss": 0.7078, + "step": 10466 + }, + { + "epoch": 0.5760911442567009, + "grad_norm": 0.7091588973999023, + "learning_rate": 8.112944442502277e-06, + "loss": 0.7932, + "step": 10467 + }, + { + "epoch": 0.5761461830590566, + "grad_norm": 0.6979780197143555, + "learning_rate": 8.11260522189862e-06, + "loss": 0.6812, + "step": 10468 + }, + { + "epoch": 0.5762012218614123, + "grad_norm": 0.6735736131668091, + "learning_rate": 8.112265977901455e-06, + "loss": 0.7499, + "step": 10469 + }, + { + "epoch": 0.576256260663768, + "grad_norm": 0.6995692849159241, + "learning_rate": 8.111926710513334e-06, + "loss": 0.7123, + "step": 10470 + }, + { + "epoch": 0.5763112994661236, + "grad_norm": 0.7162681818008423, + "learning_rate": 8.111587419736802e-06, + "loss": 0.7586, + "step": 10471 + }, + { + "epoch": 0.5763663382684793, + "grad_norm": 0.945935070514679, + "learning_rate": 8.111248105574414e-06, + "loss": 0.8474, + "step": 10472 + }, + { + "epoch": 0.576421377070835, + "grad_norm": 0.608730673789978, + "learning_rate": 8.110908768028716e-06, + "loss": 0.6433, + "step": 10473 + }, + { + "epoch": 0.5764764158731907, + "grad_norm": 0.6777853965759277, + "learning_rate": 8.110569407102263e-06, + "loss": 0.7913, + "step": 10474 + }, + { + "epoch": 0.5765314546755462, + "grad_norm": 0.6310930848121643, + "learning_rate": 8.1102300227976e-06, + "loss": 0.719, + "step": 10475 + }, + { + "epoch": 0.5765864934779019, + "grad_norm": 0.7048485279083252, + "learning_rate": 8.109890615117282e-06, + "loss": 0.7341, + "step": 10476 + }, + { + "epoch": 0.5766415322802576, + "grad_norm": 0.672987163066864, + "learning_rate": 8.10955118406386e-06, + "loss": 0.7637, + "step": 10477 + }, + { + "epoch": 0.5766965710826133, + "grad_norm": 0.7018216252326965, + "learning_rate": 8.109211729639882e-06, + "loss": 0.6924, + "step": 10478 + }, + { + "epoch": 0.5767516098849689, + "grad_norm": 0.7183761596679688, + "learning_rate": 8.108872251847901e-06, + "loss": 0.7945, + "step": 10479 + }, + { + "epoch": 0.5768066486873246, + "grad_norm": 0.7332683801651001, + "learning_rate": 8.108532750690469e-06, + "loss": 0.7686, + "step": 10480 + }, + { + "epoch": 0.5768616874896803, + "grad_norm": 0.7118290066719055, + "learning_rate": 8.108193226170139e-06, + "loss": 0.6917, + "step": 10481 + }, + { + "epoch": 0.5769167262920358, + "grad_norm": 0.8242507576942444, + "learning_rate": 8.107853678289456e-06, + "loss": 0.9119, + "step": 10482 + }, + { + "epoch": 0.5769717650943915, + "grad_norm": 0.7138590216636658, + "learning_rate": 8.10751410705098e-06, + "loss": 0.7095, + "step": 10483 + }, + { + "epoch": 0.5770268038967472, + "grad_norm": 0.7541199326515198, + "learning_rate": 8.107174512457259e-06, + "loss": 0.8042, + "step": 10484 + }, + { + "epoch": 0.5770818426991029, + "grad_norm": 0.7776939868927002, + "learning_rate": 8.106834894510846e-06, + "loss": 0.8075, + "step": 10485 + }, + { + "epoch": 0.5771368815014585, + "grad_norm": 0.6466917395591736, + "learning_rate": 8.106495253214293e-06, + "loss": 0.707, + "step": 10486 + }, + { + "epoch": 0.5771919203038142, + "grad_norm": 0.687101423740387, + "learning_rate": 8.106155588570153e-06, + "loss": 0.6945, + "step": 10487 + }, + { + "epoch": 0.5772469591061699, + "grad_norm": 0.8338418006896973, + "learning_rate": 8.10581590058098e-06, + "loss": 0.8044, + "step": 10488 + }, + { + "epoch": 0.5773019979085255, + "grad_norm": 0.7052263617515564, + "learning_rate": 8.105476189249325e-06, + "loss": 0.8216, + "step": 10489 + }, + { + "epoch": 0.5773570367108811, + "grad_norm": 0.7205906510353088, + "learning_rate": 8.105136454577744e-06, + "loss": 0.8853, + "step": 10490 + }, + { + "epoch": 0.5774120755132368, + "grad_norm": 0.7875076532363892, + "learning_rate": 8.10479669656879e-06, + "loss": 0.822, + "step": 10491 + }, + { + "epoch": 0.5774671143155925, + "grad_norm": 0.6858797669410706, + "learning_rate": 8.104456915225012e-06, + "loss": 0.7924, + "step": 10492 + }, + { + "epoch": 0.5775221531179482, + "grad_norm": 0.6991322636604309, + "learning_rate": 8.104117110548968e-06, + "loss": 0.8144, + "step": 10493 + }, + { + "epoch": 0.5775771919203038, + "grad_norm": 0.7768846750259399, + "learning_rate": 8.103777282543209e-06, + "loss": 0.7793, + "step": 10494 + }, + { + "epoch": 0.5776322307226595, + "grad_norm": 0.7055716514587402, + "learning_rate": 8.103437431210293e-06, + "loss": 0.7653, + "step": 10495 + }, + { + "epoch": 0.5776872695250151, + "grad_norm": 1.009839653968811, + "learning_rate": 8.10309755655277e-06, + "loss": 0.7646, + "step": 10496 + }, + { + "epoch": 0.5777423083273708, + "grad_norm": 0.699435293674469, + "learning_rate": 8.102757658573197e-06, + "loss": 0.7806, + "step": 10497 + }, + { + "epoch": 0.5777973471297264, + "grad_norm": 0.8566381931304932, + "learning_rate": 8.102417737274129e-06, + "loss": 0.8302, + "step": 10498 + }, + { + "epoch": 0.5778523859320821, + "grad_norm": 0.745801568031311, + "learning_rate": 8.10207779265812e-06, + "loss": 0.91, + "step": 10499 + }, + { + "epoch": 0.5779074247344378, + "grad_norm": 0.6867349743843079, + "learning_rate": 8.101737824727724e-06, + "loss": 0.771, + "step": 10500 + }, + { + "epoch": 0.5779624635367935, + "grad_norm": 0.6693048477172852, + "learning_rate": 8.101397833485496e-06, + "loss": 0.7967, + "step": 10501 + }, + { + "epoch": 0.5780175023391491, + "grad_norm": 0.7485450506210327, + "learning_rate": 8.101057818933993e-06, + "loss": 0.7132, + "step": 10502 + }, + { + "epoch": 0.5780725411415047, + "grad_norm": 0.7619839906692505, + "learning_rate": 8.100717781075769e-06, + "loss": 0.7379, + "step": 10503 + }, + { + "epoch": 0.5781275799438604, + "grad_norm": 0.7651955485343933, + "learning_rate": 8.100377719913382e-06, + "loss": 0.8437, + "step": 10504 + }, + { + "epoch": 0.5781826187462161, + "grad_norm": 0.692385196685791, + "learning_rate": 8.100037635449384e-06, + "loss": 0.7666, + "step": 10505 + }, + { + "epoch": 0.5782376575485717, + "grad_norm": 0.7332374453544617, + "learning_rate": 8.099697527686334e-06, + "loss": 0.7476, + "step": 10506 + }, + { + "epoch": 0.5782926963509274, + "grad_norm": 0.6934877634048462, + "learning_rate": 8.099357396626786e-06, + "loss": 0.8054, + "step": 10507 + }, + { + "epoch": 0.5783477351532831, + "grad_norm": 0.8393011689186096, + "learning_rate": 8.099017242273298e-06, + "loss": 0.8655, + "step": 10508 + }, + { + "epoch": 0.5784027739556388, + "grad_norm": 0.6850646734237671, + "learning_rate": 8.098677064628425e-06, + "loss": 0.7424, + "step": 10509 + }, + { + "epoch": 0.5784578127579943, + "grad_norm": 0.7302095293998718, + "learning_rate": 8.098336863694728e-06, + "loss": 0.903, + "step": 10510 + }, + { + "epoch": 0.57851285156035, + "grad_norm": 0.7474033236503601, + "learning_rate": 8.097996639474757e-06, + "loss": 0.7509, + "step": 10511 + }, + { + "epoch": 0.5785678903627057, + "grad_norm": 0.6525655388832092, + "learning_rate": 8.097656391971074e-06, + "loss": 0.7097, + "step": 10512 + }, + { + "epoch": 0.5786229291650614, + "grad_norm": 0.8197451829910278, + "learning_rate": 8.097316121186234e-06, + "loss": 0.7401, + "step": 10513 + }, + { + "epoch": 0.578677967967417, + "grad_norm": 0.7048231959342957, + "learning_rate": 8.096975827122795e-06, + "loss": 0.7964, + "step": 10514 + }, + { + "epoch": 0.5787330067697727, + "grad_norm": 0.8417022228240967, + "learning_rate": 8.096635509783315e-06, + "loss": 0.7703, + "step": 10515 + }, + { + "epoch": 0.5787880455721284, + "grad_norm": 0.7313926815986633, + "learning_rate": 8.096295169170352e-06, + "loss": 0.7565, + "step": 10516 + }, + { + "epoch": 0.5788430843744841, + "grad_norm": 0.7156692147254944, + "learning_rate": 8.095954805286464e-06, + "loss": 0.7456, + "step": 10517 + }, + { + "epoch": 0.5788981231768396, + "grad_norm": 0.7366768717765808, + "learning_rate": 8.095614418134205e-06, + "loss": 0.72, + "step": 10518 + }, + { + "epoch": 0.5789531619791953, + "grad_norm": 0.7011533379554749, + "learning_rate": 8.09527400771614e-06, + "loss": 0.7683, + "step": 10519 + }, + { + "epoch": 0.579008200781551, + "grad_norm": 0.6849086284637451, + "learning_rate": 8.094933574034823e-06, + "loss": 0.6938, + "step": 10520 + }, + { + "epoch": 0.5790632395839067, + "grad_norm": 0.7351469397544861, + "learning_rate": 8.094593117092814e-06, + "loss": 0.7364, + "step": 10521 + }, + { + "epoch": 0.5791182783862623, + "grad_norm": 0.7133724689483643, + "learning_rate": 8.09425263689267e-06, + "loss": 0.7328, + "step": 10522 + }, + { + "epoch": 0.579173317188618, + "grad_norm": 0.6713461875915527, + "learning_rate": 8.093912133436954e-06, + "loss": 0.7296, + "step": 10523 + }, + { + "epoch": 0.5792283559909737, + "grad_norm": 0.7057825922966003, + "learning_rate": 8.093571606728222e-06, + "loss": 0.7732, + "step": 10524 + }, + { + "epoch": 0.5792833947933292, + "grad_norm": 0.7378783226013184, + "learning_rate": 8.093231056769033e-06, + "loss": 0.7907, + "step": 10525 + }, + { + "epoch": 0.5793384335956849, + "grad_norm": 0.8796947598457336, + "learning_rate": 8.092890483561947e-06, + "loss": 0.7325, + "step": 10526 + }, + { + "epoch": 0.5793934723980406, + "grad_norm": 0.7326352000236511, + "learning_rate": 8.092549887109525e-06, + "loss": 0.7948, + "step": 10527 + }, + { + "epoch": 0.5794485112003963, + "grad_norm": 0.7131063342094421, + "learning_rate": 8.092209267414325e-06, + "loss": 0.7595, + "step": 10528 + }, + { + "epoch": 0.5795035500027519, + "grad_norm": 0.6993252635002136, + "learning_rate": 8.091868624478908e-06, + "loss": 0.782, + "step": 10529 + }, + { + "epoch": 0.5795585888051076, + "grad_norm": 0.6945857405662537, + "learning_rate": 8.091527958305835e-06, + "loss": 0.7283, + "step": 10530 + }, + { + "epoch": 0.5796136276074633, + "grad_norm": 0.8203904032707214, + "learning_rate": 8.091187268897667e-06, + "loss": 0.7787, + "step": 10531 + }, + { + "epoch": 0.579668666409819, + "grad_norm": 0.6450221538543701, + "learning_rate": 8.09084655625696e-06, + "loss": 0.7092, + "step": 10532 + }, + { + "epoch": 0.5797237052121745, + "grad_norm": 0.6852096915245056, + "learning_rate": 8.090505820386279e-06, + "loss": 0.7916, + "step": 10533 + }, + { + "epoch": 0.5797787440145302, + "grad_norm": 1.0816445350646973, + "learning_rate": 8.090165061288182e-06, + "loss": 0.7545, + "step": 10534 + }, + { + "epoch": 0.5798337828168859, + "grad_norm": 0.7312847375869751, + "learning_rate": 8.089824278965233e-06, + "loss": 0.7395, + "step": 10535 + }, + { + "epoch": 0.5798888216192416, + "grad_norm": 0.7281426191329956, + "learning_rate": 8.089483473419992e-06, + "loss": 0.7677, + "step": 10536 + }, + { + "epoch": 0.5799438604215972, + "grad_norm": 0.7392409443855286, + "learning_rate": 8.08914264465502e-06, + "loss": 0.7674, + "step": 10537 + }, + { + "epoch": 0.5799988992239529, + "grad_norm": 0.7041863799095154, + "learning_rate": 8.088801792672877e-06, + "loss": 0.6156, + "step": 10538 + }, + { + "epoch": 0.5800539380263086, + "grad_norm": 0.7113755345344543, + "learning_rate": 8.088460917476128e-06, + "loss": 0.7677, + "step": 10539 + }, + { + "epoch": 0.5801089768286642, + "grad_norm": 0.673966646194458, + "learning_rate": 8.088120019067334e-06, + "loss": 0.7557, + "step": 10540 + }, + { + "epoch": 0.5801640156310198, + "grad_norm": 0.8165854215621948, + "learning_rate": 8.087779097449055e-06, + "loss": 0.8102, + "step": 10541 + }, + { + "epoch": 0.5802190544333755, + "grad_norm": 0.7010880708694458, + "learning_rate": 8.087438152623857e-06, + "loss": 0.7816, + "step": 10542 + }, + { + "epoch": 0.5802740932357312, + "grad_norm": 0.726177990436554, + "learning_rate": 8.0870971845943e-06, + "loss": 0.7671, + "step": 10543 + }, + { + "epoch": 0.5803291320380869, + "grad_norm": 0.7403919696807861, + "learning_rate": 8.086756193362946e-06, + "loss": 0.8449, + "step": 10544 + }, + { + "epoch": 0.5803841708404425, + "grad_norm": 0.6897104382514954, + "learning_rate": 8.086415178932358e-06, + "loss": 0.7563, + "step": 10545 + }, + { + "epoch": 0.5804392096427982, + "grad_norm": 0.7682604193687439, + "learning_rate": 8.0860741413051e-06, + "loss": 0.8019, + "step": 10546 + }, + { + "epoch": 0.5804942484451538, + "grad_norm": 0.7317522168159485, + "learning_rate": 8.085733080483736e-06, + "loss": 0.7446, + "step": 10547 + }, + { + "epoch": 0.5805492872475095, + "grad_norm": 0.8503430485725403, + "learning_rate": 8.085391996470826e-06, + "loss": 0.7343, + "step": 10548 + }, + { + "epoch": 0.5806043260498651, + "grad_norm": 0.8550657629966736, + "learning_rate": 8.085050889268937e-06, + "loss": 0.9267, + "step": 10549 + }, + { + "epoch": 0.5806593648522208, + "grad_norm": 0.7751224637031555, + "learning_rate": 8.084709758880633e-06, + "loss": 0.7404, + "step": 10550 + }, + { + "epoch": 0.5807144036545765, + "grad_norm": 0.6346186399459839, + "learning_rate": 8.084368605308475e-06, + "loss": 0.66, + "step": 10551 + }, + { + "epoch": 0.5807694424569322, + "grad_norm": 0.7295717597007751, + "learning_rate": 8.084027428555027e-06, + "loss": 0.8313, + "step": 10552 + }, + { + "epoch": 0.5808244812592878, + "grad_norm": 0.6962289810180664, + "learning_rate": 8.083686228622856e-06, + "loss": 0.7871, + "step": 10553 + }, + { + "epoch": 0.5808795200616435, + "grad_norm": 0.6968896389007568, + "learning_rate": 8.083345005514522e-06, + "loss": 0.7261, + "step": 10554 + }, + { + "epoch": 0.5809345588639991, + "grad_norm": 0.8374869227409363, + "learning_rate": 8.083003759232595e-06, + "loss": 0.797, + "step": 10555 + }, + { + "epoch": 0.5809895976663548, + "grad_norm": 0.6511034369468689, + "learning_rate": 8.082662489779637e-06, + "loss": 0.7237, + "step": 10556 + }, + { + "epoch": 0.5810446364687104, + "grad_norm": 0.6644287705421448, + "learning_rate": 8.082321197158212e-06, + "loss": 0.6969, + "step": 10557 + }, + { + "epoch": 0.5810996752710661, + "grad_norm": 0.7681102752685547, + "learning_rate": 8.081979881370884e-06, + "loss": 0.7193, + "step": 10558 + }, + { + "epoch": 0.5811547140734218, + "grad_norm": 0.7930792570114136, + "learning_rate": 8.081638542420224e-06, + "loss": 0.7198, + "step": 10559 + }, + { + "epoch": 0.5812097528757775, + "grad_norm": 0.7227992415428162, + "learning_rate": 8.081297180308791e-06, + "loss": 0.7533, + "step": 10560 + }, + { + "epoch": 0.581264791678133, + "grad_norm": 0.7293071150779724, + "learning_rate": 8.080955795039156e-06, + "loss": 0.6228, + "step": 10561 + }, + { + "epoch": 0.5813198304804887, + "grad_norm": 0.7356483936309814, + "learning_rate": 8.080614386613879e-06, + "loss": 0.7299, + "step": 10562 + }, + { + "epoch": 0.5813748692828444, + "grad_norm": 0.8181473016738892, + "learning_rate": 8.080272955035531e-06, + "loss": 0.6576, + "step": 10563 + }, + { + "epoch": 0.5814299080852001, + "grad_norm": 0.7066958546638489, + "learning_rate": 8.079931500306675e-06, + "loss": 0.7372, + "step": 10564 + }, + { + "epoch": 0.5814849468875557, + "grad_norm": 0.6821097135543823, + "learning_rate": 8.079590022429877e-06, + "loss": 0.7516, + "step": 10565 + }, + { + "epoch": 0.5815399856899114, + "grad_norm": 0.6879069209098816, + "learning_rate": 8.079248521407707e-06, + "loss": 0.7525, + "step": 10566 + }, + { + "epoch": 0.5815950244922671, + "grad_norm": 0.956345796585083, + "learning_rate": 8.078906997242729e-06, + "loss": 0.8175, + "step": 10567 + }, + { + "epoch": 0.5816500632946227, + "grad_norm": 0.6942328214645386, + "learning_rate": 8.078565449937508e-06, + "loss": 0.6264, + "step": 10568 + }, + { + "epoch": 0.5817051020969783, + "grad_norm": 0.7073766589164734, + "learning_rate": 8.078223879494615e-06, + "loss": 0.766, + "step": 10569 + }, + { + "epoch": 0.581760140899334, + "grad_norm": 0.7649571895599365, + "learning_rate": 8.077882285916614e-06, + "loss": 0.8767, + "step": 10570 + }, + { + "epoch": 0.5818151797016897, + "grad_norm": 0.6384355425834656, + "learning_rate": 8.077540669206076e-06, + "loss": 0.7444, + "step": 10571 + }, + { + "epoch": 0.5818702185040453, + "grad_norm": 0.7173928022384644, + "learning_rate": 8.077199029365565e-06, + "loss": 0.8277, + "step": 10572 + }, + { + "epoch": 0.581925257306401, + "grad_norm": 0.7310757637023926, + "learning_rate": 8.076857366397648e-06, + "loss": 0.8425, + "step": 10573 + }, + { + "epoch": 0.5819802961087567, + "grad_norm": 0.6888872385025024, + "learning_rate": 8.076515680304897e-06, + "loss": 0.6961, + "step": 10574 + }, + { + "epoch": 0.5820353349111124, + "grad_norm": 0.7290124297142029, + "learning_rate": 8.076173971089877e-06, + "loss": 0.7865, + "step": 10575 + }, + { + "epoch": 0.582090373713468, + "grad_norm": 0.7402634024620056, + "learning_rate": 8.075832238755156e-06, + "loss": 0.7196, + "step": 10576 + }, + { + "epoch": 0.5821454125158236, + "grad_norm": 0.74916672706604, + "learning_rate": 8.075490483303305e-06, + "loss": 0.8361, + "step": 10577 + }, + { + "epoch": 0.5822004513181793, + "grad_norm": 0.8146494626998901, + "learning_rate": 8.07514870473689e-06, + "loss": 0.7398, + "step": 10578 + }, + { + "epoch": 0.582255490120535, + "grad_norm": 0.6632487177848816, + "learning_rate": 8.07480690305848e-06, + "loss": 0.7239, + "step": 10579 + }, + { + "epoch": 0.5823105289228906, + "grad_norm": 0.6912766695022583, + "learning_rate": 8.074465078270645e-06, + "loss": 0.7488, + "step": 10580 + }, + { + "epoch": 0.5823655677252463, + "grad_norm": 0.7410522699356079, + "learning_rate": 8.074123230375952e-06, + "loss": 0.7413, + "step": 10581 + }, + { + "epoch": 0.582420606527602, + "grad_norm": 0.7932689189910889, + "learning_rate": 8.073781359376972e-06, + "loss": 0.7894, + "step": 10582 + }, + { + "epoch": 0.5824756453299577, + "grad_norm": 0.6710309982299805, + "learning_rate": 8.073439465276277e-06, + "loss": 0.6727, + "step": 10583 + }, + { + "epoch": 0.5825306841323132, + "grad_norm": 0.7457143068313599, + "learning_rate": 8.07309754807643e-06, + "loss": 0.6719, + "step": 10584 + }, + { + "epoch": 0.5825857229346689, + "grad_norm": 0.7340453863143921, + "learning_rate": 8.072755607780008e-06, + "loss": 0.7397, + "step": 10585 + }, + { + "epoch": 0.5826407617370246, + "grad_norm": 0.7532176971435547, + "learning_rate": 8.072413644389574e-06, + "loss": 0.7368, + "step": 10586 + }, + { + "epoch": 0.5826958005393803, + "grad_norm": 0.9317812919616699, + "learning_rate": 8.072071657907703e-06, + "loss": 0.9113, + "step": 10587 + }, + { + "epoch": 0.5827508393417359, + "grad_norm": 0.8535491228103638, + "learning_rate": 8.071729648336963e-06, + "loss": 0.7708, + "step": 10588 + }, + { + "epoch": 0.5828058781440916, + "grad_norm": 0.6720348000526428, + "learning_rate": 8.071387615679926e-06, + "loss": 0.7521, + "step": 10589 + }, + { + "epoch": 0.5828609169464473, + "grad_norm": 0.7113864421844482, + "learning_rate": 8.071045559939162e-06, + "loss": 0.8713, + "step": 10590 + }, + { + "epoch": 0.582915955748803, + "grad_norm": 0.7760024070739746, + "learning_rate": 8.070703481117242e-06, + "loss": 0.7567, + "step": 10591 + }, + { + "epoch": 0.5829709945511585, + "grad_norm": 0.9548617005348206, + "learning_rate": 8.070361379216735e-06, + "loss": 0.7937, + "step": 10592 + }, + { + "epoch": 0.5830260333535142, + "grad_norm": 0.7796840667724609, + "learning_rate": 8.070019254240216e-06, + "loss": 0.7485, + "step": 10593 + }, + { + "epoch": 0.5830810721558699, + "grad_norm": 0.7006514668464661, + "learning_rate": 8.069677106190253e-06, + "loss": 0.7813, + "step": 10594 + }, + { + "epoch": 0.5831361109582256, + "grad_norm": 0.646396279335022, + "learning_rate": 8.069334935069417e-06, + "loss": 0.7437, + "step": 10595 + }, + { + "epoch": 0.5831911497605812, + "grad_norm": 0.8257368206977844, + "learning_rate": 8.068992740880283e-06, + "loss": 0.7351, + "step": 10596 + }, + { + "epoch": 0.5832461885629369, + "grad_norm": 0.6646208763122559, + "learning_rate": 8.068650523625422e-06, + "loss": 0.6554, + "step": 10597 + }, + { + "epoch": 0.5833012273652926, + "grad_norm": 0.8495579957962036, + "learning_rate": 8.068308283307402e-06, + "loss": 0.791, + "step": 10598 + }, + { + "epoch": 0.5833562661676482, + "grad_norm": 0.7283076047897339, + "learning_rate": 8.0679660199288e-06, + "loss": 0.7327, + "step": 10599 + }, + { + "epoch": 0.5834113049700038, + "grad_norm": 0.704572856426239, + "learning_rate": 8.067623733492187e-06, + "loss": 0.6094, + "step": 10600 + }, + { + "epoch": 0.5834663437723595, + "grad_norm": 0.6435144543647766, + "learning_rate": 8.067281424000136e-06, + "loss": 0.6974, + "step": 10601 + }, + { + "epoch": 0.5835213825747152, + "grad_norm": 0.9628346562385559, + "learning_rate": 8.066939091455215e-06, + "loss": 0.8933, + "step": 10602 + }, + { + "epoch": 0.5835764213770709, + "grad_norm": 0.6856930255889893, + "learning_rate": 8.066596735860004e-06, + "loss": 0.7414, + "step": 10603 + }, + { + "epoch": 0.5836314601794265, + "grad_norm": 0.7341175675392151, + "learning_rate": 8.066254357217072e-06, + "loss": 0.7553, + "step": 10604 + }, + { + "epoch": 0.5836864989817822, + "grad_norm": 0.7124871611595154, + "learning_rate": 8.065911955528995e-06, + "loss": 0.663, + "step": 10605 + }, + { + "epoch": 0.5837415377841378, + "grad_norm": 0.816028892993927, + "learning_rate": 8.065569530798341e-06, + "loss": 0.8778, + "step": 10606 + }, + { + "epoch": 0.5837965765864935, + "grad_norm": 0.8735721111297607, + "learning_rate": 8.06522708302769e-06, + "loss": 0.7866, + "step": 10607 + }, + { + "epoch": 0.5838516153888491, + "grad_norm": 0.6780036687850952, + "learning_rate": 8.06488461221961e-06, + "loss": 0.7329, + "step": 10608 + }, + { + "epoch": 0.5839066541912048, + "grad_norm": 0.7624822854995728, + "learning_rate": 8.06454211837668e-06, + "loss": 0.8095, + "step": 10609 + }, + { + "epoch": 0.5839616929935605, + "grad_norm": 0.8269234895706177, + "learning_rate": 8.06419960150147e-06, + "loss": 0.7194, + "step": 10610 + }, + { + "epoch": 0.5840167317959161, + "grad_norm": 0.6748649477958679, + "learning_rate": 8.063857061596558e-06, + "loss": 0.702, + "step": 10611 + }, + { + "epoch": 0.5840717705982718, + "grad_norm": 0.9700273275375366, + "learning_rate": 8.063514498664515e-06, + "loss": 0.7917, + "step": 10612 + }, + { + "epoch": 0.5841268094006274, + "grad_norm": 0.7798827290534973, + "learning_rate": 8.063171912707916e-06, + "loss": 0.798, + "step": 10613 + }, + { + "epoch": 0.5841818482029831, + "grad_norm": 0.6613249778747559, + "learning_rate": 8.06282930372934e-06, + "loss": 0.7216, + "step": 10614 + }, + { + "epoch": 0.5842368870053387, + "grad_norm": 0.727116048336029, + "learning_rate": 8.062486671731357e-06, + "loss": 0.8054, + "step": 10615 + }, + { + "epoch": 0.5842919258076944, + "grad_norm": 0.6704444289207458, + "learning_rate": 8.062144016716543e-06, + "loss": 0.7503, + "step": 10616 + }, + { + "epoch": 0.5843469646100501, + "grad_norm": 0.6867938041687012, + "learning_rate": 8.061801338687477e-06, + "loss": 0.8005, + "step": 10617 + }, + { + "epoch": 0.5844020034124058, + "grad_norm": 0.7097555994987488, + "learning_rate": 8.061458637646729e-06, + "loss": 0.8515, + "step": 10618 + }, + { + "epoch": 0.5844570422147614, + "grad_norm": 0.6624881625175476, + "learning_rate": 8.061115913596878e-06, + "loss": 0.7735, + "step": 10619 + }, + { + "epoch": 0.584512081017117, + "grad_norm": 0.6649004220962524, + "learning_rate": 8.060773166540498e-06, + "loss": 0.7837, + "step": 10620 + }, + { + "epoch": 0.5845671198194727, + "grad_norm": 0.6732968091964722, + "learning_rate": 8.06043039648017e-06, + "loss": 0.7846, + "step": 10621 + }, + { + "epoch": 0.5846221586218284, + "grad_norm": 0.7551947236061096, + "learning_rate": 8.060087603418464e-06, + "loss": 0.6868, + "step": 10622 + }, + { + "epoch": 0.584677197424184, + "grad_norm": 0.7781728506088257, + "learning_rate": 8.059744787357959e-06, + "loss": 0.8088, + "step": 10623 + }, + { + "epoch": 0.5847322362265397, + "grad_norm": 0.6362790465354919, + "learning_rate": 8.05940194830123e-06, + "loss": 0.664, + "step": 10624 + }, + { + "epoch": 0.5847872750288954, + "grad_norm": 0.670386791229248, + "learning_rate": 8.059059086250856e-06, + "loss": 0.6839, + "step": 10625 + }, + { + "epoch": 0.5848423138312511, + "grad_norm": 0.7030045986175537, + "learning_rate": 8.058716201209414e-06, + "loss": 0.7243, + "step": 10626 + }, + { + "epoch": 0.5848973526336066, + "grad_norm": 0.7881805896759033, + "learning_rate": 8.058373293179477e-06, + "loss": 0.7994, + "step": 10627 + }, + { + "epoch": 0.5849523914359623, + "grad_norm": 0.7077344059944153, + "learning_rate": 8.058030362163628e-06, + "loss": 0.822, + "step": 10628 + }, + { + "epoch": 0.585007430238318, + "grad_norm": 0.6787039637565613, + "learning_rate": 8.057687408164439e-06, + "loss": 0.7619, + "step": 10629 + }, + { + "epoch": 0.5850624690406737, + "grad_norm": 1.1377217769622803, + "learning_rate": 8.05734443118449e-06, + "loss": 0.8632, + "step": 10630 + }, + { + "epoch": 0.5851175078430293, + "grad_norm": 0.7002600431442261, + "learning_rate": 8.05700143122636e-06, + "loss": 0.8184, + "step": 10631 + }, + { + "epoch": 0.585172546645385, + "grad_norm": 0.7016324400901794, + "learning_rate": 8.056658408292626e-06, + "loss": 0.658, + "step": 10632 + }, + { + "epoch": 0.5852275854477407, + "grad_norm": 0.6674843430519104, + "learning_rate": 8.056315362385864e-06, + "loss": 0.7281, + "step": 10633 + }, + { + "epoch": 0.5852826242500964, + "grad_norm": 0.6789288520812988, + "learning_rate": 8.055972293508653e-06, + "loss": 0.8192, + "step": 10634 + }, + { + "epoch": 0.5853376630524519, + "grad_norm": 0.6740062236785889, + "learning_rate": 8.055629201663575e-06, + "loss": 0.7343, + "step": 10635 + }, + { + "epoch": 0.5853927018548076, + "grad_norm": 0.7417730689048767, + "learning_rate": 8.055286086853204e-06, + "loss": 0.8161, + "step": 10636 + }, + { + "epoch": 0.5854477406571633, + "grad_norm": 0.6680465340614319, + "learning_rate": 8.054942949080122e-06, + "loss": 0.7589, + "step": 10637 + }, + { + "epoch": 0.585502779459519, + "grad_norm": 0.7205108404159546, + "learning_rate": 8.054599788346904e-06, + "loss": 0.6837, + "step": 10638 + }, + { + "epoch": 0.5855578182618746, + "grad_norm": 0.8694404363632202, + "learning_rate": 8.054256604656134e-06, + "loss": 0.8033, + "step": 10639 + }, + { + "epoch": 0.5856128570642303, + "grad_norm": 0.685471773147583, + "learning_rate": 8.053913398010389e-06, + "loss": 0.7654, + "step": 10640 + }, + { + "epoch": 0.585667895866586, + "grad_norm": 1.3463424444198608, + "learning_rate": 8.053570168412249e-06, + "loss": 0.7743, + "step": 10641 + }, + { + "epoch": 0.5857229346689417, + "grad_norm": 0.9380106329917908, + "learning_rate": 8.05322691586429e-06, + "loss": 0.8984, + "step": 10642 + }, + { + "epoch": 0.5857779734712972, + "grad_norm": 0.7408519387245178, + "learning_rate": 8.052883640369096e-06, + "loss": 0.7716, + "step": 10643 + }, + { + "epoch": 0.5858330122736529, + "grad_norm": 0.7712904214859009, + "learning_rate": 8.052540341929248e-06, + "loss": 0.7767, + "step": 10644 + }, + { + "epoch": 0.5858880510760086, + "grad_norm": 0.8464158177375793, + "learning_rate": 8.052197020547321e-06, + "loss": 0.8333, + "step": 10645 + }, + { + "epoch": 0.5859430898783643, + "grad_norm": 0.6970158219337463, + "learning_rate": 8.0518536762259e-06, + "loss": 0.7354, + "step": 10646 + }, + { + "epoch": 0.5859981286807199, + "grad_norm": 0.7048965096473694, + "learning_rate": 8.051510308967563e-06, + "loss": 0.8333, + "step": 10647 + }, + { + "epoch": 0.5860531674830756, + "grad_norm": 0.6443868279457092, + "learning_rate": 8.05116691877489e-06, + "loss": 0.7386, + "step": 10648 + }, + { + "epoch": 0.5861082062854313, + "grad_norm": 0.6653542518615723, + "learning_rate": 8.050823505650465e-06, + "loss": 0.8116, + "step": 10649 + }, + { + "epoch": 0.5861632450877869, + "grad_norm": 0.7293158769607544, + "learning_rate": 8.050480069596868e-06, + "loss": 0.8231, + "step": 10650 + }, + { + "epoch": 0.5862182838901425, + "grad_norm": 0.6876117587089539, + "learning_rate": 8.050136610616676e-06, + "loss": 0.7856, + "step": 10651 + }, + { + "epoch": 0.5862733226924982, + "grad_norm": 0.6811665296554565, + "learning_rate": 8.049793128712477e-06, + "loss": 0.7667, + "step": 10652 + }, + { + "epoch": 0.5863283614948539, + "grad_norm": 0.701034426689148, + "learning_rate": 8.049449623886849e-06, + "loss": 0.7812, + "step": 10653 + }, + { + "epoch": 0.5863834002972095, + "grad_norm": 0.6872833967208862, + "learning_rate": 8.049106096142372e-06, + "loss": 0.755, + "step": 10654 + }, + { + "epoch": 0.5864384390995652, + "grad_norm": 0.6643580198287964, + "learning_rate": 8.04876254548163e-06, + "loss": 0.7692, + "step": 10655 + }, + { + "epoch": 0.5864934779019209, + "grad_norm": 0.6672106981277466, + "learning_rate": 8.048418971907206e-06, + "loss": 0.7424, + "step": 10656 + }, + { + "epoch": 0.5865485167042765, + "grad_norm": 0.8030515313148499, + "learning_rate": 8.04807537542168e-06, + "loss": 0.8074, + "step": 10657 + }, + { + "epoch": 0.5866035555066321, + "grad_norm": 0.713417112827301, + "learning_rate": 8.047731756027637e-06, + "loss": 0.6974, + "step": 10658 + }, + { + "epoch": 0.5866585943089878, + "grad_norm": 0.7715572118759155, + "learning_rate": 8.047388113727657e-06, + "loss": 0.7353, + "step": 10659 + }, + { + "epoch": 0.5867136331113435, + "grad_norm": 0.7009812593460083, + "learning_rate": 8.047044448524323e-06, + "loss": 0.7992, + "step": 10660 + }, + { + "epoch": 0.5867686719136992, + "grad_norm": 0.6425079107284546, + "learning_rate": 8.046700760420219e-06, + "loss": 0.7394, + "step": 10661 + }, + { + "epoch": 0.5868237107160548, + "grad_norm": 0.7713460922241211, + "learning_rate": 8.046357049417927e-06, + "loss": 0.7759, + "step": 10662 + }, + { + "epoch": 0.5868787495184105, + "grad_norm": 0.7310347557067871, + "learning_rate": 8.046013315520033e-06, + "loss": 0.7278, + "step": 10663 + }, + { + "epoch": 0.5869337883207661, + "grad_norm": 0.7493315935134888, + "learning_rate": 8.045669558729117e-06, + "loss": 0.7808, + "step": 10664 + }, + { + "epoch": 0.5869888271231218, + "grad_norm": 0.7547439336776733, + "learning_rate": 8.045325779047763e-06, + "loss": 0.8245, + "step": 10665 + }, + { + "epoch": 0.5870438659254774, + "grad_norm": 0.7556985020637512, + "learning_rate": 8.044981976478557e-06, + "loss": 0.8, + "step": 10666 + }, + { + "epoch": 0.5870989047278331, + "grad_norm": 0.8330736756324768, + "learning_rate": 8.04463815102408e-06, + "loss": 0.8177, + "step": 10667 + }, + { + "epoch": 0.5871539435301888, + "grad_norm": 0.7823941111564636, + "learning_rate": 8.04429430268692e-06, + "loss": 0.8306, + "step": 10668 + }, + { + "epoch": 0.5872089823325445, + "grad_norm": 0.9141719937324524, + "learning_rate": 8.043950431469657e-06, + "loss": 0.9137, + "step": 10669 + }, + { + "epoch": 0.5872640211349001, + "grad_norm": 0.6967095732688904, + "learning_rate": 8.043606537374878e-06, + "loss": 0.7262, + "step": 10670 + }, + { + "epoch": 0.5873190599372557, + "grad_norm": 0.7909649014472961, + "learning_rate": 8.043262620405166e-06, + "loss": 0.8332, + "step": 10671 + }, + { + "epoch": 0.5873740987396114, + "grad_norm": 0.7967168092727661, + "learning_rate": 8.042918680563107e-06, + "loss": 0.7966, + "step": 10672 + }, + { + "epoch": 0.5874291375419671, + "grad_norm": 0.7637625336647034, + "learning_rate": 8.042574717851287e-06, + "loss": 0.8322, + "step": 10673 + }, + { + "epoch": 0.5874841763443227, + "grad_norm": 0.6968004107475281, + "learning_rate": 8.04223073227229e-06, + "loss": 0.8061, + "step": 10674 + }, + { + "epoch": 0.5875392151466784, + "grad_norm": 0.7325586080551147, + "learning_rate": 8.0418867238287e-06, + "loss": 0.7922, + "step": 10675 + }, + { + "epoch": 0.5875942539490341, + "grad_norm": 0.6784406304359436, + "learning_rate": 8.041542692523103e-06, + "loss": 0.7327, + "step": 10676 + }, + { + "epoch": 0.5876492927513898, + "grad_norm": 0.8297861218452454, + "learning_rate": 8.041198638358088e-06, + "loss": 0.9347, + "step": 10677 + }, + { + "epoch": 0.5877043315537454, + "grad_norm": 0.6227413415908813, + "learning_rate": 8.040854561336236e-06, + "loss": 0.655, + "step": 10678 + }, + { + "epoch": 0.587759370356101, + "grad_norm": 0.752098023891449, + "learning_rate": 8.040510461460134e-06, + "loss": 0.7608, + "step": 10679 + }, + { + "epoch": 0.5878144091584567, + "grad_norm": 0.7008342146873474, + "learning_rate": 8.040166338732372e-06, + "loss": 0.7385, + "step": 10680 + }, + { + "epoch": 0.5878694479608124, + "grad_norm": 0.6768027544021606, + "learning_rate": 8.039822193155532e-06, + "loss": 0.6812, + "step": 10681 + }, + { + "epoch": 0.587924486763168, + "grad_norm": 0.7728545069694519, + "learning_rate": 8.039478024732203e-06, + "loss": 0.7696, + "step": 10682 + }, + { + "epoch": 0.5879795255655237, + "grad_norm": 0.7257505655288696, + "learning_rate": 8.03913383346497e-06, + "loss": 0.6686, + "step": 10683 + }, + { + "epoch": 0.5880345643678794, + "grad_norm": 0.7755837440490723, + "learning_rate": 8.03878961935642e-06, + "loss": 0.8469, + "step": 10684 + }, + { + "epoch": 0.5880896031702351, + "grad_norm": 0.7187668085098267, + "learning_rate": 8.038445382409142e-06, + "loss": 0.8249, + "step": 10685 + }, + { + "epoch": 0.5881446419725906, + "grad_norm": 0.638053834438324, + "learning_rate": 8.038101122625722e-06, + "loss": 0.6876, + "step": 10686 + }, + { + "epoch": 0.5881996807749463, + "grad_norm": 0.7323756217956543, + "learning_rate": 8.037756840008746e-06, + "loss": 0.7489, + "step": 10687 + }, + { + "epoch": 0.588254719577302, + "grad_norm": 0.6795439720153809, + "learning_rate": 8.037412534560804e-06, + "loss": 0.7246, + "step": 10688 + }, + { + "epoch": 0.5883097583796577, + "grad_norm": 0.8136376142501831, + "learning_rate": 8.037068206284482e-06, + "loss": 0.8518, + "step": 10689 + }, + { + "epoch": 0.5883647971820133, + "grad_norm": 0.6484195590019226, + "learning_rate": 8.036723855182367e-06, + "loss": 0.7018, + "step": 10690 + }, + { + "epoch": 0.588419835984369, + "grad_norm": 0.7465028166770935, + "learning_rate": 8.036379481257048e-06, + "loss": 0.8276, + "step": 10691 + }, + { + "epoch": 0.5884748747867247, + "grad_norm": 0.7761173844337463, + "learning_rate": 8.036035084511116e-06, + "loss": 0.6371, + "step": 10692 + }, + { + "epoch": 0.5885299135890804, + "grad_norm": 0.830008864402771, + "learning_rate": 8.035690664947156e-06, + "loss": 0.8199, + "step": 10693 + }, + { + "epoch": 0.5885849523914359, + "grad_norm": 0.6614254117012024, + "learning_rate": 8.03534622256776e-06, + "loss": 0.656, + "step": 10694 + }, + { + "epoch": 0.5886399911937916, + "grad_norm": 0.7229047417640686, + "learning_rate": 8.035001757375509e-06, + "loss": 0.7622, + "step": 10695 + }, + { + "epoch": 0.5886950299961473, + "grad_norm": 0.7044325470924377, + "learning_rate": 8.034657269373001e-06, + "loss": 0.7678, + "step": 10696 + }, + { + "epoch": 0.5887500687985029, + "grad_norm": 0.7109018564224243, + "learning_rate": 8.03431275856282e-06, + "loss": 0.7976, + "step": 10697 + }, + { + "epoch": 0.5888051076008586, + "grad_norm": 0.7812879085540771, + "learning_rate": 8.033968224947557e-06, + "loss": 0.7163, + "step": 10698 + }, + { + "epoch": 0.5888601464032143, + "grad_norm": 0.7408469915390015, + "learning_rate": 8.033623668529802e-06, + "loss": 0.6895, + "step": 10699 + }, + { + "epoch": 0.58891518520557, + "grad_norm": 0.7654302716255188, + "learning_rate": 8.033279089312142e-06, + "loss": 0.8126, + "step": 10700 + }, + { + "epoch": 0.5889702240079255, + "grad_norm": 0.7307846546173096, + "learning_rate": 8.032934487297169e-06, + "loss": 0.7958, + "step": 10701 + }, + { + "epoch": 0.5890252628102812, + "grad_norm": 0.6658591032028198, + "learning_rate": 8.032589862487472e-06, + "loss": 0.717, + "step": 10702 + }, + { + "epoch": 0.5890803016126369, + "grad_norm": 1.4167139530181885, + "learning_rate": 8.03224521488564e-06, + "loss": 0.8599, + "step": 10703 + }, + { + "epoch": 0.5891353404149926, + "grad_norm": 0.6723609566688538, + "learning_rate": 8.031900544494266e-06, + "loss": 0.8167, + "step": 10704 + }, + { + "epoch": 0.5891903792173482, + "grad_norm": 0.6420501470565796, + "learning_rate": 8.03155585131594e-06, + "loss": 0.692, + "step": 10705 + }, + { + "epoch": 0.5892454180197039, + "grad_norm": 0.6973454356193542, + "learning_rate": 8.031211135353251e-06, + "loss": 0.7709, + "step": 10706 + }, + { + "epoch": 0.5893004568220596, + "grad_norm": 0.7752252221107483, + "learning_rate": 8.03086639660879e-06, + "loss": 0.7795, + "step": 10707 + }, + { + "epoch": 0.5893554956244152, + "grad_norm": 0.8193135857582092, + "learning_rate": 8.030521635085149e-06, + "loss": 0.812, + "step": 10708 + }, + { + "epoch": 0.5894105344267708, + "grad_norm": 0.7976878881454468, + "learning_rate": 8.03017685078492e-06, + "loss": 0.8039, + "step": 10709 + }, + { + "epoch": 0.5894655732291265, + "grad_norm": 0.7545839548110962, + "learning_rate": 8.02983204371069e-06, + "loss": 0.8238, + "step": 10710 + }, + { + "epoch": 0.5895206120314822, + "grad_norm": 0.6544732451438904, + "learning_rate": 8.029487213865054e-06, + "loss": 0.7471, + "step": 10711 + }, + { + "epoch": 0.5895756508338379, + "grad_norm": 0.7054508924484253, + "learning_rate": 8.029142361250603e-06, + "loss": 0.8283, + "step": 10712 + }, + { + "epoch": 0.5896306896361935, + "grad_norm": 0.7425236105918884, + "learning_rate": 8.02879748586993e-06, + "loss": 0.8031, + "step": 10713 + }, + { + "epoch": 0.5896857284385492, + "grad_norm": 0.8390052318572998, + "learning_rate": 8.028452587725626e-06, + "loss": 0.7218, + "step": 10714 + }, + { + "epoch": 0.5897407672409049, + "grad_norm": 0.8116903901100159, + "learning_rate": 8.028107666820282e-06, + "loss": 0.8057, + "step": 10715 + }, + { + "epoch": 0.5897958060432605, + "grad_norm": 0.602308452129364, + "learning_rate": 8.027762723156492e-06, + "loss": 0.6428, + "step": 10716 + }, + { + "epoch": 0.5898508448456161, + "grad_norm": 0.7480159401893616, + "learning_rate": 8.027417756736848e-06, + "loss": 0.7566, + "step": 10717 + }, + { + "epoch": 0.5899058836479718, + "grad_norm": 0.6823177933692932, + "learning_rate": 8.027072767563943e-06, + "loss": 0.8337, + "step": 10718 + }, + { + "epoch": 0.5899609224503275, + "grad_norm": 0.6841796040534973, + "learning_rate": 8.026727755640367e-06, + "loss": 0.751, + "step": 10719 + }, + { + "epoch": 0.5900159612526832, + "grad_norm": 0.7257139086723328, + "learning_rate": 8.026382720968718e-06, + "loss": 0.7373, + "step": 10720 + }, + { + "epoch": 0.5900710000550388, + "grad_norm": 0.6318400502204895, + "learning_rate": 8.026037663551584e-06, + "loss": 0.7205, + "step": 10721 + }, + { + "epoch": 0.5901260388573945, + "grad_norm": 0.6612908840179443, + "learning_rate": 8.025692583391564e-06, + "loss": 0.7613, + "step": 10722 + }, + { + "epoch": 0.5901810776597501, + "grad_norm": 0.7555351853370667, + "learning_rate": 8.025347480491246e-06, + "loss": 0.718, + "step": 10723 + }, + { + "epoch": 0.5902361164621058, + "grad_norm": 0.6944366097450256, + "learning_rate": 8.025002354853227e-06, + "loss": 0.7775, + "step": 10724 + }, + { + "epoch": 0.5902911552644614, + "grad_norm": 0.6968230605125427, + "learning_rate": 8.0246572064801e-06, + "loss": 0.7316, + "step": 10725 + }, + { + "epoch": 0.5903461940668171, + "grad_norm": 0.7083567380905151, + "learning_rate": 8.024312035374459e-06, + "loss": 0.7844, + "step": 10726 + }, + { + "epoch": 0.5904012328691728, + "grad_norm": 0.7183080315589905, + "learning_rate": 8.0239668415389e-06, + "loss": 0.8308, + "step": 10727 + }, + { + "epoch": 0.5904562716715285, + "grad_norm": 0.8350495100021362, + "learning_rate": 8.023621624976014e-06, + "loss": 0.9077, + "step": 10728 + }, + { + "epoch": 0.590511310473884, + "grad_norm": 0.6876987218856812, + "learning_rate": 8.023276385688396e-06, + "loss": 0.7483, + "step": 10729 + }, + { + "epoch": 0.5905663492762397, + "grad_norm": 0.8617128133773804, + "learning_rate": 8.022931123678646e-06, + "loss": 0.7058, + "step": 10730 + }, + { + "epoch": 0.5906213880785954, + "grad_norm": 0.6921959519386292, + "learning_rate": 8.02258583894935e-06, + "loss": 0.7542, + "step": 10731 + }, + { + "epoch": 0.5906764268809511, + "grad_norm": 0.7394077181816101, + "learning_rate": 8.02224053150311e-06, + "loss": 0.7761, + "step": 10732 + }, + { + "epoch": 0.5907314656833067, + "grad_norm": 0.6672187447547913, + "learning_rate": 8.02189520134252e-06, + "loss": 0.6904, + "step": 10733 + }, + { + "epoch": 0.5907865044856624, + "grad_norm": 0.7498076558113098, + "learning_rate": 8.021549848470174e-06, + "loss": 0.7994, + "step": 10734 + }, + { + "epoch": 0.5908415432880181, + "grad_norm": 0.699832558631897, + "learning_rate": 8.021204472888669e-06, + "loss": 0.7413, + "step": 10735 + }, + { + "epoch": 0.5908965820903738, + "grad_norm": 0.7628722190856934, + "learning_rate": 8.020859074600598e-06, + "loss": 0.8202, + "step": 10736 + }, + { + "epoch": 0.5909516208927293, + "grad_norm": 0.8023744225502014, + "learning_rate": 8.020513653608558e-06, + "loss": 0.8225, + "step": 10737 + }, + { + "epoch": 0.591006659695085, + "grad_norm": 0.7283689379692078, + "learning_rate": 8.02016820991515e-06, + "loss": 0.6706, + "step": 10738 + }, + { + "epoch": 0.5910616984974407, + "grad_norm": 0.7199996113777161, + "learning_rate": 8.019822743522962e-06, + "loss": 0.8258, + "step": 10739 + }, + { + "epoch": 0.5911167372997963, + "grad_norm": 0.623249888420105, + "learning_rate": 8.019477254434598e-06, + "loss": 0.6188, + "step": 10740 + }, + { + "epoch": 0.591171776102152, + "grad_norm": 0.7331949472427368, + "learning_rate": 8.01913174265265e-06, + "loss": 0.8013, + "step": 10741 + }, + { + "epoch": 0.5912268149045077, + "grad_norm": 0.7003010511398315, + "learning_rate": 8.018786208179716e-06, + "loss": 0.8305, + "step": 10742 + }, + { + "epoch": 0.5912818537068634, + "grad_norm": 0.6879638433456421, + "learning_rate": 8.01844065101839e-06, + "loss": 0.7622, + "step": 10743 + }, + { + "epoch": 0.591336892509219, + "grad_norm": 0.6597324013710022, + "learning_rate": 8.018095071171276e-06, + "loss": 0.7362, + "step": 10744 + }, + { + "epoch": 0.5913919313115746, + "grad_norm": 0.664905846118927, + "learning_rate": 8.017749468640967e-06, + "loss": 0.7629, + "step": 10745 + }, + { + "epoch": 0.5914469701139303, + "grad_norm": 0.7358053922653198, + "learning_rate": 8.017403843430059e-06, + "loss": 0.7798, + "step": 10746 + }, + { + "epoch": 0.591502008916286, + "grad_norm": 0.699603259563446, + "learning_rate": 8.017058195541152e-06, + "loss": 0.6249, + "step": 10747 + }, + { + "epoch": 0.5915570477186416, + "grad_norm": 0.6736140847206116, + "learning_rate": 8.016712524976843e-06, + "loss": 0.6904, + "step": 10748 + }, + { + "epoch": 0.5916120865209973, + "grad_norm": 0.6803401112556458, + "learning_rate": 8.016366831739732e-06, + "loss": 0.6868, + "step": 10749 + }, + { + "epoch": 0.591667125323353, + "grad_norm": 0.7152959704399109, + "learning_rate": 8.016021115832413e-06, + "loss": 0.7747, + "step": 10750 + }, + { + "epoch": 0.5917221641257087, + "grad_norm": 0.6469255685806274, + "learning_rate": 8.015675377257489e-06, + "loss": 0.7309, + "step": 10751 + }, + { + "epoch": 0.5917772029280642, + "grad_norm": 0.7902734875679016, + "learning_rate": 8.015329616017554e-06, + "loss": 0.7575, + "step": 10752 + }, + { + "epoch": 0.5918322417304199, + "grad_norm": 0.7447189688682556, + "learning_rate": 8.014983832115208e-06, + "loss": 0.7759, + "step": 10753 + }, + { + "epoch": 0.5918872805327756, + "grad_norm": 0.6135374903678894, + "learning_rate": 8.014638025553053e-06, + "loss": 0.6681, + "step": 10754 + }, + { + "epoch": 0.5919423193351313, + "grad_norm": 0.8614835739135742, + "learning_rate": 8.014292196333684e-06, + "loss": 0.7203, + "step": 10755 + }, + { + "epoch": 0.5919973581374869, + "grad_norm": 0.7649008631706238, + "learning_rate": 8.013946344459703e-06, + "loss": 0.7966, + "step": 10756 + }, + { + "epoch": 0.5920523969398426, + "grad_norm": 1.0862764120101929, + "learning_rate": 8.013600469933707e-06, + "loss": 0.866, + "step": 10757 + }, + { + "epoch": 0.5921074357421983, + "grad_norm": 0.7304185628890991, + "learning_rate": 8.013254572758296e-06, + "loss": 0.7599, + "step": 10758 + }, + { + "epoch": 0.592162474544554, + "grad_norm": 0.6329634785652161, + "learning_rate": 8.012908652936072e-06, + "loss": 0.6855, + "step": 10759 + }, + { + "epoch": 0.5922175133469095, + "grad_norm": 0.6692202687263489, + "learning_rate": 8.012562710469631e-06, + "loss": 0.817, + "step": 10760 + }, + { + "epoch": 0.5922725521492652, + "grad_norm": 0.6577631235122681, + "learning_rate": 8.012216745361577e-06, + "loss": 0.7813, + "step": 10761 + }, + { + "epoch": 0.5923275909516209, + "grad_norm": 0.6877861022949219, + "learning_rate": 8.011870757614506e-06, + "loss": 0.7142, + "step": 10762 + }, + { + "epoch": 0.5923826297539766, + "grad_norm": 0.7132022380828857, + "learning_rate": 8.011524747231023e-06, + "loss": 0.747, + "step": 10763 + }, + { + "epoch": 0.5924376685563322, + "grad_norm": 0.7841360569000244, + "learning_rate": 8.011178714213726e-06, + "loss": 0.7511, + "step": 10764 + }, + { + "epoch": 0.5924927073586879, + "grad_norm": 0.8572794198989868, + "learning_rate": 8.010832658565215e-06, + "loss": 0.8704, + "step": 10765 + }, + { + "epoch": 0.5925477461610436, + "grad_norm": 0.6825506687164307, + "learning_rate": 8.010486580288092e-06, + "loss": 0.7472, + "step": 10766 + }, + { + "epoch": 0.5926027849633992, + "grad_norm": 0.7484591603279114, + "learning_rate": 8.010140479384957e-06, + "loss": 0.7679, + "step": 10767 + }, + { + "epoch": 0.5926578237657548, + "grad_norm": 0.712602436542511, + "learning_rate": 8.009794355858412e-06, + "loss": 0.7706, + "step": 10768 + }, + { + "epoch": 0.5927128625681105, + "grad_norm": 0.8911493420600891, + "learning_rate": 8.00944820971106e-06, + "loss": 0.8396, + "step": 10769 + }, + { + "epoch": 0.5927679013704662, + "grad_norm": 0.7300251126289368, + "learning_rate": 8.009102040945498e-06, + "loss": 0.7611, + "step": 10770 + }, + { + "epoch": 0.5928229401728219, + "grad_norm": 0.727343738079071, + "learning_rate": 8.008755849564333e-06, + "loss": 0.6785, + "step": 10771 + }, + { + "epoch": 0.5928779789751775, + "grad_norm": 0.8323808908462524, + "learning_rate": 8.008409635570163e-06, + "loss": 0.7429, + "step": 10772 + }, + { + "epoch": 0.5929330177775332, + "grad_norm": 0.6651942133903503, + "learning_rate": 8.00806339896559e-06, + "loss": 0.7683, + "step": 10773 + }, + { + "epoch": 0.5929880565798888, + "grad_norm": 0.7164554595947266, + "learning_rate": 8.007717139753222e-06, + "loss": 0.7742, + "step": 10774 + }, + { + "epoch": 0.5930430953822445, + "grad_norm": 0.6906408667564392, + "learning_rate": 8.007370857935654e-06, + "loss": 0.7322, + "step": 10775 + }, + { + "epoch": 0.5930981341846001, + "grad_norm": 0.6384999752044678, + "learning_rate": 8.007024553515493e-06, + "loss": 0.7011, + "step": 10776 + }, + { + "epoch": 0.5931531729869558, + "grad_norm": 0.6997355222702026, + "learning_rate": 8.006678226495338e-06, + "loss": 0.7303, + "step": 10777 + }, + { + "epoch": 0.5932082117893115, + "grad_norm": 0.6730707287788391, + "learning_rate": 8.006331876877797e-06, + "loss": 0.7461, + "step": 10778 + }, + { + "epoch": 0.5932632505916672, + "grad_norm": 0.7529115080833435, + "learning_rate": 8.00598550466547e-06, + "loss": 0.7487, + "step": 10779 + }, + { + "epoch": 0.5933182893940228, + "grad_norm": 0.7186329960823059, + "learning_rate": 8.00563910986096e-06, + "loss": 0.8025, + "step": 10780 + }, + { + "epoch": 0.5933733281963784, + "grad_norm": 0.7523752450942993, + "learning_rate": 8.005292692466869e-06, + "loss": 0.8291, + "step": 10781 + }, + { + "epoch": 0.5934283669987341, + "grad_norm": 1.182645559310913, + "learning_rate": 8.004946252485806e-06, + "loss": 0.8037, + "step": 10782 + }, + { + "epoch": 0.5934834058010897, + "grad_norm": 0.736570417881012, + "learning_rate": 8.004599789920369e-06, + "loss": 0.8259, + "step": 10783 + }, + { + "epoch": 0.5935384446034454, + "grad_norm": 0.757665753364563, + "learning_rate": 8.004253304773165e-06, + "loss": 0.7773, + "step": 10784 + }, + { + "epoch": 0.5935934834058011, + "grad_norm": 0.6988566517829895, + "learning_rate": 8.003906797046798e-06, + "loss": 0.7895, + "step": 10785 + }, + { + "epoch": 0.5936485222081568, + "grad_norm": 0.6921454071998596, + "learning_rate": 8.00356026674387e-06, + "loss": 0.8068, + "step": 10786 + }, + { + "epoch": 0.5937035610105124, + "grad_norm": 0.7053877115249634, + "learning_rate": 8.003213713866988e-06, + "loss": 0.7632, + "step": 10787 + }, + { + "epoch": 0.593758599812868, + "grad_norm": 0.8193650245666504, + "learning_rate": 8.002867138418757e-06, + "loss": 0.759, + "step": 10788 + }, + { + "epoch": 0.5938136386152237, + "grad_norm": 0.6089804768562317, + "learning_rate": 8.002520540401779e-06, + "loss": 0.7117, + "step": 10789 + }, + { + "epoch": 0.5938686774175794, + "grad_norm": 0.6869456768035889, + "learning_rate": 8.002173919818662e-06, + "loss": 0.7724, + "step": 10790 + }, + { + "epoch": 0.593923716219935, + "grad_norm": 0.7279118895530701, + "learning_rate": 8.001827276672007e-06, + "loss": 0.7578, + "step": 10791 + }, + { + "epoch": 0.5939787550222907, + "grad_norm": 0.6960133910179138, + "learning_rate": 8.00148061096442e-06, + "loss": 0.7887, + "step": 10792 + }, + { + "epoch": 0.5940337938246464, + "grad_norm": 0.6774740815162659, + "learning_rate": 8.001133922698511e-06, + "loss": 0.7146, + "step": 10793 + }, + { + "epoch": 0.5940888326270021, + "grad_norm": 0.6696349382400513, + "learning_rate": 8.000787211876883e-06, + "loss": 0.7829, + "step": 10794 + }, + { + "epoch": 0.5941438714293577, + "grad_norm": 1.5037024021148682, + "learning_rate": 8.000440478502142e-06, + "loss": 0.8198, + "step": 10795 + }, + { + "epoch": 0.5941989102317133, + "grad_norm": 0.7373353838920593, + "learning_rate": 8.000093722576893e-06, + "loss": 0.7864, + "step": 10796 + }, + { + "epoch": 0.594253949034069, + "grad_norm": 0.8120700120925903, + "learning_rate": 7.999746944103743e-06, + "loss": 0.7918, + "step": 10797 + }, + { + "epoch": 0.5943089878364247, + "grad_norm": 0.7669811844825745, + "learning_rate": 7.999400143085296e-06, + "loss": 0.751, + "step": 10798 + }, + { + "epoch": 0.5943640266387803, + "grad_norm": 0.8090860843658447, + "learning_rate": 7.999053319524163e-06, + "loss": 0.8387, + "step": 10799 + }, + { + "epoch": 0.594419065441136, + "grad_norm": 0.6994315385818481, + "learning_rate": 7.998706473422945e-06, + "loss": 0.7084, + "step": 10800 + }, + { + "epoch": 0.5944741042434917, + "grad_norm": 0.7913107872009277, + "learning_rate": 7.998359604784254e-06, + "loss": 0.7454, + "step": 10801 + }, + { + "epoch": 0.5945291430458474, + "grad_norm": 0.6831398010253906, + "learning_rate": 7.998012713610696e-06, + "loss": 0.7422, + "step": 10802 + }, + { + "epoch": 0.5945841818482029, + "grad_norm": 0.7324068546295166, + "learning_rate": 7.997665799904875e-06, + "loss": 0.7622, + "step": 10803 + }, + { + "epoch": 0.5946392206505586, + "grad_norm": 0.8192811012268066, + "learning_rate": 7.997318863669399e-06, + "loss": 0.7783, + "step": 10804 + }, + { + "epoch": 0.5946942594529143, + "grad_norm": 0.8008341789245605, + "learning_rate": 7.996971904906879e-06, + "loss": 0.7673, + "step": 10805 + }, + { + "epoch": 0.59474929825527, + "grad_norm": 0.6899568438529968, + "learning_rate": 7.99662492361992e-06, + "loss": 0.7477, + "step": 10806 + }, + { + "epoch": 0.5948043370576256, + "grad_norm": 0.7322555780410767, + "learning_rate": 7.996277919811132e-06, + "loss": 0.7673, + "step": 10807 + }, + { + "epoch": 0.5948593758599813, + "grad_norm": 1.008300542831421, + "learning_rate": 7.995930893483117e-06, + "loss": 0.7556, + "step": 10808 + }, + { + "epoch": 0.594914414662337, + "grad_norm": 0.7211925387382507, + "learning_rate": 7.99558384463849e-06, + "loss": 0.761, + "step": 10809 + }, + { + "epoch": 0.5949694534646927, + "grad_norm": 0.7143383622169495, + "learning_rate": 7.995236773279855e-06, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.5950244922670482, + "grad_norm": 0.7682802677154541, + "learning_rate": 7.994889679409825e-06, + "loss": 0.8538, + "step": 10811 + }, + { + "epoch": 0.5950795310694039, + "grad_norm": 0.6304698586463928, + "learning_rate": 7.994542563031004e-06, + "loss": 0.7343, + "step": 10812 + }, + { + "epoch": 0.5951345698717596, + "grad_norm": 0.6704440116882324, + "learning_rate": 7.994195424146002e-06, + "loss": 0.6921, + "step": 10813 + }, + { + "epoch": 0.5951896086741153, + "grad_norm": 0.8626209497451782, + "learning_rate": 7.99384826275743e-06, + "loss": 0.7049, + "step": 10814 + }, + { + "epoch": 0.5952446474764709, + "grad_norm": 0.810922384262085, + "learning_rate": 7.993501078867895e-06, + "loss": 0.793, + "step": 10815 + }, + { + "epoch": 0.5952996862788266, + "grad_norm": 0.8495855927467346, + "learning_rate": 7.993153872480009e-06, + "loss": 0.8078, + "step": 10816 + }, + { + "epoch": 0.5953547250811823, + "grad_norm": 0.7430331707000732, + "learning_rate": 7.992806643596378e-06, + "loss": 0.7957, + "step": 10817 + }, + { + "epoch": 0.595409763883538, + "grad_norm": 0.7188051342964172, + "learning_rate": 7.992459392219614e-06, + "loss": 0.725, + "step": 10818 + }, + { + "epoch": 0.5954648026858935, + "grad_norm": 0.7046926021575928, + "learning_rate": 7.992112118352326e-06, + "loss": 0.7438, + "step": 10819 + }, + { + "epoch": 0.5955198414882492, + "grad_norm": 0.7982804775238037, + "learning_rate": 7.991764821997123e-06, + "loss": 0.7046, + "step": 10820 + }, + { + "epoch": 0.5955748802906049, + "grad_norm": 0.6392245292663574, + "learning_rate": 7.991417503156618e-06, + "loss": 0.7413, + "step": 10821 + }, + { + "epoch": 0.5956299190929606, + "grad_norm": 0.7518960237503052, + "learning_rate": 7.99107016183342e-06, + "loss": 0.7661, + "step": 10822 + }, + { + "epoch": 0.5956849578953162, + "grad_norm": 0.7413721680641174, + "learning_rate": 7.99072279803014e-06, + "loss": 0.6538, + "step": 10823 + }, + { + "epoch": 0.5957399966976719, + "grad_norm": 0.7729454636573792, + "learning_rate": 7.990375411749384e-06, + "loss": 0.8056, + "step": 10824 + }, + { + "epoch": 0.5957950355000275, + "grad_norm": 0.8059296607971191, + "learning_rate": 7.99002800299377e-06, + "loss": 0.8699, + "step": 10825 + }, + { + "epoch": 0.5958500743023831, + "grad_norm": 0.5947105288505554, + "learning_rate": 7.989680571765907e-06, + "loss": 0.6481, + "step": 10826 + }, + { + "epoch": 0.5959051131047388, + "grad_norm": 0.7303743362426758, + "learning_rate": 7.989333118068404e-06, + "loss": 0.7401, + "step": 10827 + }, + { + "epoch": 0.5959601519070945, + "grad_norm": 0.7121400237083435, + "learning_rate": 7.988985641903873e-06, + "loss": 0.78, + "step": 10828 + }, + { + "epoch": 0.5960151907094502, + "grad_norm": 0.6921802163124084, + "learning_rate": 7.988638143274926e-06, + "loss": 0.7234, + "step": 10829 + }, + { + "epoch": 0.5960702295118058, + "grad_norm": 0.6715331673622131, + "learning_rate": 7.988290622184174e-06, + "loss": 0.7606, + "step": 10830 + }, + { + "epoch": 0.5961252683141615, + "grad_norm": 0.6315215229988098, + "learning_rate": 7.98794307863423e-06, + "loss": 0.6902, + "step": 10831 + }, + { + "epoch": 0.5961803071165171, + "grad_norm": 0.6884782314300537, + "learning_rate": 7.987595512627707e-06, + "loss": 0.7808, + "step": 10832 + }, + { + "epoch": 0.5962353459188728, + "grad_norm": 0.7050700783729553, + "learning_rate": 7.987247924167215e-06, + "loss": 0.7248, + "step": 10833 + }, + { + "epoch": 0.5962903847212284, + "grad_norm": 0.7232446074485779, + "learning_rate": 7.986900313255367e-06, + "loss": 0.8686, + "step": 10834 + }, + { + "epoch": 0.5963454235235841, + "grad_norm": 0.693631649017334, + "learning_rate": 7.986552679894778e-06, + "loss": 0.7567, + "step": 10835 + }, + { + "epoch": 0.5964004623259398, + "grad_norm": 0.6462356448173523, + "learning_rate": 7.986205024088054e-06, + "loss": 0.7091, + "step": 10836 + }, + { + "epoch": 0.5964555011282955, + "grad_norm": 0.7465559840202332, + "learning_rate": 7.985857345837814e-06, + "loss": 0.8965, + "step": 10837 + }, + { + "epoch": 0.5965105399306511, + "grad_norm": 0.6803271770477295, + "learning_rate": 7.985509645146672e-06, + "loss": 0.7602, + "step": 10838 + }, + { + "epoch": 0.5965655787330068, + "grad_norm": 1.1414798498153687, + "learning_rate": 7.985161922017238e-06, + "loss": 0.7806, + "step": 10839 + }, + { + "epoch": 0.5966206175353624, + "grad_norm": 0.6583230495452881, + "learning_rate": 7.984814176452123e-06, + "loss": 0.6727, + "step": 10840 + }, + { + "epoch": 0.5966756563377181, + "grad_norm": 0.6582550406455994, + "learning_rate": 7.984466408453946e-06, + "loss": 0.6794, + "step": 10841 + }, + { + "epoch": 0.5967306951400737, + "grad_norm": 0.8680793642997742, + "learning_rate": 7.984118618025318e-06, + "loss": 0.7999, + "step": 10842 + }, + { + "epoch": 0.5967857339424294, + "grad_norm": 0.772777795791626, + "learning_rate": 7.983770805168853e-06, + "loss": 0.6278, + "step": 10843 + }, + { + "epoch": 0.5968407727447851, + "grad_norm": 0.8099700808525085, + "learning_rate": 7.983422969887167e-06, + "loss": 0.7631, + "step": 10844 + }, + { + "epoch": 0.5968958115471408, + "grad_norm": 0.660271406173706, + "learning_rate": 7.983075112182871e-06, + "loss": 0.7557, + "step": 10845 + }, + { + "epoch": 0.5969508503494964, + "grad_norm": 0.7205530405044556, + "learning_rate": 7.982727232058582e-06, + "loss": 0.8258, + "step": 10846 + }, + { + "epoch": 0.597005889151852, + "grad_norm": 0.7925810813903809, + "learning_rate": 7.982379329516912e-06, + "loss": 0.7534, + "step": 10847 + }, + { + "epoch": 0.5970609279542077, + "grad_norm": 0.7255545854568481, + "learning_rate": 7.982031404560477e-06, + "loss": 0.8394, + "step": 10848 + }, + { + "epoch": 0.5971159667565634, + "grad_norm": 0.835394561290741, + "learning_rate": 7.981683457191893e-06, + "loss": 0.8384, + "step": 10849 + }, + { + "epoch": 0.597171005558919, + "grad_norm": 0.6781747937202454, + "learning_rate": 7.981335487413775e-06, + "loss": 0.8173, + "step": 10850 + }, + { + "epoch": 0.5972260443612747, + "grad_norm": 0.8602943420410156, + "learning_rate": 7.980987495228737e-06, + "loss": 0.8257, + "step": 10851 + }, + { + "epoch": 0.5972810831636304, + "grad_norm": 0.7157264947891235, + "learning_rate": 7.980639480639394e-06, + "loss": 0.7267, + "step": 10852 + }, + { + "epoch": 0.5973361219659861, + "grad_norm": 0.7695063352584839, + "learning_rate": 7.980291443648364e-06, + "loss": 0.7794, + "step": 10853 + }, + { + "epoch": 0.5973911607683416, + "grad_norm": 0.723971426486969, + "learning_rate": 7.979943384258262e-06, + "loss": 0.7761, + "step": 10854 + }, + { + "epoch": 0.5974461995706973, + "grad_norm": 0.691722571849823, + "learning_rate": 7.979595302471702e-06, + "loss": 0.7276, + "step": 10855 + }, + { + "epoch": 0.597501238373053, + "grad_norm": 0.7019701600074768, + "learning_rate": 7.9792471982913e-06, + "loss": 0.7965, + "step": 10856 + }, + { + "epoch": 0.5975562771754087, + "grad_norm": 0.6626996994018555, + "learning_rate": 7.978899071719675e-06, + "loss": 0.7124, + "step": 10857 + }, + { + "epoch": 0.5976113159777643, + "grad_norm": 0.6871625781059265, + "learning_rate": 7.978550922759443e-06, + "loss": 0.7742, + "step": 10858 + }, + { + "epoch": 0.59766635478012, + "grad_norm": 0.7153579592704773, + "learning_rate": 7.978202751413217e-06, + "loss": 0.7852, + "step": 10859 + }, + { + "epoch": 0.5977213935824757, + "grad_norm": 0.6891841292381287, + "learning_rate": 7.977854557683619e-06, + "loss": 0.7873, + "step": 10860 + }, + { + "epoch": 0.5977764323848314, + "grad_norm": 0.6864004731178284, + "learning_rate": 7.977506341573262e-06, + "loss": 0.7223, + "step": 10861 + }, + { + "epoch": 0.5978314711871869, + "grad_norm": 0.7163059115409851, + "learning_rate": 7.977158103084764e-06, + "loss": 0.679, + "step": 10862 + }, + { + "epoch": 0.5978865099895426, + "grad_norm": 0.6727336049079895, + "learning_rate": 7.976809842220742e-06, + "loss": 0.7148, + "step": 10863 + }, + { + "epoch": 0.5979415487918983, + "grad_norm": 0.672960638999939, + "learning_rate": 7.976461558983814e-06, + "loss": 0.7263, + "step": 10864 + }, + { + "epoch": 0.597996587594254, + "grad_norm": 0.9124444127082825, + "learning_rate": 7.976113253376601e-06, + "loss": 0.6876, + "step": 10865 + }, + { + "epoch": 0.5980516263966096, + "grad_norm": 0.6415041089057922, + "learning_rate": 7.975764925401715e-06, + "loss": 0.6655, + "step": 10866 + }, + { + "epoch": 0.5981066651989653, + "grad_norm": 0.7342595458030701, + "learning_rate": 7.975416575061776e-06, + "loss": 0.7753, + "step": 10867 + }, + { + "epoch": 0.598161704001321, + "grad_norm": 0.7161775231361389, + "learning_rate": 7.975068202359402e-06, + "loss": 0.7525, + "step": 10868 + }, + { + "epoch": 0.5982167428036765, + "grad_norm": 0.7087578773498535, + "learning_rate": 7.974719807297212e-06, + "loss": 0.7196, + "step": 10869 + }, + { + "epoch": 0.5982717816060322, + "grad_norm": 0.6472536325454712, + "learning_rate": 7.974371389877826e-06, + "loss": 0.6837, + "step": 10870 + }, + { + "epoch": 0.5983268204083879, + "grad_norm": 0.6625581383705139, + "learning_rate": 7.97402295010386e-06, + "loss": 0.6379, + "step": 10871 + }, + { + "epoch": 0.5983818592107436, + "grad_norm": 0.7621071934700012, + "learning_rate": 7.973674487977934e-06, + "loss": 0.8291, + "step": 10872 + }, + { + "epoch": 0.5984368980130992, + "grad_norm": 0.693394660949707, + "learning_rate": 7.973326003502666e-06, + "loss": 0.7677, + "step": 10873 + }, + { + "epoch": 0.5984919368154549, + "grad_norm": 0.6393985152244568, + "learning_rate": 7.972977496680674e-06, + "loss": 0.7058, + "step": 10874 + }, + { + "epoch": 0.5985469756178106, + "grad_norm": 0.7101462483406067, + "learning_rate": 7.972628967514582e-06, + "loss": 0.7396, + "step": 10875 + }, + { + "epoch": 0.5986020144201663, + "grad_norm": 0.8131522536277771, + "learning_rate": 7.972280416007003e-06, + "loss": 0.8461, + "step": 10876 + }, + { + "epoch": 0.5986570532225218, + "grad_norm": 0.7186655402183533, + "learning_rate": 7.971931842160564e-06, + "loss": 0.7721, + "step": 10877 + }, + { + "epoch": 0.5987120920248775, + "grad_norm": 0.7520855069160461, + "learning_rate": 7.971583245977877e-06, + "loss": 0.7733, + "step": 10878 + }, + { + "epoch": 0.5987671308272332, + "grad_norm": 0.6548848748207092, + "learning_rate": 7.971234627461569e-06, + "loss": 0.6555, + "step": 10879 + }, + { + "epoch": 0.5988221696295889, + "grad_norm": 0.7341775894165039, + "learning_rate": 7.970885986614254e-06, + "loss": 0.8292, + "step": 10880 + }, + { + "epoch": 0.5988772084319445, + "grad_norm": 0.7126352190971375, + "learning_rate": 7.970537323438556e-06, + "loss": 0.7704, + "step": 10881 + }, + { + "epoch": 0.5989322472343002, + "grad_norm": 0.7291527390480042, + "learning_rate": 7.970188637937097e-06, + "loss": 0.8175, + "step": 10882 + }, + { + "epoch": 0.5989872860366559, + "grad_norm": 0.682767927646637, + "learning_rate": 7.969839930112493e-06, + "loss": 0.8187, + "step": 10883 + }, + { + "epoch": 0.5990423248390115, + "grad_norm": 0.7820014953613281, + "learning_rate": 7.969491199967368e-06, + "loss": 0.7949, + "step": 10884 + }, + { + "epoch": 0.5990973636413671, + "grad_norm": 0.7257336974143982, + "learning_rate": 7.969142447504341e-06, + "loss": 0.8461, + "step": 10885 + }, + { + "epoch": 0.5991524024437228, + "grad_norm": 0.6813532114028931, + "learning_rate": 7.968793672726033e-06, + "loss": 0.7889, + "step": 10886 + }, + { + "epoch": 0.5992074412460785, + "grad_norm": 0.6868439316749573, + "learning_rate": 7.96844487563507e-06, + "loss": 0.7268, + "step": 10887 + }, + { + "epoch": 0.5992624800484342, + "grad_norm": 0.6547278761863708, + "learning_rate": 7.968096056234067e-06, + "loss": 0.7026, + "step": 10888 + }, + { + "epoch": 0.5993175188507898, + "grad_norm": 0.6704558730125427, + "learning_rate": 7.96774721452565e-06, + "loss": 0.6994, + "step": 10889 + }, + { + "epoch": 0.5993725576531455, + "grad_norm": 0.7134065628051758, + "learning_rate": 7.967398350512439e-06, + "loss": 0.7728, + "step": 10890 + }, + { + "epoch": 0.5994275964555011, + "grad_norm": 0.751265823841095, + "learning_rate": 7.967049464197056e-06, + "loss": 0.8421, + "step": 10891 + }, + { + "epoch": 0.5994826352578568, + "grad_norm": 0.8558571934700012, + "learning_rate": 7.966700555582125e-06, + "loss": 0.9144, + "step": 10892 + }, + { + "epoch": 0.5995376740602124, + "grad_norm": 0.8338084816932678, + "learning_rate": 7.966351624670263e-06, + "loss": 0.7502, + "step": 10893 + }, + { + "epoch": 0.5995927128625681, + "grad_norm": 0.7017131447792053, + "learning_rate": 7.9660026714641e-06, + "loss": 0.7778, + "step": 10894 + }, + { + "epoch": 0.5996477516649238, + "grad_norm": 0.7176111340522766, + "learning_rate": 7.965653695966253e-06, + "loss": 0.8478, + "step": 10895 + }, + { + "epoch": 0.5997027904672795, + "grad_norm": 0.7026060819625854, + "learning_rate": 7.965304698179349e-06, + "loss": 0.7111, + "step": 10896 + }, + { + "epoch": 0.5997578292696351, + "grad_norm": 0.6383810639381409, + "learning_rate": 7.964955678106005e-06, + "loss": 0.6429, + "step": 10897 + }, + { + "epoch": 0.5998128680719907, + "grad_norm": 0.8024059534072876, + "learning_rate": 7.96460663574885e-06, + "loss": 0.7308, + "step": 10898 + }, + { + "epoch": 0.5998679068743464, + "grad_norm": 0.7378466725349426, + "learning_rate": 7.964257571110504e-06, + "loss": 0.7593, + "step": 10899 + }, + { + "epoch": 0.5999229456767021, + "grad_norm": 0.7089043855667114, + "learning_rate": 7.963908484193593e-06, + "loss": 0.6862, + "step": 10900 + }, + { + "epoch": 0.5999779844790577, + "grad_norm": 0.765295684337616, + "learning_rate": 7.963559375000738e-06, + "loss": 0.6759, + "step": 10901 + }, + { + "epoch": 0.6000330232814134, + "grad_norm": 0.7040783166885376, + "learning_rate": 7.963210243534565e-06, + "loss": 0.7754, + "step": 10902 + }, + { + "epoch": 0.6000880620837691, + "grad_norm": 0.8593736886978149, + "learning_rate": 7.962861089797698e-06, + "loss": 0.8765, + "step": 10903 + }, + { + "epoch": 0.6001431008861248, + "grad_norm": 0.6613926291465759, + "learning_rate": 7.962511913792758e-06, + "loss": 0.6697, + "step": 10904 + }, + { + "epoch": 0.6001981396884803, + "grad_norm": 0.6369597911834717, + "learning_rate": 7.962162715522372e-06, + "loss": 0.7145, + "step": 10905 + }, + { + "epoch": 0.600253178490836, + "grad_norm": 1.1790162324905396, + "learning_rate": 7.961813494989164e-06, + "loss": 0.8067, + "step": 10906 + }, + { + "epoch": 0.6003082172931917, + "grad_norm": 0.7548268437385559, + "learning_rate": 7.961464252195759e-06, + "loss": 0.7936, + "step": 10907 + }, + { + "epoch": 0.6003632560955474, + "grad_norm": 0.6204384565353394, + "learning_rate": 7.961114987144781e-06, + "loss": 0.6374, + "step": 10908 + }, + { + "epoch": 0.600418294897903, + "grad_norm": 0.7149941921234131, + "learning_rate": 7.960765699838854e-06, + "loss": 0.8422, + "step": 10909 + }, + { + "epoch": 0.6004733337002587, + "grad_norm": 0.7040171027183533, + "learning_rate": 7.960416390280608e-06, + "loss": 0.8261, + "step": 10910 + }, + { + "epoch": 0.6005283725026144, + "grad_norm": 0.713591456413269, + "learning_rate": 7.960067058472663e-06, + "loss": 0.7908, + "step": 10911 + }, + { + "epoch": 0.60058341130497, + "grad_norm": 0.654086172580719, + "learning_rate": 7.959717704417645e-06, + "loss": 0.6971, + "step": 10912 + }, + { + "epoch": 0.6006384501073256, + "grad_norm": 0.7293223738670349, + "learning_rate": 7.959368328118183e-06, + "loss": 0.7032, + "step": 10913 + }, + { + "epoch": 0.6006934889096813, + "grad_norm": 0.705434262752533, + "learning_rate": 7.959018929576898e-06, + "loss": 0.7193, + "step": 10914 + }, + { + "epoch": 0.600748527712037, + "grad_norm": 0.7406907677650452, + "learning_rate": 7.958669508796422e-06, + "loss": 0.8464, + "step": 10915 + }, + { + "epoch": 0.6008035665143926, + "grad_norm": 0.6683858036994934, + "learning_rate": 7.958320065779377e-06, + "loss": 0.699, + "step": 10916 + }, + { + "epoch": 0.6008586053167483, + "grad_norm": 0.7380560636520386, + "learning_rate": 7.95797060052839e-06, + "loss": 0.7409, + "step": 10917 + }, + { + "epoch": 0.600913644119104, + "grad_norm": 0.7729377746582031, + "learning_rate": 7.957621113046088e-06, + "loss": 0.8838, + "step": 10918 + }, + { + "epoch": 0.6009686829214597, + "grad_norm": 0.6842743158340454, + "learning_rate": 7.957271603335097e-06, + "loss": 0.781, + "step": 10919 + }, + { + "epoch": 0.6010237217238152, + "grad_norm": 0.6864648461341858, + "learning_rate": 7.956922071398045e-06, + "loss": 0.6717, + "step": 10920 + }, + { + "epoch": 0.6010787605261709, + "grad_norm": 0.7718262672424316, + "learning_rate": 7.956572517237557e-06, + "loss": 0.8023, + "step": 10921 + }, + { + "epoch": 0.6011337993285266, + "grad_norm": 0.686338484287262, + "learning_rate": 7.956222940856261e-06, + "loss": 0.7139, + "step": 10922 + }, + { + "epoch": 0.6011888381308823, + "grad_norm": 0.7064465284347534, + "learning_rate": 7.955873342256789e-06, + "loss": 0.845, + "step": 10923 + }, + { + "epoch": 0.6012438769332379, + "grad_norm": 0.6847875714302063, + "learning_rate": 7.955523721441761e-06, + "loss": 0.7078, + "step": 10924 + }, + { + "epoch": 0.6012989157355936, + "grad_norm": 0.6879494786262512, + "learning_rate": 7.955174078413806e-06, + "loss": 0.7532, + "step": 10925 + }, + { + "epoch": 0.6013539545379493, + "grad_norm": 0.6569855213165283, + "learning_rate": 7.954824413175554e-06, + "loss": 0.7529, + "step": 10926 + }, + { + "epoch": 0.601408993340305, + "grad_norm": 0.6225974559783936, + "learning_rate": 7.954474725729635e-06, + "loss": 0.6595, + "step": 10927 + }, + { + "epoch": 0.6014640321426605, + "grad_norm": 0.7067761421203613, + "learning_rate": 7.954125016078675e-06, + "loss": 0.7851, + "step": 10928 + }, + { + "epoch": 0.6015190709450162, + "grad_norm": 0.683030903339386, + "learning_rate": 7.9537752842253e-06, + "loss": 0.7461, + "step": 10929 + }, + { + "epoch": 0.6015741097473719, + "grad_norm": 0.6411080956459045, + "learning_rate": 7.953425530172143e-06, + "loss": 0.6945, + "step": 10930 + }, + { + "epoch": 0.6016291485497276, + "grad_norm": 0.6254550814628601, + "learning_rate": 7.953075753921829e-06, + "loss": 0.7143, + "step": 10931 + }, + { + "epoch": 0.6016841873520832, + "grad_norm": 0.684100866317749, + "learning_rate": 7.952725955476987e-06, + "loss": 0.8137, + "step": 10932 + }, + { + "epoch": 0.6017392261544389, + "grad_norm": 0.6341036558151245, + "learning_rate": 7.95237613484025e-06, + "loss": 0.6692, + "step": 10933 + }, + { + "epoch": 0.6017942649567946, + "grad_norm": 0.7311153411865234, + "learning_rate": 7.952026292014242e-06, + "loss": 0.7091, + "step": 10934 + }, + { + "epoch": 0.6018493037591502, + "grad_norm": 0.7265943884849548, + "learning_rate": 7.951676427001596e-06, + "loss": 0.765, + "step": 10935 + }, + { + "epoch": 0.6019043425615058, + "grad_norm": 0.8777397274971008, + "learning_rate": 7.951326539804938e-06, + "loss": 0.7824, + "step": 10936 + }, + { + "epoch": 0.6019593813638615, + "grad_norm": 0.7241179347038269, + "learning_rate": 7.9509766304269e-06, + "loss": 0.7913, + "step": 10937 + }, + { + "epoch": 0.6020144201662172, + "grad_norm": 0.8090667128562927, + "learning_rate": 7.950626698870113e-06, + "loss": 0.8208, + "step": 10938 + }, + { + "epoch": 0.6020694589685729, + "grad_norm": 0.7376043796539307, + "learning_rate": 7.950276745137206e-06, + "loss": 0.7176, + "step": 10939 + }, + { + "epoch": 0.6021244977709285, + "grad_norm": 0.7149157524108887, + "learning_rate": 7.949926769230809e-06, + "loss": 0.7949, + "step": 10940 + }, + { + "epoch": 0.6021795365732842, + "grad_norm": 0.8721579909324646, + "learning_rate": 7.949576771153549e-06, + "loss": 0.8433, + "step": 10941 + }, + { + "epoch": 0.6022345753756398, + "grad_norm": 0.7946182489395142, + "learning_rate": 7.949226750908062e-06, + "loss": 0.7412, + "step": 10942 + }, + { + "epoch": 0.6022896141779955, + "grad_norm": 0.6661237478256226, + "learning_rate": 7.948876708496975e-06, + "loss": 0.725, + "step": 10943 + }, + { + "epoch": 0.6023446529803511, + "grad_norm": 0.8346213698387146, + "learning_rate": 7.948526643922922e-06, + "loss": 0.6817, + "step": 10944 + }, + { + "epoch": 0.6023996917827068, + "grad_norm": 0.7911655306816101, + "learning_rate": 7.94817655718853e-06, + "loss": 0.7398, + "step": 10945 + }, + { + "epoch": 0.6024547305850625, + "grad_norm": 0.6480078101158142, + "learning_rate": 7.947826448296432e-06, + "loss": 0.6822, + "step": 10946 + }, + { + "epoch": 0.6025097693874182, + "grad_norm": 0.6950085759162903, + "learning_rate": 7.94747631724926e-06, + "loss": 0.8073, + "step": 10947 + }, + { + "epoch": 0.6025648081897738, + "grad_norm": 0.7142168879508972, + "learning_rate": 7.947126164049645e-06, + "loss": 0.6159, + "step": 10948 + }, + { + "epoch": 0.6026198469921294, + "grad_norm": 0.7459015846252441, + "learning_rate": 7.946775988700219e-06, + "loss": 0.8377, + "step": 10949 + }, + { + "epoch": 0.6026748857944851, + "grad_norm": 1.050179362297058, + "learning_rate": 7.946425791203614e-06, + "loss": 0.8098, + "step": 10950 + }, + { + "epoch": 0.6027299245968408, + "grad_norm": 0.7473265528678894, + "learning_rate": 7.94607557156246e-06, + "loss": 0.6846, + "step": 10951 + }, + { + "epoch": 0.6027849633991964, + "grad_norm": 0.7990789413452148, + "learning_rate": 7.945725329779392e-06, + "loss": 0.8216, + "step": 10952 + }, + { + "epoch": 0.6028400022015521, + "grad_norm": 0.6461700201034546, + "learning_rate": 7.94537506585704e-06, + "loss": 0.7864, + "step": 10953 + }, + { + "epoch": 0.6028950410039078, + "grad_norm": 0.661123514175415, + "learning_rate": 7.945024779798038e-06, + "loss": 0.7466, + "step": 10954 + }, + { + "epoch": 0.6029500798062634, + "grad_norm": 0.6998088359832764, + "learning_rate": 7.944674471605018e-06, + "loss": 0.7846, + "step": 10955 + }, + { + "epoch": 0.603005118608619, + "grad_norm": 0.6917386651039124, + "learning_rate": 7.944324141280613e-06, + "loss": 0.7699, + "step": 10956 + }, + { + "epoch": 0.6030601574109747, + "grad_norm": 0.7304503321647644, + "learning_rate": 7.943973788827455e-06, + "loss": 0.8015, + "step": 10957 + }, + { + "epoch": 0.6031151962133304, + "grad_norm": 0.7996858358383179, + "learning_rate": 7.94362341424818e-06, + "loss": 0.7093, + "step": 10958 + }, + { + "epoch": 0.603170235015686, + "grad_norm": 0.7445322871208191, + "learning_rate": 7.943273017545419e-06, + "loss": 0.7388, + "step": 10959 + }, + { + "epoch": 0.6032252738180417, + "grad_norm": 0.6672174334526062, + "learning_rate": 7.942922598721805e-06, + "loss": 0.7703, + "step": 10960 + }, + { + "epoch": 0.6032803126203974, + "grad_norm": 0.7313557267189026, + "learning_rate": 7.94257215777997e-06, + "loss": 0.6637, + "step": 10961 + }, + { + "epoch": 0.6033353514227531, + "grad_norm": 0.7248823642730713, + "learning_rate": 7.942221694722553e-06, + "loss": 0.836, + "step": 10962 + }, + { + "epoch": 0.6033903902251087, + "grad_norm": 0.6583372354507446, + "learning_rate": 7.941871209552187e-06, + "loss": 0.7582, + "step": 10963 + }, + { + "epoch": 0.6034454290274643, + "grad_norm": 0.7502591013908386, + "learning_rate": 7.941520702271503e-06, + "loss": 0.7455, + "step": 10964 + }, + { + "epoch": 0.60350046782982, + "grad_norm": 0.6899349689483643, + "learning_rate": 7.941170172883135e-06, + "loss": 0.7677, + "step": 10965 + }, + { + "epoch": 0.6035555066321757, + "grad_norm": 0.693321943283081, + "learning_rate": 7.940819621389722e-06, + "loss": 0.7754, + "step": 10966 + }, + { + "epoch": 0.6036105454345313, + "grad_norm": 0.7376342415809631, + "learning_rate": 7.940469047793893e-06, + "loss": 0.7761, + "step": 10967 + }, + { + "epoch": 0.603665584236887, + "grad_norm": 0.6377952694892883, + "learning_rate": 7.940118452098289e-06, + "loss": 0.6612, + "step": 10968 + }, + { + "epoch": 0.6037206230392427, + "grad_norm": 0.8041388988494873, + "learning_rate": 7.939767834305538e-06, + "loss": 0.8358, + "step": 10969 + }, + { + "epoch": 0.6037756618415984, + "grad_norm": 1.5993521213531494, + "learning_rate": 7.939417194418282e-06, + "loss": 0.8536, + "step": 10970 + }, + { + "epoch": 0.6038307006439539, + "grad_norm": 0.6718295216560364, + "learning_rate": 7.939066532439153e-06, + "loss": 0.717, + "step": 10971 + }, + { + "epoch": 0.6038857394463096, + "grad_norm": 0.7951062917709351, + "learning_rate": 7.938715848370787e-06, + "loss": 0.6919, + "step": 10972 + }, + { + "epoch": 0.6039407782486653, + "grad_norm": 0.707804262638092, + "learning_rate": 7.938365142215816e-06, + "loss": 0.7346, + "step": 10973 + }, + { + "epoch": 0.603995817051021, + "grad_norm": 0.7244500517845154, + "learning_rate": 7.938014413976883e-06, + "loss": 0.708, + "step": 10974 + }, + { + "epoch": 0.6040508558533766, + "grad_norm": 0.7533566951751709, + "learning_rate": 7.937663663656617e-06, + "loss": 0.6761, + "step": 10975 + }, + { + "epoch": 0.6041058946557323, + "grad_norm": 0.8844665288925171, + "learning_rate": 7.93731289125766e-06, + "loss": 0.7833, + "step": 10976 + }, + { + "epoch": 0.604160933458088, + "grad_norm": 0.6413047313690186, + "learning_rate": 7.936962096782643e-06, + "loss": 0.7175, + "step": 10977 + }, + { + "epoch": 0.6042159722604437, + "grad_norm": 0.765943706035614, + "learning_rate": 7.936611280234206e-06, + "loss": 0.7654, + "step": 10978 + }, + { + "epoch": 0.6042710110627992, + "grad_norm": 0.6833398938179016, + "learning_rate": 7.936260441614985e-06, + "loss": 0.7459, + "step": 10979 + }, + { + "epoch": 0.6043260498651549, + "grad_norm": 0.6363481283187866, + "learning_rate": 7.935909580927617e-06, + "loss": 0.7173, + "step": 10980 + }, + { + "epoch": 0.6043810886675106, + "grad_norm": 0.7731046080589294, + "learning_rate": 7.935558698174738e-06, + "loss": 0.8428, + "step": 10981 + }, + { + "epoch": 0.6044361274698663, + "grad_norm": 0.7346602082252502, + "learning_rate": 7.935207793358986e-06, + "loss": 0.832, + "step": 10982 + }, + { + "epoch": 0.6044911662722219, + "grad_norm": 0.6711193919181824, + "learning_rate": 7.934856866482998e-06, + "loss": 0.742, + "step": 10983 + }, + { + "epoch": 0.6045462050745776, + "grad_norm": 0.6931266784667969, + "learning_rate": 7.934505917549411e-06, + "loss": 0.7779, + "step": 10984 + }, + { + "epoch": 0.6046012438769333, + "grad_norm": 0.7624725699424744, + "learning_rate": 7.934154946560862e-06, + "loss": 0.7229, + "step": 10985 + }, + { + "epoch": 0.604656282679289, + "grad_norm": 0.6594272255897522, + "learning_rate": 7.933803953519991e-06, + "loss": 0.7776, + "step": 10986 + }, + { + "epoch": 0.6047113214816445, + "grad_norm": 0.674521803855896, + "learning_rate": 7.933452938429435e-06, + "loss": 0.6904, + "step": 10987 + }, + { + "epoch": 0.6047663602840002, + "grad_norm": 0.7352569699287415, + "learning_rate": 7.933101901291831e-06, + "loss": 0.7655, + "step": 10988 + }, + { + "epoch": 0.6048213990863559, + "grad_norm": 0.8560347557067871, + "learning_rate": 7.932750842109817e-06, + "loss": 0.7894, + "step": 10989 + }, + { + "epoch": 0.6048764378887116, + "grad_norm": 0.769496500492096, + "learning_rate": 7.932399760886037e-06, + "loss": 0.8255, + "step": 10990 + }, + { + "epoch": 0.6049314766910672, + "grad_norm": 0.9399588108062744, + "learning_rate": 7.932048657623122e-06, + "loss": 0.8554, + "step": 10991 + }, + { + "epoch": 0.6049865154934229, + "grad_norm": 0.6662001609802246, + "learning_rate": 7.931697532323716e-06, + "loss": 0.7788, + "step": 10992 + }, + { + "epoch": 0.6050415542957785, + "grad_norm": 0.758263111114502, + "learning_rate": 7.931346384990455e-06, + "loss": 0.7907, + "step": 10993 + }, + { + "epoch": 0.6050965930981342, + "grad_norm": 0.7283937335014343, + "learning_rate": 7.930995215625978e-06, + "loss": 0.8415, + "step": 10994 + }, + { + "epoch": 0.6051516319004898, + "grad_norm": 0.6611599922180176, + "learning_rate": 7.930644024232927e-06, + "loss": 0.7145, + "step": 10995 + }, + { + "epoch": 0.6052066707028455, + "grad_norm": 0.8450857400894165, + "learning_rate": 7.93029281081394e-06, + "loss": 0.7208, + "step": 10996 + }, + { + "epoch": 0.6052617095052012, + "grad_norm": 0.649010181427002, + "learning_rate": 7.929941575371655e-06, + "loss": 0.6928, + "step": 10997 + }, + { + "epoch": 0.6053167483075568, + "grad_norm": 0.7022100687026978, + "learning_rate": 7.929590317908718e-06, + "loss": 0.7329, + "step": 10998 + }, + { + "epoch": 0.6053717871099125, + "grad_norm": 0.768598198890686, + "learning_rate": 7.92923903842776e-06, + "loss": 0.7799, + "step": 10999 + }, + { + "epoch": 0.6054268259122682, + "grad_norm": 0.6648436784744263, + "learning_rate": 7.928887736931428e-06, + "loss": 0.7728, + "step": 11000 + }, + { + "epoch": 0.6054818647146238, + "grad_norm": 0.6946157813072205, + "learning_rate": 7.928536413422357e-06, + "loss": 0.7609, + "step": 11001 + }, + { + "epoch": 0.6055369035169794, + "grad_norm": 0.7779337167739868, + "learning_rate": 7.928185067903191e-06, + "loss": 0.7679, + "step": 11002 + }, + { + "epoch": 0.6055919423193351, + "grad_norm": 0.6520814895629883, + "learning_rate": 7.927833700376573e-06, + "loss": 0.6734, + "step": 11003 + }, + { + "epoch": 0.6056469811216908, + "grad_norm": 0.7724258899688721, + "learning_rate": 7.927482310845138e-06, + "loss": 0.7564, + "step": 11004 + }, + { + "epoch": 0.6057020199240465, + "grad_norm": 0.6649174690246582, + "learning_rate": 7.927130899311529e-06, + "loss": 0.7217, + "step": 11005 + }, + { + "epoch": 0.6057570587264021, + "grad_norm": 0.6807287931442261, + "learning_rate": 7.926779465778389e-06, + "loss": 0.6966, + "step": 11006 + }, + { + "epoch": 0.6058120975287578, + "grad_norm": 0.6644826531410217, + "learning_rate": 7.926428010248357e-06, + "loss": 0.7238, + "step": 11007 + }, + { + "epoch": 0.6058671363311134, + "grad_norm": 0.7533535957336426, + "learning_rate": 7.926076532724077e-06, + "loss": 0.855, + "step": 11008 + }, + { + "epoch": 0.6059221751334691, + "grad_norm": 0.6457169055938721, + "learning_rate": 7.925725033208187e-06, + "loss": 0.6717, + "step": 11009 + }, + { + "epoch": 0.6059772139358247, + "grad_norm": 0.724719762802124, + "learning_rate": 7.925373511703332e-06, + "loss": 0.8701, + "step": 11010 + }, + { + "epoch": 0.6060322527381804, + "grad_norm": 0.746755063533783, + "learning_rate": 7.925021968212153e-06, + "loss": 0.8509, + "step": 11011 + }, + { + "epoch": 0.6060872915405361, + "grad_norm": 0.7377174496650696, + "learning_rate": 7.924670402737292e-06, + "loss": 0.8053, + "step": 11012 + }, + { + "epoch": 0.6061423303428918, + "grad_norm": 0.9791839718818665, + "learning_rate": 7.92431881528139e-06, + "loss": 0.7893, + "step": 11013 + }, + { + "epoch": 0.6061973691452474, + "grad_norm": 0.7472195029258728, + "learning_rate": 7.923967205847089e-06, + "loss": 0.7195, + "step": 11014 + }, + { + "epoch": 0.606252407947603, + "grad_norm": 0.672851026058197, + "learning_rate": 7.923615574437037e-06, + "loss": 0.8234, + "step": 11015 + }, + { + "epoch": 0.6063074467499587, + "grad_norm": 0.739942729473114, + "learning_rate": 7.923263921053872e-06, + "loss": 0.8582, + "step": 11016 + }, + { + "epoch": 0.6063624855523144, + "grad_norm": 0.7337772846221924, + "learning_rate": 7.922912245700236e-06, + "loss": 0.8008, + "step": 11017 + }, + { + "epoch": 0.60641752435467, + "grad_norm": 0.6707174777984619, + "learning_rate": 7.922560548378774e-06, + "loss": 0.8531, + "step": 11018 + }, + { + "epoch": 0.6064725631570257, + "grad_norm": 0.6783839464187622, + "learning_rate": 7.922208829092133e-06, + "loss": 0.7963, + "step": 11019 + }, + { + "epoch": 0.6065276019593814, + "grad_norm": 0.6133253574371338, + "learning_rate": 7.92185708784295e-06, + "loss": 0.7375, + "step": 11020 + }, + { + "epoch": 0.6065826407617371, + "grad_norm": 0.8300097584724426, + "learning_rate": 7.921505324633868e-06, + "loss": 0.7976, + "step": 11021 + }, + { + "epoch": 0.6066376795640926, + "grad_norm": 0.6800658702850342, + "learning_rate": 7.921153539467538e-06, + "loss": 0.7321, + "step": 11022 + }, + { + "epoch": 0.6066927183664483, + "grad_norm": 0.6849787831306458, + "learning_rate": 7.920801732346602e-06, + "loss": 0.7134, + "step": 11023 + }, + { + "epoch": 0.606747757168804, + "grad_norm": 0.7675080895423889, + "learning_rate": 7.920449903273697e-06, + "loss": 0.7402, + "step": 11024 + }, + { + "epoch": 0.6068027959711597, + "grad_norm": 0.7431055903434753, + "learning_rate": 7.920098052251476e-06, + "loss": 0.7872, + "step": 11025 + }, + { + "epoch": 0.6068578347735153, + "grad_norm": 0.6264036297798157, + "learning_rate": 7.919746179282577e-06, + "loss": 0.7496, + "step": 11026 + }, + { + "epoch": 0.606912873575871, + "grad_norm": 0.7800843715667725, + "learning_rate": 7.919394284369648e-06, + "loss": 0.7917, + "step": 11027 + }, + { + "epoch": 0.6069679123782267, + "grad_norm": 0.7665574550628662, + "learning_rate": 7.919042367515336e-06, + "loss": 0.7905, + "step": 11028 + }, + { + "epoch": 0.6070229511805824, + "grad_norm": 0.7473214864730835, + "learning_rate": 7.918690428722279e-06, + "loss": 0.7732, + "step": 11029 + }, + { + "epoch": 0.6070779899829379, + "grad_norm": 0.6717211008071899, + "learning_rate": 7.918338467993127e-06, + "loss": 0.8221, + "step": 11030 + }, + { + "epoch": 0.6071330287852936, + "grad_norm": 0.6745431423187256, + "learning_rate": 7.917986485330525e-06, + "loss": 0.6899, + "step": 11031 + }, + { + "epoch": 0.6071880675876493, + "grad_norm": 0.6838263273239136, + "learning_rate": 7.917634480737117e-06, + "loss": 0.7133, + "step": 11032 + }, + { + "epoch": 0.607243106390005, + "grad_norm": 0.7975682020187378, + "learning_rate": 7.91728245421555e-06, + "loss": 0.8283, + "step": 11033 + }, + { + "epoch": 0.6072981451923606, + "grad_norm": 0.7112031579017639, + "learning_rate": 7.916930405768468e-06, + "loss": 0.7423, + "step": 11034 + }, + { + "epoch": 0.6073531839947163, + "grad_norm": 0.7006776928901672, + "learning_rate": 7.91657833539852e-06, + "loss": 0.716, + "step": 11035 + }, + { + "epoch": 0.607408222797072, + "grad_norm": 0.7523549795150757, + "learning_rate": 7.916226243108348e-06, + "loss": 0.8591, + "step": 11036 + }, + { + "epoch": 0.6074632615994277, + "grad_norm": 0.7257835268974304, + "learning_rate": 7.9158741289006e-06, + "loss": 0.7471, + "step": 11037 + }, + { + "epoch": 0.6075183004017832, + "grad_norm": 0.8100149631500244, + "learning_rate": 7.915521992777922e-06, + "loss": 0.8373, + "step": 11038 + }, + { + "epoch": 0.6075733392041389, + "grad_norm": 0.7781035304069519, + "learning_rate": 7.915169834742964e-06, + "loss": 0.8471, + "step": 11039 + }, + { + "epoch": 0.6076283780064946, + "grad_norm": 0.7426049709320068, + "learning_rate": 7.914817654798368e-06, + "loss": 0.753, + "step": 11040 + }, + { + "epoch": 0.6076834168088502, + "grad_norm": 0.6990010738372803, + "learning_rate": 7.914465452946782e-06, + "loss": 0.7556, + "step": 11041 + }, + { + "epoch": 0.6077384556112059, + "grad_norm": 0.8038754463195801, + "learning_rate": 7.914113229190856e-06, + "loss": 0.7787, + "step": 11042 + }, + { + "epoch": 0.6077934944135616, + "grad_norm": 0.6434115767478943, + "learning_rate": 7.913760983533233e-06, + "loss": 0.7831, + "step": 11043 + }, + { + "epoch": 0.6078485332159173, + "grad_norm": 0.8119033575057983, + "learning_rate": 7.913408715976562e-06, + "loss": 0.7691, + "step": 11044 + }, + { + "epoch": 0.6079035720182728, + "grad_norm": 0.6710149049758911, + "learning_rate": 7.913056426523493e-06, + "loss": 0.7542, + "step": 11045 + }, + { + "epoch": 0.6079586108206285, + "grad_norm": 0.7458183765411377, + "learning_rate": 7.912704115176671e-06, + "loss": 0.7673, + "step": 11046 + }, + { + "epoch": 0.6080136496229842, + "grad_norm": 0.8061705827713013, + "learning_rate": 7.912351781938745e-06, + "loss": 0.9255, + "step": 11047 + }, + { + "epoch": 0.6080686884253399, + "grad_norm": 0.7193130850791931, + "learning_rate": 7.91199942681236e-06, + "loss": 0.8154, + "step": 11048 + }, + { + "epoch": 0.6081237272276955, + "grad_norm": 0.7785167098045349, + "learning_rate": 7.911647049800171e-06, + "loss": 0.7747, + "step": 11049 + }, + { + "epoch": 0.6081787660300512, + "grad_norm": 0.665765106678009, + "learning_rate": 7.911294650904818e-06, + "loss": 0.7573, + "step": 11050 + }, + { + "epoch": 0.6082338048324069, + "grad_norm": 0.7940623760223389, + "learning_rate": 7.910942230128956e-06, + "loss": 0.6628, + "step": 11051 + }, + { + "epoch": 0.6082888436347625, + "grad_norm": 0.8364549875259399, + "learning_rate": 7.910589787475232e-06, + "loss": 0.8103, + "step": 11052 + }, + { + "epoch": 0.6083438824371181, + "grad_norm": 0.6153101325035095, + "learning_rate": 7.910237322946292e-06, + "loss": 0.76, + "step": 11053 + }, + { + "epoch": 0.6083989212394738, + "grad_norm": 0.8381257653236389, + "learning_rate": 7.909884836544789e-06, + "loss": 0.8366, + "step": 11054 + }, + { + "epoch": 0.6084539600418295, + "grad_norm": 0.6602391600608826, + "learning_rate": 7.90953232827337e-06, + "loss": 0.7389, + "step": 11055 + }, + { + "epoch": 0.6085089988441852, + "grad_norm": 0.7329971194267273, + "learning_rate": 7.909179798134685e-06, + "loss": 0.8217, + "step": 11056 + }, + { + "epoch": 0.6085640376465408, + "grad_norm": 0.7319926023483276, + "learning_rate": 7.908827246131383e-06, + "loss": 0.78, + "step": 11057 + }, + { + "epoch": 0.6086190764488965, + "grad_norm": 0.6491387486457825, + "learning_rate": 7.908474672266114e-06, + "loss": 0.7496, + "step": 11058 + }, + { + "epoch": 0.6086741152512521, + "grad_norm": 0.656434953212738, + "learning_rate": 7.908122076541529e-06, + "loss": 0.7462, + "step": 11059 + }, + { + "epoch": 0.6087291540536078, + "grad_norm": 0.6908577680587769, + "learning_rate": 7.907769458960275e-06, + "loss": 0.7505, + "step": 11060 + }, + { + "epoch": 0.6087841928559634, + "grad_norm": 0.774424135684967, + "learning_rate": 7.907416819525007e-06, + "loss": 0.8275, + "step": 11061 + }, + { + "epoch": 0.6088392316583191, + "grad_norm": 0.6796718835830688, + "learning_rate": 7.90706415823837e-06, + "loss": 0.7606, + "step": 11062 + }, + { + "epoch": 0.6088942704606748, + "grad_norm": 0.9576514959335327, + "learning_rate": 7.906711475103016e-06, + "loss": 0.807, + "step": 11063 + }, + { + "epoch": 0.6089493092630305, + "grad_norm": 0.9848490953445435, + "learning_rate": 7.9063587701216e-06, + "loss": 0.7856, + "step": 11064 + }, + { + "epoch": 0.6090043480653861, + "grad_norm": 0.9490165710449219, + "learning_rate": 7.906006043296768e-06, + "loss": 0.8519, + "step": 11065 + }, + { + "epoch": 0.6090593868677417, + "grad_norm": 0.631382942199707, + "learning_rate": 7.905653294631172e-06, + "loss": 0.7041, + "step": 11066 + }, + { + "epoch": 0.6091144256700974, + "grad_norm": 0.6969574093818665, + "learning_rate": 7.905300524127464e-06, + "loss": 0.7556, + "step": 11067 + }, + { + "epoch": 0.6091694644724531, + "grad_norm": 0.6990532279014587, + "learning_rate": 7.904947731788295e-06, + "loss": 0.799, + "step": 11068 + }, + { + "epoch": 0.6092245032748087, + "grad_norm": 0.7216916084289551, + "learning_rate": 7.904594917616315e-06, + "loss": 0.7617, + "step": 11069 + }, + { + "epoch": 0.6092795420771644, + "grad_norm": 0.6874147653579712, + "learning_rate": 7.904242081614179e-06, + "loss": 0.7616, + "step": 11070 + }, + { + "epoch": 0.6093345808795201, + "grad_norm": 0.6909550428390503, + "learning_rate": 7.903889223784535e-06, + "loss": 0.7649, + "step": 11071 + }, + { + "epoch": 0.6093896196818758, + "grad_norm": 0.7796370387077332, + "learning_rate": 7.90353634413004e-06, + "loss": 0.7557, + "step": 11072 + }, + { + "epoch": 0.6094446584842313, + "grad_norm": 0.807448148727417, + "learning_rate": 7.903183442653341e-06, + "loss": 0.7519, + "step": 11073 + }, + { + "epoch": 0.609499697286587, + "grad_norm": 0.846371054649353, + "learning_rate": 7.902830519357092e-06, + "loss": 0.9342, + "step": 11074 + }, + { + "epoch": 0.6095547360889427, + "grad_norm": 1.0386929512023926, + "learning_rate": 7.902477574243947e-06, + "loss": 0.6802, + "step": 11075 + }, + { + "epoch": 0.6096097748912984, + "grad_norm": 0.8011854887008667, + "learning_rate": 7.902124607316558e-06, + "loss": 0.7756, + "step": 11076 + }, + { + "epoch": 0.609664813693654, + "grad_norm": 0.6560170650482178, + "learning_rate": 7.901771618577574e-06, + "loss": 0.7831, + "step": 11077 + }, + { + "epoch": 0.6097198524960097, + "grad_norm": 0.656891942024231, + "learning_rate": 7.901418608029655e-06, + "loss": 0.7239, + "step": 11078 + }, + { + "epoch": 0.6097748912983654, + "grad_norm": 0.7451794743537903, + "learning_rate": 7.901065575675448e-06, + "loss": 0.7426, + "step": 11079 + }, + { + "epoch": 0.6098299301007211, + "grad_norm": 0.6805453300476074, + "learning_rate": 7.90071252151761e-06, + "loss": 0.7257, + "step": 11080 + }, + { + "epoch": 0.6098849689030766, + "grad_norm": 0.7747140526771545, + "learning_rate": 7.900359445558791e-06, + "loss": 0.8554, + "step": 11081 + }, + { + "epoch": 0.6099400077054323, + "grad_norm": 0.7276260256767273, + "learning_rate": 7.900006347801649e-06, + "loss": 0.7608, + "step": 11082 + }, + { + "epoch": 0.609995046507788, + "grad_norm": 0.7496321201324463, + "learning_rate": 7.899653228248836e-06, + "loss": 0.7707, + "step": 11083 + }, + { + "epoch": 0.6100500853101436, + "grad_norm": 0.6810722947120667, + "learning_rate": 7.899300086903006e-06, + "loss": 0.7425, + "step": 11084 + }, + { + "epoch": 0.6101051241124993, + "grad_norm": 0.7245593070983887, + "learning_rate": 7.89894692376681e-06, + "loss": 0.8404, + "step": 11085 + }, + { + "epoch": 0.610160162914855, + "grad_norm": 0.7139402627944946, + "learning_rate": 7.898593738842906e-06, + "loss": 0.7219, + "step": 11086 + }, + { + "epoch": 0.6102152017172107, + "grad_norm": 0.6483772397041321, + "learning_rate": 7.898240532133947e-06, + "loss": 0.7571, + "step": 11087 + }, + { + "epoch": 0.6102702405195662, + "grad_norm": 0.7347467541694641, + "learning_rate": 7.89788730364259e-06, + "loss": 0.7666, + "step": 11088 + }, + { + "epoch": 0.6103252793219219, + "grad_norm": 0.8899261355400085, + "learning_rate": 7.897534053371485e-06, + "loss": 0.6886, + "step": 11089 + }, + { + "epoch": 0.6103803181242776, + "grad_norm": 0.7005650401115417, + "learning_rate": 7.89718078132329e-06, + "loss": 0.6771, + "step": 11090 + }, + { + "epoch": 0.6104353569266333, + "grad_norm": 0.776589035987854, + "learning_rate": 7.896827487500662e-06, + "loss": 0.7731, + "step": 11091 + }, + { + "epoch": 0.6104903957289889, + "grad_norm": 0.7039395570755005, + "learning_rate": 7.896474171906252e-06, + "loss": 0.7415, + "step": 11092 + }, + { + "epoch": 0.6105454345313446, + "grad_norm": 0.7453792095184326, + "learning_rate": 7.896120834542718e-06, + "loss": 0.8507, + "step": 11093 + }, + { + "epoch": 0.6106004733337003, + "grad_norm": 0.7516497373580933, + "learning_rate": 7.895767475412717e-06, + "loss": 0.8271, + "step": 11094 + }, + { + "epoch": 0.610655512136056, + "grad_norm": 0.6751283407211304, + "learning_rate": 7.895414094518901e-06, + "loss": 0.7788, + "step": 11095 + }, + { + "epoch": 0.6107105509384115, + "grad_norm": 0.7240836024284363, + "learning_rate": 7.895060691863927e-06, + "loss": 0.7507, + "step": 11096 + }, + { + "epoch": 0.6107655897407672, + "grad_norm": 0.8286149501800537, + "learning_rate": 7.894707267450451e-06, + "loss": 0.7033, + "step": 11097 + }, + { + "epoch": 0.6108206285431229, + "grad_norm": 0.8814655542373657, + "learning_rate": 7.894353821281131e-06, + "loss": 0.73, + "step": 11098 + }, + { + "epoch": 0.6108756673454786, + "grad_norm": 0.6792872548103333, + "learning_rate": 7.894000353358624e-06, + "loss": 0.7445, + "step": 11099 + }, + { + "epoch": 0.6109307061478342, + "grad_norm": 0.6442595720291138, + "learning_rate": 7.893646863685584e-06, + "loss": 0.7228, + "step": 11100 + }, + { + "epoch": 0.6109857449501899, + "grad_norm": 0.6775944828987122, + "learning_rate": 7.89329335226467e-06, + "loss": 0.7937, + "step": 11101 + }, + { + "epoch": 0.6110407837525456, + "grad_norm": 0.6315211653709412, + "learning_rate": 7.892939819098534e-06, + "loss": 0.7328, + "step": 11102 + }, + { + "epoch": 0.6110958225549012, + "grad_norm": 0.7419382929801941, + "learning_rate": 7.89258626418984e-06, + "loss": 0.8088, + "step": 11103 + }, + { + "epoch": 0.6111508613572568, + "grad_norm": 0.6645117402076721, + "learning_rate": 7.89223268754124e-06, + "loss": 0.7844, + "step": 11104 + }, + { + "epoch": 0.6112059001596125, + "grad_norm": 0.6389926075935364, + "learning_rate": 7.891879089155397e-06, + "loss": 0.6353, + "step": 11105 + }, + { + "epoch": 0.6112609389619682, + "grad_norm": 0.8223785758018494, + "learning_rate": 7.891525469034963e-06, + "loss": 0.7377, + "step": 11106 + }, + { + "epoch": 0.6113159777643239, + "grad_norm": 0.7627747058868408, + "learning_rate": 7.891171827182595e-06, + "loss": 0.8317, + "step": 11107 + }, + { + "epoch": 0.6113710165666795, + "grad_norm": 0.8015971183776855, + "learning_rate": 7.890818163600956e-06, + "loss": 0.8324, + "step": 11108 + }, + { + "epoch": 0.6114260553690352, + "grad_norm": 0.7180280089378357, + "learning_rate": 7.8904644782927e-06, + "loss": 0.8211, + "step": 11109 + }, + { + "epoch": 0.6114810941713908, + "grad_norm": 0.7855646014213562, + "learning_rate": 7.890110771260487e-06, + "loss": 0.8629, + "step": 11110 + }, + { + "epoch": 0.6115361329737465, + "grad_norm": 0.7389342784881592, + "learning_rate": 7.889757042506976e-06, + "loss": 0.6917, + "step": 11111 + }, + { + "epoch": 0.6115911717761021, + "grad_norm": 0.7996030449867249, + "learning_rate": 7.889403292034825e-06, + "loss": 0.7361, + "step": 11112 + }, + { + "epoch": 0.6116462105784578, + "grad_norm": 0.6658353805541992, + "learning_rate": 7.88904951984669e-06, + "loss": 0.7048, + "step": 11113 + }, + { + "epoch": 0.6117012493808135, + "grad_norm": 0.8128555417060852, + "learning_rate": 7.888695725945235e-06, + "loss": 0.7772, + "step": 11114 + }, + { + "epoch": 0.6117562881831692, + "grad_norm": 0.7597428560256958, + "learning_rate": 7.888341910333114e-06, + "loss": 0.7447, + "step": 11115 + }, + { + "epoch": 0.6118113269855248, + "grad_norm": 0.7330088019371033, + "learning_rate": 7.88798807301299e-06, + "loss": 0.849, + "step": 11116 + }, + { + "epoch": 0.6118663657878805, + "grad_norm": 0.8374074101448059, + "learning_rate": 7.88763421398752e-06, + "loss": 0.6149, + "step": 11117 + }, + { + "epoch": 0.6119214045902361, + "grad_norm": 0.7507160305976868, + "learning_rate": 7.887280333259364e-06, + "loss": 0.7737, + "step": 11118 + }, + { + "epoch": 0.6119764433925918, + "grad_norm": 0.7218281626701355, + "learning_rate": 7.886926430831181e-06, + "loss": 0.8151, + "step": 11119 + }, + { + "epoch": 0.6120314821949474, + "grad_norm": 0.6761744618415833, + "learning_rate": 7.886572506705634e-06, + "loss": 0.7429, + "step": 11120 + }, + { + "epoch": 0.6120865209973031, + "grad_norm": 0.8243520259857178, + "learning_rate": 7.886218560885379e-06, + "loss": 0.819, + "step": 11121 + }, + { + "epoch": 0.6121415597996588, + "grad_norm": 0.9675465822219849, + "learning_rate": 7.885864593373078e-06, + "loss": 0.7834, + "step": 11122 + }, + { + "epoch": 0.6121965986020145, + "grad_norm": 0.7220338582992554, + "learning_rate": 7.885510604171391e-06, + "loss": 0.8266, + "step": 11123 + }, + { + "epoch": 0.61225163740437, + "grad_norm": 0.7185316681861877, + "learning_rate": 7.88515659328298e-06, + "loss": 0.7949, + "step": 11124 + }, + { + "epoch": 0.6123066762067257, + "grad_norm": 0.67637038230896, + "learning_rate": 7.884802560710503e-06, + "loss": 0.7456, + "step": 11125 + }, + { + "epoch": 0.6123617150090814, + "grad_norm": 0.7886855602264404, + "learning_rate": 7.884448506456622e-06, + "loss": 0.7181, + "step": 11126 + }, + { + "epoch": 0.612416753811437, + "grad_norm": 0.7250227928161621, + "learning_rate": 7.884094430523999e-06, + "loss": 0.7537, + "step": 11127 + }, + { + "epoch": 0.6124717926137927, + "grad_norm": 0.6771906614303589, + "learning_rate": 7.883740332915295e-06, + "loss": 0.7642, + "step": 11128 + }, + { + "epoch": 0.6125268314161484, + "grad_norm": 0.8375886082649231, + "learning_rate": 7.88338621363317e-06, + "loss": 0.7231, + "step": 11129 + }, + { + "epoch": 0.6125818702185041, + "grad_norm": 0.6782773733139038, + "learning_rate": 7.883032072680285e-06, + "loss": 0.8391, + "step": 11130 + }, + { + "epoch": 0.6126369090208597, + "grad_norm": 0.7103945016860962, + "learning_rate": 7.882677910059304e-06, + "loss": 0.7838, + "step": 11131 + }, + { + "epoch": 0.6126919478232153, + "grad_norm": 0.7037224769592285, + "learning_rate": 7.882323725772887e-06, + "loss": 0.7906, + "step": 11132 + }, + { + "epoch": 0.612746986625571, + "grad_norm": 0.6872009634971619, + "learning_rate": 7.881969519823695e-06, + "loss": 0.7764, + "step": 11133 + }, + { + "epoch": 0.6128020254279267, + "grad_norm": 0.7377448678016663, + "learning_rate": 7.881615292214393e-06, + "loss": 0.8231, + "step": 11134 + }, + { + "epoch": 0.6128570642302823, + "grad_norm": 0.62479168176651, + "learning_rate": 7.881261042947642e-06, + "loss": 0.6522, + "step": 11135 + }, + { + "epoch": 0.612912103032638, + "grad_norm": 0.7989023923873901, + "learning_rate": 7.880906772026105e-06, + "loss": 0.7326, + "step": 11136 + }, + { + "epoch": 0.6129671418349937, + "grad_norm": 0.6322734951972961, + "learning_rate": 7.880552479452441e-06, + "loss": 0.6775, + "step": 11137 + }, + { + "epoch": 0.6130221806373494, + "grad_norm": 0.8628767132759094, + "learning_rate": 7.880198165229318e-06, + "loss": 0.7705, + "step": 11138 + }, + { + "epoch": 0.613077219439705, + "grad_norm": 0.7386173605918884, + "learning_rate": 7.879843829359396e-06, + "loss": 0.7297, + "step": 11139 + }, + { + "epoch": 0.6131322582420606, + "grad_norm": 0.6882045269012451, + "learning_rate": 7.879489471845339e-06, + "loss": 0.6875, + "step": 11140 + }, + { + "epoch": 0.6131872970444163, + "grad_norm": 0.5986032485961914, + "learning_rate": 7.879135092689809e-06, + "loss": 0.6329, + "step": 11141 + }, + { + "epoch": 0.613242335846772, + "grad_norm": 0.7973099946975708, + "learning_rate": 7.878780691895472e-06, + "loss": 0.809, + "step": 11142 + }, + { + "epoch": 0.6132973746491276, + "grad_norm": 0.6828579902648926, + "learning_rate": 7.878426269464989e-06, + "loss": 0.7777, + "step": 11143 + }, + { + "epoch": 0.6133524134514833, + "grad_norm": 0.8179183006286621, + "learning_rate": 7.878071825401024e-06, + "loss": 0.7275, + "step": 11144 + }, + { + "epoch": 0.613407452253839, + "grad_norm": 0.7290762066841125, + "learning_rate": 7.877717359706242e-06, + "loss": 0.7424, + "step": 11145 + }, + { + "epoch": 0.6134624910561947, + "grad_norm": 0.732510507106781, + "learning_rate": 7.877362872383305e-06, + "loss": 0.6157, + "step": 11146 + }, + { + "epoch": 0.6135175298585502, + "grad_norm": 0.9205982685089111, + "learning_rate": 7.877008363434881e-06, + "loss": 0.7723, + "step": 11147 + }, + { + "epoch": 0.6135725686609059, + "grad_norm": 0.7138587832450867, + "learning_rate": 7.876653832863633e-06, + "loss": 0.7773, + "step": 11148 + }, + { + "epoch": 0.6136276074632616, + "grad_norm": 0.7323171496391296, + "learning_rate": 7.876299280672224e-06, + "loss": 0.8265, + "step": 11149 + }, + { + "epoch": 0.6136826462656173, + "grad_norm": 0.6717494130134583, + "learning_rate": 7.875944706863318e-06, + "loss": 0.788, + "step": 11150 + }, + { + "epoch": 0.6137376850679729, + "grad_norm": 0.7779331207275391, + "learning_rate": 7.875590111439582e-06, + "loss": 0.7864, + "step": 11151 + }, + { + "epoch": 0.6137927238703286, + "grad_norm": 0.6706684827804565, + "learning_rate": 7.875235494403683e-06, + "loss": 0.6673, + "step": 11152 + }, + { + "epoch": 0.6138477626726843, + "grad_norm": 0.7142137885093689, + "learning_rate": 7.874880855758281e-06, + "loss": 0.8031, + "step": 11153 + }, + { + "epoch": 0.61390280147504, + "grad_norm": 0.6962595582008362, + "learning_rate": 7.874526195506045e-06, + "loss": 0.692, + "step": 11154 + }, + { + "epoch": 0.6139578402773955, + "grad_norm": 0.7237100601196289, + "learning_rate": 7.874171513649638e-06, + "loss": 0.7504, + "step": 11155 + }, + { + "epoch": 0.6140128790797512, + "grad_norm": 0.8235127925872803, + "learning_rate": 7.87381681019173e-06, + "loss": 0.8132, + "step": 11156 + }, + { + "epoch": 0.6140679178821069, + "grad_norm": 0.7483351826667786, + "learning_rate": 7.873462085134981e-06, + "loss": 0.7589, + "step": 11157 + }, + { + "epoch": 0.6141229566844626, + "grad_norm": 0.7309976816177368, + "learning_rate": 7.873107338482062e-06, + "loss": 0.7722, + "step": 11158 + }, + { + "epoch": 0.6141779954868182, + "grad_norm": 0.8871245384216309, + "learning_rate": 7.872752570235639e-06, + "loss": 0.882, + "step": 11159 + }, + { + "epoch": 0.6142330342891739, + "grad_norm": 0.5987886190414429, + "learning_rate": 7.872397780398374e-06, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.6142880730915296, + "grad_norm": 0.7320038080215454, + "learning_rate": 7.872042968972937e-06, + "loss": 0.7444, + "step": 11161 + }, + { + "epoch": 0.6143431118938852, + "grad_norm": 0.8111129999160767, + "learning_rate": 7.871688135961995e-06, + "loss": 0.7413, + "step": 11162 + }, + { + "epoch": 0.6143981506962408, + "grad_norm": 0.7497085332870483, + "learning_rate": 7.871333281368211e-06, + "loss": 0.8413, + "step": 11163 + }, + { + "epoch": 0.6144531894985965, + "grad_norm": 0.8341198563575745, + "learning_rate": 7.870978405194256e-06, + "loss": 0.7959, + "step": 11164 + }, + { + "epoch": 0.6145082283009522, + "grad_norm": 0.6293482780456543, + "learning_rate": 7.870623507442797e-06, + "loss": 0.6429, + "step": 11165 + }, + { + "epoch": 0.6145632671033079, + "grad_norm": 1.2423945665359497, + "learning_rate": 7.870268588116499e-06, + "loss": 0.6309, + "step": 11166 + }, + { + "epoch": 0.6146183059056635, + "grad_norm": 0.7811731100082397, + "learning_rate": 7.86991364721803e-06, + "loss": 0.738, + "step": 11167 + }, + { + "epoch": 0.6146733447080192, + "grad_norm": 0.6904361248016357, + "learning_rate": 7.869558684750061e-06, + "loss": 0.7995, + "step": 11168 + }, + { + "epoch": 0.6147283835103748, + "grad_norm": 0.7267210483551025, + "learning_rate": 7.869203700715254e-06, + "loss": 0.6989, + "step": 11169 + }, + { + "epoch": 0.6147834223127304, + "grad_norm": 0.7183068990707397, + "learning_rate": 7.868848695116282e-06, + "loss": 0.7872, + "step": 11170 + }, + { + "epoch": 0.6148384611150861, + "grad_norm": 0.6774286031723022, + "learning_rate": 7.868493667955808e-06, + "loss": 0.7502, + "step": 11171 + }, + { + "epoch": 0.6148934999174418, + "grad_norm": 0.7587934732437134, + "learning_rate": 7.868138619236507e-06, + "loss": 0.8037, + "step": 11172 + }, + { + "epoch": 0.6149485387197975, + "grad_norm": 0.6825854182243347, + "learning_rate": 7.867783548961043e-06, + "loss": 0.7924, + "step": 11173 + }, + { + "epoch": 0.6150035775221531, + "grad_norm": 0.6243380904197693, + "learning_rate": 7.867428457132084e-06, + "loss": 0.5953, + "step": 11174 + }, + { + "epoch": 0.6150586163245088, + "grad_norm": 0.6630006432533264, + "learning_rate": 7.8670733437523e-06, + "loss": 0.7102, + "step": 11175 + }, + { + "epoch": 0.6151136551268644, + "grad_norm": 0.7059652805328369, + "learning_rate": 7.866718208824362e-06, + "loss": 0.6847, + "step": 11176 + }, + { + "epoch": 0.6151686939292201, + "grad_norm": 0.6768305897712708, + "learning_rate": 7.866363052350938e-06, + "loss": 0.7152, + "step": 11177 + }, + { + "epoch": 0.6152237327315757, + "grad_norm": 0.6850628852844238, + "learning_rate": 7.866007874334696e-06, + "loss": 0.767, + "step": 11178 + }, + { + "epoch": 0.6152787715339314, + "grad_norm": 0.6767143607139587, + "learning_rate": 7.865652674778305e-06, + "loss": 0.6826, + "step": 11179 + }, + { + "epoch": 0.6153338103362871, + "grad_norm": 0.8240014314651489, + "learning_rate": 7.865297453684436e-06, + "loss": 0.8493, + "step": 11180 + }, + { + "epoch": 0.6153888491386428, + "grad_norm": 0.7725485563278198, + "learning_rate": 7.864942211055758e-06, + "loss": 0.8704, + "step": 11181 + }, + { + "epoch": 0.6154438879409984, + "grad_norm": 0.9260931015014648, + "learning_rate": 7.864586946894941e-06, + "loss": 0.7926, + "step": 11182 + }, + { + "epoch": 0.615498926743354, + "grad_norm": 0.7558152079582214, + "learning_rate": 7.864231661204655e-06, + "loss": 0.8436, + "step": 11183 + }, + { + "epoch": 0.6155539655457097, + "grad_norm": 0.7899817824363708, + "learning_rate": 7.863876353987571e-06, + "loss": 0.7579, + "step": 11184 + }, + { + "epoch": 0.6156090043480654, + "grad_norm": 0.7757478952407837, + "learning_rate": 7.863521025246362e-06, + "loss": 0.7534, + "step": 11185 + }, + { + "epoch": 0.615664043150421, + "grad_norm": 0.6563131809234619, + "learning_rate": 7.863165674983693e-06, + "loss": 0.728, + "step": 11186 + }, + { + "epoch": 0.6157190819527767, + "grad_norm": 0.6516488790512085, + "learning_rate": 7.862810303202234e-06, + "loss": 0.736, + "step": 11187 + }, + { + "epoch": 0.6157741207551324, + "grad_norm": 0.6867820620536804, + "learning_rate": 7.862454909904665e-06, + "loss": 0.8032, + "step": 11188 + }, + { + "epoch": 0.6158291595574881, + "grad_norm": 0.7399753928184509, + "learning_rate": 7.862099495093647e-06, + "loss": 0.8681, + "step": 11189 + }, + { + "epoch": 0.6158841983598436, + "grad_norm": 0.7249311804771423, + "learning_rate": 7.861744058771857e-06, + "loss": 0.7868, + "step": 11190 + }, + { + "epoch": 0.6159392371621993, + "grad_norm": 0.8579045534133911, + "learning_rate": 7.861388600941964e-06, + "loss": 0.7915, + "step": 11191 + }, + { + "epoch": 0.615994275964555, + "grad_norm": 0.6855454444885254, + "learning_rate": 7.86103312160664e-06, + "loss": 0.8442, + "step": 11192 + }, + { + "epoch": 0.6160493147669107, + "grad_norm": 0.7412910461425781, + "learning_rate": 7.860677620768558e-06, + "loss": 0.7684, + "step": 11193 + }, + { + "epoch": 0.6161043535692663, + "grad_norm": 0.8567430377006531, + "learning_rate": 7.860322098430389e-06, + "loss": 0.8801, + "step": 11194 + }, + { + "epoch": 0.616159392371622, + "grad_norm": 0.7504804134368896, + "learning_rate": 7.859966554594802e-06, + "loss": 0.7359, + "step": 11195 + }, + { + "epoch": 0.6162144311739777, + "grad_norm": 0.7086803317070007, + "learning_rate": 7.859610989264474e-06, + "loss": 0.8498, + "step": 11196 + }, + { + "epoch": 0.6162694699763334, + "grad_norm": 0.7201757431030273, + "learning_rate": 7.859255402442075e-06, + "loss": 0.608, + "step": 11197 + }, + { + "epoch": 0.6163245087786889, + "grad_norm": 0.8968291282653809, + "learning_rate": 7.858899794130279e-06, + "loss": 0.8067, + "step": 11198 + }, + { + "epoch": 0.6163795475810446, + "grad_norm": 0.7474254965782166, + "learning_rate": 7.858544164331756e-06, + "loss": 0.8355, + "step": 11199 + }, + { + "epoch": 0.6164345863834003, + "grad_norm": 0.6907560229301453, + "learning_rate": 7.85818851304918e-06, + "loss": 0.788, + "step": 11200 + }, + { + "epoch": 0.616489625185756, + "grad_norm": 0.725330650806427, + "learning_rate": 7.857832840285224e-06, + "loss": 0.8157, + "step": 11201 + }, + { + "epoch": 0.6165446639881116, + "grad_norm": 0.682722270488739, + "learning_rate": 7.857477146042562e-06, + "loss": 0.7939, + "step": 11202 + }, + { + "epoch": 0.6165997027904673, + "grad_norm": 0.661533534526825, + "learning_rate": 7.857121430323866e-06, + "loss": 0.7173, + "step": 11203 + }, + { + "epoch": 0.616654741592823, + "grad_norm": 0.6922706961631775, + "learning_rate": 7.856765693131811e-06, + "loss": 0.7719, + "step": 11204 + }, + { + "epoch": 0.6167097803951787, + "grad_norm": 0.72809898853302, + "learning_rate": 7.856409934469071e-06, + "loss": 0.7362, + "step": 11205 + }, + { + "epoch": 0.6167648191975342, + "grad_norm": 0.7540956735610962, + "learning_rate": 7.856054154338317e-06, + "loss": 0.7883, + "step": 11206 + }, + { + "epoch": 0.6168198579998899, + "grad_norm": 0.6777094006538391, + "learning_rate": 7.855698352742224e-06, + "loss": 0.6938, + "step": 11207 + }, + { + "epoch": 0.6168748968022456, + "grad_norm": 0.6771852970123291, + "learning_rate": 7.855342529683467e-06, + "loss": 0.697, + "step": 11208 + }, + { + "epoch": 0.6169299356046013, + "grad_norm": 0.7810118198394775, + "learning_rate": 7.854986685164721e-06, + "loss": 0.6875, + "step": 11209 + }, + { + "epoch": 0.6169849744069569, + "grad_norm": 0.6992766261100769, + "learning_rate": 7.854630819188658e-06, + "loss": 0.6553, + "step": 11210 + }, + { + "epoch": 0.6170400132093126, + "grad_norm": 0.7409703135490417, + "learning_rate": 7.854274931757954e-06, + "loss": 0.7685, + "step": 11211 + }, + { + "epoch": 0.6170950520116683, + "grad_norm": 0.7263410687446594, + "learning_rate": 7.853919022875285e-06, + "loss": 0.7939, + "step": 11212 + }, + { + "epoch": 0.6171500908140238, + "grad_norm": 0.8451918959617615, + "learning_rate": 7.853563092543323e-06, + "loss": 0.7522, + "step": 11213 + }, + { + "epoch": 0.6172051296163795, + "grad_norm": 0.672926664352417, + "learning_rate": 7.853207140764745e-06, + "loss": 0.732, + "step": 11214 + }, + { + "epoch": 0.6172601684187352, + "grad_norm": 0.6607885956764221, + "learning_rate": 7.852851167542226e-06, + "loss": 0.7441, + "step": 11215 + }, + { + "epoch": 0.6173152072210909, + "grad_norm": 0.730385422706604, + "learning_rate": 7.85249517287844e-06, + "loss": 0.7925, + "step": 11216 + }, + { + "epoch": 0.6173702460234465, + "grad_norm": 0.7338821887969971, + "learning_rate": 7.852139156776067e-06, + "loss": 0.8106, + "step": 11217 + }, + { + "epoch": 0.6174252848258022, + "grad_norm": 0.7662163376808167, + "learning_rate": 7.851783119237777e-06, + "loss": 0.8166, + "step": 11218 + }, + { + "epoch": 0.6174803236281579, + "grad_norm": 0.7738409042358398, + "learning_rate": 7.85142706026625e-06, + "loss": 0.7898, + "step": 11219 + }, + { + "epoch": 0.6175353624305135, + "grad_norm": 0.8129978775978088, + "learning_rate": 7.851070979864159e-06, + "loss": 0.7618, + "step": 11220 + }, + { + "epoch": 0.6175904012328691, + "grad_norm": 0.7923482060432434, + "learning_rate": 7.850714878034183e-06, + "loss": 0.7341, + "step": 11221 + }, + { + "epoch": 0.6176454400352248, + "grad_norm": 0.7189306020736694, + "learning_rate": 7.850358754778996e-06, + "loss": 0.7775, + "step": 11222 + }, + { + "epoch": 0.6177004788375805, + "grad_norm": 0.9873724579811096, + "learning_rate": 7.850002610101276e-06, + "loss": 0.8521, + "step": 11223 + }, + { + "epoch": 0.6177555176399362, + "grad_norm": 0.6350038051605225, + "learning_rate": 7.8496464440037e-06, + "loss": 0.6356, + "step": 11224 + }, + { + "epoch": 0.6178105564422918, + "grad_norm": 0.8059771060943604, + "learning_rate": 7.849290256488941e-06, + "loss": 0.821, + "step": 11225 + }, + { + "epoch": 0.6178655952446475, + "grad_norm": 0.7469610571861267, + "learning_rate": 7.848934047559684e-06, + "loss": 0.7782, + "step": 11226 + }, + { + "epoch": 0.6179206340470031, + "grad_norm": 0.6423176527023315, + "learning_rate": 7.848577817218597e-06, + "loss": 0.6693, + "step": 11227 + }, + { + "epoch": 0.6179756728493588, + "grad_norm": 0.7298387885093689, + "learning_rate": 7.848221565468363e-06, + "loss": 0.775, + "step": 11228 + }, + { + "epoch": 0.6180307116517144, + "grad_norm": 0.7125145196914673, + "learning_rate": 7.84786529231166e-06, + "loss": 0.7507, + "step": 11229 + }, + { + "epoch": 0.6180857504540701, + "grad_norm": 0.6658627390861511, + "learning_rate": 7.847508997751163e-06, + "loss": 0.7506, + "step": 11230 + }, + { + "epoch": 0.6181407892564258, + "grad_norm": 0.6425275206565857, + "learning_rate": 7.847152681789549e-06, + "loss": 0.657, + "step": 11231 + }, + { + "epoch": 0.6181958280587815, + "grad_norm": 0.8075960278511047, + "learning_rate": 7.846796344429498e-06, + "loss": 0.5434, + "step": 11232 + }, + { + "epoch": 0.6182508668611371, + "grad_norm": 0.8481889367103577, + "learning_rate": 7.846439985673689e-06, + "loss": 0.8303, + "step": 11233 + }, + { + "epoch": 0.6183059056634927, + "grad_norm": 0.7216358184814453, + "learning_rate": 7.846083605524799e-06, + "loss": 0.7589, + "step": 11234 + }, + { + "epoch": 0.6183609444658484, + "grad_norm": 0.8399745225906372, + "learning_rate": 7.845727203985504e-06, + "loss": 0.8096, + "step": 11235 + }, + { + "epoch": 0.6184159832682041, + "grad_norm": 0.6708692908287048, + "learning_rate": 7.845370781058489e-06, + "loss": 0.6858, + "step": 11236 + }, + { + "epoch": 0.6184710220705597, + "grad_norm": 0.6309100389480591, + "learning_rate": 7.845014336746426e-06, + "loss": 0.6093, + "step": 11237 + }, + { + "epoch": 0.6185260608729154, + "grad_norm": 0.8138728141784668, + "learning_rate": 7.844657871051997e-06, + "loss": 0.8259, + "step": 11238 + }, + { + "epoch": 0.6185810996752711, + "grad_norm": 0.6763564348220825, + "learning_rate": 7.844301383977882e-06, + "loss": 0.7056, + "step": 11239 + }, + { + "epoch": 0.6186361384776268, + "grad_norm": 0.792085587978363, + "learning_rate": 7.843944875526758e-06, + "loss": 0.7364, + "step": 11240 + }, + { + "epoch": 0.6186911772799824, + "grad_norm": 0.8738027811050415, + "learning_rate": 7.843588345701306e-06, + "loss": 0.7092, + "step": 11241 + }, + { + "epoch": 0.618746216082338, + "grad_norm": 0.7694413065910339, + "learning_rate": 7.843231794504205e-06, + "loss": 0.852, + "step": 11242 + }, + { + "epoch": 0.6188012548846937, + "grad_norm": 0.8211640119552612, + "learning_rate": 7.842875221938135e-06, + "loss": 0.8218, + "step": 11243 + }, + { + "epoch": 0.6188562936870494, + "grad_norm": 0.620566189289093, + "learning_rate": 7.842518628005776e-06, + "loss": 0.7176, + "step": 11244 + }, + { + "epoch": 0.618911332489405, + "grad_norm": 0.7044099569320679, + "learning_rate": 7.84216201270981e-06, + "loss": 0.8068, + "step": 11245 + }, + { + "epoch": 0.6189663712917607, + "grad_norm": 0.765209436416626, + "learning_rate": 7.841805376052912e-06, + "loss": 0.8002, + "step": 11246 + }, + { + "epoch": 0.6190214100941164, + "grad_norm": 0.7565444707870483, + "learning_rate": 7.841448718037765e-06, + "loss": 0.7997, + "step": 11247 + }, + { + "epoch": 0.6190764488964721, + "grad_norm": 0.9544101357460022, + "learning_rate": 7.841092038667052e-06, + "loss": 0.647, + "step": 11248 + }, + { + "epoch": 0.6191314876988276, + "grad_norm": 0.7319634556770325, + "learning_rate": 7.840735337943452e-06, + "loss": 0.7982, + "step": 11249 + }, + { + "epoch": 0.6191865265011833, + "grad_norm": 0.6017479300498962, + "learning_rate": 7.840378615869645e-06, + "loss": 0.6817, + "step": 11250 + }, + { + "epoch": 0.619241565303539, + "grad_norm": 0.6936477422714233, + "learning_rate": 7.840021872448312e-06, + "loss": 0.7227, + "step": 11251 + }, + { + "epoch": 0.6192966041058947, + "grad_norm": 0.6962631940841675, + "learning_rate": 7.839665107682135e-06, + "loss": 0.779, + "step": 11252 + }, + { + "epoch": 0.6193516429082503, + "grad_norm": 0.9580947160720825, + "learning_rate": 7.839308321573797e-06, + "loss": 0.8821, + "step": 11253 + }, + { + "epoch": 0.619406681710606, + "grad_norm": 0.7721261978149414, + "learning_rate": 7.838951514125977e-06, + "loss": 0.7146, + "step": 11254 + }, + { + "epoch": 0.6194617205129617, + "grad_norm": 0.7349434494972229, + "learning_rate": 7.838594685341354e-06, + "loss": 0.7601, + "step": 11255 + }, + { + "epoch": 0.6195167593153172, + "grad_norm": 0.6787356734275818, + "learning_rate": 7.838237835222618e-06, + "loss": 0.706, + "step": 11256 + }, + { + "epoch": 0.6195717981176729, + "grad_norm": 0.7658288478851318, + "learning_rate": 7.837880963772445e-06, + "loss": 0.7102, + "step": 11257 + }, + { + "epoch": 0.6196268369200286, + "grad_norm": 0.8083927035331726, + "learning_rate": 7.837524070993516e-06, + "loss": 0.8501, + "step": 11258 + }, + { + "epoch": 0.6196818757223843, + "grad_norm": 0.7656283974647522, + "learning_rate": 7.837167156888516e-06, + "loss": 0.7558, + "step": 11259 + }, + { + "epoch": 0.6197369145247399, + "grad_norm": 0.7897886037826538, + "learning_rate": 7.836810221460128e-06, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.6197919533270956, + "grad_norm": 0.6858190298080444, + "learning_rate": 7.836453264711035e-06, + "loss": 0.717, + "step": 11261 + }, + { + "epoch": 0.6198469921294513, + "grad_norm": 0.7423431873321533, + "learning_rate": 7.836096286643917e-06, + "loss": 0.7047, + "step": 11262 + }, + { + "epoch": 0.619902030931807, + "grad_norm": 0.8277921676635742, + "learning_rate": 7.835739287261458e-06, + "loss": 0.7418, + "step": 11263 + }, + { + "epoch": 0.6199570697341625, + "grad_norm": 0.7102510929107666, + "learning_rate": 7.835382266566343e-06, + "loss": 0.8202, + "step": 11264 + }, + { + "epoch": 0.6200121085365182, + "grad_norm": 0.6705429553985596, + "learning_rate": 7.835025224561252e-06, + "loss": 0.7332, + "step": 11265 + }, + { + "epoch": 0.6200671473388739, + "grad_norm": 0.6529950499534607, + "learning_rate": 7.834668161248873e-06, + "loss": 0.7579, + "step": 11266 + }, + { + "epoch": 0.6201221861412296, + "grad_norm": 0.7189938426017761, + "learning_rate": 7.834311076631885e-06, + "loss": 0.7323, + "step": 11267 + }, + { + "epoch": 0.6201772249435852, + "grad_norm": 0.6559470891952515, + "learning_rate": 7.833953970712973e-06, + "loss": 0.5973, + "step": 11268 + }, + { + "epoch": 0.6202322637459409, + "grad_norm": 0.7971723675727844, + "learning_rate": 7.833596843494824e-06, + "loss": 0.804, + "step": 11269 + }, + { + "epoch": 0.6202873025482966, + "grad_norm": 0.7800958752632141, + "learning_rate": 7.833239694980118e-06, + "loss": 0.772, + "step": 11270 + }, + { + "epoch": 0.6203423413506522, + "grad_norm": 0.6831466555595398, + "learning_rate": 7.83288252517154e-06, + "loss": 0.7341, + "step": 11271 + }, + { + "epoch": 0.6203973801530078, + "grad_norm": 0.6504807472229004, + "learning_rate": 7.832525334071776e-06, + "loss": 0.6462, + "step": 11272 + }, + { + "epoch": 0.6204524189553635, + "grad_norm": 0.6973552703857422, + "learning_rate": 7.832168121683512e-06, + "loss": 0.7504, + "step": 11273 + }, + { + "epoch": 0.6205074577577192, + "grad_norm": 0.6772480607032776, + "learning_rate": 7.831810888009427e-06, + "loss": 0.7273, + "step": 11274 + }, + { + "epoch": 0.6205624965600749, + "grad_norm": 0.7077416777610779, + "learning_rate": 7.831453633052212e-06, + "loss": 0.7365, + "step": 11275 + }, + { + "epoch": 0.6206175353624305, + "grad_norm": 0.7338337898254395, + "learning_rate": 7.831096356814548e-06, + "loss": 0.7959, + "step": 11276 + }, + { + "epoch": 0.6206725741647862, + "grad_norm": 0.6313255429267883, + "learning_rate": 7.830739059299123e-06, + "loss": 0.7027, + "step": 11277 + }, + { + "epoch": 0.6207276129671419, + "grad_norm": 0.7377570867538452, + "learning_rate": 7.830381740508619e-06, + "loss": 0.6903, + "step": 11278 + }, + { + "epoch": 0.6207826517694975, + "grad_norm": 0.6868650317192078, + "learning_rate": 7.830024400445724e-06, + "loss": 0.6882, + "step": 11279 + }, + { + "epoch": 0.6208376905718531, + "grad_norm": 0.7632661461830139, + "learning_rate": 7.829667039113124e-06, + "loss": 0.8437, + "step": 11280 + }, + { + "epoch": 0.6208927293742088, + "grad_norm": 0.9241608381271362, + "learning_rate": 7.829309656513504e-06, + "loss": 0.779, + "step": 11281 + }, + { + "epoch": 0.6209477681765645, + "grad_norm": 0.6857842206954956, + "learning_rate": 7.828952252649551e-06, + "loss": 0.7882, + "step": 11282 + }, + { + "epoch": 0.6210028069789202, + "grad_norm": 0.695659875869751, + "learning_rate": 7.828594827523947e-06, + "loss": 0.7471, + "step": 11283 + }, + { + "epoch": 0.6210578457812758, + "grad_norm": 0.6398521661758423, + "learning_rate": 7.828237381139383e-06, + "loss": 0.7328, + "step": 11284 + }, + { + "epoch": 0.6211128845836315, + "grad_norm": 0.7386063933372498, + "learning_rate": 7.827879913498544e-06, + "loss": 0.748, + "step": 11285 + }, + { + "epoch": 0.6211679233859871, + "grad_norm": 0.6740923523902893, + "learning_rate": 7.827522424604117e-06, + "loss": 0.6866, + "step": 11286 + }, + { + "epoch": 0.6212229621883428, + "grad_norm": 0.6794413924217224, + "learning_rate": 7.82716491445879e-06, + "loss": 0.7299, + "step": 11287 + }, + { + "epoch": 0.6212780009906984, + "grad_norm": 0.6471715569496155, + "learning_rate": 7.826807383065245e-06, + "loss": 0.7071, + "step": 11288 + }, + { + "epoch": 0.6213330397930541, + "grad_norm": 0.9716162085533142, + "learning_rate": 7.826449830426174e-06, + "loss": 0.7417, + "step": 11289 + }, + { + "epoch": 0.6213880785954098, + "grad_norm": 0.6928716897964478, + "learning_rate": 7.826092256544263e-06, + "loss": 0.7757, + "step": 11290 + }, + { + "epoch": 0.6214431173977655, + "grad_norm": 0.6739227175712585, + "learning_rate": 7.825734661422197e-06, + "loss": 0.7576, + "step": 11291 + }, + { + "epoch": 0.621498156200121, + "grad_norm": 1.2619935274124146, + "learning_rate": 7.825377045062668e-06, + "loss": 0.7454, + "step": 11292 + }, + { + "epoch": 0.6215531950024767, + "grad_norm": 0.6713572144508362, + "learning_rate": 7.825019407468361e-06, + "loss": 0.7916, + "step": 11293 + }, + { + "epoch": 0.6216082338048324, + "grad_norm": 0.6143541932106018, + "learning_rate": 7.824661748641964e-06, + "loss": 0.6765, + "step": 11294 + }, + { + "epoch": 0.6216632726071881, + "grad_norm": 0.7141658067703247, + "learning_rate": 7.824304068586163e-06, + "loss": 0.7773, + "step": 11295 + }, + { + "epoch": 0.6217183114095437, + "grad_norm": 0.7320290803909302, + "learning_rate": 7.823946367303653e-06, + "loss": 0.8062, + "step": 11296 + }, + { + "epoch": 0.6217733502118994, + "grad_norm": 0.7523403167724609, + "learning_rate": 7.823588644797115e-06, + "loss": 0.7126, + "step": 11297 + }, + { + "epoch": 0.6218283890142551, + "grad_norm": 0.6512221097946167, + "learning_rate": 7.823230901069242e-06, + "loss": 0.7563, + "step": 11298 + }, + { + "epoch": 0.6218834278166107, + "grad_norm": 0.6512733697891235, + "learning_rate": 7.82287313612272e-06, + "loss": 0.7603, + "step": 11299 + }, + { + "epoch": 0.6219384666189663, + "grad_norm": 1.0590927600860596, + "learning_rate": 7.82251534996024e-06, + "loss": 0.8325, + "step": 11300 + }, + { + "epoch": 0.621993505421322, + "grad_norm": 0.6763397455215454, + "learning_rate": 7.82215754258449e-06, + "loss": 0.7915, + "step": 11301 + }, + { + "epoch": 0.6220485442236777, + "grad_norm": 0.6640639901161194, + "learning_rate": 7.82179971399816e-06, + "loss": 0.6953, + "step": 11302 + }, + { + "epoch": 0.6221035830260333, + "grad_norm": 0.6611515283584595, + "learning_rate": 7.821441864203938e-06, + "loss": 0.8331, + "step": 11303 + }, + { + "epoch": 0.622158621828389, + "grad_norm": 0.8226057887077332, + "learning_rate": 7.821083993204514e-06, + "loss": 0.7448, + "step": 11304 + }, + { + "epoch": 0.6222136606307447, + "grad_norm": 0.6798059940338135, + "learning_rate": 7.820726101002578e-06, + "loss": 0.717, + "step": 11305 + }, + { + "epoch": 0.6222686994331004, + "grad_norm": 0.7623499631881714, + "learning_rate": 7.820368187600821e-06, + "loss": 0.7343, + "step": 11306 + }, + { + "epoch": 0.622323738235456, + "grad_norm": 0.703886866569519, + "learning_rate": 7.82001025300193e-06, + "loss": 0.8008, + "step": 11307 + }, + { + "epoch": 0.6223787770378116, + "grad_norm": 0.6817659735679626, + "learning_rate": 7.819652297208597e-06, + "loss": 0.7534, + "step": 11308 + }, + { + "epoch": 0.6224338158401673, + "grad_norm": 0.8991402983665466, + "learning_rate": 7.819294320223513e-06, + "loss": 0.6236, + "step": 11309 + }, + { + "epoch": 0.622488854642523, + "grad_norm": 0.791199803352356, + "learning_rate": 7.818936322049366e-06, + "loss": 0.772, + "step": 11310 + }, + { + "epoch": 0.6225438934448786, + "grad_norm": 0.6401470303535461, + "learning_rate": 7.81857830268885e-06, + "loss": 0.7749, + "step": 11311 + }, + { + "epoch": 0.6225989322472343, + "grad_norm": 0.6731516122817993, + "learning_rate": 7.818220262144653e-06, + "loss": 0.7506, + "step": 11312 + }, + { + "epoch": 0.62265397104959, + "grad_norm": 0.7391661405563354, + "learning_rate": 7.817862200419467e-06, + "loss": 0.7288, + "step": 11313 + }, + { + "epoch": 0.6227090098519457, + "grad_norm": 0.7363784909248352, + "learning_rate": 7.817504117515984e-06, + "loss": 0.7087, + "step": 11314 + }, + { + "epoch": 0.6227640486543012, + "grad_norm": 0.7609296441078186, + "learning_rate": 7.817146013436893e-06, + "loss": 0.7553, + "step": 11315 + }, + { + "epoch": 0.6228190874566569, + "grad_norm": 0.6818829774856567, + "learning_rate": 7.816787888184886e-06, + "loss": 0.7534, + "step": 11316 + }, + { + "epoch": 0.6228741262590126, + "grad_norm": 0.7434844374656677, + "learning_rate": 7.816429741762657e-06, + "loss": 0.8008, + "step": 11317 + }, + { + "epoch": 0.6229291650613683, + "grad_norm": 0.6881742477416992, + "learning_rate": 7.816071574172895e-06, + "loss": 0.7324, + "step": 11318 + }, + { + "epoch": 0.6229842038637239, + "grad_norm": 0.7109540104866028, + "learning_rate": 7.815713385418293e-06, + "loss": 0.7954, + "step": 11319 + }, + { + "epoch": 0.6230392426660796, + "grad_norm": 0.6868860721588135, + "learning_rate": 7.815355175501542e-06, + "loss": 0.6703, + "step": 11320 + }, + { + "epoch": 0.6230942814684353, + "grad_norm": 0.7851449847221375, + "learning_rate": 7.814996944425337e-06, + "loss": 0.8321, + "step": 11321 + }, + { + "epoch": 0.623149320270791, + "grad_norm": 0.7966809272766113, + "learning_rate": 7.814638692192367e-06, + "loss": 0.7603, + "step": 11322 + }, + { + "epoch": 0.6232043590731465, + "grad_norm": 0.6612964272499084, + "learning_rate": 7.814280418805327e-06, + "loss": 0.8096, + "step": 11323 + }, + { + "epoch": 0.6232593978755022, + "grad_norm": 0.6398881077766418, + "learning_rate": 7.813922124266908e-06, + "loss": 0.7559, + "step": 11324 + }, + { + "epoch": 0.6233144366778579, + "grad_norm": 0.8062521815299988, + "learning_rate": 7.813563808579804e-06, + "loss": 0.7863, + "step": 11325 + }, + { + "epoch": 0.6233694754802136, + "grad_norm": 0.7083317041397095, + "learning_rate": 7.813205471746708e-06, + "loss": 0.7358, + "step": 11326 + }, + { + "epoch": 0.6234245142825692, + "grad_norm": 0.6190419793128967, + "learning_rate": 7.812847113770312e-06, + "loss": 0.637, + "step": 11327 + }, + { + "epoch": 0.6234795530849249, + "grad_norm": 0.7036548256874084, + "learning_rate": 7.812488734653309e-06, + "loss": 0.8049, + "step": 11328 + }, + { + "epoch": 0.6235345918872806, + "grad_norm": 0.7952288389205933, + "learning_rate": 7.812130334398395e-06, + "loss": 0.781, + "step": 11329 + }, + { + "epoch": 0.6235896306896362, + "grad_norm": 0.7925593852996826, + "learning_rate": 7.811771913008262e-06, + "loss": 0.7913, + "step": 11330 + }, + { + "epoch": 0.6236446694919918, + "grad_norm": 0.7190900444984436, + "learning_rate": 7.811413470485604e-06, + "loss": 0.7464, + "step": 11331 + }, + { + "epoch": 0.6236997082943475, + "grad_norm": 0.6476338505744934, + "learning_rate": 7.811055006833114e-06, + "loss": 0.699, + "step": 11332 + }, + { + "epoch": 0.6237547470967032, + "grad_norm": 0.7412729263305664, + "learning_rate": 7.810696522053487e-06, + "loss": 0.7958, + "step": 11333 + }, + { + "epoch": 0.6238097858990589, + "grad_norm": 0.6646767854690552, + "learning_rate": 7.81033801614942e-06, + "loss": 0.6276, + "step": 11334 + }, + { + "epoch": 0.6238648247014145, + "grad_norm": 0.6912583112716675, + "learning_rate": 7.809979489123601e-06, + "loss": 0.7611, + "step": 11335 + }, + { + "epoch": 0.6239198635037702, + "grad_norm": 0.7324331998825073, + "learning_rate": 7.80962094097873e-06, + "loss": 0.7436, + "step": 11336 + }, + { + "epoch": 0.6239749023061258, + "grad_norm": 0.7046643495559692, + "learning_rate": 7.809262371717501e-06, + "loss": 0.7287, + "step": 11337 + }, + { + "epoch": 0.6240299411084815, + "grad_norm": 0.6013771891593933, + "learning_rate": 7.808903781342607e-06, + "loss": 0.6822, + "step": 11338 + }, + { + "epoch": 0.6240849799108371, + "grad_norm": 0.633074164390564, + "learning_rate": 7.808545169856745e-06, + "loss": 0.7758, + "step": 11339 + }, + { + "epoch": 0.6241400187131928, + "grad_norm": 0.6603411436080933, + "learning_rate": 7.808186537262608e-06, + "loss": 0.6797, + "step": 11340 + }, + { + "epoch": 0.6241950575155485, + "grad_norm": 0.8316327929496765, + "learning_rate": 7.807827883562894e-06, + "loss": 0.777, + "step": 11341 + }, + { + "epoch": 0.6242500963179041, + "grad_norm": 0.7954252362251282, + "learning_rate": 7.807469208760295e-06, + "loss": 0.6581, + "step": 11342 + }, + { + "epoch": 0.6243051351202598, + "grad_norm": 0.6108134984970093, + "learning_rate": 7.80711051285751e-06, + "loss": 0.7126, + "step": 11343 + }, + { + "epoch": 0.6243601739226154, + "grad_norm": 0.7224909067153931, + "learning_rate": 7.806751795857235e-06, + "loss": 0.8677, + "step": 11344 + }, + { + "epoch": 0.6244152127249711, + "grad_norm": 0.720923125743866, + "learning_rate": 7.806393057762165e-06, + "loss": 0.7174, + "step": 11345 + }, + { + "epoch": 0.6244702515273267, + "grad_norm": 0.6837444305419922, + "learning_rate": 7.806034298574993e-06, + "loss": 0.7431, + "step": 11346 + }, + { + "epoch": 0.6245252903296824, + "grad_norm": 0.8486534953117371, + "learning_rate": 7.80567551829842e-06, + "loss": 0.7955, + "step": 11347 + }, + { + "epoch": 0.6245803291320381, + "grad_norm": 0.6459395885467529, + "learning_rate": 7.805316716935143e-06, + "loss": 0.7681, + "step": 11348 + }, + { + "epoch": 0.6246353679343938, + "grad_norm": 0.8414636850357056, + "learning_rate": 7.804957894487854e-06, + "loss": 0.8985, + "step": 11349 + }, + { + "epoch": 0.6246904067367494, + "grad_norm": 0.7930828928947449, + "learning_rate": 7.804599050959254e-06, + "loss": 0.7389, + "step": 11350 + }, + { + "epoch": 0.624745445539105, + "grad_norm": 0.7102516889572144, + "learning_rate": 7.804240186352038e-06, + "loss": 0.8072, + "step": 11351 + }, + { + "epoch": 0.6248004843414607, + "grad_norm": 0.773341178894043, + "learning_rate": 7.803881300668901e-06, + "loss": 0.7531, + "step": 11352 + }, + { + "epoch": 0.6248555231438164, + "grad_norm": 0.6354981064796448, + "learning_rate": 7.803522393912544e-06, + "loss": 0.6761, + "step": 11353 + }, + { + "epoch": 0.624910561946172, + "grad_norm": 0.7833859324455261, + "learning_rate": 7.803163466085663e-06, + "loss": 0.7768, + "step": 11354 + }, + { + "epoch": 0.6249656007485277, + "grad_norm": 0.6982376575469971, + "learning_rate": 7.802804517190957e-06, + "loss": 0.7472, + "step": 11355 + }, + { + "epoch": 0.6250206395508834, + "grad_norm": 0.7214694023132324, + "learning_rate": 7.80244554723112e-06, + "loss": 0.7919, + "step": 11356 + }, + { + "epoch": 0.6250756783532391, + "grad_norm": 0.8002933859825134, + "learning_rate": 7.802086556208855e-06, + "loss": 0.8278, + "step": 11357 + }, + { + "epoch": 0.6251307171555947, + "grad_norm": 0.7619680762290955, + "learning_rate": 7.801727544126858e-06, + "loss": 0.7775, + "step": 11358 + }, + { + "epoch": 0.6251857559579503, + "grad_norm": 0.6340392827987671, + "learning_rate": 7.801368510987825e-06, + "loss": 0.7324, + "step": 11359 + }, + { + "epoch": 0.625240794760306, + "grad_norm": 0.6754844784736633, + "learning_rate": 7.801009456794457e-06, + "loss": 0.7296, + "step": 11360 + }, + { + "epoch": 0.6252958335626617, + "grad_norm": 0.6871771216392517, + "learning_rate": 7.80065038154945e-06, + "loss": 0.7398, + "step": 11361 + }, + { + "epoch": 0.6253508723650173, + "grad_norm": 0.6610772013664246, + "learning_rate": 7.800291285255505e-06, + "loss": 0.738, + "step": 11362 + }, + { + "epoch": 0.625405911167373, + "grad_norm": 0.6858081221580505, + "learning_rate": 7.799932167915322e-06, + "loss": 0.7353, + "step": 11363 + }, + { + "epoch": 0.6254609499697287, + "grad_norm": 0.6698840856552124, + "learning_rate": 7.799573029531597e-06, + "loss": 0.7505, + "step": 11364 + }, + { + "epoch": 0.6255159887720844, + "grad_norm": 0.7374000549316406, + "learning_rate": 7.799213870107031e-06, + "loss": 0.7974, + "step": 11365 + }, + { + "epoch": 0.6255710275744399, + "grad_norm": 0.6962621808052063, + "learning_rate": 7.798854689644324e-06, + "loss": 0.8183, + "step": 11366 + }, + { + "epoch": 0.6256260663767956, + "grad_norm": 0.8477681279182434, + "learning_rate": 7.798495488146173e-06, + "loss": 0.7533, + "step": 11367 + }, + { + "epoch": 0.6256811051791513, + "grad_norm": 0.6963459253311157, + "learning_rate": 7.798136265615278e-06, + "loss": 0.6362, + "step": 11368 + }, + { + "epoch": 0.625736143981507, + "grad_norm": 0.7125601172447205, + "learning_rate": 7.79777702205434e-06, + "loss": 0.7296, + "step": 11369 + }, + { + "epoch": 0.6257911827838626, + "grad_norm": 0.6650554537773132, + "learning_rate": 7.79741775746606e-06, + "loss": 0.8231, + "step": 11370 + }, + { + "epoch": 0.6258462215862183, + "grad_norm": 0.6556620597839355, + "learning_rate": 7.797058471853138e-06, + "loss": 0.6952, + "step": 11371 + }, + { + "epoch": 0.625901260388574, + "grad_norm": 0.6350956559181213, + "learning_rate": 7.79669916521827e-06, + "loss": 0.686, + "step": 11372 + }, + { + "epoch": 0.6259562991909297, + "grad_norm": 0.6346702575683594, + "learning_rate": 7.796339837564163e-06, + "loss": 0.7234, + "step": 11373 + }, + { + "epoch": 0.6260113379932852, + "grad_norm": 0.741437554359436, + "learning_rate": 7.795980488893514e-06, + "loss": 0.8096, + "step": 11374 + }, + { + "epoch": 0.6260663767956409, + "grad_norm": 0.7057582139968872, + "learning_rate": 7.795621119209021e-06, + "loss": 0.8022, + "step": 11375 + }, + { + "epoch": 0.6261214155979966, + "grad_norm": 0.658107578754425, + "learning_rate": 7.79526172851339e-06, + "loss": 0.7564, + "step": 11376 + }, + { + "epoch": 0.6261764544003523, + "grad_norm": 0.7974086403846741, + "learning_rate": 7.79490231680932e-06, + "loss": 0.7721, + "step": 11377 + }, + { + "epoch": 0.6262314932027079, + "grad_norm": 0.6669130921363831, + "learning_rate": 7.794542884099513e-06, + "loss": 0.7652, + "step": 11378 + }, + { + "epoch": 0.6262865320050636, + "grad_norm": 0.7364919185638428, + "learning_rate": 7.794183430386669e-06, + "loss": 0.8679, + "step": 11379 + }, + { + "epoch": 0.6263415708074193, + "grad_norm": 0.7383667230606079, + "learning_rate": 7.793823955673489e-06, + "loss": 0.7715, + "step": 11380 + }, + { + "epoch": 0.626396609609775, + "grad_norm": 0.6688774228096008, + "learning_rate": 7.793464459962679e-06, + "loss": 0.7503, + "step": 11381 + }, + { + "epoch": 0.6264516484121305, + "grad_norm": 0.6771709322929382, + "learning_rate": 7.793104943256935e-06, + "loss": 0.7479, + "step": 11382 + }, + { + "epoch": 0.6265066872144862, + "grad_norm": 0.7121349573135376, + "learning_rate": 7.792745405558964e-06, + "loss": 0.7655, + "step": 11383 + }, + { + "epoch": 0.6265617260168419, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.792385846871465e-06, + "loss": 0.7418, + "step": 11384 + }, + { + "epoch": 0.6266167648191975, + "grad_norm": 0.6701569557189941, + "learning_rate": 7.792026267197142e-06, + "loss": 0.7669, + "step": 11385 + }, + { + "epoch": 0.6266718036215532, + "grad_norm": 0.6890652179718018, + "learning_rate": 7.791666666538697e-06, + "loss": 0.7659, + "step": 11386 + }, + { + "epoch": 0.6267268424239089, + "grad_norm": 0.7636297345161438, + "learning_rate": 7.791307044898833e-06, + "loss": 0.7272, + "step": 11387 + }, + { + "epoch": 0.6267818812262645, + "grad_norm": 0.6563602089881897, + "learning_rate": 7.790947402280252e-06, + "loss": 0.7603, + "step": 11388 + }, + { + "epoch": 0.6268369200286201, + "grad_norm": 0.7252678275108337, + "learning_rate": 7.790587738685655e-06, + "loss": 0.7789, + "step": 11389 + }, + { + "epoch": 0.6268919588309758, + "grad_norm": 0.6703618764877319, + "learning_rate": 7.79022805411775e-06, + "loss": 0.6883, + "step": 11390 + }, + { + "epoch": 0.6269469976333315, + "grad_norm": 0.7165848612785339, + "learning_rate": 7.789868348579239e-06, + "loss": 0.7944, + "step": 11391 + }, + { + "epoch": 0.6270020364356872, + "grad_norm": 0.9325329065322876, + "learning_rate": 7.789508622072822e-06, + "loss": 0.9059, + "step": 11392 + }, + { + "epoch": 0.6270570752380428, + "grad_norm": 0.6875555515289307, + "learning_rate": 7.789148874601204e-06, + "loss": 0.7115, + "step": 11393 + }, + { + "epoch": 0.6271121140403985, + "grad_norm": 0.6470181941986084, + "learning_rate": 7.788789106167093e-06, + "loss": 0.7603, + "step": 11394 + }, + { + "epoch": 0.6271671528427541, + "grad_norm": 0.688685417175293, + "learning_rate": 7.788429316773188e-06, + "loss": 0.8397, + "step": 11395 + }, + { + "epoch": 0.6272221916451098, + "grad_norm": 0.6299887895584106, + "learning_rate": 7.788069506422193e-06, + "loss": 0.7026, + "step": 11396 + }, + { + "epoch": 0.6272772304474654, + "grad_norm": 0.8046191930770874, + "learning_rate": 7.787709675116817e-06, + "loss": 0.8573, + "step": 11397 + }, + { + "epoch": 0.6273322692498211, + "grad_norm": 0.6700685620307922, + "learning_rate": 7.78734982285976e-06, + "loss": 0.7225, + "step": 11398 + }, + { + "epoch": 0.6273873080521768, + "grad_norm": 0.6968538761138916, + "learning_rate": 7.786989949653726e-06, + "loss": 0.6571, + "step": 11399 + }, + { + "epoch": 0.6274423468545325, + "grad_norm": 0.6857314705848694, + "learning_rate": 7.786630055501425e-06, + "loss": 0.8131, + "step": 11400 + }, + { + "epoch": 0.6274973856568881, + "grad_norm": 0.702316403388977, + "learning_rate": 7.786270140405557e-06, + "loss": 0.7222, + "step": 11401 + }, + { + "epoch": 0.6275524244592438, + "grad_norm": 0.6987283825874329, + "learning_rate": 7.785910204368827e-06, + "loss": 0.7171, + "step": 11402 + }, + { + "epoch": 0.6276074632615994, + "grad_norm": 0.6835529208183289, + "learning_rate": 7.785550247393943e-06, + "loss": 0.8077, + "step": 11403 + }, + { + "epoch": 0.6276625020639551, + "grad_norm": 0.6423392295837402, + "learning_rate": 7.785190269483609e-06, + "loss": 0.6689, + "step": 11404 + }, + { + "epoch": 0.6277175408663107, + "grad_norm": 0.6995517611503601, + "learning_rate": 7.78483027064053e-06, + "loss": 0.7417, + "step": 11405 + }, + { + "epoch": 0.6277725796686664, + "grad_norm": 0.6639729142189026, + "learning_rate": 7.784470250867413e-06, + "loss": 0.6521, + "step": 11406 + }, + { + "epoch": 0.6278276184710221, + "grad_norm": 0.7280262112617493, + "learning_rate": 7.784110210166961e-06, + "loss": 0.7686, + "step": 11407 + }, + { + "epoch": 0.6278826572733778, + "grad_norm": 0.6741863489151001, + "learning_rate": 7.783750148541884e-06, + "loss": 0.7794, + "step": 11408 + }, + { + "epoch": 0.6279376960757334, + "grad_norm": 0.8160151243209839, + "learning_rate": 7.783390065994885e-06, + "loss": 0.7065, + "step": 11409 + }, + { + "epoch": 0.627992734878089, + "grad_norm": 0.7288973927497864, + "learning_rate": 7.783029962528672e-06, + "loss": 0.8337, + "step": 11410 + }, + { + "epoch": 0.6280477736804447, + "grad_norm": 0.7764643430709839, + "learning_rate": 7.782669838145952e-06, + "loss": 0.8812, + "step": 11411 + }, + { + "epoch": 0.6281028124828004, + "grad_norm": 0.8145303130149841, + "learning_rate": 7.782309692849425e-06, + "loss": 0.9206, + "step": 11412 + }, + { + "epoch": 0.628157851285156, + "grad_norm": 0.6883288621902466, + "learning_rate": 7.781949526641808e-06, + "loss": 0.7779, + "step": 11413 + }, + { + "epoch": 0.6282128900875117, + "grad_norm": 0.7281043529510498, + "learning_rate": 7.781589339525803e-06, + "loss": 0.7933, + "step": 11414 + }, + { + "epoch": 0.6282679288898674, + "grad_norm": 0.7998347878456116, + "learning_rate": 7.781229131504115e-06, + "loss": 0.8772, + "step": 11415 + }, + { + "epoch": 0.6283229676922231, + "grad_norm": 0.7591177225112915, + "learning_rate": 7.780868902579455e-06, + "loss": 0.9054, + "step": 11416 + }, + { + "epoch": 0.6283780064945786, + "grad_norm": 0.7209650278091431, + "learning_rate": 7.780508652754528e-06, + "loss": 0.7781, + "step": 11417 + }, + { + "epoch": 0.6284330452969343, + "grad_norm": 1.2373511791229248, + "learning_rate": 7.780148382032042e-06, + "loss": 0.7501, + "step": 11418 + }, + { + "epoch": 0.62848808409929, + "grad_norm": 0.6281551122665405, + "learning_rate": 7.779788090414704e-06, + "loss": 0.8122, + "step": 11419 + }, + { + "epoch": 0.6285431229016457, + "grad_norm": 0.6954115629196167, + "learning_rate": 7.779427777905224e-06, + "loss": 0.7815, + "step": 11420 + }, + { + "epoch": 0.6285981617040013, + "grad_norm": 0.727043628692627, + "learning_rate": 7.77906744450631e-06, + "loss": 0.7116, + "step": 11421 + }, + { + "epoch": 0.628653200506357, + "grad_norm": 0.6979809403419495, + "learning_rate": 7.778707090220667e-06, + "loss": 0.7707, + "step": 11422 + }, + { + "epoch": 0.6287082393087127, + "grad_norm": 0.6851169466972351, + "learning_rate": 7.778346715051006e-06, + "loss": 0.811, + "step": 11423 + }, + { + "epoch": 0.6287632781110684, + "grad_norm": 0.70259028673172, + "learning_rate": 7.777986319000036e-06, + "loss": 0.7766, + "step": 11424 + }, + { + "epoch": 0.6288183169134239, + "grad_norm": 0.7436364889144897, + "learning_rate": 7.777625902070463e-06, + "loss": 0.8449, + "step": 11425 + }, + { + "epoch": 0.6288733557157796, + "grad_norm": 0.6452080607414246, + "learning_rate": 7.777265464264998e-06, + "loss": 0.7138, + "step": 11426 + }, + { + "epoch": 0.6289283945181353, + "grad_norm": 0.6329460144042969, + "learning_rate": 7.776905005586349e-06, + "loss": 0.6482, + "step": 11427 + }, + { + "epoch": 0.6289834333204909, + "grad_norm": 0.7521186470985413, + "learning_rate": 7.776544526037225e-06, + "loss": 0.751, + "step": 11428 + }, + { + "epoch": 0.6290384721228466, + "grad_norm": 0.7105319499969482, + "learning_rate": 7.776184025620334e-06, + "loss": 0.843, + "step": 11429 + }, + { + "epoch": 0.6290935109252023, + "grad_norm": 0.7329964637756348, + "learning_rate": 7.77582350433839e-06, + "loss": 0.6992, + "step": 11430 + }, + { + "epoch": 0.629148549727558, + "grad_norm": 0.7492092847824097, + "learning_rate": 7.775462962194098e-06, + "loss": 0.7579, + "step": 11431 + }, + { + "epoch": 0.6292035885299135, + "grad_norm": 0.7332866191864014, + "learning_rate": 7.77510239919017e-06, + "loss": 0.7758, + "step": 11432 + }, + { + "epoch": 0.6292586273322692, + "grad_norm": 0.7532867193222046, + "learning_rate": 7.774741815329315e-06, + "loss": 0.8157, + "step": 11433 + }, + { + "epoch": 0.6293136661346249, + "grad_norm": 0.7498316168785095, + "learning_rate": 7.774381210614244e-06, + "loss": 0.7671, + "step": 11434 + }, + { + "epoch": 0.6293687049369806, + "grad_norm": 0.8017444610595703, + "learning_rate": 7.774020585047666e-06, + "loss": 0.6989, + "step": 11435 + }, + { + "epoch": 0.6294237437393362, + "grad_norm": 0.7827737927436829, + "learning_rate": 7.77365993863229e-06, + "loss": 0.852, + "step": 11436 + }, + { + "epoch": 0.6294787825416919, + "grad_norm": 1.1411668062210083, + "learning_rate": 7.77329927137083e-06, + "loss": 0.9303, + "step": 11437 + }, + { + "epoch": 0.6295338213440476, + "grad_norm": 1.2931067943572998, + "learning_rate": 7.772938583265995e-06, + "loss": 0.8913, + "step": 11438 + }, + { + "epoch": 0.6295888601464033, + "grad_norm": 0.7407616376876831, + "learning_rate": 7.772577874320494e-06, + "loss": 0.9247, + "step": 11439 + }, + { + "epoch": 0.6296438989487588, + "grad_norm": 0.6544716954231262, + "learning_rate": 7.772217144537043e-06, + "loss": 0.7879, + "step": 11440 + }, + { + "epoch": 0.6296989377511145, + "grad_norm": 0.7467932105064392, + "learning_rate": 7.77185639391835e-06, + "loss": 0.7624, + "step": 11441 + }, + { + "epoch": 0.6297539765534702, + "grad_norm": 0.6845136880874634, + "learning_rate": 7.771495622467123e-06, + "loss": 0.691, + "step": 11442 + }, + { + "epoch": 0.6298090153558259, + "grad_norm": 0.7881575226783752, + "learning_rate": 7.771134830186079e-06, + "loss": 0.7567, + "step": 11443 + }, + { + "epoch": 0.6298640541581815, + "grad_norm": 0.6910528540611267, + "learning_rate": 7.770774017077928e-06, + "loss": 0.7527, + "step": 11444 + }, + { + "epoch": 0.6299190929605372, + "grad_norm": 0.7395550608634949, + "learning_rate": 7.770413183145379e-06, + "loss": 0.8288, + "step": 11445 + }, + { + "epoch": 0.6299741317628929, + "grad_norm": 0.6876364350318909, + "learning_rate": 7.770052328391147e-06, + "loss": 0.7759, + "step": 11446 + }, + { + "epoch": 0.6300291705652485, + "grad_norm": 0.7936999201774597, + "learning_rate": 7.769691452817945e-06, + "loss": 0.6885, + "step": 11447 + }, + { + "epoch": 0.6300842093676041, + "grad_norm": 0.721479058265686, + "learning_rate": 7.769330556428482e-06, + "loss": 0.7215, + "step": 11448 + }, + { + "epoch": 0.6301392481699598, + "grad_norm": 0.6549312472343445, + "learning_rate": 7.76896963922547e-06, + "loss": 0.7523, + "step": 11449 + }, + { + "epoch": 0.6301942869723155, + "grad_norm": 0.6684648394584656, + "learning_rate": 7.768608701211627e-06, + "loss": 0.768, + "step": 11450 + }, + { + "epoch": 0.6302493257746712, + "grad_norm": 0.7014286518096924, + "learning_rate": 7.76824774238966e-06, + "loss": 0.7534, + "step": 11451 + }, + { + "epoch": 0.6303043645770268, + "grad_norm": 0.9186445474624634, + "learning_rate": 7.767886762762284e-06, + "loss": 0.8398, + "step": 11452 + }, + { + "epoch": 0.6303594033793825, + "grad_norm": 0.787187933921814, + "learning_rate": 7.76752576233221e-06, + "loss": 0.8035, + "step": 11453 + }, + { + "epoch": 0.6304144421817381, + "grad_norm": 0.7471121549606323, + "learning_rate": 7.767164741102157e-06, + "loss": 0.7983, + "step": 11454 + }, + { + "epoch": 0.6304694809840938, + "grad_norm": 0.6810591816902161, + "learning_rate": 7.766803699074834e-06, + "loss": 0.7132, + "step": 11455 + }, + { + "epoch": 0.6305245197864494, + "grad_norm": 0.7154163122177124, + "learning_rate": 7.766442636252953e-06, + "loss": 0.7942, + "step": 11456 + }, + { + "epoch": 0.6305795585888051, + "grad_norm": 0.6990880966186523, + "learning_rate": 7.766081552639231e-06, + "loss": 0.7296, + "step": 11457 + }, + { + "epoch": 0.6306345973911608, + "grad_norm": 0.8848066926002502, + "learning_rate": 7.76572044823638e-06, + "loss": 0.621, + "step": 11458 + }, + { + "epoch": 0.6306896361935165, + "grad_norm": 0.6929910182952881, + "learning_rate": 7.765359323047116e-06, + "loss": 0.5917, + "step": 11459 + }, + { + "epoch": 0.6307446749958721, + "grad_norm": 0.6874505281448364, + "learning_rate": 7.764998177074149e-06, + "loss": 0.7244, + "step": 11460 + }, + { + "epoch": 0.6307997137982277, + "grad_norm": 0.6823066473007202, + "learning_rate": 7.764637010320197e-06, + "loss": 0.7299, + "step": 11461 + }, + { + "epoch": 0.6308547526005834, + "grad_norm": 0.7315061688423157, + "learning_rate": 7.764275822787972e-06, + "loss": 0.7759, + "step": 11462 + }, + { + "epoch": 0.6309097914029391, + "grad_norm": 0.6186662316322327, + "learning_rate": 7.763914614480192e-06, + "loss": 0.6746, + "step": 11463 + }, + { + "epoch": 0.6309648302052947, + "grad_norm": 0.6751530170440674, + "learning_rate": 7.763553385399569e-06, + "loss": 0.8371, + "step": 11464 + }, + { + "epoch": 0.6310198690076504, + "grad_norm": 1.0283396244049072, + "learning_rate": 7.763192135548818e-06, + "loss": 0.7743, + "step": 11465 + }, + { + "epoch": 0.6310749078100061, + "grad_norm": 0.7695029973983765, + "learning_rate": 7.762830864930655e-06, + "loss": 0.7387, + "step": 11466 + }, + { + "epoch": 0.6311299466123618, + "grad_norm": 0.8087024688720703, + "learning_rate": 7.762469573547795e-06, + "loss": 0.8357, + "step": 11467 + }, + { + "epoch": 0.6311849854147173, + "grad_norm": 0.9203382134437561, + "learning_rate": 7.762108261402951e-06, + "loss": 0.8191, + "step": 11468 + }, + { + "epoch": 0.631240024217073, + "grad_norm": 0.6569168567657471, + "learning_rate": 7.761746928498843e-06, + "loss": 0.7035, + "step": 11469 + }, + { + "epoch": 0.6312950630194287, + "grad_norm": 0.7903677225112915, + "learning_rate": 7.761385574838183e-06, + "loss": 0.8295, + "step": 11470 + }, + { + "epoch": 0.6313501018217843, + "grad_norm": 0.6780279278755188, + "learning_rate": 7.76102420042369e-06, + "loss": 0.6497, + "step": 11471 + }, + { + "epoch": 0.63140514062414, + "grad_norm": 0.7150516510009766, + "learning_rate": 7.760662805258076e-06, + "loss": 0.7979, + "step": 11472 + }, + { + "epoch": 0.6314601794264957, + "grad_norm": 0.7278215885162354, + "learning_rate": 7.760301389344061e-06, + "loss": 0.8503, + "step": 11473 + }, + { + "epoch": 0.6315152182288514, + "grad_norm": 0.8695063591003418, + "learning_rate": 7.75993995268436e-06, + "loss": 0.7796, + "step": 11474 + }, + { + "epoch": 0.631570257031207, + "grad_norm": 0.7154332399368286, + "learning_rate": 7.759578495281688e-06, + "loss": 0.725, + "step": 11475 + }, + { + "epoch": 0.6316252958335626, + "grad_norm": 0.7151778936386108, + "learning_rate": 7.759217017138763e-06, + "loss": 0.6932, + "step": 11476 + }, + { + "epoch": 0.6316803346359183, + "grad_norm": 0.6328319311141968, + "learning_rate": 7.758855518258301e-06, + "loss": 0.7382, + "step": 11477 + }, + { + "epoch": 0.631735373438274, + "grad_norm": 0.8377438187599182, + "learning_rate": 7.75849399864302e-06, + "loss": 0.7782, + "step": 11478 + }, + { + "epoch": 0.6317904122406296, + "grad_norm": 0.6654751896858215, + "learning_rate": 7.758132458295637e-06, + "loss": 0.8076, + "step": 11479 + }, + { + "epoch": 0.6318454510429853, + "grad_norm": 0.6841873526573181, + "learning_rate": 7.757770897218869e-06, + "loss": 0.7195, + "step": 11480 + }, + { + "epoch": 0.631900489845341, + "grad_norm": 0.7791223526000977, + "learning_rate": 7.757409315415431e-06, + "loss": 0.7858, + "step": 11481 + }, + { + "epoch": 0.6319555286476967, + "grad_norm": 0.6412019729614258, + "learning_rate": 7.757047712888044e-06, + "loss": 0.6853, + "step": 11482 + }, + { + "epoch": 0.6320105674500522, + "grad_norm": 0.7058777213096619, + "learning_rate": 7.756686089639425e-06, + "loss": 0.8955, + "step": 11483 + }, + { + "epoch": 0.6320656062524079, + "grad_norm": 0.6950271725654602, + "learning_rate": 7.75632444567229e-06, + "loss": 0.7213, + "step": 11484 + }, + { + "epoch": 0.6321206450547636, + "grad_norm": 0.6938642859458923, + "learning_rate": 7.755962780989359e-06, + "loss": 0.749, + "step": 11485 + }, + { + "epoch": 0.6321756838571193, + "grad_norm": 4.447030544281006, + "learning_rate": 7.755601095593348e-06, + "loss": 0.7603, + "step": 11486 + }, + { + "epoch": 0.6322307226594749, + "grad_norm": 0.6693708896636963, + "learning_rate": 7.755239389486979e-06, + "loss": 0.769, + "step": 11487 + }, + { + "epoch": 0.6322857614618306, + "grad_norm": 0.830352246761322, + "learning_rate": 7.754877662672968e-06, + "loss": 0.8069, + "step": 11488 + }, + { + "epoch": 0.6323408002641863, + "grad_norm": 0.7211840748786926, + "learning_rate": 7.754515915154033e-06, + "loss": 0.7972, + "step": 11489 + }, + { + "epoch": 0.632395839066542, + "grad_norm": 0.723101019859314, + "learning_rate": 7.754154146932893e-06, + "loss": 0.7385, + "step": 11490 + }, + { + "epoch": 0.6324508778688975, + "grad_norm": 0.6515377759933472, + "learning_rate": 7.75379235801227e-06, + "loss": 0.7527, + "step": 11491 + }, + { + "epoch": 0.6325059166712532, + "grad_norm": 0.6296554803848267, + "learning_rate": 7.75343054839488e-06, + "loss": 0.7135, + "step": 11492 + }, + { + "epoch": 0.6325609554736089, + "grad_norm": 0.8153911232948303, + "learning_rate": 7.753068718083441e-06, + "loss": 0.7298, + "step": 11493 + }, + { + "epoch": 0.6326159942759646, + "grad_norm": 0.6735014915466309, + "learning_rate": 7.752706867080676e-06, + "loss": 0.6851, + "step": 11494 + }, + { + "epoch": 0.6326710330783202, + "grad_norm": 0.7077293992042542, + "learning_rate": 7.752344995389303e-06, + "loss": 0.7806, + "step": 11495 + }, + { + "epoch": 0.6327260718806759, + "grad_norm": 0.6928272843360901, + "learning_rate": 7.751983103012042e-06, + "loss": 0.7538, + "step": 11496 + }, + { + "epoch": 0.6327811106830316, + "grad_norm": 0.7058837413787842, + "learning_rate": 7.751621189951612e-06, + "loss": 0.7065, + "step": 11497 + }, + { + "epoch": 0.6328361494853872, + "grad_norm": 0.7272600531578064, + "learning_rate": 7.751259256210735e-06, + "loss": 0.7468, + "step": 11498 + }, + { + "epoch": 0.6328911882877428, + "grad_norm": 0.6175968050956726, + "learning_rate": 7.75089730179213e-06, + "loss": 0.7195, + "step": 11499 + }, + { + "epoch": 0.6329462270900985, + "grad_norm": 0.6567386984825134, + "learning_rate": 7.750535326698514e-06, + "loss": 0.8147, + "step": 11500 + }, + { + "epoch": 0.6330012658924542, + "grad_norm": 0.6325315237045288, + "learning_rate": 7.750173330932613e-06, + "loss": 0.7087, + "step": 11501 + }, + { + "epoch": 0.6330563046948099, + "grad_norm": 0.8607509732246399, + "learning_rate": 7.749811314497147e-06, + "loss": 0.8009, + "step": 11502 + }, + { + "epoch": 0.6331113434971655, + "grad_norm": 0.7452824711799622, + "learning_rate": 7.749449277394833e-06, + "loss": 0.7497, + "step": 11503 + }, + { + "epoch": 0.6331663822995212, + "grad_norm": 0.7371357679367065, + "learning_rate": 7.749087219628395e-06, + "loss": 0.8936, + "step": 11504 + }, + { + "epoch": 0.6332214211018768, + "grad_norm": 0.7177306413650513, + "learning_rate": 7.748725141200552e-06, + "loss": 0.8327, + "step": 11505 + }, + { + "epoch": 0.6332764599042325, + "grad_norm": 0.5938527584075928, + "learning_rate": 7.748363042114028e-06, + "loss": 0.6471, + "step": 11506 + }, + { + "epoch": 0.6333314987065881, + "grad_norm": 0.8827341198921204, + "learning_rate": 7.748000922371543e-06, + "loss": 0.7247, + "step": 11507 + }, + { + "epoch": 0.6333865375089438, + "grad_norm": 0.7008641958236694, + "learning_rate": 7.747638781975818e-06, + "loss": 0.684, + "step": 11508 + }, + { + "epoch": 0.6334415763112995, + "grad_norm": 0.7752355337142944, + "learning_rate": 7.747276620929576e-06, + "loss": 0.7993, + "step": 11509 + }, + { + "epoch": 0.6334966151136552, + "grad_norm": 0.6928088068962097, + "learning_rate": 7.74691443923554e-06, + "loss": 0.7213, + "step": 11510 + }, + { + "epoch": 0.6335516539160108, + "grad_norm": 0.8197296261787415, + "learning_rate": 7.746552236896428e-06, + "loss": 0.847, + "step": 11511 + }, + { + "epoch": 0.6336066927183664, + "grad_norm": 0.7912493348121643, + "learning_rate": 7.746190013914966e-06, + "loss": 0.8217, + "step": 11512 + }, + { + "epoch": 0.6336617315207221, + "grad_norm": 0.7726556062698364, + "learning_rate": 7.745827770293871e-06, + "loss": 0.7626, + "step": 11513 + }, + { + "epoch": 0.6337167703230777, + "grad_norm": 0.668569028377533, + "learning_rate": 7.745465506035873e-06, + "loss": 0.7141, + "step": 11514 + }, + { + "epoch": 0.6337718091254334, + "grad_norm": 0.7226139903068542, + "learning_rate": 7.745103221143694e-06, + "loss": 0.7262, + "step": 11515 + }, + { + "epoch": 0.6338268479277891, + "grad_norm": 0.7315354943275452, + "learning_rate": 7.744740915620051e-06, + "loss": 0.7955, + "step": 11516 + }, + { + "epoch": 0.6338818867301448, + "grad_norm": 0.6815279126167297, + "learning_rate": 7.744378589467668e-06, + "loss": 0.7347, + "step": 11517 + }, + { + "epoch": 0.6339369255325004, + "grad_norm": 0.6931445598602295, + "learning_rate": 7.744016242689272e-06, + "loss": 0.7959, + "step": 11518 + }, + { + "epoch": 0.633991964334856, + "grad_norm": 0.7156991362571716, + "learning_rate": 7.743653875287584e-06, + "loss": 0.7793, + "step": 11519 + }, + { + "epoch": 0.6340470031372117, + "grad_norm": 0.8503926396369934, + "learning_rate": 7.74329148726533e-06, + "loss": 0.823, + "step": 11520 + }, + { + "epoch": 0.6341020419395674, + "grad_norm": 0.6280057430267334, + "learning_rate": 7.742929078625228e-06, + "loss": 0.6729, + "step": 11521 + }, + { + "epoch": 0.634157080741923, + "grad_norm": 0.7004517316818237, + "learning_rate": 7.742566649370008e-06, + "loss": 0.7578, + "step": 11522 + }, + { + "epoch": 0.6342121195442787, + "grad_norm": 0.7147908210754395, + "learning_rate": 7.74220419950239e-06, + "loss": 0.7705, + "step": 11523 + }, + { + "epoch": 0.6342671583466344, + "grad_norm": 0.7191137671470642, + "learning_rate": 7.7418417290251e-06, + "loss": 0.789, + "step": 11524 + }, + { + "epoch": 0.6343221971489901, + "grad_norm": 0.7288943529129028, + "learning_rate": 7.741479237940862e-06, + "loss": 0.8204, + "step": 11525 + }, + { + "epoch": 0.6343772359513457, + "grad_norm": 0.714821994304657, + "learning_rate": 7.741116726252398e-06, + "loss": 0.8252, + "step": 11526 + }, + { + "epoch": 0.6344322747537013, + "grad_norm": 0.6869103312492371, + "learning_rate": 7.740754193962435e-06, + "loss": 0.8136, + "step": 11527 + }, + { + "epoch": 0.634487313556057, + "grad_norm": 0.6629248857498169, + "learning_rate": 7.740391641073698e-06, + "loss": 0.7049, + "step": 11528 + }, + { + "epoch": 0.6345423523584127, + "grad_norm": 0.7078685164451599, + "learning_rate": 7.74002906758891e-06, + "loss": 0.7345, + "step": 11529 + }, + { + "epoch": 0.6345973911607683, + "grad_norm": 0.7748367190361023, + "learning_rate": 7.739666473510798e-06, + "loss": 0.7085, + "step": 11530 + }, + { + "epoch": 0.634652429963124, + "grad_norm": 0.6661930084228516, + "learning_rate": 7.739303858842086e-06, + "loss": 0.7795, + "step": 11531 + }, + { + "epoch": 0.6347074687654797, + "grad_norm": 0.6847965121269226, + "learning_rate": 7.738941223585499e-06, + "loss": 0.797, + "step": 11532 + }, + { + "epoch": 0.6347625075678354, + "grad_norm": 0.695184051990509, + "learning_rate": 7.738578567743762e-06, + "loss": 0.8184, + "step": 11533 + }, + { + "epoch": 0.6348175463701909, + "grad_norm": 0.6620088815689087, + "learning_rate": 7.738215891319603e-06, + "loss": 0.721, + "step": 11534 + }, + { + "epoch": 0.6348725851725466, + "grad_norm": 0.6802023649215698, + "learning_rate": 7.737853194315745e-06, + "loss": 0.9207, + "step": 11535 + }, + { + "epoch": 0.6349276239749023, + "grad_norm": 1.0193618535995483, + "learning_rate": 7.737490476734916e-06, + "loss": 0.8495, + "step": 11536 + }, + { + "epoch": 0.634982662777258, + "grad_norm": 0.6578189730644226, + "learning_rate": 7.737127738579841e-06, + "loss": 0.7455, + "step": 11537 + }, + { + "epoch": 0.6350377015796136, + "grad_norm": 0.70018470287323, + "learning_rate": 7.736764979853248e-06, + "loss": 0.7414, + "step": 11538 + }, + { + "epoch": 0.6350927403819693, + "grad_norm": 0.8136304616928101, + "learning_rate": 7.736402200557862e-06, + "loss": 0.7327, + "step": 11539 + }, + { + "epoch": 0.635147779184325, + "grad_norm": 0.7805309295654297, + "learning_rate": 7.736039400696408e-06, + "loss": 0.7659, + "step": 11540 + }, + { + "epoch": 0.6352028179866807, + "grad_norm": 0.675215482711792, + "learning_rate": 7.735676580271615e-06, + "loss": 0.7532, + "step": 11541 + }, + { + "epoch": 0.6352578567890362, + "grad_norm": 0.6873239874839783, + "learning_rate": 7.735313739286208e-06, + "loss": 0.8123, + "step": 11542 + }, + { + "epoch": 0.6353128955913919, + "grad_norm": 0.6624773144721985, + "learning_rate": 7.734950877742917e-06, + "loss": 0.7642, + "step": 11543 + }, + { + "epoch": 0.6353679343937476, + "grad_norm": 0.8047438859939575, + "learning_rate": 7.734587995644468e-06, + "loss": 0.7452, + "step": 11544 + }, + { + "epoch": 0.6354229731961033, + "grad_norm": 0.7449815273284912, + "learning_rate": 7.734225092993585e-06, + "loss": 0.7756, + "step": 11545 + }, + { + "epoch": 0.6354780119984589, + "grad_norm": 0.693081259727478, + "learning_rate": 7.733862169792999e-06, + "loss": 0.7029, + "step": 11546 + }, + { + "epoch": 0.6355330508008146, + "grad_norm": 0.6593700051307678, + "learning_rate": 7.733499226045437e-06, + "loss": 0.6009, + "step": 11547 + }, + { + "epoch": 0.6355880896031703, + "grad_norm": 0.7402041554450989, + "learning_rate": 7.733136261753627e-06, + "loss": 0.6921, + "step": 11548 + }, + { + "epoch": 0.635643128405526, + "grad_norm": 0.7686228156089783, + "learning_rate": 7.732773276920294e-06, + "loss": 0.855, + "step": 11549 + }, + { + "epoch": 0.6356981672078815, + "grad_norm": 0.6776669025421143, + "learning_rate": 7.732410271548171e-06, + "loss": 0.7146, + "step": 11550 + }, + { + "epoch": 0.6357532060102372, + "grad_norm": 0.6055952906608582, + "learning_rate": 7.732047245639983e-06, + "loss": 0.6926, + "step": 11551 + }, + { + "epoch": 0.6358082448125929, + "grad_norm": 0.7452635765075684, + "learning_rate": 7.731684199198461e-06, + "loss": 0.7766, + "step": 11552 + }, + { + "epoch": 0.6358632836149486, + "grad_norm": 0.7482720017433167, + "learning_rate": 7.73132113222633e-06, + "loss": 0.7725, + "step": 11553 + }, + { + "epoch": 0.6359183224173042, + "grad_norm": 0.6534025073051453, + "learning_rate": 7.73095804472632e-06, + "loss": 0.7902, + "step": 11554 + }, + { + "epoch": 0.6359733612196599, + "grad_norm": 0.7364560961723328, + "learning_rate": 7.730594936701162e-06, + "loss": 0.7998, + "step": 11555 + }, + { + "epoch": 0.6360284000220155, + "grad_norm": 0.6881458163261414, + "learning_rate": 7.730231808153582e-06, + "loss": 0.7586, + "step": 11556 + }, + { + "epoch": 0.6360834388243711, + "grad_norm": 0.6574262976646423, + "learning_rate": 7.72986865908631e-06, + "loss": 0.6999, + "step": 11557 + }, + { + "epoch": 0.6361384776267268, + "grad_norm": 0.6976385712623596, + "learning_rate": 7.729505489502078e-06, + "loss": 0.7387, + "step": 11558 + }, + { + "epoch": 0.6361935164290825, + "grad_norm": 0.6482532620429993, + "learning_rate": 7.729142299403613e-06, + "loss": 0.7715, + "step": 11559 + }, + { + "epoch": 0.6362485552314382, + "grad_norm": 0.7140287160873413, + "learning_rate": 7.728779088793643e-06, + "loss": 0.8562, + "step": 11560 + }, + { + "epoch": 0.6363035940337938, + "grad_norm": 0.6579470634460449, + "learning_rate": 7.728415857674901e-06, + "loss": 0.727, + "step": 11561 + }, + { + "epoch": 0.6363586328361495, + "grad_norm": 0.8670933246612549, + "learning_rate": 7.728052606050116e-06, + "loss": 0.7459, + "step": 11562 + }, + { + "epoch": 0.6364136716385052, + "grad_norm": 0.7995489835739136, + "learning_rate": 7.72768933392202e-06, + "loss": 0.8228, + "step": 11563 + }, + { + "epoch": 0.6364687104408608, + "grad_norm": 0.6467362642288208, + "learning_rate": 7.727326041293336e-06, + "loss": 0.7545, + "step": 11564 + }, + { + "epoch": 0.6365237492432164, + "grad_norm": 0.6646577715873718, + "learning_rate": 7.726962728166803e-06, + "loss": 0.7824, + "step": 11565 + }, + { + "epoch": 0.6365787880455721, + "grad_norm": 0.6576912999153137, + "learning_rate": 7.726599394545149e-06, + "loss": 0.7324, + "step": 11566 + }, + { + "epoch": 0.6366338268479278, + "grad_norm": 0.7514963150024414, + "learning_rate": 7.726236040431101e-06, + "loss": 0.7712, + "step": 11567 + }, + { + "epoch": 0.6366888656502835, + "grad_norm": 0.7313328981399536, + "learning_rate": 7.725872665827394e-06, + "loss": 0.7361, + "step": 11568 + }, + { + "epoch": 0.6367439044526391, + "grad_norm": 0.7109994292259216, + "learning_rate": 7.725509270736759e-06, + "loss": 0.812, + "step": 11569 + }, + { + "epoch": 0.6367989432549948, + "grad_norm": 1.128675103187561, + "learning_rate": 7.725145855161924e-06, + "loss": 0.726, + "step": 11570 + }, + { + "epoch": 0.6368539820573504, + "grad_norm": 0.7357437014579773, + "learning_rate": 7.724782419105622e-06, + "loss": 0.7958, + "step": 11571 + }, + { + "epoch": 0.6369090208597061, + "grad_norm": 0.6874725222587585, + "learning_rate": 7.724418962570587e-06, + "loss": 0.751, + "step": 11572 + }, + { + "epoch": 0.6369640596620617, + "grad_norm": 0.7175989747047424, + "learning_rate": 7.724055485559545e-06, + "loss": 0.7191, + "step": 11573 + }, + { + "epoch": 0.6370190984644174, + "grad_norm": 0.6424688100814819, + "learning_rate": 7.723691988075235e-06, + "loss": 0.608, + "step": 11574 + }, + { + "epoch": 0.6370741372667731, + "grad_norm": 0.6845381855964661, + "learning_rate": 7.723328470120383e-06, + "loss": 0.7465, + "step": 11575 + }, + { + "epoch": 0.6371291760691288, + "grad_norm": 0.7955030202865601, + "learning_rate": 7.722964931697723e-06, + "loss": 0.745, + "step": 11576 + }, + { + "epoch": 0.6371842148714844, + "grad_norm": 0.6855689883232117, + "learning_rate": 7.722601372809989e-06, + "loss": 0.7764, + "step": 11577 + }, + { + "epoch": 0.63723925367384, + "grad_norm": 0.7505692839622498, + "learning_rate": 7.722237793459909e-06, + "loss": 0.8324, + "step": 11578 + }, + { + "epoch": 0.6372942924761957, + "grad_norm": 0.6852842569351196, + "learning_rate": 7.721874193650221e-06, + "loss": 0.7599, + "step": 11579 + }, + { + "epoch": 0.6373493312785514, + "grad_norm": 0.698210597038269, + "learning_rate": 7.721510573383654e-06, + "loss": 0.843, + "step": 11580 + }, + { + "epoch": 0.637404370080907, + "grad_norm": 0.8344444632530212, + "learning_rate": 7.721146932662942e-06, + "loss": 0.8602, + "step": 11581 + }, + { + "epoch": 0.6374594088832627, + "grad_norm": 0.6385721564292908, + "learning_rate": 7.72078327149082e-06, + "loss": 0.7449, + "step": 11582 + }, + { + "epoch": 0.6375144476856184, + "grad_norm": 0.6474401354789734, + "learning_rate": 7.720419589870016e-06, + "loss": 0.6328, + "step": 11583 + }, + { + "epoch": 0.6375694864879741, + "grad_norm": 0.6554263234138489, + "learning_rate": 7.720055887803268e-06, + "loss": 0.6672, + "step": 11584 + }, + { + "epoch": 0.6376245252903296, + "grad_norm": 0.6551910638809204, + "learning_rate": 7.719692165293309e-06, + "loss": 0.8024, + "step": 11585 + }, + { + "epoch": 0.6376795640926853, + "grad_norm": 0.693418025970459, + "learning_rate": 7.719328422342871e-06, + "loss": 0.726, + "step": 11586 + }, + { + "epoch": 0.637734602895041, + "grad_norm": 0.8642090559005737, + "learning_rate": 7.718964658954689e-06, + "loss": 0.8274, + "step": 11587 + }, + { + "epoch": 0.6377896416973967, + "grad_norm": 0.8255778551101685, + "learning_rate": 7.718600875131494e-06, + "loss": 0.7259, + "step": 11588 + }, + { + "epoch": 0.6378446804997523, + "grad_norm": 0.7492913007736206, + "learning_rate": 7.718237070876025e-06, + "loss": 0.7093, + "step": 11589 + }, + { + "epoch": 0.637899719302108, + "grad_norm": 0.7154868245124817, + "learning_rate": 7.717873246191013e-06, + "loss": 0.7909, + "step": 11590 + }, + { + "epoch": 0.6379547581044637, + "grad_norm": 0.7751424312591553, + "learning_rate": 7.717509401079194e-06, + "loss": 0.8528, + "step": 11591 + }, + { + "epoch": 0.6380097969068194, + "grad_norm": 0.68199223279953, + "learning_rate": 7.7171455355433e-06, + "loss": 0.7077, + "step": 11592 + }, + { + "epoch": 0.6380648357091749, + "grad_norm": 0.7340414524078369, + "learning_rate": 7.716781649586069e-06, + "loss": 0.693, + "step": 11593 + }, + { + "epoch": 0.6381198745115306, + "grad_norm": 0.6278988122940063, + "learning_rate": 7.716417743210234e-06, + "loss": 0.7049, + "step": 11594 + }, + { + "epoch": 0.6381749133138863, + "grad_norm": 0.9113193154335022, + "learning_rate": 7.716053816418532e-06, + "loss": 0.7757, + "step": 11595 + }, + { + "epoch": 0.638229952116242, + "grad_norm": 0.7059371471405029, + "learning_rate": 7.715689869213694e-06, + "loss": 0.7805, + "step": 11596 + }, + { + "epoch": 0.6382849909185976, + "grad_norm": 0.7508488297462463, + "learning_rate": 7.71532590159846e-06, + "loss": 0.7394, + "step": 11597 + }, + { + "epoch": 0.6383400297209533, + "grad_norm": 0.8222774863243103, + "learning_rate": 7.71496191357556e-06, + "loss": 0.7675, + "step": 11598 + }, + { + "epoch": 0.638395068523309, + "grad_norm": 0.7295246124267578, + "learning_rate": 7.714597905147736e-06, + "loss": 0.7766, + "step": 11599 + }, + { + "epoch": 0.6384501073256645, + "grad_norm": 0.7482065558433533, + "learning_rate": 7.71423387631772e-06, + "loss": 0.7334, + "step": 11600 + }, + { + "epoch": 0.6385051461280202, + "grad_norm": 0.7654659748077393, + "learning_rate": 7.71386982708825e-06, + "loss": 0.8097, + "step": 11601 + }, + { + "epoch": 0.6385601849303759, + "grad_norm": 0.9125531911849976, + "learning_rate": 7.71350575746206e-06, + "loss": 0.7776, + "step": 11602 + }, + { + "epoch": 0.6386152237327316, + "grad_norm": 0.8063878417015076, + "learning_rate": 7.713141667441886e-06, + "loss": 0.7899, + "step": 11603 + }, + { + "epoch": 0.6386702625350872, + "grad_norm": 0.7315171360969543, + "learning_rate": 7.712777557030466e-06, + "loss": 0.7884, + "step": 11604 + }, + { + "epoch": 0.6387253013374429, + "grad_norm": 0.7306345105171204, + "learning_rate": 7.712413426230536e-06, + "loss": 0.8646, + "step": 11605 + }, + { + "epoch": 0.6387803401397986, + "grad_norm": 0.8300313353538513, + "learning_rate": 7.712049275044833e-06, + "loss": 0.8131, + "step": 11606 + }, + { + "epoch": 0.6388353789421543, + "grad_norm": 0.7513623237609863, + "learning_rate": 7.711685103476093e-06, + "loss": 0.8115, + "step": 11607 + }, + { + "epoch": 0.6388904177445098, + "grad_norm": 0.7126060128211975, + "learning_rate": 7.711320911527054e-06, + "loss": 0.8198, + "step": 11608 + }, + { + "epoch": 0.6389454565468655, + "grad_norm": 0.7017398476600647, + "learning_rate": 7.710956699200454e-06, + "loss": 0.8088, + "step": 11609 + }, + { + "epoch": 0.6390004953492212, + "grad_norm": 0.7345026135444641, + "learning_rate": 7.710592466499027e-06, + "loss": 0.8228, + "step": 11610 + }, + { + "epoch": 0.6390555341515769, + "grad_norm": 0.6903058886528015, + "learning_rate": 7.710228213425514e-06, + "loss": 0.7058, + "step": 11611 + }, + { + "epoch": 0.6391105729539325, + "grad_norm": 0.6838604211807251, + "learning_rate": 7.70986393998265e-06, + "loss": 0.7091, + "step": 11612 + }, + { + "epoch": 0.6391656117562882, + "grad_norm": 0.7067943811416626, + "learning_rate": 7.709499646173177e-06, + "loss": 0.7631, + "step": 11613 + }, + { + "epoch": 0.6392206505586439, + "grad_norm": 0.7577057480812073, + "learning_rate": 7.709135331999827e-06, + "loss": 0.7545, + "step": 11614 + }, + { + "epoch": 0.6392756893609995, + "grad_norm": 0.6425572633743286, + "learning_rate": 7.70877099746534e-06, + "loss": 0.7188, + "step": 11615 + }, + { + "epoch": 0.6393307281633551, + "grad_norm": 0.7257497310638428, + "learning_rate": 7.708406642572459e-06, + "loss": 0.7514, + "step": 11616 + }, + { + "epoch": 0.6393857669657108, + "grad_norm": 0.8214251399040222, + "learning_rate": 7.708042267323916e-06, + "loss": 0.7824, + "step": 11617 + }, + { + "epoch": 0.6394408057680665, + "grad_norm": 0.7879108786582947, + "learning_rate": 7.707677871722453e-06, + "loss": 0.6122, + "step": 11618 + }, + { + "epoch": 0.6394958445704222, + "grad_norm": 0.6656795740127563, + "learning_rate": 7.707313455770808e-06, + "loss": 0.754, + "step": 11619 + }, + { + "epoch": 0.6395508833727778, + "grad_norm": 0.7196451425552368, + "learning_rate": 7.70694901947172e-06, + "loss": 0.7662, + "step": 11620 + }, + { + "epoch": 0.6396059221751335, + "grad_norm": 0.8213779926300049, + "learning_rate": 7.706584562827928e-06, + "loss": 0.8732, + "step": 11621 + }, + { + "epoch": 0.6396609609774891, + "grad_norm": 0.7114893794059753, + "learning_rate": 7.70622008584217e-06, + "loss": 0.8493, + "step": 11622 + }, + { + "epoch": 0.6397159997798448, + "grad_norm": 0.7009783983230591, + "learning_rate": 7.705855588517188e-06, + "loss": 0.738, + "step": 11623 + }, + { + "epoch": 0.6397710385822004, + "grad_norm": 0.7576995491981506, + "learning_rate": 7.705491070855717e-06, + "loss": 0.8839, + "step": 11624 + }, + { + "epoch": 0.6398260773845561, + "grad_norm": 0.705784022808075, + "learning_rate": 7.7051265328605e-06, + "loss": 0.7246, + "step": 11625 + }, + { + "epoch": 0.6398811161869118, + "grad_norm": 0.6696903109550476, + "learning_rate": 7.704761974534277e-06, + "loss": 0.7418, + "step": 11626 + }, + { + "epoch": 0.6399361549892675, + "grad_norm": 0.8617024421691895, + "learning_rate": 7.704397395879786e-06, + "loss": 0.8109, + "step": 11627 + }, + { + "epoch": 0.6399911937916231, + "grad_norm": 0.6819054484367371, + "learning_rate": 7.70403279689977e-06, + "loss": 0.6438, + "step": 11628 + }, + { + "epoch": 0.6400462325939787, + "grad_norm": 0.6145044565200806, + "learning_rate": 7.703668177596966e-06, + "loss": 0.6712, + "step": 11629 + }, + { + "epoch": 0.6401012713963344, + "grad_norm": 0.6946390271186829, + "learning_rate": 7.703303537974116e-06, + "loss": 0.8099, + "step": 11630 + }, + { + "epoch": 0.6401563101986901, + "grad_norm": 0.6791605949401855, + "learning_rate": 7.702938878033961e-06, + "loss": 0.7494, + "step": 11631 + }, + { + "epoch": 0.6402113490010457, + "grad_norm": 0.6718626618385315, + "learning_rate": 7.70257419777924e-06, + "loss": 0.7471, + "step": 11632 + }, + { + "epoch": 0.6402663878034014, + "grad_norm": 0.8051798343658447, + "learning_rate": 7.702209497212694e-06, + "loss": 0.8569, + "step": 11633 + }, + { + "epoch": 0.6403214266057571, + "grad_norm": 0.6602774858474731, + "learning_rate": 7.701844776337067e-06, + "loss": 0.7396, + "step": 11634 + }, + { + "epoch": 0.6403764654081128, + "grad_norm": 0.672363817691803, + "learning_rate": 7.701480035155096e-06, + "loss": 0.7584, + "step": 11635 + }, + { + "epoch": 0.6404315042104683, + "grad_norm": 0.7363641262054443, + "learning_rate": 7.701115273669524e-06, + "loss": 0.8149, + "step": 11636 + }, + { + "epoch": 0.640486543012824, + "grad_norm": 0.7238422632217407, + "learning_rate": 7.700750491883094e-06, + "loss": 0.7598, + "step": 11637 + }, + { + "epoch": 0.6405415818151797, + "grad_norm": 1.3627614974975586, + "learning_rate": 7.700385689798544e-06, + "loss": 0.8303, + "step": 11638 + }, + { + "epoch": 0.6405966206175354, + "grad_norm": 0.6339633464813232, + "learning_rate": 7.70002086741862e-06, + "loss": 0.7308, + "step": 11639 + }, + { + "epoch": 0.640651659419891, + "grad_norm": 0.6821589469909668, + "learning_rate": 7.699656024746062e-06, + "loss": 0.6728, + "step": 11640 + }, + { + "epoch": 0.6407066982222467, + "grad_norm": 0.8514766097068787, + "learning_rate": 7.699291161783611e-06, + "loss": 0.8693, + "step": 11641 + }, + { + "epoch": 0.6407617370246024, + "grad_norm": 0.649075984954834, + "learning_rate": 7.698926278534011e-06, + "loss": 0.7482, + "step": 11642 + }, + { + "epoch": 0.640816775826958, + "grad_norm": 0.6507017016410828, + "learning_rate": 7.698561375000001e-06, + "loss": 0.7841, + "step": 11643 + }, + { + "epoch": 0.6408718146293136, + "grad_norm": 0.6736069321632385, + "learning_rate": 7.69819645118433e-06, + "loss": 0.74, + "step": 11644 + }, + { + "epoch": 0.6409268534316693, + "grad_norm": 0.6727941632270813, + "learning_rate": 7.697831507089734e-06, + "loss": 0.806, + "step": 11645 + }, + { + "epoch": 0.640981892234025, + "grad_norm": 0.7089083194732666, + "learning_rate": 7.697466542718959e-06, + "loss": 0.8091, + "step": 11646 + }, + { + "epoch": 0.6410369310363806, + "grad_norm": 0.6355387568473816, + "learning_rate": 7.69710155807475e-06, + "loss": 0.7033, + "step": 11647 + }, + { + "epoch": 0.6410919698387363, + "grad_norm": 0.6327098608016968, + "learning_rate": 7.696736553159846e-06, + "loss": 0.7664, + "step": 11648 + }, + { + "epoch": 0.641147008641092, + "grad_norm": 0.6971945762634277, + "learning_rate": 7.69637152797699e-06, + "loss": 0.7441, + "step": 11649 + }, + { + "epoch": 0.6412020474434477, + "grad_norm": 0.7420539855957031, + "learning_rate": 7.696006482528929e-06, + "loss": 0.7909, + "step": 11650 + }, + { + "epoch": 0.6412570862458032, + "grad_norm": 0.6877853274345398, + "learning_rate": 7.695641416818405e-06, + "loss": 0.7624, + "step": 11651 + }, + { + "epoch": 0.6413121250481589, + "grad_norm": 0.7337075471878052, + "learning_rate": 7.695276330848162e-06, + "loss": 0.7829, + "step": 11652 + }, + { + "epoch": 0.6413671638505146, + "grad_norm": 0.6423582434654236, + "learning_rate": 7.694911224620944e-06, + "loss": 0.6686, + "step": 11653 + }, + { + "epoch": 0.6414222026528703, + "grad_norm": 0.7826602458953857, + "learning_rate": 7.694546098139492e-06, + "loss": 0.774, + "step": 11654 + }, + { + "epoch": 0.6414772414552259, + "grad_norm": 0.7678147554397583, + "learning_rate": 7.694180951406556e-06, + "loss": 0.8067, + "step": 11655 + }, + { + "epoch": 0.6415322802575816, + "grad_norm": 0.6400566101074219, + "learning_rate": 7.693815784424875e-06, + "loss": 0.7796, + "step": 11656 + }, + { + "epoch": 0.6415873190599373, + "grad_norm": 0.6606197357177734, + "learning_rate": 7.693450597197196e-06, + "loss": 0.7381, + "step": 11657 + }, + { + "epoch": 0.641642357862293, + "grad_norm": 0.7953683137893677, + "learning_rate": 7.693085389726262e-06, + "loss": 0.8867, + "step": 11658 + }, + { + "epoch": 0.6416973966646485, + "grad_norm": 0.6763843894004822, + "learning_rate": 7.692720162014822e-06, + "loss": 0.7579, + "step": 11659 + }, + { + "epoch": 0.6417524354670042, + "grad_norm": 0.6456292867660522, + "learning_rate": 7.692354914065617e-06, + "loss": 0.7814, + "step": 11660 + }, + { + "epoch": 0.6418074742693599, + "grad_norm": 0.702803373336792, + "learning_rate": 7.691989645881393e-06, + "loss": 0.7393, + "step": 11661 + }, + { + "epoch": 0.6418625130717156, + "grad_norm": 0.8328298926353455, + "learning_rate": 7.691624357464895e-06, + "loss": 0.6587, + "step": 11662 + }, + { + "epoch": 0.6419175518740712, + "grad_norm": 0.8409613966941833, + "learning_rate": 7.691259048818871e-06, + "loss": 0.8075, + "step": 11663 + }, + { + "epoch": 0.6419725906764269, + "grad_norm": 0.6969256401062012, + "learning_rate": 7.690893719946062e-06, + "loss": 0.8061, + "step": 11664 + }, + { + "epoch": 0.6420276294787826, + "grad_norm": 0.7689732313156128, + "learning_rate": 7.690528370849217e-06, + "loss": 0.7709, + "step": 11665 + }, + { + "epoch": 0.6420826682811382, + "grad_norm": 0.8239523768424988, + "learning_rate": 7.69016300153108e-06, + "loss": 0.7421, + "step": 11666 + }, + { + "epoch": 0.6421377070834938, + "grad_norm": 0.7199227809906006, + "learning_rate": 7.689797611994398e-06, + "loss": 0.7877, + "step": 11667 + }, + { + "epoch": 0.6421927458858495, + "grad_norm": 0.8315985798835754, + "learning_rate": 7.689432202241919e-06, + "loss": 0.8458, + "step": 11668 + }, + { + "epoch": 0.6422477846882052, + "grad_norm": 0.7213512063026428, + "learning_rate": 7.689066772276385e-06, + "loss": 0.7199, + "step": 11669 + }, + { + "epoch": 0.6423028234905609, + "grad_norm": 0.6023604273796082, + "learning_rate": 7.688701322100547e-06, + "loss": 0.6485, + "step": 11670 + }, + { + "epoch": 0.6423578622929165, + "grad_norm": 0.8171319365501404, + "learning_rate": 7.688335851717148e-06, + "loss": 0.7561, + "step": 11671 + }, + { + "epoch": 0.6424129010952722, + "grad_norm": 0.6545816659927368, + "learning_rate": 7.687970361128937e-06, + "loss": 0.6796, + "step": 11672 + }, + { + "epoch": 0.6424679398976278, + "grad_norm": 0.8093686103820801, + "learning_rate": 7.687604850338661e-06, + "loss": 0.8538, + "step": 11673 + }, + { + "epoch": 0.6425229786999835, + "grad_norm": 0.6438135504722595, + "learning_rate": 7.687239319349066e-06, + "loss": 0.7046, + "step": 11674 + }, + { + "epoch": 0.6425780175023391, + "grad_norm": 0.685100257396698, + "learning_rate": 7.6868737681629e-06, + "loss": 0.7568, + "step": 11675 + }, + { + "epoch": 0.6426330563046948, + "grad_norm": 0.6850112676620483, + "learning_rate": 7.68650819678291e-06, + "loss": 0.7082, + "step": 11676 + }, + { + "epoch": 0.6426880951070505, + "grad_norm": 0.7524490356445312, + "learning_rate": 7.686142605211843e-06, + "loss": 0.7285, + "step": 11677 + }, + { + "epoch": 0.6427431339094062, + "grad_norm": 0.7706617116928101, + "learning_rate": 7.685776993452446e-06, + "loss": 0.7934, + "step": 11678 + }, + { + "epoch": 0.6427981727117618, + "grad_norm": 0.6612235307693481, + "learning_rate": 7.68541136150747e-06, + "loss": 0.6538, + "step": 11679 + }, + { + "epoch": 0.6428532115141175, + "grad_norm": 0.6380587816238403, + "learning_rate": 7.68504570937966e-06, + "loss": 0.7, + "step": 11680 + }, + { + "epoch": 0.6429082503164731, + "grad_norm": 0.6563882231712341, + "learning_rate": 7.684680037071765e-06, + "loss": 0.6912, + "step": 11681 + }, + { + "epoch": 0.6429632891188288, + "grad_norm": 0.6579793095588684, + "learning_rate": 7.684314344586534e-06, + "loss": 0.7263, + "step": 11682 + }, + { + "epoch": 0.6430183279211844, + "grad_norm": 0.7029374837875366, + "learning_rate": 7.683948631926713e-06, + "loss": 0.7151, + "step": 11683 + }, + { + "epoch": 0.6430733667235401, + "grad_norm": 0.6683217883110046, + "learning_rate": 7.683582899095056e-06, + "loss": 0.7643, + "step": 11684 + }, + { + "epoch": 0.6431284055258958, + "grad_norm": 1.0482646226882935, + "learning_rate": 7.683217146094308e-06, + "loss": 0.8889, + "step": 11685 + }, + { + "epoch": 0.6431834443282514, + "grad_norm": 0.7101102471351624, + "learning_rate": 7.682851372927216e-06, + "loss": 0.7762, + "step": 11686 + }, + { + "epoch": 0.643238483130607, + "grad_norm": 0.674961268901825, + "learning_rate": 7.682485579596533e-06, + "loss": 0.736, + "step": 11687 + }, + { + "epoch": 0.6432935219329627, + "grad_norm": 0.7071837782859802, + "learning_rate": 7.682119766105005e-06, + "loss": 0.7231, + "step": 11688 + }, + { + "epoch": 0.6433485607353184, + "grad_norm": 0.6982744932174683, + "learning_rate": 7.681753932455383e-06, + "loss": 0.7498, + "step": 11689 + }, + { + "epoch": 0.643403599537674, + "grad_norm": 0.6927201747894287, + "learning_rate": 7.681388078650415e-06, + "loss": 0.803, + "step": 11690 + }, + { + "epoch": 0.6434586383400297, + "grad_norm": 0.7299236059188843, + "learning_rate": 7.681022204692854e-06, + "loss": 0.7386, + "step": 11691 + }, + { + "epoch": 0.6435136771423854, + "grad_norm": 0.8809047937393188, + "learning_rate": 7.680656310585449e-06, + "loss": 0.741, + "step": 11692 + }, + { + "epoch": 0.6435687159447411, + "grad_norm": 0.862843930721283, + "learning_rate": 7.680290396330947e-06, + "loss": 0.8357, + "step": 11693 + }, + { + "epoch": 0.6436237547470967, + "grad_norm": 0.7436664700508118, + "learning_rate": 7.679924461932098e-06, + "loss": 0.8352, + "step": 11694 + }, + { + "epoch": 0.6436787935494523, + "grad_norm": 0.6582232713699341, + "learning_rate": 7.679558507391657e-06, + "loss": 0.7107, + "step": 11695 + }, + { + "epoch": 0.643733832351808, + "grad_norm": 0.6798850297927856, + "learning_rate": 7.67919253271237e-06, + "loss": 0.6968, + "step": 11696 + }, + { + "epoch": 0.6437888711541637, + "grad_norm": 0.7747187614440918, + "learning_rate": 7.67882653789699e-06, + "loss": 0.7611, + "step": 11697 + }, + { + "epoch": 0.6438439099565193, + "grad_norm": 0.7097567915916443, + "learning_rate": 7.678460522948267e-06, + "loss": 0.7275, + "step": 11698 + }, + { + "epoch": 0.643898948758875, + "grad_norm": 0.6958394050598145, + "learning_rate": 7.678094487868952e-06, + "loss": 0.7441, + "step": 11699 + }, + { + "epoch": 0.6439539875612307, + "grad_norm": 0.9129040837287903, + "learning_rate": 7.677728432661794e-06, + "loss": 0.7693, + "step": 11700 + }, + { + "epoch": 0.6440090263635864, + "grad_norm": 1.1396137475967407, + "learning_rate": 7.677362357329548e-06, + "loss": 0.7479, + "step": 11701 + }, + { + "epoch": 0.644064065165942, + "grad_norm": 0.8163042664527893, + "learning_rate": 7.67699626187496e-06, + "loss": 0.835, + "step": 11702 + }, + { + "epoch": 0.6441191039682976, + "grad_norm": 0.9869117736816406, + "learning_rate": 7.676630146300787e-06, + "loss": 0.769, + "step": 11703 + }, + { + "epoch": 0.6441741427706533, + "grad_norm": 0.7439526915550232, + "learning_rate": 7.676264010609777e-06, + "loss": 0.8239, + "step": 11704 + }, + { + "epoch": 0.644229181573009, + "grad_norm": 0.6943735480308533, + "learning_rate": 7.675897854804685e-06, + "loss": 0.7702, + "step": 11705 + }, + { + "epoch": 0.6442842203753646, + "grad_norm": 0.7384238243103027, + "learning_rate": 7.67553167888826e-06, + "loss": 0.6911, + "step": 11706 + }, + { + "epoch": 0.6443392591777203, + "grad_norm": 0.660022497177124, + "learning_rate": 7.675165482863254e-06, + "loss": 0.7359, + "step": 11707 + }, + { + "epoch": 0.644394297980076, + "grad_norm": 0.6956108808517456, + "learning_rate": 7.674799266732422e-06, + "loss": 0.7845, + "step": 11708 + }, + { + "epoch": 0.6444493367824317, + "grad_norm": 0.7361618280410767, + "learning_rate": 7.674433030498513e-06, + "loss": 0.7391, + "step": 11709 + }, + { + "epoch": 0.6445043755847872, + "grad_norm": 0.7655043005943298, + "learning_rate": 7.674066774164284e-06, + "loss": 0.8305, + "step": 11710 + }, + { + "epoch": 0.6445594143871429, + "grad_norm": 0.7160911560058594, + "learning_rate": 7.673700497732483e-06, + "loss": 0.7654, + "step": 11711 + }, + { + "epoch": 0.6446144531894986, + "grad_norm": 0.7812016010284424, + "learning_rate": 7.673334201205866e-06, + "loss": 0.8212, + "step": 11712 + }, + { + "epoch": 0.6446694919918543, + "grad_norm": 0.7457767128944397, + "learning_rate": 7.672967884587184e-06, + "loss": 0.8084, + "step": 11713 + }, + { + "epoch": 0.6447245307942099, + "grad_norm": 0.7524051070213318, + "learning_rate": 7.672601547879189e-06, + "loss": 0.7525, + "step": 11714 + }, + { + "epoch": 0.6447795695965656, + "grad_norm": 0.7271043062210083, + "learning_rate": 7.672235191084638e-06, + "loss": 0.7627, + "step": 11715 + }, + { + "epoch": 0.6448346083989213, + "grad_norm": 0.6893014907836914, + "learning_rate": 7.671868814206283e-06, + "loss": 0.7969, + "step": 11716 + }, + { + "epoch": 0.644889647201277, + "grad_norm": 0.7057414054870605, + "learning_rate": 7.671502417246876e-06, + "loss": 0.7448, + "step": 11717 + }, + { + "epoch": 0.6449446860036325, + "grad_norm": 0.7490910887718201, + "learning_rate": 7.671136000209172e-06, + "loss": 0.8046, + "step": 11718 + }, + { + "epoch": 0.6449997248059882, + "grad_norm": 0.7338950634002686, + "learning_rate": 7.670769563095926e-06, + "loss": 0.8521, + "step": 11719 + }, + { + "epoch": 0.6450547636083439, + "grad_norm": 0.8669398427009583, + "learning_rate": 7.670403105909891e-06, + "loss": 0.7803, + "step": 11720 + }, + { + "epoch": 0.6451098024106996, + "grad_norm": 0.7012562155723572, + "learning_rate": 7.67003662865382e-06, + "loss": 0.8047, + "step": 11721 + }, + { + "epoch": 0.6451648412130552, + "grad_norm": 0.9933050274848938, + "learning_rate": 7.66967013133047e-06, + "loss": 0.7081, + "step": 11722 + }, + { + "epoch": 0.6452198800154109, + "grad_norm": 1.12044358253479, + "learning_rate": 7.669303613942592e-06, + "loss": 0.7315, + "step": 11723 + }, + { + "epoch": 0.6452749188177666, + "grad_norm": 0.8654733300209045, + "learning_rate": 7.668937076492943e-06, + "loss": 0.6849, + "step": 11724 + }, + { + "epoch": 0.6453299576201222, + "grad_norm": 0.7081291675567627, + "learning_rate": 7.668570518984277e-06, + "loss": 0.7584, + "step": 11725 + }, + { + "epoch": 0.6453849964224778, + "grad_norm": 0.7473898530006409, + "learning_rate": 7.66820394141935e-06, + "loss": 0.8364, + "step": 11726 + }, + { + "epoch": 0.6454400352248335, + "grad_norm": 0.7863657474517822, + "learning_rate": 7.667837343800916e-06, + "loss": 0.7235, + "step": 11727 + }, + { + "epoch": 0.6454950740271892, + "grad_norm": 0.6664546728134155, + "learning_rate": 7.667470726131732e-06, + "loss": 0.7203, + "step": 11728 + }, + { + "epoch": 0.6455501128295448, + "grad_norm": 0.7182374596595764, + "learning_rate": 7.667104088414552e-06, + "loss": 0.7376, + "step": 11729 + }, + { + "epoch": 0.6456051516319005, + "grad_norm": 0.6518070697784424, + "learning_rate": 7.666737430652128e-06, + "loss": 0.6804, + "step": 11730 + }, + { + "epoch": 0.6456601904342562, + "grad_norm": 0.7354047894477844, + "learning_rate": 7.666370752847223e-06, + "loss": 0.7648, + "step": 11731 + }, + { + "epoch": 0.6457152292366118, + "grad_norm": 0.7440805435180664, + "learning_rate": 7.666004055002588e-06, + "loss": 0.7674, + "step": 11732 + }, + { + "epoch": 0.6457702680389674, + "grad_norm": 1.6423569917678833, + "learning_rate": 7.665637337120981e-06, + "loss": 0.8957, + "step": 11733 + }, + { + "epoch": 0.6458253068413231, + "grad_norm": 0.6960558295249939, + "learning_rate": 7.665270599205156e-06, + "loss": 0.7278, + "step": 11734 + }, + { + "epoch": 0.6458803456436788, + "grad_norm": 0.6983850002288818, + "learning_rate": 7.664903841257871e-06, + "loss": 0.7351, + "step": 11735 + }, + { + "epoch": 0.6459353844460345, + "grad_norm": 0.6905686855316162, + "learning_rate": 7.664537063281883e-06, + "loss": 0.7558, + "step": 11736 + }, + { + "epoch": 0.6459904232483901, + "grad_norm": 0.7483980655670166, + "learning_rate": 7.664170265279946e-06, + "loss": 0.813, + "step": 11737 + }, + { + "epoch": 0.6460454620507458, + "grad_norm": 0.767756998538971, + "learning_rate": 7.66380344725482e-06, + "loss": 0.8397, + "step": 11738 + }, + { + "epoch": 0.6461005008531014, + "grad_norm": 0.7813250422477722, + "learning_rate": 7.66343660920926e-06, + "loss": 0.8034, + "step": 11739 + }, + { + "epoch": 0.6461555396554571, + "grad_norm": 0.7357046604156494, + "learning_rate": 7.663069751146022e-06, + "loss": 0.7604, + "step": 11740 + }, + { + "epoch": 0.6462105784578127, + "grad_norm": 0.620285153388977, + "learning_rate": 7.662702873067866e-06, + "loss": 0.6191, + "step": 11741 + }, + { + "epoch": 0.6462656172601684, + "grad_norm": 0.6711301803588867, + "learning_rate": 7.662335974977549e-06, + "loss": 0.7674, + "step": 11742 + }, + { + "epoch": 0.6463206560625241, + "grad_norm": 0.756258487701416, + "learning_rate": 7.661969056877824e-06, + "loss": 0.7074, + "step": 11743 + }, + { + "epoch": 0.6463756948648798, + "grad_norm": 0.8121050596237183, + "learning_rate": 7.661602118771456e-06, + "loss": 0.8028, + "step": 11744 + }, + { + "epoch": 0.6464307336672354, + "grad_norm": 0.735906720161438, + "learning_rate": 7.661235160661197e-06, + "loss": 0.7197, + "step": 11745 + }, + { + "epoch": 0.646485772469591, + "grad_norm": 0.644490122795105, + "learning_rate": 7.660868182549807e-06, + "loss": 0.6172, + "step": 11746 + }, + { + "epoch": 0.6465408112719467, + "grad_norm": 0.7228739261627197, + "learning_rate": 7.660501184440045e-06, + "loss": 0.8302, + "step": 11747 + }, + { + "epoch": 0.6465958500743024, + "grad_norm": 0.8292868137359619, + "learning_rate": 7.660134166334668e-06, + "loss": 0.7506, + "step": 11748 + }, + { + "epoch": 0.646650888876658, + "grad_norm": 0.7224695086479187, + "learning_rate": 7.659767128236433e-06, + "loss": 0.8043, + "step": 11749 + }, + { + "epoch": 0.6467059276790137, + "grad_norm": 0.7092188000679016, + "learning_rate": 7.659400070148102e-06, + "loss": 0.7838, + "step": 11750 + }, + { + "epoch": 0.6467609664813694, + "grad_norm": 0.6975178122520447, + "learning_rate": 7.65903299207243e-06, + "loss": 0.7576, + "step": 11751 + }, + { + "epoch": 0.6468160052837251, + "grad_norm": 0.6524471044540405, + "learning_rate": 7.658665894012179e-06, + "loss": 0.7822, + "step": 11752 + }, + { + "epoch": 0.6468710440860806, + "grad_norm": 0.8134269118309021, + "learning_rate": 7.658298775970107e-06, + "loss": 0.8116, + "step": 11753 + }, + { + "epoch": 0.6469260828884363, + "grad_norm": 0.7166362404823303, + "learning_rate": 7.657931637948974e-06, + "loss": 0.768, + "step": 11754 + }, + { + "epoch": 0.646981121690792, + "grad_norm": 0.6418643593788147, + "learning_rate": 7.657564479951535e-06, + "loss": 0.7488, + "step": 11755 + }, + { + "epoch": 0.6470361604931477, + "grad_norm": 0.7104085087776184, + "learning_rate": 7.657197301980556e-06, + "loss": 0.7518, + "step": 11756 + }, + { + "epoch": 0.6470911992955033, + "grad_norm": 0.7297894358634949, + "learning_rate": 7.656830104038793e-06, + "loss": 0.7877, + "step": 11757 + }, + { + "epoch": 0.647146238097859, + "grad_norm": 0.8037092089653015, + "learning_rate": 7.656462886129006e-06, + "loss": 0.7375, + "step": 11758 + }, + { + "epoch": 0.6472012769002147, + "grad_norm": 0.7498913407325745, + "learning_rate": 7.656095648253955e-06, + "loss": 0.7899, + "step": 11759 + }, + { + "epoch": 0.6472563157025704, + "grad_norm": 0.7383849620819092, + "learning_rate": 7.655728390416398e-06, + "loss": 0.8276, + "step": 11760 + }, + { + "epoch": 0.6473113545049259, + "grad_norm": 0.750481367111206, + "learning_rate": 7.6553611126191e-06, + "loss": 0.7649, + "step": 11761 + }, + { + "epoch": 0.6473663933072816, + "grad_norm": 0.8483286499977112, + "learning_rate": 7.654993814864817e-06, + "loss": 0.877, + "step": 11762 + }, + { + "epoch": 0.6474214321096373, + "grad_norm": 0.7938307523727417, + "learning_rate": 7.654626497156311e-06, + "loss": 0.8159, + "step": 11763 + }, + { + "epoch": 0.647476470911993, + "grad_norm": 0.6576653122901917, + "learning_rate": 7.654259159496343e-06, + "loss": 0.797, + "step": 11764 + }, + { + "epoch": 0.6475315097143486, + "grad_norm": 0.6495664715766907, + "learning_rate": 7.653891801887675e-06, + "loss": 0.6641, + "step": 11765 + }, + { + "epoch": 0.6475865485167043, + "grad_norm": 0.7447353601455688, + "learning_rate": 7.653524424333065e-06, + "loss": 0.667, + "step": 11766 + }, + { + "epoch": 0.64764158731906, + "grad_norm": 0.6565769910812378, + "learning_rate": 7.653157026835277e-06, + "loss": 0.7123, + "step": 11767 + }, + { + "epoch": 0.6476966261214157, + "grad_norm": 0.8406145572662354, + "learning_rate": 7.652789609397072e-06, + "loss": 0.7582, + "step": 11768 + }, + { + "epoch": 0.6477516649237712, + "grad_norm": 0.8478217720985413, + "learning_rate": 7.652422172021207e-06, + "loss": 0.6758, + "step": 11769 + }, + { + "epoch": 0.6478067037261269, + "grad_norm": 0.7230110168457031, + "learning_rate": 7.652054714710448e-06, + "loss": 0.8216, + "step": 11770 + }, + { + "epoch": 0.6478617425284826, + "grad_norm": 0.6718668341636658, + "learning_rate": 7.651687237467558e-06, + "loss": 0.7204, + "step": 11771 + }, + { + "epoch": 0.6479167813308382, + "grad_norm": 1.062383770942688, + "learning_rate": 7.651319740295296e-06, + "loss": 0.6853, + "step": 11772 + }, + { + "epoch": 0.6479718201331939, + "grad_norm": 0.7157385945320129, + "learning_rate": 7.650952223196423e-06, + "loss": 0.6826, + "step": 11773 + }, + { + "epoch": 0.6480268589355496, + "grad_norm": 0.6762190461158752, + "learning_rate": 7.650584686173703e-06, + "loss": 0.7673, + "step": 11774 + }, + { + "epoch": 0.6480818977379053, + "grad_norm": 0.7540121674537659, + "learning_rate": 7.650217129229897e-06, + "loss": 0.7361, + "step": 11775 + }, + { + "epoch": 0.6481369365402608, + "grad_norm": 1.0383096933364868, + "learning_rate": 7.649849552367771e-06, + "loss": 0.7936, + "step": 11776 + }, + { + "epoch": 0.6481919753426165, + "grad_norm": 0.6430917382240295, + "learning_rate": 7.649481955590084e-06, + "loss": 0.7738, + "step": 11777 + }, + { + "epoch": 0.6482470141449722, + "grad_norm": 0.7846735715866089, + "learning_rate": 7.6491143388996e-06, + "loss": 0.6892, + "step": 11778 + }, + { + "epoch": 0.6483020529473279, + "grad_norm": 0.7154437899589539, + "learning_rate": 7.64874670229908e-06, + "loss": 0.6889, + "step": 11779 + }, + { + "epoch": 0.6483570917496835, + "grad_norm": 0.731270432472229, + "learning_rate": 7.648379045791291e-06, + "loss": 0.6405, + "step": 11780 + }, + { + "epoch": 0.6484121305520392, + "grad_norm": 0.6782581210136414, + "learning_rate": 7.648011369378993e-06, + "loss": 0.7822, + "step": 11781 + }, + { + "epoch": 0.6484671693543949, + "grad_norm": 0.7025747299194336, + "learning_rate": 7.64764367306495e-06, + "loss": 0.6929, + "step": 11782 + }, + { + "epoch": 0.6485222081567505, + "grad_norm": 0.6791071891784668, + "learning_rate": 7.647275956851928e-06, + "loss": 0.7507, + "step": 11783 + }, + { + "epoch": 0.6485772469591061, + "grad_norm": 0.7598931193351746, + "learning_rate": 7.646908220742686e-06, + "loss": 0.776, + "step": 11784 + }, + { + "epoch": 0.6486322857614618, + "grad_norm": 0.6930273771286011, + "learning_rate": 7.646540464739993e-06, + "loss": 0.7653, + "step": 11785 + }, + { + "epoch": 0.6486873245638175, + "grad_norm": 0.7276393175125122, + "learning_rate": 7.646172688846608e-06, + "loss": 0.8102, + "step": 11786 + }, + { + "epoch": 0.6487423633661732, + "grad_norm": 0.6826562285423279, + "learning_rate": 7.645804893065298e-06, + "loss": 0.6182, + "step": 11787 + }, + { + "epoch": 0.6487974021685288, + "grad_norm": 0.7837507128715515, + "learning_rate": 7.645437077398827e-06, + "loss": 0.8124, + "step": 11788 + }, + { + "epoch": 0.6488524409708845, + "grad_norm": 0.6937540769577026, + "learning_rate": 7.645069241849959e-06, + "loss": 0.7831, + "step": 11789 + }, + { + "epoch": 0.6489074797732401, + "grad_norm": 0.6531546115875244, + "learning_rate": 7.644701386421458e-06, + "loss": 0.755, + "step": 11790 + }, + { + "epoch": 0.6489625185755958, + "grad_norm": 0.8563246726989746, + "learning_rate": 7.644333511116088e-06, + "loss": 0.7715, + "step": 11791 + }, + { + "epoch": 0.6490175573779514, + "grad_norm": 0.8330580592155457, + "learning_rate": 7.643965615936619e-06, + "loss": 0.6651, + "step": 11792 + }, + { + "epoch": 0.6490725961803071, + "grad_norm": 0.6478384137153625, + "learning_rate": 7.643597700885809e-06, + "loss": 0.7063, + "step": 11793 + }, + { + "epoch": 0.6491276349826628, + "grad_norm": 0.7169124484062195, + "learning_rate": 7.643229765966428e-06, + "loss": 0.7578, + "step": 11794 + }, + { + "epoch": 0.6491826737850185, + "grad_norm": 0.726198136806488, + "learning_rate": 7.642861811181239e-06, + "loss": 0.783, + "step": 11795 + }, + { + "epoch": 0.6492377125873741, + "grad_norm": 0.7167587280273438, + "learning_rate": 7.642493836533008e-06, + "loss": 0.81, + "step": 11796 + }, + { + "epoch": 0.6492927513897297, + "grad_norm": 0.7215337157249451, + "learning_rate": 7.642125842024502e-06, + "loss": 0.8176, + "step": 11797 + }, + { + "epoch": 0.6493477901920854, + "grad_norm": 0.7041502594947815, + "learning_rate": 7.641757827658484e-06, + "loss": 0.8117, + "step": 11798 + }, + { + "epoch": 0.6494028289944411, + "grad_norm": 1.0303698778152466, + "learning_rate": 7.64138979343772e-06, + "loss": 0.781, + "step": 11799 + }, + { + "epoch": 0.6494578677967967, + "grad_norm": 0.626518189907074, + "learning_rate": 7.64102173936498e-06, + "loss": 0.6668, + "step": 11800 + }, + { + "epoch": 0.6495129065991524, + "grad_norm": 0.8889065980911255, + "learning_rate": 7.640653665443025e-06, + "loss": 0.8076, + "step": 11801 + }, + { + "epoch": 0.6495679454015081, + "grad_norm": 0.8333556652069092, + "learning_rate": 7.640285571674626e-06, + "loss": 0.8111, + "step": 11802 + }, + { + "epoch": 0.6496229842038638, + "grad_norm": 0.7248615622520447, + "learning_rate": 7.639917458062547e-06, + "loss": 0.7876, + "step": 11803 + }, + { + "epoch": 0.6496780230062194, + "grad_norm": 0.8870820999145508, + "learning_rate": 7.639549324609554e-06, + "loss": 0.8586, + "step": 11804 + }, + { + "epoch": 0.649733061808575, + "grad_norm": 0.7777245044708252, + "learning_rate": 7.639181171318417e-06, + "loss": 0.7793, + "step": 11805 + }, + { + "epoch": 0.6497881006109307, + "grad_norm": 0.7858467102050781, + "learning_rate": 7.638812998191897e-06, + "loss": 0.7842, + "step": 11806 + }, + { + "epoch": 0.6498431394132864, + "grad_norm": 0.6278610825538635, + "learning_rate": 7.638444805232769e-06, + "loss": 0.6659, + "step": 11807 + }, + { + "epoch": 0.649898178215642, + "grad_norm": 0.6758826971054077, + "learning_rate": 7.638076592443795e-06, + "loss": 0.7047, + "step": 11808 + }, + { + "epoch": 0.6499532170179977, + "grad_norm": 0.745007336139679, + "learning_rate": 7.637708359827743e-06, + "loss": 0.8557, + "step": 11809 + }, + { + "epoch": 0.6500082558203534, + "grad_norm": 0.8092321157455444, + "learning_rate": 7.63734010738738e-06, + "loss": 0.7895, + "step": 11810 + }, + { + "epoch": 0.6500632946227091, + "grad_norm": 0.7055220603942871, + "learning_rate": 7.636971835125476e-06, + "loss": 0.7678, + "step": 11811 + }, + { + "epoch": 0.6501183334250646, + "grad_norm": 0.7130264043807983, + "learning_rate": 7.636603543044797e-06, + "loss": 0.7648, + "step": 11812 + }, + { + "epoch": 0.6501733722274203, + "grad_norm": 0.7494268417358398, + "learning_rate": 7.636235231148112e-06, + "loss": 0.7883, + "step": 11813 + }, + { + "epoch": 0.650228411029776, + "grad_norm": 0.7998068332672119, + "learning_rate": 7.635866899438189e-06, + "loss": 0.7849, + "step": 11814 + }, + { + "epoch": 0.6502834498321316, + "grad_norm": 0.6749094128608704, + "learning_rate": 7.635498547917795e-06, + "loss": 0.8488, + "step": 11815 + }, + { + "epoch": 0.6503384886344873, + "grad_norm": 0.743679940700531, + "learning_rate": 7.635130176589698e-06, + "loss": 0.7562, + "step": 11816 + }, + { + "epoch": 0.650393527436843, + "grad_norm": 0.8368289470672607, + "learning_rate": 7.634761785456671e-06, + "loss": 0.7012, + "step": 11817 + }, + { + "epoch": 0.6504485662391987, + "grad_norm": 0.7214943170547485, + "learning_rate": 7.634393374521478e-06, + "loss": 0.7386, + "step": 11818 + }, + { + "epoch": 0.6505036050415542, + "grad_norm": 0.7026216387748718, + "learning_rate": 7.63402494378689e-06, + "loss": 0.7444, + "step": 11819 + }, + { + "epoch": 0.6505586438439099, + "grad_norm": 0.6271201372146606, + "learning_rate": 7.633656493255677e-06, + "loss": 0.6567, + "step": 11820 + }, + { + "epoch": 0.6506136826462656, + "grad_norm": 0.8359349370002747, + "learning_rate": 7.633288022930606e-06, + "loss": 0.7081, + "step": 11821 + }, + { + "epoch": 0.6506687214486213, + "grad_norm": 0.7009666562080383, + "learning_rate": 7.632919532814444e-06, + "loss": 0.6892, + "step": 11822 + }, + { + "epoch": 0.6507237602509769, + "grad_norm": 0.7445069551467896, + "learning_rate": 7.632551022909966e-06, + "loss": 0.7854, + "step": 11823 + }, + { + "epoch": 0.6507787990533326, + "grad_norm": 0.7204466462135315, + "learning_rate": 7.63218249321994e-06, + "loss": 0.8065, + "step": 11824 + }, + { + "epoch": 0.6508338378556883, + "grad_norm": 0.7058166265487671, + "learning_rate": 7.631813943747135e-06, + "loss": 0.6668, + "step": 11825 + }, + { + "epoch": 0.650888876658044, + "grad_norm": 0.739919126033783, + "learning_rate": 7.631445374494319e-06, + "loss": 0.8657, + "step": 11826 + }, + { + "epoch": 0.6509439154603995, + "grad_norm": 1.0444670915603638, + "learning_rate": 7.631076785464263e-06, + "loss": 0.7226, + "step": 11827 + }, + { + "epoch": 0.6509989542627552, + "grad_norm": 0.7146627306938171, + "learning_rate": 7.630708176659743e-06, + "loss": 0.7567, + "step": 11828 + }, + { + "epoch": 0.6510539930651109, + "grad_norm": 0.6981074810028076, + "learning_rate": 7.630339548083521e-06, + "loss": 0.7158, + "step": 11829 + }, + { + "epoch": 0.6511090318674666, + "grad_norm": 0.7620309591293335, + "learning_rate": 7.629970899738372e-06, + "loss": 0.811, + "step": 11830 + }, + { + "epoch": 0.6511640706698222, + "grad_norm": 0.7017341256141663, + "learning_rate": 7.629602231627066e-06, + "loss": 0.7092, + "step": 11831 + }, + { + "epoch": 0.6512191094721779, + "grad_norm": 0.733524739742279, + "learning_rate": 7.629233543752373e-06, + "loss": 0.859, + "step": 11832 + }, + { + "epoch": 0.6512741482745336, + "grad_norm": 0.7246975898742676, + "learning_rate": 7.628864836117065e-06, + "loss": 0.7732, + "step": 11833 + }, + { + "epoch": 0.6513291870768892, + "grad_norm": 0.5763251185417175, + "learning_rate": 7.628496108723911e-06, + "loss": 0.6632, + "step": 11834 + }, + { + "epoch": 0.6513842258792448, + "grad_norm": 0.6120070815086365, + "learning_rate": 7.628127361575685e-06, + "loss": 0.6809, + "step": 11835 + }, + { + "epoch": 0.6514392646816005, + "grad_norm": 0.8650742769241333, + "learning_rate": 7.627758594675157e-06, + "loss": 0.6388, + "step": 11836 + }, + { + "epoch": 0.6514943034839562, + "grad_norm": 0.8650027513504028, + "learning_rate": 7.627389808025099e-06, + "loss": 0.7622, + "step": 11837 + }, + { + "epoch": 0.6515493422863119, + "grad_norm": 0.6683071851730347, + "learning_rate": 7.627021001628283e-06, + "loss": 0.7424, + "step": 11838 + }, + { + "epoch": 0.6516043810886675, + "grad_norm": 0.6821237206459045, + "learning_rate": 7.626652175487479e-06, + "loss": 0.7844, + "step": 11839 + }, + { + "epoch": 0.6516594198910232, + "grad_norm": 0.7142770886421204, + "learning_rate": 7.626283329605462e-06, + "loss": 0.7706, + "step": 11840 + }, + { + "epoch": 0.6517144586933789, + "grad_norm": 0.7870625257492065, + "learning_rate": 7.625914463985002e-06, + "loss": 0.7673, + "step": 11841 + }, + { + "epoch": 0.6517694974957345, + "grad_norm": 0.7386491894721985, + "learning_rate": 7.62554557862887e-06, + "loss": 0.7562, + "step": 11842 + }, + { + "epoch": 0.6518245362980901, + "grad_norm": 0.6529993414878845, + "learning_rate": 7.625176673539843e-06, + "loss": 0.8258, + "step": 11843 + }, + { + "epoch": 0.6518795751004458, + "grad_norm": 0.7010294795036316, + "learning_rate": 7.6248077487206895e-06, + "loss": 0.7773, + "step": 11844 + }, + { + "epoch": 0.6519346139028015, + "grad_norm": 0.6699075698852539, + "learning_rate": 7.624438804174184e-06, + "loss": 0.7163, + "step": 11845 + }, + { + "epoch": 0.6519896527051572, + "grad_norm": 0.6600161790847778, + "learning_rate": 7.624069839903099e-06, + "loss": 0.7355, + "step": 11846 + }, + { + "epoch": 0.6520446915075128, + "grad_norm": 0.6556873321533203, + "learning_rate": 7.623700855910205e-06, + "loss": 0.627, + "step": 11847 + }, + { + "epoch": 0.6520997303098685, + "grad_norm": 0.6867008805274963, + "learning_rate": 7.623331852198281e-06, + "loss": 0.8228, + "step": 11848 + }, + { + "epoch": 0.6521547691122241, + "grad_norm": 0.6885474324226379, + "learning_rate": 7.622962828770095e-06, + "loss": 0.6804, + "step": 11849 + }, + { + "epoch": 0.6522098079145798, + "grad_norm": 0.6903913021087646, + "learning_rate": 7.622593785628425e-06, + "loss": 0.6553, + "step": 11850 + }, + { + "epoch": 0.6522648467169354, + "grad_norm": 0.6581684947013855, + "learning_rate": 7.622224722776039e-06, + "loss": 0.7102, + "step": 11851 + }, + { + "epoch": 0.6523198855192911, + "grad_norm": 0.8261715769767761, + "learning_rate": 7.621855640215716e-06, + "loss": 0.676, + "step": 11852 + }, + { + "epoch": 0.6523749243216468, + "grad_norm": 0.6238247752189636, + "learning_rate": 7.6214865379502265e-06, + "loss": 0.7065, + "step": 11853 + }, + { + "epoch": 0.6524299631240025, + "grad_norm": 0.7350416779518127, + "learning_rate": 7.621117415982346e-06, + "loss": 0.7512, + "step": 11854 + }, + { + "epoch": 0.652485001926358, + "grad_norm": 0.7337208390235901, + "learning_rate": 7.620748274314851e-06, + "loss": 0.7593, + "step": 11855 + }, + { + "epoch": 0.6525400407287137, + "grad_norm": 0.6568214297294617, + "learning_rate": 7.620379112950511e-06, + "loss": 0.7363, + "step": 11856 + }, + { + "epoch": 0.6525950795310694, + "grad_norm": 0.7099055647850037, + "learning_rate": 7.620009931892105e-06, + "loss": 0.6631, + "step": 11857 + }, + { + "epoch": 0.652650118333425, + "grad_norm": 0.6563010215759277, + "learning_rate": 7.6196407311424035e-06, + "loss": 0.6617, + "step": 11858 + }, + { + "epoch": 0.6527051571357807, + "grad_norm": 0.6664251685142517, + "learning_rate": 7.6192715107041845e-06, + "loss": 0.7898, + "step": 11859 + }, + { + "epoch": 0.6527601959381364, + "grad_norm": 0.6524507403373718, + "learning_rate": 7.618902270580222e-06, + "loss": 0.767, + "step": 11860 + }, + { + "epoch": 0.6528152347404921, + "grad_norm": 0.7391313910484314, + "learning_rate": 7.61853301077329e-06, + "loss": 0.6015, + "step": 11861 + }, + { + "epoch": 0.6528702735428477, + "grad_norm": 0.7691878080368042, + "learning_rate": 7.618163731286167e-06, + "loss": 0.718, + "step": 11862 + }, + { + "epoch": 0.6529253123452033, + "grad_norm": 0.6524633765220642, + "learning_rate": 7.617794432121625e-06, + "loss": 0.6841, + "step": 11863 + }, + { + "epoch": 0.652980351147559, + "grad_norm": 0.7125405073165894, + "learning_rate": 7.61742511328244e-06, + "loss": 0.7654, + "step": 11864 + }, + { + "epoch": 0.6530353899499147, + "grad_norm": 0.7123568058013916, + "learning_rate": 7.617055774771389e-06, + "loss": 0.7189, + "step": 11865 + }, + { + "epoch": 0.6530904287522703, + "grad_norm": 0.6968240141868591, + "learning_rate": 7.616686416591248e-06, + "loss": 0.7201, + "step": 11866 + }, + { + "epoch": 0.653145467554626, + "grad_norm": 0.7208551168441772, + "learning_rate": 7.616317038744792e-06, + "loss": 0.6644, + "step": 11867 + }, + { + "epoch": 0.6532005063569817, + "grad_norm": 0.7320911884307861, + "learning_rate": 7.615947641234798e-06, + "loss": 0.7118, + "step": 11868 + }, + { + "epoch": 0.6532555451593374, + "grad_norm": 0.7762041687965393, + "learning_rate": 7.615578224064041e-06, + "loss": 0.7501, + "step": 11869 + }, + { + "epoch": 0.653310583961693, + "grad_norm": 0.7455989718437195, + "learning_rate": 7.6152087872352975e-06, + "loss": 0.8058, + "step": 11870 + }, + { + "epoch": 0.6533656227640486, + "grad_norm": 0.736044704914093, + "learning_rate": 7.614839330751347e-06, + "loss": 0.727, + "step": 11871 + }, + { + "epoch": 0.6534206615664043, + "grad_norm": 0.680171012878418, + "learning_rate": 7.614469854614961e-06, + "loss": 0.6722, + "step": 11872 + }, + { + "epoch": 0.65347570036876, + "grad_norm": 0.7598134279251099, + "learning_rate": 7.614100358828922e-06, + "loss": 0.7472, + "step": 11873 + }, + { + "epoch": 0.6535307391711156, + "grad_norm": 0.8288099765777588, + "learning_rate": 7.613730843396003e-06, + "loss": 0.7493, + "step": 11874 + }, + { + "epoch": 0.6535857779734713, + "grad_norm": 0.6436724066734314, + "learning_rate": 7.613361308318984e-06, + "loss": 0.7103, + "step": 11875 + }, + { + "epoch": 0.653640816775827, + "grad_norm": 0.671334981918335, + "learning_rate": 7.612991753600639e-06, + "loss": 0.6949, + "step": 11876 + }, + { + "epoch": 0.6536958555781827, + "grad_norm": 0.6019170880317688, + "learning_rate": 7.61262217924375e-06, + "loss": 0.6116, + "step": 11877 + }, + { + "epoch": 0.6537508943805382, + "grad_norm": 1.4682546854019165, + "learning_rate": 7.61225258525109e-06, + "loss": 0.9343, + "step": 11878 + }, + { + "epoch": 0.6538059331828939, + "grad_norm": 0.656822681427002, + "learning_rate": 7.611882971625439e-06, + "loss": 0.7357, + "step": 11879 + }, + { + "epoch": 0.6538609719852496, + "grad_norm": 0.635734498500824, + "learning_rate": 7.611513338369576e-06, + "loss": 0.6263, + "step": 11880 + }, + { + "epoch": 0.6539160107876053, + "grad_norm": 0.7123430967330933, + "learning_rate": 7.611143685486277e-06, + "loss": 0.8446, + "step": 11881 + }, + { + "epoch": 0.6539710495899609, + "grad_norm": 0.7597065567970276, + "learning_rate": 7.610774012978322e-06, + "loss": 0.7449, + "step": 11882 + }, + { + "epoch": 0.6540260883923166, + "grad_norm": 0.7555896043777466, + "learning_rate": 7.610404320848486e-06, + "loss": 0.7575, + "step": 11883 + }, + { + "epoch": 0.6540811271946723, + "grad_norm": 0.7572906613349915, + "learning_rate": 7.6100346090995506e-06, + "loss": 0.7547, + "step": 11884 + }, + { + "epoch": 0.654136165997028, + "grad_norm": 0.6663275957107544, + "learning_rate": 7.609664877734295e-06, + "loss": 0.7038, + "step": 11885 + }, + { + "epoch": 0.6541912047993835, + "grad_norm": 0.7346611618995667, + "learning_rate": 7.609295126755496e-06, + "loss": 0.7902, + "step": 11886 + }, + { + "epoch": 0.6542462436017392, + "grad_norm": 0.6846545338630676, + "learning_rate": 7.608925356165934e-06, + "loss": 0.7334, + "step": 11887 + }, + { + "epoch": 0.6543012824040949, + "grad_norm": 0.6714815497398376, + "learning_rate": 7.608555565968385e-06, + "loss": 0.7204, + "step": 11888 + }, + { + "epoch": 0.6543563212064506, + "grad_norm": 0.805095374584198, + "learning_rate": 7.608185756165634e-06, + "loss": 0.8521, + "step": 11889 + }, + { + "epoch": 0.6544113600088062, + "grad_norm": 0.8415316343307495, + "learning_rate": 7.607815926760456e-06, + "loss": 0.7076, + "step": 11890 + }, + { + "epoch": 0.6544663988111619, + "grad_norm": 0.7665743231773376, + "learning_rate": 7.607446077755632e-06, + "loss": 0.8072, + "step": 11891 + }, + { + "epoch": 0.6545214376135176, + "grad_norm": 0.6705248355865479, + "learning_rate": 7.607076209153939e-06, + "loss": 0.6607, + "step": 11892 + }, + { + "epoch": 0.6545764764158732, + "grad_norm": 0.6791796684265137, + "learning_rate": 7.606706320958159e-06, + "loss": 0.773, + "step": 11893 + }, + { + "epoch": 0.6546315152182288, + "grad_norm": 0.8177357316017151, + "learning_rate": 7.606336413171075e-06, + "loss": 0.8114, + "step": 11894 + }, + { + "epoch": 0.6546865540205845, + "grad_norm": 0.9491637945175171, + "learning_rate": 7.605966485795462e-06, + "loss": 0.7424, + "step": 11895 + }, + { + "epoch": 0.6547415928229402, + "grad_norm": 0.7326256036758423, + "learning_rate": 7.605596538834103e-06, + "loss": 0.8176, + "step": 11896 + }, + { + "epoch": 0.6547966316252959, + "grad_norm": 0.6081808805465698, + "learning_rate": 7.6052265722897775e-06, + "loss": 0.6827, + "step": 11897 + }, + { + "epoch": 0.6548516704276515, + "grad_norm": 0.7165681719779968, + "learning_rate": 7.604856586165268e-06, + "loss": 0.7854, + "step": 11898 + }, + { + "epoch": 0.6549067092300072, + "grad_norm": 0.8777725100517273, + "learning_rate": 7.604486580463353e-06, + "loss": 0.8084, + "step": 11899 + }, + { + "epoch": 0.6549617480323628, + "grad_norm": 0.6814439296722412, + "learning_rate": 7.604116555186811e-06, + "loss": 0.6869, + "step": 11900 + }, + { + "epoch": 0.6550167868347184, + "grad_norm": 0.7060914635658264, + "learning_rate": 7.60374651033843e-06, + "loss": 0.7066, + "step": 11901 + }, + { + "epoch": 0.6550718256370741, + "grad_norm": 0.6823089718818665, + "learning_rate": 7.603376445920987e-06, + "loss": 0.6095, + "step": 11902 + }, + { + "epoch": 0.6551268644394298, + "grad_norm": 0.7099863290786743, + "learning_rate": 7.603006361937262e-06, + "loss": 0.8037, + "step": 11903 + }, + { + "epoch": 0.6551819032417855, + "grad_norm": 0.6479066610336304, + "learning_rate": 7.602636258390037e-06, + "loss": 0.6844, + "step": 11904 + }, + { + "epoch": 0.6552369420441411, + "grad_norm": 0.6663268804550171, + "learning_rate": 7.602266135282097e-06, + "loss": 0.735, + "step": 11905 + }, + { + "epoch": 0.6552919808464968, + "grad_norm": 0.8670598268508911, + "learning_rate": 7.60189599261622e-06, + "loss": 0.779, + "step": 11906 + }, + { + "epoch": 0.6553470196488524, + "grad_norm": 0.607631504535675, + "learning_rate": 7.601525830395189e-06, + "loss": 0.6288, + "step": 11907 + }, + { + "epoch": 0.6554020584512081, + "grad_norm": 0.9054927229881287, + "learning_rate": 7.601155648621786e-06, + "loss": 0.8562, + "step": 11908 + }, + { + "epoch": 0.6554570972535637, + "grad_norm": 0.8069004416465759, + "learning_rate": 7.6007854472987955e-06, + "loss": 0.88, + "step": 11909 + }, + { + "epoch": 0.6555121360559194, + "grad_norm": 0.6393092274665833, + "learning_rate": 7.600415226428995e-06, + "loss": 0.6908, + "step": 11910 + }, + { + "epoch": 0.6555671748582751, + "grad_norm": 0.7533125281333923, + "learning_rate": 7.600044986015172e-06, + "loss": 0.8061, + "step": 11911 + }, + { + "epoch": 0.6556222136606308, + "grad_norm": 0.6859326958656311, + "learning_rate": 7.599674726060105e-06, + "loss": 0.7603, + "step": 11912 + }, + { + "epoch": 0.6556772524629864, + "grad_norm": 0.7284619808197021, + "learning_rate": 7.59930444656658e-06, + "loss": 0.7698, + "step": 11913 + }, + { + "epoch": 0.655732291265342, + "grad_norm": 1.074234127998352, + "learning_rate": 7.598934147537378e-06, + "loss": 0.8252, + "step": 11914 + }, + { + "epoch": 0.6557873300676977, + "grad_norm": 0.6899133920669556, + "learning_rate": 7.598563828975283e-06, + "loss": 0.6023, + "step": 11915 + }, + { + "epoch": 0.6558423688700534, + "grad_norm": 0.6736464500427246, + "learning_rate": 7.598193490883077e-06, + "loss": 0.788, + "step": 11916 + }, + { + "epoch": 0.655897407672409, + "grad_norm": 0.7646307349205017, + "learning_rate": 7.597823133263545e-06, + "loss": 0.7607, + "step": 11917 + }, + { + "epoch": 0.6559524464747647, + "grad_norm": 0.6413717865943909, + "learning_rate": 7.59745275611947e-06, + "loss": 0.6415, + "step": 11918 + }, + { + "epoch": 0.6560074852771204, + "grad_norm": 0.6605532169342041, + "learning_rate": 7.597082359453636e-06, + "loss": 0.6655, + "step": 11919 + }, + { + "epoch": 0.6560625240794761, + "grad_norm": 0.6573199033737183, + "learning_rate": 7.596711943268824e-06, + "loss": 0.624, + "step": 11920 + }, + { + "epoch": 0.6561175628818317, + "grad_norm": 0.8312102556228638, + "learning_rate": 7.596341507567822e-06, + "loss": 0.6803, + "step": 11921 + }, + { + "epoch": 0.6561726016841873, + "grad_norm": 0.6915873289108276, + "learning_rate": 7.59597105235341e-06, + "loss": 0.6897, + "step": 11922 + }, + { + "epoch": 0.656227640486543, + "grad_norm": 0.6916965842247009, + "learning_rate": 7.595600577628377e-06, + "loss": 0.7154, + "step": 11923 + }, + { + "epoch": 0.6562826792888987, + "grad_norm": 0.6712722182273865, + "learning_rate": 7.595230083395501e-06, + "loss": 0.7236, + "step": 11924 + }, + { + "epoch": 0.6563377180912543, + "grad_norm": 0.6514019966125488, + "learning_rate": 7.594859569657575e-06, + "loss": 0.6895, + "step": 11925 + }, + { + "epoch": 0.65639275689361, + "grad_norm": 0.7300555109977722, + "learning_rate": 7.594489036417378e-06, + "loss": 0.7563, + "step": 11926 + }, + { + "epoch": 0.6564477956959657, + "grad_norm": 0.8076907396316528, + "learning_rate": 7.594118483677695e-06, + "loss": 0.8883, + "step": 11927 + }, + { + "epoch": 0.6565028344983214, + "grad_norm": 0.666466236114502, + "learning_rate": 7.5937479114413114e-06, + "loss": 0.7641, + "step": 11928 + }, + { + "epoch": 0.6565578733006769, + "grad_norm": 0.6621832251548767, + "learning_rate": 7.593377319711013e-06, + "loss": 0.6687, + "step": 11929 + }, + { + "epoch": 0.6566129121030326, + "grad_norm": 0.8757139444351196, + "learning_rate": 7.593006708489585e-06, + "loss": 0.7746, + "step": 11930 + }, + { + "epoch": 0.6566679509053883, + "grad_norm": 0.646801769733429, + "learning_rate": 7.5926360777798135e-06, + "loss": 0.6884, + "step": 11931 + }, + { + "epoch": 0.656722989707744, + "grad_norm": 0.6703395843505859, + "learning_rate": 7.592265427584482e-06, + "loss": 0.6822, + "step": 11932 + }, + { + "epoch": 0.6567780285100996, + "grad_norm": 0.7653201222419739, + "learning_rate": 7.591894757906378e-06, + "loss": 0.7999, + "step": 11933 + }, + { + "epoch": 0.6568330673124553, + "grad_norm": 0.6921548247337341, + "learning_rate": 7.591524068748288e-06, + "loss": 0.7177, + "step": 11934 + }, + { + "epoch": 0.656888106114811, + "grad_norm": 0.7085320353507996, + "learning_rate": 7.591153360112995e-06, + "loss": 0.8395, + "step": 11935 + }, + { + "epoch": 0.6569431449171667, + "grad_norm": 0.6565294861793518, + "learning_rate": 7.590782632003287e-06, + "loss": 0.6969, + "step": 11936 + }, + { + "epoch": 0.6569981837195222, + "grad_norm": 0.7023206353187561, + "learning_rate": 7.590411884421952e-06, + "loss": 0.7321, + "step": 11937 + }, + { + "epoch": 0.6570532225218779, + "grad_norm": 0.7848044633865356, + "learning_rate": 7.590041117371774e-06, + "loss": 0.8857, + "step": 11938 + }, + { + "epoch": 0.6571082613242336, + "grad_norm": 1.004591703414917, + "learning_rate": 7.589670330855541e-06, + "loss": 0.8267, + "step": 11939 + }, + { + "epoch": 0.6571633001265893, + "grad_norm": 0.7525139451026917, + "learning_rate": 7.589299524876036e-06, + "loss": 0.6857, + "step": 11940 + }, + { + "epoch": 0.6572183389289449, + "grad_norm": 0.746224582195282, + "learning_rate": 7.588928699436051e-06, + "loss": 0.805, + "step": 11941 + }, + { + "epoch": 0.6572733777313006, + "grad_norm": 0.6304495930671692, + "learning_rate": 7.588557854538371e-06, + "loss": 0.652, + "step": 11942 + }, + { + "epoch": 0.6573284165336563, + "grad_norm": 0.761688768863678, + "learning_rate": 7.588186990185783e-06, + "loss": 0.7954, + "step": 11943 + }, + { + "epoch": 0.6573834553360118, + "grad_norm": 0.7735103368759155, + "learning_rate": 7.587816106381073e-06, + "loss": 0.7584, + "step": 11944 + }, + { + "epoch": 0.6574384941383675, + "grad_norm": 0.7351566553115845, + "learning_rate": 7.5874452031270305e-06, + "loss": 0.7984, + "step": 11945 + }, + { + "epoch": 0.6574935329407232, + "grad_norm": 0.7054993510246277, + "learning_rate": 7.587074280426443e-06, + "loss": 0.7057, + "step": 11946 + }, + { + "epoch": 0.6575485717430789, + "grad_norm": 0.7444368004798889, + "learning_rate": 7.586703338282099e-06, + "loss": 0.7476, + "step": 11947 + }, + { + "epoch": 0.6576036105454345, + "grad_norm": 0.6944568157196045, + "learning_rate": 7.586332376696782e-06, + "loss": 0.6874, + "step": 11948 + }, + { + "epoch": 0.6576586493477902, + "grad_norm": 0.6595578193664551, + "learning_rate": 7.585961395673287e-06, + "loss": 0.7541, + "step": 11949 + }, + { + "epoch": 0.6577136881501459, + "grad_norm": 0.6669502258300781, + "learning_rate": 7.585590395214396e-06, + "loss": 0.7515, + "step": 11950 + }, + { + "epoch": 0.6577687269525015, + "grad_norm": 0.7254583835601807, + "learning_rate": 7.585219375322901e-06, + "loss": 0.8089, + "step": 11951 + }, + { + "epoch": 0.6578237657548571, + "grad_norm": 1.0479141473770142, + "learning_rate": 7.584848336001587e-06, + "loss": 0.8108, + "step": 11952 + }, + { + "epoch": 0.6578788045572128, + "grad_norm": 0.6928718686103821, + "learning_rate": 7.584477277253246e-06, + "loss": 0.6325, + "step": 11953 + }, + { + "epoch": 0.6579338433595685, + "grad_norm": 0.8926869630813599, + "learning_rate": 7.584106199080666e-06, + "loss": 0.7294, + "step": 11954 + }, + { + "epoch": 0.6579888821619242, + "grad_norm": 0.7209964394569397, + "learning_rate": 7.583735101486635e-06, + "loss": 0.7646, + "step": 11955 + }, + { + "epoch": 0.6580439209642798, + "grad_norm": 0.7619316577911377, + "learning_rate": 7.583363984473941e-06, + "loss": 0.7756, + "step": 11956 + }, + { + "epoch": 0.6580989597666355, + "grad_norm": 0.6974903345108032, + "learning_rate": 7.582992848045378e-06, + "loss": 0.6497, + "step": 11957 + }, + { + "epoch": 0.6581539985689911, + "grad_norm": 0.8338617086410522, + "learning_rate": 7.582621692203731e-06, + "loss": 0.6619, + "step": 11958 + }, + { + "epoch": 0.6582090373713468, + "grad_norm": 0.9330396056175232, + "learning_rate": 7.5822505169517905e-06, + "loss": 0.8219, + "step": 11959 + }, + { + "epoch": 0.6582640761737024, + "grad_norm": 0.7725355625152588, + "learning_rate": 7.5818793222923445e-06, + "loss": 0.7262, + "step": 11960 + }, + { + "epoch": 0.6583191149760581, + "grad_norm": 0.7049654722213745, + "learning_rate": 7.5815081082281885e-06, + "loss": 0.7917, + "step": 11961 + }, + { + "epoch": 0.6583741537784138, + "grad_norm": 0.6801711916923523, + "learning_rate": 7.581136874762105e-06, + "loss": 0.6984, + "step": 11962 + }, + { + "epoch": 0.6584291925807695, + "grad_norm": 0.7774253487586975, + "learning_rate": 7.58076562189689e-06, + "loss": 0.7615, + "step": 11963 + }, + { + "epoch": 0.6584842313831251, + "grad_norm": 0.7436443567276001, + "learning_rate": 7.58039434963533e-06, + "loss": 0.7419, + "step": 11964 + }, + { + "epoch": 0.6585392701854808, + "grad_norm": 0.6857719421386719, + "learning_rate": 7.580023057980217e-06, + "loss": 0.8009, + "step": 11965 + }, + { + "epoch": 0.6585943089878364, + "grad_norm": 0.7194758653640747, + "learning_rate": 7.579651746934342e-06, + "loss": 0.7338, + "step": 11966 + }, + { + "epoch": 0.6586493477901921, + "grad_norm": 0.7248701453208923, + "learning_rate": 7.579280416500495e-06, + "loss": 0.6972, + "step": 11967 + }, + { + "epoch": 0.6587043865925477, + "grad_norm": 0.6719415783882141, + "learning_rate": 7.578909066681466e-06, + "loss": 0.7552, + "step": 11968 + }, + { + "epoch": 0.6587594253949034, + "grad_norm": 0.728338897228241, + "learning_rate": 7.578537697480046e-06, + "loss": 0.8386, + "step": 11969 + }, + { + "epoch": 0.6588144641972591, + "grad_norm": 0.7151786684989929, + "learning_rate": 7.578166308899029e-06, + "loss": 0.7186, + "step": 11970 + }, + { + "epoch": 0.6588695029996148, + "grad_norm": 0.664412260055542, + "learning_rate": 7.577794900941205e-06, + "loss": 0.6672, + "step": 11971 + }, + { + "epoch": 0.6589245418019704, + "grad_norm": 0.6915827989578247, + "learning_rate": 7.577423473609361e-06, + "loss": 0.7427, + "step": 11972 + }, + { + "epoch": 0.658979580604326, + "grad_norm": 0.705243706703186, + "learning_rate": 7.577052026906295e-06, + "loss": 0.7526, + "step": 11973 + }, + { + "epoch": 0.6590346194066817, + "grad_norm": 0.6559640169143677, + "learning_rate": 7.576680560834795e-06, + "loss": 0.8187, + "step": 11974 + }, + { + "epoch": 0.6590896582090374, + "grad_norm": 0.7359572649002075, + "learning_rate": 7.576309075397653e-06, + "loss": 0.8127, + "step": 11975 + }, + { + "epoch": 0.659144697011393, + "grad_norm": 0.6581039428710938, + "learning_rate": 7.575937570597661e-06, + "loss": 0.7066, + "step": 11976 + }, + { + "epoch": 0.6591997358137487, + "grad_norm": 0.8360844254493713, + "learning_rate": 7.5755660464376134e-06, + "loss": 0.7998, + "step": 11977 + }, + { + "epoch": 0.6592547746161044, + "grad_norm": 0.7201453447341919, + "learning_rate": 7.5751945029203e-06, + "loss": 0.7884, + "step": 11978 + }, + { + "epoch": 0.6593098134184601, + "grad_norm": 0.6985270977020264, + "learning_rate": 7.574822940048514e-06, + "loss": 0.7268, + "step": 11979 + }, + { + "epoch": 0.6593648522208156, + "grad_norm": 0.6405925154685974, + "learning_rate": 7.574451357825048e-06, + "loss": 0.6848, + "step": 11980 + }, + { + "epoch": 0.6594198910231713, + "grad_norm": 0.6656618714332581, + "learning_rate": 7.574079756252694e-06, + "loss": 0.7755, + "step": 11981 + }, + { + "epoch": 0.659474929825527, + "grad_norm": 0.8461045622825623, + "learning_rate": 7.573708135334248e-06, + "loss": 0.7171, + "step": 11982 + }, + { + "epoch": 0.6595299686278827, + "grad_norm": 0.5527384877204895, + "learning_rate": 7.573336495072498e-06, + "loss": 0.6668, + "step": 11983 + }, + { + "epoch": 0.6595850074302383, + "grad_norm": 0.6703749299049377, + "learning_rate": 7.572964835470241e-06, + "loss": 0.7128, + "step": 11984 + }, + { + "epoch": 0.659640046232594, + "grad_norm": 0.6824783682823181, + "learning_rate": 7.57259315653027e-06, + "loss": 0.8007, + "step": 11985 + }, + { + "epoch": 0.6596950850349497, + "grad_norm": 0.7369599938392639, + "learning_rate": 7.572221458255377e-06, + "loss": 0.7507, + "step": 11986 + }, + { + "epoch": 0.6597501238373052, + "grad_norm": 0.6976807713508606, + "learning_rate": 7.571849740648356e-06, + "loss": 0.7787, + "step": 11987 + }, + { + "epoch": 0.6598051626396609, + "grad_norm": 0.6735848784446716, + "learning_rate": 7.571478003711998e-06, + "loss": 0.6791, + "step": 11988 + }, + { + "epoch": 0.6598602014420166, + "grad_norm": 0.7245956659317017, + "learning_rate": 7.5711062474491025e-06, + "loss": 0.7999, + "step": 11989 + }, + { + "epoch": 0.6599152402443723, + "grad_norm": 0.760748565196991, + "learning_rate": 7.5707344718624595e-06, + "loss": 0.7904, + "step": 11990 + }, + { + "epoch": 0.6599702790467279, + "grad_norm": 0.6745715141296387, + "learning_rate": 7.5703626769548654e-06, + "loss": 0.6938, + "step": 11991 + }, + { + "epoch": 0.6600253178490836, + "grad_norm": 0.7301452159881592, + "learning_rate": 7.569990862729113e-06, + "loss": 0.7546, + "step": 11992 + }, + { + "epoch": 0.6600803566514393, + "grad_norm": 0.68801349401474, + "learning_rate": 7.569619029187998e-06, + "loss": 0.7592, + "step": 11993 + }, + { + "epoch": 0.660135395453795, + "grad_norm": 0.6839548349380493, + "learning_rate": 7.569247176334313e-06, + "loss": 0.7139, + "step": 11994 + }, + { + "epoch": 0.6601904342561505, + "grad_norm": 0.7490861415863037, + "learning_rate": 7.568875304170854e-06, + "loss": 0.7939, + "step": 11995 + }, + { + "epoch": 0.6602454730585062, + "grad_norm": 0.7098836302757263, + "learning_rate": 7.568503412700416e-06, + "loss": 0.7824, + "step": 11996 + }, + { + "epoch": 0.6603005118608619, + "grad_norm": 0.7427988052368164, + "learning_rate": 7.568131501925795e-06, + "loss": 0.7492, + "step": 11997 + }, + { + "epoch": 0.6603555506632176, + "grad_norm": 0.6715356111526489, + "learning_rate": 7.567759571849784e-06, + "loss": 0.6444, + "step": 11998 + }, + { + "epoch": 0.6604105894655732, + "grad_norm": 0.6697829961776733, + "learning_rate": 7.5673876224751795e-06, + "loss": 0.7064, + "step": 11999 + }, + { + "epoch": 0.6604656282679289, + "grad_norm": 0.6778494119644165, + "learning_rate": 7.567015653804777e-06, + "loss": 0.7517, + "step": 12000 + }, + { + "epoch": 0.6605206670702846, + "grad_norm": 0.6423540711402893, + "learning_rate": 7.566643665841371e-06, + "loss": 0.6321, + "step": 12001 + }, + { + "epoch": 0.6605757058726403, + "grad_norm": 0.6874244213104248, + "learning_rate": 7.566271658587761e-06, + "loss": 0.762, + "step": 12002 + }, + { + "epoch": 0.6606307446749958, + "grad_norm": 0.6805301308631897, + "learning_rate": 7.565899632046737e-06, + "loss": 0.765, + "step": 12003 + }, + { + "epoch": 0.6606857834773515, + "grad_norm": 0.7039558291435242, + "learning_rate": 7.5655275862211e-06, + "loss": 0.728, + "step": 12004 + }, + { + "epoch": 0.6607408222797072, + "grad_norm": 0.6513119339942932, + "learning_rate": 7.565155521113643e-06, + "loss": 0.7711, + "step": 12005 + }, + { + "epoch": 0.6607958610820629, + "grad_norm": 0.6483618021011353, + "learning_rate": 7.5647834367271655e-06, + "loss": 0.7015, + "step": 12006 + }, + { + "epoch": 0.6608508998844185, + "grad_norm": 0.7180553674697876, + "learning_rate": 7.564411333064461e-06, + "loss": 0.812, + "step": 12007 + }, + { + "epoch": 0.6609059386867742, + "grad_norm": 0.9036096334457397, + "learning_rate": 7.5640392101283285e-06, + "loss": 0.7858, + "step": 12008 + }, + { + "epoch": 0.6609609774891299, + "grad_norm": 0.7380802035331726, + "learning_rate": 7.563667067921563e-06, + "loss": 0.6615, + "step": 12009 + }, + { + "epoch": 0.6610160162914855, + "grad_norm": 0.6830628514289856, + "learning_rate": 7.5632949064469615e-06, + "loss": 0.7465, + "step": 12010 + }, + { + "epoch": 0.6610710550938411, + "grad_norm": 0.7562816143035889, + "learning_rate": 7.562922725707323e-06, + "loss": 0.8559, + "step": 12011 + }, + { + "epoch": 0.6611260938961968, + "grad_norm": 0.7376649379730225, + "learning_rate": 7.562550525705442e-06, + "loss": 0.7769, + "step": 12012 + }, + { + "epoch": 0.6611811326985525, + "grad_norm": 0.715466320514679, + "learning_rate": 7.562178306444116e-06, + "loss": 0.8233, + "step": 12013 + }, + { + "epoch": 0.6612361715009082, + "grad_norm": 0.6714800596237183, + "learning_rate": 7.561806067926147e-06, + "loss": 0.6025, + "step": 12014 + }, + { + "epoch": 0.6612912103032638, + "grad_norm": 0.7083391547203064, + "learning_rate": 7.561433810154328e-06, + "loss": 0.7063, + "step": 12015 + }, + { + "epoch": 0.6613462491056195, + "grad_norm": 0.8062768578529358, + "learning_rate": 7.561061533131457e-06, + "loss": 0.7992, + "step": 12016 + }, + { + "epoch": 0.6614012879079751, + "grad_norm": 0.741889476776123, + "learning_rate": 7.560689236860334e-06, + "loss": 0.8149, + "step": 12017 + }, + { + "epoch": 0.6614563267103308, + "grad_norm": 0.6834374666213989, + "learning_rate": 7.560316921343756e-06, + "loss": 0.782, + "step": 12018 + }, + { + "epoch": 0.6615113655126864, + "grad_norm": 0.7469872236251831, + "learning_rate": 7.559944586584522e-06, + "loss": 0.759, + "step": 12019 + }, + { + "epoch": 0.6615664043150421, + "grad_norm": 0.8300836086273193, + "learning_rate": 7.559572232585428e-06, + "loss": 0.8637, + "step": 12020 + }, + { + "epoch": 0.6616214431173978, + "grad_norm": 0.6241582632064819, + "learning_rate": 7.559199859349276e-06, + "loss": 0.7134, + "step": 12021 + }, + { + "epoch": 0.6616764819197535, + "grad_norm": 0.6696488261222839, + "learning_rate": 7.5588274668788634e-06, + "loss": 0.7457, + "step": 12022 + }, + { + "epoch": 0.6617315207221091, + "grad_norm": 0.7090815305709839, + "learning_rate": 7.558455055176987e-06, + "loss": 0.7449, + "step": 12023 + }, + { + "epoch": 0.6617865595244647, + "grad_norm": 0.6925215125083923, + "learning_rate": 7.558082624246448e-06, + "loss": 0.758, + "step": 12024 + }, + { + "epoch": 0.6618415983268204, + "grad_norm": 0.6658454537391663, + "learning_rate": 7.5577101740900425e-06, + "loss": 0.6918, + "step": 12025 + }, + { + "epoch": 0.6618966371291761, + "grad_norm": 0.6646405458450317, + "learning_rate": 7.557337704710574e-06, + "loss": 0.7293, + "step": 12026 + }, + { + "epoch": 0.6619516759315317, + "grad_norm": 0.6630399227142334, + "learning_rate": 7.556965216110841e-06, + "loss": 0.7572, + "step": 12027 + }, + { + "epoch": 0.6620067147338874, + "grad_norm": 0.7333918809890747, + "learning_rate": 7.556592708293641e-06, + "loss": 0.8012, + "step": 12028 + }, + { + "epoch": 0.6620617535362431, + "grad_norm": 0.7399254441261292, + "learning_rate": 7.556220181261773e-06, + "loss": 0.8406, + "step": 12029 + }, + { + "epoch": 0.6621167923385987, + "grad_norm": 0.6244909167289734, + "learning_rate": 7.55584763501804e-06, + "loss": 0.7427, + "step": 12030 + }, + { + "epoch": 0.6621718311409543, + "grad_norm": 0.6991485953330994, + "learning_rate": 7.55547506956524e-06, + "loss": 0.7583, + "step": 12031 + }, + { + "epoch": 0.66222686994331, + "grad_norm": 0.7115411162376404, + "learning_rate": 7.555102484906174e-06, + "loss": 0.7951, + "step": 12032 + }, + { + "epoch": 0.6622819087456657, + "grad_norm": 0.7684284448623657, + "learning_rate": 7.554729881043641e-06, + "loss": 0.717, + "step": 12033 + }, + { + "epoch": 0.6623369475480213, + "grad_norm": 0.7705931067466736, + "learning_rate": 7.554357257980443e-06, + "loss": 0.6903, + "step": 12034 + }, + { + "epoch": 0.662391986350377, + "grad_norm": 0.9283333420753479, + "learning_rate": 7.553984615719379e-06, + "loss": 0.7845, + "step": 12035 + }, + { + "epoch": 0.6624470251527327, + "grad_norm": 0.6867572665214539, + "learning_rate": 7.553611954263249e-06, + "loss": 0.8796, + "step": 12036 + }, + { + "epoch": 0.6625020639550884, + "grad_norm": 0.6129451990127563, + "learning_rate": 7.553239273614855e-06, + "loss": 0.6308, + "step": 12037 + }, + { + "epoch": 0.662557102757444, + "grad_norm": 0.749679446220398, + "learning_rate": 7.552866573777e-06, + "loss": 0.8308, + "step": 12038 + }, + { + "epoch": 0.6626121415597996, + "grad_norm": 0.7651422619819641, + "learning_rate": 7.552493854752483e-06, + "loss": 0.7266, + "step": 12039 + }, + { + "epoch": 0.6626671803621553, + "grad_norm": 0.9293195009231567, + "learning_rate": 7.552121116544104e-06, + "loss": 0.7795, + "step": 12040 + }, + { + "epoch": 0.662722219164511, + "grad_norm": 0.7321802973747253, + "learning_rate": 7.5517483591546655e-06, + "loss": 0.7294, + "step": 12041 + }, + { + "epoch": 0.6627772579668666, + "grad_norm": 0.702414333820343, + "learning_rate": 7.551375582586971e-06, + "loss": 0.7954, + "step": 12042 + }, + { + "epoch": 0.6628322967692223, + "grad_norm": 0.7497946619987488, + "learning_rate": 7.551002786843819e-06, + "loss": 0.7654, + "step": 12043 + }, + { + "epoch": 0.662887335571578, + "grad_norm": 0.6125331521034241, + "learning_rate": 7.550629971928017e-06, + "loss": 0.7299, + "step": 12044 + }, + { + "epoch": 0.6629423743739337, + "grad_norm": 0.7252177596092224, + "learning_rate": 7.550257137842358e-06, + "loss": 0.7553, + "step": 12045 + }, + { + "epoch": 0.6629974131762892, + "grad_norm": 0.6463978886604309, + "learning_rate": 7.5498842845896515e-06, + "loss": 0.7114, + "step": 12046 + }, + { + "epoch": 0.6630524519786449, + "grad_norm": 0.7392497062683105, + "learning_rate": 7.549511412172696e-06, + "loss": 0.6801, + "step": 12047 + }, + { + "epoch": 0.6631074907810006, + "grad_norm": 0.8068972229957581, + "learning_rate": 7.549138520594297e-06, + "loss": 0.8207, + "step": 12048 + }, + { + "epoch": 0.6631625295833563, + "grad_norm": 0.7632858753204346, + "learning_rate": 7.548765609857254e-06, + "loss": 0.7095, + "step": 12049 + }, + { + "epoch": 0.6632175683857119, + "grad_norm": 0.7252069115638733, + "learning_rate": 7.5483926799643705e-06, + "loss": 0.7796, + "step": 12050 + }, + { + "epoch": 0.6632726071880676, + "grad_norm": 1.048311471939087, + "learning_rate": 7.54801973091845e-06, + "loss": 0.7306, + "step": 12051 + }, + { + "epoch": 0.6633276459904233, + "grad_norm": 0.7432072758674622, + "learning_rate": 7.547646762722296e-06, + "loss": 0.8209, + "step": 12052 + }, + { + "epoch": 0.663382684792779, + "grad_norm": 0.7191399335861206, + "learning_rate": 7.547273775378709e-06, + "loss": 0.7011, + "step": 12053 + }, + { + "epoch": 0.6634377235951345, + "grad_norm": 0.5776329636573792, + "learning_rate": 7.5469007688904975e-06, + "loss": 0.6055, + "step": 12054 + }, + { + "epoch": 0.6634927623974902, + "grad_norm": 0.9296837449073792, + "learning_rate": 7.546527743260459e-06, + "loss": 0.7413, + "step": 12055 + }, + { + "epoch": 0.6635478011998459, + "grad_norm": 0.7279512286186218, + "learning_rate": 7.5461546984914e-06, + "loss": 0.7734, + "step": 12056 + }, + { + "epoch": 0.6636028400022016, + "grad_norm": 0.7297198176383972, + "learning_rate": 7.545781634586125e-06, + "loss": 0.7535, + "step": 12057 + }, + { + "epoch": 0.6636578788045572, + "grad_norm": 0.7094287872314453, + "learning_rate": 7.545408551547435e-06, + "loss": 0.7587, + "step": 12058 + }, + { + "epoch": 0.6637129176069129, + "grad_norm": 0.7559607028961182, + "learning_rate": 7.5450354493781374e-06, + "loss": 0.7358, + "step": 12059 + }, + { + "epoch": 0.6637679564092686, + "grad_norm": 0.8472892045974731, + "learning_rate": 7.544662328081034e-06, + "loss": 0.7537, + "step": 12060 + }, + { + "epoch": 0.6638229952116242, + "grad_norm": 0.6346176862716675, + "learning_rate": 7.544289187658929e-06, + "loss": 0.7658, + "step": 12061 + }, + { + "epoch": 0.6638780340139798, + "grad_norm": 0.7949367165565491, + "learning_rate": 7.543916028114628e-06, + "loss": 0.6837, + "step": 12062 + }, + { + "epoch": 0.6639330728163355, + "grad_norm": 0.7177689671516418, + "learning_rate": 7.5435428494509355e-06, + "loss": 0.7218, + "step": 12063 + }, + { + "epoch": 0.6639881116186912, + "grad_norm": 0.90680330991745, + "learning_rate": 7.5431696516706555e-06, + "loss": 0.8274, + "step": 12064 + }, + { + "epoch": 0.6640431504210469, + "grad_norm": 0.7799603939056396, + "learning_rate": 7.5427964347765916e-06, + "loss": 0.7528, + "step": 12065 + }, + { + "epoch": 0.6640981892234025, + "grad_norm": 0.7668048739433289, + "learning_rate": 7.542423198771553e-06, + "loss": 0.746, + "step": 12066 + }, + { + "epoch": 0.6641532280257582, + "grad_norm": 1.0042381286621094, + "learning_rate": 7.542049943658341e-06, + "loss": 0.7836, + "step": 12067 + }, + { + "epoch": 0.6642082668281138, + "grad_norm": 0.6915723085403442, + "learning_rate": 7.541676669439761e-06, + "loss": 0.8042, + "step": 12068 + }, + { + "epoch": 0.6642633056304695, + "grad_norm": 0.7268955707550049, + "learning_rate": 7.5413033761186215e-06, + "loss": 0.689, + "step": 12069 + }, + { + "epoch": 0.6643183444328251, + "grad_norm": 0.6418740749359131, + "learning_rate": 7.540930063697726e-06, + "loss": 0.6302, + "step": 12070 + }, + { + "epoch": 0.6643733832351808, + "grad_norm": 0.696384847164154, + "learning_rate": 7.540556732179879e-06, + "loss": 0.7978, + "step": 12071 + }, + { + "epoch": 0.6644284220375365, + "grad_norm": 0.7400668859481812, + "learning_rate": 7.540183381567889e-06, + "loss": 0.8768, + "step": 12072 + }, + { + "epoch": 0.6644834608398921, + "grad_norm": 0.6653871536254883, + "learning_rate": 7.539810011864559e-06, + "loss": 0.8107, + "step": 12073 + }, + { + "epoch": 0.6645384996422478, + "grad_norm": 0.7635810971260071, + "learning_rate": 7.539436623072698e-06, + "loss": 0.8476, + "step": 12074 + }, + { + "epoch": 0.6645935384446034, + "grad_norm": 0.6583054661750793, + "learning_rate": 7.53906321519511e-06, + "loss": 0.7093, + "step": 12075 + }, + { + "epoch": 0.6646485772469591, + "grad_norm": 0.8294859528541565, + "learning_rate": 7.538689788234604e-06, + "loss": 0.8107, + "step": 12076 + }, + { + "epoch": 0.6647036160493147, + "grad_norm": 0.6711081862449646, + "learning_rate": 7.538316342193983e-06, + "loss": 0.7491, + "step": 12077 + }, + { + "epoch": 0.6647586548516704, + "grad_norm": 0.7375408411026001, + "learning_rate": 7.5379428770760575e-06, + "loss": 0.7853, + "step": 12078 + }, + { + "epoch": 0.6648136936540261, + "grad_norm": 0.7322511672973633, + "learning_rate": 7.537569392883633e-06, + "loss": 0.7568, + "step": 12079 + }, + { + "epoch": 0.6648687324563818, + "grad_norm": 0.6390300393104553, + "learning_rate": 7.537195889619515e-06, + "loss": 0.7191, + "step": 12080 + }, + { + "epoch": 0.6649237712587374, + "grad_norm": 0.8155800104141235, + "learning_rate": 7.536822367286514e-06, + "loss": 0.7499, + "step": 12081 + }, + { + "epoch": 0.664978810061093, + "grad_norm": 0.7942230701446533, + "learning_rate": 7.536448825887432e-06, + "loss": 0.7797, + "step": 12082 + }, + { + "epoch": 0.6650338488634487, + "grad_norm": 0.7103378176689148, + "learning_rate": 7.536075265425083e-06, + "loss": 0.6814, + "step": 12083 + }, + { + "epoch": 0.6650888876658044, + "grad_norm": 0.8164991736412048, + "learning_rate": 7.535701685902268e-06, + "loss": 0.7917, + "step": 12084 + }, + { + "epoch": 0.66514392646816, + "grad_norm": 0.6970370411872864, + "learning_rate": 7.535328087321799e-06, + "loss": 0.7266, + "step": 12085 + }, + { + "epoch": 0.6651989652705157, + "grad_norm": 0.6468706130981445, + "learning_rate": 7.534954469686484e-06, + "loss": 0.7229, + "step": 12086 + }, + { + "epoch": 0.6652540040728714, + "grad_norm": 0.6551242470741272, + "learning_rate": 7.534580832999128e-06, + "loss": 0.6759, + "step": 12087 + }, + { + "epoch": 0.6653090428752271, + "grad_norm": 0.670215368270874, + "learning_rate": 7.534207177262543e-06, + "loss": 0.761, + "step": 12088 + }, + { + "epoch": 0.6653640816775827, + "grad_norm": 0.7365970015525818, + "learning_rate": 7.533833502479533e-06, + "loss": 0.7628, + "step": 12089 + }, + { + "epoch": 0.6654191204799383, + "grad_norm": 0.7419471740722656, + "learning_rate": 7.53345980865291e-06, + "loss": 0.8093, + "step": 12090 + }, + { + "epoch": 0.665474159282294, + "grad_norm": 0.6573269963264465, + "learning_rate": 7.53308609578548e-06, + "loss": 0.6806, + "step": 12091 + }, + { + "epoch": 0.6655291980846497, + "grad_norm": 0.9270638227462769, + "learning_rate": 7.5327123638800545e-06, + "loss": 0.8612, + "step": 12092 + }, + { + "epoch": 0.6655842368870053, + "grad_norm": 0.85124671459198, + "learning_rate": 7.532338612939441e-06, + "loss": 0.6776, + "step": 12093 + }, + { + "epoch": 0.665639275689361, + "grad_norm": 0.7791070342063904, + "learning_rate": 7.531964842966446e-06, + "loss": 0.7571, + "step": 12094 + }, + { + "epoch": 0.6656943144917167, + "grad_norm": 0.6604436635971069, + "learning_rate": 7.5315910539638825e-06, + "loss": 0.781, + "step": 12095 + }, + { + "epoch": 0.6657493532940724, + "grad_norm": 0.7567091584205627, + "learning_rate": 7.531217245934559e-06, + "loss": 0.8005, + "step": 12096 + }, + { + "epoch": 0.6658043920964279, + "grad_norm": 0.660637378692627, + "learning_rate": 7.530843418881282e-06, + "loss": 0.7351, + "step": 12097 + }, + { + "epoch": 0.6658594308987836, + "grad_norm": 0.6305738687515259, + "learning_rate": 7.530469572806865e-06, + "loss": 0.7452, + "step": 12098 + }, + { + "epoch": 0.6659144697011393, + "grad_norm": 0.8291265368461609, + "learning_rate": 7.5300957077141164e-06, + "loss": 0.7799, + "step": 12099 + }, + { + "epoch": 0.665969508503495, + "grad_norm": 0.7459661364555359, + "learning_rate": 7.5297218236058456e-06, + "loss": 0.8273, + "step": 12100 + }, + { + "epoch": 0.6660245473058506, + "grad_norm": 0.7570028901100159, + "learning_rate": 7.529347920484862e-06, + "loss": 0.7622, + "step": 12101 + }, + { + "epoch": 0.6660795861082063, + "grad_norm": 0.733403205871582, + "learning_rate": 7.528973998353977e-06, + "loss": 0.8357, + "step": 12102 + }, + { + "epoch": 0.666134624910562, + "grad_norm": 0.8814442753791809, + "learning_rate": 7.528600057216e-06, + "loss": 0.727, + "step": 12103 + }, + { + "epoch": 0.6661896637129177, + "grad_norm": 0.629338800907135, + "learning_rate": 7.528226097073742e-06, + "loss": 0.6758, + "step": 12104 + }, + { + "epoch": 0.6662447025152732, + "grad_norm": 0.7786098122596741, + "learning_rate": 7.527852117930014e-06, + "loss": 0.7476, + "step": 12105 + }, + { + "epoch": 0.6662997413176289, + "grad_norm": 0.6604528427124023, + "learning_rate": 7.527478119787626e-06, + "loss": 0.7275, + "step": 12106 + }, + { + "epoch": 0.6663547801199846, + "grad_norm": 0.6937400698661804, + "learning_rate": 7.527104102649387e-06, + "loss": 0.7187, + "step": 12107 + }, + { + "epoch": 0.6664098189223403, + "grad_norm": 0.6863219738006592, + "learning_rate": 7.526730066518113e-06, + "loss": 0.7512, + "step": 12108 + }, + { + "epoch": 0.6664648577246959, + "grad_norm": 0.7771461606025696, + "learning_rate": 7.526356011396609e-06, + "loss": 0.8439, + "step": 12109 + }, + { + "epoch": 0.6665198965270516, + "grad_norm": 0.7223722338676453, + "learning_rate": 7.525981937287692e-06, + "loss": 0.6488, + "step": 12110 + }, + { + "epoch": 0.6665749353294073, + "grad_norm": 0.8091556429862976, + "learning_rate": 7.52560784419417e-06, + "loss": 0.6618, + "step": 12111 + }, + { + "epoch": 0.666629974131763, + "grad_norm": 0.6435044407844543, + "learning_rate": 7.525233732118856e-06, + "loss": 0.6994, + "step": 12112 + }, + { + "epoch": 0.6666850129341185, + "grad_norm": 0.6933714151382446, + "learning_rate": 7.52485960106456e-06, + "loss": 0.6917, + "step": 12113 + }, + { + "epoch": 0.6667400517364742, + "grad_norm": 0.693192720413208, + "learning_rate": 7.524485451034097e-06, + "loss": 0.7941, + "step": 12114 + }, + { + "epoch": 0.6667950905388299, + "grad_norm": 1.1374844312667847, + "learning_rate": 7.524111282030275e-06, + "loss": 0.9112, + "step": 12115 + }, + { + "epoch": 0.6668501293411855, + "grad_norm": 0.6917465329170227, + "learning_rate": 7.523737094055911e-06, + "loss": 0.681, + "step": 12116 + }, + { + "epoch": 0.6669051681435412, + "grad_norm": 0.8057913184165955, + "learning_rate": 7.523362887113812e-06, + "loss": 0.8186, + "step": 12117 + }, + { + "epoch": 0.6669602069458969, + "grad_norm": 0.7194918394088745, + "learning_rate": 7.522988661206795e-06, + "loss": 0.7875, + "step": 12118 + }, + { + "epoch": 0.6670152457482525, + "grad_norm": 0.6829916834831238, + "learning_rate": 7.52261441633767e-06, + "loss": 0.6506, + "step": 12119 + }, + { + "epoch": 0.6670702845506081, + "grad_norm": 0.7869738936424255, + "learning_rate": 7.5222401525092495e-06, + "loss": 0.7091, + "step": 12120 + }, + { + "epoch": 0.6671253233529638, + "grad_norm": 0.6835895776748657, + "learning_rate": 7.5218658697243475e-06, + "loss": 0.7839, + "step": 12121 + }, + { + "epoch": 0.6671803621553195, + "grad_norm": 0.7462154030799866, + "learning_rate": 7.521491567985776e-06, + "loss": 0.7073, + "step": 12122 + }, + { + "epoch": 0.6672354009576752, + "grad_norm": 0.6413764953613281, + "learning_rate": 7.52111724729635e-06, + "loss": 0.6472, + "step": 12123 + }, + { + "epoch": 0.6672904397600308, + "grad_norm": 0.7085923552513123, + "learning_rate": 7.520742907658881e-06, + "loss": 0.8167, + "step": 12124 + }, + { + "epoch": 0.6673454785623865, + "grad_norm": 0.6490428447723389, + "learning_rate": 7.520368549076182e-06, + "loss": 0.7693, + "step": 12125 + }, + { + "epoch": 0.6674005173647422, + "grad_norm": 0.7082974910736084, + "learning_rate": 7.51999417155107e-06, + "loss": 0.6707, + "step": 12126 + }, + { + "epoch": 0.6674555561670978, + "grad_norm": 0.704335629940033, + "learning_rate": 7.519619775086355e-06, + "loss": 0.825, + "step": 12127 + }, + { + "epoch": 0.6675105949694534, + "grad_norm": 0.6815123558044434, + "learning_rate": 7.519245359684852e-06, + "loss": 0.762, + "step": 12128 + }, + { + "epoch": 0.6675656337718091, + "grad_norm": 0.6497910618782043, + "learning_rate": 7.518870925349376e-06, + "loss": 0.6934, + "step": 12129 + }, + { + "epoch": 0.6676206725741648, + "grad_norm": 0.6699943542480469, + "learning_rate": 7.51849647208274e-06, + "loss": 0.7816, + "step": 12130 + }, + { + "epoch": 0.6676757113765205, + "grad_norm": 0.7139337062835693, + "learning_rate": 7.51812199988776e-06, + "loss": 0.679, + "step": 12131 + }, + { + "epoch": 0.6677307501788761, + "grad_norm": 0.6762346029281616, + "learning_rate": 7.517747508767248e-06, + "loss": 0.7477, + "step": 12132 + }, + { + "epoch": 0.6677857889812318, + "grad_norm": 0.7429338693618774, + "learning_rate": 7.517372998724017e-06, + "loss": 0.7549, + "step": 12133 + }, + { + "epoch": 0.6678408277835874, + "grad_norm": 0.7392850518226624, + "learning_rate": 7.516998469760888e-06, + "loss": 0.8167, + "step": 12134 + }, + { + "epoch": 0.6678958665859431, + "grad_norm": 0.7511306405067444, + "learning_rate": 7.516623921880671e-06, + "loss": 0.7264, + "step": 12135 + }, + { + "epoch": 0.6679509053882987, + "grad_norm": 0.6757550835609436, + "learning_rate": 7.516249355086183e-06, + "loss": 0.7405, + "step": 12136 + }, + { + "epoch": 0.6680059441906544, + "grad_norm": 0.7433735132217407, + "learning_rate": 7.515874769380238e-06, + "loss": 0.7954, + "step": 12137 + }, + { + "epoch": 0.6680609829930101, + "grad_norm": 0.7390886545181274, + "learning_rate": 7.51550016476565e-06, + "loss": 0.7487, + "step": 12138 + }, + { + "epoch": 0.6681160217953658, + "grad_norm": 0.7405929565429688, + "learning_rate": 7.5151255412452385e-06, + "loss": 0.8127, + "step": 12139 + }, + { + "epoch": 0.6681710605977214, + "grad_norm": 0.6628968715667725, + "learning_rate": 7.514750898821817e-06, + "loss": 0.7009, + "step": 12140 + }, + { + "epoch": 0.668226099400077, + "grad_norm": 0.6777421832084656, + "learning_rate": 7.514376237498199e-06, + "loss": 0.6689, + "step": 12141 + }, + { + "epoch": 0.6682811382024327, + "grad_norm": 0.617261528968811, + "learning_rate": 7.514001557277202e-06, + "loss": 0.7597, + "step": 12142 + }, + { + "epoch": 0.6683361770047884, + "grad_norm": 0.6666202545166016, + "learning_rate": 7.5136268581616446e-06, + "loss": 0.6623, + "step": 12143 + }, + { + "epoch": 0.668391215807144, + "grad_norm": 0.7170178890228271, + "learning_rate": 7.513252140154339e-06, + "loss": 0.8224, + "step": 12144 + }, + { + "epoch": 0.6684462546094997, + "grad_norm": 0.6173199415206909, + "learning_rate": 7.512877403258103e-06, + "loss": 0.6784, + "step": 12145 + }, + { + "epoch": 0.6685012934118554, + "grad_norm": 0.6906641125679016, + "learning_rate": 7.512502647475753e-06, + "loss": 0.6649, + "step": 12146 + }, + { + "epoch": 0.6685563322142111, + "grad_norm": 0.6435873508453369, + "learning_rate": 7.5121278728101065e-06, + "loss": 0.751, + "step": 12147 + }, + { + "epoch": 0.6686113710165666, + "grad_norm": 0.8345947861671448, + "learning_rate": 7.511753079263978e-06, + "loss": 0.7841, + "step": 12148 + }, + { + "epoch": 0.6686664098189223, + "grad_norm": 0.6952378153800964, + "learning_rate": 7.511378266840187e-06, + "loss": 0.8187, + "step": 12149 + }, + { + "epoch": 0.668721448621278, + "grad_norm": 0.6878920793533325, + "learning_rate": 7.5110034355415484e-06, + "loss": 0.6726, + "step": 12150 + }, + { + "epoch": 0.6687764874236337, + "grad_norm": 0.7119094729423523, + "learning_rate": 7.5106285853708805e-06, + "loss": 0.7824, + "step": 12151 + }, + { + "epoch": 0.6688315262259893, + "grad_norm": 0.7261053323745728, + "learning_rate": 7.5102537163309994e-06, + "loss": 0.7122, + "step": 12152 + }, + { + "epoch": 0.668886565028345, + "grad_norm": 0.717268168926239, + "learning_rate": 7.509878828424725e-06, + "loss": 0.7144, + "step": 12153 + }, + { + "epoch": 0.6689416038307007, + "grad_norm": 0.8373270630836487, + "learning_rate": 7.5095039216548725e-06, + "loss": 0.7941, + "step": 12154 + }, + { + "epoch": 0.6689966426330564, + "grad_norm": 0.7113829851150513, + "learning_rate": 7.509128996024259e-06, + "loss": 0.705, + "step": 12155 + }, + { + "epoch": 0.6690516814354119, + "grad_norm": 0.7894094586372375, + "learning_rate": 7.508754051535705e-06, + "loss": 0.8284, + "step": 12156 + }, + { + "epoch": 0.6691067202377676, + "grad_norm": 0.6739659905433655, + "learning_rate": 7.508379088192028e-06, + "loss": 0.7264, + "step": 12157 + }, + { + "epoch": 0.6691617590401233, + "grad_norm": 0.735211193561554, + "learning_rate": 7.508004105996043e-06, + "loss": 0.8187, + "step": 12158 + }, + { + "epoch": 0.6692167978424789, + "grad_norm": 0.7438055872917175, + "learning_rate": 7.507629104950571e-06, + "loss": 0.8949, + "step": 12159 + }, + { + "epoch": 0.6692718366448346, + "grad_norm": 1.0734246969223022, + "learning_rate": 7.507254085058431e-06, + "loss": 0.7687, + "step": 12160 + }, + { + "epoch": 0.6693268754471903, + "grad_norm": 0.6719897985458374, + "learning_rate": 7.50687904632244e-06, + "loss": 0.7522, + "step": 12161 + }, + { + "epoch": 0.669381914249546, + "grad_norm": 0.7063966989517212, + "learning_rate": 7.506503988745416e-06, + "loss": 0.7794, + "step": 12162 + }, + { + "epoch": 0.6694369530519015, + "grad_norm": 0.6582265496253967, + "learning_rate": 7.506128912330179e-06, + "loss": 0.7012, + "step": 12163 + }, + { + "epoch": 0.6694919918542572, + "grad_norm": 0.7764506340026855, + "learning_rate": 7.50575381707955e-06, + "loss": 0.7816, + "step": 12164 + }, + { + "epoch": 0.6695470306566129, + "grad_norm": 0.7659780383110046, + "learning_rate": 7.505378702996344e-06, + "loss": 0.753, + "step": 12165 + }, + { + "epoch": 0.6696020694589686, + "grad_norm": 0.9013122916221619, + "learning_rate": 7.505003570083385e-06, + "loss": 0.8255, + "step": 12166 + }, + { + "epoch": 0.6696571082613242, + "grad_norm": 0.6417272686958313, + "learning_rate": 7.504628418343487e-06, + "loss": 0.6236, + "step": 12167 + }, + { + "epoch": 0.6697121470636799, + "grad_norm": 0.7511595487594604, + "learning_rate": 7.504253247779474e-06, + "loss": 0.7961, + "step": 12168 + }, + { + "epoch": 0.6697671858660356, + "grad_norm": 0.7987878918647766, + "learning_rate": 7.503878058394163e-06, + "loss": 0.7249, + "step": 12169 + }, + { + "epoch": 0.6698222246683913, + "grad_norm": 0.6860646605491638, + "learning_rate": 7.503502850190374e-06, + "loss": 0.7973, + "step": 12170 + }, + { + "epoch": 0.6698772634707468, + "grad_norm": 0.7334334850311279, + "learning_rate": 7.50312762317093e-06, + "loss": 0.8756, + "step": 12171 + }, + { + "epoch": 0.6699323022731025, + "grad_norm": 0.7792186737060547, + "learning_rate": 7.502752377338647e-06, + "loss": 0.8393, + "step": 12172 + }, + { + "epoch": 0.6699873410754582, + "grad_norm": 0.6532536149024963, + "learning_rate": 7.502377112696346e-06, + "loss": 0.6509, + "step": 12173 + }, + { + "epoch": 0.6700423798778139, + "grad_norm": 0.6595458984375, + "learning_rate": 7.50200182924685e-06, + "loss": 0.781, + "step": 12174 + }, + { + "epoch": 0.6700974186801695, + "grad_norm": 0.6668636202812195, + "learning_rate": 7.501626526992978e-06, + "loss": 0.7702, + "step": 12175 + }, + { + "epoch": 0.6701524574825252, + "grad_norm": 0.686851441860199, + "learning_rate": 7.501251205937551e-06, + "loss": 0.8648, + "step": 12176 + }, + { + "epoch": 0.6702074962848809, + "grad_norm": 0.7363078594207764, + "learning_rate": 7.500875866083388e-06, + "loss": 0.7309, + "step": 12177 + }, + { + "epoch": 0.6702625350872365, + "grad_norm": 0.6927379369735718, + "learning_rate": 7.500500507433312e-06, + "loss": 0.7258, + "step": 12178 + }, + { + "epoch": 0.6703175738895921, + "grad_norm": 0.6589936017990112, + "learning_rate": 7.5001251299901455e-06, + "loss": 0.6776, + "step": 12179 + }, + { + "epoch": 0.6703726126919478, + "grad_norm": 0.6402539610862732, + "learning_rate": 7.499749733756707e-06, + "loss": 0.7467, + "step": 12180 + }, + { + "epoch": 0.6704276514943035, + "grad_norm": 0.776469886302948, + "learning_rate": 7.499374318735817e-06, + "loss": 0.7856, + "step": 12181 + }, + { + "epoch": 0.6704826902966592, + "grad_norm": 0.7062460780143738, + "learning_rate": 7.4989988849303e-06, + "loss": 0.8286, + "step": 12182 + }, + { + "epoch": 0.6705377290990148, + "grad_norm": 0.6725799441337585, + "learning_rate": 7.4986234323429755e-06, + "loss": 0.7517, + "step": 12183 + }, + { + "epoch": 0.6705927679013705, + "grad_norm": 0.6444042921066284, + "learning_rate": 7.498247960976667e-06, + "loss": 0.5984, + "step": 12184 + }, + { + "epoch": 0.6706478067037261, + "grad_norm": 0.6968628764152527, + "learning_rate": 7.497872470834195e-06, + "loss": 0.6996, + "step": 12185 + }, + { + "epoch": 0.6707028455060818, + "grad_norm": 0.643500030040741, + "learning_rate": 7.497496961918381e-06, + "loss": 0.6252, + "step": 12186 + }, + { + "epoch": 0.6707578843084374, + "grad_norm": 0.7026870846748352, + "learning_rate": 7.49712143423205e-06, + "loss": 0.7883, + "step": 12187 + }, + { + "epoch": 0.6708129231107931, + "grad_norm": 0.8169240951538086, + "learning_rate": 7.496745887778022e-06, + "loss": 0.6717, + "step": 12188 + }, + { + "epoch": 0.6708679619131488, + "grad_norm": 0.6611927151679993, + "learning_rate": 7.496370322559121e-06, + "loss": 0.6674, + "step": 12189 + }, + { + "epoch": 0.6709230007155045, + "grad_norm": 0.7330195307731628, + "learning_rate": 7.495994738578169e-06, + "loss": 0.7809, + "step": 12190 + }, + { + "epoch": 0.6709780395178601, + "grad_norm": 0.6469636559486389, + "learning_rate": 7.495619135837988e-06, + "loss": 0.6511, + "step": 12191 + }, + { + "epoch": 0.6710330783202157, + "grad_norm": 0.6558564901351929, + "learning_rate": 7.495243514341402e-06, + "loss": 0.7284, + "step": 12192 + }, + { + "epoch": 0.6710881171225714, + "grad_norm": 0.6736281514167786, + "learning_rate": 7.494867874091233e-06, + "loss": 0.7007, + "step": 12193 + }, + { + "epoch": 0.6711431559249271, + "grad_norm": 0.7302053570747375, + "learning_rate": 7.494492215090304e-06, + "loss": 0.77, + "step": 12194 + }, + { + "epoch": 0.6711981947272827, + "grad_norm": 0.7368764877319336, + "learning_rate": 7.494116537341442e-06, + "loss": 0.8478, + "step": 12195 + }, + { + "epoch": 0.6712532335296384, + "grad_norm": 0.782767653465271, + "learning_rate": 7.493740840847466e-06, + "loss": 0.813, + "step": 12196 + }, + { + "epoch": 0.6713082723319941, + "grad_norm": 0.6787601113319397, + "learning_rate": 7.493365125611202e-06, + "loss": 0.7507, + "step": 12197 + }, + { + "epoch": 0.6713633111343498, + "grad_norm": 0.6912569999694824, + "learning_rate": 7.4929893916354715e-06, + "loss": 0.8003, + "step": 12198 + }, + { + "epoch": 0.6714183499367053, + "grad_norm": 0.7625328898429871, + "learning_rate": 7.4926136389231005e-06, + "loss": 0.8021, + "step": 12199 + }, + { + "epoch": 0.671473388739061, + "grad_norm": 0.6720984578132629, + "learning_rate": 7.4922378674769146e-06, + "loss": 0.7757, + "step": 12200 + }, + { + "epoch": 0.6715284275414167, + "grad_norm": 0.7816714644432068, + "learning_rate": 7.491862077299734e-06, + "loss": 0.7086, + "step": 12201 + }, + { + "epoch": 0.6715834663437723, + "grad_norm": 0.7546358108520508, + "learning_rate": 7.491486268394387e-06, + "loss": 0.8365, + "step": 12202 + }, + { + "epoch": 0.671638505146128, + "grad_norm": 0.7201979756355286, + "learning_rate": 7.491110440763695e-06, + "loss": 0.835, + "step": 12203 + }, + { + "epoch": 0.6716935439484837, + "grad_norm": 0.8177551031112671, + "learning_rate": 7.490734594410484e-06, + "loss": 0.8636, + "step": 12204 + }, + { + "epoch": 0.6717485827508394, + "grad_norm": 0.7433933019638062, + "learning_rate": 7.490358729337578e-06, + "loss": 0.745, + "step": 12205 + }, + { + "epoch": 0.671803621553195, + "grad_norm": 0.8013591170310974, + "learning_rate": 7.489982845547802e-06, + "loss": 0.7638, + "step": 12206 + }, + { + "epoch": 0.6718586603555506, + "grad_norm": 0.6561495065689087, + "learning_rate": 7.489606943043982e-06, + "loss": 0.7997, + "step": 12207 + }, + { + "epoch": 0.6719136991579063, + "grad_norm": 0.7291023135185242, + "learning_rate": 7.489231021828943e-06, + "loss": 0.7452, + "step": 12208 + }, + { + "epoch": 0.671968737960262, + "grad_norm": 0.6978216171264648, + "learning_rate": 7.488855081905511e-06, + "loss": 0.7984, + "step": 12209 + }, + { + "epoch": 0.6720237767626176, + "grad_norm": 0.701006293296814, + "learning_rate": 7.488479123276507e-06, + "loss": 0.7218, + "step": 12210 + }, + { + "epoch": 0.6720788155649733, + "grad_norm": 0.7275286912918091, + "learning_rate": 7.488103145944763e-06, + "loss": 0.6872, + "step": 12211 + }, + { + "epoch": 0.672133854367329, + "grad_norm": 0.7319645881652832, + "learning_rate": 7.487727149913101e-06, + "loss": 0.7862, + "step": 12212 + }, + { + "epoch": 0.6721888931696847, + "grad_norm": 0.7143612504005432, + "learning_rate": 7.487351135184348e-06, + "loss": 0.838, + "step": 12213 + }, + { + "epoch": 0.6722439319720402, + "grad_norm": 0.7135382294654846, + "learning_rate": 7.486975101761329e-06, + "loss": 0.7263, + "step": 12214 + }, + { + "epoch": 0.6722989707743959, + "grad_norm": 0.6283460259437561, + "learning_rate": 7.486599049646872e-06, + "loss": 0.7262, + "step": 12215 + }, + { + "epoch": 0.6723540095767516, + "grad_norm": 0.7196768522262573, + "learning_rate": 7.486222978843801e-06, + "loss": 0.6752, + "step": 12216 + }, + { + "epoch": 0.6724090483791073, + "grad_norm": 0.5856572389602661, + "learning_rate": 7.485846889354944e-06, + "loss": 0.6779, + "step": 12217 + }, + { + "epoch": 0.6724640871814629, + "grad_norm": 0.7671294808387756, + "learning_rate": 7.485470781183126e-06, + "loss": 0.766, + "step": 12218 + }, + { + "epoch": 0.6725191259838186, + "grad_norm": 0.6780520677566528, + "learning_rate": 7.485094654331177e-06, + "loss": 0.7474, + "step": 12219 + }, + { + "epoch": 0.6725741647861743, + "grad_norm": 0.7537981867790222, + "learning_rate": 7.484718508801921e-06, + "loss": 0.8347, + "step": 12220 + }, + { + "epoch": 0.67262920358853, + "grad_norm": 0.7451551556587219, + "learning_rate": 7.484342344598186e-06, + "loss": 0.8217, + "step": 12221 + }, + { + "epoch": 0.6726842423908855, + "grad_norm": 0.6656951904296875, + "learning_rate": 7.483966161722798e-06, + "loss": 0.7437, + "step": 12222 + }, + { + "epoch": 0.6727392811932412, + "grad_norm": 0.7306267619132996, + "learning_rate": 7.483589960178586e-06, + "loss": 0.8495, + "step": 12223 + }, + { + "epoch": 0.6727943199955969, + "grad_norm": 0.6619658470153809, + "learning_rate": 7.483213739968376e-06, + "loss": 0.6379, + "step": 12224 + }, + { + "epoch": 0.6728493587979526, + "grad_norm": 0.7066444754600525, + "learning_rate": 7.4828375010949974e-06, + "loss": 0.7307, + "step": 12225 + }, + { + "epoch": 0.6729043976003082, + "grad_norm": 0.7356079816818237, + "learning_rate": 7.482461243561276e-06, + "loss": 0.7781, + "step": 12226 + }, + { + "epoch": 0.6729594364026639, + "grad_norm": 0.6759988069534302, + "learning_rate": 7.48208496737004e-06, + "loss": 0.7808, + "step": 12227 + }, + { + "epoch": 0.6730144752050196, + "grad_norm": 0.7519234418869019, + "learning_rate": 7.481708672524119e-06, + "loss": 0.7948, + "step": 12228 + }, + { + "epoch": 0.6730695140073752, + "grad_norm": 0.6387592554092407, + "learning_rate": 7.48133235902634e-06, + "loss": 0.7423, + "step": 12229 + }, + { + "epoch": 0.6731245528097308, + "grad_norm": 1.0615060329437256, + "learning_rate": 7.480956026879529e-06, + "loss": 0.8668, + "step": 12230 + }, + { + "epoch": 0.6731795916120865, + "grad_norm": 0.7578469514846802, + "learning_rate": 7.480579676086519e-06, + "loss": 0.812, + "step": 12231 + }, + { + "epoch": 0.6732346304144422, + "grad_norm": 0.6669226884841919, + "learning_rate": 7.480203306650134e-06, + "loss": 0.7002, + "step": 12232 + }, + { + "epoch": 0.6732896692167979, + "grad_norm": 0.7110459208488464, + "learning_rate": 7.479826918573208e-06, + "loss": 0.8542, + "step": 12233 + }, + { + "epoch": 0.6733447080191535, + "grad_norm": 0.6632254123687744, + "learning_rate": 7.479450511858563e-06, + "loss": 0.6784, + "step": 12234 + }, + { + "epoch": 0.6733997468215092, + "grad_norm": 0.7368438839912415, + "learning_rate": 7.479074086509032e-06, + "loss": 0.7683, + "step": 12235 + }, + { + "epoch": 0.6734547856238648, + "grad_norm": 0.764905571937561, + "learning_rate": 7.478697642527447e-06, + "loss": 0.7585, + "step": 12236 + }, + { + "epoch": 0.6735098244262205, + "grad_norm": 0.7141197323799133, + "learning_rate": 7.478321179916632e-06, + "loss": 0.7409, + "step": 12237 + }, + { + "epoch": 0.6735648632285761, + "grad_norm": 0.6514197587966919, + "learning_rate": 7.477944698679419e-06, + "loss": 0.7623, + "step": 12238 + }, + { + "epoch": 0.6736199020309318, + "grad_norm": 0.7712671160697937, + "learning_rate": 7.477568198818636e-06, + "loss": 0.777, + "step": 12239 + }, + { + "epoch": 0.6736749408332875, + "grad_norm": 0.6690881252288818, + "learning_rate": 7.4771916803371145e-06, + "loss": 0.7275, + "step": 12240 + }, + { + "epoch": 0.6737299796356432, + "grad_norm": 0.7206465601921082, + "learning_rate": 7.476815143237683e-06, + "loss": 0.853, + "step": 12241 + }, + { + "epoch": 0.6737850184379988, + "grad_norm": 0.7052504420280457, + "learning_rate": 7.476438587523171e-06, + "loss": 0.774, + "step": 12242 + }, + { + "epoch": 0.6738400572403545, + "grad_norm": 1.6168169975280762, + "learning_rate": 7.476062013196411e-06, + "loss": 0.7423, + "step": 12243 + }, + { + "epoch": 0.6738950960427101, + "grad_norm": 0.715300977230072, + "learning_rate": 7.475685420260232e-06, + "loss": 0.78, + "step": 12244 + }, + { + "epoch": 0.6739501348450657, + "grad_norm": 0.7774379253387451, + "learning_rate": 7.475308808717463e-06, + "loss": 0.885, + "step": 12245 + }, + { + "epoch": 0.6740051736474214, + "grad_norm": 0.6998060941696167, + "learning_rate": 7.474932178570935e-06, + "loss": 0.807, + "step": 12246 + }, + { + "epoch": 0.6740602124497771, + "grad_norm": 0.6710013747215271, + "learning_rate": 7.47455552982348e-06, + "loss": 0.7639, + "step": 12247 + }, + { + "epoch": 0.6741152512521328, + "grad_norm": 0.707435667514801, + "learning_rate": 7.474178862477929e-06, + "loss": 0.7914, + "step": 12248 + }, + { + "epoch": 0.6741702900544884, + "grad_norm": 0.7344105243682861, + "learning_rate": 7.47380217653711e-06, + "loss": 0.7464, + "step": 12249 + }, + { + "epoch": 0.674225328856844, + "grad_norm": 0.7157585620880127, + "learning_rate": 7.473425472003858e-06, + "loss": 0.7747, + "step": 12250 + }, + { + "epoch": 0.6742803676591997, + "grad_norm": 0.6978434920310974, + "learning_rate": 7.473048748881001e-06, + "loss": 0.6903, + "step": 12251 + }, + { + "epoch": 0.6743354064615554, + "grad_norm": 0.6454086899757385, + "learning_rate": 7.472672007171372e-06, + "loss": 0.725, + "step": 12252 + }, + { + "epoch": 0.674390445263911, + "grad_norm": 0.6729341745376587, + "learning_rate": 7.4722952468778035e-06, + "loss": 0.7704, + "step": 12253 + }, + { + "epoch": 0.6744454840662667, + "grad_norm": 0.7995265126228333, + "learning_rate": 7.471918468003122e-06, + "loss": 0.7567, + "step": 12254 + }, + { + "epoch": 0.6745005228686224, + "grad_norm": 0.729629397392273, + "learning_rate": 7.471541670550165e-06, + "loss": 0.796, + "step": 12255 + }, + { + "epoch": 0.6745555616709781, + "grad_norm": 0.6923666000366211, + "learning_rate": 7.471164854521764e-06, + "loss": 0.6894, + "step": 12256 + }, + { + "epoch": 0.6746106004733337, + "grad_norm": 0.6485042572021484, + "learning_rate": 7.470788019920747e-06, + "loss": 0.6912, + "step": 12257 + }, + { + "epoch": 0.6746656392756893, + "grad_norm": 0.7569034099578857, + "learning_rate": 7.470411166749949e-06, + "loss": 0.8167, + "step": 12258 + }, + { + "epoch": 0.674720678078045, + "grad_norm": 0.6202835440635681, + "learning_rate": 7.470034295012203e-06, + "loss": 0.6409, + "step": 12259 + }, + { + "epoch": 0.6747757168804007, + "grad_norm": 0.6414007544517517, + "learning_rate": 7.4696574047103395e-06, + "loss": 0.7163, + "step": 12260 + }, + { + "epoch": 0.6748307556827563, + "grad_norm": 0.7012181878089905, + "learning_rate": 7.469280495847193e-06, + "loss": 0.7682, + "step": 12261 + }, + { + "epoch": 0.674885794485112, + "grad_norm": 0.7027888298034668, + "learning_rate": 7.468903568425596e-06, + "loss": 0.7561, + "step": 12262 + }, + { + "epoch": 0.6749408332874677, + "grad_norm": 0.7282221913337708, + "learning_rate": 7.4685266224483785e-06, + "loss": 0.7552, + "step": 12263 + }, + { + "epoch": 0.6749958720898234, + "grad_norm": 0.7349117398262024, + "learning_rate": 7.468149657918377e-06, + "loss": 0.8323, + "step": 12264 + }, + { + "epoch": 0.675050910892179, + "grad_norm": 0.8992187976837158, + "learning_rate": 7.467772674838424e-06, + "loss": 0.7589, + "step": 12265 + }, + { + "epoch": 0.6751059496945346, + "grad_norm": 0.6773034930229187, + "learning_rate": 7.4673956732113505e-06, + "loss": 0.7229, + "step": 12266 + }, + { + "epoch": 0.6751609884968903, + "grad_norm": 0.6563699841499329, + "learning_rate": 7.467018653039992e-06, + "loss": 0.7526, + "step": 12267 + }, + { + "epoch": 0.675216027299246, + "grad_norm": 0.7559765577316284, + "learning_rate": 7.466641614327181e-06, + "loss": 0.708, + "step": 12268 + }, + { + "epoch": 0.6752710661016016, + "grad_norm": 0.7077820897102356, + "learning_rate": 7.4662645570757545e-06, + "loss": 0.6568, + "step": 12269 + }, + { + "epoch": 0.6753261049039573, + "grad_norm": 0.8082162141799927, + "learning_rate": 7.465887481288541e-06, + "loss": 0.8751, + "step": 12270 + }, + { + "epoch": 0.675381143706313, + "grad_norm": 0.6940243244171143, + "learning_rate": 7.465510386968377e-06, + "loss": 0.7826, + "step": 12271 + }, + { + "epoch": 0.6754361825086687, + "grad_norm": 0.6634145379066467, + "learning_rate": 7.465133274118099e-06, + "loss": 0.6816, + "step": 12272 + }, + { + "epoch": 0.6754912213110242, + "grad_norm": 0.6797559857368469, + "learning_rate": 7.464756142740539e-06, + "loss": 0.7101, + "step": 12273 + }, + { + "epoch": 0.6755462601133799, + "grad_norm": 0.7696588635444641, + "learning_rate": 7.464378992838531e-06, + "loss": 0.8114, + "step": 12274 + }, + { + "epoch": 0.6756012989157356, + "grad_norm": 0.6733334064483643, + "learning_rate": 7.4640018244149105e-06, + "loss": 0.7585, + "step": 12275 + }, + { + "epoch": 0.6756563377180913, + "grad_norm": 0.7087474465370178, + "learning_rate": 7.463624637472512e-06, + "loss": 0.6911, + "step": 12276 + }, + { + "epoch": 0.6757113765204469, + "grad_norm": 0.6944451928138733, + "learning_rate": 7.46324743201417e-06, + "loss": 0.7726, + "step": 12277 + }, + { + "epoch": 0.6757664153228026, + "grad_norm": 0.7214855551719666, + "learning_rate": 7.46287020804272e-06, + "loss": 0.7844, + "step": 12278 + }, + { + "epoch": 0.6758214541251583, + "grad_norm": 0.7106257677078247, + "learning_rate": 7.462492965560995e-06, + "loss": 0.7724, + "step": 12279 + }, + { + "epoch": 0.675876492927514, + "grad_norm": 0.7403497695922852, + "learning_rate": 7.462115704571833e-06, + "loss": 0.7558, + "step": 12280 + }, + { + "epoch": 0.6759315317298695, + "grad_norm": 0.7157884836196899, + "learning_rate": 7.4617384250780685e-06, + "loss": 0.6681, + "step": 12281 + }, + { + "epoch": 0.6759865705322252, + "grad_norm": 0.6937661170959473, + "learning_rate": 7.461361127082538e-06, + "loss": 0.7852, + "step": 12282 + }, + { + "epoch": 0.6760416093345809, + "grad_norm": 0.7106412053108215, + "learning_rate": 7.4609838105880735e-06, + "loss": 0.7689, + "step": 12283 + }, + { + "epoch": 0.6760966481369366, + "grad_norm": 0.6860619187355042, + "learning_rate": 7.460606475597516e-06, + "loss": 0.6528, + "step": 12284 + }, + { + "epoch": 0.6761516869392922, + "grad_norm": 0.7085865139961243, + "learning_rate": 7.460229122113698e-06, + "loss": 0.7303, + "step": 12285 + }, + { + "epoch": 0.6762067257416479, + "grad_norm": 0.6648178100585938, + "learning_rate": 7.459851750139457e-06, + "loss": 0.6751, + "step": 12286 + }, + { + "epoch": 0.6762617645440036, + "grad_norm": 0.74468594789505, + "learning_rate": 7.459474359677629e-06, + "loss": 0.756, + "step": 12287 + }, + { + "epoch": 0.6763168033463591, + "grad_norm": 0.6408486366271973, + "learning_rate": 7.459096950731048e-06, + "loss": 0.7737, + "step": 12288 + }, + { + "epoch": 0.6763718421487148, + "grad_norm": 0.7204515933990479, + "learning_rate": 7.458719523302556e-06, + "loss": 0.7845, + "step": 12289 + }, + { + "epoch": 0.6764268809510705, + "grad_norm": 0.7373428344726562, + "learning_rate": 7.458342077394984e-06, + "loss": 0.7245, + "step": 12290 + }, + { + "epoch": 0.6764819197534262, + "grad_norm": 0.701654851436615, + "learning_rate": 7.45796461301117e-06, + "loss": 0.7711, + "step": 12291 + }, + { + "epoch": 0.6765369585557818, + "grad_norm": 0.7002573013305664, + "learning_rate": 7.4575871301539526e-06, + "loss": 0.8138, + "step": 12292 + }, + { + "epoch": 0.6765919973581375, + "grad_norm": 0.7460681200027466, + "learning_rate": 7.45720962882617e-06, + "loss": 0.8012, + "step": 12293 + }, + { + "epoch": 0.6766470361604932, + "grad_norm": 0.6478421092033386, + "learning_rate": 7.456832109030655e-06, + "loss": 0.7161, + "step": 12294 + }, + { + "epoch": 0.6767020749628488, + "grad_norm": 0.7101582288742065, + "learning_rate": 7.456454570770248e-06, + "loss": 0.7348, + "step": 12295 + }, + { + "epoch": 0.6767571137652044, + "grad_norm": 0.7735113501548767, + "learning_rate": 7.4560770140477865e-06, + "loss": 0.7584, + "step": 12296 + }, + { + "epoch": 0.6768121525675601, + "grad_norm": 0.6811535358428955, + "learning_rate": 7.4556994388661085e-06, + "loss": 0.7653, + "step": 12297 + }, + { + "epoch": 0.6768671913699158, + "grad_norm": 0.7445605397224426, + "learning_rate": 7.455321845228051e-06, + "loss": 0.7661, + "step": 12298 + }, + { + "epoch": 0.6769222301722715, + "grad_norm": 0.6862059831619263, + "learning_rate": 7.4549442331364505e-06, + "loss": 0.776, + "step": 12299 + }, + { + "epoch": 0.6769772689746271, + "grad_norm": 0.7030314207077026, + "learning_rate": 7.4545666025941465e-06, + "loss": 0.7393, + "step": 12300 + }, + { + "epoch": 0.6770323077769828, + "grad_norm": 0.6718610525131226, + "learning_rate": 7.454188953603978e-06, + "loss": 0.7375, + "step": 12301 + }, + { + "epoch": 0.6770873465793384, + "grad_norm": 0.6716088652610779, + "learning_rate": 7.453811286168782e-06, + "loss": 0.8021, + "step": 12302 + }, + { + "epoch": 0.6771423853816941, + "grad_norm": 0.8916372656822205, + "learning_rate": 7.453433600291395e-06, + "loss": 0.8274, + "step": 12303 + }, + { + "epoch": 0.6771974241840497, + "grad_norm": 0.7396363615989685, + "learning_rate": 7.45305589597466e-06, + "loss": 0.7892, + "step": 12304 + }, + { + "epoch": 0.6772524629864054, + "grad_norm": 0.8074424862861633, + "learning_rate": 7.452678173221413e-06, + "loss": 0.7586, + "step": 12305 + }, + { + "epoch": 0.6773075017887611, + "grad_norm": 0.6928194165229797, + "learning_rate": 7.452300432034494e-06, + "loss": 0.7914, + "step": 12306 + }, + { + "epoch": 0.6773625405911168, + "grad_norm": 0.7064313292503357, + "learning_rate": 7.451922672416739e-06, + "loss": 0.7948, + "step": 12307 + }, + { + "epoch": 0.6774175793934724, + "grad_norm": 0.6828622221946716, + "learning_rate": 7.451544894370992e-06, + "loss": 0.6723, + "step": 12308 + }, + { + "epoch": 0.677472618195828, + "grad_norm": 0.6794914603233337, + "learning_rate": 7.45116709790009e-06, + "loss": 0.7344, + "step": 12309 + }, + { + "epoch": 0.6775276569981837, + "grad_norm": 0.7643330097198486, + "learning_rate": 7.45078928300687e-06, + "loss": 0.7836, + "step": 12310 + }, + { + "epoch": 0.6775826958005394, + "grad_norm": 0.692569375038147, + "learning_rate": 7.450411449694176e-06, + "loss": 0.7608, + "step": 12311 + }, + { + "epoch": 0.677637734602895, + "grad_norm": 0.7718693614006042, + "learning_rate": 7.4500335979648455e-06, + "loss": 0.7131, + "step": 12312 + }, + { + "epoch": 0.6776927734052507, + "grad_norm": 0.6267405152320862, + "learning_rate": 7.449655727821716e-06, + "loss": 0.7543, + "step": 12313 + }, + { + "epoch": 0.6777478122076064, + "grad_norm": 0.8252732157707214, + "learning_rate": 7.4492778392676325e-06, + "loss": 0.8799, + "step": 12314 + }, + { + "epoch": 0.6778028510099621, + "grad_norm": 0.6310145854949951, + "learning_rate": 7.448899932305429e-06, + "loss": 0.7389, + "step": 12315 + }, + { + "epoch": 0.6778578898123176, + "grad_norm": 0.6115848422050476, + "learning_rate": 7.448522006937951e-06, + "loss": 0.6069, + "step": 12316 + }, + { + "epoch": 0.6779129286146733, + "grad_norm": 0.6809090971946716, + "learning_rate": 7.448144063168038e-06, + "loss": 0.7092, + "step": 12317 + }, + { + "epoch": 0.677967967417029, + "grad_norm": 0.7285470366477966, + "learning_rate": 7.447766100998529e-06, + "loss": 0.714, + "step": 12318 + }, + { + "epoch": 0.6780230062193847, + "grad_norm": 0.6637021899223328, + "learning_rate": 7.447388120432264e-06, + "loss": 0.7247, + "step": 12319 + }, + { + "epoch": 0.6780780450217403, + "grad_norm": 0.7735750675201416, + "learning_rate": 7.447010121472087e-06, + "loss": 0.7616, + "step": 12320 + }, + { + "epoch": 0.678133083824096, + "grad_norm": 0.7643262147903442, + "learning_rate": 7.446632104120836e-06, + "loss": 0.5863, + "step": 12321 + }, + { + "epoch": 0.6781881226264517, + "grad_norm": 0.6957301497459412, + "learning_rate": 7.446254068381352e-06, + "loss": 0.7125, + "step": 12322 + }, + { + "epoch": 0.6782431614288074, + "grad_norm": 0.6573877930641174, + "learning_rate": 7.445876014256479e-06, + "loss": 0.7115, + "step": 12323 + }, + { + "epoch": 0.6782982002311629, + "grad_norm": 0.6507790684700012, + "learning_rate": 7.445497941749056e-06, + "loss": 0.7266, + "step": 12324 + }, + { + "epoch": 0.6783532390335186, + "grad_norm": 0.8314819931983948, + "learning_rate": 7.4451198508619245e-06, + "loss": 0.6902, + "step": 12325 + }, + { + "epoch": 0.6784082778358743, + "grad_norm": 0.6907274127006531, + "learning_rate": 7.444741741597927e-06, + "loss": 0.8253, + "step": 12326 + }, + { + "epoch": 0.67846331663823, + "grad_norm": 0.7311725616455078, + "learning_rate": 7.444363613959904e-06, + "loss": 0.8641, + "step": 12327 + }, + { + "epoch": 0.6785183554405856, + "grad_norm": 0.6690121293067932, + "learning_rate": 7.443985467950701e-06, + "loss": 0.6966, + "step": 12328 + }, + { + "epoch": 0.6785733942429413, + "grad_norm": 0.6444346308708191, + "learning_rate": 7.443607303573155e-06, + "loss": 0.7848, + "step": 12329 + }, + { + "epoch": 0.678628433045297, + "grad_norm": 0.7553900480270386, + "learning_rate": 7.4432291208301125e-06, + "loss": 0.8196, + "step": 12330 + }, + { + "epoch": 0.6786834718476525, + "grad_norm": 0.6393183469772339, + "learning_rate": 7.442850919724411e-06, + "loss": 0.7622, + "step": 12331 + }, + { + "epoch": 0.6787385106500082, + "grad_norm": 0.7045423984527588, + "learning_rate": 7.442472700258898e-06, + "loss": 0.7483, + "step": 12332 + }, + { + "epoch": 0.6787935494523639, + "grad_norm": 0.7536678314208984, + "learning_rate": 7.442094462436414e-06, + "loss": 0.815, + "step": 12333 + }, + { + "epoch": 0.6788485882547196, + "grad_norm": 0.645391047000885, + "learning_rate": 7.441716206259801e-06, + "loss": 0.7394, + "step": 12334 + }, + { + "epoch": 0.6789036270570752, + "grad_norm": 0.8870118260383606, + "learning_rate": 7.441337931731905e-06, + "loss": 0.8076, + "step": 12335 + }, + { + "epoch": 0.6789586658594309, + "grad_norm": 0.6672457456588745, + "learning_rate": 7.440959638855564e-06, + "loss": 0.7573, + "step": 12336 + }, + { + "epoch": 0.6790137046617866, + "grad_norm": 0.7104566693305969, + "learning_rate": 7.440581327633625e-06, + "loss": 0.6855, + "step": 12337 + }, + { + "epoch": 0.6790687434641423, + "grad_norm": 0.7201581001281738, + "learning_rate": 7.4402029980689294e-06, + "loss": 0.7977, + "step": 12338 + }, + { + "epoch": 0.6791237822664978, + "grad_norm": 0.6685218811035156, + "learning_rate": 7.43982465016432e-06, + "loss": 0.8114, + "step": 12339 + }, + { + "epoch": 0.6791788210688535, + "grad_norm": 0.6913738250732422, + "learning_rate": 7.439446283922645e-06, + "loss": 0.7584, + "step": 12340 + }, + { + "epoch": 0.6792338598712092, + "grad_norm": 0.7332273721694946, + "learning_rate": 7.439067899346742e-06, + "loss": 0.7658, + "step": 12341 + }, + { + "epoch": 0.6792888986735649, + "grad_norm": 0.777909517288208, + "learning_rate": 7.438689496439458e-06, + "loss": 0.8064, + "step": 12342 + }, + { + "epoch": 0.6793439374759205, + "grad_norm": 0.7444930076599121, + "learning_rate": 7.438311075203636e-06, + "loss": 0.7896, + "step": 12343 + }, + { + "epoch": 0.6793989762782762, + "grad_norm": 0.7678806781768799, + "learning_rate": 7.4379326356421224e-06, + "loss": 0.8533, + "step": 12344 + }, + { + "epoch": 0.6794540150806319, + "grad_norm": 0.6653377413749695, + "learning_rate": 7.437554177757759e-06, + "loss": 0.7287, + "step": 12345 + }, + { + "epoch": 0.6795090538829875, + "grad_norm": 0.6270567178726196, + "learning_rate": 7.43717570155339e-06, + "loss": 0.6802, + "step": 12346 + }, + { + "epoch": 0.6795640926853431, + "grad_norm": 0.7091223001480103, + "learning_rate": 7.436797207031861e-06, + "loss": 0.7693, + "step": 12347 + }, + { + "epoch": 0.6796191314876988, + "grad_norm": 0.6583104133605957, + "learning_rate": 7.436418694196018e-06, + "loss": 0.7171, + "step": 12348 + }, + { + "epoch": 0.6796741702900545, + "grad_norm": 0.6897410750389099, + "learning_rate": 7.436040163048703e-06, + "loss": 0.7831, + "step": 12349 + }, + { + "epoch": 0.6797292090924102, + "grad_norm": 0.6506269574165344, + "learning_rate": 7.435661613592763e-06, + "loss": 0.8037, + "step": 12350 + }, + { + "epoch": 0.6797842478947658, + "grad_norm": 0.6772280931472778, + "learning_rate": 7.435283045831041e-06, + "loss": 0.8102, + "step": 12351 + }, + { + "epoch": 0.6798392866971215, + "grad_norm": 0.8470273017883301, + "learning_rate": 7.434904459766384e-06, + "loss": 0.7816, + "step": 12352 + }, + { + "epoch": 0.6798943254994771, + "grad_norm": 0.6969698071479797, + "learning_rate": 7.434525855401638e-06, + "loss": 0.6911, + "step": 12353 + }, + { + "epoch": 0.6799493643018328, + "grad_norm": 0.9969611763954163, + "learning_rate": 7.434147232739646e-06, + "loss": 0.7041, + "step": 12354 + }, + { + "epoch": 0.6800044031041884, + "grad_norm": 0.6697688698768616, + "learning_rate": 7.433768591783255e-06, + "loss": 0.6602, + "step": 12355 + }, + { + "epoch": 0.6800594419065441, + "grad_norm": 0.9857928156852722, + "learning_rate": 7.433389932535311e-06, + "loss": 0.6505, + "step": 12356 + }, + { + "epoch": 0.6801144807088998, + "grad_norm": 0.8787727355957031, + "learning_rate": 7.43301125499866e-06, + "loss": 0.7558, + "step": 12357 + }, + { + "epoch": 0.6801695195112555, + "grad_norm": 0.6035268306732178, + "learning_rate": 7.432632559176147e-06, + "loss": 0.6337, + "step": 12358 + }, + { + "epoch": 0.6802245583136111, + "grad_norm": 0.7977258563041687, + "learning_rate": 7.432253845070621e-06, + "loss": 0.7324, + "step": 12359 + }, + { + "epoch": 0.6802795971159667, + "grad_norm": 0.5842836499214172, + "learning_rate": 7.431875112684923e-06, + "loss": 0.677, + "step": 12360 + }, + { + "epoch": 0.6803346359183224, + "grad_norm": 0.7134125828742981, + "learning_rate": 7.431496362021905e-06, + "loss": 0.7034, + "step": 12361 + }, + { + "epoch": 0.6803896747206781, + "grad_norm": 0.7101823091506958, + "learning_rate": 7.431117593084411e-06, + "loss": 0.7526, + "step": 12362 + }, + { + "epoch": 0.6804447135230337, + "grad_norm": 0.6543304920196533, + "learning_rate": 7.4307388058752865e-06, + "loss": 0.7548, + "step": 12363 + }, + { + "epoch": 0.6804997523253894, + "grad_norm": 0.6522945761680603, + "learning_rate": 7.430360000397381e-06, + "loss": 0.7044, + "step": 12364 + }, + { + "epoch": 0.6805547911277451, + "grad_norm": 0.7405091524124146, + "learning_rate": 7.429981176653539e-06, + "loss": 0.8064, + "step": 12365 + }, + { + "epoch": 0.6806098299301008, + "grad_norm": 0.6454355716705322, + "learning_rate": 7.429602334646611e-06, + "loss": 0.7179, + "step": 12366 + }, + { + "epoch": 0.6806648687324564, + "grad_norm": 0.8131621479988098, + "learning_rate": 7.429223474379439e-06, + "loss": 0.7144, + "step": 12367 + }, + { + "epoch": 0.680719907534812, + "grad_norm": 0.7203080058097839, + "learning_rate": 7.428844595854876e-06, + "loss": 0.8189, + "step": 12368 + }, + { + "epoch": 0.6807749463371677, + "grad_norm": 0.650414228439331, + "learning_rate": 7.428465699075767e-06, + "loss": 0.7815, + "step": 12369 + }, + { + "epoch": 0.6808299851395234, + "grad_norm": 0.8152775168418884, + "learning_rate": 7.42808678404496e-06, + "loss": 0.7365, + "step": 12370 + }, + { + "epoch": 0.680885023941879, + "grad_norm": 0.5871601700782776, + "learning_rate": 7.427707850765302e-06, + "loss": 0.6804, + "step": 12371 + }, + { + "epoch": 0.6809400627442347, + "grad_norm": 0.7115684747695923, + "learning_rate": 7.427328899239643e-06, + "loss": 0.728, + "step": 12372 + }, + { + "epoch": 0.6809951015465904, + "grad_norm": 0.6575615406036377, + "learning_rate": 7.426949929470828e-06, + "loss": 0.725, + "step": 12373 + }, + { + "epoch": 0.681050140348946, + "grad_norm": 0.7744095325469971, + "learning_rate": 7.426570941461708e-06, + "loss": 0.7647, + "step": 12374 + }, + { + "epoch": 0.6811051791513016, + "grad_norm": 0.6856220364570618, + "learning_rate": 7.4261919352151305e-06, + "loss": 0.8121, + "step": 12375 + }, + { + "epoch": 0.6811602179536573, + "grad_norm": 0.8197830319404602, + "learning_rate": 7.425812910733943e-06, + "loss": 0.8685, + "step": 12376 + }, + { + "epoch": 0.681215256756013, + "grad_norm": 1.240628719329834, + "learning_rate": 7.425433868020996e-06, + "loss": 0.8063, + "step": 12377 + }, + { + "epoch": 0.6812702955583686, + "grad_norm": 0.8716747760772705, + "learning_rate": 7.425054807079136e-06, + "loss": 0.7384, + "step": 12378 + }, + { + "epoch": 0.6813253343607243, + "grad_norm": 0.7512598037719727, + "learning_rate": 7.4246757279112135e-06, + "loss": 0.7428, + "step": 12379 + }, + { + "epoch": 0.68138037316308, + "grad_norm": 0.7002312541007996, + "learning_rate": 7.424296630520078e-06, + "loss": 0.6066, + "step": 12380 + }, + { + "epoch": 0.6814354119654357, + "grad_norm": 0.6422720551490784, + "learning_rate": 7.423917514908578e-06, + "loss": 0.6645, + "step": 12381 + }, + { + "epoch": 0.6814904507677912, + "grad_norm": 0.8667505383491516, + "learning_rate": 7.423538381079562e-06, + "loss": 0.8663, + "step": 12382 + }, + { + "epoch": 0.6815454895701469, + "grad_norm": 0.7045377492904663, + "learning_rate": 7.423159229035881e-06, + "loss": 0.7684, + "step": 12383 + }, + { + "epoch": 0.6816005283725026, + "grad_norm": 0.7663894295692444, + "learning_rate": 7.422780058780385e-06, + "loss": 0.8051, + "step": 12384 + }, + { + "epoch": 0.6816555671748583, + "grad_norm": 0.7612582445144653, + "learning_rate": 7.42240087031592e-06, + "loss": 0.7771, + "step": 12385 + }, + { + "epoch": 0.6817106059772139, + "grad_norm": 0.8682271838188171, + "learning_rate": 7.42202166364534e-06, + "loss": 0.7761, + "step": 12386 + }, + { + "epoch": 0.6817656447795696, + "grad_norm": 0.712204098701477, + "learning_rate": 7.421642438771492e-06, + "loss": 0.7832, + "step": 12387 + }, + { + "epoch": 0.6818206835819253, + "grad_norm": 0.6726338863372803, + "learning_rate": 7.42126319569723e-06, + "loss": 0.7541, + "step": 12388 + }, + { + "epoch": 0.681875722384281, + "grad_norm": 0.647570788860321, + "learning_rate": 7.420883934425401e-06, + "loss": 0.7281, + "step": 12389 + }, + { + "epoch": 0.6819307611866365, + "grad_norm": 0.7058577537536621, + "learning_rate": 7.420504654958857e-06, + "loss": 0.8315, + "step": 12390 + }, + { + "epoch": 0.6819857999889922, + "grad_norm": 0.6683655977249146, + "learning_rate": 7.420125357300446e-06, + "loss": 0.772, + "step": 12391 + }, + { + "epoch": 0.6820408387913479, + "grad_norm": 0.6768681406974792, + "learning_rate": 7.419746041453022e-06, + "loss": 0.7023, + "step": 12392 + }, + { + "epoch": 0.6820958775937036, + "grad_norm": 0.8037514686584473, + "learning_rate": 7.419366707419434e-06, + "loss": 0.6894, + "step": 12393 + }, + { + "epoch": 0.6821509163960592, + "grad_norm": 0.6510934829711914, + "learning_rate": 7.418987355202534e-06, + "loss": 0.6411, + "step": 12394 + }, + { + "epoch": 0.6822059551984149, + "grad_norm": 0.7628617882728577, + "learning_rate": 7.418607984805173e-06, + "loss": 0.7681, + "step": 12395 + }, + { + "epoch": 0.6822609940007706, + "grad_norm": 0.7146260738372803, + "learning_rate": 7.418228596230201e-06, + "loss": 0.7003, + "step": 12396 + }, + { + "epoch": 0.6823160328031262, + "grad_norm": 0.6208338737487793, + "learning_rate": 7.41784918948047e-06, + "loss": 0.7138, + "step": 12397 + }, + { + "epoch": 0.6823710716054818, + "grad_norm": 0.7859066724777222, + "learning_rate": 7.417469764558832e-06, + "loss": 0.7984, + "step": 12398 + }, + { + "epoch": 0.6824261104078375, + "grad_norm": 0.7636224031448364, + "learning_rate": 7.417090321468138e-06, + "loss": 0.7445, + "step": 12399 + }, + { + "epoch": 0.6824811492101932, + "grad_norm": 0.9071671366691589, + "learning_rate": 7.41671086021124e-06, + "loss": 0.8058, + "step": 12400 + }, + { + "epoch": 0.6825361880125489, + "grad_norm": 0.5986278057098389, + "learning_rate": 7.416331380790991e-06, + "loss": 0.7001, + "step": 12401 + }, + { + "epoch": 0.6825912268149045, + "grad_norm": 0.6812893152236938, + "learning_rate": 7.415951883210242e-06, + "loss": 0.7745, + "step": 12402 + }, + { + "epoch": 0.6826462656172602, + "grad_norm": 0.666362464427948, + "learning_rate": 7.415572367471844e-06, + "loss": 0.7861, + "step": 12403 + }, + { + "epoch": 0.6827013044196159, + "grad_norm": 0.6963029503822327, + "learning_rate": 7.415192833578653e-06, + "loss": 0.7657, + "step": 12404 + }, + { + "epoch": 0.6827563432219715, + "grad_norm": 0.669876217842102, + "learning_rate": 7.414813281533517e-06, + "loss": 0.6441, + "step": 12405 + }, + { + "epoch": 0.6828113820243271, + "grad_norm": 0.6608602404594421, + "learning_rate": 7.414433711339293e-06, + "loss": 0.7203, + "step": 12406 + }, + { + "epoch": 0.6828664208266828, + "grad_norm": 0.7262642979621887, + "learning_rate": 7.41405412299883e-06, + "loss": 0.7842, + "step": 12407 + }, + { + "epoch": 0.6829214596290385, + "grad_norm": 0.7728527188301086, + "learning_rate": 7.413674516514983e-06, + "loss": 0.7551, + "step": 12408 + }, + { + "epoch": 0.6829764984313942, + "grad_norm": 0.7970840930938721, + "learning_rate": 7.4132948918906035e-06, + "loss": 0.8181, + "step": 12409 + }, + { + "epoch": 0.6830315372337498, + "grad_norm": 0.6672868728637695, + "learning_rate": 7.412915249128546e-06, + "loss": 0.7201, + "step": 12410 + }, + { + "epoch": 0.6830865760361055, + "grad_norm": 0.8261075019836426, + "learning_rate": 7.412535588231664e-06, + "loss": 0.6006, + "step": 12411 + }, + { + "epoch": 0.6831416148384611, + "grad_norm": 0.6768019795417786, + "learning_rate": 7.412155909202809e-06, + "loss": 0.7326, + "step": 12412 + }, + { + "epoch": 0.6831966536408168, + "grad_norm": 0.7482851147651672, + "learning_rate": 7.4117762120448364e-06, + "loss": 0.7913, + "step": 12413 + }, + { + "epoch": 0.6832516924431724, + "grad_norm": 0.7315956354141235, + "learning_rate": 7.411396496760601e-06, + "loss": 0.7949, + "step": 12414 + }, + { + "epoch": 0.6833067312455281, + "grad_norm": 0.7460561394691467, + "learning_rate": 7.411016763352954e-06, + "loss": 0.8445, + "step": 12415 + }, + { + "epoch": 0.6833617700478838, + "grad_norm": 0.7025588154792786, + "learning_rate": 7.410637011824749e-06, + "loss": 0.7658, + "step": 12416 + }, + { + "epoch": 0.6834168088502394, + "grad_norm": 0.7507885694503784, + "learning_rate": 7.410257242178842e-06, + "loss": 0.711, + "step": 12417 + }, + { + "epoch": 0.683471847652595, + "grad_norm": 0.6935780048370361, + "learning_rate": 7.409877454418088e-06, + "loss": 0.8376, + "step": 12418 + }, + { + "epoch": 0.6835268864549507, + "grad_norm": 0.7747789025306702, + "learning_rate": 7.409497648545341e-06, + "loss": 0.8173, + "step": 12419 + }, + { + "epoch": 0.6835819252573064, + "grad_norm": 0.6559001803398132, + "learning_rate": 7.4091178245634525e-06, + "loss": 0.7146, + "step": 12420 + }, + { + "epoch": 0.683636964059662, + "grad_norm": 0.7123926877975464, + "learning_rate": 7.408737982475279e-06, + "loss": 0.7544, + "step": 12421 + }, + { + "epoch": 0.6836920028620177, + "grad_norm": 0.8163334131240845, + "learning_rate": 7.408358122283678e-06, + "loss": 0.8008, + "step": 12422 + }, + { + "epoch": 0.6837470416643734, + "grad_norm": 0.6837686896324158, + "learning_rate": 7.4079782439915e-06, + "loss": 0.6595, + "step": 12423 + }, + { + "epoch": 0.6838020804667291, + "grad_norm": 0.9385979175567627, + "learning_rate": 7.407598347601601e-06, + "loss": 0.8135, + "step": 12424 + }, + { + "epoch": 0.6838571192690847, + "grad_norm": 0.7197830677032471, + "learning_rate": 7.407218433116839e-06, + "loss": 0.8401, + "step": 12425 + }, + { + "epoch": 0.6839121580714403, + "grad_norm": 0.7165716290473938, + "learning_rate": 7.406838500540069e-06, + "loss": 0.7864, + "step": 12426 + }, + { + "epoch": 0.683967196873796, + "grad_norm": 0.6844950318336487, + "learning_rate": 7.4064585498741435e-06, + "loss": 0.7409, + "step": 12427 + }, + { + "epoch": 0.6840222356761517, + "grad_norm": 0.6237946152687073, + "learning_rate": 7.40607858112192e-06, + "loss": 0.6915, + "step": 12428 + }, + { + "epoch": 0.6840772744785073, + "grad_norm": 0.7437137365341187, + "learning_rate": 7.405698594286252e-06, + "loss": 0.8191, + "step": 12429 + }, + { + "epoch": 0.684132313280863, + "grad_norm": 0.6956225633621216, + "learning_rate": 7.4053185893700006e-06, + "loss": 0.7662, + "step": 12430 + }, + { + "epoch": 0.6841873520832187, + "grad_norm": 0.6508380174636841, + "learning_rate": 7.404938566376018e-06, + "loss": 0.7758, + "step": 12431 + }, + { + "epoch": 0.6842423908855744, + "grad_norm": 0.6759025454521179, + "learning_rate": 7.404558525307159e-06, + "loss": 0.7713, + "step": 12432 + }, + { + "epoch": 0.68429742968793, + "grad_norm": 0.7280172109603882, + "learning_rate": 7.404178466166283e-06, + "loss": 0.7753, + "step": 12433 + }, + { + "epoch": 0.6843524684902856, + "grad_norm": 0.7599073052406311, + "learning_rate": 7.403798388956245e-06, + "loss": 0.6993, + "step": 12434 + }, + { + "epoch": 0.6844075072926413, + "grad_norm": 0.7962353229522705, + "learning_rate": 7.403418293679903e-06, + "loss": 0.771, + "step": 12435 + }, + { + "epoch": 0.684462546094997, + "grad_norm": 0.6714458465576172, + "learning_rate": 7.40303818034011e-06, + "loss": 0.7077, + "step": 12436 + }, + { + "epoch": 0.6845175848973526, + "grad_norm": 0.6770713925361633, + "learning_rate": 7.402658048939726e-06, + "loss": 0.7695, + "step": 12437 + }, + { + "epoch": 0.6845726236997083, + "grad_norm": 0.7337867617607117, + "learning_rate": 7.402277899481608e-06, + "loss": 0.9453, + "step": 12438 + }, + { + "epoch": 0.684627662502064, + "grad_norm": 0.7457698583602905, + "learning_rate": 7.401897731968612e-06, + "loss": 0.7569, + "step": 12439 + }, + { + "epoch": 0.6846827013044197, + "grad_norm": 0.6683285236358643, + "learning_rate": 7.401517546403595e-06, + "loss": 0.7215, + "step": 12440 + }, + { + "epoch": 0.6847377401067752, + "grad_norm": 0.6516628861427307, + "learning_rate": 7.401137342789415e-06, + "loss": 0.7433, + "step": 12441 + }, + { + "epoch": 0.6847927789091309, + "grad_norm": 0.7572295665740967, + "learning_rate": 7.400757121128932e-06, + "loss": 0.7204, + "step": 12442 + }, + { + "epoch": 0.6848478177114866, + "grad_norm": 0.6884106993675232, + "learning_rate": 7.400376881425e-06, + "loss": 0.6766, + "step": 12443 + }, + { + "epoch": 0.6849028565138423, + "grad_norm": 0.798926591873169, + "learning_rate": 7.399996623680475e-06, + "loss": 0.7673, + "step": 12444 + }, + { + "epoch": 0.6849578953161979, + "grad_norm": 0.7200846672058105, + "learning_rate": 7.399616347898221e-06, + "loss": 0.8032, + "step": 12445 + }, + { + "epoch": 0.6850129341185536, + "grad_norm": 0.7085461020469666, + "learning_rate": 7.3992360540810915e-06, + "loss": 0.7075, + "step": 12446 + }, + { + "epoch": 0.6850679729209093, + "grad_norm": 0.6885339021682739, + "learning_rate": 7.398855742231947e-06, + "loss": 0.7278, + "step": 12447 + }, + { + "epoch": 0.685123011723265, + "grad_norm": 0.6693943738937378, + "learning_rate": 7.398475412353643e-06, + "loss": 0.7134, + "step": 12448 + }, + { + "epoch": 0.6851780505256205, + "grad_norm": 0.6908173561096191, + "learning_rate": 7.398095064449041e-06, + "loss": 0.8054, + "step": 12449 + }, + { + "epoch": 0.6852330893279762, + "grad_norm": 0.6207892894744873, + "learning_rate": 7.397714698520999e-06, + "loss": 0.5789, + "step": 12450 + }, + { + "epoch": 0.6852881281303319, + "grad_norm": 0.8367832899093628, + "learning_rate": 7.397334314572374e-06, + "loss": 0.8186, + "step": 12451 + }, + { + "epoch": 0.6853431669326876, + "grad_norm": 0.7005738615989685, + "learning_rate": 7.396953912606026e-06, + "loss": 0.8177, + "step": 12452 + }, + { + "epoch": 0.6853982057350432, + "grad_norm": 0.7189906239509583, + "learning_rate": 7.396573492624814e-06, + "loss": 0.8387, + "step": 12453 + }, + { + "epoch": 0.6854532445373989, + "grad_norm": 1.040576457977295, + "learning_rate": 7.3961930546315995e-06, + "loss": 0.7165, + "step": 12454 + }, + { + "epoch": 0.6855082833397546, + "grad_norm": 0.6417170166969299, + "learning_rate": 7.3958125986292385e-06, + "loss": 0.6671, + "step": 12455 + }, + { + "epoch": 0.6855633221421102, + "grad_norm": 0.6443242430686951, + "learning_rate": 7.395432124620589e-06, + "loss": 0.6995, + "step": 12456 + }, + { + "epoch": 0.6856183609444658, + "grad_norm": 0.5764951705932617, + "learning_rate": 7.395051632608516e-06, + "loss": 0.6088, + "step": 12457 + }, + { + "epoch": 0.6856733997468215, + "grad_norm": 0.6193686127662659, + "learning_rate": 7.394671122595873e-06, + "loss": 0.7283, + "step": 12458 + }, + { + "epoch": 0.6857284385491772, + "grad_norm": 0.6773817539215088, + "learning_rate": 7.394290594585525e-06, + "loss": 0.8204, + "step": 12459 + }, + { + "epoch": 0.6857834773515328, + "grad_norm": 0.7906570434570312, + "learning_rate": 7.393910048580328e-06, + "loss": 0.7057, + "step": 12460 + }, + { + "epoch": 0.6858385161538885, + "grad_norm": 0.7544124126434326, + "learning_rate": 7.393529484583145e-06, + "loss": 0.8053, + "step": 12461 + }, + { + "epoch": 0.6858935549562442, + "grad_norm": 0.6878008842468262, + "learning_rate": 7.3931489025968365e-06, + "loss": 0.6972, + "step": 12462 + }, + { + "epoch": 0.6859485937585998, + "grad_norm": 0.6734861731529236, + "learning_rate": 7.392768302624259e-06, + "loss": 0.7921, + "step": 12463 + }, + { + "epoch": 0.6860036325609554, + "grad_norm": 0.6845618486404419, + "learning_rate": 7.392387684668276e-06, + "loss": 0.7461, + "step": 12464 + }, + { + "epoch": 0.6860586713633111, + "grad_norm": 0.6362663507461548, + "learning_rate": 7.392007048731748e-06, + "loss": 0.7108, + "step": 12465 + }, + { + "epoch": 0.6861137101656668, + "grad_norm": 0.7441046237945557, + "learning_rate": 7.391626394817537e-06, + "loss": 0.6944, + "step": 12466 + }, + { + "epoch": 0.6861687489680225, + "grad_norm": 1.0933935642242432, + "learning_rate": 7.391245722928501e-06, + "loss": 0.7744, + "step": 12467 + }, + { + "epoch": 0.6862237877703781, + "grad_norm": 0.6531348824501038, + "learning_rate": 7.3908650330675e-06, + "loss": 0.6772, + "step": 12468 + }, + { + "epoch": 0.6862788265727338, + "grad_norm": 0.7533715963363647, + "learning_rate": 7.390484325237399e-06, + "loss": 0.7385, + "step": 12469 + }, + { + "epoch": 0.6863338653750894, + "grad_norm": 0.618679940700531, + "learning_rate": 7.390103599441058e-06, + "loss": 0.6053, + "step": 12470 + }, + { + "epoch": 0.6863889041774451, + "grad_norm": 0.7102347612380981, + "learning_rate": 7.389722855681338e-06, + "loss": 0.7246, + "step": 12471 + }, + { + "epoch": 0.6864439429798007, + "grad_norm": 0.8545061945915222, + "learning_rate": 7.3893420939611e-06, + "loss": 0.7386, + "step": 12472 + }, + { + "epoch": 0.6864989817821564, + "grad_norm": 0.6298168897628784, + "learning_rate": 7.388961314283207e-06, + "loss": 0.6573, + "step": 12473 + }, + { + "epoch": 0.6865540205845121, + "grad_norm": 0.6909272074699402, + "learning_rate": 7.388580516650521e-06, + "loss": 0.7973, + "step": 12474 + }, + { + "epoch": 0.6866090593868678, + "grad_norm": 0.6782366037368774, + "learning_rate": 7.388199701065904e-06, + "loss": 0.7437, + "step": 12475 + }, + { + "epoch": 0.6866640981892234, + "grad_norm": 0.6826187372207642, + "learning_rate": 7.387818867532213e-06, + "loss": 0.6254, + "step": 12476 + }, + { + "epoch": 0.686719136991579, + "grad_norm": 0.7471422553062439, + "learning_rate": 7.387438016052318e-06, + "loss": 0.8668, + "step": 12477 + }, + { + "epoch": 0.6867741757939347, + "grad_norm": 0.7987646460533142, + "learning_rate": 7.38705714662908e-06, + "loss": 0.6759, + "step": 12478 + }, + { + "epoch": 0.6868292145962904, + "grad_norm": 0.7318877577781677, + "learning_rate": 7.386676259265356e-06, + "loss": 0.7167, + "step": 12479 + }, + { + "epoch": 0.686884253398646, + "grad_norm": 0.6655439138412476, + "learning_rate": 7.386295353964013e-06, + "loss": 0.7184, + "step": 12480 + }, + { + "epoch": 0.6869392922010017, + "grad_norm": 0.7323878407478333, + "learning_rate": 7.385914430727912e-06, + "loss": 0.7562, + "step": 12481 + }, + { + "epoch": 0.6869943310033574, + "grad_norm": 0.7813006639480591, + "learning_rate": 7.385533489559918e-06, + "loss": 0.7665, + "step": 12482 + }, + { + "epoch": 0.6870493698057131, + "grad_norm": 0.6889718770980835, + "learning_rate": 7.385152530462894e-06, + "loss": 0.6587, + "step": 12483 + }, + { + "epoch": 0.6871044086080687, + "grad_norm": 0.6930332183837891, + "learning_rate": 7.384771553439698e-06, + "loss": 0.8244, + "step": 12484 + }, + { + "epoch": 0.6871594474104243, + "grad_norm": 0.8294679522514343, + "learning_rate": 7.384390558493201e-06, + "loss": 0.6977, + "step": 12485 + }, + { + "epoch": 0.68721448621278, + "grad_norm": 0.7235204577445984, + "learning_rate": 7.384009545626262e-06, + "loss": 0.7946, + "step": 12486 + }, + { + "epoch": 0.6872695250151357, + "grad_norm": 0.6346727609634399, + "learning_rate": 7.3836285148417456e-06, + "loss": 0.6109, + "step": 12487 + }, + { + "epoch": 0.6873245638174913, + "grad_norm": 0.7168872356414795, + "learning_rate": 7.383247466142513e-06, + "loss": 0.7485, + "step": 12488 + }, + { + "epoch": 0.687379602619847, + "grad_norm": 0.6511938571929932, + "learning_rate": 7.382866399531434e-06, + "loss": 0.8048, + "step": 12489 + }, + { + "epoch": 0.6874346414222027, + "grad_norm": 0.7569704651832581, + "learning_rate": 7.3824853150113674e-06, + "loss": 0.8017, + "step": 12490 + }, + { + "epoch": 0.6874896802245584, + "grad_norm": 0.7708210945129395, + "learning_rate": 7.382104212585178e-06, + "loss": 0.7258, + "step": 12491 + }, + { + "epoch": 0.6875447190269139, + "grad_norm": 0.709702730178833, + "learning_rate": 7.381723092255731e-06, + "loss": 0.7707, + "step": 12492 + }, + { + "epoch": 0.6875997578292696, + "grad_norm": 0.6683183908462524, + "learning_rate": 7.381341954025892e-06, + "loss": 0.702, + "step": 12493 + }, + { + "epoch": 0.6876547966316253, + "grad_norm": 0.7639274597167969, + "learning_rate": 7.380960797898524e-06, + "loss": 0.7027, + "step": 12494 + }, + { + "epoch": 0.687709835433981, + "grad_norm": 0.6735698580741882, + "learning_rate": 7.380579623876492e-06, + "loss": 0.7124, + "step": 12495 + }, + { + "epoch": 0.6877648742363366, + "grad_norm": 0.6635340452194214, + "learning_rate": 7.38019843196266e-06, + "loss": 0.6968, + "step": 12496 + }, + { + "epoch": 0.6878199130386923, + "grad_norm": 0.7459729313850403, + "learning_rate": 7.379817222159895e-06, + "loss": 0.7629, + "step": 12497 + }, + { + "epoch": 0.687874951841048, + "grad_norm": 0.7408778667449951, + "learning_rate": 7.37943599447106e-06, + "loss": 0.8327, + "step": 12498 + }, + { + "epoch": 0.6879299906434037, + "grad_norm": 0.659736156463623, + "learning_rate": 7.379054748899021e-06, + "loss": 0.6746, + "step": 12499 + }, + { + "epoch": 0.6879850294457592, + "grad_norm": 0.7429264783859253, + "learning_rate": 7.3786734854466435e-06, + "loss": 0.8555, + "step": 12500 + }, + { + "epoch": 0.6880400682481149, + "grad_norm": 0.7492697834968567, + "learning_rate": 7.378292204116793e-06, + "loss": 0.7825, + "step": 12501 + }, + { + "epoch": 0.6880951070504706, + "grad_norm": 0.6664871573448181, + "learning_rate": 7.377910904912336e-06, + "loss": 0.7343, + "step": 12502 + }, + { + "epoch": 0.6881501458528262, + "grad_norm": 0.8010555505752563, + "learning_rate": 7.377529587836135e-06, + "loss": 0.6789, + "step": 12503 + }, + { + "epoch": 0.6882051846551819, + "grad_norm": 0.6339166164398193, + "learning_rate": 7.3771482528910585e-06, + "loss": 0.7471, + "step": 12504 + }, + { + "epoch": 0.6882602234575376, + "grad_norm": 0.6750906109809875, + "learning_rate": 7.376766900079973e-06, + "loss": 0.665, + "step": 12505 + }, + { + "epoch": 0.6883152622598933, + "grad_norm": 0.6440090537071228, + "learning_rate": 7.376385529405743e-06, + "loss": 0.6804, + "step": 12506 + }, + { + "epoch": 0.6883703010622488, + "grad_norm": 0.7159061431884766, + "learning_rate": 7.376004140871236e-06, + "loss": 0.7524, + "step": 12507 + }, + { + "epoch": 0.6884253398646045, + "grad_norm": 0.7551491260528564, + "learning_rate": 7.375622734479316e-06, + "loss": 0.891, + "step": 12508 + }, + { + "epoch": 0.6884803786669602, + "grad_norm": 0.6584289073944092, + "learning_rate": 7.375241310232854e-06, + "loss": 0.7313, + "step": 12509 + }, + { + "epoch": 0.6885354174693159, + "grad_norm": 0.7616147398948669, + "learning_rate": 7.374859868134713e-06, + "loss": 0.8351, + "step": 12510 + }, + { + "epoch": 0.6885904562716715, + "grad_norm": 0.669541597366333, + "learning_rate": 7.374478408187761e-06, + "loss": 0.6836, + "step": 12511 + }, + { + "epoch": 0.6886454950740272, + "grad_norm": 0.6483158469200134, + "learning_rate": 7.374096930394864e-06, + "loss": 0.6909, + "step": 12512 + }, + { + "epoch": 0.6887005338763829, + "grad_norm": 0.7079604864120483, + "learning_rate": 7.3737154347588925e-06, + "loss": 0.7151, + "step": 12513 + }, + { + "epoch": 0.6887555726787385, + "grad_norm": 0.6805073618888855, + "learning_rate": 7.373333921282709e-06, + "loss": 0.7761, + "step": 12514 + }, + { + "epoch": 0.6888106114810941, + "grad_norm": 0.757008969783783, + "learning_rate": 7.372952389969183e-06, + "loss": 0.7249, + "step": 12515 + }, + { + "epoch": 0.6888656502834498, + "grad_norm": 0.6990587711334229, + "learning_rate": 7.372570840821183e-06, + "loss": 0.7463, + "step": 12516 + }, + { + "epoch": 0.6889206890858055, + "grad_norm": 0.7405683398246765, + "learning_rate": 7.3721892738415745e-06, + "loss": 0.8039, + "step": 12517 + }, + { + "epoch": 0.6889757278881612, + "grad_norm": 0.6736571192741394, + "learning_rate": 7.371807689033228e-06, + "loss": 0.7084, + "step": 12518 + }, + { + "epoch": 0.6890307666905168, + "grad_norm": 0.752955436706543, + "learning_rate": 7.3714260863990095e-06, + "loss": 0.7951, + "step": 12519 + }, + { + "epoch": 0.6890858054928725, + "grad_norm": 0.6810917258262634, + "learning_rate": 7.3710444659417855e-06, + "loss": 0.7884, + "step": 12520 + }, + { + "epoch": 0.6891408442952281, + "grad_norm": 0.727500855922699, + "learning_rate": 7.370662827664427e-06, + "loss": 0.7617, + "step": 12521 + }, + { + "epoch": 0.6891958830975838, + "grad_norm": 0.6739845871925354, + "learning_rate": 7.3702811715698016e-06, + "loss": 0.6831, + "step": 12522 + }, + { + "epoch": 0.6892509218999394, + "grad_norm": 0.850913941860199, + "learning_rate": 7.369899497660779e-06, + "loss": 0.7658, + "step": 12523 + }, + { + "epoch": 0.6893059607022951, + "grad_norm": 0.7352884411811829, + "learning_rate": 7.369517805940223e-06, + "loss": 0.7748, + "step": 12524 + }, + { + "epoch": 0.6893609995046508, + "grad_norm": 0.6702300310134888, + "learning_rate": 7.369136096411008e-06, + "loss": 0.7557, + "step": 12525 + }, + { + "epoch": 0.6894160383070065, + "grad_norm": 0.7117186784744263, + "learning_rate": 7.368754369075999e-06, + "loss": 0.8147, + "step": 12526 + }, + { + "epoch": 0.6894710771093621, + "grad_norm": 0.6896687746047974, + "learning_rate": 7.368372623938067e-06, + "loss": 0.7753, + "step": 12527 + }, + { + "epoch": 0.6895261159117178, + "grad_norm": 0.669207751750946, + "learning_rate": 7.367990861000078e-06, + "loss": 0.739, + "step": 12528 + }, + { + "epoch": 0.6895811547140734, + "grad_norm": 0.7014279961585999, + "learning_rate": 7.367609080264906e-06, + "loss": 0.7712, + "step": 12529 + }, + { + "epoch": 0.6896361935164291, + "grad_norm": 1.0029237270355225, + "learning_rate": 7.367227281735418e-06, + "loss": 0.7641, + "step": 12530 + }, + { + "epoch": 0.6896912323187847, + "grad_norm": 0.6342340707778931, + "learning_rate": 7.3668454654144824e-06, + "loss": 0.7572, + "step": 12531 + }, + { + "epoch": 0.6897462711211404, + "grad_norm": 0.7475802302360535, + "learning_rate": 7.3664636313049696e-06, + "loss": 0.7969, + "step": 12532 + }, + { + "epoch": 0.6898013099234961, + "grad_norm": 0.7478888630867004, + "learning_rate": 7.36608177940975e-06, + "loss": 0.8299, + "step": 12533 + }, + { + "epoch": 0.6898563487258518, + "grad_norm": 0.7017174363136292, + "learning_rate": 7.365699909731694e-06, + "loss": 0.6608, + "step": 12534 + }, + { + "epoch": 0.6899113875282074, + "grad_norm": 0.7259606122970581, + "learning_rate": 7.3653180222736695e-06, + "loss": 0.7088, + "step": 12535 + }, + { + "epoch": 0.689966426330563, + "grad_norm": 0.7049521207809448, + "learning_rate": 7.364936117038548e-06, + "loss": 0.8177, + "step": 12536 + }, + { + "epoch": 0.6900214651329187, + "grad_norm": 0.6557304263114929, + "learning_rate": 7.364554194029201e-06, + "loss": 0.73, + "step": 12537 + }, + { + "epoch": 0.6900765039352744, + "grad_norm": 0.704140305519104, + "learning_rate": 7.364172253248497e-06, + "loss": 0.7671, + "step": 12538 + }, + { + "epoch": 0.69013154273763, + "grad_norm": 0.6879541873931885, + "learning_rate": 7.3637902946993064e-06, + "loss": 0.6707, + "step": 12539 + }, + { + "epoch": 0.6901865815399857, + "grad_norm": 0.7715931534767151, + "learning_rate": 7.363408318384501e-06, + "loss": 0.7494, + "step": 12540 + }, + { + "epoch": 0.6902416203423414, + "grad_norm": 0.7890990972518921, + "learning_rate": 7.363026324306952e-06, + "loss": 0.7499, + "step": 12541 + }, + { + "epoch": 0.6902966591446971, + "grad_norm": 0.7177792191505432, + "learning_rate": 7.362644312469529e-06, + "loss": 0.8053, + "step": 12542 + }, + { + "epoch": 0.6903516979470526, + "grad_norm": 0.7434332370758057, + "learning_rate": 7.3622622828751044e-06, + "loss": 0.7371, + "step": 12543 + }, + { + "epoch": 0.6904067367494083, + "grad_norm": 0.5836912989616394, + "learning_rate": 7.361880235526547e-06, + "loss": 0.6681, + "step": 12544 + }, + { + "epoch": 0.690461775551764, + "grad_norm": 0.6814625263214111, + "learning_rate": 7.3614981704267315e-06, + "loss": 0.7408, + "step": 12545 + }, + { + "epoch": 0.6905168143541196, + "grad_norm": 0.6524162292480469, + "learning_rate": 7.361116087578528e-06, + "loss": 0.6788, + "step": 12546 + }, + { + "epoch": 0.6905718531564753, + "grad_norm": 0.6614788174629211, + "learning_rate": 7.360733986984808e-06, + "loss": 0.75, + "step": 12547 + }, + { + "epoch": 0.690626891958831, + "grad_norm": 1.035152792930603, + "learning_rate": 7.360351868648442e-06, + "loss": 0.7181, + "step": 12548 + }, + { + "epoch": 0.6906819307611867, + "grad_norm": 0.7525657415390015, + "learning_rate": 7.359969732572305e-06, + "loss": 0.8149, + "step": 12549 + }, + { + "epoch": 0.6907369695635422, + "grad_norm": 0.8323431015014648, + "learning_rate": 7.359587578759267e-06, + "loss": 0.6908, + "step": 12550 + }, + { + "epoch": 0.6907920083658979, + "grad_norm": 0.7551344633102417, + "learning_rate": 7.3592054072122e-06, + "loss": 0.794, + "step": 12551 + }, + { + "epoch": 0.6908470471682536, + "grad_norm": 0.5937384366989136, + "learning_rate": 7.358823217933977e-06, + "loss": 0.6532, + "step": 12552 + }, + { + "epoch": 0.6909020859706093, + "grad_norm": 1.5515329837799072, + "learning_rate": 7.358441010927468e-06, + "loss": 0.7003, + "step": 12553 + }, + { + "epoch": 0.6909571247729649, + "grad_norm": 0.6838175654411316, + "learning_rate": 7.3580587861955495e-06, + "loss": 0.7184, + "step": 12554 + }, + { + "epoch": 0.6910121635753206, + "grad_norm": 0.7055354714393616, + "learning_rate": 7.357676543741092e-06, + "loss": 0.8372, + "step": 12555 + }, + { + "epoch": 0.6910672023776763, + "grad_norm": 0.8683249950408936, + "learning_rate": 7.3572942835669695e-06, + "loss": 0.7594, + "step": 12556 + }, + { + "epoch": 0.691122241180032, + "grad_norm": 0.8586179614067078, + "learning_rate": 7.3569120056760535e-06, + "loss": 0.8422, + "step": 12557 + }, + { + "epoch": 0.6911772799823875, + "grad_norm": 0.692132830619812, + "learning_rate": 7.356529710071217e-06, + "loss": 0.7872, + "step": 12558 + }, + { + "epoch": 0.6912323187847432, + "grad_norm": 0.7342404723167419, + "learning_rate": 7.356147396755335e-06, + "loss": 0.6908, + "step": 12559 + }, + { + "epoch": 0.6912873575870989, + "grad_norm": 0.6941357254981995, + "learning_rate": 7.35576506573128e-06, + "loss": 0.608, + "step": 12560 + }, + { + "epoch": 0.6913423963894546, + "grad_norm": 0.648225724697113, + "learning_rate": 7.355382717001925e-06, + "loss": 0.6923, + "step": 12561 + }, + { + "epoch": 0.6913974351918102, + "grad_norm": 0.6735422015190125, + "learning_rate": 7.355000350570144e-06, + "loss": 0.7502, + "step": 12562 + }, + { + "epoch": 0.6914524739941659, + "grad_norm": 0.8507662415504456, + "learning_rate": 7.3546179664388105e-06, + "loss": 0.7883, + "step": 12563 + }, + { + "epoch": 0.6915075127965216, + "grad_norm": 0.7287268042564392, + "learning_rate": 7.3542355646108e-06, + "loss": 0.8687, + "step": 12564 + }, + { + "epoch": 0.6915625515988773, + "grad_norm": 0.6085666418075562, + "learning_rate": 7.353853145088983e-06, + "loss": 0.6675, + "step": 12565 + }, + { + "epoch": 0.6916175904012328, + "grad_norm": 0.727668046951294, + "learning_rate": 7.353470707876237e-06, + "loss": 0.8591, + "step": 12566 + }, + { + "epoch": 0.6916726292035885, + "grad_norm": 0.724846601486206, + "learning_rate": 7.353088252975436e-06, + "loss": 0.8501, + "step": 12567 + }, + { + "epoch": 0.6917276680059442, + "grad_norm": 0.6801046133041382, + "learning_rate": 7.352705780389452e-06, + "loss": 0.7637, + "step": 12568 + }, + { + "epoch": 0.6917827068082999, + "grad_norm": 0.680496335029602, + "learning_rate": 7.352323290121161e-06, + "loss": 0.7308, + "step": 12569 + }, + { + "epoch": 0.6918377456106555, + "grad_norm": 0.7143607139587402, + "learning_rate": 7.351940782173439e-06, + "loss": 0.7494, + "step": 12570 + }, + { + "epoch": 0.6918927844130112, + "grad_norm": 0.679755687713623, + "learning_rate": 7.351558256549158e-06, + "loss": 0.7731, + "step": 12571 + }, + { + "epoch": 0.6919478232153669, + "grad_norm": 0.6626351475715637, + "learning_rate": 7.351175713251197e-06, + "loss": 0.8593, + "step": 12572 + }, + { + "epoch": 0.6920028620177225, + "grad_norm": 0.6830954551696777, + "learning_rate": 7.350793152282427e-06, + "loss": 0.6327, + "step": 12573 + }, + { + "epoch": 0.6920579008200781, + "grad_norm": 0.653810977935791, + "learning_rate": 7.350410573645726e-06, + "loss": 0.7341, + "step": 12574 + }, + { + "epoch": 0.6921129396224338, + "grad_norm": 0.6939566731452942, + "learning_rate": 7.3500279773439675e-06, + "loss": 0.7823, + "step": 12575 + }, + { + "epoch": 0.6921679784247895, + "grad_norm": 0.8212422728538513, + "learning_rate": 7.349645363380029e-06, + "loss": 0.6388, + "step": 12576 + }, + { + "epoch": 0.6922230172271452, + "grad_norm": 0.7703338265419006, + "learning_rate": 7.349262731756783e-06, + "loss": 0.7476, + "step": 12577 + }, + { + "epoch": 0.6922780560295008, + "grad_norm": 0.6710889935493469, + "learning_rate": 7.348880082477108e-06, + "loss": 0.7869, + "step": 12578 + }, + { + "epoch": 0.6923330948318565, + "grad_norm": 0.7384413480758667, + "learning_rate": 7.3484974155438795e-06, + "loss": 0.6628, + "step": 12579 + }, + { + "epoch": 0.6923881336342121, + "grad_norm": 0.7628176212310791, + "learning_rate": 7.348114730959973e-06, + "loss": 0.7599, + "step": 12580 + }, + { + "epoch": 0.6924431724365678, + "grad_norm": 0.683885931968689, + "learning_rate": 7.347732028728264e-06, + "loss": 0.7134, + "step": 12581 + }, + { + "epoch": 0.6924982112389234, + "grad_norm": 0.6710503697395325, + "learning_rate": 7.34734930885163e-06, + "loss": 0.7147, + "step": 12582 + }, + { + "epoch": 0.6925532500412791, + "grad_norm": 0.6984537243843079, + "learning_rate": 7.346966571332947e-06, + "loss": 0.7517, + "step": 12583 + }, + { + "epoch": 0.6926082888436348, + "grad_norm": 0.7563193440437317, + "learning_rate": 7.346583816175092e-06, + "loss": 0.7971, + "step": 12584 + }, + { + "epoch": 0.6926633276459905, + "grad_norm": 0.8407838940620422, + "learning_rate": 7.346201043380941e-06, + "loss": 0.8227, + "step": 12585 + }, + { + "epoch": 0.6927183664483461, + "grad_norm": 0.673098623752594, + "learning_rate": 7.345818252953369e-06, + "loss": 0.7514, + "step": 12586 + }, + { + "epoch": 0.6927734052507017, + "grad_norm": 0.6452111005783081, + "learning_rate": 7.345435444895257e-06, + "loss": 0.7201, + "step": 12587 + }, + { + "epoch": 0.6928284440530574, + "grad_norm": 0.8728383779525757, + "learning_rate": 7.345052619209481e-06, + "loss": 0.7452, + "step": 12588 + }, + { + "epoch": 0.692883482855413, + "grad_norm": 0.7032049298286438, + "learning_rate": 7.344669775898914e-06, + "loss": 0.8885, + "step": 12589 + }, + { + "epoch": 0.6929385216577687, + "grad_norm": 0.7744605541229248, + "learning_rate": 7.344286914966438e-06, + "loss": 0.8048, + "step": 12590 + }, + { + "epoch": 0.6929935604601244, + "grad_norm": 0.7334163784980774, + "learning_rate": 7.343904036414931e-06, + "loss": 0.8502, + "step": 12591 + }, + { + "epoch": 0.6930485992624801, + "grad_norm": 0.6684108376502991, + "learning_rate": 7.343521140247266e-06, + "loss": 0.8264, + "step": 12592 + }, + { + "epoch": 0.6931036380648357, + "grad_norm": 0.6192718744277954, + "learning_rate": 7.343138226466324e-06, + "loss": 0.6625, + "step": 12593 + }, + { + "epoch": 0.6931586768671913, + "grad_norm": 0.6410724520683289, + "learning_rate": 7.342755295074984e-06, + "loss": 0.717, + "step": 12594 + }, + { + "epoch": 0.693213715669547, + "grad_norm": 0.6854361891746521, + "learning_rate": 7.342372346076121e-06, + "loss": 0.7246, + "step": 12595 + }, + { + "epoch": 0.6932687544719027, + "grad_norm": 0.6920250058174133, + "learning_rate": 7.341989379472614e-06, + "loss": 0.7414, + "step": 12596 + }, + { + "epoch": 0.6933237932742583, + "grad_norm": 0.6545842885971069, + "learning_rate": 7.341606395267342e-06, + "loss": 0.7731, + "step": 12597 + }, + { + "epoch": 0.693378832076614, + "grad_norm": 0.6879072785377502, + "learning_rate": 7.341223393463184e-06, + "loss": 0.7272, + "step": 12598 + }, + { + "epoch": 0.6934338708789697, + "grad_norm": 0.7460979223251343, + "learning_rate": 7.340840374063018e-06, + "loss": 0.771, + "step": 12599 + }, + { + "epoch": 0.6934889096813254, + "grad_norm": 0.7836858630180359, + "learning_rate": 7.340457337069722e-06, + "loss": 0.846, + "step": 12600 + }, + { + "epoch": 0.693543948483681, + "grad_norm": 0.958403468132019, + "learning_rate": 7.340074282486174e-06, + "loss": 0.8913, + "step": 12601 + }, + { + "epoch": 0.6935989872860366, + "grad_norm": 0.6614813208580017, + "learning_rate": 7.339691210315254e-06, + "loss": 0.7129, + "step": 12602 + }, + { + "epoch": 0.6936540260883923, + "grad_norm": 0.7303252816200256, + "learning_rate": 7.339308120559843e-06, + "loss": 0.8395, + "step": 12603 + }, + { + "epoch": 0.693709064890748, + "grad_norm": 0.7341620922088623, + "learning_rate": 7.338925013222817e-06, + "loss": 0.8341, + "step": 12604 + }, + { + "epoch": 0.6937641036931036, + "grad_norm": 0.7077179551124573, + "learning_rate": 7.338541888307056e-06, + "loss": 0.7813, + "step": 12605 + }, + { + "epoch": 0.6938191424954593, + "grad_norm": 0.6654969453811646, + "learning_rate": 7.338158745815441e-06, + "loss": 0.7337, + "step": 12606 + }, + { + "epoch": 0.693874181297815, + "grad_norm": 0.6637474894523621, + "learning_rate": 7.337775585750852e-06, + "loss": 0.8197, + "step": 12607 + }, + { + "epoch": 0.6939292201001707, + "grad_norm": 0.654712975025177, + "learning_rate": 7.337392408116166e-06, + "loss": 0.6991, + "step": 12608 + }, + { + "epoch": 0.6939842589025262, + "grad_norm": 0.6698346138000488, + "learning_rate": 7.337009212914265e-06, + "loss": 0.7991, + "step": 12609 + }, + { + "epoch": 0.6940392977048819, + "grad_norm": 0.9616294503211975, + "learning_rate": 7.336626000148028e-06, + "loss": 0.7326, + "step": 12610 + }, + { + "epoch": 0.6940943365072376, + "grad_norm": 0.7749543786048889, + "learning_rate": 7.336242769820335e-06, + "loss": 0.8015, + "step": 12611 + }, + { + "epoch": 0.6941493753095933, + "grad_norm": 0.7263140678405762, + "learning_rate": 7.335859521934068e-06, + "loss": 0.7538, + "step": 12612 + }, + { + "epoch": 0.6942044141119489, + "grad_norm": 0.6383689641952515, + "learning_rate": 7.335476256492105e-06, + "loss": 0.7611, + "step": 12613 + }, + { + "epoch": 0.6942594529143046, + "grad_norm": 0.7464908957481384, + "learning_rate": 7.335092973497326e-06, + "loss": 0.7904, + "step": 12614 + }, + { + "epoch": 0.6943144917166603, + "grad_norm": 1.114864468574524, + "learning_rate": 7.334709672952615e-06, + "loss": 0.8518, + "step": 12615 + }, + { + "epoch": 0.694369530519016, + "grad_norm": 0.6712734699249268, + "learning_rate": 7.334326354860852e-06, + "loss": 0.7431, + "step": 12616 + }, + { + "epoch": 0.6944245693213715, + "grad_norm": 0.7559850811958313, + "learning_rate": 7.3339430192249166e-06, + "loss": 0.7556, + "step": 12617 + }, + { + "epoch": 0.6944796081237272, + "grad_norm": 0.7262033224105835, + "learning_rate": 7.333559666047689e-06, + "loss": 0.7624, + "step": 12618 + }, + { + "epoch": 0.6945346469260829, + "grad_norm": 0.6428695917129517, + "learning_rate": 7.333176295332053e-06, + "loss": 0.6894, + "step": 12619 + }, + { + "epoch": 0.6945896857284386, + "grad_norm": 0.7353672385215759, + "learning_rate": 7.3327929070808875e-06, + "loss": 0.7611, + "step": 12620 + }, + { + "epoch": 0.6946447245307942, + "grad_norm": 0.7063810229301453, + "learning_rate": 7.332409501297076e-06, + "loss": 0.7428, + "step": 12621 + }, + { + "epoch": 0.6946997633331499, + "grad_norm": 0.6552421450614929, + "learning_rate": 7.332026077983498e-06, + "loss": 0.7046, + "step": 12622 + }, + { + "epoch": 0.6947548021355056, + "grad_norm": 0.8843327760696411, + "learning_rate": 7.331642637143037e-06, + "loss": 0.6952, + "step": 12623 + }, + { + "epoch": 0.6948098409378612, + "grad_norm": 0.7279102802276611, + "learning_rate": 7.331259178778574e-06, + "loss": 0.7911, + "step": 12624 + }, + { + "epoch": 0.6948648797402168, + "grad_norm": 0.6585525870323181, + "learning_rate": 7.33087570289299e-06, + "loss": 0.7684, + "step": 12625 + }, + { + "epoch": 0.6949199185425725, + "grad_norm": 0.663185715675354, + "learning_rate": 7.3304922094891695e-06, + "loss": 0.6753, + "step": 12626 + }, + { + "epoch": 0.6949749573449282, + "grad_norm": 0.652765691280365, + "learning_rate": 7.330108698569993e-06, + "loss": 0.7333, + "step": 12627 + }, + { + "epoch": 0.6950299961472839, + "grad_norm": 0.7781688570976257, + "learning_rate": 7.329725170138343e-06, + "loss": 0.7312, + "step": 12628 + }, + { + "epoch": 0.6950850349496395, + "grad_norm": 0.6798241138458252, + "learning_rate": 7.329341624197102e-06, + "loss": 0.7747, + "step": 12629 + }, + { + "epoch": 0.6951400737519952, + "grad_norm": 0.7588373422622681, + "learning_rate": 7.328958060749153e-06, + "loss": 0.8535, + "step": 12630 + }, + { + "epoch": 0.6951951125543508, + "grad_norm": 0.8833348155021667, + "learning_rate": 7.328574479797379e-06, + "loss": 0.8345, + "step": 12631 + }, + { + "epoch": 0.6952501513567064, + "grad_norm": 0.799454927444458, + "learning_rate": 7.328190881344663e-06, + "loss": 0.7571, + "step": 12632 + }, + { + "epoch": 0.6953051901590621, + "grad_norm": 0.8030340671539307, + "learning_rate": 7.327807265393887e-06, + "loss": 0.7426, + "step": 12633 + }, + { + "epoch": 0.6953602289614178, + "grad_norm": 0.6246228218078613, + "learning_rate": 7.327423631947934e-06, + "loss": 0.6712, + "step": 12634 + }, + { + "epoch": 0.6954152677637735, + "grad_norm": 0.7203500866889954, + "learning_rate": 7.32703998100969e-06, + "loss": 0.8315, + "step": 12635 + }, + { + "epoch": 0.6954703065661291, + "grad_norm": 0.6128239035606384, + "learning_rate": 7.326656312582035e-06, + "loss": 0.6788, + "step": 12636 + }, + { + "epoch": 0.6955253453684848, + "grad_norm": 0.8052619695663452, + "learning_rate": 7.326272626667852e-06, + "loss": 0.8076, + "step": 12637 + }, + { + "epoch": 0.6955803841708404, + "grad_norm": 0.9128470420837402, + "learning_rate": 7.325888923270029e-06, + "loss": 0.7135, + "step": 12638 + }, + { + "epoch": 0.6956354229731961, + "grad_norm": 0.6815299391746521, + "learning_rate": 7.325505202391447e-06, + "loss": 0.7756, + "step": 12639 + }, + { + "epoch": 0.6956904617755517, + "grad_norm": 0.6278733611106873, + "learning_rate": 7.325121464034991e-06, + "loss": 0.6583, + "step": 12640 + }, + { + "epoch": 0.6957455005779074, + "grad_norm": 0.7161649465560913, + "learning_rate": 7.324737708203543e-06, + "loss": 0.7106, + "step": 12641 + }, + { + "epoch": 0.6958005393802631, + "grad_norm": 0.6827715635299683, + "learning_rate": 7.324353934899989e-06, + "loss": 0.7988, + "step": 12642 + }, + { + "epoch": 0.6958555781826188, + "grad_norm": 0.9999695420265198, + "learning_rate": 7.323970144127215e-06, + "loss": 0.8222, + "step": 12643 + }, + { + "epoch": 0.6959106169849744, + "grad_norm": 0.8048173785209656, + "learning_rate": 7.323586335888102e-06, + "loss": 0.7157, + "step": 12644 + }, + { + "epoch": 0.69596565578733, + "grad_norm": 0.7403637170791626, + "learning_rate": 7.323202510185536e-06, + "loss": 0.7516, + "step": 12645 + }, + { + "epoch": 0.6960206945896857, + "grad_norm": 0.6660793423652649, + "learning_rate": 7.322818667022402e-06, + "loss": 0.7081, + "step": 12646 + }, + { + "epoch": 0.6960757333920414, + "grad_norm": 0.713985800743103, + "learning_rate": 7.322434806401585e-06, + "loss": 0.7682, + "step": 12647 + }, + { + "epoch": 0.696130772194397, + "grad_norm": 0.739253044128418, + "learning_rate": 7.322050928325969e-06, + "loss": 0.838, + "step": 12648 + }, + { + "epoch": 0.6961858109967527, + "grad_norm": 0.8350489735603333, + "learning_rate": 7.32166703279844e-06, + "loss": 0.7627, + "step": 12649 + }, + { + "epoch": 0.6962408497991084, + "grad_norm": 0.580456018447876, + "learning_rate": 7.321283119821883e-06, + "loss": 0.6248, + "step": 12650 + }, + { + "epoch": 0.6962958886014641, + "grad_norm": 0.8619480729103088, + "learning_rate": 7.320899189399183e-06, + "loss": 0.848, + "step": 12651 + }, + { + "epoch": 0.6963509274038197, + "grad_norm": 0.6201381087303162, + "learning_rate": 7.320515241533227e-06, + "loss": 0.6506, + "step": 12652 + }, + { + "epoch": 0.6964059662061753, + "grad_norm": 0.6956773400306702, + "learning_rate": 7.320131276226898e-06, + "loss": 0.7561, + "step": 12653 + }, + { + "epoch": 0.696461005008531, + "grad_norm": 0.6382080912590027, + "learning_rate": 7.319747293483085e-06, + "loss": 0.6462, + "step": 12654 + }, + { + "epoch": 0.6965160438108867, + "grad_norm": 0.7288708686828613, + "learning_rate": 7.319363293304672e-06, + "loss": 0.7907, + "step": 12655 + }, + { + "epoch": 0.6965710826132423, + "grad_norm": 0.6280390024185181, + "learning_rate": 7.318979275694546e-06, + "loss": 0.6882, + "step": 12656 + }, + { + "epoch": 0.696626121415598, + "grad_norm": 0.7260308861732483, + "learning_rate": 7.31859524065559e-06, + "loss": 0.756, + "step": 12657 + }, + { + "epoch": 0.6966811602179537, + "grad_norm": 0.6715009212493896, + "learning_rate": 7.318211188190696e-06, + "loss": 0.7194, + "step": 12658 + }, + { + "epoch": 0.6967361990203094, + "grad_norm": 0.6770408749580383, + "learning_rate": 7.3178271183027465e-06, + "loss": 0.808, + "step": 12659 + }, + { + "epoch": 0.6967912378226649, + "grad_norm": 0.7209904789924622, + "learning_rate": 7.317443030994628e-06, + "loss": 0.7242, + "step": 12660 + }, + { + "epoch": 0.6968462766250206, + "grad_norm": 0.6943202018737793, + "learning_rate": 7.317058926269227e-06, + "loss": 0.758, + "step": 12661 + }, + { + "epoch": 0.6969013154273763, + "grad_norm": 0.6073412299156189, + "learning_rate": 7.316674804129432e-06, + "loss": 0.6571, + "step": 12662 + }, + { + "epoch": 0.696956354229732, + "grad_norm": 0.7065439224243164, + "learning_rate": 7.316290664578129e-06, + "loss": 0.7333, + "step": 12663 + }, + { + "epoch": 0.6970113930320876, + "grad_norm": 0.6275133490562439, + "learning_rate": 7.315906507618207e-06, + "loss": 0.6785, + "step": 12664 + }, + { + "epoch": 0.6970664318344433, + "grad_norm": 0.6484677791595459, + "learning_rate": 7.315522333252551e-06, + "loss": 0.7461, + "step": 12665 + }, + { + "epoch": 0.697121470636799, + "grad_norm": 0.6815413236618042, + "learning_rate": 7.315138141484049e-06, + "loss": 0.673, + "step": 12666 + }, + { + "epoch": 0.6971765094391547, + "grad_norm": 0.7227872610092163, + "learning_rate": 7.314753932315587e-06, + "loss": 0.7212, + "step": 12667 + }, + { + "epoch": 0.6972315482415102, + "grad_norm": 0.661568284034729, + "learning_rate": 7.314369705750055e-06, + "loss": 0.7633, + "step": 12668 + }, + { + "epoch": 0.6972865870438659, + "grad_norm": 0.5873990654945374, + "learning_rate": 7.3139854617903405e-06, + "loss": 0.6142, + "step": 12669 + }, + { + "epoch": 0.6973416258462216, + "grad_norm": 0.7015652656555176, + "learning_rate": 7.313601200439331e-06, + "loss": 0.6762, + "step": 12670 + }, + { + "epoch": 0.6973966646485773, + "grad_norm": 0.7060853242874146, + "learning_rate": 7.313216921699913e-06, + "loss": 0.8111, + "step": 12671 + }, + { + "epoch": 0.6974517034509329, + "grad_norm": 0.6198092699050903, + "learning_rate": 7.312832625574977e-06, + "loss": 0.7058, + "step": 12672 + }, + { + "epoch": 0.6975067422532886, + "grad_norm": 0.6785464286804199, + "learning_rate": 7.312448312067408e-06, + "loss": 0.7509, + "step": 12673 + }, + { + "epoch": 0.6975617810556443, + "grad_norm": 0.74974524974823, + "learning_rate": 7.312063981180097e-06, + "loss": 0.7679, + "step": 12674 + }, + { + "epoch": 0.6976168198579998, + "grad_norm": 0.6188651919364929, + "learning_rate": 7.311679632915934e-06, + "loss": 0.663, + "step": 12675 + }, + { + "epoch": 0.6976718586603555, + "grad_norm": 0.7458493113517761, + "learning_rate": 7.3112952672778044e-06, + "loss": 0.7316, + "step": 12676 + }, + { + "epoch": 0.6977268974627112, + "grad_norm": 0.7480403780937195, + "learning_rate": 7.310910884268597e-06, + "loss": 0.8476, + "step": 12677 + }, + { + "epoch": 0.6977819362650669, + "grad_norm": 0.6921943426132202, + "learning_rate": 7.310526483891204e-06, + "loss": 0.7931, + "step": 12678 + }, + { + "epoch": 0.6978369750674225, + "grad_norm": 0.7384023666381836, + "learning_rate": 7.3101420661485124e-06, + "loss": 0.7698, + "step": 12679 + }, + { + "epoch": 0.6978920138697782, + "grad_norm": 0.6693310141563416, + "learning_rate": 7.3097576310434105e-06, + "loss": 0.6838, + "step": 12680 + }, + { + "epoch": 0.6979470526721339, + "grad_norm": 0.6888617873191833, + "learning_rate": 7.309373178578789e-06, + "loss": 0.7196, + "step": 12681 + }, + { + "epoch": 0.6980020914744895, + "grad_norm": 0.7608165144920349, + "learning_rate": 7.308988708757536e-06, + "loss": 0.7483, + "step": 12682 + }, + { + "epoch": 0.6980571302768451, + "grad_norm": 0.6969812512397766, + "learning_rate": 7.308604221582543e-06, + "loss": 0.7415, + "step": 12683 + }, + { + "epoch": 0.6981121690792008, + "grad_norm": 0.7440872192382812, + "learning_rate": 7.3082197170566996e-06, + "loss": 0.7776, + "step": 12684 + }, + { + "epoch": 0.6981672078815565, + "grad_norm": 0.7920299768447876, + "learning_rate": 7.307835195182892e-06, + "loss": 0.746, + "step": 12685 + }, + { + "epoch": 0.6982222466839122, + "grad_norm": 0.7002919912338257, + "learning_rate": 7.3074506559640134e-06, + "loss": 0.7948, + "step": 12686 + }, + { + "epoch": 0.6982772854862678, + "grad_norm": 0.7199681997299194, + "learning_rate": 7.3070660994029554e-06, + "loss": 0.7568, + "step": 12687 + }, + { + "epoch": 0.6983323242886235, + "grad_norm": 0.6287575960159302, + "learning_rate": 7.306681525502604e-06, + "loss": 0.6564, + "step": 12688 + }, + { + "epoch": 0.6983873630909792, + "grad_norm": 0.6910778880119324, + "learning_rate": 7.306296934265853e-06, + "loss": 0.7892, + "step": 12689 + }, + { + "epoch": 0.6984424018933348, + "grad_norm": 0.6454603672027588, + "learning_rate": 7.30591232569559e-06, + "loss": 0.7848, + "step": 12690 + }, + { + "epoch": 0.6984974406956904, + "grad_norm": 0.7337101101875305, + "learning_rate": 7.305527699794709e-06, + "loss": 0.8012, + "step": 12691 + }, + { + "epoch": 0.6985524794980461, + "grad_norm": 0.6694337129592896, + "learning_rate": 7.305143056566098e-06, + "loss": 0.7767, + "step": 12692 + }, + { + "epoch": 0.6986075183004018, + "grad_norm": 0.6485214233398438, + "learning_rate": 7.30475839601265e-06, + "loss": 0.7142, + "step": 12693 + }, + { + "epoch": 0.6986625571027575, + "grad_norm": 0.6401854753494263, + "learning_rate": 7.304373718137253e-06, + "loss": 0.6562, + "step": 12694 + }, + { + "epoch": 0.6987175959051131, + "grad_norm": 0.7190635800361633, + "learning_rate": 7.303989022942801e-06, + "loss": 0.7513, + "step": 12695 + }, + { + "epoch": 0.6987726347074688, + "grad_norm": 0.7100299596786499, + "learning_rate": 7.3036043104321854e-06, + "loss": 0.759, + "step": 12696 + }, + { + "epoch": 0.6988276735098244, + "grad_norm": 0.8507145047187805, + "learning_rate": 7.303219580608295e-06, + "loss": 0.7567, + "step": 12697 + }, + { + "epoch": 0.6988827123121801, + "grad_norm": 0.6758378744125366, + "learning_rate": 7.302834833474022e-06, + "loss": 0.6751, + "step": 12698 + }, + { + "epoch": 0.6989377511145357, + "grad_norm": 0.7602974772453308, + "learning_rate": 7.30245006903226e-06, + "loss": 0.7304, + "step": 12699 + }, + { + "epoch": 0.6989927899168914, + "grad_norm": 0.7519045472145081, + "learning_rate": 7.3020652872859e-06, + "loss": 0.7573, + "step": 12700 + }, + { + "epoch": 0.6990478287192471, + "grad_norm": 0.6076456904411316, + "learning_rate": 7.301680488237832e-06, + "loss": 0.6335, + "step": 12701 + }, + { + "epoch": 0.6991028675216028, + "grad_norm": 0.6900685429573059, + "learning_rate": 7.30129567189095e-06, + "loss": 0.7787, + "step": 12702 + }, + { + "epoch": 0.6991579063239584, + "grad_norm": 0.7366316318511963, + "learning_rate": 7.300910838248146e-06, + "loss": 0.8176, + "step": 12703 + }, + { + "epoch": 0.699212945126314, + "grad_norm": 0.6658521890640259, + "learning_rate": 7.300525987312312e-06, + "loss": 0.6436, + "step": 12704 + }, + { + "epoch": 0.6992679839286697, + "grad_norm": 0.7635871171951294, + "learning_rate": 7.300141119086341e-06, + "loss": 0.8421, + "step": 12705 + }, + { + "epoch": 0.6993230227310254, + "grad_norm": 0.7257800698280334, + "learning_rate": 7.299756233573125e-06, + "loss": 0.6468, + "step": 12706 + }, + { + "epoch": 0.699378061533381, + "grad_norm": 0.7536096572875977, + "learning_rate": 7.299371330775558e-06, + "loss": 0.7782, + "step": 12707 + }, + { + "epoch": 0.6994331003357367, + "grad_norm": 0.7504379153251648, + "learning_rate": 7.298986410696529e-06, + "loss": 0.7097, + "step": 12708 + }, + { + "epoch": 0.6994881391380924, + "grad_norm": 0.7340306043624878, + "learning_rate": 7.298601473338936e-06, + "loss": 0.8165, + "step": 12709 + }, + { + "epoch": 0.6995431779404481, + "grad_norm": 0.6928045749664307, + "learning_rate": 7.298216518705667e-06, + "loss": 0.777, + "step": 12710 + }, + { + "epoch": 0.6995982167428036, + "grad_norm": 0.6942496299743652, + "learning_rate": 7.29783154679962e-06, + "loss": 0.6607, + "step": 12711 + }, + { + "epoch": 0.6996532555451593, + "grad_norm": 0.6646896600723267, + "learning_rate": 7.297446557623684e-06, + "loss": 0.712, + "step": 12712 + }, + { + "epoch": 0.699708294347515, + "grad_norm": 0.6828078627586365, + "learning_rate": 7.297061551180758e-06, + "loss": 0.7251, + "step": 12713 + }, + { + "epoch": 0.6997633331498707, + "grad_norm": 0.7554219365119934, + "learning_rate": 7.296676527473729e-06, + "loss": 0.8279, + "step": 12714 + }, + { + "epoch": 0.6998183719522263, + "grad_norm": 0.8122106194496155, + "learning_rate": 7.296291486505495e-06, + "loss": 0.8039, + "step": 12715 + }, + { + "epoch": 0.699873410754582, + "grad_norm": 0.6602222323417664, + "learning_rate": 7.295906428278949e-06, + "loss": 0.7149, + "step": 12716 + }, + { + "epoch": 0.6999284495569377, + "grad_norm": 0.8341954350471497, + "learning_rate": 7.2955213527969845e-06, + "loss": 0.7868, + "step": 12717 + }, + { + "epoch": 0.6999834883592932, + "grad_norm": 0.7157256603240967, + "learning_rate": 7.295136260062496e-06, + "loss": 0.745, + "step": 12718 + }, + { + "epoch": 0.7000385271616489, + "grad_norm": 0.5845672488212585, + "learning_rate": 7.294751150078379e-06, + "loss": 0.657, + "step": 12719 + }, + { + "epoch": 0.7000935659640046, + "grad_norm": 0.7370786070823669, + "learning_rate": 7.2943660228475265e-06, + "loss": 0.7883, + "step": 12720 + }, + { + "epoch": 0.7001486047663603, + "grad_norm": 0.6687451004981995, + "learning_rate": 7.293980878372833e-06, + "loss": 0.7945, + "step": 12721 + }, + { + "epoch": 0.7002036435687159, + "grad_norm": 0.6352105736732483, + "learning_rate": 7.293595716657192e-06, + "loss": 0.6581, + "step": 12722 + }, + { + "epoch": 0.7002586823710716, + "grad_norm": 0.7371370196342468, + "learning_rate": 7.293210537703499e-06, + "loss": 0.7859, + "step": 12723 + }, + { + "epoch": 0.7003137211734273, + "grad_norm": 0.6885504722595215, + "learning_rate": 7.292825341514651e-06, + "loss": 0.7355, + "step": 12724 + }, + { + "epoch": 0.700368759975783, + "grad_norm": 0.6930849552154541, + "learning_rate": 7.292440128093542e-06, + "loss": 0.8145, + "step": 12725 + }, + { + "epoch": 0.7004237987781385, + "grad_norm": 0.6767199635505676, + "learning_rate": 7.292054897443065e-06, + "loss": 0.7136, + "step": 12726 + }, + { + "epoch": 0.7004788375804942, + "grad_norm": 0.6672216653823853, + "learning_rate": 7.291669649566117e-06, + "loss": 0.6131, + "step": 12727 + }, + { + "epoch": 0.7005338763828499, + "grad_norm": 0.6618815064430237, + "learning_rate": 7.291284384465595e-06, + "loss": 0.7633, + "step": 12728 + }, + { + "epoch": 0.7005889151852056, + "grad_norm": 0.6573876142501831, + "learning_rate": 7.290899102144392e-06, + "loss": 0.7621, + "step": 12729 + }, + { + "epoch": 0.7006439539875612, + "grad_norm": 0.7449564337730408, + "learning_rate": 7.290513802605405e-06, + "loss": 0.6488, + "step": 12730 + }, + { + "epoch": 0.7006989927899169, + "grad_norm": 0.7307295203208923, + "learning_rate": 7.290128485851529e-06, + "loss": 0.7095, + "step": 12731 + }, + { + "epoch": 0.7007540315922726, + "grad_norm": 0.698699951171875, + "learning_rate": 7.2897431518856596e-06, + "loss": 0.7428, + "step": 12732 + }, + { + "epoch": 0.7008090703946283, + "grad_norm": 0.6334750056266785, + "learning_rate": 7.289357800710695e-06, + "loss": 0.6977, + "step": 12733 + }, + { + "epoch": 0.7008641091969838, + "grad_norm": 0.6526468396186829, + "learning_rate": 7.288972432329529e-06, + "loss": 0.6375, + "step": 12734 + }, + { + "epoch": 0.7009191479993395, + "grad_norm": 0.7282149791717529, + "learning_rate": 7.288587046745059e-06, + "loss": 0.7494, + "step": 12735 + }, + { + "epoch": 0.7009741868016952, + "grad_norm": 0.8511056900024414, + "learning_rate": 7.288201643960182e-06, + "loss": 0.7494, + "step": 12736 + }, + { + "epoch": 0.7010292256040509, + "grad_norm": 0.6908526420593262, + "learning_rate": 7.287816223977793e-06, + "loss": 0.6861, + "step": 12737 + }, + { + "epoch": 0.7010842644064065, + "grad_norm": 0.7582982182502747, + "learning_rate": 7.2874307868007896e-06, + "loss": 0.7758, + "step": 12738 + }, + { + "epoch": 0.7011393032087622, + "grad_norm": 0.9717779159545898, + "learning_rate": 7.2870453324320685e-06, + "loss": 0.7221, + "step": 12739 + }, + { + "epoch": 0.7011943420111179, + "grad_norm": 0.6532751321792603, + "learning_rate": 7.286659860874529e-06, + "loss": 0.8009, + "step": 12740 + }, + { + "epoch": 0.7012493808134735, + "grad_norm": 0.6708540320396423, + "learning_rate": 7.286274372131065e-06, + "loss": 0.7177, + "step": 12741 + }, + { + "epoch": 0.7013044196158291, + "grad_norm": 0.7624804973602295, + "learning_rate": 7.285888866204575e-06, + "loss": 0.7878, + "step": 12742 + }, + { + "epoch": 0.7013594584181848, + "grad_norm": 0.7167851328849792, + "learning_rate": 7.285503343097955e-06, + "loss": 0.7276, + "step": 12743 + }, + { + "epoch": 0.7014144972205405, + "grad_norm": 0.6592209935188293, + "learning_rate": 7.2851178028141045e-06, + "loss": 0.7665, + "step": 12744 + }, + { + "epoch": 0.7014695360228962, + "grad_norm": 0.684847354888916, + "learning_rate": 7.284732245355921e-06, + "loss": 0.7358, + "step": 12745 + }, + { + "epoch": 0.7015245748252518, + "grad_norm": 0.6852415800094604, + "learning_rate": 7.2843466707262985e-06, + "loss": 0.7805, + "step": 12746 + }, + { + "epoch": 0.7015796136276075, + "grad_norm": 0.6422114968299866, + "learning_rate": 7.283961078928141e-06, + "loss": 0.7386, + "step": 12747 + }, + { + "epoch": 0.7016346524299631, + "grad_norm": 0.7538495659828186, + "learning_rate": 7.283575469964343e-06, + "loss": 0.798, + "step": 12748 + }, + { + "epoch": 0.7016896912323188, + "grad_norm": 0.6646687984466553, + "learning_rate": 7.2831898438378025e-06, + "loss": 0.7048, + "step": 12749 + }, + { + "epoch": 0.7017447300346744, + "grad_norm": 0.8338429927825928, + "learning_rate": 7.2828042005514176e-06, + "loss": 0.8585, + "step": 12750 + }, + { + "epoch": 0.7017997688370301, + "grad_norm": 0.7086663842201233, + "learning_rate": 7.282418540108088e-06, + "loss": 0.8011, + "step": 12751 + }, + { + "epoch": 0.7018548076393858, + "grad_norm": 0.6040074229240417, + "learning_rate": 7.282032862510712e-06, + "loss": 0.6327, + "step": 12752 + }, + { + "epoch": 0.7019098464417415, + "grad_norm": 0.7030978798866272, + "learning_rate": 7.281647167762187e-06, + "loss": 0.6373, + "step": 12753 + }, + { + "epoch": 0.7019648852440971, + "grad_norm": 0.662308394908905, + "learning_rate": 7.281261455865414e-06, + "loss": 0.7283, + "step": 12754 + }, + { + "epoch": 0.7020199240464527, + "grad_norm": 0.7369368672370911, + "learning_rate": 7.28087572682329e-06, + "loss": 0.7632, + "step": 12755 + }, + { + "epoch": 0.7020749628488084, + "grad_norm": 0.6887282729148865, + "learning_rate": 7.280489980638714e-06, + "loss": 0.7629, + "step": 12756 + }, + { + "epoch": 0.702130001651164, + "grad_norm": 0.656512975692749, + "learning_rate": 7.280104217314587e-06, + "loss": 0.8028, + "step": 12757 + }, + { + "epoch": 0.7021850404535197, + "grad_norm": 0.7006264328956604, + "learning_rate": 7.279718436853805e-06, + "loss": 0.7025, + "step": 12758 + }, + { + "epoch": 0.7022400792558754, + "grad_norm": 0.675585925579071, + "learning_rate": 7.279332639259271e-06, + "loss": 0.8001, + "step": 12759 + }, + { + "epoch": 0.7022951180582311, + "grad_norm": 0.7105827331542969, + "learning_rate": 7.278946824533883e-06, + "loss": 0.7767, + "step": 12760 + }, + { + "epoch": 0.7023501568605867, + "grad_norm": 0.8310064673423767, + "learning_rate": 7.27856099268054e-06, + "loss": 0.7828, + "step": 12761 + }, + { + "epoch": 0.7024051956629423, + "grad_norm": 0.6885055899620056, + "learning_rate": 7.278175143702142e-06, + "loss": 0.7018, + "step": 12762 + }, + { + "epoch": 0.702460234465298, + "grad_norm": 0.6542866826057434, + "learning_rate": 7.27778927760159e-06, + "loss": 0.7118, + "step": 12763 + }, + { + "epoch": 0.7025152732676537, + "grad_norm": 0.9102655053138733, + "learning_rate": 7.277403394381784e-06, + "loss": 0.8381, + "step": 12764 + }, + { + "epoch": 0.7025703120700093, + "grad_norm": 0.6538355946540833, + "learning_rate": 7.277017494045624e-06, + "loss": 0.7766, + "step": 12765 + }, + { + "epoch": 0.702625350872365, + "grad_norm": 0.6691237092018127, + "learning_rate": 7.27663157659601e-06, + "loss": 0.8077, + "step": 12766 + }, + { + "epoch": 0.7026803896747207, + "grad_norm": 0.7159995436668396, + "learning_rate": 7.2762456420358414e-06, + "loss": 0.8333, + "step": 12767 + }, + { + "epoch": 0.7027354284770764, + "grad_norm": 0.6518422365188599, + "learning_rate": 7.275859690368022e-06, + "loss": 0.7634, + "step": 12768 + }, + { + "epoch": 0.702790467279432, + "grad_norm": 0.6969057321548462, + "learning_rate": 7.275473721595449e-06, + "loss": 0.7481, + "step": 12769 + }, + { + "epoch": 0.7028455060817876, + "grad_norm": 0.6788915395736694, + "learning_rate": 7.2750877357210225e-06, + "loss": 0.7402, + "step": 12770 + }, + { + "epoch": 0.7029005448841433, + "grad_norm": 0.7323998212814331, + "learning_rate": 7.274701732747649e-06, + "loss": 0.7122, + "step": 12771 + }, + { + "epoch": 0.702955583686499, + "grad_norm": 0.7224077582359314, + "learning_rate": 7.274315712678224e-06, + "loss": 0.7333, + "step": 12772 + }, + { + "epoch": 0.7030106224888546, + "grad_norm": 0.9009444117546082, + "learning_rate": 7.273929675515652e-06, + "loss": 0.6912, + "step": 12773 + }, + { + "epoch": 0.7030656612912103, + "grad_norm": 0.7076312899589539, + "learning_rate": 7.273543621262832e-06, + "loss": 0.7651, + "step": 12774 + }, + { + "epoch": 0.703120700093566, + "grad_norm": 0.78575599193573, + "learning_rate": 7.273157549922668e-06, + "loss": 0.7443, + "step": 12775 + }, + { + "epoch": 0.7031757388959217, + "grad_norm": 0.6957094669342041, + "learning_rate": 7.27277146149806e-06, + "loss": 0.7684, + "step": 12776 + }, + { + "epoch": 0.7032307776982772, + "grad_norm": 1.177878975868225, + "learning_rate": 7.27238535599191e-06, + "loss": 0.9033, + "step": 12777 + }, + { + "epoch": 0.7032858165006329, + "grad_norm": 0.6929007768630981, + "learning_rate": 7.27199923340712e-06, + "loss": 0.7411, + "step": 12778 + }, + { + "epoch": 0.7033408553029886, + "grad_norm": 0.7725315093994141, + "learning_rate": 7.2716130937465926e-06, + "loss": 0.7833, + "step": 12779 + }, + { + "epoch": 0.7033958941053443, + "grad_norm": 0.6512928605079651, + "learning_rate": 7.271226937013228e-06, + "loss": 0.7918, + "step": 12780 + }, + { + "epoch": 0.7034509329076999, + "grad_norm": 0.7033893465995789, + "learning_rate": 7.270840763209931e-06, + "loss": 0.843, + "step": 12781 + }, + { + "epoch": 0.7035059717100556, + "grad_norm": 0.7596432566642761, + "learning_rate": 7.2704545723396e-06, + "loss": 0.7916, + "step": 12782 + }, + { + "epoch": 0.7035610105124113, + "grad_norm": 0.6256046891212463, + "learning_rate": 7.270068364405143e-06, + "loss": 0.6531, + "step": 12783 + }, + { + "epoch": 0.703616049314767, + "grad_norm": 0.8107615113258362, + "learning_rate": 7.26968213940946e-06, + "loss": 0.7755, + "step": 12784 + }, + { + "epoch": 0.7036710881171225, + "grad_norm": 0.6742845177650452, + "learning_rate": 7.269295897355451e-06, + "loss": 0.834, + "step": 12785 + }, + { + "epoch": 0.7037261269194782, + "grad_norm": 0.6665072441101074, + "learning_rate": 7.268909638246024e-06, + "loss": 0.6864, + "step": 12786 + }, + { + "epoch": 0.7037811657218339, + "grad_norm": 0.68357914686203, + "learning_rate": 7.268523362084078e-06, + "loss": 0.7789, + "step": 12787 + }, + { + "epoch": 0.7038362045241896, + "grad_norm": 0.6878114938735962, + "learning_rate": 7.268137068872519e-06, + "loss": 0.7277, + "step": 12788 + }, + { + "epoch": 0.7038912433265452, + "grad_norm": 0.7173313498497009, + "learning_rate": 7.267750758614247e-06, + "loss": 0.8156, + "step": 12789 + }, + { + "epoch": 0.7039462821289009, + "grad_norm": 0.6523084044456482, + "learning_rate": 7.267364431312169e-06, + "loss": 0.7143, + "step": 12790 + }, + { + "epoch": 0.7040013209312566, + "grad_norm": 0.7403815388679504, + "learning_rate": 7.2669780869691865e-06, + "loss": 0.8196, + "step": 12791 + }, + { + "epoch": 0.7040563597336122, + "grad_norm": 0.6411255598068237, + "learning_rate": 7.266591725588204e-06, + "loss": 0.6645, + "step": 12792 + }, + { + "epoch": 0.7041113985359678, + "grad_norm": 0.9094020128250122, + "learning_rate": 7.266205347172124e-06, + "loss": 0.8023, + "step": 12793 + }, + { + "epoch": 0.7041664373383235, + "grad_norm": 1.1041208505630493, + "learning_rate": 7.265818951723851e-06, + "loss": 0.7011, + "step": 12794 + }, + { + "epoch": 0.7042214761406792, + "grad_norm": 0.7339954376220703, + "learning_rate": 7.265432539246289e-06, + "loss": 0.7467, + "step": 12795 + }, + { + "epoch": 0.7042765149430349, + "grad_norm": 0.7055865526199341, + "learning_rate": 7.265046109742344e-06, + "loss": 0.7364, + "step": 12796 + }, + { + "epoch": 0.7043315537453905, + "grad_norm": 0.7052320241928101, + "learning_rate": 7.264659663214917e-06, + "loss": 0.7611, + "step": 12797 + }, + { + "epoch": 0.7043865925477462, + "grad_norm": 0.7374194860458374, + "learning_rate": 7.264273199666915e-06, + "loss": 0.7612, + "step": 12798 + }, + { + "epoch": 0.7044416313501018, + "grad_norm": 0.634986162185669, + "learning_rate": 7.263886719101242e-06, + "loss": 0.8001, + "step": 12799 + }, + { + "epoch": 0.7044966701524574, + "grad_norm": 0.8178644180297852, + "learning_rate": 7.2635002215208014e-06, + "loss": 0.8404, + "step": 12800 + }, + { + "epoch": 0.7045517089548131, + "grad_norm": 0.7743822336196899, + "learning_rate": 7.263113706928501e-06, + "loss": 0.7297, + "step": 12801 + }, + { + "epoch": 0.7046067477571688, + "grad_norm": 0.6558601260185242, + "learning_rate": 7.262727175327242e-06, + "loss": 0.6933, + "step": 12802 + }, + { + "epoch": 0.7046617865595245, + "grad_norm": 1.0608787536621094, + "learning_rate": 7.262340626719933e-06, + "loss": 0.8792, + "step": 12803 + }, + { + "epoch": 0.7047168253618801, + "grad_norm": 0.7488270401954651, + "learning_rate": 7.261954061109475e-06, + "loss": 0.7755, + "step": 12804 + }, + { + "epoch": 0.7047718641642358, + "grad_norm": 0.8960574865341187, + "learning_rate": 7.261567478498778e-06, + "loss": 0.7274, + "step": 12805 + }, + { + "epoch": 0.7048269029665915, + "grad_norm": 0.6289944648742676, + "learning_rate": 7.2611808788907436e-06, + "loss": 0.6469, + "step": 12806 + }, + { + "epoch": 0.7048819417689471, + "grad_norm": 0.6488339900970459, + "learning_rate": 7.26079426228828e-06, + "loss": 0.7581, + "step": 12807 + }, + { + "epoch": 0.7049369805713027, + "grad_norm": 0.7354650497436523, + "learning_rate": 7.260407628694292e-06, + "loss": 0.7596, + "step": 12808 + }, + { + "epoch": 0.7049920193736584, + "grad_norm": 0.8163169026374817, + "learning_rate": 7.2600209781116834e-06, + "loss": 0.8291, + "step": 12809 + }, + { + "epoch": 0.7050470581760141, + "grad_norm": 0.8223916292190552, + "learning_rate": 7.259634310543364e-06, + "loss": 0.7089, + "step": 12810 + }, + { + "epoch": 0.7051020969783698, + "grad_norm": 0.7815924286842346, + "learning_rate": 7.2592476259922374e-06, + "loss": 0.8098, + "step": 12811 + }, + { + "epoch": 0.7051571357807254, + "grad_norm": 0.7027734518051147, + "learning_rate": 7.2588609244612105e-06, + "loss": 0.7276, + "step": 12812 + }, + { + "epoch": 0.705212174583081, + "grad_norm": 0.7345930337905884, + "learning_rate": 7.2584742059531894e-06, + "loss": 0.803, + "step": 12813 + }, + { + "epoch": 0.7052672133854367, + "grad_norm": 0.6998127102851868, + "learning_rate": 7.258087470471081e-06, + "loss": 0.7938, + "step": 12814 + }, + { + "epoch": 0.7053222521877924, + "grad_norm": 0.6418118476867676, + "learning_rate": 7.257700718017793e-06, + "loss": 0.66, + "step": 12815 + }, + { + "epoch": 0.705377290990148, + "grad_norm": 0.6774695515632629, + "learning_rate": 7.257313948596228e-06, + "loss": 0.7143, + "step": 12816 + }, + { + "epoch": 0.7054323297925037, + "grad_norm": 0.7107009291648865, + "learning_rate": 7.256927162209298e-06, + "loss": 0.8378, + "step": 12817 + }, + { + "epoch": 0.7054873685948594, + "grad_norm": 0.7287374138832092, + "learning_rate": 7.256540358859906e-06, + "loss": 0.88, + "step": 12818 + }, + { + "epoch": 0.7055424073972151, + "grad_norm": 0.651221752166748, + "learning_rate": 7.256153538550961e-06, + "loss": 0.7092, + "step": 12819 + }, + { + "epoch": 0.7055974461995707, + "grad_norm": 0.6549085974693298, + "learning_rate": 7.255766701285371e-06, + "loss": 0.6697, + "step": 12820 + }, + { + "epoch": 0.7056524850019263, + "grad_norm": 0.6617292165756226, + "learning_rate": 7.255379847066041e-06, + "loss": 0.7779, + "step": 12821 + }, + { + "epoch": 0.705707523804282, + "grad_norm": 0.6677221655845642, + "learning_rate": 7.254992975895879e-06, + "loss": 0.7821, + "step": 12822 + }, + { + "epoch": 0.7057625626066377, + "grad_norm": 0.8183515667915344, + "learning_rate": 7.2546060877777945e-06, + "loss": 0.7727, + "step": 12823 + }, + { + "epoch": 0.7058176014089933, + "grad_norm": 0.6574132442474365, + "learning_rate": 7.2542191827146945e-06, + "loss": 0.7118, + "step": 12824 + }, + { + "epoch": 0.705872640211349, + "grad_norm": 0.6874130964279175, + "learning_rate": 7.253832260709487e-06, + "loss": 0.7677, + "step": 12825 + }, + { + "epoch": 0.7059276790137047, + "grad_norm": 0.6460297107696533, + "learning_rate": 7.253445321765079e-06, + "loss": 0.725, + "step": 12826 + }, + { + "epoch": 0.7059827178160604, + "grad_norm": 0.6618219614028931, + "learning_rate": 7.253058365884379e-06, + "loss": 0.7504, + "step": 12827 + }, + { + "epoch": 0.706037756618416, + "grad_norm": 0.6519019603729248, + "learning_rate": 7.252671393070295e-06, + "loss": 0.7382, + "step": 12828 + }, + { + "epoch": 0.7060927954207716, + "grad_norm": 0.7114588022232056, + "learning_rate": 7.252284403325737e-06, + "loss": 0.8364, + "step": 12829 + }, + { + "epoch": 0.7061478342231273, + "grad_norm": 0.6304726600646973, + "learning_rate": 7.251897396653611e-06, + "loss": 0.6972, + "step": 12830 + }, + { + "epoch": 0.706202873025483, + "grad_norm": 0.6728807687759399, + "learning_rate": 7.251510373056827e-06, + "loss": 0.671, + "step": 12831 + }, + { + "epoch": 0.7062579118278386, + "grad_norm": 0.690641462802887, + "learning_rate": 7.251123332538295e-06, + "loss": 0.7381, + "step": 12832 + }, + { + "epoch": 0.7063129506301943, + "grad_norm": 0.7018027305603027, + "learning_rate": 7.2507362751009226e-06, + "loss": 0.7546, + "step": 12833 + }, + { + "epoch": 0.70636798943255, + "grad_norm": 0.7203684449195862, + "learning_rate": 7.250349200747617e-06, + "loss": 0.7534, + "step": 12834 + }, + { + "epoch": 0.7064230282349057, + "grad_norm": 0.6936585903167725, + "learning_rate": 7.24996210948129e-06, + "loss": 0.7716, + "step": 12835 + }, + { + "epoch": 0.7064780670372612, + "grad_norm": 0.7421281337738037, + "learning_rate": 7.249575001304851e-06, + "loss": 0.7517, + "step": 12836 + }, + { + "epoch": 0.7065331058396169, + "grad_norm": 0.6622288227081299, + "learning_rate": 7.249187876221207e-06, + "loss": 0.6799, + "step": 12837 + }, + { + "epoch": 0.7065881446419726, + "grad_norm": 0.7267055511474609, + "learning_rate": 7.24880073423327e-06, + "loss": 0.7871, + "step": 12838 + }, + { + "epoch": 0.7066431834443283, + "grad_norm": 0.6978085041046143, + "learning_rate": 7.2484135753439485e-06, + "loss": 0.7812, + "step": 12839 + }, + { + "epoch": 0.7066982222466839, + "grad_norm": 0.8353652358055115, + "learning_rate": 7.248026399556153e-06, + "loss": 0.7481, + "step": 12840 + }, + { + "epoch": 0.7067532610490396, + "grad_norm": 0.8402471542358398, + "learning_rate": 7.247639206872792e-06, + "loss": 0.783, + "step": 12841 + }, + { + "epoch": 0.7068082998513953, + "grad_norm": 0.8279419541358948, + "learning_rate": 7.247251997296777e-06, + "loss": 0.8177, + "step": 12842 + }, + { + "epoch": 0.7068633386537508, + "grad_norm": 0.6850735545158386, + "learning_rate": 7.246864770831017e-06, + "loss": 0.7586, + "step": 12843 + }, + { + "epoch": 0.7069183774561065, + "grad_norm": 0.7327665090560913, + "learning_rate": 7.246477527478422e-06, + "loss": 0.9327, + "step": 12844 + }, + { + "epoch": 0.7069734162584622, + "grad_norm": 0.6343075037002563, + "learning_rate": 7.246090267241905e-06, + "loss": 0.6957, + "step": 12845 + }, + { + "epoch": 0.7070284550608179, + "grad_norm": 0.7028965353965759, + "learning_rate": 7.245702990124373e-06, + "loss": 0.7524, + "step": 12846 + }, + { + "epoch": 0.7070834938631735, + "grad_norm": 0.7578299045562744, + "learning_rate": 7.24531569612874e-06, + "loss": 0.7302, + "step": 12847 + }, + { + "epoch": 0.7071385326655292, + "grad_norm": 0.8113438487052917, + "learning_rate": 7.2449283852579146e-06, + "loss": 0.7658, + "step": 12848 + }, + { + "epoch": 0.7071935714678849, + "grad_norm": 0.6442512273788452, + "learning_rate": 7.244541057514809e-06, + "loss": 0.6742, + "step": 12849 + }, + { + "epoch": 0.7072486102702406, + "grad_norm": 0.8595272898674011, + "learning_rate": 7.244153712902333e-06, + "loss": 0.7944, + "step": 12850 + }, + { + "epoch": 0.7073036490725961, + "grad_norm": 0.6565983891487122, + "learning_rate": 7.243766351423398e-06, + "loss": 0.7411, + "step": 12851 + }, + { + "epoch": 0.7073586878749518, + "grad_norm": 0.7935337424278259, + "learning_rate": 7.243378973080917e-06, + "loss": 0.8109, + "step": 12852 + }, + { + "epoch": 0.7074137266773075, + "grad_norm": 0.7083927392959595, + "learning_rate": 7.242991577877799e-06, + "loss": 0.8405, + "step": 12853 + }, + { + "epoch": 0.7074687654796632, + "grad_norm": 0.7452830672264099, + "learning_rate": 7.242604165816958e-06, + "loss": 0.7972, + "step": 12854 + }, + { + "epoch": 0.7075238042820188, + "grad_norm": 0.6775808334350586, + "learning_rate": 7.242216736901302e-06, + "loss": 0.7114, + "step": 12855 + }, + { + "epoch": 0.7075788430843745, + "grad_norm": 0.8069992661476135, + "learning_rate": 7.241829291133748e-06, + "loss": 0.6606, + "step": 12856 + }, + { + "epoch": 0.7076338818867302, + "grad_norm": 0.6690802574157715, + "learning_rate": 7.241441828517203e-06, + "loss": 0.742, + "step": 12857 + }, + { + "epoch": 0.7076889206890858, + "grad_norm": 0.8077805638313293, + "learning_rate": 7.2410543490545814e-06, + "loss": 0.7786, + "step": 12858 + }, + { + "epoch": 0.7077439594914414, + "grad_norm": 0.6906875967979431, + "learning_rate": 7.240666852748795e-06, + "loss": 0.7445, + "step": 12859 + }, + { + "epoch": 0.7077989982937971, + "grad_norm": 0.6830704808235168, + "learning_rate": 7.2402793396027585e-06, + "loss": 0.7664, + "step": 12860 + }, + { + "epoch": 0.7078540370961528, + "grad_norm": 0.8118640780448914, + "learning_rate": 7.23989180961938e-06, + "loss": 0.7654, + "step": 12861 + }, + { + "epoch": 0.7079090758985085, + "grad_norm": 0.6819882392883301, + "learning_rate": 7.2395042628015755e-06, + "loss": 0.649, + "step": 12862 + }, + { + "epoch": 0.7079641147008641, + "grad_norm": 0.6543441414833069, + "learning_rate": 7.239116699152256e-06, + "loss": 0.8054, + "step": 12863 + }, + { + "epoch": 0.7080191535032198, + "grad_norm": 0.8613989353179932, + "learning_rate": 7.238729118674335e-06, + "loss": 0.7283, + "step": 12864 + }, + { + "epoch": 0.7080741923055754, + "grad_norm": 0.6993124485015869, + "learning_rate": 7.238341521370725e-06, + "loss": 0.8145, + "step": 12865 + }, + { + "epoch": 0.7081292311079311, + "grad_norm": 0.7047560811042786, + "learning_rate": 7.237953907244339e-06, + "loss": 0.6729, + "step": 12866 + }, + { + "epoch": 0.7081842699102867, + "grad_norm": 0.7923689484596252, + "learning_rate": 7.237566276298091e-06, + "loss": 0.7615, + "step": 12867 + }, + { + "epoch": 0.7082393087126424, + "grad_norm": 0.6873850226402283, + "learning_rate": 7.237178628534894e-06, + "loss": 0.7638, + "step": 12868 + }, + { + "epoch": 0.7082943475149981, + "grad_norm": 0.6483134031295776, + "learning_rate": 7.236790963957661e-06, + "loss": 0.6366, + "step": 12869 + }, + { + "epoch": 0.7083493863173538, + "grad_norm": 0.6623784899711609, + "learning_rate": 7.236403282569305e-06, + "loss": 0.7032, + "step": 12870 + }, + { + "epoch": 0.7084044251197094, + "grad_norm": 0.7004366517066956, + "learning_rate": 7.236015584372741e-06, + "loss": 0.6436, + "step": 12871 + }, + { + "epoch": 0.708459463922065, + "grad_norm": 0.5676529407501221, + "learning_rate": 7.235627869370883e-06, + "loss": 0.6395, + "step": 12872 + }, + { + "epoch": 0.7085145027244207, + "grad_norm": 0.6909729838371277, + "learning_rate": 7.235240137566644e-06, + "loss": 0.7063, + "step": 12873 + }, + { + "epoch": 0.7085695415267764, + "grad_norm": 0.7635348439216614, + "learning_rate": 7.234852388962939e-06, + "loss": 0.7518, + "step": 12874 + }, + { + "epoch": 0.708624580329132, + "grad_norm": 0.7217742204666138, + "learning_rate": 7.2344646235626815e-06, + "loss": 0.7782, + "step": 12875 + }, + { + "epoch": 0.7086796191314877, + "grad_norm": 0.6506509184837341, + "learning_rate": 7.2340768413687855e-06, + "loss": 0.7456, + "step": 12876 + }, + { + "epoch": 0.7087346579338434, + "grad_norm": 0.6537386775016785, + "learning_rate": 7.2336890423841664e-06, + "loss": 0.7395, + "step": 12877 + }, + { + "epoch": 0.7087896967361991, + "grad_norm": 0.7759900689125061, + "learning_rate": 7.233301226611737e-06, + "loss": 0.8098, + "step": 12878 + }, + { + "epoch": 0.7088447355385546, + "grad_norm": 0.8476354479789734, + "learning_rate": 7.232913394054415e-06, + "loss": 0.8241, + "step": 12879 + }, + { + "epoch": 0.7088997743409103, + "grad_norm": 0.6770507097244263, + "learning_rate": 7.232525544715114e-06, + "loss": 0.6966, + "step": 12880 + }, + { + "epoch": 0.708954813143266, + "grad_norm": 0.7750027775764465, + "learning_rate": 7.232137678596747e-06, + "loss": 0.8038, + "step": 12881 + }, + { + "epoch": 0.7090098519456217, + "grad_norm": 0.6507213711738586, + "learning_rate": 7.231749795702232e-06, + "loss": 0.6446, + "step": 12882 + }, + { + "epoch": 0.7090648907479773, + "grad_norm": 0.7554625272750854, + "learning_rate": 7.231361896034481e-06, + "loss": 0.7769, + "step": 12883 + }, + { + "epoch": 0.709119929550333, + "grad_norm": 0.8175020813941956, + "learning_rate": 7.230973979596414e-06, + "loss": 0.8283, + "step": 12884 + }, + { + "epoch": 0.7091749683526887, + "grad_norm": 0.7528663873672485, + "learning_rate": 7.2305860463909416e-06, + "loss": 0.7737, + "step": 12885 + }, + { + "epoch": 0.7092300071550443, + "grad_norm": 0.9242768883705139, + "learning_rate": 7.230198096420983e-06, + "loss": 0.647, + "step": 12886 + }, + { + "epoch": 0.7092850459573999, + "grad_norm": 0.899874746799469, + "learning_rate": 7.229810129689452e-06, + "loss": 0.8952, + "step": 12887 + }, + { + "epoch": 0.7093400847597556, + "grad_norm": 0.8221275806427002, + "learning_rate": 7.229422146199266e-06, + "loss": 0.6845, + "step": 12888 + }, + { + "epoch": 0.7093951235621113, + "grad_norm": 0.6964027285575867, + "learning_rate": 7.229034145953338e-06, + "loss": 0.7153, + "step": 12889 + }, + { + "epoch": 0.7094501623644669, + "grad_norm": 0.8018684387207031, + "learning_rate": 7.228646128954588e-06, + "loss": 0.6421, + "step": 12890 + }, + { + "epoch": 0.7095052011668226, + "grad_norm": 0.6874614953994751, + "learning_rate": 7.228258095205928e-06, + "loss": 0.8024, + "step": 12891 + }, + { + "epoch": 0.7095602399691783, + "grad_norm": 0.7141417860984802, + "learning_rate": 7.227870044710277e-06, + "loss": 0.7746, + "step": 12892 + }, + { + "epoch": 0.709615278771534, + "grad_norm": 0.7109399437904358, + "learning_rate": 7.227481977470552e-06, + "loss": 0.7826, + "step": 12893 + }, + { + "epoch": 0.7096703175738895, + "grad_norm": 0.7021867036819458, + "learning_rate": 7.227093893489669e-06, + "loss": 0.7196, + "step": 12894 + }, + { + "epoch": 0.7097253563762452, + "grad_norm": 0.6896560788154602, + "learning_rate": 7.226705792770543e-06, + "loss": 0.6925, + "step": 12895 + }, + { + "epoch": 0.7097803951786009, + "grad_norm": 0.7138262987136841, + "learning_rate": 7.226317675316094e-06, + "loss": 0.7417, + "step": 12896 + }, + { + "epoch": 0.7098354339809566, + "grad_norm": 0.6789212226867676, + "learning_rate": 7.225929541129236e-06, + "loss": 0.7095, + "step": 12897 + }, + { + "epoch": 0.7098904727833122, + "grad_norm": 0.8102045059204102, + "learning_rate": 7.225541390212889e-06, + "loss": 0.9252, + "step": 12898 + }, + { + "epoch": 0.7099455115856679, + "grad_norm": 0.6220358610153198, + "learning_rate": 7.2251532225699674e-06, + "loss": 0.7205, + "step": 12899 + }, + { + "epoch": 0.7100005503880236, + "grad_norm": 0.6375265121459961, + "learning_rate": 7.224765038203391e-06, + "loss": 0.7974, + "step": 12900 + }, + { + "epoch": 0.7100555891903793, + "grad_norm": 0.7457360029220581, + "learning_rate": 7.224376837116075e-06, + "loss": 0.7083, + "step": 12901 + }, + { + "epoch": 0.7101106279927348, + "grad_norm": 0.7012878060340881, + "learning_rate": 7.2239886193109374e-06, + "loss": 0.7334, + "step": 12902 + }, + { + "epoch": 0.7101656667950905, + "grad_norm": 0.7437683343887329, + "learning_rate": 7.223600384790898e-06, + "loss": 0.82, + "step": 12903 + }, + { + "epoch": 0.7102207055974462, + "grad_norm": 0.6727370619773865, + "learning_rate": 7.223212133558872e-06, + "loss": 0.7339, + "step": 12904 + }, + { + "epoch": 0.7102757443998019, + "grad_norm": 0.9253849983215332, + "learning_rate": 7.222823865617781e-06, + "loss": 0.7398, + "step": 12905 + }, + { + "epoch": 0.7103307832021575, + "grad_norm": 0.6664100885391235, + "learning_rate": 7.222435580970539e-06, + "loss": 0.7519, + "step": 12906 + }, + { + "epoch": 0.7103858220045132, + "grad_norm": 0.7452943325042725, + "learning_rate": 7.222047279620066e-06, + "loss": 0.7382, + "step": 12907 + }, + { + "epoch": 0.7104408608068689, + "grad_norm": 0.7235015630722046, + "learning_rate": 7.22165896156928e-06, + "loss": 0.7726, + "step": 12908 + }, + { + "epoch": 0.7104958996092245, + "grad_norm": 0.6324653029441833, + "learning_rate": 7.221270626821102e-06, + "loss": 0.7451, + "step": 12909 + }, + { + "epoch": 0.7105509384115801, + "grad_norm": 0.789829432964325, + "learning_rate": 7.220882275378447e-06, + "loss": 0.7375, + "step": 12910 + }, + { + "epoch": 0.7106059772139358, + "grad_norm": 0.9090244174003601, + "learning_rate": 7.220493907244236e-06, + "loss": 0.8935, + "step": 12911 + }, + { + "epoch": 0.7106610160162915, + "grad_norm": 0.6570677757263184, + "learning_rate": 7.220105522421388e-06, + "loss": 0.7259, + "step": 12912 + }, + { + "epoch": 0.7107160548186472, + "grad_norm": 0.7142132520675659, + "learning_rate": 7.219717120912819e-06, + "loss": 0.7862, + "step": 12913 + }, + { + "epoch": 0.7107710936210028, + "grad_norm": 0.7359404563903809, + "learning_rate": 7.219328702721452e-06, + "loss": 0.7074, + "step": 12914 + }, + { + "epoch": 0.7108261324233585, + "grad_norm": 0.7118046283721924, + "learning_rate": 7.218940267850203e-06, + "loss": 0.8151, + "step": 12915 + }, + { + "epoch": 0.7108811712257141, + "grad_norm": 0.8301580548286438, + "learning_rate": 7.218551816301994e-06, + "loss": 0.7031, + "step": 12916 + }, + { + "epoch": 0.7109362100280698, + "grad_norm": 0.6647501587867737, + "learning_rate": 7.218163348079743e-06, + "loss": 0.8309, + "step": 12917 + }, + { + "epoch": 0.7109912488304254, + "grad_norm": 0.6546997427940369, + "learning_rate": 7.217774863186371e-06, + "loss": 0.717, + "step": 12918 + }, + { + "epoch": 0.7110462876327811, + "grad_norm": 0.6639735102653503, + "learning_rate": 7.217386361624795e-06, + "loss": 0.7308, + "step": 12919 + }, + { + "epoch": 0.7111013264351368, + "grad_norm": 0.724433183670044, + "learning_rate": 7.216997843397938e-06, + "loss": 0.7576, + "step": 12920 + }, + { + "epoch": 0.7111563652374925, + "grad_norm": 0.750253438949585, + "learning_rate": 7.216609308508719e-06, + "loss": 0.7014, + "step": 12921 + }, + { + "epoch": 0.7112114040398481, + "grad_norm": 0.7010897397994995, + "learning_rate": 7.216220756960058e-06, + "loss": 0.6951, + "step": 12922 + }, + { + "epoch": 0.7112664428422037, + "grad_norm": 0.7739251852035522, + "learning_rate": 7.215832188754873e-06, + "loss": 0.7392, + "step": 12923 + }, + { + "epoch": 0.7113214816445594, + "grad_norm": 0.6893059015274048, + "learning_rate": 7.215443603896088e-06, + "loss": 0.7029, + "step": 12924 + }, + { + "epoch": 0.7113765204469151, + "grad_norm": 0.8061872124671936, + "learning_rate": 7.215055002386622e-06, + "loss": 0.7557, + "step": 12925 + }, + { + "epoch": 0.7114315592492707, + "grad_norm": 1.089525580406189, + "learning_rate": 7.214666384229395e-06, + "loss": 0.6701, + "step": 12926 + }, + { + "epoch": 0.7114865980516264, + "grad_norm": 0.7601733207702637, + "learning_rate": 7.2142777494273275e-06, + "loss": 0.8113, + "step": 12927 + }, + { + "epoch": 0.7115416368539821, + "grad_norm": 0.7863540649414062, + "learning_rate": 7.213889097983342e-06, + "loss": 0.7945, + "step": 12928 + }, + { + "epoch": 0.7115966756563377, + "grad_norm": 0.7722556591033936, + "learning_rate": 7.21350042990036e-06, + "loss": 0.9492, + "step": 12929 + }, + { + "epoch": 0.7116517144586934, + "grad_norm": 0.6834682822227478, + "learning_rate": 7.213111745181299e-06, + "loss": 0.7138, + "step": 12930 + }, + { + "epoch": 0.711706753261049, + "grad_norm": 0.6974432468414307, + "learning_rate": 7.212723043829083e-06, + "loss": 0.7654, + "step": 12931 + }, + { + "epoch": 0.7117617920634047, + "grad_norm": 0.9797543883323669, + "learning_rate": 7.2123343258466334e-06, + "loss": 0.7786, + "step": 12932 + }, + { + "epoch": 0.7118168308657603, + "grad_norm": 0.6337804794311523, + "learning_rate": 7.211945591236872e-06, + "loss": 0.7147, + "step": 12933 + }, + { + "epoch": 0.711871869668116, + "grad_norm": 0.7450474500656128, + "learning_rate": 7.211556840002718e-06, + "loss": 0.8516, + "step": 12934 + }, + { + "epoch": 0.7119269084704717, + "grad_norm": 0.7786532640457153, + "learning_rate": 7.2111680721470965e-06, + "loss": 0.837, + "step": 12935 + }, + { + "epoch": 0.7119819472728274, + "grad_norm": 0.666020393371582, + "learning_rate": 7.210779287672927e-06, + "loss": 0.7646, + "step": 12936 + }, + { + "epoch": 0.712036986075183, + "grad_norm": 0.622648298740387, + "learning_rate": 7.210390486583132e-06, + "loss": 0.7102, + "step": 12937 + }, + { + "epoch": 0.7120920248775386, + "grad_norm": 0.7175952792167664, + "learning_rate": 7.210001668880634e-06, + "loss": 0.7043, + "step": 12938 + }, + { + "epoch": 0.7121470636798943, + "grad_norm": 0.8019681572914124, + "learning_rate": 7.209612834568353e-06, + "loss": 0.8166, + "step": 12939 + }, + { + "epoch": 0.71220210248225, + "grad_norm": 0.804457426071167, + "learning_rate": 7.209223983649216e-06, + "loss": 0.7182, + "step": 12940 + }, + { + "epoch": 0.7122571412846056, + "grad_norm": 0.7261730432510376, + "learning_rate": 7.208835116126143e-06, + "loss": 0.6634, + "step": 12941 + }, + { + "epoch": 0.7123121800869613, + "grad_norm": 0.7461307644844055, + "learning_rate": 7.208446232002055e-06, + "loss": 0.709, + "step": 12942 + }, + { + "epoch": 0.712367218889317, + "grad_norm": 0.6730383634567261, + "learning_rate": 7.208057331279877e-06, + "loss": 0.7111, + "step": 12943 + }, + { + "epoch": 0.7124222576916727, + "grad_norm": 0.829530656337738, + "learning_rate": 7.207668413962531e-06, + "loss": 0.729, + "step": 12944 + }, + { + "epoch": 0.7124772964940282, + "grad_norm": 0.5997991561889648, + "learning_rate": 7.20727948005294e-06, + "loss": 0.6385, + "step": 12945 + }, + { + "epoch": 0.7125323352963839, + "grad_norm": 0.9590086936950684, + "learning_rate": 7.206890529554027e-06, + "loss": 0.7217, + "step": 12946 + }, + { + "epoch": 0.7125873740987396, + "grad_norm": 0.7818330526351929, + "learning_rate": 7.206501562468717e-06, + "loss": 0.7276, + "step": 12947 + }, + { + "epoch": 0.7126424129010953, + "grad_norm": 0.6033679842948914, + "learning_rate": 7.206112578799931e-06, + "loss": 0.5935, + "step": 12948 + }, + { + "epoch": 0.7126974517034509, + "grad_norm": 0.7431650757789612, + "learning_rate": 7.205723578550593e-06, + "loss": 0.8649, + "step": 12949 + }, + { + "epoch": 0.7127524905058066, + "grad_norm": 0.7026848793029785, + "learning_rate": 7.205334561723627e-06, + "loss": 0.7484, + "step": 12950 + }, + { + "epoch": 0.7128075293081623, + "grad_norm": 0.6328058242797852, + "learning_rate": 7.204945528321956e-06, + "loss": 0.6994, + "step": 12951 + }, + { + "epoch": 0.712862568110518, + "grad_norm": 0.6806536912918091, + "learning_rate": 7.204556478348507e-06, + "loss": 0.7461, + "step": 12952 + }, + { + "epoch": 0.7129176069128735, + "grad_norm": 0.6822162866592407, + "learning_rate": 7.2041674118062e-06, + "loss": 0.7947, + "step": 12953 + }, + { + "epoch": 0.7129726457152292, + "grad_norm": 0.7283263802528381, + "learning_rate": 7.203778328697962e-06, + "loss": 0.7559, + "step": 12954 + }, + { + "epoch": 0.7130276845175849, + "grad_norm": 0.663564920425415, + "learning_rate": 7.203389229026714e-06, + "loss": 0.6898, + "step": 12955 + }, + { + "epoch": 0.7130827233199406, + "grad_norm": 0.7218708395957947, + "learning_rate": 7.203000112795383e-06, + "loss": 0.8095, + "step": 12956 + }, + { + "epoch": 0.7131377621222962, + "grad_norm": 0.6931518912315369, + "learning_rate": 7.202610980006893e-06, + "loss": 0.7591, + "step": 12957 + }, + { + "epoch": 0.7131928009246519, + "grad_norm": 0.6982918381690979, + "learning_rate": 7.2022218306641704e-06, + "loss": 0.7651, + "step": 12958 + }, + { + "epoch": 0.7132478397270076, + "grad_norm": 0.8033974170684814, + "learning_rate": 7.201832664770135e-06, + "loss": 0.8857, + "step": 12959 + }, + { + "epoch": 0.7133028785293632, + "grad_norm": 0.6625493764877319, + "learning_rate": 7.201443482327717e-06, + "loss": 0.752, + "step": 12960 + }, + { + "epoch": 0.7133579173317188, + "grad_norm": 0.8149683475494385, + "learning_rate": 7.201054283339838e-06, + "loss": 0.8528, + "step": 12961 + }, + { + "epoch": 0.7134129561340745, + "grad_norm": 0.7894958257675171, + "learning_rate": 7.200665067809425e-06, + "loss": 0.8554, + "step": 12962 + }, + { + "epoch": 0.7134679949364302, + "grad_norm": 0.7613523602485657, + "learning_rate": 7.200275835739401e-06, + "loss": 0.7435, + "step": 12963 + }, + { + "epoch": 0.7135230337387859, + "grad_norm": 0.665985643863678, + "learning_rate": 7.199886587132693e-06, + "loss": 0.7072, + "step": 12964 + }, + { + "epoch": 0.7135780725411415, + "grad_norm": 0.7523592710494995, + "learning_rate": 7.199497321992227e-06, + "loss": 0.7945, + "step": 12965 + }, + { + "epoch": 0.7136331113434972, + "grad_norm": 0.8894450664520264, + "learning_rate": 7.199108040320928e-06, + "loss": 0.7885, + "step": 12966 + }, + { + "epoch": 0.7136881501458529, + "grad_norm": 0.639108419418335, + "learning_rate": 7.198718742121722e-06, + "loss": 0.6975, + "step": 12967 + }, + { + "epoch": 0.7137431889482085, + "grad_norm": 0.670013964176178, + "learning_rate": 7.198329427397532e-06, + "loss": 0.7441, + "step": 12968 + }, + { + "epoch": 0.7137982277505641, + "grad_norm": 0.7695425748825073, + "learning_rate": 7.197940096151289e-06, + "loss": 0.7616, + "step": 12969 + }, + { + "epoch": 0.7138532665529198, + "grad_norm": 0.9098057150840759, + "learning_rate": 7.197550748385917e-06, + "loss": 0.9028, + "step": 12970 + }, + { + "epoch": 0.7139083053552755, + "grad_norm": 0.7677769660949707, + "learning_rate": 7.197161384104341e-06, + "loss": 0.7926, + "step": 12971 + }, + { + "epoch": 0.7139633441576311, + "grad_norm": 0.7020674347877502, + "learning_rate": 7.196772003309487e-06, + "loss": 0.7248, + "step": 12972 + }, + { + "epoch": 0.7140183829599868, + "grad_norm": 0.6616366505622864, + "learning_rate": 7.196382606004283e-06, + "loss": 0.7137, + "step": 12973 + }, + { + "epoch": 0.7140734217623425, + "grad_norm": 0.7174738645553589, + "learning_rate": 7.195993192191656e-06, + "loss": 0.8167, + "step": 12974 + }, + { + "epoch": 0.7141284605646981, + "grad_norm": 0.6672176122665405, + "learning_rate": 7.1956037618745325e-06, + "loss": 0.6516, + "step": 12975 + }, + { + "epoch": 0.7141834993670537, + "grad_norm": 0.714790403842926, + "learning_rate": 7.195214315055837e-06, + "loss": 0.865, + "step": 12976 + }, + { + "epoch": 0.7142385381694094, + "grad_norm": 0.6637690663337708, + "learning_rate": 7.1948248517385e-06, + "loss": 0.7328, + "step": 12977 + }, + { + "epoch": 0.7142935769717651, + "grad_norm": 0.8998367786407471, + "learning_rate": 7.194435371925446e-06, + "loss": 0.7097, + "step": 12978 + }, + { + "epoch": 0.7143486157741208, + "grad_norm": 0.7472445964813232, + "learning_rate": 7.194045875619604e-06, + "loss": 0.7556, + "step": 12979 + }, + { + "epoch": 0.7144036545764764, + "grad_norm": 0.7897135019302368, + "learning_rate": 7.1936563628239e-06, + "loss": 0.8728, + "step": 12980 + }, + { + "epoch": 0.714458693378832, + "grad_norm": 0.6520817279815674, + "learning_rate": 7.193266833541261e-06, + "loss": 0.6824, + "step": 12981 + }, + { + "epoch": 0.7145137321811877, + "grad_norm": 0.833849310874939, + "learning_rate": 7.192877287774618e-06, + "loss": 0.8877, + "step": 12982 + }, + { + "epoch": 0.7145687709835434, + "grad_norm": 0.7105151414871216, + "learning_rate": 7.192487725526896e-06, + "loss": 0.7799, + "step": 12983 + }, + { + "epoch": 0.714623809785899, + "grad_norm": 0.7515869140625, + "learning_rate": 7.192098146801021e-06, + "loss": 0.7012, + "step": 12984 + }, + { + "epoch": 0.7146788485882547, + "grad_norm": 0.7447199821472168, + "learning_rate": 7.191708551599923e-06, + "loss": 0.7545, + "step": 12985 + }, + { + "epoch": 0.7147338873906104, + "grad_norm": 0.8502823114395142, + "learning_rate": 7.191318939926532e-06, + "loss": 0.7232, + "step": 12986 + }, + { + "epoch": 0.7147889261929661, + "grad_norm": 0.7193031907081604, + "learning_rate": 7.190929311783774e-06, + "loss": 0.762, + "step": 12987 + }, + { + "epoch": 0.7148439649953217, + "grad_norm": 0.8479939699172974, + "learning_rate": 7.190539667174576e-06, + "loss": 0.7238, + "step": 12988 + }, + { + "epoch": 0.7148990037976773, + "grad_norm": 0.8313719630241394, + "learning_rate": 7.1901500061018704e-06, + "loss": 0.8145, + "step": 12989 + }, + { + "epoch": 0.714954042600033, + "grad_norm": 0.7019978165626526, + "learning_rate": 7.189760328568584e-06, + "loss": 0.6461, + "step": 12990 + }, + { + "epoch": 0.7150090814023887, + "grad_norm": 0.897280216217041, + "learning_rate": 7.1893706345776436e-06, + "loss": 0.818, + "step": 12991 + }, + { + "epoch": 0.7150641202047443, + "grad_norm": 0.7495617866516113, + "learning_rate": 7.1889809241319795e-06, + "loss": 0.7533, + "step": 12992 + }, + { + "epoch": 0.7151191590071, + "grad_norm": 0.733496904373169, + "learning_rate": 7.188591197234522e-06, + "loss": 0.7405, + "step": 12993 + }, + { + "epoch": 0.7151741978094557, + "grad_norm": 0.8873284459114075, + "learning_rate": 7.1882014538882e-06, + "loss": 0.7525, + "step": 12994 + }, + { + "epoch": 0.7152292366118114, + "grad_norm": 0.6693230271339417, + "learning_rate": 7.187811694095939e-06, + "loss": 0.7509, + "step": 12995 + }, + { + "epoch": 0.715284275414167, + "grad_norm": 0.8513357043266296, + "learning_rate": 7.187421917860671e-06, + "loss": 0.8111, + "step": 12996 + }, + { + "epoch": 0.7153393142165226, + "grad_norm": 0.6986566185951233, + "learning_rate": 7.187032125185326e-06, + "loss": 0.8013, + "step": 12997 + }, + { + "epoch": 0.7153943530188783, + "grad_norm": 0.7062557339668274, + "learning_rate": 7.1866423160728335e-06, + "loss": 0.7266, + "step": 12998 + }, + { + "epoch": 0.715449391821234, + "grad_norm": 0.6329573392868042, + "learning_rate": 7.186252490526122e-06, + "loss": 0.6753, + "step": 12999 + }, + { + "epoch": 0.7155044306235896, + "grad_norm": 0.6740719079971313, + "learning_rate": 7.185862648548122e-06, + "loss": 0.7197, + "step": 13000 + }, + { + "epoch": 0.7155594694259453, + "grad_norm": 0.7911732196807861, + "learning_rate": 7.185472790141764e-06, + "loss": 0.6939, + "step": 13001 + }, + { + "epoch": 0.715614508228301, + "grad_norm": 0.7368680238723755, + "learning_rate": 7.185082915309978e-06, + "loss": 0.6919, + "step": 13002 + }, + { + "epoch": 0.7156695470306567, + "grad_norm": 0.6374472975730896, + "learning_rate": 7.1846930240556925e-06, + "loss": 0.6645, + "step": 13003 + }, + { + "epoch": 0.7157245858330122, + "grad_norm": 0.6727073192596436, + "learning_rate": 7.184303116381839e-06, + "loss": 0.5995, + "step": 13004 + }, + { + "epoch": 0.7157796246353679, + "grad_norm": 0.6122208833694458, + "learning_rate": 7.183913192291348e-06, + "loss": 0.6755, + "step": 13005 + }, + { + "epoch": 0.7158346634377236, + "grad_norm": 0.7095892429351807, + "learning_rate": 7.1835232517871525e-06, + "loss": 0.8009, + "step": 13006 + }, + { + "epoch": 0.7158897022400793, + "grad_norm": 0.6828192472457886, + "learning_rate": 7.1831332948721786e-06, + "loss": 0.7755, + "step": 13007 + }, + { + "epoch": 0.7159447410424349, + "grad_norm": 0.7997334003448486, + "learning_rate": 7.182743321549359e-06, + "loss": 0.7259, + "step": 13008 + }, + { + "epoch": 0.7159997798447906, + "grad_norm": 0.7431252002716064, + "learning_rate": 7.182353331821626e-06, + "loss": 0.7765, + "step": 13009 + }, + { + "epoch": 0.7160548186471463, + "grad_norm": 0.7202625870704651, + "learning_rate": 7.181963325691907e-06, + "loss": 0.7638, + "step": 13010 + }, + { + "epoch": 0.716109857449502, + "grad_norm": 0.7617568373680115, + "learning_rate": 7.181573303163139e-06, + "loss": 0.825, + "step": 13011 + }, + { + "epoch": 0.7161648962518575, + "grad_norm": 0.7382665276527405, + "learning_rate": 7.181183264238247e-06, + "loss": 0.8005, + "step": 13012 + }, + { + "epoch": 0.7162199350542132, + "grad_norm": 0.7782611846923828, + "learning_rate": 7.180793208920167e-06, + "loss": 0.7044, + "step": 13013 + }, + { + "epoch": 0.7162749738565689, + "grad_norm": 0.7020898461341858, + "learning_rate": 7.18040313721183e-06, + "loss": 0.8059, + "step": 13014 + }, + { + "epoch": 0.7163300126589245, + "grad_norm": 1.2005099058151245, + "learning_rate": 7.1800130491161656e-06, + "loss": 0.6663, + "step": 13015 + }, + { + "epoch": 0.7163850514612802, + "grad_norm": 0.6663569211959839, + "learning_rate": 7.1796229446361066e-06, + "loss": 0.7046, + "step": 13016 + }, + { + "epoch": 0.7164400902636359, + "grad_norm": 0.7010110020637512, + "learning_rate": 7.1792328237745845e-06, + "loss": 0.6433, + "step": 13017 + }, + { + "epoch": 0.7164951290659916, + "grad_norm": 0.6447514891624451, + "learning_rate": 7.178842686534534e-06, + "loss": 0.7794, + "step": 13018 + }, + { + "epoch": 0.7165501678683471, + "grad_norm": 0.6813021302223206, + "learning_rate": 7.1784525329188835e-06, + "loss": 0.7413, + "step": 13019 + }, + { + "epoch": 0.7166052066707028, + "grad_norm": 0.6894733905792236, + "learning_rate": 7.178062362930567e-06, + "loss": 0.7896, + "step": 13020 + }, + { + "epoch": 0.7166602454730585, + "grad_norm": 0.6717034578323364, + "learning_rate": 7.177672176572517e-06, + "loss": 0.7599, + "step": 13021 + }, + { + "epoch": 0.7167152842754142, + "grad_norm": 0.7861666083335876, + "learning_rate": 7.177281973847665e-06, + "loss": 0.9068, + "step": 13022 + }, + { + "epoch": 0.7167703230777698, + "grad_norm": 0.6784214973449707, + "learning_rate": 7.176891754758946e-06, + "loss": 0.8319, + "step": 13023 + }, + { + "epoch": 0.7168253618801255, + "grad_norm": 0.7053580284118652, + "learning_rate": 7.176501519309289e-06, + "loss": 0.8085, + "step": 13024 + }, + { + "epoch": 0.7168804006824812, + "grad_norm": 0.9643208980560303, + "learning_rate": 7.176111267501631e-06, + "loss": 0.7799, + "step": 13025 + }, + { + "epoch": 0.7169354394848368, + "grad_norm": 0.8921111822128296, + "learning_rate": 7.175720999338902e-06, + "loss": 0.6465, + "step": 13026 + }, + { + "epoch": 0.7169904782871924, + "grad_norm": 0.7356166839599609, + "learning_rate": 7.1753307148240385e-06, + "loss": 0.7862, + "step": 13027 + }, + { + "epoch": 0.7170455170895481, + "grad_norm": 0.6906836628913879, + "learning_rate": 7.174940413959968e-06, + "loss": 0.7341, + "step": 13028 + }, + { + "epoch": 0.7171005558919038, + "grad_norm": 0.6229632496833801, + "learning_rate": 7.174550096749632e-06, + "loss": 0.721, + "step": 13029 + }, + { + "epoch": 0.7171555946942595, + "grad_norm": 0.6832499504089355, + "learning_rate": 7.174159763195958e-06, + "loss": 0.6733, + "step": 13030 + }, + { + "epoch": 0.7172106334966151, + "grad_norm": 0.8304060697555542, + "learning_rate": 7.1737694133018806e-06, + "loss": 0.7732, + "step": 13031 + }, + { + "epoch": 0.7172656722989708, + "grad_norm": 0.6813186407089233, + "learning_rate": 7.173379047070333e-06, + "loss": 0.7742, + "step": 13032 + }, + { + "epoch": 0.7173207111013264, + "grad_norm": 0.6671963930130005, + "learning_rate": 7.172988664504252e-06, + "loss": 0.6516, + "step": 13033 + }, + { + "epoch": 0.7173757499036821, + "grad_norm": 0.661108136177063, + "learning_rate": 7.172598265606569e-06, + "loss": 0.7361, + "step": 13034 + }, + { + "epoch": 0.7174307887060377, + "grad_norm": 0.7097620368003845, + "learning_rate": 7.1722078503802196e-06, + "loss": 0.8142, + "step": 13035 + }, + { + "epoch": 0.7174858275083934, + "grad_norm": 0.7663383483886719, + "learning_rate": 7.1718174188281365e-06, + "loss": 0.8149, + "step": 13036 + }, + { + "epoch": 0.7175408663107491, + "grad_norm": 0.7142401337623596, + "learning_rate": 7.171426970953256e-06, + "loss": 0.7539, + "step": 13037 + }, + { + "epoch": 0.7175959051131048, + "grad_norm": 0.667346715927124, + "learning_rate": 7.171036506758512e-06, + "loss": 0.7517, + "step": 13038 + }, + { + "epoch": 0.7176509439154604, + "grad_norm": 0.5933231711387634, + "learning_rate": 7.170646026246838e-06, + "loss": 0.6852, + "step": 13039 + }, + { + "epoch": 0.717705982717816, + "grad_norm": 0.730015218257904, + "learning_rate": 7.170255529421168e-06, + "loss": 0.7316, + "step": 13040 + }, + { + "epoch": 0.7177610215201717, + "grad_norm": 0.6146146059036255, + "learning_rate": 7.169865016284442e-06, + "loss": 0.6715, + "step": 13041 + }, + { + "epoch": 0.7178160603225274, + "grad_norm": 0.694131076335907, + "learning_rate": 7.16947448683959e-06, + "loss": 0.7944, + "step": 13042 + }, + { + "epoch": 0.717871099124883, + "grad_norm": 0.6736807823181152, + "learning_rate": 7.169083941089547e-06, + "loss": 0.7922, + "step": 13043 + }, + { + "epoch": 0.7179261379272387, + "grad_norm": 0.6748425364494324, + "learning_rate": 7.16869337903725e-06, + "loss": 0.6738, + "step": 13044 + }, + { + "epoch": 0.7179811767295944, + "grad_norm": 0.6807510852813721, + "learning_rate": 7.168302800685635e-06, + "loss": 0.7291, + "step": 13045 + }, + { + "epoch": 0.7180362155319501, + "grad_norm": 0.6613160371780396, + "learning_rate": 7.167912206037637e-06, + "loss": 0.6839, + "step": 13046 + }, + { + "epoch": 0.7180912543343057, + "grad_norm": 0.7184692621231079, + "learning_rate": 7.16752159509619e-06, + "loss": 0.6748, + "step": 13047 + }, + { + "epoch": 0.7181462931366613, + "grad_norm": 0.6938989758491516, + "learning_rate": 7.167130967864231e-06, + "loss": 0.7926, + "step": 13048 + }, + { + "epoch": 0.718201331939017, + "grad_norm": 0.6871020793914795, + "learning_rate": 7.166740324344696e-06, + "loss": 0.8229, + "step": 13049 + }, + { + "epoch": 0.7182563707413727, + "grad_norm": 0.8003624081611633, + "learning_rate": 7.166349664540521e-06, + "loss": 0.8488, + "step": 13050 + }, + { + "epoch": 0.7183114095437283, + "grad_norm": 0.7309357523918152, + "learning_rate": 7.165958988454642e-06, + "loss": 0.7442, + "step": 13051 + }, + { + "epoch": 0.718366448346084, + "grad_norm": 0.7462141513824463, + "learning_rate": 7.165568296089993e-06, + "loss": 0.8014, + "step": 13052 + }, + { + "epoch": 0.7184214871484397, + "grad_norm": 0.8335661292076111, + "learning_rate": 7.165177587449516e-06, + "loss": 0.6773, + "step": 13053 + }, + { + "epoch": 0.7184765259507954, + "grad_norm": 0.6996884346008301, + "learning_rate": 7.164786862536142e-06, + "loss": 0.7491, + "step": 13054 + }, + { + "epoch": 0.7185315647531509, + "grad_norm": 0.7203043103218079, + "learning_rate": 7.164396121352809e-06, + "loss": 0.7196, + "step": 13055 + }, + { + "epoch": 0.7185866035555066, + "grad_norm": 0.7109461426734924, + "learning_rate": 7.164005363902453e-06, + "loss": 0.7336, + "step": 13056 + }, + { + "epoch": 0.7186416423578623, + "grad_norm": 0.7057282328605652, + "learning_rate": 7.1636145901880135e-06, + "loss": 0.734, + "step": 13057 + }, + { + "epoch": 0.7186966811602179, + "grad_norm": 0.7288782000541687, + "learning_rate": 7.163223800212427e-06, + "loss": 0.8141, + "step": 13058 + }, + { + "epoch": 0.7187517199625736, + "grad_norm": 0.6812320947647095, + "learning_rate": 7.162832993978628e-06, + "loss": 0.7525, + "step": 13059 + }, + { + "epoch": 0.7188067587649293, + "grad_norm": 0.6782627105712891, + "learning_rate": 7.1624421714895546e-06, + "loss": 0.7647, + "step": 13060 + }, + { + "epoch": 0.718861797567285, + "grad_norm": 0.7361965775489807, + "learning_rate": 7.162051332748146e-06, + "loss": 0.7774, + "step": 13061 + }, + { + "epoch": 0.7189168363696405, + "grad_norm": 0.68894362449646, + "learning_rate": 7.161660477757337e-06, + "loss": 0.767, + "step": 13062 + }, + { + "epoch": 0.7189718751719962, + "grad_norm": 0.6440854668617249, + "learning_rate": 7.161269606520067e-06, + "loss": 0.7062, + "step": 13063 + }, + { + "epoch": 0.7190269139743519, + "grad_norm": 0.8411546945571899, + "learning_rate": 7.160878719039273e-06, + "loss": 0.728, + "step": 13064 + }, + { + "epoch": 0.7190819527767076, + "grad_norm": 0.6895145177841187, + "learning_rate": 7.160487815317895e-06, + "loss": 0.6667, + "step": 13065 + }, + { + "epoch": 0.7191369915790632, + "grad_norm": 0.6943626403808594, + "learning_rate": 7.160096895358866e-06, + "loss": 0.7579, + "step": 13066 + }, + { + "epoch": 0.7191920303814189, + "grad_norm": 0.7940205335617065, + "learning_rate": 7.1597059591651294e-06, + "loss": 0.7286, + "step": 13067 + }, + { + "epoch": 0.7192470691837746, + "grad_norm": 0.7350896000862122, + "learning_rate": 7.159315006739619e-06, + "loss": 0.7174, + "step": 13068 + }, + { + "epoch": 0.7193021079861303, + "grad_norm": 0.7663372159004211, + "learning_rate": 7.158924038085275e-06, + "loss": 0.7871, + "step": 13069 + }, + { + "epoch": 0.7193571467884858, + "grad_norm": 0.7368965744972229, + "learning_rate": 7.1585330532050375e-06, + "loss": 0.7356, + "step": 13070 + }, + { + "epoch": 0.7194121855908415, + "grad_norm": 0.7345212697982788, + "learning_rate": 7.158142052101843e-06, + "loss": 0.7784, + "step": 13071 + }, + { + "epoch": 0.7194672243931972, + "grad_norm": 0.7847188711166382, + "learning_rate": 7.157751034778629e-06, + "loss": 0.7899, + "step": 13072 + }, + { + "epoch": 0.7195222631955529, + "grad_norm": 0.757514476776123, + "learning_rate": 7.157360001238337e-06, + "loss": 0.8899, + "step": 13073 + }, + { + "epoch": 0.7195773019979085, + "grad_norm": 0.73405522108078, + "learning_rate": 7.156968951483905e-06, + "loss": 0.7283, + "step": 13074 + }, + { + "epoch": 0.7196323408002642, + "grad_norm": 0.7950206398963928, + "learning_rate": 7.156577885518271e-06, + "loss": 0.7338, + "step": 13075 + }, + { + "epoch": 0.7196873796026199, + "grad_norm": 0.8082411289215088, + "learning_rate": 7.156186803344374e-06, + "loss": 0.711, + "step": 13076 + }, + { + "epoch": 0.7197424184049755, + "grad_norm": 0.6868693828582764, + "learning_rate": 7.1557957049651574e-06, + "loss": 0.7583, + "step": 13077 + }, + { + "epoch": 0.7197974572073311, + "grad_norm": 0.7226251363754272, + "learning_rate": 7.155404590383554e-06, + "loss": 0.746, + "step": 13078 + }, + { + "epoch": 0.7198524960096868, + "grad_norm": 0.7437220811843872, + "learning_rate": 7.155013459602509e-06, + "loss": 0.6884, + "step": 13079 + }, + { + "epoch": 0.7199075348120425, + "grad_norm": 0.7486164569854736, + "learning_rate": 7.154622312624958e-06, + "loss": 0.6968, + "step": 13080 + }, + { + "epoch": 0.7199625736143982, + "grad_norm": 0.7709106802940369, + "learning_rate": 7.154231149453843e-06, + "loss": 0.838, + "step": 13081 + }, + { + "epoch": 0.7200176124167538, + "grad_norm": 0.6962981224060059, + "learning_rate": 7.153839970092104e-06, + "loss": 0.7186, + "step": 13082 + }, + { + "epoch": 0.7200726512191095, + "grad_norm": 0.8195380568504333, + "learning_rate": 7.15344877454268e-06, + "loss": 0.7949, + "step": 13083 + }, + { + "epoch": 0.7201276900214651, + "grad_norm": 0.735285758972168, + "learning_rate": 7.15305756280851e-06, + "loss": 0.7477, + "step": 13084 + }, + { + "epoch": 0.7201827288238208, + "grad_norm": 0.6121101379394531, + "learning_rate": 7.1526663348925375e-06, + "loss": 0.6686, + "step": 13085 + }, + { + "epoch": 0.7202377676261764, + "grad_norm": 0.7204885482788086, + "learning_rate": 7.1522750907977e-06, + "loss": 0.8013, + "step": 13086 + }, + { + "epoch": 0.7202928064285321, + "grad_norm": 0.6808584332466125, + "learning_rate": 7.15188383052694e-06, + "loss": 0.7847, + "step": 13087 + }, + { + "epoch": 0.7203478452308878, + "grad_norm": 0.7049086093902588, + "learning_rate": 7.151492554083195e-06, + "loss": 0.7563, + "step": 13088 + }, + { + "epoch": 0.7204028840332435, + "grad_norm": 0.765708327293396, + "learning_rate": 7.151101261469411e-06, + "loss": 0.7648, + "step": 13089 + }, + { + "epoch": 0.7204579228355991, + "grad_norm": 0.6810007095336914, + "learning_rate": 7.150709952688525e-06, + "loss": 0.731, + "step": 13090 + }, + { + "epoch": 0.7205129616379548, + "grad_norm": 0.7242745757102966, + "learning_rate": 7.150318627743478e-06, + "loss": 0.8027, + "step": 13091 + }, + { + "epoch": 0.7205680004403104, + "grad_norm": 0.7452220916748047, + "learning_rate": 7.14992728663721e-06, + "loss": 0.7848, + "step": 13092 + }, + { + "epoch": 0.7206230392426661, + "grad_norm": 0.6333943605422974, + "learning_rate": 7.149535929372667e-06, + "loss": 0.7105, + "step": 13093 + }, + { + "epoch": 0.7206780780450217, + "grad_norm": 0.7565333247184753, + "learning_rate": 7.149144555952785e-06, + "loss": 0.8006, + "step": 13094 + }, + { + "epoch": 0.7207331168473774, + "grad_norm": 0.7703632712364197, + "learning_rate": 7.14875316638051e-06, + "loss": 0.7323, + "step": 13095 + }, + { + "epoch": 0.7207881556497331, + "grad_norm": 0.6275011301040649, + "learning_rate": 7.148361760658779e-06, + "loss": 0.6817, + "step": 13096 + }, + { + "epoch": 0.7208431944520888, + "grad_norm": 0.7363598942756653, + "learning_rate": 7.147970338790537e-06, + "loss": 0.7641, + "step": 13097 + }, + { + "epoch": 0.7208982332544444, + "grad_norm": 0.6284294724464417, + "learning_rate": 7.147578900778727e-06, + "loss": 0.7117, + "step": 13098 + }, + { + "epoch": 0.7209532720568, + "grad_norm": 0.7878503203392029, + "learning_rate": 7.147187446626287e-06, + "loss": 0.8184, + "step": 13099 + }, + { + "epoch": 0.7210083108591557, + "grad_norm": 0.6973691582679749, + "learning_rate": 7.146795976336159e-06, + "loss": 0.7815, + "step": 13100 + }, + { + "epoch": 0.7210633496615113, + "grad_norm": 0.7018479704856873, + "learning_rate": 7.146404489911291e-06, + "loss": 0.7305, + "step": 13101 + }, + { + "epoch": 0.721118388463867, + "grad_norm": 0.6903830766677856, + "learning_rate": 7.14601298735462e-06, + "loss": 0.7074, + "step": 13102 + }, + { + "epoch": 0.7211734272662227, + "grad_norm": 0.7612621188163757, + "learning_rate": 7.145621468669089e-06, + "loss": 0.8189, + "step": 13103 + }, + { + "epoch": 0.7212284660685784, + "grad_norm": 0.7256856560707092, + "learning_rate": 7.145229933857643e-06, + "loss": 0.5959, + "step": 13104 + }, + { + "epoch": 0.721283504870934, + "grad_norm": 0.6632323265075684, + "learning_rate": 7.1448383829232205e-06, + "loss": 0.7519, + "step": 13105 + }, + { + "epoch": 0.7213385436732896, + "grad_norm": 0.6320651769638062, + "learning_rate": 7.144446815868768e-06, + "loss": 0.7259, + "step": 13106 + }, + { + "epoch": 0.7213935824756453, + "grad_norm": 0.6883212924003601, + "learning_rate": 7.144055232697227e-06, + "loss": 0.7776, + "step": 13107 + }, + { + "epoch": 0.721448621278001, + "grad_norm": 0.7159759402275085, + "learning_rate": 7.1436636334115415e-06, + "loss": 0.6915, + "step": 13108 + }, + { + "epoch": 0.7215036600803566, + "grad_norm": 0.7108080983161926, + "learning_rate": 7.1432720180146535e-06, + "loss": 0.731, + "step": 13109 + }, + { + "epoch": 0.7215586988827123, + "grad_norm": 0.7765033841133118, + "learning_rate": 7.142880386509506e-06, + "loss": 0.6965, + "step": 13110 + }, + { + "epoch": 0.721613737685068, + "grad_norm": 0.7205119132995605, + "learning_rate": 7.142488738899045e-06, + "loss": 0.7262, + "step": 13111 + }, + { + "epoch": 0.7216687764874237, + "grad_norm": 0.6786921620368958, + "learning_rate": 7.142097075186212e-06, + "loss": 0.805, + "step": 13112 + }, + { + "epoch": 0.7217238152897792, + "grad_norm": 0.7947409152984619, + "learning_rate": 7.141705395373949e-06, + "loss": 0.7701, + "step": 13113 + }, + { + "epoch": 0.7217788540921349, + "grad_norm": 0.6672971844673157, + "learning_rate": 7.141313699465204e-06, + "loss": 0.7325, + "step": 13114 + }, + { + "epoch": 0.7218338928944906, + "grad_norm": 0.641765296459198, + "learning_rate": 7.140921987462916e-06, + "loss": 0.7902, + "step": 13115 + }, + { + "epoch": 0.7218889316968463, + "grad_norm": 0.6675699353218079, + "learning_rate": 7.140530259370032e-06, + "loss": 0.7422, + "step": 13116 + }, + { + "epoch": 0.7219439704992019, + "grad_norm": 0.6940729022026062, + "learning_rate": 7.140138515189495e-06, + "loss": 0.6978, + "step": 13117 + }, + { + "epoch": 0.7219990093015576, + "grad_norm": 0.6805779337882996, + "learning_rate": 7.1397467549242514e-06, + "loss": 0.7498, + "step": 13118 + }, + { + "epoch": 0.7220540481039133, + "grad_norm": 0.6231662631034851, + "learning_rate": 7.139354978577243e-06, + "loss": 0.7344, + "step": 13119 + }, + { + "epoch": 0.722109086906269, + "grad_norm": 0.6883575916290283, + "learning_rate": 7.138963186151416e-06, + "loss": 0.835, + "step": 13120 + }, + { + "epoch": 0.7221641257086245, + "grad_norm": 0.6902666687965393, + "learning_rate": 7.138571377649712e-06, + "loss": 0.7427, + "step": 13121 + }, + { + "epoch": 0.7222191645109802, + "grad_norm": 0.7156440019607544, + "learning_rate": 7.1381795530750805e-06, + "loss": 0.7661, + "step": 13122 + }, + { + "epoch": 0.7222742033133359, + "grad_norm": 0.6727150678634644, + "learning_rate": 7.137787712430464e-06, + "loss": 0.7872, + "step": 13123 + }, + { + "epoch": 0.7223292421156916, + "grad_norm": 0.6200405359268188, + "learning_rate": 7.137395855718806e-06, + "loss": 0.6108, + "step": 13124 + }, + { + "epoch": 0.7223842809180472, + "grad_norm": 0.6384756565093994, + "learning_rate": 7.137003982943054e-06, + "loss": 0.698, + "step": 13125 + }, + { + "epoch": 0.7224393197204029, + "grad_norm": 0.7212089896202087, + "learning_rate": 7.1366120941061515e-06, + "loss": 0.7679, + "step": 13126 + }, + { + "epoch": 0.7224943585227586, + "grad_norm": 0.737352192401886, + "learning_rate": 7.136220189211044e-06, + "loss": 0.8173, + "step": 13127 + }, + { + "epoch": 0.7225493973251143, + "grad_norm": 0.6244099736213684, + "learning_rate": 7.135828268260679e-06, + "loss": 0.7224, + "step": 13128 + }, + { + "epoch": 0.7226044361274698, + "grad_norm": 0.8191885948181152, + "learning_rate": 7.135436331257997e-06, + "loss": 0.8122, + "step": 13129 + }, + { + "epoch": 0.7226594749298255, + "grad_norm": 0.7069095373153687, + "learning_rate": 7.135044378205949e-06, + "loss": 0.7844, + "step": 13130 + }, + { + "epoch": 0.7227145137321812, + "grad_norm": 0.6094380021095276, + "learning_rate": 7.13465240910748e-06, + "loss": 0.7093, + "step": 13131 + }, + { + "epoch": 0.7227695525345369, + "grad_norm": 0.7075843811035156, + "learning_rate": 7.134260423965534e-06, + "loss": 0.8109, + "step": 13132 + }, + { + "epoch": 0.7228245913368925, + "grad_norm": 0.6684398651123047, + "learning_rate": 7.133868422783057e-06, + "loss": 0.7224, + "step": 13133 + }, + { + "epoch": 0.7228796301392482, + "grad_norm": 0.6574007272720337, + "learning_rate": 7.133476405562998e-06, + "loss": 0.6763, + "step": 13134 + }, + { + "epoch": 0.7229346689416039, + "grad_norm": 0.7124022841453552, + "learning_rate": 7.133084372308301e-06, + "loss": 0.8047, + "step": 13135 + }, + { + "epoch": 0.7229897077439595, + "grad_norm": 0.7035976648330688, + "learning_rate": 7.1326923230219124e-06, + "loss": 0.7544, + "step": 13136 + }, + { + "epoch": 0.7230447465463151, + "grad_norm": 0.7007604241371155, + "learning_rate": 7.132300257706779e-06, + "loss": 0.7584, + "step": 13137 + }, + { + "epoch": 0.7230997853486708, + "grad_norm": 0.6917324066162109, + "learning_rate": 7.131908176365848e-06, + "loss": 0.6846, + "step": 13138 + }, + { + "epoch": 0.7231548241510265, + "grad_norm": 0.6857448816299438, + "learning_rate": 7.1315160790020666e-06, + "loss": 0.8142, + "step": 13139 + }, + { + "epoch": 0.7232098629533822, + "grad_norm": 0.8381820321083069, + "learning_rate": 7.13112396561838e-06, + "loss": 0.8132, + "step": 13140 + }, + { + "epoch": 0.7232649017557378, + "grad_norm": 0.7024879455566406, + "learning_rate": 7.130731836217735e-06, + "loss": 0.7157, + "step": 13141 + }, + { + "epoch": 0.7233199405580935, + "grad_norm": 0.7313332557678223, + "learning_rate": 7.130339690803081e-06, + "loss": 0.7623, + "step": 13142 + }, + { + "epoch": 0.7233749793604491, + "grad_norm": 0.697536051273346, + "learning_rate": 7.129947529377364e-06, + "loss": 0.7202, + "step": 13143 + }, + { + "epoch": 0.7234300181628047, + "grad_norm": 0.6946722865104675, + "learning_rate": 7.129555351943533e-06, + "loss": 0.7862, + "step": 13144 + }, + { + "epoch": 0.7234850569651604, + "grad_norm": 0.6643924117088318, + "learning_rate": 7.129163158504532e-06, + "loss": 0.7055, + "step": 13145 + }, + { + "epoch": 0.7235400957675161, + "grad_norm": 0.7285693287849426, + "learning_rate": 7.1287709490633104e-06, + "loss": 0.6815, + "step": 13146 + }, + { + "epoch": 0.7235951345698718, + "grad_norm": 1.2701799869537354, + "learning_rate": 7.128378723622818e-06, + "loss": 0.8596, + "step": 13147 + }, + { + "epoch": 0.7236501733722274, + "grad_norm": 0.7067306041717529, + "learning_rate": 7.127986482186e-06, + "loss": 0.7077, + "step": 13148 + }, + { + "epoch": 0.7237052121745831, + "grad_norm": 0.8863486051559448, + "learning_rate": 7.127594224755805e-06, + "loss": 0.8961, + "step": 13149 + }, + { + "epoch": 0.7237602509769387, + "grad_norm": 0.7286190986633301, + "learning_rate": 7.127201951335182e-06, + "loss": 0.7941, + "step": 13150 + }, + { + "epoch": 0.7238152897792944, + "grad_norm": 0.8756779432296753, + "learning_rate": 7.126809661927079e-06, + "loss": 0.7862, + "step": 13151 + }, + { + "epoch": 0.72387032858165, + "grad_norm": 0.7780876755714417, + "learning_rate": 7.126417356534443e-06, + "loss": 0.7095, + "step": 13152 + }, + { + "epoch": 0.7239253673840057, + "grad_norm": 0.6332812905311584, + "learning_rate": 7.1260250351602225e-06, + "loss": 0.7057, + "step": 13153 + }, + { + "epoch": 0.7239804061863614, + "grad_norm": 0.8350435495376587, + "learning_rate": 7.125632697807368e-06, + "loss": 0.7695, + "step": 13154 + }, + { + "epoch": 0.7240354449887171, + "grad_norm": 0.8306411504745483, + "learning_rate": 7.125240344478827e-06, + "loss": 0.6605, + "step": 13155 + }, + { + "epoch": 0.7240904837910727, + "grad_norm": 0.7495117783546448, + "learning_rate": 7.124847975177548e-06, + "loss": 0.8078, + "step": 13156 + }, + { + "epoch": 0.7241455225934283, + "grad_norm": 0.6481010317802429, + "learning_rate": 7.12445558990648e-06, + "loss": 0.8094, + "step": 13157 + }, + { + "epoch": 0.724200561395784, + "grad_norm": 0.7742613554000854, + "learning_rate": 7.124063188668573e-06, + "loss": 0.78, + "step": 13158 + }, + { + "epoch": 0.7242556001981397, + "grad_norm": 0.8394206762313843, + "learning_rate": 7.123670771466776e-06, + "loss": 0.8983, + "step": 13159 + }, + { + "epoch": 0.7243106390004953, + "grad_norm": 0.7196840047836304, + "learning_rate": 7.123278338304038e-06, + "loss": 0.7203, + "step": 13160 + }, + { + "epoch": 0.724365677802851, + "grad_norm": 0.5964440107345581, + "learning_rate": 7.122885889183309e-06, + "loss": 0.6251, + "step": 13161 + }, + { + "epoch": 0.7244207166052067, + "grad_norm": 0.7394048571586609, + "learning_rate": 7.1224934241075375e-06, + "loss": 0.7755, + "step": 13162 + }, + { + "epoch": 0.7244757554075624, + "grad_norm": 0.6427145004272461, + "learning_rate": 7.1221009430796724e-06, + "loss": 0.74, + "step": 13163 + }, + { + "epoch": 0.724530794209918, + "grad_norm": 0.7084387540817261, + "learning_rate": 7.121708446102667e-06, + "loss": 0.7464, + "step": 13164 + }, + { + "epoch": 0.7245858330122736, + "grad_norm": 0.6623230576515198, + "learning_rate": 7.121315933179466e-06, + "loss": 0.7237, + "step": 13165 + }, + { + "epoch": 0.7246408718146293, + "grad_norm": 0.9234243631362915, + "learning_rate": 7.120923404313024e-06, + "loss": 0.8238, + "step": 13166 + }, + { + "epoch": 0.724695910616985, + "grad_norm": 0.6458896994590759, + "learning_rate": 7.120530859506289e-06, + "loss": 0.8105, + "step": 13167 + }, + { + "epoch": 0.7247509494193406, + "grad_norm": 0.7160854935646057, + "learning_rate": 7.1201382987622115e-06, + "loss": 0.7954, + "step": 13168 + }, + { + "epoch": 0.7248059882216963, + "grad_norm": 0.6896069645881653, + "learning_rate": 7.119745722083742e-06, + "loss": 0.7281, + "step": 13169 + }, + { + "epoch": 0.724861027024052, + "grad_norm": 0.6609574556350708, + "learning_rate": 7.119353129473831e-06, + "loss": 0.7682, + "step": 13170 + }, + { + "epoch": 0.7249160658264077, + "grad_norm": 0.6477035880088806, + "learning_rate": 7.118960520935429e-06, + "loss": 0.8183, + "step": 13171 + }, + { + "epoch": 0.7249711046287632, + "grad_norm": 1.4488556385040283, + "learning_rate": 7.1185678964714885e-06, + "loss": 0.8321, + "step": 13172 + }, + { + "epoch": 0.7250261434311189, + "grad_norm": 0.8502382040023804, + "learning_rate": 7.118175256084958e-06, + "loss": 0.7881, + "step": 13173 + }, + { + "epoch": 0.7250811822334746, + "grad_norm": 0.6969912648200989, + "learning_rate": 7.117782599778788e-06, + "loss": 0.7598, + "step": 13174 + }, + { + "epoch": 0.7251362210358303, + "grad_norm": 0.7254889011383057, + "learning_rate": 7.117389927555933e-06, + "loss": 0.8473, + "step": 13175 + }, + { + "epoch": 0.7251912598381859, + "grad_norm": 0.9958444237709045, + "learning_rate": 7.116997239419341e-06, + "loss": 0.7558, + "step": 13176 + }, + { + "epoch": 0.7252462986405416, + "grad_norm": 0.6694881916046143, + "learning_rate": 7.116604535371963e-06, + "loss": 0.7072, + "step": 13177 + }, + { + "epoch": 0.7253013374428973, + "grad_norm": 1.0730634927749634, + "learning_rate": 7.116211815416754e-06, + "loss": 0.7607, + "step": 13178 + }, + { + "epoch": 0.725356376245253, + "grad_norm": 0.6770226359367371, + "learning_rate": 7.115819079556663e-06, + "loss": 0.7213, + "step": 13179 + }, + { + "epoch": 0.7254114150476085, + "grad_norm": 0.866215705871582, + "learning_rate": 7.115426327794642e-06, + "loss": 0.7273, + "step": 13180 + }, + { + "epoch": 0.7254664538499642, + "grad_norm": 0.7303730845451355, + "learning_rate": 7.115033560133642e-06, + "loss": 0.764, + "step": 13181 + }, + { + "epoch": 0.7255214926523199, + "grad_norm": 0.6900389194488525, + "learning_rate": 7.114640776576617e-06, + "loss": 0.6958, + "step": 13182 + }, + { + "epoch": 0.7255765314546756, + "grad_norm": 0.7255710959434509, + "learning_rate": 7.114247977126518e-06, + "loss": 0.6507, + "step": 13183 + }, + { + "epoch": 0.7256315702570312, + "grad_norm": 0.6848479509353638, + "learning_rate": 7.113855161786297e-06, + "loss": 0.6848, + "step": 13184 + }, + { + "epoch": 0.7256866090593869, + "grad_norm": 0.6800528764724731, + "learning_rate": 7.113462330558907e-06, + "loss": 0.7354, + "step": 13185 + }, + { + "epoch": 0.7257416478617426, + "grad_norm": 0.7271339297294617, + "learning_rate": 7.113069483447299e-06, + "loss": 0.7695, + "step": 13186 + }, + { + "epoch": 0.7257966866640981, + "grad_norm": 0.8212381601333618, + "learning_rate": 7.112676620454427e-06, + "loss": 0.7348, + "step": 13187 + }, + { + "epoch": 0.7258517254664538, + "grad_norm": 0.6714771389961243, + "learning_rate": 7.112283741583242e-06, + "loss": 0.75, + "step": 13188 + }, + { + "epoch": 0.7259067642688095, + "grad_norm": 0.7834941148757935, + "learning_rate": 7.111890846836699e-06, + "loss": 0.6914, + "step": 13189 + }, + { + "epoch": 0.7259618030711652, + "grad_norm": 0.8107824325561523, + "learning_rate": 7.111497936217748e-06, + "loss": 0.803, + "step": 13190 + }, + { + "epoch": 0.7260168418735208, + "grad_norm": 0.6306549906730652, + "learning_rate": 7.1111050097293464e-06, + "loss": 0.7915, + "step": 13191 + }, + { + "epoch": 0.7260718806758765, + "grad_norm": 0.7030252814292908, + "learning_rate": 7.110712067374444e-06, + "loss": 0.7091, + "step": 13192 + }, + { + "epoch": 0.7261269194782322, + "grad_norm": 0.7625641226768494, + "learning_rate": 7.110319109155992e-06, + "loss": 0.774, + "step": 13193 + }, + { + "epoch": 0.7261819582805878, + "grad_norm": 0.6382628083229065, + "learning_rate": 7.109926135076949e-06, + "loss": 0.6774, + "step": 13194 + }, + { + "epoch": 0.7262369970829434, + "grad_norm": 0.6594563722610474, + "learning_rate": 7.109533145140265e-06, + "loss": 0.7977, + "step": 13195 + }, + { + "epoch": 0.7262920358852991, + "grad_norm": 0.7177248001098633, + "learning_rate": 7.109140139348895e-06, + "loss": 0.6771, + "step": 13196 + }, + { + "epoch": 0.7263470746876548, + "grad_norm": 0.6631305813789368, + "learning_rate": 7.108747117705792e-06, + "loss": 0.6877, + "step": 13197 + }, + { + "epoch": 0.7264021134900105, + "grad_norm": 0.6783736944198608, + "learning_rate": 7.10835408021391e-06, + "loss": 0.8048, + "step": 13198 + }, + { + "epoch": 0.7264571522923661, + "grad_norm": 0.7368303537368774, + "learning_rate": 7.107961026876204e-06, + "loss": 0.7962, + "step": 13199 + }, + { + "epoch": 0.7265121910947218, + "grad_norm": 0.7697044014930725, + "learning_rate": 7.107567957695627e-06, + "loss": 0.769, + "step": 13200 + }, + { + "epoch": 0.7265672298970774, + "grad_norm": 0.639934241771698, + "learning_rate": 7.1071748726751325e-06, + "loss": 0.722, + "step": 13201 + }, + { + "epoch": 0.7266222686994331, + "grad_norm": 0.8410669565200806, + "learning_rate": 7.106781771817676e-06, + "loss": 0.8861, + "step": 13202 + }, + { + "epoch": 0.7266773075017887, + "grad_norm": 0.654924213886261, + "learning_rate": 7.106388655126212e-06, + "loss": 0.7463, + "step": 13203 + }, + { + "epoch": 0.7267323463041444, + "grad_norm": 0.719714879989624, + "learning_rate": 7.105995522603695e-06, + "loss": 0.759, + "step": 13204 + }, + { + "epoch": 0.7267873851065001, + "grad_norm": 0.7019139528274536, + "learning_rate": 7.105602374253078e-06, + "loss": 0.7965, + "step": 13205 + }, + { + "epoch": 0.7268424239088558, + "grad_norm": 0.7289487719535828, + "learning_rate": 7.105209210077318e-06, + "loss": 0.8591, + "step": 13206 + }, + { + "epoch": 0.7268974627112114, + "grad_norm": 0.670274019241333, + "learning_rate": 7.104816030079369e-06, + "loss": 0.7707, + "step": 13207 + }, + { + "epoch": 0.726952501513567, + "grad_norm": 0.7156813740730286, + "learning_rate": 7.104422834262187e-06, + "loss": 0.7724, + "step": 13208 + }, + { + "epoch": 0.7270075403159227, + "grad_norm": 0.6776198148727417, + "learning_rate": 7.104029622628726e-06, + "loss": 0.7331, + "step": 13209 + }, + { + "epoch": 0.7270625791182784, + "grad_norm": 0.8008358478546143, + "learning_rate": 7.103636395181941e-06, + "loss": 0.8279, + "step": 13210 + }, + { + "epoch": 0.727117617920634, + "grad_norm": 0.6622886061668396, + "learning_rate": 7.1032431519247876e-06, + "loss": 0.6646, + "step": 13211 + }, + { + "epoch": 0.7271726567229897, + "grad_norm": 0.6834877729415894, + "learning_rate": 7.102849892860223e-06, + "loss": 0.75, + "step": 13212 + }, + { + "epoch": 0.7272276955253454, + "grad_norm": 0.7659596800804138, + "learning_rate": 7.1024566179912e-06, + "loss": 0.6999, + "step": 13213 + }, + { + "epoch": 0.7272827343277011, + "grad_norm": 0.7368002533912659, + "learning_rate": 7.102063327320677e-06, + "loss": 0.7376, + "step": 13214 + }, + { + "epoch": 0.7273377731300567, + "grad_norm": 0.7286058664321899, + "learning_rate": 7.101670020851609e-06, + "loss": 0.8139, + "step": 13215 + }, + { + "epoch": 0.7273928119324123, + "grad_norm": 1.0521546602249146, + "learning_rate": 7.101276698586951e-06, + "loss": 0.8545, + "step": 13216 + }, + { + "epoch": 0.727447850734768, + "grad_norm": 0.6940305233001709, + "learning_rate": 7.100883360529659e-06, + "loss": 0.7534, + "step": 13217 + }, + { + "epoch": 0.7275028895371237, + "grad_norm": 0.8279024362564087, + "learning_rate": 7.100490006682691e-06, + "loss": 0.852, + "step": 13218 + }, + { + "epoch": 0.7275579283394793, + "grad_norm": 0.63093501329422, + "learning_rate": 7.100096637049002e-06, + "loss": 0.6728, + "step": 13219 + }, + { + "epoch": 0.727612967141835, + "grad_norm": 0.7576018571853638, + "learning_rate": 7.099703251631549e-06, + "loss": 0.6343, + "step": 13220 + }, + { + "epoch": 0.7276680059441907, + "grad_norm": 0.9493140578269958, + "learning_rate": 7.0993098504332894e-06, + "loss": 0.82, + "step": 13221 + }, + { + "epoch": 0.7277230447465464, + "grad_norm": 0.7279804944992065, + "learning_rate": 7.098916433457177e-06, + "loss": 0.8149, + "step": 13222 + }, + { + "epoch": 0.7277780835489019, + "grad_norm": 0.7660531401634216, + "learning_rate": 7.0985230007061725e-06, + "loss": 0.8278, + "step": 13223 + }, + { + "epoch": 0.7278331223512576, + "grad_norm": 0.6468318104743958, + "learning_rate": 7.09812955218323e-06, + "loss": 0.7193, + "step": 13224 + }, + { + "epoch": 0.7278881611536133, + "grad_norm": 0.6389151811599731, + "learning_rate": 7.097736087891306e-06, + "loss": 0.6744, + "step": 13225 + }, + { + "epoch": 0.727943199955969, + "grad_norm": 0.6565649509429932, + "learning_rate": 7.097342607833361e-06, + "loss": 0.7586, + "step": 13226 + }, + { + "epoch": 0.7279982387583246, + "grad_norm": 0.6867381930351257, + "learning_rate": 7.09694911201235e-06, + "loss": 0.684, + "step": 13227 + }, + { + "epoch": 0.7280532775606803, + "grad_norm": 0.7509286403656006, + "learning_rate": 7.096555600431229e-06, + "loss": 0.8242, + "step": 13228 + }, + { + "epoch": 0.728108316363036, + "grad_norm": 0.6997731328010559, + "learning_rate": 7.096162073092959e-06, + "loss": 0.8182, + "step": 13229 + }, + { + "epoch": 0.7281633551653915, + "grad_norm": 0.6698907017707825, + "learning_rate": 7.095768530000496e-06, + "loss": 0.7752, + "step": 13230 + }, + { + "epoch": 0.7282183939677472, + "grad_norm": 0.7219094634056091, + "learning_rate": 7.095374971156799e-06, + "loss": 0.792, + "step": 13231 + }, + { + "epoch": 0.7282734327701029, + "grad_norm": 0.6479744911193848, + "learning_rate": 7.094981396564822e-06, + "loss": 0.7556, + "step": 13232 + }, + { + "epoch": 0.7283284715724586, + "grad_norm": 0.6795497536659241, + "learning_rate": 7.094587806227527e-06, + "loss": 0.7611, + "step": 13233 + }, + { + "epoch": 0.7283835103748142, + "grad_norm": 0.7145074605941772, + "learning_rate": 7.094194200147871e-06, + "loss": 0.8064, + "step": 13234 + }, + { + "epoch": 0.7284385491771699, + "grad_norm": 0.6750605702400208, + "learning_rate": 7.093800578328811e-06, + "loss": 0.7054, + "step": 13235 + }, + { + "epoch": 0.7284935879795256, + "grad_norm": 0.7574751377105713, + "learning_rate": 7.093406940773307e-06, + "loss": 0.7878, + "step": 13236 + }, + { + "epoch": 0.7285486267818813, + "grad_norm": 0.7836418747901917, + "learning_rate": 7.093013287484316e-06, + "loss": 0.7445, + "step": 13237 + }, + { + "epoch": 0.7286036655842368, + "grad_norm": 0.7658870220184326, + "learning_rate": 7.092619618464799e-06, + "loss": 0.7513, + "step": 13238 + }, + { + "epoch": 0.7286587043865925, + "grad_norm": 1.1127573251724243, + "learning_rate": 7.092225933717711e-06, + "loss": 0.7601, + "step": 13239 + }, + { + "epoch": 0.7287137431889482, + "grad_norm": 0.7003853917121887, + "learning_rate": 7.091832233246015e-06, + "loss": 0.8533, + "step": 13240 + }, + { + "epoch": 0.7287687819913039, + "grad_norm": 0.6513979434967041, + "learning_rate": 7.091438517052667e-06, + "loss": 0.7285, + "step": 13241 + }, + { + "epoch": 0.7288238207936595, + "grad_norm": 0.7072234153747559, + "learning_rate": 7.091044785140626e-06, + "loss": 0.7741, + "step": 13242 + }, + { + "epoch": 0.7288788595960152, + "grad_norm": 0.8117190599441528, + "learning_rate": 7.090651037512854e-06, + "loss": 0.6851, + "step": 13243 + }, + { + "epoch": 0.7289338983983709, + "grad_norm": 0.6876427531242371, + "learning_rate": 7.090257274172306e-06, + "loss": 0.7162, + "step": 13244 + }, + { + "epoch": 0.7289889372007266, + "grad_norm": 0.7128324508666992, + "learning_rate": 7.0898634951219455e-06, + "loss": 0.7302, + "step": 13245 + }, + { + "epoch": 0.7290439760030821, + "grad_norm": 0.6918201446533203, + "learning_rate": 7.089469700364731e-06, + "loss": 0.8582, + "step": 13246 + }, + { + "epoch": 0.7290990148054378, + "grad_norm": 0.6172242164611816, + "learning_rate": 7.08907588990362e-06, + "loss": 0.6846, + "step": 13247 + }, + { + "epoch": 0.7291540536077935, + "grad_norm": 0.6799596548080444, + "learning_rate": 7.088682063741575e-06, + "loss": 0.7174, + "step": 13248 + }, + { + "epoch": 0.7292090924101492, + "grad_norm": 0.6663293838500977, + "learning_rate": 7.088288221881554e-06, + "loss": 0.7237, + "step": 13249 + }, + { + "epoch": 0.7292641312125048, + "grad_norm": 0.6758549213409424, + "learning_rate": 7.0878943643265175e-06, + "loss": 0.7912, + "step": 13250 + }, + { + "epoch": 0.7293191700148605, + "grad_norm": 0.6937153339385986, + "learning_rate": 7.087500491079427e-06, + "loss": 0.742, + "step": 13251 + }, + { + "epoch": 0.7293742088172162, + "grad_norm": 0.6441238522529602, + "learning_rate": 7.087106602143241e-06, + "loss": 0.7676, + "step": 13252 + }, + { + "epoch": 0.7294292476195718, + "grad_norm": 0.6615588068962097, + "learning_rate": 7.08671269752092e-06, + "loss": 0.7069, + "step": 13253 + }, + { + "epoch": 0.7294842864219274, + "grad_norm": 0.8052160739898682, + "learning_rate": 7.086318777215424e-06, + "loss": 0.811, + "step": 13254 + }, + { + "epoch": 0.7295393252242831, + "grad_norm": 0.7293280363082886, + "learning_rate": 7.085924841229716e-06, + "loss": 0.7127, + "step": 13255 + }, + { + "epoch": 0.7295943640266388, + "grad_norm": 0.7104617953300476, + "learning_rate": 7.085530889566756e-06, + "loss": 0.716, + "step": 13256 + }, + { + "epoch": 0.7296494028289945, + "grad_norm": 0.72947758436203, + "learning_rate": 7.085136922229503e-06, + "loss": 0.8144, + "step": 13257 + }, + { + "epoch": 0.7297044416313501, + "grad_norm": 0.7993913292884827, + "learning_rate": 7.08474293922092e-06, + "loss": 0.7609, + "step": 13258 + }, + { + "epoch": 0.7297594804337058, + "grad_norm": 0.7810680270195007, + "learning_rate": 7.0843489405439656e-06, + "loss": 0.8107, + "step": 13259 + }, + { + "epoch": 0.7298145192360614, + "grad_norm": 0.6383776664733887, + "learning_rate": 7.083954926201604e-06, + "loss": 0.7842, + "step": 13260 + }, + { + "epoch": 0.7298695580384171, + "grad_norm": 0.7653967142105103, + "learning_rate": 7.083560896196795e-06, + "loss": 0.729, + "step": 13261 + }, + { + "epoch": 0.7299245968407727, + "grad_norm": 0.6693821549415588, + "learning_rate": 7.083166850532498e-06, + "loss": 0.6901, + "step": 13262 + }, + { + "epoch": 0.7299796356431284, + "grad_norm": 0.7408621907234192, + "learning_rate": 7.082772789211678e-06, + "loss": 0.7415, + "step": 13263 + }, + { + "epoch": 0.7300346744454841, + "grad_norm": 0.6693123579025269, + "learning_rate": 7.082378712237295e-06, + "loss": 0.8102, + "step": 13264 + }, + { + "epoch": 0.7300897132478398, + "grad_norm": 0.6572727560997009, + "learning_rate": 7.081984619612311e-06, + "loss": 0.6595, + "step": 13265 + }, + { + "epoch": 0.7301447520501954, + "grad_norm": 0.7934693694114685, + "learning_rate": 7.081590511339687e-06, + "loss": 0.8024, + "step": 13266 + }, + { + "epoch": 0.730199790852551, + "grad_norm": 1.0663061141967773, + "learning_rate": 7.081196387422388e-06, + "loss": 0.7844, + "step": 13267 + }, + { + "epoch": 0.7302548296549067, + "grad_norm": 0.8005035519599915, + "learning_rate": 7.080802247863372e-06, + "loss": 0.751, + "step": 13268 + }, + { + "epoch": 0.7303098684572624, + "grad_norm": 0.6480177044868469, + "learning_rate": 7.0804080926656046e-06, + "loss": 0.7745, + "step": 13269 + }, + { + "epoch": 0.730364907259618, + "grad_norm": 0.7026820182800293, + "learning_rate": 7.080013921832047e-06, + "loss": 0.7545, + "step": 13270 + }, + { + "epoch": 0.7304199460619737, + "grad_norm": 0.673954427242279, + "learning_rate": 7.079619735365662e-06, + "loss": 0.7142, + "step": 13271 + }, + { + "epoch": 0.7304749848643294, + "grad_norm": 0.7296637296676636, + "learning_rate": 7.079225533269411e-06, + "loss": 0.8493, + "step": 13272 + }, + { + "epoch": 0.730530023666685, + "grad_norm": 0.7147308588027954, + "learning_rate": 7.0788313155462576e-06, + "loss": 0.7638, + "step": 13273 + }, + { + "epoch": 0.7305850624690406, + "grad_norm": 0.7531922459602356, + "learning_rate": 7.078437082199163e-06, + "loss": 0.8644, + "step": 13274 + }, + { + "epoch": 0.7306401012713963, + "grad_norm": 0.6581404805183411, + "learning_rate": 7.078042833231092e-06, + "loss": 0.7555, + "step": 13275 + }, + { + "epoch": 0.730695140073752, + "grad_norm": 0.6781187057495117, + "learning_rate": 7.0776485686450095e-06, + "loss": 0.7536, + "step": 13276 + }, + { + "epoch": 0.7307501788761076, + "grad_norm": 0.7164949774742126, + "learning_rate": 7.077254288443874e-06, + "loss": 0.7275, + "step": 13277 + }, + { + "epoch": 0.7308052176784633, + "grad_norm": 0.8158305287361145, + "learning_rate": 7.076859992630652e-06, + "loss": 0.6821, + "step": 13278 + }, + { + "epoch": 0.730860256480819, + "grad_norm": 0.7101448178291321, + "learning_rate": 7.076465681208307e-06, + "loss": 0.69, + "step": 13279 + }, + { + "epoch": 0.7309152952831747, + "grad_norm": 0.6844518780708313, + "learning_rate": 7.076071354179802e-06, + "loss": 0.7577, + "step": 13280 + }, + { + "epoch": 0.7309703340855302, + "grad_norm": 0.6564158797264099, + "learning_rate": 7.0756770115481e-06, + "loss": 0.6752, + "step": 13281 + }, + { + "epoch": 0.7310253728878859, + "grad_norm": 0.7444283962249756, + "learning_rate": 7.0752826533161655e-06, + "loss": 0.8118, + "step": 13282 + }, + { + "epoch": 0.7310804116902416, + "grad_norm": 0.7657533884048462, + "learning_rate": 7.074888279486962e-06, + "loss": 0.8819, + "step": 13283 + }, + { + "epoch": 0.7311354504925973, + "grad_norm": 0.6924453973770142, + "learning_rate": 7.074493890063453e-06, + "loss": 0.7674, + "step": 13284 + }, + { + "epoch": 0.7311904892949529, + "grad_norm": 0.676188588142395, + "learning_rate": 7.074099485048603e-06, + "loss": 0.7266, + "step": 13285 + }, + { + "epoch": 0.7312455280973086, + "grad_norm": 0.6325914263725281, + "learning_rate": 7.073705064445378e-06, + "loss": 0.6856, + "step": 13286 + }, + { + "epoch": 0.7313005668996643, + "grad_norm": 0.662558913230896, + "learning_rate": 7.073310628256739e-06, + "loss": 0.751, + "step": 13287 + }, + { + "epoch": 0.73135560570202, + "grad_norm": 0.8313137292861938, + "learning_rate": 7.072916176485654e-06, + "loss": 0.7187, + "step": 13288 + }, + { + "epoch": 0.7314106445043755, + "grad_norm": 0.7033550143241882, + "learning_rate": 7.072521709135084e-06, + "loss": 0.8132, + "step": 13289 + }, + { + "epoch": 0.7314656833067312, + "grad_norm": 0.715242862701416, + "learning_rate": 7.0721272262079965e-06, + "loss": 0.8551, + "step": 13290 + }, + { + "epoch": 0.7315207221090869, + "grad_norm": 0.7545164227485657, + "learning_rate": 7.071732727707356e-06, + "loss": 0.7772, + "step": 13291 + }, + { + "epoch": 0.7315757609114426, + "grad_norm": 0.7181825637817383, + "learning_rate": 7.071338213636126e-06, + "loss": 0.7378, + "step": 13292 + }, + { + "epoch": 0.7316307997137982, + "grad_norm": 0.7793779969215393, + "learning_rate": 7.070943683997273e-06, + "loss": 0.7801, + "step": 13293 + }, + { + "epoch": 0.7316858385161539, + "grad_norm": 0.7456476092338562, + "learning_rate": 7.070549138793762e-06, + "loss": 0.8038, + "step": 13294 + }, + { + "epoch": 0.7317408773185096, + "grad_norm": 0.652519702911377, + "learning_rate": 7.0701545780285576e-06, + "loss": 0.746, + "step": 13295 + }, + { + "epoch": 0.7317959161208653, + "grad_norm": 0.784450888633728, + "learning_rate": 7.069760001704625e-06, + "loss": 0.8065, + "step": 13296 + }, + { + "epoch": 0.7318509549232208, + "grad_norm": 0.8052587509155273, + "learning_rate": 7.069365409824931e-06, + "loss": 0.8098, + "step": 13297 + }, + { + "epoch": 0.7319059937255765, + "grad_norm": 0.6890794038772583, + "learning_rate": 7.06897080239244e-06, + "loss": 0.783, + "step": 13298 + }, + { + "epoch": 0.7319610325279322, + "grad_norm": 0.7470653057098389, + "learning_rate": 7.068576179410119e-06, + "loss": 0.7658, + "step": 13299 + }, + { + "epoch": 0.7320160713302879, + "grad_norm": 0.6831437945365906, + "learning_rate": 7.068181540880932e-06, + "loss": 0.7864, + "step": 13300 + }, + { + "epoch": 0.7320711101326435, + "grad_norm": 0.7058265209197998, + "learning_rate": 7.067786886807847e-06, + "loss": 0.8254, + "step": 13301 + }, + { + "epoch": 0.7321261489349992, + "grad_norm": 0.7938248515129089, + "learning_rate": 7.067392217193828e-06, + "loss": 0.7291, + "step": 13302 + }, + { + "epoch": 0.7321811877373549, + "grad_norm": 0.7261865735054016, + "learning_rate": 7.066997532041844e-06, + "loss": 0.8115, + "step": 13303 + }, + { + "epoch": 0.7322362265397105, + "grad_norm": 0.6971743702888489, + "learning_rate": 7.0666028313548586e-06, + "loss": 0.7504, + "step": 13304 + }, + { + "epoch": 0.7322912653420661, + "grad_norm": 0.844879150390625, + "learning_rate": 7.0662081151358405e-06, + "loss": 0.7903, + "step": 13305 + }, + { + "epoch": 0.7323463041444218, + "grad_norm": 0.6670572757720947, + "learning_rate": 7.065813383387755e-06, + "loss": 0.7597, + "step": 13306 + }, + { + "epoch": 0.7324013429467775, + "grad_norm": 0.669711172580719, + "learning_rate": 7.06541863611357e-06, + "loss": 0.7179, + "step": 13307 + }, + { + "epoch": 0.7324563817491332, + "grad_norm": 0.7176600098609924, + "learning_rate": 7.0650238733162506e-06, + "loss": 0.8157, + "step": 13308 + }, + { + "epoch": 0.7325114205514888, + "grad_norm": 0.7230100631713867, + "learning_rate": 7.064629094998765e-06, + "loss": 0.7902, + "step": 13309 + }, + { + "epoch": 0.7325664593538445, + "grad_norm": 0.8811234831809998, + "learning_rate": 7.064234301164078e-06, + "loss": 0.7746, + "step": 13310 + }, + { + "epoch": 0.7326214981562001, + "grad_norm": 0.6777653098106384, + "learning_rate": 7.06383949181516e-06, + "loss": 0.7708, + "step": 13311 + }, + { + "epoch": 0.7326765369585558, + "grad_norm": 0.6692547798156738, + "learning_rate": 7.063444666954977e-06, + "loss": 0.7103, + "step": 13312 + }, + { + "epoch": 0.7327315757609114, + "grad_norm": 1.2304950952529907, + "learning_rate": 7.063049826586496e-06, + "loss": 0.7878, + "step": 13313 + }, + { + "epoch": 0.7327866145632671, + "grad_norm": 0.7073930501937866, + "learning_rate": 7.0626549707126834e-06, + "loss": 0.7546, + "step": 13314 + }, + { + "epoch": 0.7328416533656228, + "grad_norm": 0.7184866070747375, + "learning_rate": 7.06226009933651e-06, + "loss": 0.7207, + "step": 13315 + }, + { + "epoch": 0.7328966921679784, + "grad_norm": 0.7098046541213989, + "learning_rate": 7.061865212460941e-06, + "loss": 0.6415, + "step": 13316 + }, + { + "epoch": 0.7329517309703341, + "grad_norm": 0.714379608631134, + "learning_rate": 7.0614703100889445e-06, + "loss": 0.7305, + "step": 13317 + }, + { + "epoch": 0.7330067697726897, + "grad_norm": 0.655060887336731, + "learning_rate": 7.061075392223491e-06, + "loss": 0.6125, + "step": 13318 + }, + { + "epoch": 0.7330618085750454, + "grad_norm": 0.6481055617332458, + "learning_rate": 7.060680458867545e-06, + "loss": 0.7059, + "step": 13319 + }, + { + "epoch": 0.733116847377401, + "grad_norm": 0.7123916745185852, + "learning_rate": 7.060285510024076e-06, + "loss": 0.8007, + "step": 13320 + }, + { + "epoch": 0.7331718861797567, + "grad_norm": 0.7231262922286987, + "learning_rate": 7.059890545696053e-06, + "loss": 0.7781, + "step": 13321 + }, + { + "epoch": 0.7332269249821124, + "grad_norm": 0.8415369391441345, + "learning_rate": 7.0594955658864435e-06, + "loss": 0.6649, + "step": 13322 + }, + { + "epoch": 0.7332819637844681, + "grad_norm": 0.7243070006370544, + "learning_rate": 7.059100570598217e-06, + "loss": 0.6588, + "step": 13323 + }, + { + "epoch": 0.7333370025868237, + "grad_norm": 0.6581026315689087, + "learning_rate": 7.058705559834342e-06, + "loss": 0.7938, + "step": 13324 + }, + { + "epoch": 0.7333920413891793, + "grad_norm": 0.6213739514350891, + "learning_rate": 7.058310533597787e-06, + "loss": 0.7092, + "step": 13325 + }, + { + "epoch": 0.733447080191535, + "grad_norm": 0.6857954859733582, + "learning_rate": 7.057915491891522e-06, + "loss": 0.698, + "step": 13326 + }, + { + "epoch": 0.7335021189938907, + "grad_norm": 0.7528544068336487, + "learning_rate": 7.0575204347185135e-06, + "loss": 0.7234, + "step": 13327 + }, + { + "epoch": 0.7335571577962463, + "grad_norm": 0.6449099779129028, + "learning_rate": 7.057125362081733e-06, + "loss": 0.7391, + "step": 13328 + }, + { + "epoch": 0.733612196598602, + "grad_norm": 0.640689492225647, + "learning_rate": 7.0567302739841495e-06, + "loss": 0.5316, + "step": 13329 + }, + { + "epoch": 0.7336672354009577, + "grad_norm": 0.6686868071556091, + "learning_rate": 7.056335170428731e-06, + "loss": 0.7713, + "step": 13330 + }, + { + "epoch": 0.7337222742033134, + "grad_norm": 0.7627772688865662, + "learning_rate": 7.055940051418447e-06, + "loss": 0.7706, + "step": 13331 + }, + { + "epoch": 0.733777313005669, + "grad_norm": 0.7421852350234985, + "learning_rate": 7.055544916956269e-06, + "loss": 0.6418, + "step": 13332 + }, + { + "epoch": 0.7338323518080246, + "grad_norm": 0.7414699196815491, + "learning_rate": 7.0551497670451666e-06, + "loss": 0.811, + "step": 13333 + }, + { + "epoch": 0.7338873906103803, + "grad_norm": 0.7054136991500854, + "learning_rate": 7.0547546016881064e-06, + "loss": 0.8005, + "step": 13334 + }, + { + "epoch": 0.733942429412736, + "grad_norm": 0.670174241065979, + "learning_rate": 7.054359420888062e-06, + "loss": 0.6136, + "step": 13335 + }, + { + "epoch": 0.7339974682150916, + "grad_norm": 0.728255033493042, + "learning_rate": 7.053964224648001e-06, + "loss": 0.848, + "step": 13336 + }, + { + "epoch": 0.7340525070174473, + "grad_norm": 0.729815661907196, + "learning_rate": 7.053569012970896e-06, + "loss": 0.6985, + "step": 13337 + }, + { + "epoch": 0.734107545819803, + "grad_norm": 0.7564244866371155, + "learning_rate": 7.053173785859715e-06, + "loss": 0.7995, + "step": 13338 + }, + { + "epoch": 0.7341625846221587, + "grad_norm": 0.7746061682701111, + "learning_rate": 7.05277854331743e-06, + "loss": 0.7663, + "step": 13339 + }, + { + "epoch": 0.7342176234245142, + "grad_norm": 0.6878651976585388, + "learning_rate": 7.052383285347011e-06, + "loss": 0.8624, + "step": 13340 + }, + { + "epoch": 0.7342726622268699, + "grad_norm": 0.6989734768867493, + "learning_rate": 7.051988011951428e-06, + "loss": 0.7221, + "step": 13341 + }, + { + "epoch": 0.7343277010292256, + "grad_norm": 0.6854223012924194, + "learning_rate": 7.051592723133654e-06, + "loss": 0.7878, + "step": 13342 + }, + { + "epoch": 0.7343827398315813, + "grad_norm": 0.746696949005127, + "learning_rate": 7.051197418896657e-06, + "loss": 0.7074, + "step": 13343 + }, + { + "epoch": 0.7344377786339369, + "grad_norm": 0.6933150887489319, + "learning_rate": 7.050802099243409e-06, + "loss": 0.7587, + "step": 13344 + }, + { + "epoch": 0.7344928174362926, + "grad_norm": 0.7285788655281067, + "learning_rate": 7.050406764176882e-06, + "loss": 0.6589, + "step": 13345 + }, + { + "epoch": 0.7345478562386483, + "grad_norm": 0.6834994554519653, + "learning_rate": 7.050011413700046e-06, + "loss": 0.7196, + "step": 13346 + }, + { + "epoch": 0.734602895041004, + "grad_norm": 0.6504353880882263, + "learning_rate": 7.049616047815873e-06, + "loss": 0.7675, + "step": 13347 + }, + { + "epoch": 0.7346579338433595, + "grad_norm": 0.7009296417236328, + "learning_rate": 7.049220666527335e-06, + "loss": 0.7638, + "step": 13348 + }, + { + "epoch": 0.7347129726457152, + "grad_norm": 0.6210034489631653, + "learning_rate": 7.0488252698374024e-06, + "loss": 0.6872, + "step": 13349 + }, + { + "epoch": 0.7347680114480709, + "grad_norm": 0.6280165910720825, + "learning_rate": 7.0484298577490485e-06, + "loss": 0.7084, + "step": 13350 + }, + { + "epoch": 0.7348230502504266, + "grad_norm": 0.8055418133735657, + "learning_rate": 7.048034430265242e-06, + "loss": 0.8202, + "step": 13351 + }, + { + "epoch": 0.7348780890527822, + "grad_norm": 0.6674166917800903, + "learning_rate": 7.047638987388959e-06, + "loss": 0.6368, + "step": 13352 + }, + { + "epoch": 0.7349331278551379, + "grad_norm": 0.9182783961296082, + "learning_rate": 7.04724352912317e-06, + "loss": 0.6734, + "step": 13353 + }, + { + "epoch": 0.7349881666574936, + "grad_norm": 0.6371243596076965, + "learning_rate": 7.046848055470845e-06, + "loss": 0.7308, + "step": 13354 + }, + { + "epoch": 0.7350432054598492, + "grad_norm": 0.6454519033432007, + "learning_rate": 7.046452566434959e-06, + "loss": 0.6882, + "step": 13355 + }, + { + "epoch": 0.7350982442622048, + "grad_norm": 0.648970365524292, + "learning_rate": 7.046057062018483e-06, + "loss": 0.7247, + "step": 13356 + }, + { + "epoch": 0.7351532830645605, + "grad_norm": 0.668886661529541, + "learning_rate": 7.04566154222439e-06, + "loss": 0.7379, + "step": 13357 + }, + { + "epoch": 0.7352083218669162, + "grad_norm": 0.6593654751777649, + "learning_rate": 7.045266007055651e-06, + "loss": 0.7473, + "step": 13358 + }, + { + "epoch": 0.7352633606692718, + "grad_norm": 0.8418927192687988, + "learning_rate": 7.044870456515241e-06, + "loss": 0.7949, + "step": 13359 + }, + { + "epoch": 0.7353183994716275, + "grad_norm": 0.7350470423698425, + "learning_rate": 7.044474890606132e-06, + "loss": 0.7545, + "step": 13360 + }, + { + "epoch": 0.7353734382739832, + "grad_norm": 0.7786250114440918, + "learning_rate": 7.044079309331298e-06, + "loss": 0.8587, + "step": 13361 + }, + { + "epoch": 0.7354284770763388, + "grad_norm": 0.6345693469047546, + "learning_rate": 7.04368371269371e-06, + "loss": 0.77, + "step": 13362 + }, + { + "epoch": 0.7354835158786944, + "grad_norm": 0.7030417919158936, + "learning_rate": 7.043288100696343e-06, + "loss": 0.7624, + "step": 13363 + }, + { + "epoch": 0.7355385546810501, + "grad_norm": 0.7526041865348816, + "learning_rate": 7.042892473342169e-06, + "loss": 0.8018, + "step": 13364 + }, + { + "epoch": 0.7355935934834058, + "grad_norm": 0.6419941782951355, + "learning_rate": 7.042496830634162e-06, + "loss": 0.6788, + "step": 13365 + }, + { + "epoch": 0.7356486322857615, + "grad_norm": 0.6952203512191772, + "learning_rate": 7.042101172575297e-06, + "loss": 0.7747, + "step": 13366 + }, + { + "epoch": 0.7357036710881171, + "grad_norm": 0.8046327829360962, + "learning_rate": 7.041705499168544e-06, + "loss": 0.8216, + "step": 13367 + }, + { + "epoch": 0.7357587098904728, + "grad_norm": 0.6641537547111511, + "learning_rate": 7.041309810416881e-06, + "loss": 0.7313, + "step": 13368 + }, + { + "epoch": 0.7358137486928285, + "grad_norm": 0.6824444532394409, + "learning_rate": 7.040914106323278e-06, + "loss": 0.7179, + "step": 13369 + }, + { + "epoch": 0.7358687874951841, + "grad_norm": 0.6469557285308838, + "learning_rate": 7.040518386890711e-06, + "loss": 0.7671, + "step": 13370 + }, + { + "epoch": 0.7359238262975397, + "grad_norm": 0.6826488971710205, + "learning_rate": 7.040122652122156e-06, + "loss": 0.7, + "step": 13371 + }, + { + "epoch": 0.7359788650998954, + "grad_norm": 0.6931618452072144, + "learning_rate": 7.039726902020583e-06, + "loss": 0.7641, + "step": 13372 + }, + { + "epoch": 0.7360339039022511, + "grad_norm": 0.7445465922355652, + "learning_rate": 7.039331136588971e-06, + "loss": 0.7458, + "step": 13373 + }, + { + "epoch": 0.7360889427046068, + "grad_norm": 0.6358756422996521, + "learning_rate": 7.038935355830289e-06, + "loss": 0.6125, + "step": 13374 + }, + { + "epoch": 0.7361439815069624, + "grad_norm": 0.6966063380241394, + "learning_rate": 7.038539559747517e-06, + "loss": 0.6812, + "step": 13375 + }, + { + "epoch": 0.736199020309318, + "grad_norm": 0.9898090362548828, + "learning_rate": 7.038143748343626e-06, + "loss": 0.707, + "step": 13376 + }, + { + "epoch": 0.7362540591116737, + "grad_norm": 0.685951828956604, + "learning_rate": 7.0377479216215935e-06, + "loss": 0.7932, + "step": 13377 + }, + { + "epoch": 0.7363090979140294, + "grad_norm": 0.7056856751441956, + "learning_rate": 7.037352079584392e-06, + "loss": 0.7432, + "step": 13378 + }, + { + "epoch": 0.736364136716385, + "grad_norm": 0.7802489995956421, + "learning_rate": 7.036956222234999e-06, + "loss": 0.8275, + "step": 13379 + }, + { + "epoch": 0.7364191755187407, + "grad_norm": 0.7990192770957947, + "learning_rate": 7.036560349576387e-06, + "loss": 0.893, + "step": 13380 + }, + { + "epoch": 0.7364742143210964, + "grad_norm": 0.6454586386680603, + "learning_rate": 7.0361644616115334e-06, + "loss": 0.751, + "step": 13381 + }, + { + "epoch": 0.7365292531234521, + "grad_norm": 0.7071009278297424, + "learning_rate": 7.035768558343412e-06, + "loss": 0.7771, + "step": 13382 + }, + { + "epoch": 0.7365842919258077, + "grad_norm": 0.6530466079711914, + "learning_rate": 7.035372639774999e-06, + "loss": 0.7529, + "step": 13383 + }, + { + "epoch": 0.7366393307281633, + "grad_norm": 0.728689968585968, + "learning_rate": 7.03497670590927e-06, + "loss": 0.7862, + "step": 13384 + }, + { + "epoch": 0.736694369530519, + "grad_norm": 0.6640015244483948, + "learning_rate": 7.034580756749202e-06, + "loss": 0.6876, + "step": 13385 + }, + { + "epoch": 0.7367494083328747, + "grad_norm": 0.7388426661491394, + "learning_rate": 7.034184792297769e-06, + "loss": 0.8168, + "step": 13386 + }, + { + "epoch": 0.7368044471352303, + "grad_norm": 0.6543731093406677, + "learning_rate": 7.0337888125579465e-06, + "loss": 0.7555, + "step": 13387 + }, + { + "epoch": 0.736859485937586, + "grad_norm": 0.7783555388450623, + "learning_rate": 7.0333928175327125e-06, + "loss": 0.755, + "step": 13388 + }, + { + "epoch": 0.7369145247399417, + "grad_norm": 0.6275887489318848, + "learning_rate": 7.032996807225043e-06, + "loss": 0.7187, + "step": 13389 + }, + { + "epoch": 0.7369695635422974, + "grad_norm": 0.7007517218589783, + "learning_rate": 7.032600781637913e-06, + "loss": 0.6993, + "step": 13390 + }, + { + "epoch": 0.737024602344653, + "grad_norm": 0.6322247385978699, + "learning_rate": 7.0322047407743e-06, + "loss": 0.7178, + "step": 13391 + }, + { + "epoch": 0.7370796411470086, + "grad_norm": 0.7160976529121399, + "learning_rate": 7.0318086846371804e-06, + "loss": 0.6884, + "step": 13392 + }, + { + "epoch": 0.7371346799493643, + "grad_norm": 0.6056101322174072, + "learning_rate": 7.03141261322953e-06, + "loss": 0.6672, + "step": 13393 + }, + { + "epoch": 0.73718971875172, + "grad_norm": 0.8779410123825073, + "learning_rate": 7.0310165265543264e-06, + "loss": 0.7564, + "step": 13394 + }, + { + "epoch": 0.7372447575540756, + "grad_norm": 0.6868176460266113, + "learning_rate": 7.030620424614546e-06, + "loss": 0.7658, + "step": 13395 + }, + { + "epoch": 0.7372997963564313, + "grad_norm": 0.7611618041992188, + "learning_rate": 7.030224307413166e-06, + "loss": 0.6445, + "step": 13396 + }, + { + "epoch": 0.737354835158787, + "grad_norm": 0.7688242793083191, + "learning_rate": 7.0298281749531636e-06, + "loss": 0.8061, + "step": 13397 + }, + { + "epoch": 0.7374098739611427, + "grad_norm": 0.6781700849533081, + "learning_rate": 7.029432027237518e-06, + "loss": 0.6374, + "step": 13398 + }, + { + "epoch": 0.7374649127634982, + "grad_norm": 0.6719028353691101, + "learning_rate": 7.0290358642692e-06, + "loss": 0.7585, + "step": 13399 + }, + { + "epoch": 0.7375199515658539, + "grad_norm": 0.704429030418396, + "learning_rate": 7.028639686051195e-06, + "loss": 0.7052, + "step": 13400 + }, + { + "epoch": 0.7375749903682096, + "grad_norm": 0.714914083480835, + "learning_rate": 7.028243492586478e-06, + "loss": 0.7785, + "step": 13401 + }, + { + "epoch": 0.7376300291705652, + "grad_norm": 0.7732700705528259, + "learning_rate": 7.027847283878023e-06, + "loss": 0.7812, + "step": 13402 + }, + { + "epoch": 0.7376850679729209, + "grad_norm": 0.6849464178085327, + "learning_rate": 7.027451059928813e-06, + "loss": 0.7657, + "step": 13403 + }, + { + "epoch": 0.7377401067752766, + "grad_norm": 0.6924402117729187, + "learning_rate": 7.027054820741822e-06, + "loss": 0.677, + "step": 13404 + }, + { + "epoch": 0.7377951455776323, + "grad_norm": 0.7142716646194458, + "learning_rate": 7.02665856632003e-06, + "loss": 0.7071, + "step": 13405 + }, + { + "epoch": 0.7378501843799878, + "grad_norm": 0.7227265238761902, + "learning_rate": 7.0262622966664154e-06, + "loss": 0.6986, + "step": 13406 + }, + { + "epoch": 0.7379052231823435, + "grad_norm": 0.6387726664543152, + "learning_rate": 7.025866011783954e-06, + "loss": 0.6563, + "step": 13407 + }, + { + "epoch": 0.7379602619846992, + "grad_norm": 0.6411992311477661, + "learning_rate": 7.025469711675628e-06, + "loss": 0.5842, + "step": 13408 + }, + { + "epoch": 0.7380153007870549, + "grad_norm": 0.6811027526855469, + "learning_rate": 7.025073396344413e-06, + "loss": 0.6746, + "step": 13409 + }, + { + "epoch": 0.7380703395894105, + "grad_norm": 1.0705479383468628, + "learning_rate": 7.024677065793289e-06, + "loss": 0.7457, + "step": 13410 + }, + { + "epoch": 0.7381253783917662, + "grad_norm": 0.6920849084854126, + "learning_rate": 7.024280720025232e-06, + "loss": 0.6838, + "step": 13411 + }, + { + "epoch": 0.7381804171941219, + "grad_norm": 0.8089182376861572, + "learning_rate": 7.0238843590432236e-06, + "loss": 0.6682, + "step": 13412 + }, + { + "epoch": 0.7382354559964776, + "grad_norm": 0.6140334010124207, + "learning_rate": 7.023487982850244e-06, + "loss": 0.6992, + "step": 13413 + }, + { + "epoch": 0.7382904947988331, + "grad_norm": 0.8564643263816833, + "learning_rate": 7.023091591449269e-06, + "loss": 0.8512, + "step": 13414 + }, + { + "epoch": 0.7383455336011888, + "grad_norm": 0.655516505241394, + "learning_rate": 7.02269518484328e-06, + "loss": 0.7291, + "step": 13415 + }, + { + "epoch": 0.7384005724035445, + "grad_norm": 0.6373177766799927, + "learning_rate": 7.022298763035255e-06, + "loss": 0.7553, + "step": 13416 + }, + { + "epoch": 0.7384556112059002, + "grad_norm": 0.7023805379867554, + "learning_rate": 7.021902326028174e-06, + "loss": 0.7562, + "step": 13417 + }, + { + "epoch": 0.7385106500082558, + "grad_norm": 0.654181182384491, + "learning_rate": 7.021505873825016e-06, + "loss": 0.7153, + "step": 13418 + }, + { + "epoch": 0.7385656888106115, + "grad_norm": 0.6633459329605103, + "learning_rate": 7.02110940642876e-06, + "loss": 0.6779, + "step": 13419 + }, + { + "epoch": 0.7386207276129672, + "grad_norm": 0.7050659656524658, + "learning_rate": 7.020712923842388e-06, + "loss": 0.741, + "step": 13420 + }, + { + "epoch": 0.7386757664153228, + "grad_norm": 0.7241182327270508, + "learning_rate": 7.020316426068879e-06, + "loss": 0.7479, + "step": 13421 + }, + { + "epoch": 0.7387308052176784, + "grad_norm": 1.0262155532836914, + "learning_rate": 7.019919913111212e-06, + "loss": 0.8418, + "step": 13422 + }, + { + "epoch": 0.7387858440200341, + "grad_norm": 0.6765457391738892, + "learning_rate": 7.019523384972366e-06, + "loss": 0.727, + "step": 13423 + }, + { + "epoch": 0.7388408828223898, + "grad_norm": 0.6871724724769592, + "learning_rate": 7.0191268416553245e-06, + "loss": 0.8273, + "step": 13424 + }, + { + "epoch": 0.7388959216247455, + "grad_norm": 0.8085252046585083, + "learning_rate": 7.018730283163067e-06, + "loss": 0.7306, + "step": 13425 + }, + { + "epoch": 0.7389509604271011, + "grad_norm": 0.6822873950004578, + "learning_rate": 7.018333709498572e-06, + "loss": 0.7454, + "step": 13426 + }, + { + "epoch": 0.7390059992294568, + "grad_norm": 0.7210521697998047, + "learning_rate": 7.01793712066482e-06, + "loss": 0.8306, + "step": 13427 + }, + { + "epoch": 0.7390610380318124, + "grad_norm": 0.6404997110366821, + "learning_rate": 7.017540516664795e-06, + "loss": 0.7151, + "step": 13428 + }, + { + "epoch": 0.7391160768341681, + "grad_norm": 0.6662821769714355, + "learning_rate": 7.017143897501475e-06, + "loss": 0.7446, + "step": 13429 + }, + { + "epoch": 0.7391711156365237, + "grad_norm": 0.8048129081726074, + "learning_rate": 7.0167472631778415e-06, + "loss": 0.7953, + "step": 13430 + }, + { + "epoch": 0.7392261544388794, + "grad_norm": 0.7215000987052917, + "learning_rate": 7.016350613696873e-06, + "loss": 0.8373, + "step": 13431 + }, + { + "epoch": 0.7392811932412351, + "grad_norm": 0.7309150099754333, + "learning_rate": 7.015953949061555e-06, + "loss": 0.7654, + "step": 13432 + }, + { + "epoch": 0.7393362320435908, + "grad_norm": 0.6487464904785156, + "learning_rate": 7.0155572692748665e-06, + "loss": 0.6473, + "step": 13433 + }, + { + "epoch": 0.7393912708459464, + "grad_norm": 0.6172077059745789, + "learning_rate": 7.01516057433979e-06, + "loss": 0.6672, + "step": 13434 + }, + { + "epoch": 0.739446309648302, + "grad_norm": 0.7569651007652283, + "learning_rate": 7.014763864259304e-06, + "loss": 0.8501, + "step": 13435 + }, + { + "epoch": 0.7395013484506577, + "grad_norm": 0.824669599533081, + "learning_rate": 7.014367139036393e-06, + "loss": 0.8596, + "step": 13436 + }, + { + "epoch": 0.7395563872530134, + "grad_norm": 0.6904401183128357, + "learning_rate": 7.013970398674038e-06, + "loss": 0.7403, + "step": 13437 + }, + { + "epoch": 0.739611426055369, + "grad_norm": 0.7999581098556519, + "learning_rate": 7.013573643175221e-06, + "loss": 0.8879, + "step": 13438 + }, + { + "epoch": 0.7396664648577247, + "grad_norm": 0.6600533723831177, + "learning_rate": 7.0131768725429236e-06, + "loss": 0.7324, + "step": 13439 + }, + { + "epoch": 0.7397215036600804, + "grad_norm": 0.7174191474914551, + "learning_rate": 7.0127800867801275e-06, + "loss": 0.7474, + "step": 13440 + }, + { + "epoch": 0.7397765424624361, + "grad_norm": 0.7023884654045105, + "learning_rate": 7.012383285889814e-06, + "loss": 0.7826, + "step": 13441 + }, + { + "epoch": 0.7398315812647916, + "grad_norm": 0.6486913561820984, + "learning_rate": 7.011986469874969e-06, + "loss": 0.6553, + "step": 13442 + }, + { + "epoch": 0.7398866200671473, + "grad_norm": 0.7238486409187317, + "learning_rate": 7.011589638738569e-06, + "loss": 0.6759, + "step": 13443 + }, + { + "epoch": 0.739941658869503, + "grad_norm": 0.7879656553268433, + "learning_rate": 7.011192792483601e-06, + "loss": 0.886, + "step": 13444 + }, + { + "epoch": 0.7399966976718586, + "grad_norm": 0.6592407822608948, + "learning_rate": 7.010795931113047e-06, + "loss": 0.7746, + "step": 13445 + }, + { + "epoch": 0.7400517364742143, + "grad_norm": 0.8274507522583008, + "learning_rate": 7.010399054629889e-06, + "loss": 0.7615, + "step": 13446 + }, + { + "epoch": 0.74010677527657, + "grad_norm": 0.6233614087104797, + "learning_rate": 7.010002163037109e-06, + "loss": 0.695, + "step": 13447 + }, + { + "epoch": 0.7401618140789257, + "grad_norm": 0.7082701921463013, + "learning_rate": 7.00960525633769e-06, + "loss": 0.6677, + "step": 13448 + }, + { + "epoch": 0.7402168528812813, + "grad_norm": 1.0694652795791626, + "learning_rate": 7.009208334534618e-06, + "loss": 0.7792, + "step": 13449 + }, + { + "epoch": 0.7402718916836369, + "grad_norm": 0.7189109325408936, + "learning_rate": 7.008811397630874e-06, + "loss": 0.8606, + "step": 13450 + }, + { + "epoch": 0.7403269304859926, + "grad_norm": 0.7136901617050171, + "learning_rate": 7.00841444562944e-06, + "loss": 0.7142, + "step": 13451 + }, + { + "epoch": 0.7403819692883483, + "grad_norm": 0.6508508920669556, + "learning_rate": 7.008017478533301e-06, + "loss": 0.6748, + "step": 13452 + }, + { + "epoch": 0.7404370080907039, + "grad_norm": 0.6560903191566467, + "learning_rate": 7.007620496345441e-06, + "loss": 0.7929, + "step": 13453 + }, + { + "epoch": 0.7404920468930596, + "grad_norm": 0.6909067034721375, + "learning_rate": 7.007223499068841e-06, + "loss": 0.6118, + "step": 13454 + }, + { + "epoch": 0.7405470856954153, + "grad_norm": 0.6554582715034485, + "learning_rate": 7.0068264867064874e-06, + "loss": 0.7687, + "step": 13455 + }, + { + "epoch": 0.740602124497771, + "grad_norm": 0.7788346409797668, + "learning_rate": 7.006429459261363e-06, + "loss": 0.7535, + "step": 13456 + }, + { + "epoch": 0.7406571633001265, + "grad_norm": 0.7702943682670593, + "learning_rate": 7.006032416736452e-06, + "loss": 0.833, + "step": 13457 + }, + { + "epoch": 0.7407122021024822, + "grad_norm": 0.6860190033912659, + "learning_rate": 7.005635359134738e-06, + "loss": 0.6643, + "step": 13458 + }, + { + "epoch": 0.7407672409048379, + "grad_norm": 0.7470136880874634, + "learning_rate": 7.005238286459205e-06, + "loss": 0.7811, + "step": 13459 + }, + { + "epoch": 0.7408222797071936, + "grad_norm": 0.6769132614135742, + "learning_rate": 7.004841198712839e-06, + "loss": 0.7322, + "step": 13460 + }, + { + "epoch": 0.7408773185095492, + "grad_norm": 0.7865259647369385, + "learning_rate": 7.004444095898623e-06, + "loss": 0.817, + "step": 13461 + }, + { + "epoch": 0.7409323573119049, + "grad_norm": 0.7352784276008606, + "learning_rate": 7.004046978019542e-06, + "loss": 0.7373, + "step": 13462 + }, + { + "epoch": 0.7409873961142606, + "grad_norm": 0.7647448182106018, + "learning_rate": 7.00364984507858e-06, + "loss": 0.7129, + "step": 13463 + }, + { + "epoch": 0.7410424349166163, + "grad_norm": 0.6979989409446716, + "learning_rate": 7.003252697078722e-06, + "loss": 0.7833, + "step": 13464 + }, + { + "epoch": 0.7410974737189718, + "grad_norm": 0.6117465496063232, + "learning_rate": 7.002855534022953e-06, + "loss": 0.6732, + "step": 13465 + }, + { + "epoch": 0.7411525125213275, + "grad_norm": 0.6754159331321716, + "learning_rate": 7.002458355914258e-06, + "loss": 0.6939, + "step": 13466 + }, + { + "epoch": 0.7412075513236832, + "grad_norm": 0.6713566184043884, + "learning_rate": 7.002061162755621e-06, + "loss": 0.7459, + "step": 13467 + }, + { + "epoch": 0.7412625901260389, + "grad_norm": 0.6475394368171692, + "learning_rate": 7.001663954550029e-06, + "loss": 0.7912, + "step": 13468 + }, + { + "epoch": 0.7413176289283945, + "grad_norm": 0.6577908992767334, + "learning_rate": 7.001266731300467e-06, + "loss": 0.6903, + "step": 13469 + }, + { + "epoch": 0.7413726677307502, + "grad_norm": 0.8129748106002808, + "learning_rate": 7.00086949300992e-06, + "loss": 0.8277, + "step": 13470 + }, + { + "epoch": 0.7414277065331059, + "grad_norm": 0.6730444431304932, + "learning_rate": 7.000472239681372e-06, + "loss": 0.7357, + "step": 13471 + }, + { + "epoch": 0.7414827453354615, + "grad_norm": 0.7166460156440735, + "learning_rate": 7.000074971317812e-06, + "loss": 0.7544, + "step": 13472 + }, + { + "epoch": 0.7415377841378171, + "grad_norm": 0.6668731570243835, + "learning_rate": 6.9996776879222225e-06, + "loss": 0.7073, + "step": 13473 + }, + { + "epoch": 0.7415928229401728, + "grad_norm": 0.7031315565109253, + "learning_rate": 6.999280389497591e-06, + "loss": 0.7262, + "step": 13474 + }, + { + "epoch": 0.7416478617425285, + "grad_norm": 0.7426775693893433, + "learning_rate": 6.998883076046904e-06, + "loss": 0.7394, + "step": 13475 + }, + { + "epoch": 0.7417029005448842, + "grad_norm": 0.665226399898529, + "learning_rate": 6.9984857475731475e-06, + "loss": 0.7365, + "step": 13476 + }, + { + "epoch": 0.7417579393472398, + "grad_norm": 0.7762128114700317, + "learning_rate": 6.998088404079306e-06, + "loss": 0.8551, + "step": 13477 + }, + { + "epoch": 0.7418129781495955, + "grad_norm": 0.7129524350166321, + "learning_rate": 6.997691045568366e-06, + "loss": 0.7646, + "step": 13478 + }, + { + "epoch": 0.7418680169519511, + "grad_norm": 0.7199442386627197, + "learning_rate": 6.997293672043316e-06, + "loss": 0.6879, + "step": 13479 + }, + { + "epoch": 0.7419230557543068, + "grad_norm": 0.6559237241744995, + "learning_rate": 6.9968962835071415e-06, + "loss": 0.6965, + "step": 13480 + }, + { + "epoch": 0.7419780945566624, + "grad_norm": 0.7428768277168274, + "learning_rate": 6.996498879962829e-06, + "loss": 0.7748, + "step": 13481 + }, + { + "epoch": 0.7420331333590181, + "grad_norm": 0.7344076633453369, + "learning_rate": 6.996101461413365e-06, + "loss": 0.6554, + "step": 13482 + }, + { + "epoch": 0.7420881721613738, + "grad_norm": 0.7080272436141968, + "learning_rate": 6.995704027861736e-06, + "loss": 0.7335, + "step": 13483 + }, + { + "epoch": 0.7421432109637295, + "grad_norm": 0.6296887397766113, + "learning_rate": 6.9953065793109306e-06, + "loss": 0.6411, + "step": 13484 + }, + { + "epoch": 0.7421982497660851, + "grad_norm": 0.7597532868385315, + "learning_rate": 6.994909115763935e-06, + "loss": 0.8281, + "step": 13485 + }, + { + "epoch": 0.7422532885684407, + "grad_norm": 0.7059680819511414, + "learning_rate": 6.994511637223737e-06, + "loss": 0.8075, + "step": 13486 + }, + { + "epoch": 0.7423083273707964, + "grad_norm": 0.8097653388977051, + "learning_rate": 6.994114143693323e-06, + "loss": 0.772, + "step": 13487 + }, + { + "epoch": 0.742363366173152, + "grad_norm": 0.7609913945198059, + "learning_rate": 6.993716635175681e-06, + "loss": 0.8265, + "step": 13488 + }, + { + "epoch": 0.7424184049755077, + "grad_norm": 0.6209948062896729, + "learning_rate": 6.993319111673799e-06, + "loss": 0.6266, + "step": 13489 + }, + { + "epoch": 0.7424734437778634, + "grad_norm": 0.6655107140541077, + "learning_rate": 6.992921573190663e-06, + "loss": 0.7519, + "step": 13490 + }, + { + "epoch": 0.7425284825802191, + "grad_norm": 1.1243617534637451, + "learning_rate": 6.992524019729262e-06, + "loss": 0.7707, + "step": 13491 + }, + { + "epoch": 0.7425835213825747, + "grad_norm": 0.6680326461791992, + "learning_rate": 6.9921264512925845e-06, + "loss": 0.7344, + "step": 13492 + }, + { + "epoch": 0.7426385601849304, + "grad_norm": 0.7689213156700134, + "learning_rate": 6.991728867883618e-06, + "loss": 0.7591, + "step": 13493 + }, + { + "epoch": 0.742693598987286, + "grad_norm": 0.8587394952774048, + "learning_rate": 6.99133126950535e-06, + "loss": 0.6991, + "step": 13494 + }, + { + "epoch": 0.7427486377896417, + "grad_norm": 0.6736756563186646, + "learning_rate": 6.990933656160768e-06, + "loss": 0.7604, + "step": 13495 + }, + { + "epoch": 0.7428036765919973, + "grad_norm": 0.6538887023925781, + "learning_rate": 6.990536027852864e-06, + "loss": 0.7332, + "step": 13496 + }, + { + "epoch": 0.742858715394353, + "grad_norm": 0.6578357815742493, + "learning_rate": 6.990138384584623e-06, + "loss": 0.7238, + "step": 13497 + }, + { + "epoch": 0.7429137541967087, + "grad_norm": 0.6865534782409668, + "learning_rate": 6.989740726359035e-06, + "loss": 0.7012, + "step": 13498 + }, + { + "epoch": 0.7429687929990644, + "grad_norm": 0.6198129057884216, + "learning_rate": 6.989343053179088e-06, + "loss": 0.7391, + "step": 13499 + }, + { + "epoch": 0.74302383180142, + "grad_norm": 0.6929547786712646, + "learning_rate": 6.98894536504777e-06, + "loss": 0.8498, + "step": 13500 + }, + { + "epoch": 0.7430788706037756, + "grad_norm": 0.6863006353378296, + "learning_rate": 6.988547661968072e-06, + "loss": 0.6589, + "step": 13501 + }, + { + "epoch": 0.7431339094061313, + "grad_norm": 0.7490457892417908, + "learning_rate": 6.988149943942982e-06, + "loss": 0.8145, + "step": 13502 + }, + { + "epoch": 0.743188948208487, + "grad_norm": 0.6597211360931396, + "learning_rate": 6.987752210975489e-06, + "loss": 0.7786, + "step": 13503 + }, + { + "epoch": 0.7432439870108426, + "grad_norm": 0.7211003303527832, + "learning_rate": 6.987354463068583e-06, + "loss": 0.7668, + "step": 13504 + }, + { + "epoch": 0.7432990258131983, + "grad_norm": 0.6257827877998352, + "learning_rate": 6.9869567002252526e-06, + "loss": 0.7378, + "step": 13505 + }, + { + "epoch": 0.743354064615554, + "grad_norm": 0.656944751739502, + "learning_rate": 6.986558922448488e-06, + "loss": 0.6408, + "step": 13506 + }, + { + "epoch": 0.7434091034179097, + "grad_norm": 0.6862110495567322, + "learning_rate": 6.986161129741276e-06, + "loss": 0.7648, + "step": 13507 + }, + { + "epoch": 0.7434641422202652, + "grad_norm": 0.6216374039649963, + "learning_rate": 6.985763322106612e-06, + "loss": 0.6826, + "step": 13508 + }, + { + "epoch": 0.7435191810226209, + "grad_norm": 0.7959128618240356, + "learning_rate": 6.985365499547479e-06, + "loss": 0.7554, + "step": 13509 + }, + { + "epoch": 0.7435742198249766, + "grad_norm": 0.5882300734519958, + "learning_rate": 6.984967662066875e-06, + "loss": 0.6523, + "step": 13510 + }, + { + "epoch": 0.7436292586273323, + "grad_norm": 0.8529833555221558, + "learning_rate": 6.9845698096677805e-06, + "loss": 0.7871, + "step": 13511 + }, + { + "epoch": 0.7436842974296879, + "grad_norm": 1.2988953590393066, + "learning_rate": 6.9841719423531925e-06, + "loss": 0.708, + "step": 13512 + }, + { + "epoch": 0.7437393362320436, + "grad_norm": 0.6735696792602539, + "learning_rate": 6.983774060126101e-06, + "loss": 0.7962, + "step": 13513 + }, + { + "epoch": 0.7437943750343993, + "grad_norm": 0.8145982623100281, + "learning_rate": 6.9833761629894925e-06, + "loss": 0.9067, + "step": 13514 + }, + { + "epoch": 0.743849413836755, + "grad_norm": 0.7107387781143188, + "learning_rate": 6.98297825094636e-06, + "loss": 0.7986, + "step": 13515 + }, + { + "epoch": 0.7439044526391105, + "grad_norm": 0.7350436449050903, + "learning_rate": 6.9825803239996934e-06, + "loss": 0.7724, + "step": 13516 + }, + { + "epoch": 0.7439594914414662, + "grad_norm": 0.7300962805747986, + "learning_rate": 6.982182382152485e-06, + "loss": 0.734, + "step": 13517 + }, + { + "epoch": 0.7440145302438219, + "grad_norm": 0.7088475823402405, + "learning_rate": 6.981784425407724e-06, + "loss": 0.818, + "step": 13518 + }, + { + "epoch": 0.7440695690461776, + "grad_norm": 0.6911785006523132, + "learning_rate": 6.981386453768402e-06, + "loss": 0.6857, + "step": 13519 + }, + { + "epoch": 0.7441246078485332, + "grad_norm": 0.794143795967102, + "learning_rate": 6.980988467237508e-06, + "loss": 0.7496, + "step": 13520 + }, + { + "epoch": 0.7441796466508889, + "grad_norm": 0.7116371989250183, + "learning_rate": 6.980590465818037e-06, + "loss": 0.7082, + "step": 13521 + }, + { + "epoch": 0.7442346854532446, + "grad_norm": 0.6306180953979492, + "learning_rate": 6.980192449512978e-06, + "loss": 0.7227, + "step": 13522 + }, + { + "epoch": 0.7442897242556002, + "grad_norm": 0.6662481427192688, + "learning_rate": 6.979794418325323e-06, + "loss": 0.7323, + "step": 13523 + }, + { + "epoch": 0.7443447630579558, + "grad_norm": 0.6824387907981873, + "learning_rate": 6.97939637225806e-06, + "loss": 0.7188, + "step": 13524 + }, + { + "epoch": 0.7443998018603115, + "grad_norm": 0.7429190278053284, + "learning_rate": 6.9789983113141865e-06, + "loss": 0.7818, + "step": 13525 + }, + { + "epoch": 0.7444548406626672, + "grad_norm": 0.7148364782333374, + "learning_rate": 6.978600235496692e-06, + "loss": 0.7665, + "step": 13526 + }, + { + "epoch": 0.7445098794650229, + "grad_norm": 0.711482584476471, + "learning_rate": 6.978202144808567e-06, + "loss": 0.7865, + "step": 13527 + }, + { + "epoch": 0.7445649182673785, + "grad_norm": 0.6913465857505798, + "learning_rate": 6.977804039252802e-06, + "loss": 0.8206, + "step": 13528 + }, + { + "epoch": 0.7446199570697342, + "grad_norm": 0.9090713858604431, + "learning_rate": 6.977405918832394e-06, + "loss": 0.7243, + "step": 13529 + }, + { + "epoch": 0.7446749958720899, + "grad_norm": 0.7680408954620361, + "learning_rate": 6.977007783550331e-06, + "loss": 0.847, + "step": 13530 + }, + { + "epoch": 0.7447300346744454, + "grad_norm": 0.6486232876777649, + "learning_rate": 6.976609633409608e-06, + "loss": 0.7258, + "step": 13531 + }, + { + "epoch": 0.7447850734768011, + "grad_norm": 0.7612336277961731, + "learning_rate": 6.976211468413214e-06, + "loss": 0.7452, + "step": 13532 + }, + { + "epoch": 0.7448401122791568, + "grad_norm": 0.7539309859275818, + "learning_rate": 6.975813288564146e-06, + "loss": 0.8292, + "step": 13533 + }, + { + "epoch": 0.7448951510815125, + "grad_norm": 0.64984530210495, + "learning_rate": 6.975415093865394e-06, + "loss": 0.6818, + "step": 13534 + }, + { + "epoch": 0.7449501898838681, + "grad_norm": 0.6415309309959412, + "learning_rate": 6.9750168843199506e-06, + "loss": 0.7369, + "step": 13535 + }, + { + "epoch": 0.7450052286862238, + "grad_norm": 0.7107319235801697, + "learning_rate": 6.974618659930807e-06, + "loss": 0.7364, + "step": 13536 + }, + { + "epoch": 0.7450602674885795, + "grad_norm": 0.7358448505401611, + "learning_rate": 6.9742204207009605e-06, + "loss": 0.7784, + "step": 13537 + }, + { + "epoch": 0.7451153062909351, + "grad_norm": 0.6950068473815918, + "learning_rate": 6.9738221666334e-06, + "loss": 0.792, + "step": 13538 + }, + { + "epoch": 0.7451703450932907, + "grad_norm": 0.7355311512947083, + "learning_rate": 6.973423897731122e-06, + "loss": 0.7631, + "step": 13539 + }, + { + "epoch": 0.7452253838956464, + "grad_norm": 0.6813983917236328, + "learning_rate": 6.9730256139971175e-06, + "loss": 0.7397, + "step": 13540 + }, + { + "epoch": 0.7452804226980021, + "grad_norm": 0.7698497772216797, + "learning_rate": 6.9726273154343806e-06, + "loss": 0.7769, + "step": 13541 + }, + { + "epoch": 0.7453354615003578, + "grad_norm": 0.7406428456306458, + "learning_rate": 6.972229002045905e-06, + "loss": 0.6502, + "step": 13542 + }, + { + "epoch": 0.7453905003027134, + "grad_norm": 0.6976667046546936, + "learning_rate": 6.9718306738346846e-06, + "loss": 0.773, + "step": 13543 + }, + { + "epoch": 0.745445539105069, + "grad_norm": 0.6932592391967773, + "learning_rate": 6.9714323308037115e-06, + "loss": 0.7315, + "step": 13544 + }, + { + "epoch": 0.7455005779074247, + "grad_norm": 0.7329851984977722, + "learning_rate": 6.971033972955981e-06, + "loss": 0.7432, + "step": 13545 + }, + { + "epoch": 0.7455556167097804, + "grad_norm": 0.6262860298156738, + "learning_rate": 6.970635600294489e-06, + "loss": 0.6368, + "step": 13546 + }, + { + "epoch": 0.745610655512136, + "grad_norm": 0.7157273292541504, + "learning_rate": 6.970237212822225e-06, + "loss": 0.7209, + "step": 13547 + }, + { + "epoch": 0.7456656943144917, + "grad_norm": 0.7256374955177307, + "learning_rate": 6.9698388105421855e-06, + "loss": 0.794, + "step": 13548 + }, + { + "epoch": 0.7457207331168474, + "grad_norm": 0.7763124704360962, + "learning_rate": 6.969440393457365e-06, + "loss": 0.7211, + "step": 13549 + }, + { + "epoch": 0.7457757719192031, + "grad_norm": 0.7139148712158203, + "learning_rate": 6.9690419615707585e-06, + "loss": 0.6612, + "step": 13550 + }, + { + "epoch": 0.7458308107215587, + "grad_norm": 0.7532974481582642, + "learning_rate": 6.968643514885359e-06, + "loss": 0.6952, + "step": 13551 + }, + { + "epoch": 0.7458858495239143, + "grad_norm": 0.6845714449882507, + "learning_rate": 6.968245053404161e-06, + "loss": 0.6972, + "step": 13552 + }, + { + "epoch": 0.74594088832627, + "grad_norm": 0.7445462346076965, + "learning_rate": 6.967846577130162e-06, + "loss": 0.7826, + "step": 13553 + }, + { + "epoch": 0.7459959271286257, + "grad_norm": 0.7269366383552551, + "learning_rate": 6.967448086066353e-06, + "loss": 0.7353, + "step": 13554 + }, + { + "epoch": 0.7460509659309813, + "grad_norm": 0.7366362810134888, + "learning_rate": 6.967049580215732e-06, + "loss": 0.7955, + "step": 13555 + }, + { + "epoch": 0.746106004733337, + "grad_norm": 0.6456870436668396, + "learning_rate": 6.966651059581292e-06, + "loss": 0.7467, + "step": 13556 + }, + { + "epoch": 0.7461610435356927, + "grad_norm": 0.7196624279022217, + "learning_rate": 6.966252524166031e-06, + "loss": 0.6621, + "step": 13557 + }, + { + "epoch": 0.7462160823380484, + "grad_norm": 0.6776413917541504, + "learning_rate": 6.965853973972941e-06, + "loss": 0.7647, + "step": 13558 + }, + { + "epoch": 0.746271121140404, + "grad_norm": 0.7319629192352295, + "learning_rate": 6.9654554090050195e-06, + "loss": 0.8172, + "step": 13559 + }, + { + "epoch": 0.7463261599427596, + "grad_norm": 0.6995210647583008, + "learning_rate": 6.96505682926526e-06, + "loss": 0.7252, + "step": 13560 + }, + { + "epoch": 0.7463811987451153, + "grad_norm": 0.6520518064498901, + "learning_rate": 6.964658234756659e-06, + "loss": 0.6856, + "step": 13561 + }, + { + "epoch": 0.746436237547471, + "grad_norm": 0.7562724947929382, + "learning_rate": 6.964259625482215e-06, + "loss": 0.7088, + "step": 13562 + }, + { + "epoch": 0.7464912763498266, + "grad_norm": 0.788045346736908, + "learning_rate": 6.963861001444919e-06, + "loss": 0.7183, + "step": 13563 + }, + { + "epoch": 0.7465463151521823, + "grad_norm": 0.7461729049682617, + "learning_rate": 6.96346236264777e-06, + "loss": 0.6725, + "step": 13564 + }, + { + "epoch": 0.746601353954538, + "grad_norm": 0.7283952832221985, + "learning_rate": 6.963063709093764e-06, + "loss": 0.7765, + "step": 13565 + }, + { + "epoch": 0.7466563927568937, + "grad_norm": 0.7947741150856018, + "learning_rate": 6.962665040785896e-06, + "loss": 0.8423, + "step": 13566 + }, + { + "epoch": 0.7467114315592492, + "grad_norm": 0.7964398264884949, + "learning_rate": 6.962266357727164e-06, + "loss": 0.7589, + "step": 13567 + }, + { + "epoch": 0.7467664703616049, + "grad_norm": 0.7807595133781433, + "learning_rate": 6.961867659920563e-06, + "loss": 0.7843, + "step": 13568 + }, + { + "epoch": 0.7468215091639606, + "grad_norm": 0.678011417388916, + "learning_rate": 6.961468947369089e-06, + "loss": 0.6664, + "step": 13569 + }, + { + "epoch": 0.7468765479663163, + "grad_norm": 0.6768447756767273, + "learning_rate": 6.961070220075741e-06, + "loss": 0.7531, + "step": 13570 + }, + { + "epoch": 0.7469315867686719, + "grad_norm": 0.7405245304107666, + "learning_rate": 6.960671478043514e-06, + "loss": 0.8278, + "step": 13571 + }, + { + "epoch": 0.7469866255710276, + "grad_norm": 0.605675458908081, + "learning_rate": 6.960272721275403e-06, + "loss": 0.7167, + "step": 13572 + }, + { + "epoch": 0.7470416643733833, + "grad_norm": 0.7406657338142395, + "learning_rate": 6.959873949774409e-06, + "loss": 0.8191, + "step": 13573 + }, + { + "epoch": 0.7470967031757388, + "grad_norm": 0.6163522601127625, + "learning_rate": 6.959475163543526e-06, + "loss": 0.6711, + "step": 13574 + }, + { + "epoch": 0.7471517419780945, + "grad_norm": 0.6036590337753296, + "learning_rate": 6.9590763625857525e-06, + "loss": 0.7029, + "step": 13575 + }, + { + "epoch": 0.7472067807804502, + "grad_norm": 0.8638957738876343, + "learning_rate": 6.9586775469040845e-06, + "loss": 0.6288, + "step": 13576 + }, + { + "epoch": 0.7472618195828059, + "grad_norm": 0.7490845322608948, + "learning_rate": 6.958278716501521e-06, + "loss": 0.7375, + "step": 13577 + }, + { + "epoch": 0.7473168583851615, + "grad_norm": 0.7788114547729492, + "learning_rate": 6.957879871381059e-06, + "loss": 0.814, + "step": 13578 + }, + { + "epoch": 0.7473718971875172, + "grad_norm": 0.7247292995452881, + "learning_rate": 6.957481011545697e-06, + "loss": 0.6187, + "step": 13579 + }, + { + "epoch": 0.7474269359898729, + "grad_norm": 0.9642785787582397, + "learning_rate": 6.95708213699843e-06, + "loss": 0.8745, + "step": 13580 + }, + { + "epoch": 0.7474819747922286, + "grad_norm": 0.701675295829773, + "learning_rate": 6.956683247742259e-06, + "loss": 0.8474, + "step": 13581 + }, + { + "epoch": 0.7475370135945841, + "grad_norm": 0.6338050961494446, + "learning_rate": 6.9562843437801795e-06, + "loss": 0.7346, + "step": 13582 + }, + { + "epoch": 0.7475920523969398, + "grad_norm": 0.6954126358032227, + "learning_rate": 6.955885425115191e-06, + "loss": 0.8083, + "step": 13583 + }, + { + "epoch": 0.7476470911992955, + "grad_norm": 0.7316300272941589, + "learning_rate": 6.95548649175029e-06, + "loss": 0.8009, + "step": 13584 + }, + { + "epoch": 0.7477021300016512, + "grad_norm": 0.6314196586608887, + "learning_rate": 6.955087543688477e-06, + "loss": 0.6375, + "step": 13585 + }, + { + "epoch": 0.7477571688040068, + "grad_norm": 0.6604906320571899, + "learning_rate": 6.9546885809327495e-06, + "loss": 0.7081, + "step": 13586 + }, + { + "epoch": 0.7478122076063625, + "grad_norm": 0.8251973986625671, + "learning_rate": 6.9542896034861064e-06, + "loss": 0.7483, + "step": 13587 + }, + { + "epoch": 0.7478672464087182, + "grad_norm": 0.6946399211883545, + "learning_rate": 6.953890611351544e-06, + "loss": 0.8849, + "step": 13588 + }, + { + "epoch": 0.7479222852110738, + "grad_norm": 0.7713609933853149, + "learning_rate": 6.953491604532063e-06, + "loss": 0.7913, + "step": 13589 + }, + { + "epoch": 0.7479773240134294, + "grad_norm": 0.734355092048645, + "learning_rate": 6.953092583030664e-06, + "loss": 0.7216, + "step": 13590 + }, + { + "epoch": 0.7480323628157851, + "grad_norm": 0.6147064566612244, + "learning_rate": 6.952693546850342e-06, + "loss": 0.6894, + "step": 13591 + }, + { + "epoch": 0.7480874016181408, + "grad_norm": 0.7472255229949951, + "learning_rate": 6.9522944959940986e-06, + "loss": 0.7941, + "step": 13592 + }, + { + "epoch": 0.7481424404204965, + "grad_norm": 0.6478431224822998, + "learning_rate": 6.951895430464935e-06, + "loss": 0.6995, + "step": 13593 + }, + { + "epoch": 0.7481974792228521, + "grad_norm": 0.6956225633621216, + "learning_rate": 6.951496350265844e-06, + "loss": 0.7637, + "step": 13594 + }, + { + "epoch": 0.7482525180252078, + "grad_norm": 1.0637938976287842, + "learning_rate": 6.95109725539983e-06, + "loss": 0.7448, + "step": 13595 + }, + { + "epoch": 0.7483075568275634, + "grad_norm": 0.6948299407958984, + "learning_rate": 6.9506981458698916e-06, + "loss": 0.7343, + "step": 13596 + }, + { + "epoch": 0.7483625956299191, + "grad_norm": 0.9034255743026733, + "learning_rate": 6.950299021679028e-06, + "loss": 0.6481, + "step": 13597 + }, + { + "epoch": 0.7484176344322747, + "grad_norm": 0.7901731729507446, + "learning_rate": 6.949899882830239e-06, + "loss": 0.8368, + "step": 13598 + }, + { + "epoch": 0.7484726732346304, + "grad_norm": 0.7791730761528015, + "learning_rate": 6.949500729326525e-06, + "loss": 0.7912, + "step": 13599 + }, + { + "epoch": 0.7485277120369861, + "grad_norm": 0.7678626179695129, + "learning_rate": 6.949101561170883e-06, + "loss": 0.7514, + "step": 13600 + }, + { + "epoch": 0.7485827508393418, + "grad_norm": 0.709762454032898, + "learning_rate": 6.948702378366318e-06, + "loss": 0.6809, + "step": 13601 + }, + { + "epoch": 0.7486377896416974, + "grad_norm": 0.706031084060669, + "learning_rate": 6.948303180915827e-06, + "loss": 0.7454, + "step": 13602 + }, + { + "epoch": 0.748692828444053, + "grad_norm": 0.658869743347168, + "learning_rate": 6.9479039688224105e-06, + "loss": 0.6498, + "step": 13603 + }, + { + "epoch": 0.7487478672464087, + "grad_norm": 0.7253865599632263, + "learning_rate": 6.9475047420890685e-06, + "loss": 0.8063, + "step": 13604 + }, + { + "epoch": 0.7488029060487644, + "grad_norm": 0.752839207649231, + "learning_rate": 6.947105500718804e-06, + "loss": 0.7708, + "step": 13605 + }, + { + "epoch": 0.74885794485112, + "grad_norm": 0.6694571375846863, + "learning_rate": 6.946706244714615e-06, + "loss": 0.7121, + "step": 13606 + }, + { + "epoch": 0.7489129836534757, + "grad_norm": 0.751380443572998, + "learning_rate": 6.946306974079503e-06, + "loss": 0.8797, + "step": 13607 + }, + { + "epoch": 0.7489680224558314, + "grad_norm": 0.8001984357833862, + "learning_rate": 6.9459076888164676e-06, + "loss": 0.8963, + "step": 13608 + }, + { + "epoch": 0.7490230612581871, + "grad_norm": 0.7149432301521301, + "learning_rate": 6.945508388928511e-06, + "loss": 0.8311, + "step": 13609 + }, + { + "epoch": 0.7490781000605427, + "grad_norm": 0.8295183777809143, + "learning_rate": 6.945109074418635e-06, + "loss": 0.7466, + "step": 13610 + }, + { + "epoch": 0.7491331388628983, + "grad_norm": 0.7480556964874268, + "learning_rate": 6.94470974528984e-06, + "loss": 0.8277, + "step": 13611 + }, + { + "epoch": 0.749188177665254, + "grad_norm": 0.7962234616279602, + "learning_rate": 6.944310401545127e-06, + "loss": 0.7143, + "step": 13612 + }, + { + "epoch": 0.7492432164676097, + "grad_norm": 0.7722699642181396, + "learning_rate": 6.943911043187497e-06, + "loss": 0.6619, + "step": 13613 + }, + { + "epoch": 0.7492982552699653, + "grad_norm": 0.8495624661445618, + "learning_rate": 6.943511670219952e-06, + "loss": 0.8475, + "step": 13614 + }, + { + "epoch": 0.749353294072321, + "grad_norm": 0.7702826261520386, + "learning_rate": 6.943112282645494e-06, + "loss": 0.826, + "step": 13615 + }, + { + "epoch": 0.7494083328746767, + "grad_norm": 0.7435297966003418, + "learning_rate": 6.942712880467124e-06, + "loss": 0.8121, + "step": 13616 + }, + { + "epoch": 0.7494633716770323, + "grad_norm": 0.8108325600624084, + "learning_rate": 6.942313463687844e-06, + "loss": 0.7282, + "step": 13617 + }, + { + "epoch": 0.7495184104793879, + "grad_norm": 0.6840381622314453, + "learning_rate": 6.9419140323106574e-06, + "loss": 0.7446, + "step": 13618 + }, + { + "epoch": 0.7495734492817436, + "grad_norm": 0.7155357599258423, + "learning_rate": 6.941514586338562e-06, + "loss": 0.7598, + "step": 13619 + }, + { + "epoch": 0.7496284880840993, + "grad_norm": 0.7693290114402771, + "learning_rate": 6.941115125774564e-06, + "loss": 0.7666, + "step": 13620 + }, + { + "epoch": 0.7496835268864549, + "grad_norm": 0.6918750405311584, + "learning_rate": 6.940715650621665e-06, + "loss": 0.6831, + "step": 13621 + }, + { + "epoch": 0.7497385656888106, + "grad_norm": 0.8241471648216248, + "learning_rate": 6.9403161608828654e-06, + "loss": 0.6753, + "step": 13622 + }, + { + "epoch": 0.7497936044911663, + "grad_norm": 0.6659193634986877, + "learning_rate": 6.93991665656117e-06, + "loss": 0.6988, + "step": 13623 + }, + { + "epoch": 0.749848643293522, + "grad_norm": 0.8012998700141907, + "learning_rate": 6.9395171376595795e-06, + "loss": 0.7922, + "step": 13624 + }, + { + "epoch": 0.7499036820958775, + "grad_norm": 0.783018946647644, + "learning_rate": 6.9391176041810974e-06, + "loss": 0.7062, + "step": 13625 + }, + { + "epoch": 0.7499587208982332, + "grad_norm": 0.8228014707565308, + "learning_rate": 6.938718056128726e-06, + "loss": 0.7762, + "step": 13626 + }, + { + "epoch": 0.7500137597005889, + "grad_norm": 0.783525288105011, + "learning_rate": 6.9383184935054705e-06, + "loss": 0.7517, + "step": 13627 + }, + { + "epoch": 0.7500687985029446, + "grad_norm": 0.6686612963676453, + "learning_rate": 6.93791891631433e-06, + "loss": 0.7372, + "step": 13628 + }, + { + "epoch": 0.7501238373053002, + "grad_norm": 0.7089647054672241, + "learning_rate": 6.937519324558312e-06, + "loss": 0.7847, + "step": 13629 + }, + { + "epoch": 0.7501788761076559, + "grad_norm": 0.7674399018287659, + "learning_rate": 6.937119718240415e-06, + "loss": 0.7414, + "step": 13630 + }, + { + "epoch": 0.7502339149100116, + "grad_norm": 0.6331565380096436, + "learning_rate": 6.936720097363646e-06, + "loss": 0.7603, + "step": 13631 + }, + { + "epoch": 0.7502889537123673, + "grad_norm": 0.7084798812866211, + "learning_rate": 6.9363204619310065e-06, + "loss": 0.6844, + "step": 13632 + }, + { + "epoch": 0.7503439925147228, + "grad_norm": 0.8624362945556641, + "learning_rate": 6.9359208119455015e-06, + "loss": 0.7098, + "step": 13633 + }, + { + "epoch": 0.7503990313170785, + "grad_norm": 0.7681849598884583, + "learning_rate": 6.935521147410134e-06, + "loss": 0.7896, + "step": 13634 + }, + { + "epoch": 0.7504540701194342, + "grad_norm": 0.7494263052940369, + "learning_rate": 6.935121468327907e-06, + "loss": 0.7858, + "step": 13635 + }, + { + "epoch": 0.7505091089217899, + "grad_norm": 0.7102827429771423, + "learning_rate": 6.934721774701824e-06, + "loss": 0.7485, + "step": 13636 + }, + { + "epoch": 0.7505641477241455, + "grad_norm": 0.7031061053276062, + "learning_rate": 6.934322066534891e-06, + "loss": 0.7154, + "step": 13637 + }, + { + "epoch": 0.7506191865265012, + "grad_norm": 0.6468148231506348, + "learning_rate": 6.933922343830112e-06, + "loss": 0.729, + "step": 13638 + }, + { + "epoch": 0.7506742253288569, + "grad_norm": 0.8570408225059509, + "learning_rate": 6.933522606590489e-06, + "loss": 0.6922, + "step": 13639 + }, + { + "epoch": 0.7507292641312125, + "grad_norm": 0.6836286783218384, + "learning_rate": 6.933122854819027e-06, + "loss": 0.7982, + "step": 13640 + }, + { + "epoch": 0.7507843029335681, + "grad_norm": 1.052017092704773, + "learning_rate": 6.9327230885187344e-06, + "loss": 0.7522, + "step": 13641 + }, + { + "epoch": 0.7508393417359238, + "grad_norm": 0.6352099180221558, + "learning_rate": 6.932323307692611e-06, + "loss": 0.6724, + "step": 13642 + }, + { + "epoch": 0.7508943805382795, + "grad_norm": 0.7046655416488647, + "learning_rate": 6.931923512343663e-06, + "loss": 0.7732, + "step": 13643 + }, + { + "epoch": 0.7509494193406352, + "grad_norm": 0.7600587010383606, + "learning_rate": 6.931523702474893e-06, + "loss": 0.7013, + "step": 13644 + }, + { + "epoch": 0.7510044581429908, + "grad_norm": 0.674828052520752, + "learning_rate": 6.9311238780893095e-06, + "loss": 0.7022, + "step": 13645 + }, + { + "epoch": 0.7510594969453465, + "grad_norm": 0.7517798542976379, + "learning_rate": 6.930724039189916e-06, + "loss": 0.7248, + "step": 13646 + }, + { + "epoch": 0.7511145357477022, + "grad_norm": 0.7851112484931946, + "learning_rate": 6.930324185779716e-06, + "loss": 0.8025, + "step": 13647 + }, + { + "epoch": 0.7511695745500578, + "grad_norm": 0.6545413732528687, + "learning_rate": 6.929924317861717e-06, + "loss": 0.781, + "step": 13648 + }, + { + "epoch": 0.7512246133524134, + "grad_norm": 0.7079984545707703, + "learning_rate": 6.929524435438923e-06, + "loss": 0.8033, + "step": 13649 + }, + { + "epoch": 0.7512796521547691, + "grad_norm": 0.6501914262771606, + "learning_rate": 6.929124538514341e-06, + "loss": 0.7525, + "step": 13650 + }, + { + "epoch": 0.7513346909571248, + "grad_norm": 0.7697597742080688, + "learning_rate": 6.928724627090975e-06, + "loss": 0.7358, + "step": 13651 + }, + { + "epoch": 0.7513897297594805, + "grad_norm": 0.8155171275138855, + "learning_rate": 6.928324701171832e-06, + "loss": 0.7389, + "step": 13652 + }, + { + "epoch": 0.7514447685618361, + "grad_norm": 0.6969262361526489, + "learning_rate": 6.927924760759914e-06, + "loss": 0.8349, + "step": 13653 + }, + { + "epoch": 0.7514998073641918, + "grad_norm": 0.6736776828765869, + "learning_rate": 6.927524805858233e-06, + "loss": 0.7379, + "step": 13654 + }, + { + "epoch": 0.7515548461665474, + "grad_norm": 0.6362389922142029, + "learning_rate": 6.927124836469788e-06, + "loss": 0.7479, + "step": 13655 + }, + { + "epoch": 0.7516098849689031, + "grad_norm": 0.688922643661499, + "learning_rate": 6.92672485259759e-06, + "loss": 0.7828, + "step": 13656 + }, + { + "epoch": 0.7516649237712587, + "grad_norm": 0.7098214030265808, + "learning_rate": 6.926324854244644e-06, + "loss": 0.6084, + "step": 13657 + }, + { + "epoch": 0.7517199625736144, + "grad_norm": 0.6436209678649902, + "learning_rate": 6.925924841413956e-06, + "loss": 0.687, + "step": 13658 + }, + { + "epoch": 0.7517750013759701, + "grad_norm": 0.6051730513572693, + "learning_rate": 6.925524814108533e-06, + "loss": 0.6884, + "step": 13659 + }, + { + "epoch": 0.7518300401783257, + "grad_norm": 0.6347759962081909, + "learning_rate": 6.92512477233138e-06, + "loss": 0.7057, + "step": 13660 + }, + { + "epoch": 0.7518850789806814, + "grad_norm": 0.6917054653167725, + "learning_rate": 6.924724716085505e-06, + "loss": 0.8374, + "step": 13661 + }, + { + "epoch": 0.751940117783037, + "grad_norm": 0.7676698565483093, + "learning_rate": 6.924324645373914e-06, + "loss": 0.7435, + "step": 13662 + }, + { + "epoch": 0.7519951565853927, + "grad_norm": 0.6601388454437256, + "learning_rate": 6.923924560199613e-06, + "loss": 0.7168, + "step": 13663 + }, + { + "epoch": 0.7520501953877483, + "grad_norm": 0.6342683434486389, + "learning_rate": 6.923524460565611e-06, + "loss": 0.7382, + "step": 13664 + }, + { + "epoch": 0.752105234190104, + "grad_norm": 0.6703974604606628, + "learning_rate": 6.923124346474915e-06, + "loss": 0.7687, + "step": 13665 + }, + { + "epoch": 0.7521602729924597, + "grad_norm": 0.6937074661254883, + "learning_rate": 6.922724217930531e-06, + "loss": 0.7687, + "step": 13666 + }, + { + "epoch": 0.7522153117948154, + "grad_norm": 0.7919568419456482, + "learning_rate": 6.922324074935466e-06, + "loss": 0.7328, + "step": 13667 + }, + { + "epoch": 0.752270350597171, + "grad_norm": 0.668331503868103, + "learning_rate": 6.9219239174927275e-06, + "loss": 0.7654, + "step": 13668 + }, + { + "epoch": 0.7523253893995266, + "grad_norm": 0.6298941969871521, + "learning_rate": 6.921523745605323e-06, + "loss": 0.719, + "step": 13669 + }, + { + "epoch": 0.7523804282018823, + "grad_norm": 0.6539381146430969, + "learning_rate": 6.921123559276262e-06, + "loss": 0.6681, + "step": 13670 + }, + { + "epoch": 0.752435467004238, + "grad_norm": 1.0692330598831177, + "learning_rate": 6.920723358508548e-06, + "loss": 0.7914, + "step": 13671 + }, + { + "epoch": 0.7524905058065936, + "grad_norm": 0.7410482168197632, + "learning_rate": 6.920323143305193e-06, + "loss": 0.8331, + "step": 13672 + }, + { + "epoch": 0.7525455446089493, + "grad_norm": 0.6976327300071716, + "learning_rate": 6.919922913669203e-06, + "loss": 0.8131, + "step": 13673 + }, + { + "epoch": 0.752600583411305, + "grad_norm": 0.646442174911499, + "learning_rate": 6.919522669603587e-06, + "loss": 0.7658, + "step": 13674 + }, + { + "epoch": 0.7526556222136607, + "grad_norm": 0.6257727146148682, + "learning_rate": 6.919122411111352e-06, + "loss": 0.666, + "step": 13675 + }, + { + "epoch": 0.7527106610160162, + "grad_norm": 0.6913230419158936, + "learning_rate": 6.918722138195506e-06, + "loss": 0.6935, + "step": 13676 + }, + { + "epoch": 0.7527656998183719, + "grad_norm": 0.6282557249069214, + "learning_rate": 6.918321850859059e-06, + "loss": 0.7042, + "step": 13677 + }, + { + "epoch": 0.7528207386207276, + "grad_norm": 0.6980175971984863, + "learning_rate": 6.917921549105018e-06, + "loss": 0.6757, + "step": 13678 + }, + { + "epoch": 0.7528757774230833, + "grad_norm": 0.6954337954521179, + "learning_rate": 6.917521232936393e-06, + "loss": 0.729, + "step": 13679 + }, + { + "epoch": 0.7529308162254389, + "grad_norm": 0.6813758015632629, + "learning_rate": 6.91712090235619e-06, + "loss": 0.6964, + "step": 13680 + }, + { + "epoch": 0.7529858550277946, + "grad_norm": 1.0940780639648438, + "learning_rate": 6.916720557367419e-06, + "loss": 0.7853, + "step": 13681 + }, + { + "epoch": 0.7530408938301503, + "grad_norm": 0.6899382472038269, + "learning_rate": 6.9163201979730906e-06, + "loss": 0.7639, + "step": 13682 + }, + { + "epoch": 0.753095932632506, + "grad_norm": 0.660252034664154, + "learning_rate": 6.915919824176213e-06, + "loss": 0.7068, + "step": 13683 + }, + { + "epoch": 0.7531509714348615, + "grad_norm": 0.6454583406448364, + "learning_rate": 6.915519435979795e-06, + "loss": 0.7268, + "step": 13684 + }, + { + "epoch": 0.7532060102372172, + "grad_norm": 0.7292754650115967, + "learning_rate": 6.915119033386843e-06, + "loss": 0.8131, + "step": 13685 + }, + { + "epoch": 0.7532610490395729, + "grad_norm": 0.6312932372093201, + "learning_rate": 6.914718616400372e-06, + "loss": 0.6977, + "step": 13686 + }, + { + "epoch": 0.7533160878419286, + "grad_norm": 0.8528029322624207, + "learning_rate": 6.914318185023388e-06, + "loss": 0.8403, + "step": 13687 + }, + { + "epoch": 0.7533711266442842, + "grad_norm": 0.758721649646759, + "learning_rate": 6.9139177392589e-06, + "loss": 0.7, + "step": 13688 + }, + { + "epoch": 0.7534261654466399, + "grad_norm": 0.6678142547607422, + "learning_rate": 6.913517279109919e-06, + "loss": 0.6251, + "step": 13689 + }, + { + "epoch": 0.7534812042489956, + "grad_norm": 0.6136146783828735, + "learning_rate": 6.913116804579455e-06, + "loss": 0.653, + "step": 13690 + }, + { + "epoch": 0.7535362430513513, + "grad_norm": 0.7546648383140564, + "learning_rate": 6.912716315670517e-06, + "loss": 0.8202, + "step": 13691 + }, + { + "epoch": 0.7535912818537068, + "grad_norm": 0.7232012152671814, + "learning_rate": 6.912315812386114e-06, + "loss": 0.7993, + "step": 13692 + }, + { + "epoch": 0.7536463206560625, + "grad_norm": 0.7288710474967957, + "learning_rate": 6.911915294729258e-06, + "loss": 0.7702, + "step": 13693 + }, + { + "epoch": 0.7537013594584182, + "grad_norm": 0.6847403049468994, + "learning_rate": 6.9115147627029575e-06, + "loss": 0.8141, + "step": 13694 + }, + { + "epoch": 0.7537563982607739, + "grad_norm": 0.62345951795578, + "learning_rate": 6.9111142163102255e-06, + "loss": 0.6832, + "step": 13695 + }, + { + "epoch": 0.7538114370631295, + "grad_norm": 0.7275232672691345, + "learning_rate": 6.9107136555540695e-06, + "loss": 0.7548, + "step": 13696 + }, + { + "epoch": 0.7538664758654852, + "grad_norm": 0.6724695563316345, + "learning_rate": 6.910313080437501e-06, + "loss": 0.7755, + "step": 13697 + }, + { + "epoch": 0.7539215146678409, + "grad_norm": 0.8446974754333496, + "learning_rate": 6.90991249096353e-06, + "loss": 0.827, + "step": 13698 + }, + { + "epoch": 0.7539765534701965, + "grad_norm": 0.7124913930892944, + "learning_rate": 6.9095118871351705e-06, + "loss": 0.7463, + "step": 13699 + }, + { + "epoch": 0.7540315922725521, + "grad_norm": 0.6916043162345886, + "learning_rate": 6.90911126895543e-06, + "loss": 0.714, + "step": 13700 + }, + { + "epoch": 0.7540866310749078, + "grad_norm": 0.7585330009460449, + "learning_rate": 6.908710636427319e-06, + "loss": 0.6731, + "step": 13701 + }, + { + "epoch": 0.7541416698772635, + "grad_norm": 0.6905520558357239, + "learning_rate": 6.90830998955385e-06, + "loss": 0.726, + "step": 13702 + }, + { + "epoch": 0.7541967086796191, + "grad_norm": 0.7482494115829468, + "learning_rate": 6.907909328338035e-06, + "loss": 0.7269, + "step": 13703 + }, + { + "epoch": 0.7542517474819748, + "grad_norm": 0.7565957307815552, + "learning_rate": 6.907508652782884e-06, + "loss": 0.6959, + "step": 13704 + }, + { + "epoch": 0.7543067862843305, + "grad_norm": 0.7458370923995972, + "learning_rate": 6.9071079628914075e-06, + "loss": 0.7448, + "step": 13705 + }, + { + "epoch": 0.7543618250866861, + "grad_norm": 1.3538293838500977, + "learning_rate": 6.9067072586666185e-06, + "loss": 0.8164, + "step": 13706 + }, + { + "epoch": 0.7544168638890417, + "grad_norm": 0.6217493414878845, + "learning_rate": 6.906306540111528e-06, + "loss": 0.7001, + "step": 13707 + }, + { + "epoch": 0.7544719026913974, + "grad_norm": 0.6862730383872986, + "learning_rate": 6.9059058072291485e-06, + "loss": 0.7921, + "step": 13708 + }, + { + "epoch": 0.7545269414937531, + "grad_norm": 0.6684688925743103, + "learning_rate": 6.905505060022491e-06, + "loss": 0.6736, + "step": 13709 + }, + { + "epoch": 0.7545819802961088, + "grad_norm": 0.6581160426139832, + "learning_rate": 6.905104298494567e-06, + "loss": 0.7581, + "step": 13710 + }, + { + "epoch": 0.7546370190984644, + "grad_norm": 0.7772610783576965, + "learning_rate": 6.9047035226483885e-06, + "loss": 0.7984, + "step": 13711 + }, + { + "epoch": 0.7546920579008201, + "grad_norm": 0.6856822371482849, + "learning_rate": 6.90430273248697e-06, + "loss": 0.8232, + "step": 13712 + }, + { + "epoch": 0.7547470967031757, + "grad_norm": 0.7250725626945496, + "learning_rate": 6.903901928013322e-06, + "loss": 0.7844, + "step": 13713 + }, + { + "epoch": 0.7548021355055314, + "grad_norm": 0.7034164667129517, + "learning_rate": 6.9035011092304545e-06, + "loss": 0.8293, + "step": 13714 + }, + { + "epoch": 0.754857174307887, + "grad_norm": 0.6783095002174377, + "learning_rate": 6.903100276141383e-06, + "loss": 0.6841, + "step": 13715 + }, + { + "epoch": 0.7549122131102427, + "grad_norm": 0.6180121302604675, + "learning_rate": 6.90269942874912e-06, + "loss": 0.7111, + "step": 13716 + }, + { + "epoch": 0.7549672519125984, + "grad_norm": 0.70428466796875, + "learning_rate": 6.902298567056677e-06, + "loss": 0.8758, + "step": 13717 + }, + { + "epoch": 0.7550222907149541, + "grad_norm": 0.8130238652229309, + "learning_rate": 6.9018976910670665e-06, + "loss": 0.6443, + "step": 13718 + }, + { + "epoch": 0.7550773295173097, + "grad_norm": 0.6910800933837891, + "learning_rate": 6.901496800783302e-06, + "loss": 0.7231, + "step": 13719 + }, + { + "epoch": 0.7551323683196653, + "grad_norm": 0.700933575630188, + "learning_rate": 6.901095896208398e-06, + "loss": 0.6785, + "step": 13720 + }, + { + "epoch": 0.755187407122021, + "grad_norm": 0.7407829761505127, + "learning_rate": 6.9006949773453656e-06, + "loss": 0.694, + "step": 13721 + }, + { + "epoch": 0.7552424459243767, + "grad_norm": 0.7907935380935669, + "learning_rate": 6.900294044197218e-06, + "loss": 0.7674, + "step": 13722 + }, + { + "epoch": 0.7552974847267323, + "grad_norm": 0.6585111021995544, + "learning_rate": 6.89989309676697e-06, + "loss": 0.6785, + "step": 13723 + }, + { + "epoch": 0.755352523529088, + "grad_norm": 0.7611724138259888, + "learning_rate": 6.899492135057633e-06, + "loss": 0.8028, + "step": 13724 + }, + { + "epoch": 0.7554075623314437, + "grad_norm": 0.6412070989608765, + "learning_rate": 6.899091159072222e-06, + "loss": 0.7634, + "step": 13725 + }, + { + "epoch": 0.7554626011337994, + "grad_norm": 0.7712366580963135, + "learning_rate": 6.898690168813751e-06, + "loss": 0.8275, + "step": 13726 + }, + { + "epoch": 0.755517639936155, + "grad_norm": 0.6826579570770264, + "learning_rate": 6.898289164285232e-06, + "loss": 0.7949, + "step": 13727 + }, + { + "epoch": 0.7555726787385106, + "grad_norm": 0.7501955628395081, + "learning_rate": 6.897888145489681e-06, + "loss": 0.7846, + "step": 13728 + }, + { + "epoch": 0.7556277175408663, + "grad_norm": 0.6493077874183655, + "learning_rate": 6.8974871124301075e-06, + "loss": 0.7294, + "step": 13729 + }, + { + "epoch": 0.755682756343222, + "grad_norm": 0.6854347586631775, + "learning_rate": 6.897086065109532e-06, + "loss": 0.7121, + "step": 13730 + }, + { + "epoch": 0.7557377951455776, + "grad_norm": 0.7376317977905273, + "learning_rate": 6.896685003530964e-06, + "loss": 0.7719, + "step": 13731 + }, + { + "epoch": 0.7557928339479333, + "grad_norm": 0.8477175235748291, + "learning_rate": 6.89628392769742e-06, + "loss": 0.7981, + "step": 13732 + }, + { + "epoch": 0.755847872750289, + "grad_norm": 0.6611722111701965, + "learning_rate": 6.8958828376119125e-06, + "loss": 0.7628, + "step": 13733 + }, + { + "epoch": 0.7559029115526447, + "grad_norm": 0.6898290514945984, + "learning_rate": 6.895481733277458e-06, + "loss": 0.7578, + "step": 13734 + }, + { + "epoch": 0.7559579503550002, + "grad_norm": 0.6566810607910156, + "learning_rate": 6.89508061469707e-06, + "loss": 0.6919, + "step": 13735 + }, + { + "epoch": 0.7560129891573559, + "grad_norm": 0.6395933032035828, + "learning_rate": 6.894679481873763e-06, + "loss": 0.7334, + "step": 13736 + }, + { + "epoch": 0.7560680279597116, + "grad_norm": 0.7060876488685608, + "learning_rate": 6.8942783348105535e-06, + "loss": 0.7405, + "step": 13737 + }, + { + "epoch": 0.7561230667620673, + "grad_norm": 0.7303228974342346, + "learning_rate": 6.893877173510454e-06, + "loss": 0.8563, + "step": 13738 + }, + { + "epoch": 0.7561781055644229, + "grad_norm": 0.663474977016449, + "learning_rate": 6.893475997976481e-06, + "loss": 0.703, + "step": 13739 + }, + { + "epoch": 0.7562331443667786, + "grad_norm": 0.8005428910255432, + "learning_rate": 6.893074808211649e-06, + "loss": 0.7219, + "step": 13740 + }, + { + "epoch": 0.7562881831691343, + "grad_norm": 1.3285688161849976, + "learning_rate": 6.892673604218972e-06, + "loss": 0.672, + "step": 13741 + }, + { + "epoch": 0.75634322197149, + "grad_norm": 0.6958948373794556, + "learning_rate": 6.892272386001469e-06, + "loss": 0.7728, + "step": 13742 + }, + { + "epoch": 0.7563982607738455, + "grad_norm": 0.6840598583221436, + "learning_rate": 6.891871153562153e-06, + "loss": 0.7881, + "step": 13743 + }, + { + "epoch": 0.7564532995762012, + "grad_norm": 0.7184257507324219, + "learning_rate": 6.891469906904039e-06, + "loss": 0.736, + "step": 13744 + }, + { + "epoch": 0.7565083383785569, + "grad_norm": 0.6611571311950684, + "learning_rate": 6.891068646030143e-06, + "loss": 0.7171, + "step": 13745 + }, + { + "epoch": 0.7565633771809125, + "grad_norm": 0.8237559795379639, + "learning_rate": 6.890667370943482e-06, + "loss": 0.8669, + "step": 13746 + }, + { + "epoch": 0.7566184159832682, + "grad_norm": 0.6898388266563416, + "learning_rate": 6.890266081647072e-06, + "loss": 0.6654, + "step": 13747 + }, + { + "epoch": 0.7566734547856239, + "grad_norm": 0.6541711688041687, + "learning_rate": 6.889864778143928e-06, + "loss": 0.7455, + "step": 13748 + }, + { + "epoch": 0.7567284935879796, + "grad_norm": 0.6518157124519348, + "learning_rate": 6.8894634604370655e-06, + "loss": 0.7174, + "step": 13749 + }, + { + "epoch": 0.7567835323903351, + "grad_norm": 0.7992080450057983, + "learning_rate": 6.889062128529502e-06, + "loss": 0.7349, + "step": 13750 + }, + { + "epoch": 0.7568385711926908, + "grad_norm": 0.5748338103294373, + "learning_rate": 6.888660782424253e-06, + "loss": 0.5398, + "step": 13751 + }, + { + "epoch": 0.7568936099950465, + "grad_norm": 0.6507781744003296, + "learning_rate": 6.8882594221243344e-06, + "loss": 0.6762, + "step": 13752 + }, + { + "epoch": 0.7569486487974022, + "grad_norm": 0.6908432841300964, + "learning_rate": 6.887858047632764e-06, + "loss": 0.8034, + "step": 13753 + }, + { + "epoch": 0.7570036875997578, + "grad_norm": 0.6497751474380493, + "learning_rate": 6.887456658952557e-06, + "loss": 0.6351, + "step": 13754 + }, + { + "epoch": 0.7570587264021135, + "grad_norm": 0.7233273386955261, + "learning_rate": 6.887055256086732e-06, + "loss": 0.7096, + "step": 13755 + }, + { + "epoch": 0.7571137652044692, + "grad_norm": 0.6587454676628113, + "learning_rate": 6.886653839038305e-06, + "loss": 0.7354, + "step": 13756 + }, + { + "epoch": 0.7571688040068248, + "grad_norm": 0.6654310822486877, + "learning_rate": 6.886252407810292e-06, + "loss": 0.7776, + "step": 13757 + }, + { + "epoch": 0.7572238428091804, + "grad_norm": 0.796604573726654, + "learning_rate": 6.885850962405711e-06, + "loss": 0.7925, + "step": 13758 + }, + { + "epoch": 0.7572788816115361, + "grad_norm": 0.7053457498550415, + "learning_rate": 6.8854495028275795e-06, + "loss": 0.7893, + "step": 13759 + }, + { + "epoch": 0.7573339204138918, + "grad_norm": 0.7201200127601624, + "learning_rate": 6.885048029078914e-06, + "loss": 0.8346, + "step": 13760 + }, + { + "epoch": 0.7573889592162475, + "grad_norm": 0.8437653183937073, + "learning_rate": 6.884646541162731e-06, + "loss": 0.7468, + "step": 13761 + }, + { + "epoch": 0.7574439980186031, + "grad_norm": 0.6910028457641602, + "learning_rate": 6.884245039082052e-06, + "loss": 0.7362, + "step": 13762 + }, + { + "epoch": 0.7574990368209588, + "grad_norm": 0.6896274089813232, + "learning_rate": 6.883843522839889e-06, + "loss": 0.6515, + "step": 13763 + }, + { + "epoch": 0.7575540756233144, + "grad_norm": 0.9833560585975647, + "learning_rate": 6.8834419924392636e-06, + "loss": 0.8764, + "step": 13764 + }, + { + "epoch": 0.7576091144256701, + "grad_norm": 0.7130032181739807, + "learning_rate": 6.88304044788319e-06, + "loss": 0.7631, + "step": 13765 + }, + { + "epoch": 0.7576641532280257, + "grad_norm": 0.7059195041656494, + "learning_rate": 6.882638889174691e-06, + "loss": 0.8147, + "step": 13766 + }, + { + "epoch": 0.7577191920303814, + "grad_norm": 0.6451989412307739, + "learning_rate": 6.882237316316781e-06, + "loss": 0.6638, + "step": 13767 + }, + { + "epoch": 0.7577742308327371, + "grad_norm": 0.7541074752807617, + "learning_rate": 6.881835729312481e-06, + "loss": 0.6918, + "step": 13768 + }, + { + "epoch": 0.7578292696350928, + "grad_norm": 0.7227535843849182, + "learning_rate": 6.881434128164805e-06, + "loss": 0.7759, + "step": 13769 + }, + { + "epoch": 0.7578843084374484, + "grad_norm": 0.673112154006958, + "learning_rate": 6.881032512876774e-06, + "loss": 0.7328, + "step": 13770 + }, + { + "epoch": 0.757939347239804, + "grad_norm": 0.6536681056022644, + "learning_rate": 6.880630883451407e-06, + "loss": 0.7677, + "step": 13771 + }, + { + "epoch": 0.7579943860421597, + "grad_norm": 0.8517894148826599, + "learning_rate": 6.880229239891721e-06, + "loss": 0.8566, + "step": 13772 + }, + { + "epoch": 0.7580494248445154, + "grad_norm": 0.8260573148727417, + "learning_rate": 6.879827582200737e-06, + "loss": 0.8228, + "step": 13773 + }, + { + "epoch": 0.758104463646871, + "grad_norm": 0.7460072040557861, + "learning_rate": 6.87942591038147e-06, + "loss": 0.8047, + "step": 13774 + }, + { + "epoch": 0.7581595024492267, + "grad_norm": 0.7648436427116394, + "learning_rate": 6.879024224436942e-06, + "loss": 0.852, + "step": 13775 + }, + { + "epoch": 0.7582145412515824, + "grad_norm": 0.7161253094673157, + "learning_rate": 6.878622524370171e-06, + "loss": 0.7638, + "step": 13776 + }, + { + "epoch": 0.7582695800539381, + "grad_norm": 0.6559579372406006, + "learning_rate": 6.878220810184175e-06, + "loss": 0.6932, + "step": 13777 + }, + { + "epoch": 0.7583246188562937, + "grad_norm": 0.6846898198127747, + "learning_rate": 6.877819081881975e-06, + "loss": 0.7098, + "step": 13778 + }, + { + "epoch": 0.7583796576586493, + "grad_norm": 0.7569675445556641, + "learning_rate": 6.87741733946659e-06, + "loss": 0.687, + "step": 13779 + }, + { + "epoch": 0.758434696461005, + "grad_norm": 0.7513766288757324, + "learning_rate": 6.877015582941038e-06, + "loss": 0.8673, + "step": 13780 + }, + { + "epoch": 0.7584897352633607, + "grad_norm": 0.7158082127571106, + "learning_rate": 6.876613812308338e-06, + "loss": 0.7563, + "step": 13781 + }, + { + "epoch": 0.7585447740657163, + "grad_norm": 0.6307277083396912, + "learning_rate": 6.876212027571513e-06, + "loss": 0.6725, + "step": 13782 + }, + { + "epoch": 0.758599812868072, + "grad_norm": 0.735090434551239, + "learning_rate": 6.87581022873358e-06, + "loss": 0.763, + "step": 13783 + }, + { + "epoch": 0.7586548516704277, + "grad_norm": 0.6412403583526611, + "learning_rate": 6.8754084157975594e-06, + "loss": 0.5992, + "step": 13784 + }, + { + "epoch": 0.7587098904727834, + "grad_norm": 0.639854907989502, + "learning_rate": 6.875006588766472e-06, + "loss": 0.7372, + "step": 13785 + }, + { + "epoch": 0.7587649292751389, + "grad_norm": 0.6855082511901855, + "learning_rate": 6.8746047476433365e-06, + "loss": 0.7709, + "step": 13786 + }, + { + "epoch": 0.7588199680774946, + "grad_norm": 0.6838769912719727, + "learning_rate": 6.874202892431173e-06, + "loss": 0.7545, + "step": 13787 + }, + { + "epoch": 0.7588750068798503, + "grad_norm": 1.1560181379318237, + "learning_rate": 6.873801023133002e-06, + "loss": 0.7291, + "step": 13788 + }, + { + "epoch": 0.7589300456822059, + "grad_norm": 0.7140469551086426, + "learning_rate": 6.873399139751844e-06, + "loss": 0.7214, + "step": 13789 + }, + { + "epoch": 0.7589850844845616, + "grad_norm": 0.6856355667114258, + "learning_rate": 6.8729972422907195e-06, + "loss": 0.7417, + "step": 13790 + }, + { + "epoch": 0.7590401232869173, + "grad_norm": 0.7856155633926392, + "learning_rate": 6.8725953307526505e-06, + "loss": 0.7484, + "step": 13791 + }, + { + "epoch": 0.759095162089273, + "grad_norm": 0.8107255697250366, + "learning_rate": 6.8721934051406555e-06, + "loss": 0.7568, + "step": 13792 + }, + { + "epoch": 0.7591502008916285, + "grad_norm": 0.6590837240219116, + "learning_rate": 6.871791465457757e-06, + "loss": 0.7495, + "step": 13793 + }, + { + "epoch": 0.7592052396939842, + "grad_norm": 0.7531588077545166, + "learning_rate": 6.8713895117069715e-06, + "loss": 0.7434, + "step": 13794 + }, + { + "epoch": 0.7592602784963399, + "grad_norm": 0.6818329095840454, + "learning_rate": 6.870987543891326e-06, + "loss": 0.7128, + "step": 13795 + }, + { + "epoch": 0.7593153172986956, + "grad_norm": 0.6082884669303894, + "learning_rate": 6.8705855620138395e-06, + "loss": 0.7437, + "step": 13796 + }, + { + "epoch": 0.7593703561010512, + "grad_norm": 0.9583787322044373, + "learning_rate": 6.870183566077532e-06, + "loss": 0.7779, + "step": 13797 + }, + { + "epoch": 0.7594253949034069, + "grad_norm": 0.6684621572494507, + "learning_rate": 6.869781556085425e-06, + "loss": 0.5856, + "step": 13798 + }, + { + "epoch": 0.7594804337057626, + "grad_norm": 0.6225603222846985, + "learning_rate": 6.869379532040541e-06, + "loss": 0.7407, + "step": 13799 + }, + { + "epoch": 0.7595354725081183, + "grad_norm": 0.6973103284835815, + "learning_rate": 6.8689774939459005e-06, + "loss": 0.7789, + "step": 13800 + }, + { + "epoch": 0.7595905113104738, + "grad_norm": 0.6655399203300476, + "learning_rate": 6.868575441804526e-06, + "loss": 0.7489, + "step": 13801 + }, + { + "epoch": 0.7596455501128295, + "grad_norm": 0.7066664695739746, + "learning_rate": 6.868173375619437e-06, + "loss": 0.7035, + "step": 13802 + }, + { + "epoch": 0.7597005889151852, + "grad_norm": 1.0646852254867554, + "learning_rate": 6.867771295393658e-06, + "loss": 0.8488, + "step": 13803 + }, + { + "epoch": 0.7597556277175409, + "grad_norm": 0.6551353335380554, + "learning_rate": 6.867369201130209e-06, + "loss": 0.7147, + "step": 13804 + }, + { + "epoch": 0.7598106665198965, + "grad_norm": 0.6749850511550903, + "learning_rate": 6.866967092832115e-06, + "loss": 0.7963, + "step": 13805 + }, + { + "epoch": 0.7598657053222522, + "grad_norm": 0.6704042553901672, + "learning_rate": 6.866564970502394e-06, + "loss": 0.7992, + "step": 13806 + }, + { + "epoch": 0.7599207441246079, + "grad_norm": 0.7027791142463684, + "learning_rate": 6.866162834144071e-06, + "loss": 0.7931, + "step": 13807 + }, + { + "epoch": 0.7599757829269636, + "grad_norm": 0.7925322651863098, + "learning_rate": 6.865760683760169e-06, + "loss": 0.7826, + "step": 13808 + }, + { + "epoch": 0.7600308217293191, + "grad_norm": 0.7152161002159119, + "learning_rate": 6.865358519353708e-06, + "loss": 0.7481, + "step": 13809 + }, + { + "epoch": 0.7600858605316748, + "grad_norm": 0.6572757959365845, + "learning_rate": 6.864956340927711e-06, + "loss": 0.785, + "step": 13810 + }, + { + "epoch": 0.7601408993340305, + "grad_norm": 0.6848406791687012, + "learning_rate": 6.864554148485203e-06, + "loss": 0.6423, + "step": 13811 + }, + { + "epoch": 0.7601959381363862, + "grad_norm": 0.747597873210907, + "learning_rate": 6.864151942029205e-06, + "loss": 0.7901, + "step": 13812 + }, + { + "epoch": 0.7602509769387418, + "grad_norm": 0.7106720805168152, + "learning_rate": 6.863749721562738e-06, + "loss": 0.7488, + "step": 13813 + }, + { + "epoch": 0.7603060157410975, + "grad_norm": 0.6864057779312134, + "learning_rate": 6.8633474870888275e-06, + "loss": 0.7066, + "step": 13814 + }, + { + "epoch": 0.7603610545434532, + "grad_norm": 0.7022056579589844, + "learning_rate": 6.862945238610496e-06, + "loss": 0.6851, + "step": 13815 + }, + { + "epoch": 0.7604160933458088, + "grad_norm": 0.7361913919448853, + "learning_rate": 6.862542976130769e-06, + "loss": 0.7425, + "step": 13816 + }, + { + "epoch": 0.7604711321481644, + "grad_norm": 0.6723676323890686, + "learning_rate": 6.862140699652666e-06, + "loss": 0.7937, + "step": 13817 + }, + { + "epoch": 0.7605261709505201, + "grad_norm": 0.7491924166679382, + "learning_rate": 6.861738409179212e-06, + "loss": 0.7585, + "step": 13818 + }, + { + "epoch": 0.7605812097528758, + "grad_norm": 0.6772211790084839, + "learning_rate": 6.86133610471343e-06, + "loss": 0.7617, + "step": 13819 + }, + { + "epoch": 0.7606362485552315, + "grad_norm": 0.7819864153862, + "learning_rate": 6.860933786258344e-06, + "loss": 0.7924, + "step": 13820 + }, + { + "epoch": 0.7606912873575871, + "grad_norm": 0.6992526650428772, + "learning_rate": 6.86053145381698e-06, + "loss": 0.7054, + "step": 13821 + }, + { + "epoch": 0.7607463261599428, + "grad_norm": 0.7189231514930725, + "learning_rate": 6.860129107392357e-06, + "loss": 0.7603, + "step": 13822 + }, + { + "epoch": 0.7608013649622984, + "grad_norm": 0.7165294885635376, + "learning_rate": 6.859726746987503e-06, + "loss": 0.8118, + "step": 13823 + }, + { + "epoch": 0.7608564037646541, + "grad_norm": 0.6510334014892578, + "learning_rate": 6.85932437260544e-06, + "loss": 0.7584, + "step": 13824 + }, + { + "epoch": 0.7609114425670097, + "grad_norm": 0.7113379836082458, + "learning_rate": 6.8589219842491935e-06, + "loss": 0.7799, + "step": 13825 + }, + { + "epoch": 0.7609664813693654, + "grad_norm": 0.7441100478172302, + "learning_rate": 6.8585195819217856e-06, + "loss": 0.6468, + "step": 13826 + }, + { + "epoch": 0.7610215201717211, + "grad_norm": 1.0703508853912354, + "learning_rate": 6.858117165626244e-06, + "loss": 0.7922, + "step": 13827 + }, + { + "epoch": 0.7610765589740768, + "grad_norm": 0.7097275853157043, + "learning_rate": 6.857714735365589e-06, + "loss": 0.7594, + "step": 13828 + }, + { + "epoch": 0.7611315977764324, + "grad_norm": 0.7001124620437622, + "learning_rate": 6.857312291142848e-06, + "loss": 0.7679, + "step": 13829 + }, + { + "epoch": 0.761186636578788, + "grad_norm": 0.6898123621940613, + "learning_rate": 6.856909832961045e-06, + "loss": 0.7684, + "step": 13830 + }, + { + "epoch": 0.7612416753811437, + "grad_norm": 0.6535243391990662, + "learning_rate": 6.856507360823206e-06, + "loss": 0.6143, + "step": 13831 + }, + { + "epoch": 0.7612967141834993, + "grad_norm": 0.6726056933403015, + "learning_rate": 6.856104874732353e-06, + "loss": 0.7566, + "step": 13832 + }, + { + "epoch": 0.761351752985855, + "grad_norm": 0.8741437196731567, + "learning_rate": 6.855702374691513e-06, + "loss": 0.723, + "step": 13833 + }, + { + "epoch": 0.7614067917882107, + "grad_norm": 0.7025718092918396, + "learning_rate": 6.855299860703712e-06, + "loss": 0.8035, + "step": 13834 + }, + { + "epoch": 0.7614618305905664, + "grad_norm": 1.08286452293396, + "learning_rate": 6.8548973327719726e-06, + "loss": 0.7347, + "step": 13835 + }, + { + "epoch": 0.761516869392922, + "grad_norm": 0.6483243107795715, + "learning_rate": 6.854494790899322e-06, + "loss": 0.7326, + "step": 13836 + }, + { + "epoch": 0.7615719081952776, + "grad_norm": 0.6611089110374451, + "learning_rate": 6.854092235088784e-06, + "loss": 0.7619, + "step": 13837 + }, + { + "epoch": 0.7616269469976333, + "grad_norm": 0.8394322991371155, + "learning_rate": 6.853689665343385e-06, + "loss": 0.7017, + "step": 13838 + }, + { + "epoch": 0.761681985799989, + "grad_norm": 0.7131583094596863, + "learning_rate": 6.853287081666151e-06, + "loss": 0.7367, + "step": 13839 + }, + { + "epoch": 0.7617370246023446, + "grad_norm": 0.7316367626190186, + "learning_rate": 6.852884484060108e-06, + "loss": 0.7323, + "step": 13840 + }, + { + "epoch": 0.7617920634047003, + "grad_norm": 0.7639010548591614, + "learning_rate": 6.852481872528281e-06, + "loss": 0.819, + "step": 13841 + }, + { + "epoch": 0.761847102207056, + "grad_norm": 0.7118390202522278, + "learning_rate": 6.852079247073695e-06, + "loss": 0.7645, + "step": 13842 + }, + { + "epoch": 0.7619021410094117, + "grad_norm": 0.6885393857955933, + "learning_rate": 6.851676607699379e-06, + "loss": 0.8052, + "step": 13843 + }, + { + "epoch": 0.7619571798117672, + "grad_norm": 0.7034374475479126, + "learning_rate": 6.851273954408356e-06, + "loss": 0.8464, + "step": 13844 + }, + { + "epoch": 0.7620122186141229, + "grad_norm": 0.6531803607940674, + "learning_rate": 6.850871287203654e-06, + "loss": 0.7871, + "step": 13845 + }, + { + "epoch": 0.7620672574164786, + "grad_norm": 0.6637283563613892, + "learning_rate": 6.8504686060882995e-06, + "loss": 0.7326, + "step": 13846 + }, + { + "epoch": 0.7621222962188343, + "grad_norm": 0.6467694640159607, + "learning_rate": 6.850065911065318e-06, + "loss": 0.7936, + "step": 13847 + }, + { + "epoch": 0.7621773350211899, + "grad_norm": 0.6829109191894531, + "learning_rate": 6.849663202137735e-06, + "loss": 0.7003, + "step": 13848 + }, + { + "epoch": 0.7622323738235456, + "grad_norm": 0.7321386933326721, + "learning_rate": 6.84926047930858e-06, + "loss": 0.6921, + "step": 13849 + }, + { + "epoch": 0.7622874126259013, + "grad_norm": 0.6900202631950378, + "learning_rate": 6.8488577425808766e-06, + "loss": 0.7496, + "step": 13850 + }, + { + "epoch": 0.762342451428257, + "grad_norm": 0.6304247975349426, + "learning_rate": 6.848454991957655e-06, + "loss": 0.7135, + "step": 13851 + }, + { + "epoch": 0.7623974902306125, + "grad_norm": 0.7087798118591309, + "learning_rate": 6.8480522274419404e-06, + "loss": 0.7032, + "step": 13852 + }, + { + "epoch": 0.7624525290329682, + "grad_norm": 0.7777289152145386, + "learning_rate": 6.84764944903676e-06, + "loss": 0.7345, + "step": 13853 + }, + { + "epoch": 0.7625075678353239, + "grad_norm": 0.7282242774963379, + "learning_rate": 6.847246656745139e-06, + "loss": 0.6408, + "step": 13854 + }, + { + "epoch": 0.7625626066376796, + "grad_norm": 0.7798221707344055, + "learning_rate": 6.846843850570107e-06, + "loss": 0.9058, + "step": 13855 + }, + { + "epoch": 0.7626176454400352, + "grad_norm": 0.6145210266113281, + "learning_rate": 6.846441030514692e-06, + "loss": 0.6331, + "step": 13856 + }, + { + "epoch": 0.7626726842423909, + "grad_norm": 0.7079364061355591, + "learning_rate": 6.846038196581921e-06, + "loss": 0.7511, + "step": 13857 + }, + { + "epoch": 0.7627277230447466, + "grad_norm": 0.733635425567627, + "learning_rate": 6.845635348774821e-06, + "loss": 0.6957, + "step": 13858 + }, + { + "epoch": 0.7627827618471023, + "grad_norm": 0.8099489808082581, + "learning_rate": 6.845232487096419e-06, + "loss": 0.8068, + "step": 13859 + }, + { + "epoch": 0.7628378006494578, + "grad_norm": 0.6241937875747681, + "learning_rate": 6.844829611549744e-06, + "loss": 0.7102, + "step": 13860 + }, + { + "epoch": 0.7628928394518135, + "grad_norm": 0.8009611368179321, + "learning_rate": 6.8444267221378235e-06, + "loss": 0.8369, + "step": 13861 + }, + { + "epoch": 0.7629478782541692, + "grad_norm": 0.6700903177261353, + "learning_rate": 6.844023818863685e-06, + "loss": 0.8075, + "step": 13862 + }, + { + "epoch": 0.7630029170565249, + "grad_norm": 0.9378371834754944, + "learning_rate": 6.843620901730357e-06, + "loss": 0.7539, + "step": 13863 + }, + { + "epoch": 0.7630579558588805, + "grad_norm": 0.6704423427581787, + "learning_rate": 6.843217970740867e-06, + "loss": 0.7285, + "step": 13864 + }, + { + "epoch": 0.7631129946612362, + "grad_norm": 0.7236818075180054, + "learning_rate": 6.842815025898246e-06, + "loss": 0.7223, + "step": 13865 + }, + { + "epoch": 0.7631680334635919, + "grad_norm": 0.676184356212616, + "learning_rate": 6.84241206720552e-06, + "loss": 0.7286, + "step": 13866 + }, + { + "epoch": 0.7632230722659475, + "grad_norm": 0.6443304419517517, + "learning_rate": 6.842009094665717e-06, + "loss": 0.6806, + "step": 13867 + }, + { + "epoch": 0.7632781110683031, + "grad_norm": 0.7931790947914124, + "learning_rate": 6.841606108281868e-06, + "loss": 0.7801, + "step": 13868 + }, + { + "epoch": 0.7633331498706588, + "grad_norm": 0.7440798878669739, + "learning_rate": 6.841203108057e-06, + "loss": 0.8044, + "step": 13869 + }, + { + "epoch": 0.7633881886730145, + "grad_norm": 0.7226675748825073, + "learning_rate": 6.840800093994142e-06, + "loss": 0.718, + "step": 13870 + }, + { + "epoch": 0.7634432274753702, + "grad_norm": 0.7351265549659729, + "learning_rate": 6.8403970660963245e-06, + "loss": 0.8389, + "step": 13871 + }, + { + "epoch": 0.7634982662777258, + "grad_norm": 0.8326215744018555, + "learning_rate": 6.839994024366574e-06, + "loss": 0.8583, + "step": 13872 + }, + { + "epoch": 0.7635533050800815, + "grad_norm": 0.6841259002685547, + "learning_rate": 6.839590968807922e-06, + "loss": 0.7553, + "step": 13873 + }, + { + "epoch": 0.7636083438824371, + "grad_norm": 0.7305078506469727, + "learning_rate": 6.839187899423395e-06, + "loss": 0.7825, + "step": 13874 + }, + { + "epoch": 0.7636633826847927, + "grad_norm": 0.7235193252563477, + "learning_rate": 6.838784816216025e-06, + "loss": 0.7653, + "step": 13875 + }, + { + "epoch": 0.7637184214871484, + "grad_norm": 0.6468761563301086, + "learning_rate": 6.838381719188842e-06, + "loss": 0.6901, + "step": 13876 + }, + { + "epoch": 0.7637734602895041, + "grad_norm": 0.6806310415267944, + "learning_rate": 6.837978608344872e-06, + "loss": 0.6876, + "step": 13877 + }, + { + "epoch": 0.7638284990918598, + "grad_norm": 0.692081093788147, + "learning_rate": 6.837575483687147e-06, + "loss": 0.7506, + "step": 13878 + }, + { + "epoch": 0.7638835378942154, + "grad_norm": 0.6447135806083679, + "learning_rate": 6.837172345218697e-06, + "loss": 0.6841, + "step": 13879 + }, + { + "epoch": 0.7639385766965711, + "grad_norm": 0.7352014183998108, + "learning_rate": 6.8367691929425516e-06, + "loss": 0.8066, + "step": 13880 + }, + { + "epoch": 0.7639936154989267, + "grad_norm": 0.7305072546005249, + "learning_rate": 6.8363660268617405e-06, + "loss": 0.717, + "step": 13881 + }, + { + "epoch": 0.7640486543012824, + "grad_norm": 0.6580411195755005, + "learning_rate": 6.835962846979294e-06, + "loss": 0.7585, + "step": 13882 + }, + { + "epoch": 0.764103693103638, + "grad_norm": 0.7568425536155701, + "learning_rate": 6.835559653298242e-06, + "loss": 0.8273, + "step": 13883 + }, + { + "epoch": 0.7641587319059937, + "grad_norm": 0.8121107816696167, + "learning_rate": 6.835156445821616e-06, + "loss": 0.9064, + "step": 13884 + }, + { + "epoch": 0.7642137707083494, + "grad_norm": 0.6522091031074524, + "learning_rate": 6.834753224552444e-06, + "loss": 0.767, + "step": 13885 + }, + { + "epoch": 0.7642688095107051, + "grad_norm": 1.0779389142990112, + "learning_rate": 6.8343499894937574e-06, + "loss": 0.7702, + "step": 13886 + }, + { + "epoch": 0.7643238483130607, + "grad_norm": 0.6902838349342346, + "learning_rate": 6.833946740648588e-06, + "loss": 0.6529, + "step": 13887 + }, + { + "epoch": 0.7643788871154164, + "grad_norm": 0.692480742931366, + "learning_rate": 6.833543478019966e-06, + "loss": 0.7404, + "step": 13888 + }, + { + "epoch": 0.764433925917772, + "grad_norm": 0.633627712726593, + "learning_rate": 6.833140201610923e-06, + "loss": 0.711, + "step": 13889 + }, + { + "epoch": 0.7644889647201277, + "grad_norm": 0.8653294444084167, + "learning_rate": 6.832736911424487e-06, + "loss": 0.8102, + "step": 13890 + }, + { + "epoch": 0.7645440035224833, + "grad_norm": 0.7864197492599487, + "learning_rate": 6.832333607463692e-06, + "loss": 0.7064, + "step": 13891 + }, + { + "epoch": 0.764599042324839, + "grad_norm": 0.6703711748123169, + "learning_rate": 6.831930289731569e-06, + "loss": 0.7653, + "step": 13892 + }, + { + "epoch": 0.7646540811271947, + "grad_norm": 0.7420178651809692, + "learning_rate": 6.831526958231147e-06, + "loss": 0.8137, + "step": 13893 + }, + { + "epoch": 0.7647091199295504, + "grad_norm": 0.7372543215751648, + "learning_rate": 6.831123612965459e-06, + "loss": 0.6871, + "step": 13894 + }, + { + "epoch": 0.764764158731906, + "grad_norm": 0.77486652135849, + "learning_rate": 6.830720253937536e-06, + "loss": 0.727, + "step": 13895 + }, + { + "epoch": 0.7648191975342616, + "grad_norm": 0.7087406516075134, + "learning_rate": 6.83031688115041e-06, + "loss": 0.7743, + "step": 13896 + }, + { + "epoch": 0.7648742363366173, + "grad_norm": 0.8415336608886719, + "learning_rate": 6.829913494607112e-06, + "loss": 0.774, + "step": 13897 + }, + { + "epoch": 0.764929275138973, + "grad_norm": 0.7736749053001404, + "learning_rate": 6.829510094310674e-06, + "loss": 0.7541, + "step": 13898 + }, + { + "epoch": 0.7649843139413286, + "grad_norm": 0.6749987602233887, + "learning_rate": 6.829106680264128e-06, + "loss": 0.7139, + "step": 13899 + }, + { + "epoch": 0.7650393527436843, + "grad_norm": 0.7079635262489319, + "learning_rate": 6.8287032524705055e-06, + "loss": 0.75, + "step": 13900 + }, + { + "epoch": 0.76509439154604, + "grad_norm": 0.6906388401985168, + "learning_rate": 6.828299810932839e-06, + "loss": 0.6895, + "step": 13901 + }, + { + "epoch": 0.7651494303483957, + "grad_norm": 0.7045881152153015, + "learning_rate": 6.82789635565416e-06, + "loss": 0.8728, + "step": 13902 + }, + { + "epoch": 0.7652044691507512, + "grad_norm": 0.6836426258087158, + "learning_rate": 6.827492886637501e-06, + "loss": 0.7315, + "step": 13903 + }, + { + "epoch": 0.7652595079531069, + "grad_norm": 0.6467520594596863, + "learning_rate": 6.827089403885896e-06, + "loss": 0.7556, + "step": 13904 + }, + { + "epoch": 0.7653145467554626, + "grad_norm": 0.7118285894393921, + "learning_rate": 6.826685907402376e-06, + "loss": 0.8686, + "step": 13905 + }, + { + "epoch": 0.7653695855578183, + "grad_norm": 0.6093236207962036, + "learning_rate": 6.826282397189974e-06, + "loss": 0.7066, + "step": 13906 + }, + { + "epoch": 0.7654246243601739, + "grad_norm": 0.6839649677276611, + "learning_rate": 6.825878873251721e-06, + "loss": 0.7025, + "step": 13907 + }, + { + "epoch": 0.7654796631625296, + "grad_norm": 0.7582715153694153, + "learning_rate": 6.825475335590652e-06, + "loss": 0.7301, + "step": 13908 + }, + { + "epoch": 0.7655347019648853, + "grad_norm": 0.6580978631973267, + "learning_rate": 6.8250717842098e-06, + "loss": 0.6771, + "step": 13909 + }, + { + "epoch": 0.765589740767241, + "grad_norm": 0.6754937171936035, + "learning_rate": 6.824668219112195e-06, + "loss": 0.7446, + "step": 13910 + }, + { + "epoch": 0.7656447795695965, + "grad_norm": 0.7541018724441528, + "learning_rate": 6.8242646403008725e-06, + "loss": 0.802, + "step": 13911 + }, + { + "epoch": 0.7656998183719522, + "grad_norm": 0.6714808344841003, + "learning_rate": 6.823861047778866e-06, + "loss": 0.7334, + "step": 13912 + }, + { + "epoch": 0.7657548571743079, + "grad_norm": 0.6972425580024719, + "learning_rate": 6.823457441549209e-06, + "loss": 0.7859, + "step": 13913 + }, + { + "epoch": 0.7658098959766636, + "grad_norm": 0.6660878658294678, + "learning_rate": 6.823053821614931e-06, + "loss": 0.6594, + "step": 13914 + }, + { + "epoch": 0.7658649347790192, + "grad_norm": 0.7392181158065796, + "learning_rate": 6.82265018797907e-06, + "loss": 0.6667, + "step": 13915 + }, + { + "epoch": 0.7659199735813749, + "grad_norm": 0.7601449489593506, + "learning_rate": 6.822246540644659e-06, + "loss": 0.7349, + "step": 13916 + }, + { + "epoch": 0.7659750123837306, + "grad_norm": 0.6648421287536621, + "learning_rate": 6.821842879614731e-06, + "loss": 0.7597, + "step": 13917 + }, + { + "epoch": 0.7660300511860861, + "grad_norm": 0.6369950175285339, + "learning_rate": 6.821439204892317e-06, + "loss": 0.7452, + "step": 13918 + }, + { + "epoch": 0.7660850899884418, + "grad_norm": 0.747653603553772, + "learning_rate": 6.821035516480457e-06, + "loss": 0.693, + "step": 13919 + }, + { + "epoch": 0.7661401287907975, + "grad_norm": 0.6450137495994568, + "learning_rate": 6.8206318143821795e-06, + "loss": 0.6492, + "step": 13920 + }, + { + "epoch": 0.7661951675931532, + "grad_norm": 0.707801878452301, + "learning_rate": 6.8202280986005205e-06, + "loss": 0.7284, + "step": 13921 + }, + { + "epoch": 0.7662502063955088, + "grad_norm": 0.7191962003707886, + "learning_rate": 6.8198243691385146e-06, + "loss": 0.7714, + "step": 13922 + }, + { + "epoch": 0.7663052451978645, + "grad_norm": 0.7477172613143921, + "learning_rate": 6.819420625999196e-06, + "loss": 0.7076, + "step": 13923 + }, + { + "epoch": 0.7663602840002202, + "grad_norm": 0.6221175193786621, + "learning_rate": 6.819016869185599e-06, + "loss": 0.6848, + "step": 13924 + }, + { + "epoch": 0.7664153228025758, + "grad_norm": 0.7840436697006226, + "learning_rate": 6.818613098700758e-06, + "loss": 0.7028, + "step": 13925 + }, + { + "epoch": 0.7664703616049314, + "grad_norm": 0.7147907018661499, + "learning_rate": 6.818209314547707e-06, + "loss": 0.7242, + "step": 13926 + }, + { + "epoch": 0.7665254004072871, + "grad_norm": 0.6627985835075378, + "learning_rate": 6.817805516729482e-06, + "loss": 0.7177, + "step": 13927 + }, + { + "epoch": 0.7665804392096428, + "grad_norm": 0.8019070625305176, + "learning_rate": 6.817401705249118e-06, + "loss": 0.6594, + "step": 13928 + }, + { + "epoch": 0.7666354780119985, + "grad_norm": 0.7127207517623901, + "learning_rate": 6.816997880109649e-06, + "loss": 0.8282, + "step": 13929 + }, + { + "epoch": 0.7666905168143541, + "grad_norm": 0.7335825562477112, + "learning_rate": 6.816594041314111e-06, + "loss": 0.7593, + "step": 13930 + }, + { + "epoch": 0.7667455556167098, + "grad_norm": 0.6878668069839478, + "learning_rate": 6.816190188865538e-06, + "loss": 0.7898, + "step": 13931 + }, + { + "epoch": 0.7668005944190655, + "grad_norm": 0.6441968679428101, + "learning_rate": 6.815786322766965e-06, + "loss": 0.6795, + "step": 13932 + }, + { + "epoch": 0.7668556332214211, + "grad_norm": 0.6503410339355469, + "learning_rate": 6.815382443021429e-06, + "loss": 0.753, + "step": 13933 + }, + { + "epoch": 0.7669106720237767, + "grad_norm": 0.6734908223152161, + "learning_rate": 6.8149785496319645e-06, + "loss": 0.7145, + "step": 13934 + }, + { + "epoch": 0.7669657108261324, + "grad_norm": 0.8363823890686035, + "learning_rate": 6.814574642601606e-06, + "loss": 0.8499, + "step": 13935 + }, + { + "epoch": 0.7670207496284881, + "grad_norm": 0.6986021995544434, + "learning_rate": 6.81417072193339e-06, + "loss": 0.7101, + "step": 13936 + }, + { + "epoch": 0.7670757884308438, + "grad_norm": 0.9656592011451721, + "learning_rate": 6.813766787630354e-06, + "loss": 0.7841, + "step": 13937 + }, + { + "epoch": 0.7671308272331994, + "grad_norm": 0.6830777525901794, + "learning_rate": 6.813362839695532e-06, + "loss": 0.7443, + "step": 13938 + }, + { + "epoch": 0.767185866035555, + "grad_norm": 0.6358513236045837, + "learning_rate": 6.812958878131959e-06, + "loss": 0.7017, + "step": 13939 + }, + { + "epoch": 0.7672409048379107, + "grad_norm": 0.9075862169265747, + "learning_rate": 6.812554902942673e-06, + "loss": 0.6991, + "step": 13940 + }, + { + "epoch": 0.7672959436402664, + "grad_norm": 0.7004347443580627, + "learning_rate": 6.812150914130709e-06, + "loss": 0.6519, + "step": 13941 + }, + { + "epoch": 0.767350982442622, + "grad_norm": 0.6648300886154175, + "learning_rate": 6.811746911699105e-06, + "loss": 0.7044, + "step": 13942 + }, + { + "epoch": 0.7674060212449777, + "grad_norm": 0.7050208449363708, + "learning_rate": 6.811342895650896e-06, + "loss": 0.78, + "step": 13943 + }, + { + "epoch": 0.7674610600473334, + "grad_norm": 0.6387132406234741, + "learning_rate": 6.810938865989119e-06, + "loss": 0.6062, + "step": 13944 + }, + { + "epoch": 0.7675160988496891, + "grad_norm": 0.6441114544868469, + "learning_rate": 6.81053482271681e-06, + "loss": 0.7252, + "step": 13945 + }, + { + "epoch": 0.7675711376520447, + "grad_norm": 0.7309751510620117, + "learning_rate": 6.810130765837006e-06, + "loss": 0.6407, + "step": 13946 + }, + { + "epoch": 0.7676261764544003, + "grad_norm": 0.7132161259651184, + "learning_rate": 6.809726695352742e-06, + "loss": 0.8341, + "step": 13947 + }, + { + "epoch": 0.767681215256756, + "grad_norm": 0.7214738726615906, + "learning_rate": 6.809322611267058e-06, + "loss": 0.8357, + "step": 13948 + }, + { + "epoch": 0.7677362540591117, + "grad_norm": 0.6410175561904907, + "learning_rate": 6.80891851358299e-06, + "loss": 0.6718, + "step": 13949 + }, + { + "epoch": 0.7677912928614673, + "grad_norm": 0.8888845443725586, + "learning_rate": 6.8085144023035745e-06, + "loss": 0.7823, + "step": 13950 + }, + { + "epoch": 0.767846331663823, + "grad_norm": 0.7327878475189209, + "learning_rate": 6.808110277431848e-06, + "loss": 0.7083, + "step": 13951 + }, + { + "epoch": 0.7679013704661787, + "grad_norm": 0.6871985793113708, + "learning_rate": 6.807706138970849e-06, + "loss": 0.7808, + "step": 13952 + }, + { + "epoch": 0.7679564092685344, + "grad_norm": 0.6939501762390137, + "learning_rate": 6.8073019869236134e-06, + "loss": 0.693, + "step": 13953 + }, + { + "epoch": 0.76801144807089, + "grad_norm": 0.7377064824104309, + "learning_rate": 6.8068978212931814e-06, + "loss": 0.9322, + "step": 13954 + }, + { + "epoch": 0.7680664868732456, + "grad_norm": 0.8165044188499451, + "learning_rate": 6.80649364208259e-06, + "loss": 0.6846, + "step": 13955 + }, + { + "epoch": 0.7681215256756013, + "grad_norm": 0.6774152517318726, + "learning_rate": 6.806089449294875e-06, + "loss": 0.8503, + "step": 13956 + }, + { + "epoch": 0.768176564477957, + "grad_norm": 0.7773441076278687, + "learning_rate": 6.805685242933074e-06, + "loss": 0.8775, + "step": 13957 + }, + { + "epoch": 0.7682316032803126, + "grad_norm": 0.6710473895072937, + "learning_rate": 6.805281023000227e-06, + "loss": 0.7831, + "step": 13958 + }, + { + "epoch": 0.7682866420826683, + "grad_norm": 0.6163424849510193, + "learning_rate": 6.80487678949937e-06, + "loss": 0.7309, + "step": 13959 + }, + { + "epoch": 0.768341680885024, + "grad_norm": 0.6851963400840759, + "learning_rate": 6.804472542433543e-06, + "loss": 0.6556, + "step": 13960 + }, + { + "epoch": 0.7683967196873795, + "grad_norm": 0.6881004571914673, + "learning_rate": 6.804068281805784e-06, + "loss": 0.7115, + "step": 13961 + }, + { + "epoch": 0.7684517584897352, + "grad_norm": 0.7372351884841919, + "learning_rate": 6.8036640076191304e-06, + "loss": 0.7869, + "step": 13962 + }, + { + "epoch": 0.7685067972920909, + "grad_norm": 0.7900989055633545, + "learning_rate": 6.8032597198766205e-06, + "loss": 0.7419, + "step": 13963 + }, + { + "epoch": 0.7685618360944466, + "grad_norm": 0.7245132327079773, + "learning_rate": 6.802855418581294e-06, + "loss": 0.8175, + "step": 13964 + }, + { + "epoch": 0.7686168748968022, + "grad_norm": 0.6681550741195679, + "learning_rate": 6.802451103736188e-06, + "loss": 0.773, + "step": 13965 + }, + { + "epoch": 0.7686719136991579, + "grad_norm": 0.6316970586776733, + "learning_rate": 6.802046775344343e-06, + "loss": 0.6597, + "step": 13966 + }, + { + "epoch": 0.7687269525015136, + "grad_norm": 0.7201604843139648, + "learning_rate": 6.801642433408796e-06, + "loss": 0.7205, + "step": 13967 + }, + { + "epoch": 0.7687819913038693, + "grad_norm": 0.6226171851158142, + "learning_rate": 6.801238077932587e-06, + "loss": 0.7271, + "step": 13968 + }, + { + "epoch": 0.7688370301062248, + "grad_norm": 0.833369255065918, + "learning_rate": 6.800833708918755e-06, + "loss": 0.7731, + "step": 13969 + }, + { + "epoch": 0.7688920689085805, + "grad_norm": 0.7280329465866089, + "learning_rate": 6.800429326370339e-06, + "loss": 0.7833, + "step": 13970 + }, + { + "epoch": 0.7689471077109362, + "grad_norm": 0.7581672072410583, + "learning_rate": 6.800024930290376e-06, + "loss": 0.8008, + "step": 13971 + }, + { + "epoch": 0.7690021465132919, + "grad_norm": 0.7931516170501709, + "learning_rate": 6.79962052068191e-06, + "loss": 0.8884, + "step": 13972 + }, + { + "epoch": 0.7690571853156475, + "grad_norm": 0.8455879092216492, + "learning_rate": 6.799216097547977e-06, + "loss": 0.8109, + "step": 13973 + }, + { + "epoch": 0.7691122241180032, + "grad_norm": 0.687336266040802, + "learning_rate": 6.798811660891618e-06, + "loss": 0.783, + "step": 13974 + }, + { + "epoch": 0.7691672629203589, + "grad_norm": 0.7661089897155762, + "learning_rate": 6.7984072107158696e-06, + "loss": 0.8448, + "step": 13975 + }, + { + "epoch": 0.7692223017227146, + "grad_norm": 0.6965043544769287, + "learning_rate": 6.798002747023776e-06, + "loss": 0.7421, + "step": 13976 + }, + { + "epoch": 0.7692773405250701, + "grad_norm": 0.7373656630516052, + "learning_rate": 6.797598269818375e-06, + "loss": 0.7093, + "step": 13977 + }, + { + "epoch": 0.7693323793274258, + "grad_norm": 0.6387331485748291, + "learning_rate": 6.7971937791027064e-06, + "loss": 0.7811, + "step": 13978 + }, + { + "epoch": 0.7693874181297815, + "grad_norm": 0.7566075325012207, + "learning_rate": 6.796789274879811e-06, + "loss": 0.8245, + "step": 13979 + }, + { + "epoch": 0.7694424569321372, + "grad_norm": 0.7035738229751587, + "learning_rate": 6.796384757152729e-06, + "loss": 0.7674, + "step": 13980 + }, + { + "epoch": 0.7694974957344928, + "grad_norm": 0.8265605568885803, + "learning_rate": 6.795980225924499e-06, + "loss": 0.7755, + "step": 13981 + }, + { + "epoch": 0.7695525345368485, + "grad_norm": 0.709454357624054, + "learning_rate": 6.7955756811981625e-06, + "loss": 0.8651, + "step": 13982 + }, + { + "epoch": 0.7696075733392042, + "grad_norm": 0.7075764536857605, + "learning_rate": 6.795171122976758e-06, + "loss": 0.7371, + "step": 13983 + }, + { + "epoch": 0.7696626121415598, + "grad_norm": 0.7027561664581299, + "learning_rate": 6.79476655126333e-06, + "loss": 0.7763, + "step": 13984 + }, + { + "epoch": 0.7697176509439154, + "grad_norm": 0.7922375202178955, + "learning_rate": 6.794361966060916e-06, + "loss": 0.7677, + "step": 13985 + }, + { + "epoch": 0.7697726897462711, + "grad_norm": 0.7185537219047546, + "learning_rate": 6.793957367372559e-06, + "loss": 0.7229, + "step": 13986 + }, + { + "epoch": 0.7698277285486268, + "grad_norm": 0.7173545956611633, + "learning_rate": 6.793552755201297e-06, + "loss": 0.7508, + "step": 13987 + }, + { + "epoch": 0.7698827673509825, + "grad_norm": 0.7743139863014221, + "learning_rate": 6.793148129550175e-06, + "loss": 0.7305, + "step": 13988 + }, + { + "epoch": 0.7699378061533381, + "grad_norm": 0.7992164492607117, + "learning_rate": 6.792743490422229e-06, + "loss": 0.7212, + "step": 13989 + }, + { + "epoch": 0.7699928449556938, + "grad_norm": 0.7437503337860107, + "learning_rate": 6.792338837820504e-06, + "loss": 0.6396, + "step": 13990 + }, + { + "epoch": 0.7700478837580494, + "grad_norm": 0.6908634305000305, + "learning_rate": 6.79193417174804e-06, + "loss": 0.7279, + "step": 13991 + }, + { + "epoch": 0.7701029225604051, + "grad_norm": 0.6894391775131226, + "learning_rate": 6.7915294922078805e-06, + "loss": 0.7615, + "step": 13992 + }, + { + "epoch": 0.7701579613627607, + "grad_norm": 0.7162172794342041, + "learning_rate": 6.791124799203062e-06, + "loss": 0.7404, + "step": 13993 + }, + { + "epoch": 0.7702130001651164, + "grad_norm": 0.6469258069992065, + "learning_rate": 6.79072009273663e-06, + "loss": 0.7035, + "step": 13994 + }, + { + "epoch": 0.7702680389674721, + "grad_norm": 0.6456457376480103, + "learning_rate": 6.790315372811625e-06, + "loss": 0.708, + "step": 13995 + }, + { + "epoch": 0.7703230777698278, + "grad_norm": 0.7880644798278809, + "learning_rate": 6.789910639431089e-06, + "loss": 0.7723, + "step": 13996 + }, + { + "epoch": 0.7703781165721834, + "grad_norm": 0.7847834229469299, + "learning_rate": 6.789505892598063e-06, + "loss": 0.8585, + "step": 13997 + }, + { + "epoch": 0.770433155374539, + "grad_norm": 0.6909215450286865, + "learning_rate": 6.789101132315591e-06, + "loss": 0.7107, + "step": 13998 + }, + { + "epoch": 0.7704881941768947, + "grad_norm": 0.7883939146995544, + "learning_rate": 6.788696358586713e-06, + "loss": 0.7575, + "step": 13999 + }, + { + "epoch": 0.7705432329792504, + "grad_norm": 0.6629998087882996, + "learning_rate": 6.788291571414472e-06, + "loss": 0.7273, + "step": 14000 + }, + { + "epoch": 0.770598271781606, + "grad_norm": 0.7548647522926331, + "learning_rate": 6.7878867708019106e-06, + "loss": 0.8214, + "step": 14001 + }, + { + "epoch": 0.7706533105839617, + "grad_norm": 0.6721330881118774, + "learning_rate": 6.78748195675207e-06, + "loss": 0.7153, + "step": 14002 + }, + { + "epoch": 0.7707083493863174, + "grad_norm": 0.6921262145042419, + "learning_rate": 6.787077129267994e-06, + "loss": 0.7099, + "step": 14003 + }, + { + "epoch": 0.770763388188673, + "grad_norm": 0.956937849521637, + "learning_rate": 6.786672288352725e-06, + "loss": 0.6765, + "step": 14004 + }, + { + "epoch": 0.7708184269910286, + "grad_norm": 0.7265778183937073, + "learning_rate": 6.786267434009306e-06, + "loss": 0.7653, + "step": 14005 + }, + { + "epoch": 0.7708734657933843, + "grad_norm": 0.7429845929145813, + "learning_rate": 6.785862566240778e-06, + "loss": 0.8064, + "step": 14006 + }, + { + "epoch": 0.77092850459574, + "grad_norm": 0.7437632083892822, + "learning_rate": 6.785457685050184e-06, + "loss": 0.7138, + "step": 14007 + }, + { + "epoch": 0.7709835433980956, + "grad_norm": 0.7218232750892639, + "learning_rate": 6.7850527904405695e-06, + "loss": 0.7785, + "step": 14008 + }, + { + "epoch": 0.7710385822004513, + "grad_norm": 0.7131973505020142, + "learning_rate": 6.784647882414977e-06, + "loss": 0.7651, + "step": 14009 + }, + { + "epoch": 0.771093621002807, + "grad_norm": 0.739919126033783, + "learning_rate": 6.784242960976447e-06, + "loss": 0.7993, + "step": 14010 + }, + { + "epoch": 0.7711486598051627, + "grad_norm": 0.6655608415603638, + "learning_rate": 6.783838026128025e-06, + "loss": 0.7394, + "step": 14011 + }, + { + "epoch": 0.7712036986075183, + "grad_norm": 0.9327310919761658, + "learning_rate": 6.783433077872753e-06, + "loss": 0.8737, + "step": 14012 + }, + { + "epoch": 0.7712587374098739, + "grad_norm": 0.5928294062614441, + "learning_rate": 6.783028116213677e-06, + "loss": 0.5819, + "step": 14013 + }, + { + "epoch": 0.7713137762122296, + "grad_norm": 0.6752136945724487, + "learning_rate": 6.782623141153838e-06, + "loss": 0.8021, + "step": 14014 + }, + { + "epoch": 0.7713688150145853, + "grad_norm": 0.6452222466468811, + "learning_rate": 6.78221815269628e-06, + "loss": 0.7806, + "step": 14015 + }, + { + "epoch": 0.7714238538169409, + "grad_norm": 0.7725237607955933, + "learning_rate": 6.78181315084405e-06, + "loss": 0.7679, + "step": 14016 + }, + { + "epoch": 0.7714788926192966, + "grad_norm": 0.6594743728637695, + "learning_rate": 6.781408135600187e-06, + "loss": 0.7254, + "step": 14017 + }, + { + "epoch": 0.7715339314216523, + "grad_norm": 0.7008917927742004, + "learning_rate": 6.7810031069677385e-06, + "loss": 0.705, + "step": 14018 + }, + { + "epoch": 0.771588970224008, + "grad_norm": 0.9435684084892273, + "learning_rate": 6.780598064949746e-06, + "loss": 0.7787, + "step": 14019 + }, + { + "epoch": 0.7716440090263635, + "grad_norm": 0.6615981459617615, + "learning_rate": 6.780193009549256e-06, + "loss": 0.7592, + "step": 14020 + }, + { + "epoch": 0.7716990478287192, + "grad_norm": 0.7042600512504578, + "learning_rate": 6.7797879407693115e-06, + "loss": 0.719, + "step": 14021 + }, + { + "epoch": 0.7717540866310749, + "grad_norm": 0.7135425209999084, + "learning_rate": 6.779382858612957e-06, + "loss": 0.739, + "step": 14022 + }, + { + "epoch": 0.7718091254334306, + "grad_norm": 0.6546016931533813, + "learning_rate": 6.778977763083238e-06, + "loss": 0.7039, + "step": 14023 + }, + { + "epoch": 0.7718641642357862, + "grad_norm": 0.8549250960350037, + "learning_rate": 6.778572654183198e-06, + "loss": 0.8384, + "step": 14024 + }, + { + "epoch": 0.7719192030381419, + "grad_norm": 0.7008731365203857, + "learning_rate": 6.778167531915882e-06, + "loss": 0.776, + "step": 14025 + }, + { + "epoch": 0.7719742418404976, + "grad_norm": 0.7047393321990967, + "learning_rate": 6.7777623962843355e-06, + "loss": 0.819, + "step": 14026 + }, + { + "epoch": 0.7720292806428533, + "grad_norm": 0.7015580534934998, + "learning_rate": 6.777357247291601e-06, + "loss": 0.8339, + "step": 14027 + }, + { + "epoch": 0.7720843194452088, + "grad_norm": 0.7008551955223083, + "learning_rate": 6.776952084940727e-06, + "loss": 0.783, + "step": 14028 + }, + { + "epoch": 0.7721393582475645, + "grad_norm": 1.0310637950897217, + "learning_rate": 6.776546909234757e-06, + "loss": 0.7447, + "step": 14029 + }, + { + "epoch": 0.7721943970499202, + "grad_norm": 0.6264338493347168, + "learning_rate": 6.776141720176734e-06, + "loss": 0.5542, + "step": 14030 + }, + { + "epoch": 0.7722494358522759, + "grad_norm": 0.6249508261680603, + "learning_rate": 6.775736517769707e-06, + "loss": 0.6514, + "step": 14031 + }, + { + "epoch": 0.7723044746546315, + "grad_norm": 0.6741732954978943, + "learning_rate": 6.775331302016719e-06, + "loss": 0.6967, + "step": 14032 + }, + { + "epoch": 0.7723595134569872, + "grad_norm": 0.7342913746833801, + "learning_rate": 6.774926072920815e-06, + "loss": 0.8279, + "step": 14033 + }, + { + "epoch": 0.7724145522593429, + "grad_norm": 0.7702916264533997, + "learning_rate": 6.774520830485044e-06, + "loss": 0.8539, + "step": 14034 + }, + { + "epoch": 0.7724695910616985, + "grad_norm": 0.7873550057411194, + "learning_rate": 6.774115574712448e-06, + "loss": 0.6999, + "step": 14035 + }, + { + "epoch": 0.7725246298640541, + "grad_norm": 0.6832353472709656, + "learning_rate": 6.773710305606074e-06, + "loss": 0.7246, + "step": 14036 + }, + { + "epoch": 0.7725796686664098, + "grad_norm": 0.7547367215156555, + "learning_rate": 6.773305023168969e-06, + "loss": 0.7357, + "step": 14037 + }, + { + "epoch": 0.7726347074687655, + "grad_norm": 0.7146826386451721, + "learning_rate": 6.772899727404178e-06, + "loss": 0.6742, + "step": 14038 + }, + { + "epoch": 0.7726897462711212, + "grad_norm": 0.7623558640480042, + "learning_rate": 6.772494418314748e-06, + "loss": 0.7729, + "step": 14039 + }, + { + "epoch": 0.7727447850734768, + "grad_norm": 0.637706458568573, + "learning_rate": 6.772089095903723e-06, + "loss": 0.6662, + "step": 14040 + }, + { + "epoch": 0.7727998238758325, + "grad_norm": 0.7293589115142822, + "learning_rate": 6.771683760174151e-06, + "loss": 0.7899, + "step": 14041 + }, + { + "epoch": 0.7728548626781881, + "grad_norm": 0.7191390991210938, + "learning_rate": 6.771278411129079e-06, + "loss": 0.6912, + "step": 14042 + }, + { + "epoch": 0.7729099014805438, + "grad_norm": 0.8264575004577637, + "learning_rate": 6.770873048771552e-06, + "loss": 0.7027, + "step": 14043 + }, + { + "epoch": 0.7729649402828994, + "grad_norm": 0.7490931749343872, + "learning_rate": 6.770467673104617e-06, + "loss": 0.6917, + "step": 14044 + }, + { + "epoch": 0.7730199790852551, + "grad_norm": 0.6901552081108093, + "learning_rate": 6.77006228413132e-06, + "loss": 0.8097, + "step": 14045 + }, + { + "epoch": 0.7730750178876108, + "grad_norm": 0.6340280175209045, + "learning_rate": 6.76965688185471e-06, + "loss": 0.6309, + "step": 14046 + }, + { + "epoch": 0.7731300566899664, + "grad_norm": 0.6807279586791992, + "learning_rate": 6.7692514662778315e-06, + "loss": 0.7744, + "step": 14047 + }, + { + "epoch": 0.7731850954923221, + "grad_norm": 1.2796865701675415, + "learning_rate": 6.7688460374037335e-06, + "loss": 0.7499, + "step": 14048 + }, + { + "epoch": 0.7732401342946778, + "grad_norm": 0.7059674263000488, + "learning_rate": 6.768440595235463e-06, + "loss": 0.8705, + "step": 14049 + }, + { + "epoch": 0.7732951730970334, + "grad_norm": 0.7626641392707825, + "learning_rate": 6.768035139776066e-06, + "loss": 0.8448, + "step": 14050 + }, + { + "epoch": 0.773350211899389, + "grad_norm": 0.6590229868888855, + "learning_rate": 6.767629671028588e-06, + "loss": 0.6796, + "step": 14051 + }, + { + "epoch": 0.7734052507017447, + "grad_norm": 0.6702030301094055, + "learning_rate": 6.767224188996081e-06, + "loss": 0.7087, + "step": 14052 + }, + { + "epoch": 0.7734602895041004, + "grad_norm": 0.670612096786499, + "learning_rate": 6.76681869368159e-06, + "loss": 0.7203, + "step": 14053 + }, + { + "epoch": 0.7735153283064561, + "grad_norm": 0.6892215013504028, + "learning_rate": 6.766413185088161e-06, + "loss": 0.6891, + "step": 14054 + }, + { + "epoch": 0.7735703671088117, + "grad_norm": 0.8354474902153015, + "learning_rate": 6.766007663218843e-06, + "loss": 0.7378, + "step": 14055 + }, + { + "epoch": 0.7736254059111674, + "grad_norm": 0.7633876204490662, + "learning_rate": 6.765602128076686e-06, + "loss": 0.6916, + "step": 14056 + }, + { + "epoch": 0.773680444713523, + "grad_norm": 0.7249060869216919, + "learning_rate": 6.765196579664736e-06, + "loss": 0.791, + "step": 14057 + }, + { + "epoch": 0.7737354835158787, + "grad_norm": 0.7033042311668396, + "learning_rate": 6.7647910179860395e-06, + "loss": 0.6799, + "step": 14058 + }, + { + "epoch": 0.7737905223182343, + "grad_norm": 0.7087684273719788, + "learning_rate": 6.7643854430436466e-06, + "loss": 0.6389, + "step": 14059 + }, + { + "epoch": 0.77384556112059, + "grad_norm": 0.6433978080749512, + "learning_rate": 6.763979854840606e-06, + "loss": 0.7214, + "step": 14060 + }, + { + "epoch": 0.7739005999229457, + "grad_norm": 0.7777101993560791, + "learning_rate": 6.763574253379964e-06, + "loss": 0.7458, + "step": 14061 + }, + { + "epoch": 0.7739556387253014, + "grad_norm": 0.7065346240997314, + "learning_rate": 6.763168638664771e-06, + "loss": 0.7663, + "step": 14062 + }, + { + "epoch": 0.774010677527657, + "grad_norm": 0.7136278748512268, + "learning_rate": 6.762763010698074e-06, + "loss": 0.667, + "step": 14063 + }, + { + "epoch": 0.7740657163300126, + "grad_norm": 0.6670508980751038, + "learning_rate": 6.762357369482921e-06, + "loss": 0.7462, + "step": 14064 + }, + { + "epoch": 0.7741207551323683, + "grad_norm": 0.6366799473762512, + "learning_rate": 6.7619517150223635e-06, + "loss": 0.7147, + "step": 14065 + }, + { + "epoch": 0.774175793934724, + "grad_norm": 0.5999431610107422, + "learning_rate": 6.761546047319447e-06, + "loss": 0.667, + "step": 14066 + }, + { + "epoch": 0.7742308327370796, + "grad_norm": 0.6751196980476379, + "learning_rate": 6.761140366377222e-06, + "loss": 0.7255, + "step": 14067 + }, + { + "epoch": 0.7742858715394353, + "grad_norm": 0.6786272525787354, + "learning_rate": 6.760734672198738e-06, + "loss": 0.7694, + "step": 14068 + }, + { + "epoch": 0.774340910341791, + "grad_norm": 0.6915947794914246, + "learning_rate": 6.760328964787044e-06, + "loss": 0.7955, + "step": 14069 + }, + { + "epoch": 0.7743959491441467, + "grad_norm": 0.7041972279548645, + "learning_rate": 6.759923244145188e-06, + "loss": 0.6542, + "step": 14070 + }, + { + "epoch": 0.7744509879465022, + "grad_norm": 0.6384761333465576, + "learning_rate": 6.759517510276221e-06, + "loss": 0.7384, + "step": 14071 + }, + { + "epoch": 0.7745060267488579, + "grad_norm": 0.7430800199508667, + "learning_rate": 6.759111763183189e-06, + "loss": 0.7587, + "step": 14072 + }, + { + "epoch": 0.7745610655512136, + "grad_norm": 0.6568213701248169, + "learning_rate": 6.758706002869146e-06, + "loss": 0.7118, + "step": 14073 + }, + { + "epoch": 0.7746161043535693, + "grad_norm": 0.8791618943214417, + "learning_rate": 6.75830022933714e-06, + "loss": 0.8049, + "step": 14074 + }, + { + "epoch": 0.7746711431559249, + "grad_norm": 0.6377304792404175, + "learning_rate": 6.75789444259022e-06, + "loss": 0.737, + "step": 14075 + }, + { + "epoch": 0.7747261819582806, + "grad_norm": 0.7253721356391907, + "learning_rate": 6.757488642631434e-06, + "loss": 0.8432, + "step": 14076 + }, + { + "epoch": 0.7747812207606363, + "grad_norm": 0.684626042842865, + "learning_rate": 6.757082829463835e-06, + "loss": 0.7845, + "step": 14077 + }, + { + "epoch": 0.774836259562992, + "grad_norm": 0.7737520337104797, + "learning_rate": 6.756677003090471e-06, + "loss": 0.8055, + "step": 14078 + }, + { + "epoch": 0.7748912983653475, + "grad_norm": 0.7294824719429016, + "learning_rate": 6.756271163514394e-06, + "loss": 0.7666, + "step": 14079 + }, + { + "epoch": 0.7749463371677032, + "grad_norm": 0.7728607654571533, + "learning_rate": 6.755865310738651e-06, + "loss": 0.7748, + "step": 14080 + }, + { + "epoch": 0.7750013759700589, + "grad_norm": 0.6738442778587341, + "learning_rate": 6.755459444766297e-06, + "loss": 0.6711, + "step": 14081 + }, + { + "epoch": 0.7750564147724146, + "grad_norm": 0.7041414976119995, + "learning_rate": 6.7550535656003794e-06, + "loss": 0.7126, + "step": 14082 + }, + { + "epoch": 0.7751114535747702, + "grad_norm": 1.0205422639846802, + "learning_rate": 6.754647673243948e-06, + "loss": 0.7394, + "step": 14083 + }, + { + "epoch": 0.7751664923771259, + "grad_norm": 0.6594380736351013, + "learning_rate": 6.754241767700054e-06, + "loss": 0.7599, + "step": 14084 + }, + { + "epoch": 0.7752215311794816, + "grad_norm": 0.6800520420074463, + "learning_rate": 6.753835848971749e-06, + "loss": 0.7579, + "step": 14085 + }, + { + "epoch": 0.7752765699818372, + "grad_norm": 0.7658087611198425, + "learning_rate": 6.7534299170620846e-06, + "loss": 0.7705, + "step": 14086 + }, + { + "epoch": 0.7753316087841928, + "grad_norm": 0.7242750525474548, + "learning_rate": 6.7530239719741084e-06, + "loss": 0.7683, + "step": 14087 + }, + { + "epoch": 0.7753866475865485, + "grad_norm": 0.6997398138046265, + "learning_rate": 6.752618013710874e-06, + "loss": 0.8023, + "step": 14088 + }, + { + "epoch": 0.7754416863889042, + "grad_norm": 0.7041590809822083, + "learning_rate": 6.752212042275431e-06, + "loss": 0.7013, + "step": 14089 + }, + { + "epoch": 0.7754967251912598, + "grad_norm": 0.7027721405029297, + "learning_rate": 6.751806057670832e-06, + "loss": 0.7678, + "step": 14090 + }, + { + "epoch": 0.7755517639936155, + "grad_norm": 0.714290201663971, + "learning_rate": 6.751400059900128e-06, + "loss": 0.6769, + "step": 14091 + }, + { + "epoch": 0.7756068027959712, + "grad_norm": 0.7385110855102539, + "learning_rate": 6.750994048966369e-06, + "loss": 0.6576, + "step": 14092 + }, + { + "epoch": 0.7756618415983269, + "grad_norm": 0.7665147185325623, + "learning_rate": 6.750588024872607e-06, + "loss": 0.8127, + "step": 14093 + }, + { + "epoch": 0.7757168804006824, + "grad_norm": 0.6774508953094482, + "learning_rate": 6.750181987621895e-06, + "loss": 0.8112, + "step": 14094 + }, + { + "epoch": 0.7757719192030381, + "grad_norm": 0.666394054889679, + "learning_rate": 6.749775937217285e-06, + "loss": 0.6444, + "step": 14095 + }, + { + "epoch": 0.7758269580053938, + "grad_norm": 0.6557022929191589, + "learning_rate": 6.749369873661825e-06, + "loss": 0.7613, + "step": 14096 + }, + { + "epoch": 0.7758819968077495, + "grad_norm": 0.7090621590614319, + "learning_rate": 6.74896379695857e-06, + "loss": 0.7229, + "step": 14097 + }, + { + "epoch": 0.7759370356101051, + "grad_norm": 0.8117626309394836, + "learning_rate": 6.7485577071105734e-06, + "loss": 0.8002, + "step": 14098 + }, + { + "epoch": 0.7759920744124608, + "grad_norm": 0.6743370294570923, + "learning_rate": 6.748151604120883e-06, + "loss": 0.7457, + "step": 14099 + }, + { + "epoch": 0.7760471132148165, + "grad_norm": 0.7637452483177185, + "learning_rate": 6.747745487992553e-06, + "loss": 0.7471, + "step": 14100 + }, + { + "epoch": 0.7761021520171721, + "grad_norm": 0.6732922196388245, + "learning_rate": 6.747339358728636e-06, + "loss": 0.7471, + "step": 14101 + }, + { + "epoch": 0.7761571908195277, + "grad_norm": 0.7510336637496948, + "learning_rate": 6.746933216332184e-06, + "loss": 0.7252, + "step": 14102 + }, + { + "epoch": 0.7762122296218834, + "grad_norm": 0.731719434261322, + "learning_rate": 6.746527060806251e-06, + "loss": 0.8706, + "step": 14103 + }, + { + "epoch": 0.7762672684242391, + "grad_norm": 0.7625692486763, + "learning_rate": 6.746120892153886e-06, + "loss": 0.7518, + "step": 14104 + }, + { + "epoch": 0.7763223072265948, + "grad_norm": 0.6809547543525696, + "learning_rate": 6.745714710378145e-06, + "loss": 0.7172, + "step": 14105 + }, + { + "epoch": 0.7763773460289504, + "grad_norm": 0.709996223449707, + "learning_rate": 6.745308515482079e-06, + "loss": 0.7925, + "step": 14106 + }, + { + "epoch": 0.776432384831306, + "grad_norm": 0.6675372123718262, + "learning_rate": 6.744902307468742e-06, + "loss": 0.8175, + "step": 14107 + }, + { + "epoch": 0.7764874236336617, + "grad_norm": 0.6978115439414978, + "learning_rate": 6.744496086341186e-06, + "loss": 0.7895, + "step": 14108 + }, + { + "epoch": 0.7765424624360174, + "grad_norm": 0.6593814492225647, + "learning_rate": 6.7440898521024634e-06, + "loss": 0.7791, + "step": 14109 + }, + { + "epoch": 0.776597501238373, + "grad_norm": 0.7169299721717834, + "learning_rate": 6.743683604755631e-06, + "loss": 0.7944, + "step": 14110 + }, + { + "epoch": 0.7766525400407287, + "grad_norm": 0.6805511713027954, + "learning_rate": 6.743277344303738e-06, + "loss": 0.7671, + "step": 14111 + }, + { + "epoch": 0.7767075788430844, + "grad_norm": 0.7300780415534973, + "learning_rate": 6.742871070749838e-06, + "loss": 0.7789, + "step": 14112 + }, + { + "epoch": 0.7767626176454401, + "grad_norm": 0.6475857496261597, + "learning_rate": 6.742464784096987e-06, + "loss": 0.6652, + "step": 14113 + }, + { + "epoch": 0.7768176564477957, + "grad_norm": 0.6941269040107727, + "learning_rate": 6.742058484348236e-06, + "loss": 0.8138, + "step": 14114 + }, + { + "epoch": 0.7768726952501513, + "grad_norm": 0.6175981760025024, + "learning_rate": 6.7416521715066405e-06, + "loss": 0.7667, + "step": 14115 + }, + { + "epoch": 0.776927734052507, + "grad_norm": 0.6499401330947876, + "learning_rate": 6.741245845575252e-06, + "loss": 0.7415, + "step": 14116 + }, + { + "epoch": 0.7769827728548627, + "grad_norm": 0.6601547598838806, + "learning_rate": 6.740839506557127e-06, + "loss": 0.732, + "step": 14117 + }, + { + "epoch": 0.7770378116572183, + "grad_norm": 0.7939042448997498, + "learning_rate": 6.740433154455319e-06, + "loss": 0.7043, + "step": 14118 + }, + { + "epoch": 0.777092850459574, + "grad_norm": 0.7381628751754761, + "learning_rate": 6.740026789272881e-06, + "loss": 0.8256, + "step": 14119 + }, + { + "epoch": 0.7771478892619297, + "grad_norm": 0.6131769418716431, + "learning_rate": 6.739620411012866e-06, + "loss": 0.726, + "step": 14120 + }, + { + "epoch": 0.7772029280642854, + "grad_norm": 1.201745867729187, + "learning_rate": 6.739214019678332e-06, + "loss": 0.7097, + "step": 14121 + }, + { + "epoch": 0.777257966866641, + "grad_norm": 0.6618456244468689, + "learning_rate": 6.7388076152723295e-06, + "loss": 0.6396, + "step": 14122 + }, + { + "epoch": 0.7773130056689966, + "grad_norm": 0.7490836977958679, + "learning_rate": 6.738401197797915e-06, + "loss": 0.6475, + "step": 14123 + }, + { + "epoch": 0.7773680444713523, + "grad_norm": 0.8125407099723816, + "learning_rate": 6.737994767258142e-06, + "loss": 0.7693, + "step": 14124 + }, + { + "epoch": 0.777423083273708, + "grad_norm": 0.7501794099807739, + "learning_rate": 6.737588323656065e-06, + "loss": 0.7333, + "step": 14125 + }, + { + "epoch": 0.7774781220760636, + "grad_norm": 1.3062889575958252, + "learning_rate": 6.73718186699474e-06, + "loss": 0.6909, + "step": 14126 + }, + { + "epoch": 0.7775331608784193, + "grad_norm": 0.6784525513648987, + "learning_rate": 6.736775397277221e-06, + "loss": 0.7256, + "step": 14127 + }, + { + "epoch": 0.777588199680775, + "grad_norm": 0.7018646597862244, + "learning_rate": 6.736368914506562e-06, + "loss": 0.7632, + "step": 14128 + }, + { + "epoch": 0.7776432384831307, + "grad_norm": 0.7596307992935181, + "learning_rate": 6.735962418685821e-06, + "loss": 0.7117, + "step": 14129 + }, + { + "epoch": 0.7776982772854862, + "grad_norm": 0.7582107186317444, + "learning_rate": 6.7355559098180504e-06, + "loss": 0.7808, + "step": 14130 + }, + { + "epoch": 0.7777533160878419, + "grad_norm": 0.6460647583007812, + "learning_rate": 6.7351493879063056e-06, + "loss": 0.675, + "step": 14131 + }, + { + "epoch": 0.7778083548901976, + "grad_norm": 0.6801304221153259, + "learning_rate": 6.7347428529536415e-06, + "loss": 0.6504, + "step": 14132 + }, + { + "epoch": 0.7778633936925532, + "grad_norm": 0.8122933506965637, + "learning_rate": 6.7343363049631176e-06, + "loss": 0.7949, + "step": 14133 + }, + { + "epoch": 0.7779184324949089, + "grad_norm": 0.6750267744064331, + "learning_rate": 6.733929743937784e-06, + "loss": 0.7689, + "step": 14134 + }, + { + "epoch": 0.7779734712972646, + "grad_norm": 0.7141891121864319, + "learning_rate": 6.7335231698807005e-06, + "loss": 0.7099, + "step": 14135 + }, + { + "epoch": 0.7780285100996203, + "grad_norm": 0.7904065251350403, + "learning_rate": 6.733116582794918e-06, + "loss": 0.8458, + "step": 14136 + }, + { + "epoch": 0.7780835489019758, + "grad_norm": 0.6905248165130615, + "learning_rate": 6.732709982683496e-06, + "loss": 0.7848, + "step": 14137 + }, + { + "epoch": 0.7781385877043315, + "grad_norm": 0.6707245707511902, + "learning_rate": 6.732303369549491e-06, + "loss": 0.8319, + "step": 14138 + }, + { + "epoch": 0.7781936265066872, + "grad_norm": 0.6611519455909729, + "learning_rate": 6.731896743395957e-06, + "loss": 0.7025, + "step": 14139 + }, + { + "epoch": 0.7782486653090429, + "grad_norm": 0.7113156914710999, + "learning_rate": 6.73149010422595e-06, + "loss": 0.8297, + "step": 14140 + }, + { + "epoch": 0.7783037041113985, + "grad_norm": 0.7279486060142517, + "learning_rate": 6.7310834520425265e-06, + "loss": 0.8134, + "step": 14141 + }, + { + "epoch": 0.7783587429137542, + "grad_norm": 0.7561796307563782, + "learning_rate": 6.730676786848744e-06, + "loss": 0.806, + "step": 14142 + }, + { + "epoch": 0.7784137817161099, + "grad_norm": 0.6724728345870972, + "learning_rate": 6.7302701086476585e-06, + "loss": 0.7782, + "step": 14143 + }, + { + "epoch": 0.7784688205184656, + "grad_norm": 0.6363211274147034, + "learning_rate": 6.729863417442325e-06, + "loss": 0.6298, + "step": 14144 + }, + { + "epoch": 0.7785238593208211, + "grad_norm": 0.6920950412750244, + "learning_rate": 6.729456713235803e-06, + "loss": 0.5804, + "step": 14145 + }, + { + "epoch": 0.7785788981231768, + "grad_norm": 0.7388806343078613, + "learning_rate": 6.729049996031145e-06, + "loss": 0.6594, + "step": 14146 + }, + { + "epoch": 0.7786339369255325, + "grad_norm": 0.7736972570419312, + "learning_rate": 6.728643265831412e-06, + "loss": 0.8244, + "step": 14147 + }, + { + "epoch": 0.7786889757278882, + "grad_norm": 0.6928302049636841, + "learning_rate": 6.728236522639658e-06, + "loss": 0.6713, + "step": 14148 + }, + { + "epoch": 0.7787440145302438, + "grad_norm": 0.8058464527130127, + "learning_rate": 6.72782976645894e-06, + "loss": 0.7647, + "step": 14149 + }, + { + "epoch": 0.7787990533325995, + "grad_norm": 0.7111127376556396, + "learning_rate": 6.727422997292317e-06, + "loss": 0.7629, + "step": 14150 + }, + { + "epoch": 0.7788540921349552, + "grad_norm": 0.9375373721122742, + "learning_rate": 6.7270162151428455e-06, + "loss": 0.8306, + "step": 14151 + }, + { + "epoch": 0.7789091309373108, + "grad_norm": 0.6894392371177673, + "learning_rate": 6.726609420013581e-06, + "loss": 0.6995, + "step": 14152 + }, + { + "epoch": 0.7789641697396664, + "grad_norm": 0.7058690786361694, + "learning_rate": 6.726202611907583e-06, + "loss": 0.844, + "step": 14153 + }, + { + "epoch": 0.7790192085420221, + "grad_norm": 0.7672932744026184, + "learning_rate": 6.725795790827909e-06, + "loss": 0.6613, + "step": 14154 + }, + { + "epoch": 0.7790742473443778, + "grad_norm": 0.8575173020362854, + "learning_rate": 6.7253889567776146e-06, + "loss": 0.6946, + "step": 14155 + }, + { + "epoch": 0.7791292861467335, + "grad_norm": 0.6832261085510254, + "learning_rate": 6.724982109759759e-06, + "loss": 0.7121, + "step": 14156 + }, + { + "epoch": 0.7791843249490891, + "grad_norm": 0.8188209533691406, + "learning_rate": 6.724575249777401e-06, + "loss": 0.6479, + "step": 14157 + }, + { + "epoch": 0.7792393637514448, + "grad_norm": 0.6514336466789246, + "learning_rate": 6.724168376833595e-06, + "loss": 0.6117, + "step": 14158 + }, + { + "epoch": 0.7792944025538004, + "grad_norm": 0.7283767461776733, + "learning_rate": 6.723761490931403e-06, + "loss": 0.6882, + "step": 14159 + }, + { + "epoch": 0.7793494413561561, + "grad_norm": 0.7681146860122681, + "learning_rate": 6.7233545920738785e-06, + "loss": 0.8028, + "step": 14160 + }, + { + "epoch": 0.7794044801585117, + "grad_norm": 0.6202995181083679, + "learning_rate": 6.722947680264084e-06, + "loss": 0.713, + "step": 14161 + }, + { + "epoch": 0.7794595189608674, + "grad_norm": 0.7137139439582825, + "learning_rate": 6.722540755505076e-06, + "loss": 0.7842, + "step": 14162 + }, + { + "epoch": 0.7795145577632231, + "grad_norm": 0.6852554678916931, + "learning_rate": 6.722133817799913e-06, + "loss": 0.7329, + "step": 14163 + }, + { + "epoch": 0.7795695965655788, + "grad_norm": 0.7520774602890015, + "learning_rate": 6.7217268671516525e-06, + "loss": 0.7498, + "step": 14164 + }, + { + "epoch": 0.7796246353679344, + "grad_norm": 0.708577573299408, + "learning_rate": 6.7213199035633525e-06, + "loss": 0.675, + "step": 14165 + }, + { + "epoch": 0.77967967417029, + "grad_norm": 0.8061410188674927, + "learning_rate": 6.7209129270380744e-06, + "loss": 0.7176, + "step": 14166 + }, + { + "epoch": 0.7797347129726457, + "grad_norm": 0.8070787787437439, + "learning_rate": 6.720505937578876e-06, + "loss": 0.8138, + "step": 14167 + }, + { + "epoch": 0.7797897517750014, + "grad_norm": 0.7127004265785217, + "learning_rate": 6.720098935188815e-06, + "loss": 0.7004, + "step": 14168 + }, + { + "epoch": 0.779844790577357, + "grad_norm": 0.7188708782196045, + "learning_rate": 6.719691919870951e-06, + "loss": 0.6996, + "step": 14169 + }, + { + "epoch": 0.7798998293797127, + "grad_norm": 0.6346360445022583, + "learning_rate": 6.719284891628342e-06, + "loss": 0.7349, + "step": 14170 + }, + { + "epoch": 0.7799548681820684, + "grad_norm": 0.6262187361717224, + "learning_rate": 6.71887785046405e-06, + "loss": 0.7279, + "step": 14171 + }, + { + "epoch": 0.7800099069844241, + "grad_norm": 0.7538053393363953, + "learning_rate": 6.718470796381129e-06, + "loss": 0.754, + "step": 14172 + }, + { + "epoch": 0.7800649457867797, + "grad_norm": 0.6569569706916809, + "learning_rate": 6.718063729382643e-06, + "loss": 0.6787, + "step": 14173 + }, + { + "epoch": 0.7801199845891353, + "grad_norm": 0.6446678042411804, + "learning_rate": 6.71765664947165e-06, + "loss": 0.6338, + "step": 14174 + }, + { + "epoch": 0.780175023391491, + "grad_norm": 0.7559269666671753, + "learning_rate": 6.7172495566512095e-06, + "loss": 0.7472, + "step": 14175 + }, + { + "epoch": 0.7802300621938466, + "grad_norm": 0.6920101642608643, + "learning_rate": 6.71684245092438e-06, + "loss": 0.7189, + "step": 14176 + }, + { + "epoch": 0.7802851009962023, + "grad_norm": 0.6513105034828186, + "learning_rate": 6.716435332294223e-06, + "loss": 0.6104, + "step": 14177 + }, + { + "epoch": 0.780340139798558, + "grad_norm": 0.7076418399810791, + "learning_rate": 6.716028200763798e-06, + "loss": 0.7974, + "step": 14178 + }, + { + "epoch": 0.7803951786009137, + "grad_norm": 0.7291662693023682, + "learning_rate": 6.715621056336164e-06, + "loss": 0.7661, + "step": 14179 + }, + { + "epoch": 0.7804502174032693, + "grad_norm": 0.682321310043335, + "learning_rate": 6.715213899014381e-06, + "loss": 0.7345, + "step": 14180 + }, + { + "epoch": 0.7805052562056249, + "grad_norm": 0.7170400619506836, + "learning_rate": 6.71480672880151e-06, + "loss": 0.6968, + "step": 14181 + }, + { + "epoch": 0.7805602950079806, + "grad_norm": 0.7504192590713501, + "learning_rate": 6.714399545700611e-06, + "loss": 0.7868, + "step": 14182 + }, + { + "epoch": 0.7806153338103363, + "grad_norm": 0.7334801554679871, + "learning_rate": 6.713992349714744e-06, + "loss": 0.8806, + "step": 14183 + }, + { + "epoch": 0.7806703726126919, + "grad_norm": 0.6495537161827087, + "learning_rate": 6.713585140846969e-06, + "loss": 0.7272, + "step": 14184 + }, + { + "epoch": 0.7807254114150476, + "grad_norm": 0.7101101279258728, + "learning_rate": 6.713177919100347e-06, + "loss": 0.8038, + "step": 14185 + }, + { + "epoch": 0.7807804502174033, + "grad_norm": 0.7013083100318909, + "learning_rate": 6.712770684477937e-06, + "loss": 0.7576, + "step": 14186 + }, + { + "epoch": 0.780835489019759, + "grad_norm": 0.7535369992256165, + "learning_rate": 6.712363436982802e-06, + "loss": 0.6537, + "step": 14187 + }, + { + "epoch": 0.7808905278221145, + "grad_norm": 0.7432667016983032, + "learning_rate": 6.711956176618001e-06, + "loss": 0.7734, + "step": 14188 + }, + { + "epoch": 0.7809455666244702, + "grad_norm": 0.718006432056427, + "learning_rate": 6.711548903386597e-06, + "loss": 0.7291, + "step": 14189 + }, + { + "epoch": 0.7810006054268259, + "grad_norm": 0.7983072400093079, + "learning_rate": 6.711141617291649e-06, + "loss": 0.8403, + "step": 14190 + }, + { + "epoch": 0.7810556442291816, + "grad_norm": 0.7017259001731873, + "learning_rate": 6.710734318336218e-06, + "loss": 0.7293, + "step": 14191 + }, + { + "epoch": 0.7811106830315372, + "grad_norm": 0.6061737537384033, + "learning_rate": 6.710327006523366e-06, + "loss": 0.6624, + "step": 14192 + }, + { + "epoch": 0.7811657218338929, + "grad_norm": 0.6876726746559143, + "learning_rate": 6.709919681856155e-06, + "loss": 0.723, + "step": 14193 + }, + { + "epoch": 0.7812207606362486, + "grad_norm": 0.6926757097244263, + "learning_rate": 6.709512344337646e-06, + "loss": 0.7392, + "step": 14194 + }, + { + "epoch": 0.7812757994386043, + "grad_norm": 0.6464381217956543, + "learning_rate": 6.7091049939708985e-06, + "loss": 0.7301, + "step": 14195 + }, + { + "epoch": 0.7813308382409598, + "grad_norm": 0.7292629480361938, + "learning_rate": 6.708697630758974e-06, + "loss": 0.7511, + "step": 14196 + }, + { + "epoch": 0.7813858770433155, + "grad_norm": 0.7483099102973938, + "learning_rate": 6.708290254704937e-06, + "loss": 0.7981, + "step": 14197 + }, + { + "epoch": 0.7814409158456712, + "grad_norm": 0.6766877770423889, + "learning_rate": 6.707882865811848e-06, + "loss": 0.7987, + "step": 14198 + }, + { + "epoch": 0.7814959546480269, + "grad_norm": 0.7340181469917297, + "learning_rate": 6.707475464082769e-06, + "loss": 0.799, + "step": 14199 + }, + { + "epoch": 0.7815509934503825, + "grad_norm": 0.6247759461402893, + "learning_rate": 6.707068049520759e-06, + "loss": 0.7299, + "step": 14200 + }, + { + "epoch": 0.7816060322527382, + "grad_norm": 0.6783067584037781, + "learning_rate": 6.706660622128885e-06, + "loss": 0.6987, + "step": 14201 + }, + { + "epoch": 0.7816610710550939, + "grad_norm": 0.7613719701766968, + "learning_rate": 6.706253181910205e-06, + "loss": 0.7894, + "step": 14202 + }, + { + "epoch": 0.7817161098574495, + "grad_norm": 0.6673761606216431, + "learning_rate": 6.705845728867784e-06, + "loss": 0.8015, + "step": 14203 + }, + { + "epoch": 0.7817711486598051, + "grad_norm": 0.6551307439804077, + "learning_rate": 6.705438263004683e-06, + "loss": 0.7057, + "step": 14204 + }, + { + "epoch": 0.7818261874621608, + "grad_norm": 0.6815405488014221, + "learning_rate": 6.705030784323965e-06, + "loss": 0.7466, + "step": 14205 + }, + { + "epoch": 0.7818812262645165, + "grad_norm": 0.6838087439537048, + "learning_rate": 6.704623292828692e-06, + "loss": 0.8226, + "step": 14206 + }, + { + "epoch": 0.7819362650668722, + "grad_norm": 0.6704637408256531, + "learning_rate": 6.704215788521925e-06, + "loss": 0.8101, + "step": 14207 + }, + { + "epoch": 0.7819913038692278, + "grad_norm": 0.6606172919273376, + "learning_rate": 6.70380827140673e-06, + "loss": 0.7824, + "step": 14208 + }, + { + "epoch": 0.7820463426715835, + "grad_norm": 0.6641090512275696, + "learning_rate": 6.703400741486166e-06, + "loss": 0.7507, + "step": 14209 + }, + { + "epoch": 0.7821013814739392, + "grad_norm": 1.6413429975509644, + "learning_rate": 6.702993198763299e-06, + "loss": 0.7793, + "step": 14210 + }, + { + "epoch": 0.7821564202762948, + "grad_norm": 0.6664854884147644, + "learning_rate": 6.7025856432411915e-06, + "loss": 0.7304, + "step": 14211 + }, + { + "epoch": 0.7822114590786504, + "grad_norm": 0.6968172192573547, + "learning_rate": 6.7021780749229075e-06, + "loss": 0.7506, + "step": 14212 + }, + { + "epoch": 0.7822664978810061, + "grad_norm": 0.6443943381309509, + "learning_rate": 6.701770493811506e-06, + "loss": 0.7511, + "step": 14213 + }, + { + "epoch": 0.7823215366833618, + "grad_norm": 0.67723548412323, + "learning_rate": 6.701362899910053e-06, + "loss": 0.6839, + "step": 14214 + }, + { + "epoch": 0.7823765754857175, + "grad_norm": 0.7601221203804016, + "learning_rate": 6.700955293221614e-06, + "loss": 0.7397, + "step": 14215 + }, + { + "epoch": 0.7824316142880731, + "grad_norm": 0.6056920289993286, + "learning_rate": 6.700547673749249e-06, + "loss": 0.7706, + "step": 14216 + }, + { + "epoch": 0.7824866530904288, + "grad_norm": 0.6421142816543579, + "learning_rate": 6.700140041496024e-06, + "loss": 0.7209, + "step": 14217 + }, + { + "epoch": 0.7825416918927844, + "grad_norm": 0.6653133034706116, + "learning_rate": 6.6997323964650005e-06, + "loss": 0.708, + "step": 14218 + }, + { + "epoch": 0.78259673069514, + "grad_norm": 0.8854939937591553, + "learning_rate": 6.699324738659243e-06, + "loss": 0.7658, + "step": 14219 + }, + { + "epoch": 0.7826517694974957, + "grad_norm": 0.7130745649337769, + "learning_rate": 6.6989170680818175e-06, + "loss": 0.7827, + "step": 14220 + }, + { + "epoch": 0.7827068082998514, + "grad_norm": 0.953117847442627, + "learning_rate": 6.698509384735783e-06, + "loss": 0.7852, + "step": 14221 + }, + { + "epoch": 0.7827618471022071, + "grad_norm": 0.655768871307373, + "learning_rate": 6.698101688624209e-06, + "loss": 0.8461, + "step": 14222 + }, + { + "epoch": 0.7828168859045627, + "grad_norm": 0.656775951385498, + "learning_rate": 6.6976939797501575e-06, + "loss": 0.7254, + "step": 14223 + }, + { + "epoch": 0.7828719247069184, + "grad_norm": 0.6901991963386536, + "learning_rate": 6.697286258116691e-06, + "loss": 0.7242, + "step": 14224 + }, + { + "epoch": 0.782926963509274, + "grad_norm": 0.8289571404457092, + "learning_rate": 6.696878523726875e-06, + "loss": 0.8578, + "step": 14225 + }, + { + "epoch": 0.7829820023116297, + "grad_norm": 0.6268846392631531, + "learning_rate": 6.696470776583775e-06, + "loss": 0.737, + "step": 14226 + }, + { + "epoch": 0.7830370411139853, + "grad_norm": 0.7026770114898682, + "learning_rate": 6.696063016690455e-06, + "loss": 0.6771, + "step": 14227 + }, + { + "epoch": 0.783092079916341, + "grad_norm": 0.7377839088439941, + "learning_rate": 6.69565524404998e-06, + "loss": 0.7174, + "step": 14228 + }, + { + "epoch": 0.7831471187186967, + "grad_norm": 0.6778523921966553, + "learning_rate": 6.695247458665414e-06, + "loss": 0.8255, + "step": 14229 + }, + { + "epoch": 0.7832021575210524, + "grad_norm": 0.7624330520629883, + "learning_rate": 6.69483966053982e-06, + "loss": 0.7495, + "step": 14230 + }, + { + "epoch": 0.783257196323408, + "grad_norm": 0.8944052457809448, + "learning_rate": 6.694431849676267e-06, + "loss": 0.868, + "step": 14231 + }, + { + "epoch": 0.7833122351257636, + "grad_norm": 0.7391701936721802, + "learning_rate": 6.694024026077816e-06, + "loss": 0.7032, + "step": 14232 + }, + { + "epoch": 0.7833672739281193, + "grad_norm": 0.7548620104789734, + "learning_rate": 6.693616189747535e-06, + "loss": 0.8272, + "step": 14233 + }, + { + "epoch": 0.783422312730475, + "grad_norm": 0.6795994639396667, + "learning_rate": 6.693208340688489e-06, + "loss": 0.703, + "step": 14234 + }, + { + "epoch": 0.7834773515328306, + "grad_norm": 0.6580816507339478, + "learning_rate": 6.69280047890374e-06, + "loss": 0.7454, + "step": 14235 + }, + { + "epoch": 0.7835323903351863, + "grad_norm": 0.7124443650245667, + "learning_rate": 6.6923926043963576e-06, + "loss": 0.6655, + "step": 14236 + }, + { + "epoch": 0.783587429137542, + "grad_norm": 0.6730241179466248, + "learning_rate": 6.691984717169404e-06, + "loss": 0.7522, + "step": 14237 + }, + { + "epoch": 0.7836424679398977, + "grad_norm": 0.8156033158302307, + "learning_rate": 6.6915768172259466e-06, + "loss": 0.8955, + "step": 14238 + }, + { + "epoch": 0.7836975067422532, + "grad_norm": 0.8041443228721619, + "learning_rate": 6.6911689045690506e-06, + "loss": 0.8019, + "step": 14239 + }, + { + "epoch": 0.7837525455446089, + "grad_norm": 0.7252053618431091, + "learning_rate": 6.690760979201782e-06, + "loss": 0.7014, + "step": 14240 + }, + { + "epoch": 0.7838075843469646, + "grad_norm": 0.6969071626663208, + "learning_rate": 6.690353041127208e-06, + "loss": 0.7304, + "step": 14241 + }, + { + "epoch": 0.7838626231493203, + "grad_norm": 0.8254885673522949, + "learning_rate": 6.6899450903483906e-06, + "loss": 0.7193, + "step": 14242 + }, + { + "epoch": 0.7839176619516759, + "grad_norm": 0.7426590323448181, + "learning_rate": 6.6895371268684e-06, + "loss": 0.697, + "step": 14243 + }, + { + "epoch": 0.7839727007540316, + "grad_norm": 0.6744338274002075, + "learning_rate": 6.6891291506903e-06, + "loss": 0.8363, + "step": 14244 + }, + { + "epoch": 0.7840277395563873, + "grad_norm": 0.6609839797019958, + "learning_rate": 6.688721161817156e-06, + "loss": 0.7756, + "step": 14245 + }, + { + "epoch": 0.784082778358743, + "grad_norm": 0.8377131223678589, + "learning_rate": 6.688313160252038e-06, + "loss": 0.8355, + "step": 14246 + }, + { + "epoch": 0.7841378171610985, + "grad_norm": 0.6922308802604675, + "learning_rate": 6.687905145998009e-06, + "loss": 0.756, + "step": 14247 + }, + { + "epoch": 0.7841928559634542, + "grad_norm": 0.7217739820480347, + "learning_rate": 6.687497119058137e-06, + "loss": 0.7309, + "step": 14248 + }, + { + "epoch": 0.7842478947658099, + "grad_norm": 0.6906038522720337, + "learning_rate": 6.687089079435488e-06, + "loss": 0.6645, + "step": 14249 + }, + { + "epoch": 0.7843029335681656, + "grad_norm": 0.6800183057785034, + "learning_rate": 6.6866810271331305e-06, + "loss": 0.6791, + "step": 14250 + }, + { + "epoch": 0.7843579723705212, + "grad_norm": 0.6835503578186035, + "learning_rate": 6.686272962154129e-06, + "loss": 0.699, + "step": 14251 + }, + { + "epoch": 0.7844130111728769, + "grad_norm": 0.6643723845481873, + "learning_rate": 6.685864884501552e-06, + "loss": 0.7808, + "step": 14252 + }, + { + "epoch": 0.7844680499752326, + "grad_norm": 0.6742954850196838, + "learning_rate": 6.685456794178464e-06, + "loss": 0.7704, + "step": 14253 + }, + { + "epoch": 0.7845230887775883, + "grad_norm": 0.6374711990356445, + "learning_rate": 6.6850486911879355e-06, + "loss": 0.7557, + "step": 14254 + }, + { + "epoch": 0.7845781275799438, + "grad_norm": 0.7354347109794617, + "learning_rate": 6.684640575533031e-06, + "loss": 0.7928, + "step": 14255 + }, + { + "epoch": 0.7846331663822995, + "grad_norm": 0.6694937348365784, + "learning_rate": 6.684232447216821e-06, + "loss": 0.7247, + "step": 14256 + }, + { + "epoch": 0.7846882051846552, + "grad_norm": 0.716623842716217, + "learning_rate": 6.683824306242368e-06, + "loss": 0.8638, + "step": 14257 + }, + { + "epoch": 0.7847432439870109, + "grad_norm": 0.667164146900177, + "learning_rate": 6.683416152612743e-06, + "loss": 0.7455, + "step": 14258 + }, + { + "epoch": 0.7847982827893665, + "grad_norm": 0.7302100658416748, + "learning_rate": 6.683007986331014e-06, + "loss": 0.707, + "step": 14259 + }, + { + "epoch": 0.7848533215917222, + "grad_norm": 0.7605045437812805, + "learning_rate": 6.682599807400246e-06, + "loss": 0.7727, + "step": 14260 + }, + { + "epoch": 0.7849083603940779, + "grad_norm": 0.6819437146186829, + "learning_rate": 6.682191615823508e-06, + "loss": 0.7538, + "step": 14261 + }, + { + "epoch": 0.7849633991964334, + "grad_norm": 0.7399439811706543, + "learning_rate": 6.6817834116038695e-06, + "loss": 0.7499, + "step": 14262 + }, + { + "epoch": 0.7850184379987891, + "grad_norm": 0.7864901423454285, + "learning_rate": 6.681375194744397e-06, + "loss": 0.7128, + "step": 14263 + }, + { + "epoch": 0.7850734768011448, + "grad_norm": 0.7308626174926758, + "learning_rate": 6.680966965248159e-06, + "loss": 0.7239, + "step": 14264 + }, + { + "epoch": 0.7851285156035005, + "grad_norm": 0.6553478837013245, + "learning_rate": 6.680558723118222e-06, + "loss": 0.6984, + "step": 14265 + }, + { + "epoch": 0.7851835544058561, + "grad_norm": 0.621415376663208, + "learning_rate": 6.680150468357656e-06, + "loss": 0.6428, + "step": 14266 + }, + { + "epoch": 0.7852385932082118, + "grad_norm": 1.0505764484405518, + "learning_rate": 6.679742200969529e-06, + "loss": 0.8073, + "step": 14267 + }, + { + "epoch": 0.7852936320105675, + "grad_norm": 0.7393355369567871, + "learning_rate": 6.67933392095691e-06, + "loss": 0.7396, + "step": 14268 + }, + { + "epoch": 0.7853486708129231, + "grad_norm": 0.7346563935279846, + "learning_rate": 6.678925628322864e-06, + "loss": 0.7398, + "step": 14269 + }, + { + "epoch": 0.7854037096152787, + "grad_norm": 0.6694674491882324, + "learning_rate": 6.678517323070465e-06, + "loss": 0.7346, + "step": 14270 + }, + { + "epoch": 0.7854587484176344, + "grad_norm": 0.6907033920288086, + "learning_rate": 6.678109005202779e-06, + "loss": 0.7617, + "step": 14271 + }, + { + "epoch": 0.7855137872199901, + "grad_norm": 0.6588131189346313, + "learning_rate": 6.677700674722873e-06, + "loss": 0.7514, + "step": 14272 + }, + { + "epoch": 0.7855688260223458, + "grad_norm": 0.6535136699676514, + "learning_rate": 6.677292331633819e-06, + "loss": 0.7154, + "step": 14273 + }, + { + "epoch": 0.7856238648247014, + "grad_norm": 0.7013682723045349, + "learning_rate": 6.676883975938685e-06, + "loss": 0.8506, + "step": 14274 + }, + { + "epoch": 0.7856789036270571, + "grad_norm": 0.7128416895866394, + "learning_rate": 6.67647560764054e-06, + "loss": 0.7669, + "step": 14275 + }, + { + "epoch": 0.7857339424294127, + "grad_norm": 0.7021318674087524, + "learning_rate": 6.676067226742453e-06, + "loss": 0.8236, + "step": 14276 + }, + { + "epoch": 0.7857889812317684, + "grad_norm": 0.7067561745643616, + "learning_rate": 6.675658833247493e-06, + "loss": 0.6848, + "step": 14277 + }, + { + "epoch": 0.785844020034124, + "grad_norm": 0.6488254070281982, + "learning_rate": 6.675250427158731e-06, + "loss": 0.7877, + "step": 14278 + }, + { + "epoch": 0.7858990588364797, + "grad_norm": 0.7153946757316589, + "learning_rate": 6.674842008479234e-06, + "loss": 0.7994, + "step": 14279 + }, + { + "epoch": 0.7859540976388354, + "grad_norm": 0.7290914058685303, + "learning_rate": 6.6744335772120735e-06, + "loss": 0.8074, + "step": 14280 + }, + { + "epoch": 0.7860091364411911, + "grad_norm": 0.726309061050415, + "learning_rate": 6.674025133360316e-06, + "loss": 0.7789, + "step": 14281 + }, + { + "epoch": 0.7860641752435467, + "grad_norm": 0.6294347047805786, + "learning_rate": 6.673616676927037e-06, + "loss": 0.6405, + "step": 14282 + }, + { + "epoch": 0.7861192140459023, + "grad_norm": 0.654400646686554, + "learning_rate": 6.673208207915302e-06, + "loss": 0.7876, + "step": 14283 + }, + { + "epoch": 0.786174252848258, + "grad_norm": 0.6729328632354736, + "learning_rate": 6.672799726328182e-06, + "loss": 0.7773, + "step": 14284 + }, + { + "epoch": 0.7862292916506137, + "grad_norm": 0.7607905268669128, + "learning_rate": 6.672391232168745e-06, + "loss": 0.8262, + "step": 14285 + }, + { + "epoch": 0.7862843304529693, + "grad_norm": 0.6475018858909607, + "learning_rate": 6.671982725440065e-06, + "loss": 0.7383, + "step": 14286 + }, + { + "epoch": 0.786339369255325, + "grad_norm": 0.8290789723396301, + "learning_rate": 6.671574206145211e-06, + "loss": 0.7968, + "step": 14287 + }, + { + "epoch": 0.7863944080576807, + "grad_norm": 0.7462177872657776, + "learning_rate": 6.671165674287252e-06, + "loss": 0.7465, + "step": 14288 + }, + { + "epoch": 0.7864494468600364, + "grad_norm": 0.7029373049736023, + "learning_rate": 6.6707571298692595e-06, + "loss": 0.7342, + "step": 14289 + }, + { + "epoch": 0.786504485662392, + "grad_norm": 0.8253761529922485, + "learning_rate": 6.670348572894303e-06, + "loss": 0.8196, + "step": 14290 + }, + { + "epoch": 0.7865595244647476, + "grad_norm": 0.7234970331192017, + "learning_rate": 6.669940003365455e-06, + "loss": 0.7966, + "step": 14291 + }, + { + "epoch": 0.7866145632671033, + "grad_norm": 0.8699348568916321, + "learning_rate": 6.6695314212857845e-06, + "loss": 0.8761, + "step": 14292 + }, + { + "epoch": 0.786669602069459, + "grad_norm": 0.6620158553123474, + "learning_rate": 6.66912282665836e-06, + "loss": 0.7534, + "step": 14293 + }, + { + "epoch": 0.7867246408718146, + "grad_norm": 0.6469776630401611, + "learning_rate": 6.668714219486259e-06, + "loss": 0.7812, + "step": 14294 + }, + { + "epoch": 0.7867796796741703, + "grad_norm": 0.6477407813072205, + "learning_rate": 6.668305599772546e-06, + "loss": 0.7144, + "step": 14295 + }, + { + "epoch": 0.786834718476526, + "grad_norm": 0.6626473665237427, + "learning_rate": 6.667896967520297e-06, + "loss": 0.7283, + "step": 14296 + }, + { + "epoch": 0.7868897572788817, + "grad_norm": 0.6214945316314697, + "learning_rate": 6.667488322732578e-06, + "loss": 0.6835, + "step": 14297 + }, + { + "epoch": 0.7869447960812372, + "grad_norm": 0.6199555397033691, + "learning_rate": 6.667079665412465e-06, + "loss": 0.706, + "step": 14298 + }, + { + "epoch": 0.7869998348835929, + "grad_norm": 0.8127612471580505, + "learning_rate": 6.666670995563027e-06, + "loss": 0.7099, + "step": 14299 + }, + { + "epoch": 0.7870548736859486, + "grad_norm": 0.6241362690925598, + "learning_rate": 6.6662623131873374e-06, + "loss": 0.7076, + "step": 14300 + }, + { + "epoch": 0.7871099124883043, + "grad_norm": 0.7260692715644836, + "learning_rate": 6.665853618288465e-06, + "loss": 0.7842, + "step": 14301 + }, + { + "epoch": 0.7871649512906599, + "grad_norm": 0.6644107103347778, + "learning_rate": 6.665444910869482e-06, + "loss": 0.6515, + "step": 14302 + }, + { + "epoch": 0.7872199900930156, + "grad_norm": 0.6629641056060791, + "learning_rate": 6.6650361909334616e-06, + "loss": 0.7062, + "step": 14303 + }, + { + "epoch": 0.7872750288953713, + "grad_norm": 0.6616516709327698, + "learning_rate": 6.6646274584834745e-06, + "loss": 0.8195, + "step": 14304 + }, + { + "epoch": 0.7873300676977268, + "grad_norm": 0.7184805870056152, + "learning_rate": 6.664218713522593e-06, + "loss": 0.8699, + "step": 14305 + }, + { + "epoch": 0.7873851065000825, + "grad_norm": 0.6567219495773315, + "learning_rate": 6.6638099560538905e-06, + "loss": 0.7679, + "step": 14306 + }, + { + "epoch": 0.7874401453024382, + "grad_norm": 0.6952399611473083, + "learning_rate": 6.663401186080436e-06, + "loss": 0.603, + "step": 14307 + }, + { + "epoch": 0.7874951841047939, + "grad_norm": 0.7298767566680908, + "learning_rate": 6.662992403605304e-06, + "loss": 0.7655, + "step": 14308 + }, + { + "epoch": 0.7875502229071495, + "grad_norm": 0.7162219882011414, + "learning_rate": 6.662583608631567e-06, + "loss": 0.7797, + "step": 14309 + }, + { + "epoch": 0.7876052617095052, + "grad_norm": 0.6489827036857605, + "learning_rate": 6.662174801162296e-06, + "loss": 0.8165, + "step": 14310 + }, + { + "epoch": 0.7876603005118609, + "grad_norm": 0.7893611192703247, + "learning_rate": 6.6617659812005635e-06, + "loss": 0.8082, + "step": 14311 + }, + { + "epoch": 0.7877153393142166, + "grad_norm": 0.6709675192832947, + "learning_rate": 6.661357148749443e-06, + "loss": 0.7549, + "step": 14312 + }, + { + "epoch": 0.7877703781165721, + "grad_norm": 0.6166689991950989, + "learning_rate": 6.660948303812009e-06, + "loss": 0.7116, + "step": 14313 + }, + { + "epoch": 0.7878254169189278, + "grad_norm": 0.7941738367080688, + "learning_rate": 6.660539446391329e-06, + "loss": 0.7981, + "step": 14314 + }, + { + "epoch": 0.7878804557212835, + "grad_norm": 0.6339346170425415, + "learning_rate": 6.660130576490481e-06, + "loss": 0.7306, + "step": 14315 + }, + { + "epoch": 0.7879354945236392, + "grad_norm": 0.7044192552566528, + "learning_rate": 6.659721694112535e-06, + "loss": 0.7811, + "step": 14316 + }, + { + "epoch": 0.7879905333259948, + "grad_norm": 0.7853406071662903, + "learning_rate": 6.659312799260565e-06, + "loss": 0.7652, + "step": 14317 + }, + { + "epoch": 0.7880455721283505, + "grad_norm": 0.7076637148857117, + "learning_rate": 6.658903891937645e-06, + "loss": 0.7672, + "step": 14318 + }, + { + "epoch": 0.7881006109307062, + "grad_norm": 0.7043278813362122, + "learning_rate": 6.658494972146847e-06, + "loss": 0.726, + "step": 14319 + }, + { + "epoch": 0.7881556497330618, + "grad_norm": 0.8903809785842896, + "learning_rate": 6.658086039891245e-06, + "loss": 0.8, + "step": 14320 + }, + { + "epoch": 0.7882106885354174, + "grad_norm": 0.8239984512329102, + "learning_rate": 6.657677095173911e-06, + "loss": 0.7283, + "step": 14321 + }, + { + "epoch": 0.7882657273377731, + "grad_norm": 0.7221176028251648, + "learning_rate": 6.6572681379979206e-06, + "loss": 0.8058, + "step": 14322 + }, + { + "epoch": 0.7883207661401288, + "grad_norm": 0.8297285437583923, + "learning_rate": 6.6568591683663475e-06, + "loss": 0.8064, + "step": 14323 + }, + { + "epoch": 0.7883758049424845, + "grad_norm": 0.680659294128418, + "learning_rate": 6.656450186282264e-06, + "loss": 0.7259, + "step": 14324 + }, + { + "epoch": 0.7884308437448401, + "grad_norm": 0.7067807912826538, + "learning_rate": 6.656041191748744e-06, + "loss": 0.8414, + "step": 14325 + }, + { + "epoch": 0.7884858825471958, + "grad_norm": 0.6053900718688965, + "learning_rate": 6.655632184768861e-06, + "loss": 0.6762, + "step": 14326 + }, + { + "epoch": 0.7885409213495514, + "grad_norm": 0.7123621106147766, + "learning_rate": 6.65522316534569e-06, + "loss": 0.6968, + "step": 14327 + }, + { + "epoch": 0.7885959601519071, + "grad_norm": 0.7308228015899658, + "learning_rate": 6.6548141334823045e-06, + "loss": 0.6715, + "step": 14328 + }, + { + "epoch": 0.7886509989542627, + "grad_norm": 0.7508199214935303, + "learning_rate": 6.654405089181779e-06, + "loss": 0.7884, + "step": 14329 + }, + { + "epoch": 0.7887060377566184, + "grad_norm": 0.7317141890525818, + "learning_rate": 6.653996032447188e-06, + "loss": 0.7319, + "step": 14330 + }, + { + "epoch": 0.7887610765589741, + "grad_norm": 0.6797091364860535, + "learning_rate": 6.653586963281607e-06, + "loss": 0.7898, + "step": 14331 + }, + { + "epoch": 0.7888161153613298, + "grad_norm": 0.6293582320213318, + "learning_rate": 6.6531778816881065e-06, + "loss": 0.6784, + "step": 14332 + }, + { + "epoch": 0.7888711541636854, + "grad_norm": 0.7604238986968994, + "learning_rate": 6.652768787669763e-06, + "loss": 0.7226, + "step": 14333 + }, + { + "epoch": 0.788926192966041, + "grad_norm": 0.6921128034591675, + "learning_rate": 6.652359681229654e-06, + "loss": 0.7375, + "step": 14334 + }, + { + "epoch": 0.7889812317683967, + "grad_norm": 0.6532993316650391, + "learning_rate": 6.651950562370851e-06, + "loss": 0.703, + "step": 14335 + }, + { + "epoch": 0.7890362705707524, + "grad_norm": 0.6739360094070435, + "learning_rate": 6.651541431096431e-06, + "loss": 0.7488, + "step": 14336 + }, + { + "epoch": 0.789091309373108, + "grad_norm": 0.7503200173377991, + "learning_rate": 6.651132287409466e-06, + "loss": 0.7492, + "step": 14337 + }, + { + "epoch": 0.7891463481754637, + "grad_norm": 0.6537551879882812, + "learning_rate": 6.650723131313035e-06, + "loss": 0.723, + "step": 14338 + }, + { + "epoch": 0.7892013869778194, + "grad_norm": 0.6378511786460876, + "learning_rate": 6.650313962810208e-06, + "loss": 0.7764, + "step": 14339 + }, + { + "epoch": 0.7892564257801751, + "grad_norm": 0.7948685884475708, + "learning_rate": 6.649904781904065e-06, + "loss": 0.7996, + "step": 14340 + }, + { + "epoch": 0.7893114645825307, + "grad_norm": 0.7558071613311768, + "learning_rate": 6.649495588597678e-06, + "loss": 0.8249, + "step": 14341 + }, + { + "epoch": 0.7893665033848863, + "grad_norm": 0.7158063054084778, + "learning_rate": 6.649086382894124e-06, + "loss": 0.815, + "step": 14342 + }, + { + "epoch": 0.789421542187242, + "grad_norm": 0.7551599144935608, + "learning_rate": 6.648677164796479e-06, + "loss": 0.7151, + "step": 14343 + }, + { + "epoch": 0.7894765809895977, + "grad_norm": 0.6966339349746704, + "learning_rate": 6.648267934307817e-06, + "loss": 0.8057, + "step": 14344 + }, + { + "epoch": 0.7895316197919533, + "grad_norm": 0.6863396167755127, + "learning_rate": 6.647858691431214e-06, + "loss": 0.7819, + "step": 14345 + }, + { + "epoch": 0.789586658594309, + "grad_norm": 0.7352383136749268, + "learning_rate": 6.647449436169747e-06, + "loss": 0.8101, + "step": 14346 + }, + { + "epoch": 0.7896416973966647, + "grad_norm": 0.7630855441093445, + "learning_rate": 6.64704016852649e-06, + "loss": 0.7155, + "step": 14347 + }, + { + "epoch": 0.7896967361990203, + "grad_norm": 0.6740198135375977, + "learning_rate": 6.646630888504522e-06, + "loss": 0.7255, + "step": 14348 + }, + { + "epoch": 0.7897517750013759, + "grad_norm": 0.7095367908477783, + "learning_rate": 6.646221596106917e-06, + "loss": 0.7527, + "step": 14349 + }, + { + "epoch": 0.7898068138037316, + "grad_norm": 0.6096131801605225, + "learning_rate": 6.645812291336749e-06, + "loss": 0.7116, + "step": 14350 + }, + { + "epoch": 0.7898618526060873, + "grad_norm": 0.7212585210800171, + "learning_rate": 6.645402974197097e-06, + "loss": 0.7647, + "step": 14351 + }, + { + "epoch": 0.7899168914084429, + "grad_norm": 0.7145454287528992, + "learning_rate": 6.6449936446910376e-06, + "loss": 0.7988, + "step": 14352 + }, + { + "epoch": 0.7899719302107986, + "grad_norm": 0.668269693851471, + "learning_rate": 6.644584302821646e-06, + "loss": 0.8453, + "step": 14353 + }, + { + "epoch": 0.7900269690131543, + "grad_norm": 0.7431649565696716, + "learning_rate": 6.644174948591998e-06, + "loss": 0.6981, + "step": 14354 + }, + { + "epoch": 0.79008200781551, + "grad_norm": 0.6727485060691833, + "learning_rate": 6.643765582005172e-06, + "loss": 0.792, + "step": 14355 + }, + { + "epoch": 0.7901370466178655, + "grad_norm": 0.7102059721946716, + "learning_rate": 6.643356203064244e-06, + "loss": 0.7469, + "step": 14356 + }, + { + "epoch": 0.7901920854202212, + "grad_norm": 0.6719706654548645, + "learning_rate": 6.642946811772291e-06, + "loss": 0.7542, + "step": 14357 + }, + { + "epoch": 0.7902471242225769, + "grad_norm": 0.7044880986213684, + "learning_rate": 6.6425374081323875e-06, + "loss": 0.7884, + "step": 14358 + }, + { + "epoch": 0.7903021630249326, + "grad_norm": 0.656411349773407, + "learning_rate": 6.642127992147614e-06, + "loss": 0.7596, + "step": 14359 + }, + { + "epoch": 0.7903572018272882, + "grad_norm": 0.6256445050239563, + "learning_rate": 6.641718563821047e-06, + "loss": 0.6257, + "step": 14360 + }, + { + "epoch": 0.7904122406296439, + "grad_norm": 0.6761715412139893, + "learning_rate": 6.641309123155761e-06, + "loss": 0.7024, + "step": 14361 + }, + { + "epoch": 0.7904672794319996, + "grad_norm": 0.7567794322967529, + "learning_rate": 6.640899670154837e-06, + "loss": 0.7948, + "step": 14362 + }, + { + "epoch": 0.7905223182343553, + "grad_norm": 0.6192977428436279, + "learning_rate": 6.640490204821349e-06, + "loss": 0.7307, + "step": 14363 + }, + { + "epoch": 0.7905773570367108, + "grad_norm": 0.8120929002761841, + "learning_rate": 6.640080727158376e-06, + "loss": 0.7173, + "step": 14364 + }, + { + "epoch": 0.7906323958390665, + "grad_norm": 0.7303271293640137, + "learning_rate": 6.639671237168996e-06, + "loss": 0.8118, + "step": 14365 + }, + { + "epoch": 0.7906874346414222, + "grad_norm": 0.6731529831886292, + "learning_rate": 6.639261734856284e-06, + "loss": 0.76, + "step": 14366 + }, + { + "epoch": 0.7907424734437779, + "grad_norm": 0.6909935474395752, + "learning_rate": 6.638852220223321e-06, + "loss": 0.7732, + "step": 14367 + }, + { + "epoch": 0.7907975122461335, + "grad_norm": 0.6543979048728943, + "learning_rate": 6.638442693273183e-06, + "loss": 0.7408, + "step": 14368 + }, + { + "epoch": 0.7908525510484892, + "grad_norm": 0.6411511301994324, + "learning_rate": 6.6380331540089485e-06, + "loss": 0.6963, + "step": 14369 + }, + { + "epoch": 0.7909075898508449, + "grad_norm": 0.6657214164733887, + "learning_rate": 6.637623602433694e-06, + "loss": 0.7417, + "step": 14370 + }, + { + "epoch": 0.7909626286532006, + "grad_norm": 0.6852405071258545, + "learning_rate": 6.6372140385505e-06, + "loss": 0.7176, + "step": 14371 + }, + { + "epoch": 0.7910176674555561, + "grad_norm": 0.6453777551651001, + "learning_rate": 6.636804462362444e-06, + "loss": 0.7791, + "step": 14372 + }, + { + "epoch": 0.7910727062579118, + "grad_norm": 0.6806328296661377, + "learning_rate": 6.636394873872603e-06, + "loss": 0.7856, + "step": 14373 + }, + { + "epoch": 0.7911277450602675, + "grad_norm": 0.6819495558738708, + "learning_rate": 6.635985273084058e-06, + "loss": 0.7865, + "step": 14374 + }, + { + "epoch": 0.7911827838626232, + "grad_norm": 0.7372999787330627, + "learning_rate": 6.635575659999883e-06, + "loss": 0.8549, + "step": 14375 + }, + { + "epoch": 0.7912378226649788, + "grad_norm": 0.8146817684173584, + "learning_rate": 6.635166034623162e-06, + "loss": 0.7253, + "step": 14376 + }, + { + "epoch": 0.7912928614673345, + "grad_norm": 0.8205630779266357, + "learning_rate": 6.634756396956969e-06, + "loss": 0.6915, + "step": 14377 + }, + { + "epoch": 0.7913479002696902, + "grad_norm": 0.7168713808059692, + "learning_rate": 6.634346747004383e-06, + "loss": 0.7495, + "step": 14378 + }, + { + "epoch": 0.7914029390720458, + "grad_norm": 0.7210709452629089, + "learning_rate": 6.6339370847684854e-06, + "loss": 0.7323, + "step": 14379 + }, + { + "epoch": 0.7914579778744014, + "grad_norm": 0.9042065143585205, + "learning_rate": 6.633527410252355e-06, + "loss": 0.847, + "step": 14380 + }, + { + "epoch": 0.7915130166767571, + "grad_norm": 0.6700118184089661, + "learning_rate": 6.633117723459071e-06, + "loss": 0.7975, + "step": 14381 + }, + { + "epoch": 0.7915680554791128, + "grad_norm": 0.6355725526809692, + "learning_rate": 6.632708024391707e-06, + "loss": 0.7398, + "step": 14382 + }, + { + "epoch": 0.7916230942814685, + "grad_norm": 0.8274535536766052, + "learning_rate": 6.6322983130533505e-06, + "loss": 0.8641, + "step": 14383 + }, + { + "epoch": 0.7916781330838241, + "grad_norm": 0.5835573077201843, + "learning_rate": 6.631888589447075e-06, + "loss": 0.636, + "step": 14384 + }, + { + "epoch": 0.7917331718861798, + "grad_norm": 0.6933130621910095, + "learning_rate": 6.631478853575963e-06, + "loss": 0.7874, + "step": 14385 + }, + { + "epoch": 0.7917882106885354, + "grad_norm": 0.8125241994857788, + "learning_rate": 6.631069105443092e-06, + "loss": 0.7961, + "step": 14386 + }, + { + "epoch": 0.7918432494908911, + "grad_norm": 0.6661116480827332, + "learning_rate": 6.630659345051542e-06, + "loss": 0.6498, + "step": 14387 + }, + { + "epoch": 0.7918982882932467, + "grad_norm": 0.6807548403739929, + "learning_rate": 6.630249572404393e-06, + "loss": 0.6952, + "step": 14388 + }, + { + "epoch": 0.7919533270956024, + "grad_norm": 0.6886214017868042, + "learning_rate": 6.629839787504726e-06, + "loss": 0.7416, + "step": 14389 + }, + { + "epoch": 0.7920083658979581, + "grad_norm": 0.7633732557296753, + "learning_rate": 6.629429990355617e-06, + "loss": 0.8008, + "step": 14390 + }, + { + "epoch": 0.7920634047003137, + "grad_norm": 0.8401023745536804, + "learning_rate": 6.6290201809601494e-06, + "loss": 0.8312, + "step": 14391 + }, + { + "epoch": 0.7921184435026694, + "grad_norm": 0.6608526706695557, + "learning_rate": 6.628610359321403e-06, + "loss": 0.563, + "step": 14392 + }, + { + "epoch": 0.792173482305025, + "grad_norm": 0.687045156955719, + "learning_rate": 6.6282005254424566e-06, + "loss": 0.7451, + "step": 14393 + }, + { + "epoch": 0.7922285211073807, + "grad_norm": 0.7129287123680115, + "learning_rate": 6.627790679326389e-06, + "loss": 0.8495, + "step": 14394 + }, + { + "epoch": 0.7922835599097363, + "grad_norm": 0.6951952576637268, + "learning_rate": 6.627380820976283e-06, + "loss": 0.7895, + "step": 14395 + }, + { + "epoch": 0.792338598712092, + "grad_norm": 0.8020780086517334, + "learning_rate": 6.626970950395221e-06, + "loss": 0.7136, + "step": 14396 + }, + { + "epoch": 0.7923936375144477, + "grad_norm": 0.6654007434844971, + "learning_rate": 6.626561067586279e-06, + "loss": 0.7865, + "step": 14397 + }, + { + "epoch": 0.7924486763168034, + "grad_norm": 0.844744861125946, + "learning_rate": 6.62615117255254e-06, + "loss": 0.7856, + "step": 14398 + }, + { + "epoch": 0.792503715119159, + "grad_norm": 0.6890879273414612, + "learning_rate": 6.625741265297083e-06, + "loss": 0.7574, + "step": 14399 + }, + { + "epoch": 0.7925587539215146, + "grad_norm": 0.7559735774993896, + "learning_rate": 6.625331345822992e-06, + "loss": 0.634, + "step": 14400 + }, + { + "epoch": 0.7926137927238703, + "grad_norm": 0.6918107867240906, + "learning_rate": 6.624921414133344e-06, + "loss": 0.6935, + "step": 14401 + }, + { + "epoch": 0.792668831526226, + "grad_norm": 0.7468792200088501, + "learning_rate": 6.624511470231221e-06, + "loss": 0.7301, + "step": 14402 + }, + { + "epoch": 0.7927238703285816, + "grad_norm": 0.6749486327171326, + "learning_rate": 6.624101514119705e-06, + "loss": 0.7143, + "step": 14403 + }, + { + "epoch": 0.7927789091309373, + "grad_norm": 0.7765836119651794, + "learning_rate": 6.623691545801878e-06, + "loss": 0.7201, + "step": 14404 + }, + { + "epoch": 0.792833947933293, + "grad_norm": 0.6263312697410583, + "learning_rate": 6.623281565280819e-06, + "loss": 0.5866, + "step": 14405 + }, + { + "epoch": 0.7928889867356487, + "grad_norm": 0.6325232982635498, + "learning_rate": 6.62287157255961e-06, + "loss": 0.7389, + "step": 14406 + }, + { + "epoch": 0.7929440255380042, + "grad_norm": 0.7165958881378174, + "learning_rate": 6.622461567641333e-06, + "loss": 0.7378, + "step": 14407 + }, + { + "epoch": 0.7929990643403599, + "grad_norm": 0.7611519694328308, + "learning_rate": 6.62205155052907e-06, + "loss": 0.7146, + "step": 14408 + }, + { + "epoch": 0.7930541031427156, + "grad_norm": 0.6764969825744629, + "learning_rate": 6.6216415212259e-06, + "loss": 0.7802, + "step": 14409 + }, + { + "epoch": 0.7931091419450713, + "grad_norm": 0.7266956567764282, + "learning_rate": 6.621231479734908e-06, + "loss": 0.7065, + "step": 14410 + }, + { + "epoch": 0.7931641807474269, + "grad_norm": 0.7540454268455505, + "learning_rate": 6.620821426059174e-06, + "loss": 0.7327, + "step": 14411 + }, + { + "epoch": 0.7932192195497826, + "grad_norm": 0.7931423783302307, + "learning_rate": 6.620411360201779e-06, + "loss": 0.8032, + "step": 14412 + }, + { + "epoch": 0.7932742583521383, + "grad_norm": 1.2976648807525635, + "learning_rate": 6.620001282165808e-06, + "loss": 0.7422, + "step": 14413 + }, + { + "epoch": 0.793329297154494, + "grad_norm": 0.6525906920433044, + "learning_rate": 6.619591191954338e-06, + "loss": 0.6857, + "step": 14414 + }, + { + "epoch": 0.7933843359568495, + "grad_norm": 0.6153263449668884, + "learning_rate": 6.619181089570456e-06, + "loss": 0.6117, + "step": 14415 + }, + { + "epoch": 0.7934393747592052, + "grad_norm": 0.7076815962791443, + "learning_rate": 6.6187709750172425e-06, + "loss": 0.8053, + "step": 14416 + }, + { + "epoch": 0.7934944135615609, + "grad_norm": 0.6999046802520752, + "learning_rate": 6.618360848297779e-06, + "loss": 0.6275, + "step": 14417 + }, + { + "epoch": 0.7935494523639166, + "grad_norm": 0.7043859958648682, + "learning_rate": 6.6179507094151484e-06, + "loss": 0.8273, + "step": 14418 + }, + { + "epoch": 0.7936044911662722, + "grad_norm": 0.6295393705368042, + "learning_rate": 6.617540558372434e-06, + "loss": 0.6394, + "step": 14419 + }, + { + "epoch": 0.7936595299686279, + "grad_norm": 0.8165664076805115, + "learning_rate": 6.617130395172718e-06, + "loss": 0.8473, + "step": 14420 + }, + { + "epoch": 0.7937145687709836, + "grad_norm": 0.7598135471343994, + "learning_rate": 6.616720219819082e-06, + "loss": 0.729, + "step": 14421 + }, + { + "epoch": 0.7937696075733393, + "grad_norm": 0.7222034335136414, + "learning_rate": 6.6163100323146105e-06, + "loss": 0.7526, + "step": 14422 + }, + { + "epoch": 0.7938246463756948, + "grad_norm": 0.7994693517684937, + "learning_rate": 6.615899832662385e-06, + "loss": 0.8346, + "step": 14423 + }, + { + "epoch": 0.7938796851780505, + "grad_norm": 0.6603162884712219, + "learning_rate": 6.615489620865489e-06, + "loss": 0.7546, + "step": 14424 + }, + { + "epoch": 0.7939347239804062, + "grad_norm": 0.6525929570198059, + "learning_rate": 6.615079396927005e-06, + "loss": 0.7344, + "step": 14425 + }, + { + "epoch": 0.7939897627827619, + "grad_norm": 0.6144835948944092, + "learning_rate": 6.614669160850016e-06, + "loss": 0.6776, + "step": 14426 + }, + { + "epoch": 0.7940448015851175, + "grad_norm": 0.7205507159233093, + "learning_rate": 6.614258912637607e-06, + "loss": 0.809, + "step": 14427 + }, + { + "epoch": 0.7940998403874732, + "grad_norm": 0.6757732629776001, + "learning_rate": 6.61384865229286e-06, + "loss": 0.7403, + "step": 14428 + }, + { + "epoch": 0.7941548791898289, + "grad_norm": 0.6392103433609009, + "learning_rate": 6.6134383798188586e-06, + "loss": 0.7689, + "step": 14429 + }, + { + "epoch": 0.7942099179921845, + "grad_norm": 0.6647289395332336, + "learning_rate": 6.613028095218685e-06, + "loss": 0.6611, + "step": 14430 + }, + { + "epoch": 0.7942649567945401, + "grad_norm": 0.6961668133735657, + "learning_rate": 6.612617798495426e-06, + "loss": 0.7784, + "step": 14431 + }, + { + "epoch": 0.7943199955968958, + "grad_norm": 1.1188037395477295, + "learning_rate": 6.6122074896521615e-06, + "loss": 0.6518, + "step": 14432 + }, + { + "epoch": 0.7943750343992515, + "grad_norm": 0.6382507085800171, + "learning_rate": 6.611797168691978e-06, + "loss": 0.6954, + "step": 14433 + }, + { + "epoch": 0.7944300732016071, + "grad_norm": 0.6720117330551147, + "learning_rate": 6.6113868356179585e-06, + "loss": 0.7267, + "step": 14434 + }, + { + "epoch": 0.7944851120039628, + "grad_norm": 0.6667274832725525, + "learning_rate": 6.610976490433186e-06, + "loss": 0.6867, + "step": 14435 + }, + { + "epoch": 0.7945401508063185, + "grad_norm": 0.658217191696167, + "learning_rate": 6.610566133140747e-06, + "loss": 0.66, + "step": 14436 + }, + { + "epoch": 0.7945951896086741, + "grad_norm": 0.6820386648178101, + "learning_rate": 6.610155763743723e-06, + "loss": 0.7352, + "step": 14437 + }, + { + "epoch": 0.7946502284110297, + "grad_norm": 0.788696825504303, + "learning_rate": 6.609745382245198e-06, + "loss": 0.6822, + "step": 14438 + }, + { + "epoch": 0.7947052672133854, + "grad_norm": 0.6485540270805359, + "learning_rate": 6.6093349886482596e-06, + "loss": 0.718, + "step": 14439 + }, + { + "epoch": 0.7947603060157411, + "grad_norm": 0.717659056186676, + "learning_rate": 6.60892458295599e-06, + "loss": 0.7898, + "step": 14440 + }, + { + "epoch": 0.7948153448180968, + "grad_norm": 0.6576352119445801, + "learning_rate": 6.608514165171473e-06, + "loss": 0.8041, + "step": 14441 + }, + { + "epoch": 0.7948703836204524, + "grad_norm": 0.7034726738929749, + "learning_rate": 6.608103735297795e-06, + "loss": 0.7901, + "step": 14442 + }, + { + "epoch": 0.7949254224228081, + "grad_norm": 0.7001451253890991, + "learning_rate": 6.6076932933380386e-06, + "loss": 0.6814, + "step": 14443 + }, + { + "epoch": 0.7949804612251637, + "grad_norm": 0.789359450340271, + "learning_rate": 6.607282839295291e-06, + "loss": 0.744, + "step": 14444 + }, + { + "epoch": 0.7950355000275194, + "grad_norm": 0.7830412983894348, + "learning_rate": 6.606872373172636e-06, + "loss": 0.8161, + "step": 14445 + }, + { + "epoch": 0.795090538829875, + "grad_norm": 0.6462455987930298, + "learning_rate": 6.606461894973157e-06, + "loss": 0.7723, + "step": 14446 + }, + { + "epoch": 0.7951455776322307, + "grad_norm": 0.6232526898384094, + "learning_rate": 6.606051404699943e-06, + "loss": 0.6723, + "step": 14447 + }, + { + "epoch": 0.7952006164345864, + "grad_norm": 0.7790026068687439, + "learning_rate": 6.605640902356074e-06, + "loss": 0.7687, + "step": 14448 + }, + { + "epoch": 0.7952556552369421, + "grad_norm": 0.7281851768493652, + "learning_rate": 6.605230387944639e-06, + "loss": 0.827, + "step": 14449 + }, + { + "epoch": 0.7953106940392977, + "grad_norm": 0.6519556045532227, + "learning_rate": 6.604819861468721e-06, + "loss": 0.7039, + "step": 14450 + }, + { + "epoch": 0.7953657328416534, + "grad_norm": 0.6768763661384583, + "learning_rate": 6.604409322931406e-06, + "loss": 0.7288, + "step": 14451 + }, + { + "epoch": 0.795420771644009, + "grad_norm": 0.7457320094108582, + "learning_rate": 6.6039987723357825e-06, + "loss": 0.8386, + "step": 14452 + }, + { + "epoch": 0.7954758104463647, + "grad_norm": 0.9579072594642639, + "learning_rate": 6.6035882096849325e-06, + "loss": 0.7552, + "step": 14453 + }, + { + "epoch": 0.7955308492487203, + "grad_norm": 0.6709916591644287, + "learning_rate": 6.603177634981941e-06, + "loss": 0.724, + "step": 14454 + }, + { + "epoch": 0.795585888051076, + "grad_norm": 0.6097317934036255, + "learning_rate": 6.602767048229897e-06, + "loss": 0.6866, + "step": 14455 + }, + { + "epoch": 0.7956409268534317, + "grad_norm": 0.7303394675254822, + "learning_rate": 6.602356449431885e-06, + "loss": 0.682, + "step": 14456 + }, + { + "epoch": 0.7956959656557874, + "grad_norm": 0.775979220867157, + "learning_rate": 6.601945838590991e-06, + "loss": 0.7784, + "step": 14457 + }, + { + "epoch": 0.795751004458143, + "grad_norm": 0.7016483545303345, + "learning_rate": 6.6015352157103e-06, + "loss": 0.7557, + "step": 14458 + }, + { + "epoch": 0.7958060432604986, + "grad_norm": 0.688946545124054, + "learning_rate": 6.6011245807929e-06, + "loss": 0.707, + "step": 14459 + }, + { + "epoch": 0.7958610820628543, + "grad_norm": 0.7286174297332764, + "learning_rate": 6.600713933841877e-06, + "loss": 0.784, + "step": 14460 + }, + { + "epoch": 0.79591612086521, + "grad_norm": 0.7604749798774719, + "learning_rate": 6.600303274860316e-06, + "loss": 0.7099, + "step": 14461 + }, + { + "epoch": 0.7959711596675656, + "grad_norm": 0.6626706123352051, + "learning_rate": 6.599892603851301e-06, + "loss": 0.7137, + "step": 14462 + }, + { + "epoch": 0.7960261984699213, + "grad_norm": 0.7692080736160278, + "learning_rate": 6.599481920817925e-06, + "loss": 0.847, + "step": 14463 + }, + { + "epoch": 0.796081237272277, + "grad_norm": 0.6811042428016663, + "learning_rate": 6.599071225763269e-06, + "loss": 0.7888, + "step": 14464 + }, + { + "epoch": 0.7961362760746327, + "grad_norm": 0.654481053352356, + "learning_rate": 6.598660518690424e-06, + "loss": 0.6973, + "step": 14465 + }, + { + "epoch": 0.7961913148769882, + "grad_norm": 0.7332738637924194, + "learning_rate": 6.598249799602472e-06, + "loss": 0.8311, + "step": 14466 + }, + { + "epoch": 0.7962463536793439, + "grad_norm": 0.7098381519317627, + "learning_rate": 6.597839068502503e-06, + "loss": 0.8265, + "step": 14467 + }, + { + "epoch": 0.7963013924816996, + "grad_norm": 0.6338212490081787, + "learning_rate": 6.597428325393604e-06, + "loss": 0.6889, + "step": 14468 + }, + { + "epoch": 0.7963564312840553, + "grad_norm": 0.7001339197158813, + "learning_rate": 6.597017570278861e-06, + "loss": 0.7613, + "step": 14469 + }, + { + "epoch": 0.7964114700864109, + "grad_norm": 0.6565783619880676, + "learning_rate": 6.596606803161361e-06, + "loss": 0.6284, + "step": 14470 + }, + { + "epoch": 0.7964665088887666, + "grad_norm": 0.6638015508651733, + "learning_rate": 6.5961960240441935e-06, + "loss": 0.6635, + "step": 14471 + }, + { + "epoch": 0.7965215476911223, + "grad_norm": 0.6389575600624084, + "learning_rate": 6.595785232930443e-06, + "loss": 0.6588, + "step": 14472 + }, + { + "epoch": 0.796576586493478, + "grad_norm": 0.9486858248710632, + "learning_rate": 6.595374429823197e-06, + "loss": 0.8314, + "step": 14473 + }, + { + "epoch": 0.7966316252958335, + "grad_norm": 0.7555649280548096, + "learning_rate": 6.594963614725544e-06, + "loss": 0.8173, + "step": 14474 + }, + { + "epoch": 0.7966866640981892, + "grad_norm": 0.63021320104599, + "learning_rate": 6.5945527876405715e-06, + "loss": 0.7038, + "step": 14475 + }, + { + "epoch": 0.7967417029005449, + "grad_norm": 0.802980899810791, + "learning_rate": 6.594141948571366e-06, + "loss": 0.8031, + "step": 14476 + }, + { + "epoch": 0.7967967417029005, + "grad_norm": 0.7204614281654358, + "learning_rate": 6.593731097521019e-06, + "loss": 0.827, + "step": 14477 + }, + { + "epoch": 0.7968517805052562, + "grad_norm": 0.6805211305618286, + "learning_rate": 6.593320234492613e-06, + "loss": 0.7405, + "step": 14478 + }, + { + "epoch": 0.7969068193076119, + "grad_norm": 0.7011345028877258, + "learning_rate": 6.59290935948924e-06, + "loss": 0.7241, + "step": 14479 + }, + { + "epoch": 0.7969618581099676, + "grad_norm": 0.8995540738105774, + "learning_rate": 6.592498472513986e-06, + "loss": 0.6864, + "step": 14480 + }, + { + "epoch": 0.7970168969123231, + "grad_norm": 0.7518284320831299, + "learning_rate": 6.592087573569941e-06, + "loss": 0.7561, + "step": 14481 + }, + { + "epoch": 0.7970719357146788, + "grad_norm": 0.6359231472015381, + "learning_rate": 6.591676662660191e-06, + "loss": 0.6402, + "step": 14482 + }, + { + "epoch": 0.7971269745170345, + "grad_norm": 0.6610120534896851, + "learning_rate": 6.5912657397878264e-06, + "loss": 0.6419, + "step": 14483 + }, + { + "epoch": 0.7971820133193902, + "grad_norm": 0.7054341435432434, + "learning_rate": 6.590854804955934e-06, + "loss": 0.7252, + "step": 14484 + }, + { + "epoch": 0.7972370521217458, + "grad_norm": 0.6929903626441956, + "learning_rate": 6.5904438581676025e-06, + "loss": 0.6566, + "step": 14485 + }, + { + "epoch": 0.7972920909241015, + "grad_norm": 0.7354124188423157, + "learning_rate": 6.59003289942592e-06, + "loss": 0.763, + "step": 14486 + }, + { + "epoch": 0.7973471297264572, + "grad_norm": 0.6366610527038574, + "learning_rate": 6.5896219287339755e-06, + "loss": 0.6601, + "step": 14487 + }, + { + "epoch": 0.7974021685288128, + "grad_norm": 0.6916924715042114, + "learning_rate": 6.589210946094859e-06, + "loss": 0.7683, + "step": 14488 + }, + { + "epoch": 0.7974572073311684, + "grad_norm": 0.6567399501800537, + "learning_rate": 6.5887999515116586e-06, + "loss": 0.7487, + "step": 14489 + }, + { + "epoch": 0.7975122461335241, + "grad_norm": 0.8082888722419739, + "learning_rate": 6.5883889449874626e-06, + "loss": 0.7579, + "step": 14490 + }, + { + "epoch": 0.7975672849358798, + "grad_norm": 0.7138401865959167, + "learning_rate": 6.58797792652536e-06, + "loss": 0.7256, + "step": 14491 + }, + { + "epoch": 0.7976223237382355, + "grad_norm": 0.6514482498168945, + "learning_rate": 6.587566896128441e-06, + "loss": 0.6612, + "step": 14492 + }, + { + "epoch": 0.7976773625405911, + "grad_norm": 0.6770455837249756, + "learning_rate": 6.587155853799795e-06, + "loss": 0.677, + "step": 14493 + }, + { + "epoch": 0.7977324013429468, + "grad_norm": 0.6956327557563782, + "learning_rate": 6.586744799542511e-06, + "loss": 0.7824, + "step": 14494 + }, + { + "epoch": 0.7977874401453025, + "grad_norm": 0.6565653085708618, + "learning_rate": 6.586333733359676e-06, + "loss": 0.7496, + "step": 14495 + }, + { + "epoch": 0.7978424789476581, + "grad_norm": 0.6353399157524109, + "learning_rate": 6.585922655254382e-06, + "loss": 0.7264, + "step": 14496 + }, + { + "epoch": 0.7978975177500137, + "grad_norm": 1.037051796913147, + "learning_rate": 6.585511565229717e-06, + "loss": 0.7562, + "step": 14497 + }, + { + "epoch": 0.7979525565523694, + "grad_norm": 0.6447896957397461, + "learning_rate": 6.5851004632887725e-06, + "loss": 0.7509, + "step": 14498 + }, + { + "epoch": 0.7980075953547251, + "grad_norm": 0.7022401690483093, + "learning_rate": 6.584689349434636e-06, + "loss": 0.7752, + "step": 14499 + }, + { + "epoch": 0.7980626341570808, + "grad_norm": 0.7033591270446777, + "learning_rate": 6.5842782236703996e-06, + "loss": 0.7693, + "step": 14500 + }, + { + "epoch": 0.7981176729594364, + "grad_norm": 0.7061769962310791, + "learning_rate": 6.583867085999151e-06, + "loss": 0.6833, + "step": 14501 + }, + { + "epoch": 0.798172711761792, + "grad_norm": 0.7934882640838623, + "learning_rate": 6.583455936423984e-06, + "loss": 0.799, + "step": 14502 + }, + { + "epoch": 0.7982277505641477, + "grad_norm": 0.6968011260032654, + "learning_rate": 6.5830447749479835e-06, + "loss": 0.7132, + "step": 14503 + }, + { + "epoch": 0.7982827893665034, + "grad_norm": 1.7348299026489258, + "learning_rate": 6.582633601574243e-06, + "loss": 0.8996, + "step": 14504 + }, + { + "epoch": 0.798337828168859, + "grad_norm": 0.6822964549064636, + "learning_rate": 6.582222416305852e-06, + "loss": 0.7381, + "step": 14505 + }, + { + "epoch": 0.7983928669712147, + "grad_norm": 0.6600543856620789, + "learning_rate": 6.581811219145902e-06, + "loss": 0.711, + "step": 14506 + }, + { + "epoch": 0.7984479057735704, + "grad_norm": 0.8719834089279175, + "learning_rate": 6.581400010097481e-06, + "loss": 0.7567, + "step": 14507 + }, + { + "epoch": 0.7985029445759261, + "grad_norm": 0.7221046090126038, + "learning_rate": 6.580988789163681e-06, + "loss": 0.7417, + "step": 14508 + }, + { + "epoch": 0.7985579833782817, + "grad_norm": 0.6720401048660278, + "learning_rate": 6.580577556347592e-06, + "loss": 0.7467, + "step": 14509 + }, + { + "epoch": 0.7986130221806373, + "grad_norm": 0.7007263898849487, + "learning_rate": 6.580166311652306e-06, + "loss": 0.7356, + "step": 14510 + }, + { + "epoch": 0.798668060982993, + "grad_norm": 0.7384739518165588, + "learning_rate": 6.579755055080912e-06, + "loss": 0.7807, + "step": 14511 + }, + { + "epoch": 0.7987230997853487, + "grad_norm": 0.8054519295692444, + "learning_rate": 6.579343786636503e-06, + "loss": 0.7737, + "step": 14512 + }, + { + "epoch": 0.7987781385877043, + "grad_norm": 1.042319655418396, + "learning_rate": 6.578932506322169e-06, + "loss": 0.8708, + "step": 14513 + }, + { + "epoch": 0.79883317739006, + "grad_norm": 0.7122198343276978, + "learning_rate": 6.578521214141e-06, + "loss": 0.7818, + "step": 14514 + }, + { + "epoch": 0.7988882161924157, + "grad_norm": 0.9158271551132202, + "learning_rate": 6.578109910096088e-06, + "loss": 0.7439, + "step": 14515 + }, + { + "epoch": 0.7989432549947714, + "grad_norm": 0.7280082106590271, + "learning_rate": 6.577698594190524e-06, + "loss": 0.7888, + "step": 14516 + }, + { + "epoch": 0.798998293797127, + "grad_norm": 0.8203748464584351, + "learning_rate": 6.577287266427401e-06, + "loss": 0.7669, + "step": 14517 + }, + { + "epoch": 0.7990533325994826, + "grad_norm": 0.6998257637023926, + "learning_rate": 6.576875926809809e-06, + "loss": 0.7819, + "step": 14518 + }, + { + "epoch": 0.7991083714018383, + "grad_norm": 0.672575831413269, + "learning_rate": 6.57646457534084e-06, + "loss": 0.7359, + "step": 14519 + }, + { + "epoch": 0.7991634102041939, + "grad_norm": 0.931996762752533, + "learning_rate": 6.5760532120235845e-06, + "loss": 0.8816, + "step": 14520 + }, + { + "epoch": 0.7992184490065496, + "grad_norm": 0.7250553369522095, + "learning_rate": 6.575641836861134e-06, + "loss": 0.7924, + "step": 14521 + }, + { + "epoch": 0.7992734878089053, + "grad_norm": 0.6658768057823181, + "learning_rate": 6.575230449856582e-06, + "loss": 0.7064, + "step": 14522 + }, + { + "epoch": 0.799328526611261, + "grad_norm": 0.6901206374168396, + "learning_rate": 6.57481905101302e-06, + "loss": 0.7826, + "step": 14523 + }, + { + "epoch": 0.7993835654136165, + "grad_norm": 0.6772152781486511, + "learning_rate": 6.5744076403335386e-06, + "loss": 0.8143, + "step": 14524 + }, + { + "epoch": 0.7994386042159722, + "grad_norm": 0.6718147397041321, + "learning_rate": 6.5739962178212325e-06, + "loss": 0.765, + "step": 14525 + }, + { + "epoch": 0.7994936430183279, + "grad_norm": 0.7435488700866699, + "learning_rate": 6.573584783479191e-06, + "loss": 0.8685, + "step": 14526 + }, + { + "epoch": 0.7995486818206836, + "grad_norm": 0.7146314382553101, + "learning_rate": 6.573173337310506e-06, + "loss": 0.7605, + "step": 14527 + }, + { + "epoch": 0.7996037206230392, + "grad_norm": 0.6808409690856934, + "learning_rate": 6.572761879318274e-06, + "loss": 0.6996, + "step": 14528 + }, + { + "epoch": 0.7996587594253949, + "grad_norm": 1.1303905248641968, + "learning_rate": 6.572350409505584e-06, + "loss": 0.6107, + "step": 14529 + }, + { + "epoch": 0.7997137982277506, + "grad_norm": 0.7584583163261414, + "learning_rate": 6.571938927875529e-06, + "loss": 0.771, + "step": 14530 + }, + { + "epoch": 0.7997688370301063, + "grad_norm": 0.808233916759491, + "learning_rate": 6.5715274344312015e-06, + "loss": 0.7179, + "step": 14531 + }, + { + "epoch": 0.7998238758324618, + "grad_norm": 0.7067314386367798, + "learning_rate": 6.571115929175695e-06, + "loss": 0.7519, + "step": 14532 + }, + { + "epoch": 0.7998789146348175, + "grad_norm": 0.7611628174781799, + "learning_rate": 6.570704412112101e-06, + "loss": 0.8727, + "step": 14533 + }, + { + "epoch": 0.7999339534371732, + "grad_norm": 0.6485727429389954, + "learning_rate": 6.5702928832435145e-06, + "loss": 0.8455, + "step": 14534 + }, + { + "epoch": 0.7999889922395289, + "grad_norm": 1.5309134721755981, + "learning_rate": 6.569881342573024e-06, + "loss": 0.8362, + "step": 14535 + }, + { + "epoch": 0.8000440310418845, + "grad_norm": 0.7068225145339966, + "learning_rate": 6.569469790103729e-06, + "loss": 0.7924, + "step": 14536 + }, + { + "epoch": 0.8000990698442402, + "grad_norm": 0.7326669692993164, + "learning_rate": 6.569058225838717e-06, + "loss": 0.7594, + "step": 14537 + }, + { + "epoch": 0.8001541086465959, + "grad_norm": 0.6705706119537354, + "learning_rate": 6.568646649781085e-06, + "loss": 0.7331, + "step": 14538 + }, + { + "epoch": 0.8002091474489516, + "grad_norm": 0.7303051948547363, + "learning_rate": 6.568235061933923e-06, + "loss": 0.7274, + "step": 14539 + }, + { + "epoch": 0.8002641862513071, + "grad_norm": 0.6334550380706787, + "learning_rate": 6.567823462300326e-06, + "loss": 0.7105, + "step": 14540 + }, + { + "epoch": 0.8003192250536628, + "grad_norm": 0.7183839678764343, + "learning_rate": 6.56741185088339e-06, + "loss": 0.657, + "step": 14541 + }, + { + "epoch": 0.8003742638560185, + "grad_norm": 0.6896400451660156, + "learning_rate": 6.567000227686204e-06, + "loss": 0.7752, + "step": 14542 + }, + { + "epoch": 0.8004293026583742, + "grad_norm": 0.7214651703834534, + "learning_rate": 6.566588592711864e-06, + "loss": 0.753, + "step": 14543 + }, + { + "epoch": 0.8004843414607298, + "grad_norm": 0.7064470648765564, + "learning_rate": 6.566176945963464e-06, + "loss": 0.744, + "step": 14544 + }, + { + "epoch": 0.8005393802630855, + "grad_norm": 0.696674644947052, + "learning_rate": 6.565765287444097e-06, + "loss": 0.6822, + "step": 14545 + }, + { + "epoch": 0.8005944190654412, + "grad_norm": 0.711722195148468, + "learning_rate": 6.5653536171568574e-06, + "loss": 0.7724, + "step": 14546 + }, + { + "epoch": 0.8006494578677968, + "grad_norm": 0.791977047920227, + "learning_rate": 6.564941935104838e-06, + "loss": 0.7913, + "step": 14547 + }, + { + "epoch": 0.8007044966701524, + "grad_norm": 0.6904259920120239, + "learning_rate": 6.564530241291135e-06, + "loss": 0.7732, + "step": 14548 + }, + { + "epoch": 0.8007595354725081, + "grad_norm": 0.6089264750480652, + "learning_rate": 6.564118535718842e-06, + "loss": 0.6506, + "step": 14549 + }, + { + "epoch": 0.8008145742748638, + "grad_norm": 0.6502360105514526, + "learning_rate": 6.563706818391051e-06, + "loss": 0.6638, + "step": 14550 + }, + { + "epoch": 0.8008696130772195, + "grad_norm": 0.6249814033508301, + "learning_rate": 6.563295089310859e-06, + "loss": 0.7066, + "step": 14551 + }, + { + "epoch": 0.8009246518795751, + "grad_norm": 0.8013060688972473, + "learning_rate": 6.56288334848136e-06, + "loss": 0.7968, + "step": 14552 + }, + { + "epoch": 0.8009796906819308, + "grad_norm": 0.7289897799491882, + "learning_rate": 6.562471595905648e-06, + "loss": 0.752, + "step": 14553 + }, + { + "epoch": 0.8010347294842864, + "grad_norm": 0.6774812340736389, + "learning_rate": 6.5620598315868176e-06, + "loss": 0.8263, + "step": 14554 + }, + { + "epoch": 0.8010897682866421, + "grad_norm": 0.6756269931793213, + "learning_rate": 6.561648055527965e-06, + "loss": 0.8096, + "step": 14555 + }, + { + "epoch": 0.8011448070889977, + "grad_norm": 0.7138845324516296, + "learning_rate": 6.5612362677321815e-06, + "loss": 0.7513, + "step": 14556 + }, + { + "epoch": 0.8011998458913534, + "grad_norm": 0.6763927340507507, + "learning_rate": 6.5608244682025656e-06, + "loss": 0.7975, + "step": 14557 + }, + { + "epoch": 0.8012548846937091, + "grad_norm": 0.8147655129432678, + "learning_rate": 6.56041265694221e-06, + "loss": 0.8192, + "step": 14558 + }, + { + "epoch": 0.8013099234960648, + "grad_norm": 0.7272641658782959, + "learning_rate": 6.5600008339542095e-06, + "loss": 0.7829, + "step": 14559 + }, + { + "epoch": 0.8013649622984204, + "grad_norm": 0.7464525103569031, + "learning_rate": 6.559588999241661e-06, + "loss": 0.7596, + "step": 14560 + }, + { + "epoch": 0.801420001100776, + "grad_norm": 0.7236443758010864, + "learning_rate": 6.559177152807661e-06, + "loss": 0.8151, + "step": 14561 + }, + { + "epoch": 0.8014750399031317, + "grad_norm": 0.6752793192863464, + "learning_rate": 6.558765294655301e-06, + "loss": 0.7578, + "step": 14562 + }, + { + "epoch": 0.8015300787054873, + "grad_norm": 0.709994375705719, + "learning_rate": 6.558353424787678e-06, + "loss": 0.6847, + "step": 14563 + }, + { + "epoch": 0.801585117507843, + "grad_norm": 0.7082880139350891, + "learning_rate": 6.557941543207889e-06, + "loss": 0.7968, + "step": 14564 + }, + { + "epoch": 0.8016401563101987, + "grad_norm": 0.692663848400116, + "learning_rate": 6.557529649919028e-06, + "loss": 0.6625, + "step": 14565 + }, + { + "epoch": 0.8016951951125544, + "grad_norm": 0.8464102149009705, + "learning_rate": 6.557117744924191e-06, + "loss": 0.7383, + "step": 14566 + }, + { + "epoch": 0.80175023391491, + "grad_norm": 0.6129899024963379, + "learning_rate": 6.5567058282264735e-06, + "loss": 0.7007, + "step": 14567 + }, + { + "epoch": 0.8018052727172656, + "grad_norm": 0.6458886861801147, + "learning_rate": 6.556293899828973e-06, + "loss": 0.7019, + "step": 14568 + }, + { + "epoch": 0.8018603115196213, + "grad_norm": 0.6543694138526917, + "learning_rate": 6.555881959734783e-06, + "loss": 0.7254, + "step": 14569 + }, + { + "epoch": 0.801915350321977, + "grad_norm": 0.7678859829902649, + "learning_rate": 6.555470007947001e-06, + "loss": 0.7952, + "step": 14570 + }, + { + "epoch": 0.8019703891243326, + "grad_norm": 0.7121342420578003, + "learning_rate": 6.555058044468722e-06, + "loss": 0.7951, + "step": 14571 + }, + { + "epoch": 0.8020254279266883, + "grad_norm": 0.6496285200119019, + "learning_rate": 6.554646069303043e-06, + "loss": 0.696, + "step": 14572 + }, + { + "epoch": 0.802080466729044, + "grad_norm": 0.7206087112426758, + "learning_rate": 6.5542340824530614e-06, + "loss": 0.7599, + "step": 14573 + }, + { + "epoch": 0.8021355055313997, + "grad_norm": 0.7285301685333252, + "learning_rate": 6.553822083921872e-06, + "loss": 0.7805, + "step": 14574 + }, + { + "epoch": 0.8021905443337553, + "grad_norm": 0.7524350881576538, + "learning_rate": 6.553410073712572e-06, + "loss": 0.7388, + "step": 14575 + }, + { + "epoch": 0.8022455831361109, + "grad_norm": 0.7634537220001221, + "learning_rate": 6.552998051828256e-06, + "loss": 0.6969, + "step": 14576 + }, + { + "epoch": 0.8023006219384666, + "grad_norm": 0.6950779557228088, + "learning_rate": 6.552586018272024e-06, + "loss": 0.8533, + "step": 14577 + }, + { + "epoch": 0.8023556607408223, + "grad_norm": 0.694496214389801, + "learning_rate": 6.552173973046972e-06, + "loss": 0.766, + "step": 14578 + }, + { + "epoch": 0.8024106995431779, + "grad_norm": 0.8068329691886902, + "learning_rate": 6.5517619161561954e-06, + "loss": 0.7642, + "step": 14579 + }, + { + "epoch": 0.8024657383455336, + "grad_norm": 0.6933363080024719, + "learning_rate": 6.5513498476027905e-06, + "loss": 0.8721, + "step": 14580 + }, + { + "epoch": 0.8025207771478893, + "grad_norm": 0.7041658163070679, + "learning_rate": 6.550937767389857e-06, + "loss": 0.6654, + "step": 14581 + }, + { + "epoch": 0.802575815950245, + "grad_norm": 0.7080103754997253, + "learning_rate": 6.550525675520489e-06, + "loss": 0.6917, + "step": 14582 + }, + { + "epoch": 0.8026308547526005, + "grad_norm": 0.6644875407218933, + "learning_rate": 6.550113571997785e-06, + "loss": 0.7674, + "step": 14583 + }, + { + "epoch": 0.8026858935549562, + "grad_norm": 0.7660395503044128, + "learning_rate": 6.549701456824843e-06, + "loss": 0.792, + "step": 14584 + }, + { + "epoch": 0.8027409323573119, + "grad_norm": 0.6853451132774353, + "learning_rate": 6.549289330004759e-06, + "loss": 0.8038, + "step": 14585 + }, + { + "epoch": 0.8027959711596676, + "grad_norm": 0.7349985837936401, + "learning_rate": 6.548877191540632e-06, + "loss": 0.7658, + "step": 14586 + }, + { + "epoch": 0.8028510099620232, + "grad_norm": 0.7605637311935425, + "learning_rate": 6.548465041435557e-06, + "loss": 0.7691, + "step": 14587 + }, + { + "epoch": 0.8029060487643789, + "grad_norm": 0.7635177969932556, + "learning_rate": 6.548052879692635e-06, + "loss": 0.8337, + "step": 14588 + }, + { + "epoch": 0.8029610875667346, + "grad_norm": 0.6873355507850647, + "learning_rate": 6.5476407063149614e-06, + "loss": 0.64, + "step": 14589 + }, + { + "epoch": 0.8030161263690903, + "grad_norm": 0.7642813920974731, + "learning_rate": 6.547228521305635e-06, + "loss": 0.6961, + "step": 14590 + }, + { + "epoch": 0.8030711651714458, + "grad_norm": 0.6329793334007263, + "learning_rate": 6.546816324667752e-06, + "loss": 0.73, + "step": 14591 + }, + { + "epoch": 0.8031262039738015, + "grad_norm": 0.6932308673858643, + "learning_rate": 6.546404116404412e-06, + "loss": 0.7582, + "step": 14592 + }, + { + "epoch": 0.8031812427761572, + "grad_norm": 0.699260413646698, + "learning_rate": 6.545991896518713e-06, + "loss": 0.7219, + "step": 14593 + }, + { + "epoch": 0.8032362815785129, + "grad_norm": 0.6217201948165894, + "learning_rate": 6.545579665013754e-06, + "loss": 0.6237, + "step": 14594 + }, + { + "epoch": 0.8032913203808685, + "grad_norm": 0.7078647017478943, + "learning_rate": 6.545167421892629e-06, + "loss": 0.666, + "step": 14595 + }, + { + "epoch": 0.8033463591832242, + "grad_norm": 0.6955916881561279, + "learning_rate": 6.544755167158441e-06, + "loss": 0.737, + "step": 14596 + }, + { + "epoch": 0.8034013979855799, + "grad_norm": 0.8195130825042725, + "learning_rate": 6.544342900814287e-06, + "loss": 0.787, + "step": 14597 + }, + { + "epoch": 0.8034564367879355, + "grad_norm": 0.6160768270492554, + "learning_rate": 6.543930622863263e-06, + "loss": 0.6141, + "step": 14598 + }, + { + "epoch": 0.8035114755902911, + "grad_norm": 0.8483116030693054, + "learning_rate": 6.543518333308472e-06, + "loss": 0.7639, + "step": 14599 + }, + { + "epoch": 0.8035665143926468, + "grad_norm": 0.6937680244445801, + "learning_rate": 6.5431060321530105e-06, + "loss": 0.7484, + "step": 14600 + }, + { + "epoch": 0.8036215531950025, + "grad_norm": 0.6298720836639404, + "learning_rate": 6.542693719399975e-06, + "loss": 0.6357, + "step": 14601 + }, + { + "epoch": 0.8036765919973582, + "grad_norm": 0.6431903839111328, + "learning_rate": 6.54228139505247e-06, + "loss": 0.6749, + "step": 14602 + }, + { + "epoch": 0.8037316307997138, + "grad_norm": 0.8972636461257935, + "learning_rate": 6.541869059113588e-06, + "loss": 0.8907, + "step": 14603 + }, + { + "epoch": 0.8037866696020695, + "grad_norm": 0.7302204966545105, + "learning_rate": 6.5414567115864316e-06, + "loss": 0.7494, + "step": 14604 + }, + { + "epoch": 0.8038417084044251, + "grad_norm": 0.7784821391105652, + "learning_rate": 6.541044352474099e-06, + "loss": 0.6582, + "step": 14605 + }, + { + "epoch": 0.8038967472067807, + "grad_norm": 0.7257398366928101, + "learning_rate": 6.54063198177969e-06, + "loss": 0.7362, + "step": 14606 + }, + { + "epoch": 0.8039517860091364, + "grad_norm": 0.6745980381965637, + "learning_rate": 6.540219599506302e-06, + "loss": 0.6756, + "step": 14607 + }, + { + "epoch": 0.8040068248114921, + "grad_norm": 0.8664490580558777, + "learning_rate": 6.539807205657037e-06, + "loss": 0.6728, + "step": 14608 + }, + { + "epoch": 0.8040618636138478, + "grad_norm": 0.704233705997467, + "learning_rate": 6.5393948002349926e-06, + "loss": 0.7713, + "step": 14609 + }, + { + "epoch": 0.8041169024162034, + "grad_norm": 0.7709019780158997, + "learning_rate": 6.538982383243271e-06, + "loss": 0.8148, + "step": 14610 + }, + { + "epoch": 0.8041719412185591, + "grad_norm": 0.7056839466094971, + "learning_rate": 6.538569954684967e-06, + "loss": 0.7143, + "step": 14611 + }, + { + "epoch": 0.8042269800209148, + "grad_norm": 0.715506374835968, + "learning_rate": 6.538157514563184e-06, + "loss": 0.7932, + "step": 14612 + }, + { + "epoch": 0.8042820188232704, + "grad_norm": 0.8245391845703125, + "learning_rate": 6.537745062881021e-06, + "loss": 0.7569, + "step": 14613 + }, + { + "epoch": 0.804337057625626, + "grad_norm": 0.6912628412246704, + "learning_rate": 6.5373325996415794e-06, + "loss": 0.7174, + "step": 14614 + }, + { + "epoch": 0.8043920964279817, + "grad_norm": 0.6994870901107788, + "learning_rate": 6.536920124847955e-06, + "loss": 0.6174, + "step": 14615 + }, + { + "epoch": 0.8044471352303374, + "grad_norm": 0.6660363674163818, + "learning_rate": 6.536507638503251e-06, + "loss": 0.8065, + "step": 14616 + }, + { + "epoch": 0.8045021740326931, + "grad_norm": 0.6742863059043884, + "learning_rate": 6.536095140610567e-06, + "loss": 0.7984, + "step": 14617 + }, + { + "epoch": 0.8045572128350487, + "grad_norm": 0.6868259906768799, + "learning_rate": 6.535682631173005e-06, + "loss": 0.7907, + "step": 14618 + }, + { + "epoch": 0.8046122516374044, + "grad_norm": 0.7442048788070679, + "learning_rate": 6.5352701101936615e-06, + "loss": 0.7893, + "step": 14619 + }, + { + "epoch": 0.80466729043976, + "grad_norm": 0.7389286756515503, + "learning_rate": 6.534857577675639e-06, + "loss": 0.827, + "step": 14620 + }, + { + "epoch": 0.8047223292421157, + "grad_norm": 0.6679701209068298, + "learning_rate": 6.534445033622036e-06, + "loss": 0.6721, + "step": 14621 + }, + { + "epoch": 0.8047773680444713, + "grad_norm": 0.6372442841529846, + "learning_rate": 6.534032478035957e-06, + "loss": 0.7381, + "step": 14622 + }, + { + "epoch": 0.804832406846827, + "grad_norm": 0.7682638764381409, + "learning_rate": 6.533619910920501e-06, + "loss": 0.7003, + "step": 14623 + }, + { + "epoch": 0.8048874456491827, + "grad_norm": 0.6821291446685791, + "learning_rate": 6.533207332278767e-06, + "loss": 0.8164, + "step": 14624 + }, + { + "epoch": 0.8049424844515384, + "grad_norm": 0.6591019034385681, + "learning_rate": 6.532794742113858e-06, + "loss": 0.6772, + "step": 14625 + }, + { + "epoch": 0.804997523253894, + "grad_norm": 0.7331292033195496, + "learning_rate": 6.532382140428874e-06, + "loss": 0.7606, + "step": 14626 + }, + { + "epoch": 0.8050525620562496, + "grad_norm": 0.9654768705368042, + "learning_rate": 6.531969527226917e-06, + "loss": 0.9196, + "step": 14627 + }, + { + "epoch": 0.8051076008586053, + "grad_norm": 0.6320267915725708, + "learning_rate": 6.5315569025110844e-06, + "loss": 0.6982, + "step": 14628 + }, + { + "epoch": 0.805162639660961, + "grad_norm": 0.6921746134757996, + "learning_rate": 6.531144266284481e-06, + "loss": 0.7176, + "step": 14629 + }, + { + "epoch": 0.8052176784633166, + "grad_norm": 0.7233335375785828, + "learning_rate": 6.530731618550208e-06, + "loss": 0.8388, + "step": 14630 + }, + { + "epoch": 0.8052727172656723, + "grad_norm": 0.6576363444328308, + "learning_rate": 6.530318959311366e-06, + "loss": 0.7511, + "step": 14631 + }, + { + "epoch": 0.805327756068028, + "grad_norm": 0.6921162009239197, + "learning_rate": 6.529906288571055e-06, + "loss": 0.8161, + "step": 14632 + }, + { + "epoch": 0.8053827948703837, + "grad_norm": 0.7314246296882629, + "learning_rate": 6.529493606332379e-06, + "loss": 0.7824, + "step": 14633 + }, + { + "epoch": 0.8054378336727392, + "grad_norm": 0.6419001221656799, + "learning_rate": 6.529080912598438e-06, + "loss": 0.7593, + "step": 14634 + }, + { + "epoch": 0.8054928724750949, + "grad_norm": 0.9500213861465454, + "learning_rate": 6.528668207372335e-06, + "loss": 0.7429, + "step": 14635 + }, + { + "epoch": 0.8055479112774506, + "grad_norm": 0.7299035787582397, + "learning_rate": 6.52825549065717e-06, + "loss": 0.8064, + "step": 14636 + }, + { + "epoch": 0.8056029500798063, + "grad_norm": 0.6231887936592102, + "learning_rate": 6.527842762456046e-06, + "loss": 0.6177, + "step": 14637 + }, + { + "epoch": 0.8056579888821619, + "grad_norm": 0.6219315528869629, + "learning_rate": 6.527430022772066e-06, + "loss": 0.6781, + "step": 14638 + }, + { + "epoch": 0.8057130276845176, + "grad_norm": 0.696861982345581, + "learning_rate": 6.527017271608329e-06, + "loss": 0.7508, + "step": 14639 + }, + { + "epoch": 0.8057680664868733, + "grad_norm": 0.7849573493003845, + "learning_rate": 6.5266045089679394e-06, + "loss": 0.7347, + "step": 14640 + }, + { + "epoch": 0.805823105289229, + "grad_norm": 0.6350993514060974, + "learning_rate": 6.526191734853999e-06, + "loss": 0.6863, + "step": 14641 + }, + { + "epoch": 0.8058781440915845, + "grad_norm": 0.6293141841888428, + "learning_rate": 6.5257789492696115e-06, + "loss": 0.7288, + "step": 14642 + }, + { + "epoch": 0.8059331828939402, + "grad_norm": 0.7801508903503418, + "learning_rate": 6.525366152217876e-06, + "loss": 0.7592, + "step": 14643 + }, + { + "epoch": 0.8059882216962959, + "grad_norm": 0.7031479477882385, + "learning_rate": 6.5249533437018964e-06, + "loss": 0.8677, + "step": 14644 + }, + { + "epoch": 0.8060432604986516, + "grad_norm": 0.7052507996559143, + "learning_rate": 6.524540523724777e-06, + "loss": 0.7957, + "step": 14645 + }, + { + "epoch": 0.8060982993010072, + "grad_norm": 0.669743537902832, + "learning_rate": 6.524127692289619e-06, + "loss": 0.7163, + "step": 14646 + }, + { + "epoch": 0.8061533381033629, + "grad_norm": 0.7180876731872559, + "learning_rate": 6.523714849399525e-06, + "loss": 0.8814, + "step": 14647 + }, + { + "epoch": 0.8062083769057186, + "grad_norm": 0.6617746353149414, + "learning_rate": 6.523301995057597e-06, + "loss": 0.721, + "step": 14648 + }, + { + "epoch": 0.8062634157080741, + "grad_norm": 0.6464657783508301, + "learning_rate": 6.5228891292669404e-06, + "loss": 0.7334, + "step": 14649 + }, + { + "epoch": 0.8063184545104298, + "grad_norm": 0.7648638486862183, + "learning_rate": 6.522476252030658e-06, + "loss": 0.7701, + "step": 14650 + }, + { + "epoch": 0.8063734933127855, + "grad_norm": 0.7313019037246704, + "learning_rate": 6.522063363351851e-06, + "loss": 0.7912, + "step": 14651 + }, + { + "epoch": 0.8064285321151412, + "grad_norm": 0.6175631284713745, + "learning_rate": 6.5216504632336195e-06, + "loss": 0.7568, + "step": 14652 + }, + { + "epoch": 0.8064835709174968, + "grad_norm": 0.6935408711433411, + "learning_rate": 6.521237551679074e-06, + "loss": 0.7622, + "step": 14653 + }, + { + "epoch": 0.8065386097198525, + "grad_norm": 0.7232398390769958, + "learning_rate": 6.520824628691314e-06, + "loss": 0.7908, + "step": 14654 + }, + { + "epoch": 0.8065936485222082, + "grad_norm": 0.6642309427261353, + "learning_rate": 6.520411694273443e-06, + "loss": 0.7355, + "step": 14655 + }, + { + "epoch": 0.8066486873245639, + "grad_norm": 0.6679350137710571, + "learning_rate": 6.5199987484285635e-06, + "loss": 0.735, + "step": 14656 + }, + { + "epoch": 0.8067037261269194, + "grad_norm": 0.6861871480941772, + "learning_rate": 6.519585791159782e-06, + "loss": 0.6744, + "step": 14657 + }, + { + "epoch": 0.8067587649292751, + "grad_norm": 0.7689095735549927, + "learning_rate": 6.519172822470199e-06, + "loss": 0.6888, + "step": 14658 + }, + { + "epoch": 0.8068138037316308, + "grad_norm": 0.6604742407798767, + "learning_rate": 6.5187598423629206e-06, + "loss": 0.6943, + "step": 14659 + }, + { + "epoch": 0.8068688425339865, + "grad_norm": 0.6478890776634216, + "learning_rate": 6.518346850841049e-06, + "loss": 0.7161, + "step": 14660 + }, + { + "epoch": 0.8069238813363421, + "grad_norm": 0.6213741302490234, + "learning_rate": 6.517933847907689e-06, + "loss": 0.68, + "step": 14661 + }, + { + "epoch": 0.8069789201386978, + "grad_norm": 0.7663899660110474, + "learning_rate": 6.517520833565945e-06, + "loss": 0.7498, + "step": 14662 + }, + { + "epoch": 0.8070339589410535, + "grad_norm": 0.653498649597168, + "learning_rate": 6.517107807818921e-06, + "loss": 0.7433, + "step": 14663 + }, + { + "epoch": 0.8070889977434091, + "grad_norm": 0.7618738412857056, + "learning_rate": 6.51669477066972e-06, + "loss": 0.7499, + "step": 14664 + }, + { + "epoch": 0.8071440365457647, + "grad_norm": 0.5960344672203064, + "learning_rate": 6.516281722121447e-06, + "loss": 0.6005, + "step": 14665 + }, + { + "epoch": 0.8071990753481204, + "grad_norm": 0.6768549084663391, + "learning_rate": 6.5158686621772075e-06, + "loss": 0.6859, + "step": 14666 + }, + { + "epoch": 0.8072541141504761, + "grad_norm": 0.6475711464881897, + "learning_rate": 6.515455590840104e-06, + "loss": 0.7582, + "step": 14667 + }, + { + "epoch": 0.8073091529528318, + "grad_norm": 0.7188607454299927, + "learning_rate": 6.5150425081132414e-06, + "loss": 0.7241, + "step": 14668 + }, + { + "epoch": 0.8073641917551874, + "grad_norm": 0.6507582068443298, + "learning_rate": 6.514629413999727e-06, + "loss": 0.7659, + "step": 14669 + }, + { + "epoch": 0.807419230557543, + "grad_norm": 0.6676538586616516, + "learning_rate": 6.514216308502661e-06, + "loss": 0.7336, + "step": 14670 + }, + { + "epoch": 0.8074742693598987, + "grad_norm": 0.7141211628913879, + "learning_rate": 6.513803191625152e-06, + "loss": 0.8121, + "step": 14671 + }, + { + "epoch": 0.8075293081622544, + "grad_norm": 0.7497949600219727, + "learning_rate": 6.513390063370302e-06, + "loss": 0.7238, + "step": 14672 + }, + { + "epoch": 0.80758434696461, + "grad_norm": 0.671271562576294, + "learning_rate": 6.51297692374122e-06, + "loss": 0.7876, + "step": 14673 + }, + { + "epoch": 0.8076393857669657, + "grad_norm": 0.7081878781318665, + "learning_rate": 6.512563772741008e-06, + "loss": 0.6774, + "step": 14674 + }, + { + "epoch": 0.8076944245693214, + "grad_norm": 0.640925943851471, + "learning_rate": 6.512150610372769e-06, + "loss": 0.7094, + "step": 14675 + }, + { + "epoch": 0.8077494633716771, + "grad_norm": 0.6333619952201843, + "learning_rate": 6.511737436639611e-06, + "loss": 0.6439, + "step": 14676 + }, + { + "epoch": 0.8078045021740327, + "grad_norm": 0.7294490337371826, + "learning_rate": 6.511324251544642e-06, + "loss": 0.7786, + "step": 14677 + }, + { + "epoch": 0.8078595409763883, + "grad_norm": 0.6488819718360901, + "learning_rate": 6.510911055090963e-06, + "loss": 0.7495, + "step": 14678 + }, + { + "epoch": 0.807914579778744, + "grad_norm": 0.6535395383834839, + "learning_rate": 6.51049784728168e-06, + "loss": 0.6713, + "step": 14679 + }, + { + "epoch": 0.8079696185810997, + "grad_norm": 0.6795744895935059, + "learning_rate": 6.5100846281198995e-06, + "loss": 0.71, + "step": 14680 + }, + { + "epoch": 0.8080246573834553, + "grad_norm": 0.661171019077301, + "learning_rate": 6.509671397608728e-06, + "loss": 0.7009, + "step": 14681 + }, + { + "epoch": 0.808079696185811, + "grad_norm": 0.6474859118461609, + "learning_rate": 6.50925815575127e-06, + "loss": 0.7268, + "step": 14682 + }, + { + "epoch": 0.8081347349881667, + "grad_norm": 0.676891565322876, + "learning_rate": 6.508844902550633e-06, + "loss": 0.8748, + "step": 14683 + }, + { + "epoch": 0.8081897737905224, + "grad_norm": 0.9747083783149719, + "learning_rate": 6.50843163800992e-06, + "loss": 0.6817, + "step": 14684 + }, + { + "epoch": 0.808244812592878, + "grad_norm": 0.655274510383606, + "learning_rate": 6.50801836213224e-06, + "loss": 0.7675, + "step": 14685 + }, + { + "epoch": 0.8082998513952336, + "grad_norm": 0.6916972398757935, + "learning_rate": 6.507605074920697e-06, + "loss": 0.7862, + "step": 14686 + }, + { + "epoch": 0.8083548901975893, + "grad_norm": 0.7079103589057922, + "learning_rate": 6.5071917763783975e-06, + "loss": 0.671, + "step": 14687 + }, + { + "epoch": 0.808409928999945, + "grad_norm": 0.7460986375808716, + "learning_rate": 6.506778466508447e-06, + "loss": 0.7136, + "step": 14688 + }, + { + "epoch": 0.8084649678023006, + "grad_norm": 0.6531261801719666, + "learning_rate": 6.5063651453139555e-06, + "loss": 0.811, + "step": 14689 + }, + { + "epoch": 0.8085200066046563, + "grad_norm": 0.7160762548446655, + "learning_rate": 6.505951812798025e-06, + "loss": 0.8368, + "step": 14690 + }, + { + "epoch": 0.808575045407012, + "grad_norm": 0.7230852842330933, + "learning_rate": 6.505538468963763e-06, + "loss": 0.6908, + "step": 14691 + }, + { + "epoch": 0.8086300842093676, + "grad_norm": 0.6912978887557983, + "learning_rate": 6.505125113814278e-06, + "loss": 0.6716, + "step": 14692 + }, + { + "epoch": 0.8086851230117232, + "grad_norm": 0.6745109558105469, + "learning_rate": 6.504711747352677e-06, + "loss": 0.7119, + "step": 14693 + }, + { + "epoch": 0.8087401618140789, + "grad_norm": 0.678657054901123, + "learning_rate": 6.5042983695820624e-06, + "loss": 0.7548, + "step": 14694 + }, + { + "epoch": 0.8087952006164346, + "grad_norm": 0.7501665949821472, + "learning_rate": 6.503884980505546e-06, + "loss": 0.7493, + "step": 14695 + }, + { + "epoch": 0.8088502394187902, + "grad_norm": 0.6181747317314148, + "learning_rate": 6.503471580126232e-06, + "loss": 0.7217, + "step": 14696 + }, + { + "epoch": 0.8089052782211459, + "grad_norm": 0.6548559069633484, + "learning_rate": 6.5030581684472295e-06, + "loss": 0.7448, + "step": 14697 + }, + { + "epoch": 0.8089603170235016, + "grad_norm": 0.7716642022132874, + "learning_rate": 6.5026447454716426e-06, + "loss": 0.8794, + "step": 14698 + }, + { + "epoch": 0.8090153558258573, + "grad_norm": 0.861995279788971, + "learning_rate": 6.502231311202581e-06, + "loss": 0.7839, + "step": 14699 + }, + { + "epoch": 0.8090703946282128, + "grad_norm": 0.796821117401123, + "learning_rate": 6.501817865643149e-06, + "loss": 0.8541, + "step": 14700 + }, + { + "epoch": 0.8091254334305685, + "grad_norm": 0.6995296478271484, + "learning_rate": 6.501404408796457e-06, + "loss": 0.677, + "step": 14701 + }, + { + "epoch": 0.8091804722329242, + "grad_norm": 0.6681582927703857, + "learning_rate": 6.500990940665611e-06, + "loss": 0.7754, + "step": 14702 + }, + { + "epoch": 0.8092355110352799, + "grad_norm": 0.5945298671722412, + "learning_rate": 6.50057746125372e-06, + "loss": 0.6762, + "step": 14703 + }, + { + "epoch": 0.8092905498376355, + "grad_norm": 0.672554612159729, + "learning_rate": 6.500163970563889e-06, + "loss": 0.6967, + "step": 14704 + }, + { + "epoch": 0.8093455886399912, + "grad_norm": 0.6375272870063782, + "learning_rate": 6.499750468599227e-06, + "loss": 0.7291, + "step": 14705 + }, + { + "epoch": 0.8094006274423469, + "grad_norm": 0.6369407773017883, + "learning_rate": 6.499336955362844e-06, + "loss": 0.6939, + "step": 14706 + }, + { + "epoch": 0.8094556662447026, + "grad_norm": 0.6497664451599121, + "learning_rate": 6.498923430857844e-06, + "loss": 0.7207, + "step": 14707 + }, + { + "epoch": 0.8095107050470581, + "grad_norm": 0.7345920205116272, + "learning_rate": 6.498509895087337e-06, + "loss": 0.8373, + "step": 14708 + }, + { + "epoch": 0.8095657438494138, + "grad_norm": 0.6824862957000732, + "learning_rate": 6.4980963480544324e-06, + "loss": 0.7531, + "step": 14709 + }, + { + "epoch": 0.8096207826517695, + "grad_norm": 0.7067939639091492, + "learning_rate": 6.497682789762236e-06, + "loss": 0.6951, + "step": 14710 + }, + { + "epoch": 0.8096758214541252, + "grad_norm": 0.6856693625450134, + "learning_rate": 6.497269220213856e-06, + "loss": 0.7264, + "step": 14711 + }, + { + "epoch": 0.8097308602564808, + "grad_norm": 0.6881466507911682, + "learning_rate": 6.4968556394124e-06, + "loss": 0.7837, + "step": 14712 + }, + { + "epoch": 0.8097858990588365, + "grad_norm": 0.6211455464363098, + "learning_rate": 6.49644204736098e-06, + "loss": 0.7278, + "step": 14713 + }, + { + "epoch": 0.8098409378611922, + "grad_norm": 0.688604474067688, + "learning_rate": 6.496028444062701e-06, + "loss": 0.7786, + "step": 14714 + }, + { + "epoch": 0.8098959766635478, + "grad_norm": 0.6615015268325806, + "learning_rate": 6.495614829520673e-06, + "loss": 0.7014, + "step": 14715 + }, + { + "epoch": 0.8099510154659034, + "grad_norm": 0.712661623954773, + "learning_rate": 6.495201203738004e-06, + "loss": 0.6792, + "step": 14716 + }, + { + "epoch": 0.8100060542682591, + "grad_norm": 0.6737191677093506, + "learning_rate": 6.494787566717803e-06, + "loss": 0.7937, + "step": 14717 + }, + { + "epoch": 0.8100610930706148, + "grad_norm": 0.8007351160049438, + "learning_rate": 6.494373918463179e-06, + "loss": 0.8367, + "step": 14718 + }, + { + "epoch": 0.8101161318729705, + "grad_norm": 0.7500883936882019, + "learning_rate": 6.493960258977241e-06, + "loss": 0.8102, + "step": 14719 + }, + { + "epoch": 0.8101711706753261, + "grad_norm": 0.7605966925621033, + "learning_rate": 6.493546588263097e-06, + "loss": 0.8316, + "step": 14720 + }, + { + "epoch": 0.8102262094776818, + "grad_norm": 0.746762216091156, + "learning_rate": 6.493132906323858e-06, + "loss": 0.7765, + "step": 14721 + }, + { + "epoch": 0.8102812482800374, + "grad_norm": 0.6034676432609558, + "learning_rate": 6.49271921316263e-06, + "loss": 0.7109, + "step": 14722 + }, + { + "epoch": 0.8103362870823931, + "grad_norm": 0.6965274810791016, + "learning_rate": 6.492305508782525e-06, + "loss": 0.8156, + "step": 14723 + }, + { + "epoch": 0.8103913258847487, + "grad_norm": 0.6813820004463196, + "learning_rate": 6.4918917931866495e-06, + "loss": 0.7016, + "step": 14724 + }, + { + "epoch": 0.8104463646871044, + "grad_norm": 0.8055655360221863, + "learning_rate": 6.491478066378117e-06, + "loss": 0.7837, + "step": 14725 + }, + { + "epoch": 0.8105014034894601, + "grad_norm": 0.6131647229194641, + "learning_rate": 6.491064328360033e-06, + "loss": 0.6716, + "step": 14726 + }, + { + "epoch": 0.8105564422918158, + "grad_norm": 0.6845986247062683, + "learning_rate": 6.49065057913551e-06, + "loss": 0.8112, + "step": 14727 + }, + { + "epoch": 0.8106114810941714, + "grad_norm": 0.6867175698280334, + "learning_rate": 6.490236818707653e-06, + "loss": 0.7953, + "step": 14728 + }, + { + "epoch": 0.810666519896527, + "grad_norm": 0.7170011401176453, + "learning_rate": 6.489823047079578e-06, + "loss": 0.8108, + "step": 14729 + }, + { + "epoch": 0.8107215586988827, + "grad_norm": 0.6280927658081055, + "learning_rate": 6.489409264254393e-06, + "loss": 0.6807, + "step": 14730 + }, + { + "epoch": 0.8107765975012384, + "grad_norm": 0.8344630002975464, + "learning_rate": 6.488995470235204e-06, + "loss": 0.7555, + "step": 14731 + }, + { + "epoch": 0.810831636303594, + "grad_norm": 0.6674200296401978, + "learning_rate": 6.488581665025125e-06, + "loss": 0.5732, + "step": 14732 + }, + { + "epoch": 0.8108866751059497, + "grad_norm": 0.7843313217163086, + "learning_rate": 6.4881678486272646e-06, + "loss": 0.6689, + "step": 14733 + }, + { + "epoch": 0.8109417139083054, + "grad_norm": 0.6951878666877747, + "learning_rate": 6.487754021044732e-06, + "loss": 0.8005, + "step": 14734 + }, + { + "epoch": 0.810996752710661, + "grad_norm": 0.7773714065551758, + "learning_rate": 6.487340182280639e-06, + "loss": 0.8151, + "step": 14735 + }, + { + "epoch": 0.8110517915130167, + "grad_norm": 0.824998140335083, + "learning_rate": 6.486926332338095e-06, + "loss": 0.7947, + "step": 14736 + }, + { + "epoch": 0.8111068303153723, + "grad_norm": 0.6411730647087097, + "learning_rate": 6.486512471220212e-06, + "loss": 0.7272, + "step": 14737 + }, + { + "epoch": 0.811161869117728, + "grad_norm": 0.6758518815040588, + "learning_rate": 6.486098598930097e-06, + "loss": 0.6676, + "step": 14738 + }, + { + "epoch": 0.8112169079200836, + "grad_norm": 0.7147762179374695, + "learning_rate": 6.485684715470866e-06, + "loss": 0.7796, + "step": 14739 + }, + { + "epoch": 0.8112719467224393, + "grad_norm": 0.7641217112541199, + "learning_rate": 6.485270820845623e-06, + "loss": 0.7943, + "step": 14740 + }, + { + "epoch": 0.811326985524795, + "grad_norm": 0.6947311162948608, + "learning_rate": 6.484856915057482e-06, + "loss": 0.7791, + "step": 14741 + }, + { + "epoch": 0.8113820243271507, + "grad_norm": 0.6781480312347412, + "learning_rate": 6.4844429981095565e-06, + "loss": 0.7399, + "step": 14742 + }, + { + "epoch": 0.8114370631295063, + "grad_norm": 0.6716181039810181, + "learning_rate": 6.484029070004953e-06, + "loss": 0.8111, + "step": 14743 + }, + { + "epoch": 0.8114921019318619, + "grad_norm": 0.8642836213111877, + "learning_rate": 6.4836151307467854e-06, + "loss": 0.756, + "step": 14744 + }, + { + "epoch": 0.8115471407342176, + "grad_norm": 0.5997880101203918, + "learning_rate": 6.483201180338163e-06, + "loss": 0.6043, + "step": 14745 + }, + { + "epoch": 0.8116021795365733, + "grad_norm": 0.7397846579551697, + "learning_rate": 6.4827872187821985e-06, + "loss": 0.848, + "step": 14746 + }, + { + "epoch": 0.8116572183389289, + "grad_norm": 0.7586305141448975, + "learning_rate": 6.482373246082001e-06, + "loss": 0.802, + "step": 14747 + }, + { + "epoch": 0.8117122571412846, + "grad_norm": 0.705182671546936, + "learning_rate": 6.4819592622406825e-06, + "loss": 0.7484, + "step": 14748 + }, + { + "epoch": 0.8117672959436403, + "grad_norm": 0.7092768549919128, + "learning_rate": 6.481545267261357e-06, + "loss": 0.7031, + "step": 14749 + }, + { + "epoch": 0.811822334745996, + "grad_norm": 0.6800800561904907, + "learning_rate": 6.4811312611471325e-06, + "loss": 0.7253, + "step": 14750 + }, + { + "epoch": 0.8118773735483515, + "grad_norm": 0.6862359642982483, + "learning_rate": 6.4807172439011215e-06, + "loss": 0.818, + "step": 14751 + }, + { + "epoch": 0.8119324123507072, + "grad_norm": 0.6928552389144897, + "learning_rate": 6.480303215526436e-06, + "loss": 0.7459, + "step": 14752 + }, + { + "epoch": 0.8119874511530629, + "grad_norm": 0.6869228482246399, + "learning_rate": 6.479889176026189e-06, + "loss": 0.7024, + "step": 14753 + }, + { + "epoch": 0.8120424899554186, + "grad_norm": 0.7036190032958984, + "learning_rate": 6.479475125403489e-06, + "loss": 0.766, + "step": 14754 + }, + { + "epoch": 0.8120975287577742, + "grad_norm": 0.6574180722236633, + "learning_rate": 6.479061063661452e-06, + "loss": 0.7355, + "step": 14755 + }, + { + "epoch": 0.8121525675601299, + "grad_norm": 0.6424534916877747, + "learning_rate": 6.478646990803188e-06, + "loss": 0.6837, + "step": 14756 + }, + { + "epoch": 0.8122076063624856, + "grad_norm": 0.6922320127487183, + "learning_rate": 6.478232906831808e-06, + "loss": 0.7535, + "step": 14757 + }, + { + "epoch": 0.8122626451648413, + "grad_norm": 0.6424705386161804, + "learning_rate": 6.477818811750426e-06, + "loss": 0.691, + "step": 14758 + }, + { + "epoch": 0.8123176839671968, + "grad_norm": 0.6180749535560608, + "learning_rate": 6.4774047055621525e-06, + "loss": 0.6944, + "step": 14759 + }, + { + "epoch": 0.8123727227695525, + "grad_norm": 0.8718746900558472, + "learning_rate": 6.4769905882701e-06, + "loss": 0.89, + "step": 14760 + }, + { + "epoch": 0.8124277615719082, + "grad_norm": 0.6664311289787292, + "learning_rate": 6.476576459877384e-06, + "loss": 0.7144, + "step": 14761 + }, + { + "epoch": 0.8124828003742639, + "grad_norm": 0.6547374129295349, + "learning_rate": 6.476162320387112e-06, + "loss": 0.7292, + "step": 14762 + }, + { + "epoch": 0.8125378391766195, + "grad_norm": 0.7387503385543823, + "learning_rate": 6.475748169802401e-06, + "loss": 0.7388, + "step": 14763 + }, + { + "epoch": 0.8125928779789752, + "grad_norm": 0.6013749241828918, + "learning_rate": 6.475334008126361e-06, + "loss": 0.6853, + "step": 14764 + }, + { + "epoch": 0.8126479167813309, + "grad_norm": 0.6720583438873291, + "learning_rate": 6.474919835362105e-06, + "loss": 0.7392, + "step": 14765 + }, + { + "epoch": 0.8127029555836865, + "grad_norm": 0.6651661992073059, + "learning_rate": 6.474505651512748e-06, + "loss": 0.7586, + "step": 14766 + }, + { + "epoch": 0.8127579943860421, + "grad_norm": 0.7653207182884216, + "learning_rate": 6.474091456581401e-06, + "loss": 0.9182, + "step": 14767 + }, + { + "epoch": 0.8128130331883978, + "grad_norm": 0.6322795152664185, + "learning_rate": 6.473677250571176e-06, + "loss": 0.6954, + "step": 14768 + }, + { + "epoch": 0.8128680719907535, + "grad_norm": 0.7423616647720337, + "learning_rate": 6.4732630334851885e-06, + "loss": 0.748, + "step": 14769 + }, + { + "epoch": 0.8129231107931092, + "grad_norm": 0.5989160537719727, + "learning_rate": 6.472848805326549e-06, + "loss": 0.6571, + "step": 14770 + }, + { + "epoch": 0.8129781495954648, + "grad_norm": 0.695566713809967, + "learning_rate": 6.472434566098373e-06, + "loss": 0.6936, + "step": 14771 + }, + { + "epoch": 0.8130331883978205, + "grad_norm": 0.6993961930274963, + "learning_rate": 6.4720203158037734e-06, + "loss": 0.8283, + "step": 14772 + }, + { + "epoch": 0.8130882272001762, + "grad_norm": 0.6430020928382874, + "learning_rate": 6.471606054445861e-06, + "loss": 0.6882, + "step": 14773 + }, + { + "epoch": 0.8131432660025318, + "grad_norm": 0.6834734678268433, + "learning_rate": 6.471191782027754e-06, + "loss": 0.7519, + "step": 14774 + }, + { + "epoch": 0.8131983048048874, + "grad_norm": 0.679432213306427, + "learning_rate": 6.470777498552561e-06, + "loss": 0.7707, + "step": 14775 + }, + { + "epoch": 0.8132533436072431, + "grad_norm": 0.6929466128349304, + "learning_rate": 6.4703632040234e-06, + "loss": 0.7166, + "step": 14776 + }, + { + "epoch": 0.8133083824095988, + "grad_norm": 0.7033447623252869, + "learning_rate": 6.469948898443381e-06, + "loss": 0.7558, + "step": 14777 + }, + { + "epoch": 0.8133634212119544, + "grad_norm": 0.89338618516922, + "learning_rate": 6.469534581815621e-06, + "loss": 0.7829, + "step": 14778 + }, + { + "epoch": 0.8134184600143101, + "grad_norm": 0.7361789345741272, + "learning_rate": 6.469120254143233e-06, + "loss": 0.7885, + "step": 14779 + }, + { + "epoch": 0.8134734988166658, + "grad_norm": 0.7532172203063965, + "learning_rate": 6.468705915429329e-06, + "loss": 0.7791, + "step": 14780 + }, + { + "epoch": 0.8135285376190214, + "grad_norm": 0.7082527279853821, + "learning_rate": 6.468291565677025e-06, + "loss": 0.7809, + "step": 14781 + }, + { + "epoch": 0.813583576421377, + "grad_norm": 0.7854330539703369, + "learning_rate": 6.467877204889435e-06, + "loss": 0.8467, + "step": 14782 + }, + { + "epoch": 0.8136386152237327, + "grad_norm": 0.7649636268615723, + "learning_rate": 6.467462833069672e-06, + "loss": 0.7766, + "step": 14783 + }, + { + "epoch": 0.8136936540260884, + "grad_norm": 0.6293399930000305, + "learning_rate": 6.467048450220852e-06, + "loss": 0.7307, + "step": 14784 + }, + { + "epoch": 0.8137486928284441, + "grad_norm": 0.7131813764572144, + "learning_rate": 6.4666340563460874e-06, + "loss": 0.7614, + "step": 14785 + }, + { + "epoch": 0.8138037316307997, + "grad_norm": 0.6650925874710083, + "learning_rate": 6.466219651448496e-06, + "loss": 0.7576, + "step": 14786 + }, + { + "epoch": 0.8138587704331554, + "grad_norm": 0.8009011745452881, + "learning_rate": 6.4658052355311875e-06, + "loss": 0.7127, + "step": 14787 + }, + { + "epoch": 0.813913809235511, + "grad_norm": 1.009027123451233, + "learning_rate": 6.465390808597281e-06, + "loss": 0.7647, + "step": 14788 + }, + { + "epoch": 0.8139688480378667, + "grad_norm": 0.7495583891868591, + "learning_rate": 6.464976370649888e-06, + "loss": 0.7276, + "step": 14789 + }, + { + "epoch": 0.8140238868402223, + "grad_norm": 0.7181064486503601, + "learning_rate": 6.464561921692125e-06, + "loss": 0.687, + "step": 14790 + }, + { + "epoch": 0.814078925642578, + "grad_norm": 0.7480552196502686, + "learning_rate": 6.464147461727108e-06, + "loss": 0.7813, + "step": 14791 + }, + { + "epoch": 0.8141339644449337, + "grad_norm": 0.6699607968330383, + "learning_rate": 6.4637329907579506e-06, + "loss": 0.7364, + "step": 14792 + }, + { + "epoch": 0.8141890032472894, + "grad_norm": 0.7321322560310364, + "learning_rate": 6.463318508787767e-06, + "loss": 0.6799, + "step": 14793 + }, + { + "epoch": 0.814244042049645, + "grad_norm": 0.8992179036140442, + "learning_rate": 6.462904015819673e-06, + "loss": 0.7602, + "step": 14794 + }, + { + "epoch": 0.8142990808520006, + "grad_norm": 0.6949485540390015, + "learning_rate": 6.462489511856784e-06, + "loss": 0.6701, + "step": 14795 + }, + { + "epoch": 0.8143541196543563, + "grad_norm": 0.6367032527923584, + "learning_rate": 6.462074996902217e-06, + "loss": 0.7132, + "step": 14796 + }, + { + "epoch": 0.814409158456712, + "grad_norm": 0.6424476504325867, + "learning_rate": 6.461660470959084e-06, + "loss": 0.7111, + "step": 14797 + }, + { + "epoch": 0.8144641972590676, + "grad_norm": 0.6649259924888611, + "learning_rate": 6.4612459340305025e-06, + "loss": 0.6583, + "step": 14798 + }, + { + "epoch": 0.8145192360614233, + "grad_norm": 0.7781171798706055, + "learning_rate": 6.460831386119587e-06, + "loss": 0.8145, + "step": 14799 + }, + { + "epoch": 0.814574274863779, + "grad_norm": 0.7409094572067261, + "learning_rate": 6.460416827229455e-06, + "loss": 0.7559, + "step": 14800 + }, + { + "epoch": 0.8146293136661347, + "grad_norm": 1.2152613401412964, + "learning_rate": 6.46000225736322e-06, + "loss": 0.8263, + "step": 14801 + }, + { + "epoch": 0.8146843524684902, + "grad_norm": 0.7133356332778931, + "learning_rate": 6.459587676524e-06, + "loss": 0.7687, + "step": 14802 + }, + { + "epoch": 0.8147393912708459, + "grad_norm": 0.8576061129570007, + "learning_rate": 6.459173084714908e-06, + "loss": 0.8364, + "step": 14803 + }, + { + "epoch": 0.8147944300732016, + "grad_norm": 0.7701650857925415, + "learning_rate": 6.4587584819390634e-06, + "loss": 0.7768, + "step": 14804 + }, + { + "epoch": 0.8148494688755573, + "grad_norm": 0.6629199981689453, + "learning_rate": 6.45834386819958e-06, + "loss": 0.7338, + "step": 14805 + }, + { + "epoch": 0.8149045076779129, + "grad_norm": 0.6498340964317322, + "learning_rate": 6.457929243499574e-06, + "loss": 0.7241, + "step": 14806 + }, + { + "epoch": 0.8149595464802686, + "grad_norm": 0.7107635140419006, + "learning_rate": 6.457514607842164e-06, + "loss": 0.7999, + "step": 14807 + }, + { + "epoch": 0.8150145852826243, + "grad_norm": 0.8689384460449219, + "learning_rate": 6.457099961230462e-06, + "loss": 0.7882, + "step": 14808 + }, + { + "epoch": 0.81506962408498, + "grad_norm": 0.7050377726554871, + "learning_rate": 6.456685303667587e-06, + "loss": 0.8039, + "step": 14809 + }, + { + "epoch": 0.8151246628873355, + "grad_norm": 0.6171709895133972, + "learning_rate": 6.456270635156656e-06, + "loss": 0.6569, + "step": 14810 + }, + { + "epoch": 0.8151797016896912, + "grad_norm": 0.837285041809082, + "learning_rate": 6.455855955700785e-06, + "loss": 0.6529, + "step": 14811 + }, + { + "epoch": 0.8152347404920469, + "grad_norm": 0.7335891723632812, + "learning_rate": 6.45544126530309e-06, + "loss": 0.814, + "step": 14812 + }, + { + "epoch": 0.8152897792944026, + "grad_norm": 0.7217129468917847, + "learning_rate": 6.4550265639666864e-06, + "loss": 0.795, + "step": 14813 + }, + { + "epoch": 0.8153448180967582, + "grad_norm": 0.7292104959487915, + "learning_rate": 6.454611851694694e-06, + "loss": 0.7169, + "step": 14814 + }, + { + "epoch": 0.8153998568991139, + "grad_norm": 0.7190173864364624, + "learning_rate": 6.454197128490229e-06, + "loss": 0.8413, + "step": 14815 + }, + { + "epoch": 0.8154548957014696, + "grad_norm": 0.6679649949073792, + "learning_rate": 6.453782394356407e-06, + "loss": 0.6626, + "step": 14816 + }, + { + "epoch": 0.8155099345038253, + "grad_norm": 0.6829885244369507, + "learning_rate": 6.453367649296347e-06, + "loss": 0.6512, + "step": 14817 + }, + { + "epoch": 0.8155649733061808, + "grad_norm": 0.659461498260498, + "learning_rate": 6.452952893313163e-06, + "loss": 0.7271, + "step": 14818 + }, + { + "epoch": 0.8156200121085365, + "grad_norm": 0.6737749576568604, + "learning_rate": 6.452538126409975e-06, + "loss": 0.6882, + "step": 14819 + }, + { + "epoch": 0.8156750509108922, + "grad_norm": 0.7798036336898804, + "learning_rate": 6.452123348589899e-06, + "loss": 0.7214, + "step": 14820 + }, + { + "epoch": 0.8157300897132478, + "grad_norm": 0.6594774127006531, + "learning_rate": 6.451708559856051e-06, + "loss": 0.7611, + "step": 14821 + }, + { + "epoch": 0.8157851285156035, + "grad_norm": 0.6795164942741394, + "learning_rate": 6.451293760211552e-06, + "loss": 0.6825, + "step": 14822 + }, + { + "epoch": 0.8158401673179592, + "grad_norm": 0.8376501798629761, + "learning_rate": 6.450878949659517e-06, + "loss": 0.7898, + "step": 14823 + }, + { + "epoch": 0.8158952061203149, + "grad_norm": 0.6746712923049927, + "learning_rate": 6.450464128203064e-06, + "loss": 0.6771, + "step": 14824 + }, + { + "epoch": 0.8159502449226704, + "grad_norm": 0.7984384894371033, + "learning_rate": 6.450049295845311e-06, + "loss": 0.7326, + "step": 14825 + }, + { + "epoch": 0.8160052837250261, + "grad_norm": 0.8210996389389038, + "learning_rate": 6.449634452589376e-06, + "loss": 0.8194, + "step": 14826 + }, + { + "epoch": 0.8160603225273818, + "grad_norm": 0.7045891284942627, + "learning_rate": 6.449219598438376e-06, + "loss": 0.7683, + "step": 14827 + }, + { + "epoch": 0.8161153613297375, + "grad_norm": 0.7199337482452393, + "learning_rate": 6.448804733395431e-06, + "loss": 0.7125, + "step": 14828 + }, + { + "epoch": 0.8161704001320931, + "grad_norm": 0.8576976656913757, + "learning_rate": 6.448389857463655e-06, + "loss": 0.6744, + "step": 14829 + }, + { + "epoch": 0.8162254389344488, + "grad_norm": 0.6944701075553894, + "learning_rate": 6.4479749706461705e-06, + "loss": 0.7663, + "step": 14830 + }, + { + "epoch": 0.8162804777368045, + "grad_norm": 0.7436455488204956, + "learning_rate": 6.447560072946093e-06, + "loss": 0.7612, + "step": 14831 + }, + { + "epoch": 0.8163355165391601, + "grad_norm": 0.6023590564727783, + "learning_rate": 6.447145164366542e-06, + "loss": 0.7029, + "step": 14832 + }, + { + "epoch": 0.8163905553415157, + "grad_norm": 0.6720685362815857, + "learning_rate": 6.446730244910633e-06, + "loss": 0.7821, + "step": 14833 + }, + { + "epoch": 0.8164455941438714, + "grad_norm": 0.6359856128692627, + "learning_rate": 6.446315314581488e-06, + "loss": 0.7119, + "step": 14834 + }, + { + "epoch": 0.8165006329462271, + "grad_norm": 0.6796891689300537, + "learning_rate": 6.445900373382225e-06, + "loss": 0.7414, + "step": 14835 + }, + { + "epoch": 0.8165556717485828, + "grad_norm": 0.6865763068199158, + "learning_rate": 6.445485421315963e-06, + "loss": 0.7239, + "step": 14836 + }, + { + "epoch": 0.8166107105509384, + "grad_norm": 0.6696601510047913, + "learning_rate": 6.445070458385816e-06, + "loss": 0.6322, + "step": 14837 + }, + { + "epoch": 0.8166657493532941, + "grad_norm": 0.6800506711006165, + "learning_rate": 6.444655484594909e-06, + "loss": 0.7827, + "step": 14838 + }, + { + "epoch": 0.8167207881556497, + "grad_norm": 0.7590689063072205, + "learning_rate": 6.444240499946357e-06, + "loss": 0.7177, + "step": 14839 + }, + { + "epoch": 0.8167758269580054, + "grad_norm": 0.6692266464233398, + "learning_rate": 6.4438255044432805e-06, + "loss": 0.6631, + "step": 14840 + }, + { + "epoch": 0.816830865760361, + "grad_norm": 0.695164144039154, + "learning_rate": 6.443410498088798e-06, + "loss": 0.6953, + "step": 14841 + }, + { + "epoch": 0.8168859045627167, + "grad_norm": 0.6503697037696838, + "learning_rate": 6.442995480886028e-06, + "loss": 0.7868, + "step": 14842 + }, + { + "epoch": 0.8169409433650724, + "grad_norm": 0.6943323016166687, + "learning_rate": 6.442580452838091e-06, + "loss": 0.7464, + "step": 14843 + }, + { + "epoch": 0.8169959821674281, + "grad_norm": 0.7510622143745422, + "learning_rate": 6.442165413948105e-06, + "loss": 0.7984, + "step": 14844 + }, + { + "epoch": 0.8170510209697837, + "grad_norm": 0.6322263479232788, + "learning_rate": 6.441750364219189e-06, + "loss": 0.7693, + "step": 14845 + }, + { + "epoch": 0.8171060597721393, + "grad_norm": 0.681967556476593, + "learning_rate": 6.4413353036544646e-06, + "loss": 0.6781, + "step": 14846 + }, + { + "epoch": 0.817161098574495, + "grad_norm": 0.6799043416976929, + "learning_rate": 6.440920232257049e-06, + "loss": 0.7791, + "step": 14847 + }, + { + "epoch": 0.8172161373768507, + "grad_norm": 0.673652172088623, + "learning_rate": 6.440505150030064e-06, + "loss": 0.7099, + "step": 14848 + }, + { + "epoch": 0.8172711761792063, + "grad_norm": 0.755377471446991, + "learning_rate": 6.4400900569766255e-06, + "loss": 0.7292, + "step": 14849 + }, + { + "epoch": 0.817326214981562, + "grad_norm": 0.6099830269813538, + "learning_rate": 6.439674953099857e-06, + "loss": 0.7154, + "step": 14850 + }, + { + "epoch": 0.8173812537839177, + "grad_norm": 0.6330500841140747, + "learning_rate": 6.439259838402878e-06, + "loss": 0.6858, + "step": 14851 + }, + { + "epoch": 0.8174362925862734, + "grad_norm": 0.6727203726768494, + "learning_rate": 6.438844712888806e-06, + "loss": 0.7089, + "step": 14852 + }, + { + "epoch": 0.817491331388629, + "grad_norm": 0.7482651472091675, + "learning_rate": 6.438429576560763e-06, + "loss": 0.7065, + "step": 14853 + }, + { + "epoch": 0.8175463701909846, + "grad_norm": 0.6786343455314636, + "learning_rate": 6.438014429421868e-06, + "loss": 0.7049, + "step": 14854 + }, + { + "epoch": 0.8176014089933403, + "grad_norm": 0.6155980825424194, + "learning_rate": 6.437599271475241e-06, + "loss": 0.607, + "step": 14855 + }, + { + "epoch": 0.817656447795696, + "grad_norm": 0.6551154851913452, + "learning_rate": 6.437184102724003e-06, + "loss": 0.7022, + "step": 14856 + }, + { + "epoch": 0.8177114865980516, + "grad_norm": 0.6127358078956604, + "learning_rate": 6.436768923171273e-06, + "loss": 0.6827, + "step": 14857 + }, + { + "epoch": 0.8177665254004073, + "grad_norm": 0.6470245718955994, + "learning_rate": 6.436353732820175e-06, + "loss": 0.6877, + "step": 14858 + }, + { + "epoch": 0.817821564202763, + "grad_norm": 0.704667866230011, + "learning_rate": 6.435938531673825e-06, + "loss": 0.7223, + "step": 14859 + }, + { + "epoch": 0.8178766030051187, + "grad_norm": 0.6328873634338379, + "learning_rate": 6.435523319735345e-06, + "loss": 0.7181, + "step": 14860 + }, + { + "epoch": 0.8179316418074742, + "grad_norm": 0.6489065885543823, + "learning_rate": 6.435108097007856e-06, + "loss": 0.7597, + "step": 14861 + }, + { + "epoch": 0.8179866806098299, + "grad_norm": 0.6398639678955078, + "learning_rate": 6.43469286349448e-06, + "loss": 0.667, + "step": 14862 + }, + { + "epoch": 0.8180417194121856, + "grad_norm": 0.7615578770637512, + "learning_rate": 6.434277619198335e-06, + "loss": 0.8474, + "step": 14863 + }, + { + "epoch": 0.8180967582145412, + "grad_norm": 0.8604047894477844, + "learning_rate": 6.433862364122545e-06, + "loss": 0.7977, + "step": 14864 + }, + { + "epoch": 0.8181517970168969, + "grad_norm": 0.6157855987548828, + "learning_rate": 6.433447098270228e-06, + "loss": 0.6513, + "step": 14865 + }, + { + "epoch": 0.8182068358192526, + "grad_norm": 0.7052211761474609, + "learning_rate": 6.433031821644507e-06, + "loss": 0.7043, + "step": 14866 + }, + { + "epoch": 0.8182618746216083, + "grad_norm": 0.785987138748169, + "learning_rate": 6.432616534248503e-06, + "loss": 0.8722, + "step": 14867 + }, + { + "epoch": 0.8183169134239638, + "grad_norm": 0.7711461782455444, + "learning_rate": 6.432201236085336e-06, + "loss": 0.68, + "step": 14868 + }, + { + "epoch": 0.8183719522263195, + "grad_norm": 0.6299784183502197, + "learning_rate": 6.431785927158126e-06, + "loss": 0.7397, + "step": 14869 + }, + { + "epoch": 0.8184269910286752, + "grad_norm": 0.6292238235473633, + "learning_rate": 6.431370607469998e-06, + "loss": 0.7392, + "step": 14870 + }, + { + "epoch": 0.8184820298310309, + "grad_norm": 0.8696228861808777, + "learning_rate": 6.430955277024071e-06, + "loss": 0.884, + "step": 14871 + }, + { + "epoch": 0.8185370686333865, + "grad_norm": 0.6754364967346191, + "learning_rate": 6.430539935823469e-06, + "loss": 0.7122, + "step": 14872 + }, + { + "epoch": 0.8185921074357422, + "grad_norm": 0.6936547160148621, + "learning_rate": 6.4301245838713085e-06, + "loss": 0.7353, + "step": 14873 + }, + { + "epoch": 0.8186471462380979, + "grad_norm": 0.8840705156326294, + "learning_rate": 6.429709221170717e-06, + "loss": 0.7043, + "step": 14874 + }, + { + "epoch": 0.8187021850404536, + "grad_norm": 0.7349988222122192, + "learning_rate": 6.4292938477248135e-06, + "loss": 0.7861, + "step": 14875 + }, + { + "epoch": 0.8187572238428091, + "grad_norm": 0.697790801525116, + "learning_rate": 6.428878463536721e-06, + "loss": 0.8021, + "step": 14876 + }, + { + "epoch": 0.8188122626451648, + "grad_norm": 0.7873979806900024, + "learning_rate": 6.428463068609559e-06, + "loss": 0.7313, + "step": 14877 + }, + { + "epoch": 0.8188673014475205, + "grad_norm": 0.6542018055915833, + "learning_rate": 6.4280476629464505e-06, + "loss": 0.7811, + "step": 14878 + }, + { + "epoch": 0.8189223402498762, + "grad_norm": 0.7477063536643982, + "learning_rate": 6.427632246550519e-06, + "loss": 0.764, + "step": 14879 + }, + { + "epoch": 0.8189773790522318, + "grad_norm": 0.6456438302993774, + "learning_rate": 6.4272168194248855e-06, + "loss": 0.7517, + "step": 14880 + }, + { + "epoch": 0.8190324178545875, + "grad_norm": 0.699684202671051, + "learning_rate": 6.426801381572671e-06, + "loss": 0.7963, + "step": 14881 + }, + { + "epoch": 0.8190874566569432, + "grad_norm": 0.9158867001533508, + "learning_rate": 6.426385932997001e-06, + "loss": 0.8782, + "step": 14882 + }, + { + "epoch": 0.8191424954592988, + "grad_norm": 0.5998190641403198, + "learning_rate": 6.425970473700995e-06, + "loss": 0.6598, + "step": 14883 + }, + { + "epoch": 0.8191975342616544, + "grad_norm": 0.6674730777740479, + "learning_rate": 6.4255550036877775e-06, + "loss": 0.7232, + "step": 14884 + }, + { + "epoch": 0.8192525730640101, + "grad_norm": 0.6303582191467285, + "learning_rate": 6.42513952296047e-06, + "loss": 0.7614, + "step": 14885 + }, + { + "epoch": 0.8193076118663658, + "grad_norm": 0.6255910992622375, + "learning_rate": 6.424724031522195e-06, + "loss": 0.7052, + "step": 14886 + }, + { + "epoch": 0.8193626506687215, + "grad_norm": 0.6610854268074036, + "learning_rate": 6.424308529376075e-06, + "loss": 0.7403, + "step": 14887 + }, + { + "epoch": 0.8194176894710771, + "grad_norm": 0.6758664846420288, + "learning_rate": 6.4238930165252355e-06, + "loss": 0.7603, + "step": 14888 + }, + { + "epoch": 0.8194727282734328, + "grad_norm": 0.6897797584533691, + "learning_rate": 6.423477492972796e-06, + "loss": 0.7194, + "step": 14889 + }, + { + "epoch": 0.8195277670757884, + "grad_norm": 0.7007622718811035, + "learning_rate": 6.42306195872188e-06, + "loss": 0.7905, + "step": 14890 + }, + { + "epoch": 0.8195828058781441, + "grad_norm": 0.7482092976570129, + "learning_rate": 6.422646413775613e-06, + "loss": 0.7809, + "step": 14891 + }, + { + "epoch": 0.8196378446804997, + "grad_norm": 0.9551613926887512, + "learning_rate": 6.422230858137115e-06, + "loss": 0.8559, + "step": 14892 + }, + { + "epoch": 0.8196928834828554, + "grad_norm": 0.6831939220428467, + "learning_rate": 6.42181529180951e-06, + "loss": 0.7867, + "step": 14893 + }, + { + "epoch": 0.8197479222852111, + "grad_norm": 1.446377158164978, + "learning_rate": 6.421399714795923e-06, + "loss": 0.8745, + "step": 14894 + }, + { + "epoch": 0.8198029610875668, + "grad_norm": 0.6738638877868652, + "learning_rate": 6.420984127099475e-06, + "loss": 0.727, + "step": 14895 + }, + { + "epoch": 0.8198579998899224, + "grad_norm": 0.7388872504234314, + "learning_rate": 6.420568528723292e-06, + "loss": 0.7041, + "step": 14896 + }, + { + "epoch": 0.819913038692278, + "grad_norm": 0.6977630853652954, + "learning_rate": 6.420152919670495e-06, + "loss": 0.7944, + "step": 14897 + }, + { + "epoch": 0.8199680774946337, + "grad_norm": 0.6300190091133118, + "learning_rate": 6.41973729994421e-06, + "loss": 0.6879, + "step": 14898 + }, + { + "epoch": 0.8200231162969894, + "grad_norm": 0.6350599527359009, + "learning_rate": 6.419321669547559e-06, + "loss": 0.6725, + "step": 14899 + }, + { + "epoch": 0.820078155099345, + "grad_norm": 0.8604453206062317, + "learning_rate": 6.418906028483667e-06, + "loss": 0.7706, + "step": 14900 + }, + { + "epoch": 0.8201331939017007, + "grad_norm": 0.6574103236198425, + "learning_rate": 6.418490376755656e-06, + "loss": 0.7008, + "step": 14901 + }, + { + "epoch": 0.8201882327040564, + "grad_norm": 0.706132173538208, + "learning_rate": 6.418074714366651e-06, + "loss": 0.7608, + "step": 14902 + }, + { + "epoch": 0.8202432715064121, + "grad_norm": 1.155480146408081, + "learning_rate": 6.417659041319777e-06, + "loss": 0.6893, + "step": 14903 + }, + { + "epoch": 0.8202983103087677, + "grad_norm": 0.8497835397720337, + "learning_rate": 6.417243357618157e-06, + "loss": 0.6889, + "step": 14904 + }, + { + "epoch": 0.8203533491111233, + "grad_norm": 0.9319966435432434, + "learning_rate": 6.416827663264915e-06, + "loss": 0.8098, + "step": 14905 + }, + { + "epoch": 0.820408387913479, + "grad_norm": 0.744888186454773, + "learning_rate": 6.4164119582631745e-06, + "loss": 0.7871, + "step": 14906 + }, + { + "epoch": 0.8204634267158346, + "grad_norm": 0.6928347945213318, + "learning_rate": 6.415996242616063e-06, + "loss": 0.7693, + "step": 14907 + }, + { + "epoch": 0.8205184655181903, + "grad_norm": 0.7455456852912903, + "learning_rate": 6.415580516326701e-06, + "loss": 0.6475, + "step": 14908 + }, + { + "epoch": 0.820573504320546, + "grad_norm": 0.6823583245277405, + "learning_rate": 6.415164779398215e-06, + "loss": 0.7223, + "step": 14909 + }, + { + "epoch": 0.8206285431229017, + "grad_norm": 0.6989970207214355, + "learning_rate": 6.414749031833729e-06, + "loss": 0.8203, + "step": 14910 + }, + { + "epoch": 0.8206835819252573, + "grad_norm": 0.6026825308799744, + "learning_rate": 6.414333273636369e-06, + "loss": 0.6307, + "step": 14911 + }, + { + "epoch": 0.8207386207276129, + "grad_norm": 0.6102367639541626, + "learning_rate": 6.413917504809258e-06, + "loss": 0.7049, + "step": 14912 + }, + { + "epoch": 0.8207936595299686, + "grad_norm": 0.6658119559288025, + "learning_rate": 6.4135017253555225e-06, + "loss": 0.7541, + "step": 14913 + }, + { + "epoch": 0.8208486983323243, + "grad_norm": 0.7272284626960754, + "learning_rate": 6.413085935278286e-06, + "loss": 0.7581, + "step": 14914 + }, + { + "epoch": 0.8209037371346799, + "grad_norm": 0.7826990485191345, + "learning_rate": 6.412670134580674e-06, + "loss": 0.8121, + "step": 14915 + }, + { + "epoch": 0.8209587759370356, + "grad_norm": 0.5845723748207092, + "learning_rate": 6.412254323265811e-06, + "loss": 0.5921, + "step": 14916 + }, + { + "epoch": 0.8210138147393913, + "grad_norm": 0.655577540397644, + "learning_rate": 6.411838501336823e-06, + "loss": 0.7694, + "step": 14917 + }, + { + "epoch": 0.821068853541747, + "grad_norm": 0.6722497940063477, + "learning_rate": 6.4114226687968325e-06, + "loss": 0.6377, + "step": 14918 + }, + { + "epoch": 0.8211238923441025, + "grad_norm": 0.713169276714325, + "learning_rate": 6.41100682564897e-06, + "loss": 0.7328, + "step": 14919 + }, + { + "epoch": 0.8211789311464582, + "grad_norm": 0.6004113554954529, + "learning_rate": 6.410590971896357e-06, + "loss": 0.6564, + "step": 14920 + }, + { + "epoch": 0.8212339699488139, + "grad_norm": 0.6541520953178406, + "learning_rate": 6.410175107542119e-06, + "loss": 0.7063, + "step": 14921 + }, + { + "epoch": 0.8212890087511696, + "grad_norm": 0.7937784194946289, + "learning_rate": 6.409759232589383e-06, + "loss": 0.7516, + "step": 14922 + }, + { + "epoch": 0.8213440475535252, + "grad_norm": 0.7017408013343811, + "learning_rate": 6.409343347041274e-06, + "loss": 0.6846, + "step": 14923 + }, + { + "epoch": 0.8213990863558809, + "grad_norm": 0.6233413815498352, + "learning_rate": 6.408927450900917e-06, + "loss": 0.6655, + "step": 14924 + }, + { + "epoch": 0.8214541251582366, + "grad_norm": 0.93160480260849, + "learning_rate": 6.4085115441714396e-06, + "loss": 0.7461, + "step": 14925 + }, + { + "epoch": 0.8215091639605923, + "grad_norm": 0.6075658202171326, + "learning_rate": 6.4080956268559655e-06, + "loss": 0.705, + "step": 14926 + }, + { + "epoch": 0.8215642027629478, + "grad_norm": 0.6212051510810852, + "learning_rate": 6.407679698957623e-06, + "loss": 0.6943, + "step": 14927 + }, + { + "epoch": 0.8216192415653035, + "grad_norm": 0.8143971562385559, + "learning_rate": 6.407263760479536e-06, + "loss": 0.6918, + "step": 14928 + }, + { + "epoch": 0.8216742803676592, + "grad_norm": 0.6851963996887207, + "learning_rate": 6.406847811424831e-06, + "loss": 0.7849, + "step": 14929 + }, + { + "epoch": 0.8217293191700149, + "grad_norm": 0.7047909498214722, + "learning_rate": 6.406431851796633e-06, + "loss": 0.7364, + "step": 14930 + }, + { + "epoch": 0.8217843579723705, + "grad_norm": 0.7377674579620361, + "learning_rate": 6.406015881598071e-06, + "loss": 0.7413, + "step": 14931 + }, + { + "epoch": 0.8218393967747262, + "grad_norm": 0.7188243269920349, + "learning_rate": 6.405599900832271e-06, + "loss": 0.8051, + "step": 14932 + }, + { + "epoch": 0.8218944355770819, + "grad_norm": 0.7588842511177063, + "learning_rate": 6.4051839095023575e-06, + "loss": 0.7687, + "step": 14933 + }, + { + "epoch": 0.8219494743794376, + "grad_norm": 0.6396436095237732, + "learning_rate": 6.404767907611457e-06, + "loss": 0.7516, + "step": 14934 + }, + { + "epoch": 0.8220045131817931, + "grad_norm": 0.6896073818206787, + "learning_rate": 6.404351895162698e-06, + "loss": 0.7904, + "step": 14935 + }, + { + "epoch": 0.8220595519841488, + "grad_norm": 0.7475640773773193, + "learning_rate": 6.403935872159206e-06, + "loss": 0.8325, + "step": 14936 + }, + { + "epoch": 0.8221145907865045, + "grad_norm": 0.6456442475318909, + "learning_rate": 6.403519838604107e-06, + "loss": 0.7685, + "step": 14937 + }, + { + "epoch": 0.8221696295888602, + "grad_norm": 0.6446966528892517, + "learning_rate": 6.40310379450053e-06, + "loss": 0.731, + "step": 14938 + }, + { + "epoch": 0.8222246683912158, + "grad_norm": 0.7744176983833313, + "learning_rate": 6.4026877398515995e-06, + "loss": 0.7975, + "step": 14939 + }, + { + "epoch": 0.8222797071935715, + "grad_norm": 0.6441214680671692, + "learning_rate": 6.402271674660444e-06, + "loss": 0.7386, + "step": 14940 + }, + { + "epoch": 0.8223347459959272, + "grad_norm": 0.6788361072540283, + "learning_rate": 6.40185559893019e-06, + "loss": 0.7664, + "step": 14941 + }, + { + "epoch": 0.8223897847982828, + "grad_norm": 0.6565073132514954, + "learning_rate": 6.4014395126639624e-06, + "loss": 0.6716, + "step": 14942 + }, + { + "epoch": 0.8224448236006384, + "grad_norm": 0.6475300788879395, + "learning_rate": 6.401023415864893e-06, + "loss": 0.6887, + "step": 14943 + }, + { + "epoch": 0.8224998624029941, + "grad_norm": 0.7058338522911072, + "learning_rate": 6.400607308536107e-06, + "loss": 0.7248, + "step": 14944 + }, + { + "epoch": 0.8225549012053498, + "grad_norm": 0.7184485197067261, + "learning_rate": 6.4001911906807305e-06, + "loss": 0.693, + "step": 14945 + }, + { + "epoch": 0.8226099400077055, + "grad_norm": 0.6280504465103149, + "learning_rate": 6.399775062301891e-06, + "loss": 0.6776, + "step": 14946 + }, + { + "epoch": 0.8226649788100611, + "grad_norm": 0.6995168328285217, + "learning_rate": 6.399358923402716e-06, + "loss": 0.7536, + "step": 14947 + }, + { + "epoch": 0.8227200176124168, + "grad_norm": 0.7770118713378906, + "learning_rate": 6.398942773986337e-06, + "loss": 0.6966, + "step": 14948 + }, + { + "epoch": 0.8227750564147724, + "grad_norm": 0.6947488188743591, + "learning_rate": 6.398526614055876e-06, + "loss": 0.7317, + "step": 14949 + }, + { + "epoch": 0.822830095217128, + "grad_norm": 0.7234527468681335, + "learning_rate": 6.3981104436144645e-06, + "loss": 0.6495, + "step": 14950 + }, + { + "epoch": 0.8228851340194837, + "grad_norm": 0.6872434020042419, + "learning_rate": 6.3976942626652295e-06, + "loss": 0.651, + "step": 14951 + }, + { + "epoch": 0.8229401728218394, + "grad_norm": 0.6762012243270874, + "learning_rate": 6.397278071211298e-06, + "loss": 0.7115, + "step": 14952 + }, + { + "epoch": 0.8229952116241951, + "grad_norm": 0.7007278800010681, + "learning_rate": 6.396861869255799e-06, + "loss": 0.717, + "step": 14953 + }, + { + "epoch": 0.8230502504265507, + "grad_norm": 0.7403082251548767, + "learning_rate": 6.396445656801859e-06, + "loss": 0.846, + "step": 14954 + }, + { + "epoch": 0.8231052892289064, + "grad_norm": 0.688758373260498, + "learning_rate": 6.396029433852609e-06, + "loss": 0.7871, + "step": 14955 + }, + { + "epoch": 0.823160328031262, + "grad_norm": 0.7264360189437866, + "learning_rate": 6.395613200411173e-06, + "loss": 0.7803, + "step": 14956 + }, + { + "epoch": 0.8232153668336177, + "grad_norm": 0.6858585476875305, + "learning_rate": 6.395196956480683e-06, + "loss": 0.6595, + "step": 14957 + }, + { + "epoch": 0.8232704056359733, + "grad_norm": 0.7834211587905884, + "learning_rate": 6.394780702064266e-06, + "loss": 0.7689, + "step": 14958 + }, + { + "epoch": 0.823325444438329, + "grad_norm": 0.6933274865150452, + "learning_rate": 6.394364437165052e-06, + "loss": 0.758, + "step": 14959 + }, + { + "epoch": 0.8233804832406847, + "grad_norm": 0.7490070462226868, + "learning_rate": 6.3939481617861664e-06, + "loss": 0.8106, + "step": 14960 + }, + { + "epoch": 0.8234355220430404, + "grad_norm": 0.5586501955986023, + "learning_rate": 6.3935318759307405e-06, + "loss": 0.6207, + "step": 14961 + }, + { + "epoch": 0.823490560845396, + "grad_norm": 0.6999693512916565, + "learning_rate": 6.393115579601902e-06, + "loss": 0.7787, + "step": 14962 + }, + { + "epoch": 0.8235455996477516, + "grad_norm": 1.0214177370071411, + "learning_rate": 6.392699272802779e-06, + "loss": 0.6444, + "step": 14963 + }, + { + "epoch": 0.8236006384501073, + "grad_norm": 0.7808836698532104, + "learning_rate": 6.392282955536502e-06, + "loss": 0.7537, + "step": 14964 + }, + { + "epoch": 0.823655677252463, + "grad_norm": 0.6825253963470459, + "learning_rate": 6.391866627806198e-06, + "loss": 0.7346, + "step": 14965 + }, + { + "epoch": 0.8237107160548186, + "grad_norm": 0.6105558276176453, + "learning_rate": 6.391450289614998e-06, + "loss": 0.6631, + "step": 14966 + }, + { + "epoch": 0.8237657548571743, + "grad_norm": 0.721986711025238, + "learning_rate": 6.391033940966029e-06, + "loss": 0.8638, + "step": 14967 + }, + { + "epoch": 0.82382079365953, + "grad_norm": 0.6226428747177124, + "learning_rate": 6.390617581862421e-06, + "loss": 0.7291, + "step": 14968 + }, + { + "epoch": 0.8238758324618857, + "grad_norm": 0.7403777241706848, + "learning_rate": 6.390201212307305e-06, + "loss": 0.7417, + "step": 14969 + }, + { + "epoch": 0.8239308712642412, + "grad_norm": 0.7188371419906616, + "learning_rate": 6.389784832303808e-06, + "loss": 0.757, + "step": 14970 + }, + { + "epoch": 0.8239859100665969, + "grad_norm": 0.8741163611412048, + "learning_rate": 6.389368441855061e-06, + "loss": 0.7264, + "step": 14971 + }, + { + "epoch": 0.8240409488689526, + "grad_norm": 0.7092788219451904, + "learning_rate": 6.388952040964192e-06, + "loss": 0.731, + "step": 14972 + }, + { + "epoch": 0.8240959876713083, + "grad_norm": 0.9291765689849854, + "learning_rate": 6.388535629634331e-06, + "loss": 0.7964, + "step": 14973 + }, + { + "epoch": 0.8241510264736639, + "grad_norm": 0.6140535473823547, + "learning_rate": 6.388119207868608e-06, + "loss": 0.7099, + "step": 14974 + }, + { + "epoch": 0.8242060652760196, + "grad_norm": 0.654778778553009, + "learning_rate": 6.387702775670154e-06, + "loss": 0.6667, + "step": 14975 + }, + { + "epoch": 0.8242611040783753, + "grad_norm": 0.7221185564994812, + "learning_rate": 6.387286333042095e-06, + "loss": 0.7533, + "step": 14976 + }, + { + "epoch": 0.824316142880731, + "grad_norm": 0.6680133938789368, + "learning_rate": 6.386869879987565e-06, + "loss": 0.6404, + "step": 14977 + }, + { + "epoch": 0.8243711816830865, + "grad_norm": 0.7067292928695679, + "learning_rate": 6.386453416509691e-06, + "loss": 0.8493, + "step": 14978 + }, + { + "epoch": 0.8244262204854422, + "grad_norm": 0.6279785633087158, + "learning_rate": 6.386036942611605e-06, + "loss": 0.7465, + "step": 14979 + }, + { + "epoch": 0.8244812592877979, + "grad_norm": 0.7184332013130188, + "learning_rate": 6.385620458296438e-06, + "loss": 0.738, + "step": 14980 + }, + { + "epoch": 0.8245362980901536, + "grad_norm": 0.7318315505981445, + "learning_rate": 6.385203963567316e-06, + "loss": 0.7409, + "step": 14981 + }, + { + "epoch": 0.8245913368925092, + "grad_norm": 0.6848355531692505, + "learning_rate": 6.384787458427372e-06, + "loss": 0.7343, + "step": 14982 + }, + { + "epoch": 0.8246463756948649, + "grad_norm": 0.7097738981246948, + "learning_rate": 6.384370942879736e-06, + "loss": 0.817, + "step": 14983 + }, + { + "epoch": 0.8247014144972206, + "grad_norm": 0.6933857798576355, + "learning_rate": 6.38395441692754e-06, + "loss": 0.7356, + "step": 14984 + }, + { + "epoch": 0.8247564532995763, + "grad_norm": 0.6631865501403809, + "learning_rate": 6.383537880573913e-06, + "loss": 0.752, + "step": 14985 + }, + { + "epoch": 0.8248114921019318, + "grad_norm": 0.6564633846282959, + "learning_rate": 6.3831213338219855e-06, + "loss": 0.7755, + "step": 14986 + }, + { + "epoch": 0.8248665309042875, + "grad_norm": 0.6518037915229797, + "learning_rate": 6.382704776674887e-06, + "loss": 0.7185, + "step": 14987 + }, + { + "epoch": 0.8249215697066432, + "grad_norm": 0.7074370384216309, + "learning_rate": 6.382288209135752e-06, + "loss": 0.7632, + "step": 14988 + }, + { + "epoch": 0.8249766085089989, + "grad_norm": 0.7034205198287964, + "learning_rate": 6.381871631207707e-06, + "loss": 0.8234, + "step": 14989 + }, + { + "epoch": 0.8250316473113545, + "grad_norm": 0.7635502815246582, + "learning_rate": 6.381455042893884e-06, + "loss": 0.7847, + "step": 14990 + }, + { + "epoch": 0.8250866861137102, + "grad_norm": 0.7682950496673584, + "learning_rate": 6.381038444197416e-06, + "loss": 0.6815, + "step": 14991 + }, + { + "epoch": 0.8251417249160659, + "grad_norm": 0.7713856101036072, + "learning_rate": 6.380621835121432e-06, + "loss": 0.7437, + "step": 14992 + }, + { + "epoch": 0.8251967637184214, + "grad_norm": 0.7955800294876099, + "learning_rate": 6.380205215669064e-06, + "loss": 0.876, + "step": 14993 + }, + { + "epoch": 0.8252518025207771, + "grad_norm": 0.6979825496673584, + "learning_rate": 6.379788585843443e-06, + "loss": 0.7018, + "step": 14994 + }, + { + "epoch": 0.8253068413231328, + "grad_norm": 0.6413466930389404, + "learning_rate": 6.379371945647701e-06, + "loss": 0.7345, + "step": 14995 + }, + { + "epoch": 0.8253618801254885, + "grad_norm": 0.6284430027008057, + "learning_rate": 6.378955295084968e-06, + "loss": 0.6758, + "step": 14996 + }, + { + "epoch": 0.8254169189278441, + "grad_norm": 0.5943842530250549, + "learning_rate": 6.378538634158377e-06, + "loss": 0.6572, + "step": 14997 + }, + { + "epoch": 0.8254719577301998, + "grad_norm": 0.7123218774795532, + "learning_rate": 6.378121962871058e-06, + "loss": 0.6993, + "step": 14998 + }, + { + "epoch": 0.8255269965325555, + "grad_norm": 0.6608574390411377, + "learning_rate": 6.377705281226143e-06, + "loss": 0.7802, + "step": 14999 + }, + { + "epoch": 0.8255820353349111, + "grad_norm": 0.6387534141540527, + "learning_rate": 6.377288589226764e-06, + "loss": 0.6572, + "step": 15000 + }, + { + "epoch": 0.8256370741372667, + "grad_norm": 0.6593596935272217, + "learning_rate": 6.376871886876054e-06, + "loss": 0.665, + "step": 15001 + }, + { + "epoch": 0.8256921129396224, + "grad_norm": 0.7146610617637634, + "learning_rate": 6.376455174177141e-06, + "loss": 0.7278, + "step": 15002 + }, + { + "epoch": 0.8257471517419781, + "grad_norm": 0.6776326298713684, + "learning_rate": 6.376038451133161e-06, + "loss": 0.7679, + "step": 15003 + }, + { + "epoch": 0.8258021905443338, + "grad_norm": 0.7008724808692932, + "learning_rate": 6.375621717747244e-06, + "loss": 0.8749, + "step": 15004 + }, + { + "epoch": 0.8258572293466894, + "grad_norm": 0.6809947490692139, + "learning_rate": 6.375204974022522e-06, + "loss": 0.7248, + "step": 15005 + }, + { + "epoch": 0.8259122681490451, + "grad_norm": 0.6921886205673218, + "learning_rate": 6.374788219962127e-06, + "loss": 0.6685, + "step": 15006 + }, + { + "epoch": 0.8259673069514007, + "grad_norm": 0.6471500396728516, + "learning_rate": 6.374371455569192e-06, + "loss": 0.6856, + "step": 15007 + }, + { + "epoch": 0.8260223457537564, + "grad_norm": 0.673425555229187, + "learning_rate": 6.373954680846851e-06, + "loss": 0.7006, + "step": 15008 + }, + { + "epoch": 0.826077384556112, + "grad_norm": 0.710217297077179, + "learning_rate": 6.373537895798233e-06, + "loss": 0.7315, + "step": 15009 + }, + { + "epoch": 0.8261324233584677, + "grad_norm": 0.692030668258667, + "learning_rate": 6.3731211004264725e-06, + "loss": 0.6534, + "step": 15010 + }, + { + "epoch": 0.8261874621608234, + "grad_norm": 0.6370778679847717, + "learning_rate": 6.372704294734701e-06, + "loss": 0.7278, + "step": 15011 + }, + { + "epoch": 0.8262425009631791, + "grad_norm": 0.6571012139320374, + "learning_rate": 6.372287478726052e-06, + "loss": 0.6889, + "step": 15012 + }, + { + "epoch": 0.8262975397655347, + "grad_norm": 0.721810519695282, + "learning_rate": 6.371870652403657e-06, + "loss": 0.8572, + "step": 15013 + }, + { + "epoch": 0.8263525785678904, + "grad_norm": 0.6751163601875305, + "learning_rate": 6.371453815770647e-06, + "loss": 0.7646, + "step": 15014 + }, + { + "epoch": 0.826407617370246, + "grad_norm": 0.724319338798523, + "learning_rate": 6.371036968830161e-06, + "loss": 0.8433, + "step": 15015 + }, + { + "epoch": 0.8264626561726017, + "grad_norm": 0.6961913108825684, + "learning_rate": 6.370620111585326e-06, + "loss": 0.7069, + "step": 15016 + }, + { + "epoch": 0.8265176949749573, + "grad_norm": 0.649428129196167, + "learning_rate": 6.370203244039279e-06, + "loss": 0.7286, + "step": 15017 + }, + { + "epoch": 0.826572733777313, + "grad_norm": 0.6468552947044373, + "learning_rate": 6.369786366195149e-06, + "loss": 0.7006, + "step": 15018 + }, + { + "epoch": 0.8266277725796687, + "grad_norm": 0.6564732789993286, + "learning_rate": 6.369369478056072e-06, + "loss": 0.727, + "step": 15019 + }, + { + "epoch": 0.8266828113820244, + "grad_norm": 0.6573188900947571, + "learning_rate": 6.36895257962518e-06, + "loss": 0.6603, + "step": 15020 + }, + { + "epoch": 0.82673785018438, + "grad_norm": 0.747164785861969, + "learning_rate": 6.368535670905609e-06, + "loss": 0.7426, + "step": 15021 + }, + { + "epoch": 0.8267928889867356, + "grad_norm": 0.6366723775863647, + "learning_rate": 6.368118751900489e-06, + "loss": 0.6487, + "step": 15022 + }, + { + "epoch": 0.8268479277890913, + "grad_norm": 0.6517844200134277, + "learning_rate": 6.367701822612955e-06, + "loss": 0.7131, + "step": 15023 + }, + { + "epoch": 0.826902966591447, + "grad_norm": 0.774309515953064, + "learning_rate": 6.367284883046141e-06, + "loss": 0.7978, + "step": 15024 + }, + { + "epoch": 0.8269580053938026, + "grad_norm": 0.6302667856216431, + "learning_rate": 6.366867933203178e-06, + "loss": 0.7403, + "step": 15025 + }, + { + "epoch": 0.8270130441961583, + "grad_norm": 0.6881224513053894, + "learning_rate": 6.366450973087202e-06, + "loss": 0.7884, + "step": 15026 + }, + { + "epoch": 0.827068082998514, + "grad_norm": 0.6901270747184753, + "learning_rate": 6.366034002701346e-06, + "loss": 0.6596, + "step": 15027 + }, + { + "epoch": 0.8271231218008697, + "grad_norm": 0.7436091303825378, + "learning_rate": 6.365617022048745e-06, + "loss": 0.8141, + "step": 15028 + }, + { + "epoch": 0.8271781606032252, + "grad_norm": 0.6745834350585938, + "learning_rate": 6.365200031132531e-06, + "loss": 0.7738, + "step": 15029 + }, + { + "epoch": 0.8272331994055809, + "grad_norm": 0.6963297724723816, + "learning_rate": 6.364783029955839e-06, + "loss": 0.8649, + "step": 15030 + }, + { + "epoch": 0.8272882382079366, + "grad_norm": 0.6468135714530945, + "learning_rate": 6.364366018521803e-06, + "loss": 0.7403, + "step": 15031 + }, + { + "epoch": 0.8273432770102923, + "grad_norm": 0.6481515169143677, + "learning_rate": 6.363948996833559e-06, + "loss": 0.6268, + "step": 15032 + }, + { + "epoch": 0.8273983158126479, + "grad_norm": 0.6881366968154907, + "learning_rate": 6.3635319648942386e-06, + "loss": 0.6339, + "step": 15033 + }, + { + "epoch": 0.8274533546150036, + "grad_norm": 0.6858122944831848, + "learning_rate": 6.363114922706977e-06, + "loss": 0.7685, + "step": 15034 + }, + { + "epoch": 0.8275083934173593, + "grad_norm": 0.6630339026451111, + "learning_rate": 6.362697870274907e-06, + "loss": 0.7281, + "step": 15035 + }, + { + "epoch": 0.8275634322197148, + "grad_norm": 0.7198584079742432, + "learning_rate": 6.362280807601167e-06, + "loss": 0.7726, + "step": 15036 + }, + { + "epoch": 0.8276184710220705, + "grad_norm": 0.721622884273529, + "learning_rate": 6.361863734688888e-06, + "loss": 0.6471, + "step": 15037 + }, + { + "epoch": 0.8276735098244262, + "grad_norm": 0.6032352447509766, + "learning_rate": 6.3614466515412055e-06, + "loss": 0.6684, + "step": 15038 + }, + { + "epoch": 0.8277285486267819, + "grad_norm": 0.7568576335906982, + "learning_rate": 6.3610295581612535e-06, + "loss": 0.7089, + "step": 15039 + }, + { + "epoch": 0.8277835874291375, + "grad_norm": 0.7461723685264587, + "learning_rate": 6.360612454552168e-06, + "loss": 0.806, + "step": 15040 + }, + { + "epoch": 0.8278386262314932, + "grad_norm": 0.6606107354164124, + "learning_rate": 6.3601953407170855e-06, + "loss": 0.7276, + "step": 15041 + }, + { + "epoch": 0.8278936650338489, + "grad_norm": 0.7203792333602905, + "learning_rate": 6.3597782166591384e-06, + "loss": 0.844, + "step": 15042 + }, + { + "epoch": 0.8279487038362046, + "grad_norm": 0.7327194213867188, + "learning_rate": 6.35936108238146e-06, + "loss": 0.8289, + "step": 15043 + }, + { + "epoch": 0.8280037426385601, + "grad_norm": 0.6741734147071838, + "learning_rate": 6.358943937887189e-06, + "loss": 0.7022, + "step": 15044 + }, + { + "epoch": 0.8280587814409158, + "grad_norm": 0.795724630355835, + "learning_rate": 6.35852678317946e-06, + "loss": 0.7703, + "step": 15045 + }, + { + "epoch": 0.8281138202432715, + "grad_norm": 0.6476230621337891, + "learning_rate": 6.3581096182614055e-06, + "loss": 0.7471, + "step": 15046 + }, + { + "epoch": 0.8281688590456272, + "grad_norm": 0.658829391002655, + "learning_rate": 6.357692443136164e-06, + "loss": 0.7796, + "step": 15047 + }, + { + "epoch": 0.8282238978479828, + "grad_norm": 0.6755202412605286, + "learning_rate": 6.35727525780687e-06, + "loss": 0.8239, + "step": 15048 + }, + { + "epoch": 0.8282789366503385, + "grad_norm": 0.6518263220787048, + "learning_rate": 6.356858062276658e-06, + "loss": 0.7222, + "step": 15049 + }, + { + "epoch": 0.8283339754526942, + "grad_norm": 0.7006294131278992, + "learning_rate": 6.356440856548662e-06, + "loss": 0.7779, + "step": 15050 + }, + { + "epoch": 0.8283890142550498, + "grad_norm": 0.6771633625030518, + "learning_rate": 6.356023640626021e-06, + "loss": 0.7529, + "step": 15051 + }, + { + "epoch": 0.8284440530574054, + "grad_norm": 0.6893792152404785, + "learning_rate": 6.35560641451187e-06, + "loss": 0.834, + "step": 15052 + }, + { + "epoch": 0.8284990918597611, + "grad_norm": 0.7450309991836548, + "learning_rate": 6.355189178209343e-06, + "loss": 0.7017, + "step": 15053 + }, + { + "epoch": 0.8285541306621168, + "grad_norm": 0.7094436883926392, + "learning_rate": 6.3547719317215785e-06, + "loss": 0.7883, + "step": 15054 + }, + { + "epoch": 0.8286091694644725, + "grad_norm": 0.6926944255828857, + "learning_rate": 6.3543546750517085e-06, + "loss": 0.7309, + "step": 15055 + }, + { + "epoch": 0.8286642082668281, + "grad_norm": 0.7394436597824097, + "learning_rate": 6.3539374082028725e-06, + "loss": 0.8819, + "step": 15056 + }, + { + "epoch": 0.8287192470691838, + "grad_norm": 0.7663393616676331, + "learning_rate": 6.353520131178206e-06, + "loss": 0.7269, + "step": 15057 + }, + { + "epoch": 0.8287742858715395, + "grad_norm": 0.702627956867218, + "learning_rate": 6.353102843980844e-06, + "loss": 0.8205, + "step": 15058 + }, + { + "epoch": 0.8288293246738951, + "grad_norm": 0.6575393676757812, + "learning_rate": 6.352685546613924e-06, + "loss": 0.782, + "step": 15059 + }, + { + "epoch": 0.8288843634762507, + "grad_norm": 0.6844787001609802, + "learning_rate": 6.35226823908058e-06, + "loss": 0.7485, + "step": 15060 + }, + { + "epoch": 0.8289394022786064, + "grad_norm": 0.6018843054771423, + "learning_rate": 6.351850921383951e-06, + "loss": 0.6788, + "step": 15061 + }, + { + "epoch": 0.8289944410809621, + "grad_norm": 0.7418997883796692, + "learning_rate": 6.351433593527172e-06, + "loss": 0.6789, + "step": 15062 + }, + { + "epoch": 0.8290494798833178, + "grad_norm": 0.625535786151886, + "learning_rate": 6.351016255513379e-06, + "loss": 0.7405, + "step": 15063 + }, + { + "epoch": 0.8291045186856734, + "grad_norm": 0.678569495677948, + "learning_rate": 6.350598907345711e-06, + "loss": 0.7386, + "step": 15064 + }, + { + "epoch": 0.829159557488029, + "grad_norm": 0.8012919425964355, + "learning_rate": 6.350181549027302e-06, + "loss": 0.7703, + "step": 15065 + }, + { + "epoch": 0.8292145962903847, + "grad_norm": 0.6115431189537048, + "learning_rate": 6.3497641805612905e-06, + "loss": 0.7131, + "step": 15066 + }, + { + "epoch": 0.8292696350927404, + "grad_norm": 0.7392085194587708, + "learning_rate": 6.349346801950812e-06, + "loss": 0.7648, + "step": 15067 + }, + { + "epoch": 0.829324673895096, + "grad_norm": 0.597613513469696, + "learning_rate": 6.348929413199005e-06, + "loss": 0.6023, + "step": 15068 + }, + { + "epoch": 0.8293797126974517, + "grad_norm": 0.6418130397796631, + "learning_rate": 6.348512014309005e-06, + "loss": 0.7507, + "step": 15069 + }, + { + "epoch": 0.8294347514998074, + "grad_norm": 0.6351965665817261, + "learning_rate": 6.34809460528395e-06, + "loss": 0.722, + "step": 15070 + }, + { + "epoch": 0.8294897903021631, + "grad_norm": 0.6593570709228516, + "learning_rate": 6.347677186126977e-06, + "loss": 0.7032, + "step": 15071 + }, + { + "epoch": 0.8295448291045187, + "grad_norm": 0.8040562868118286, + "learning_rate": 6.3472597568412235e-06, + "loss": 0.6519, + "step": 15072 + }, + { + "epoch": 0.8295998679068743, + "grad_norm": 0.7043612599372864, + "learning_rate": 6.346842317429825e-06, + "loss": 0.7765, + "step": 15073 + }, + { + "epoch": 0.82965490670923, + "grad_norm": 0.6304612159729004, + "learning_rate": 6.346424867895922e-06, + "loss": 0.6763, + "step": 15074 + }, + { + "epoch": 0.8297099455115857, + "grad_norm": 0.6402591466903687, + "learning_rate": 6.346007408242647e-06, + "loss": 0.828, + "step": 15075 + }, + { + "epoch": 0.8297649843139413, + "grad_norm": 0.6908280849456787, + "learning_rate": 6.345589938473142e-06, + "loss": 0.855, + "step": 15076 + }, + { + "epoch": 0.829820023116297, + "grad_norm": 0.5829552412033081, + "learning_rate": 6.345172458590545e-06, + "loss": 0.6323, + "step": 15077 + }, + { + "epoch": 0.8298750619186527, + "grad_norm": 0.8221700191497803, + "learning_rate": 6.34475496859799e-06, + "loss": 0.7069, + "step": 15078 + }, + { + "epoch": 0.8299301007210083, + "grad_norm": 0.7065801024436951, + "learning_rate": 6.344337468498616e-06, + "loss": 0.692, + "step": 15079 + }, + { + "epoch": 0.829985139523364, + "grad_norm": 0.6199344396591187, + "learning_rate": 6.343919958295564e-06, + "loss": 0.682, + "step": 15080 + }, + { + "epoch": 0.8300401783257196, + "grad_norm": 0.8999378681182861, + "learning_rate": 6.343502437991968e-06, + "loss": 0.7924, + "step": 15081 + }, + { + "epoch": 0.8300952171280753, + "grad_norm": 0.639163076877594, + "learning_rate": 6.343084907590966e-06, + "loss": 0.6976, + "step": 15082 + }, + { + "epoch": 0.8301502559304309, + "grad_norm": 0.8266178965568542, + "learning_rate": 6.3426673670957e-06, + "loss": 0.6831, + "step": 15083 + }, + { + "epoch": 0.8302052947327866, + "grad_norm": 0.6245449781417847, + "learning_rate": 6.3422498165093034e-06, + "loss": 0.6917, + "step": 15084 + }, + { + "epoch": 0.8302603335351423, + "grad_norm": 0.7809823751449585, + "learning_rate": 6.341832255834918e-06, + "loss": 0.8424, + "step": 15085 + }, + { + "epoch": 0.830315372337498, + "grad_norm": 0.6803410053253174, + "learning_rate": 6.34141468507568e-06, + "loss": 0.8345, + "step": 15086 + }, + { + "epoch": 0.8303704111398535, + "grad_norm": 0.7445305585861206, + "learning_rate": 6.340997104234728e-06, + "loss": 0.8823, + "step": 15087 + }, + { + "epoch": 0.8304254499422092, + "grad_norm": 0.6992506384849548, + "learning_rate": 6.340579513315199e-06, + "loss": 0.7857, + "step": 15088 + }, + { + "epoch": 0.8304804887445649, + "grad_norm": 0.7050431966781616, + "learning_rate": 6.340161912320237e-06, + "loss": 0.7988, + "step": 15089 + }, + { + "epoch": 0.8305355275469206, + "grad_norm": 0.8718838095664978, + "learning_rate": 6.339744301252973e-06, + "loss": 0.9983, + "step": 15090 + }, + { + "epoch": 0.8305905663492762, + "grad_norm": 0.7317140698432922, + "learning_rate": 6.339326680116551e-06, + "loss": 0.6852, + "step": 15091 + }, + { + "epoch": 0.8306456051516319, + "grad_norm": 0.6975864768028259, + "learning_rate": 6.338909048914108e-06, + "loss": 0.7334, + "step": 15092 + }, + { + "epoch": 0.8307006439539876, + "grad_norm": 0.6615436673164368, + "learning_rate": 6.3384914076487834e-06, + "loss": 0.776, + "step": 15093 + }, + { + "epoch": 0.8307556827563433, + "grad_norm": 0.773273766040802, + "learning_rate": 6.338073756323717e-06, + "loss": 0.7868, + "step": 15094 + }, + { + "epoch": 0.8308107215586988, + "grad_norm": 0.6686182022094727, + "learning_rate": 6.337656094942045e-06, + "loss": 0.7487, + "step": 15095 + }, + { + "epoch": 0.8308657603610545, + "grad_norm": 0.8202255368232727, + "learning_rate": 6.337238423506909e-06, + "loss": 0.7748, + "step": 15096 + }, + { + "epoch": 0.8309207991634102, + "grad_norm": 0.6356936693191528, + "learning_rate": 6.336820742021445e-06, + "loss": 0.6539, + "step": 15097 + }, + { + "epoch": 0.8309758379657659, + "grad_norm": 0.6543401479721069, + "learning_rate": 6.3364030504887955e-06, + "loss": 0.7185, + "step": 15098 + }, + { + "epoch": 0.8310308767681215, + "grad_norm": 0.6499043107032776, + "learning_rate": 6.335985348912097e-06, + "loss": 0.7254, + "step": 15099 + }, + { + "epoch": 0.8310859155704772, + "grad_norm": 0.6983271241188049, + "learning_rate": 6.335567637294491e-06, + "loss": 0.784, + "step": 15100 + }, + { + "epoch": 0.8311409543728329, + "grad_norm": 0.7932507395744324, + "learning_rate": 6.335149915639117e-06, + "loss": 0.6708, + "step": 15101 + }, + { + "epoch": 0.8311959931751886, + "grad_norm": 0.6792518496513367, + "learning_rate": 6.334732183949112e-06, + "loss": 0.7365, + "step": 15102 + }, + { + "epoch": 0.8312510319775441, + "grad_norm": 0.6852229237556458, + "learning_rate": 6.334314442227618e-06, + "loss": 0.7283, + "step": 15103 + }, + { + "epoch": 0.8313060707798998, + "grad_norm": 0.6528468728065491, + "learning_rate": 6.333896690477774e-06, + "loss": 0.763, + "step": 15104 + }, + { + "epoch": 0.8313611095822555, + "grad_norm": 0.7215067148208618, + "learning_rate": 6.33347892870272e-06, + "loss": 0.7769, + "step": 15105 + }, + { + "epoch": 0.8314161483846112, + "grad_norm": 0.7171593308448792, + "learning_rate": 6.333061156905596e-06, + "loss": 0.6807, + "step": 15106 + }, + { + "epoch": 0.8314711871869668, + "grad_norm": 0.6781407594680786, + "learning_rate": 6.332643375089539e-06, + "loss": 0.6801, + "step": 15107 + }, + { + "epoch": 0.8315262259893225, + "grad_norm": 0.803057849407196, + "learning_rate": 6.332225583257693e-06, + "loss": 0.682, + "step": 15108 + }, + { + "epoch": 0.8315812647916782, + "grad_norm": 0.6467291712760925, + "learning_rate": 6.331807781413195e-06, + "loss": 0.6675, + "step": 15109 + }, + { + "epoch": 0.8316363035940338, + "grad_norm": 0.7285529971122742, + "learning_rate": 6.331389969559186e-06, + "loss": 0.7333, + "step": 15110 + }, + { + "epoch": 0.8316913423963894, + "grad_norm": 0.6569895148277283, + "learning_rate": 6.330972147698806e-06, + "loss": 0.7202, + "step": 15111 + }, + { + "epoch": 0.8317463811987451, + "grad_norm": 0.7848708033561707, + "learning_rate": 6.330554315835198e-06, + "loss": 0.7936, + "step": 15112 + }, + { + "epoch": 0.8318014200011008, + "grad_norm": 0.6699723601341248, + "learning_rate": 6.330136473971498e-06, + "loss": 0.7107, + "step": 15113 + }, + { + "epoch": 0.8318564588034565, + "grad_norm": 0.7443183660507202, + "learning_rate": 6.329718622110848e-06, + "loss": 0.8102, + "step": 15114 + }, + { + "epoch": 0.8319114976058121, + "grad_norm": 0.6073893904685974, + "learning_rate": 6.329300760256389e-06, + "loss": 0.7061, + "step": 15115 + }, + { + "epoch": 0.8319665364081678, + "grad_norm": 0.6192148923873901, + "learning_rate": 6.328882888411262e-06, + "loss": 0.6929, + "step": 15116 + }, + { + "epoch": 0.8320215752105234, + "grad_norm": 0.7347237467765808, + "learning_rate": 6.3284650065786065e-06, + "loss": 0.6705, + "step": 15117 + }, + { + "epoch": 0.8320766140128791, + "grad_norm": 0.6286477446556091, + "learning_rate": 6.328047114761564e-06, + "loss": 0.6494, + "step": 15118 + }, + { + "epoch": 0.8321316528152347, + "grad_norm": 0.6492440104484558, + "learning_rate": 6.327629212963275e-06, + "loss": 0.6618, + "step": 15119 + }, + { + "epoch": 0.8321866916175904, + "grad_norm": 0.6295114755630493, + "learning_rate": 6.3272113011868804e-06, + "loss": 0.786, + "step": 15120 + }, + { + "epoch": 0.8322417304199461, + "grad_norm": 0.6737865805625916, + "learning_rate": 6.3267933794355206e-06, + "loss": 0.7544, + "step": 15121 + }, + { + "epoch": 0.8322967692223017, + "grad_norm": 0.8025132417678833, + "learning_rate": 6.3263754477123374e-06, + "loss": 0.7736, + "step": 15122 + }, + { + "epoch": 0.8323518080246574, + "grad_norm": 0.6820534467697144, + "learning_rate": 6.32595750602047e-06, + "loss": 0.6616, + "step": 15123 + }, + { + "epoch": 0.832406846827013, + "grad_norm": 0.7022573351860046, + "learning_rate": 6.325539554363061e-06, + "loss": 0.8175, + "step": 15124 + }, + { + "epoch": 0.8324618856293687, + "grad_norm": 0.7034926414489746, + "learning_rate": 6.325121592743253e-06, + "loss": 0.7047, + "step": 15125 + }, + { + "epoch": 0.8325169244317243, + "grad_norm": 0.654296875, + "learning_rate": 6.3247036211641856e-06, + "loss": 0.6468, + "step": 15126 + }, + { + "epoch": 0.83257196323408, + "grad_norm": 0.647859513759613, + "learning_rate": 6.324285639628999e-06, + "loss": 0.694, + "step": 15127 + }, + { + "epoch": 0.8326270020364357, + "grad_norm": 1.0824226140975952, + "learning_rate": 6.323867648140837e-06, + "loss": 0.7226, + "step": 15128 + }, + { + "epoch": 0.8326820408387914, + "grad_norm": 0.8568648099899292, + "learning_rate": 6.323449646702839e-06, + "loss": 0.7524, + "step": 15129 + }, + { + "epoch": 0.832737079641147, + "grad_norm": 0.6550299525260925, + "learning_rate": 6.32303163531815e-06, + "loss": 0.7294, + "step": 15130 + }, + { + "epoch": 0.8327921184435026, + "grad_norm": 0.7722175121307373, + "learning_rate": 6.3226136139899075e-06, + "loss": 0.7864, + "step": 15131 + }, + { + "epoch": 0.8328471572458583, + "grad_norm": 0.6542928218841553, + "learning_rate": 6.322195582721256e-06, + "loss": 0.6614, + "step": 15132 + }, + { + "epoch": 0.832902196048214, + "grad_norm": 0.6617493629455566, + "learning_rate": 6.321777541515337e-06, + "loss": 0.7147, + "step": 15133 + }, + { + "epoch": 0.8329572348505696, + "grad_norm": 0.698868989944458, + "learning_rate": 6.321359490375291e-06, + "loss": 0.6894, + "step": 15134 + }, + { + "epoch": 0.8330122736529253, + "grad_norm": 0.8005796074867249, + "learning_rate": 6.3209414293042595e-06, + "loss": 0.7513, + "step": 15135 + }, + { + "epoch": 0.833067312455281, + "grad_norm": 0.7656713128089905, + "learning_rate": 6.320523358305387e-06, + "loss": 0.7387, + "step": 15136 + }, + { + "epoch": 0.8331223512576367, + "grad_norm": 0.7299987077713013, + "learning_rate": 6.320105277381815e-06, + "loss": 0.7868, + "step": 15137 + }, + { + "epoch": 0.8331773900599923, + "grad_norm": 0.782574474811554, + "learning_rate": 6.319687186536685e-06, + "loss": 0.8307, + "step": 15138 + }, + { + "epoch": 0.8332324288623479, + "grad_norm": 0.6786854863166809, + "learning_rate": 6.319269085773138e-06, + "loss": 0.7819, + "step": 15139 + }, + { + "epoch": 0.8332874676647036, + "grad_norm": 1.173049807548523, + "learning_rate": 6.318850975094318e-06, + "loss": 0.7623, + "step": 15140 + }, + { + "epoch": 0.8333425064670593, + "grad_norm": 0.8410226106643677, + "learning_rate": 6.318432854503368e-06, + "loss": 0.812, + "step": 15141 + }, + { + "epoch": 0.8333975452694149, + "grad_norm": 0.8525705337524414, + "learning_rate": 6.3180147240034304e-06, + "loss": 0.7585, + "step": 15142 + }, + { + "epoch": 0.8334525840717706, + "grad_norm": 0.6345195770263672, + "learning_rate": 6.317596583597645e-06, + "loss": 0.7446, + "step": 15143 + }, + { + "epoch": 0.8335076228741263, + "grad_norm": 0.7238603234291077, + "learning_rate": 6.317178433289157e-06, + "loss": 0.7461, + "step": 15144 + }, + { + "epoch": 0.833562661676482, + "grad_norm": 0.6187044382095337, + "learning_rate": 6.31676027308111e-06, + "loss": 0.7195, + "step": 15145 + }, + { + "epoch": 0.8336177004788375, + "grad_norm": 0.6813417077064514, + "learning_rate": 6.316342102976644e-06, + "loss": 0.772, + "step": 15146 + }, + { + "epoch": 0.8336727392811932, + "grad_norm": 0.665515124797821, + "learning_rate": 6.315923922978902e-06, + "loss": 0.7127, + "step": 15147 + }, + { + "epoch": 0.8337277780835489, + "grad_norm": 0.8104628920555115, + "learning_rate": 6.315505733091028e-06, + "loss": 0.7332, + "step": 15148 + }, + { + "epoch": 0.8337828168859046, + "grad_norm": 0.8447679281234741, + "learning_rate": 6.315087533316166e-06, + "loss": 0.6803, + "step": 15149 + }, + { + "epoch": 0.8338378556882602, + "grad_norm": 0.7588180303573608, + "learning_rate": 6.31466932365746e-06, + "loss": 0.8301, + "step": 15150 + }, + { + "epoch": 0.8338928944906159, + "grad_norm": 0.7697302103042603, + "learning_rate": 6.314251104118048e-06, + "loss": 0.7777, + "step": 15151 + }, + { + "epoch": 0.8339479332929716, + "grad_norm": 0.8361233472824097, + "learning_rate": 6.313832874701078e-06, + "loss": 0.7585, + "step": 15152 + }, + { + "epoch": 0.8340029720953273, + "grad_norm": 0.6954757571220398, + "learning_rate": 6.313414635409692e-06, + "loss": 0.759, + "step": 15153 + }, + { + "epoch": 0.8340580108976828, + "grad_norm": 0.72389155626297, + "learning_rate": 6.312996386247034e-06, + "loss": 0.6679, + "step": 15154 + }, + { + "epoch": 0.8341130497000385, + "grad_norm": 0.781382143497467, + "learning_rate": 6.312578127216245e-06, + "loss": 0.769, + "step": 15155 + }, + { + "epoch": 0.8341680885023942, + "grad_norm": 0.7186244130134583, + "learning_rate": 6.312159858320472e-06, + "loss": 0.7476, + "step": 15156 + }, + { + "epoch": 0.8342231273047499, + "grad_norm": 0.6909130215644836, + "learning_rate": 6.311741579562855e-06, + "loss": 0.749, + "step": 15157 + }, + { + "epoch": 0.8342781661071055, + "grad_norm": 0.7692446708679199, + "learning_rate": 6.31132329094654e-06, + "loss": 0.7141, + "step": 15158 + }, + { + "epoch": 0.8343332049094612, + "grad_norm": 0.6753776669502258, + "learning_rate": 6.310904992474669e-06, + "loss": 0.7259, + "step": 15159 + }, + { + "epoch": 0.8343882437118169, + "grad_norm": 0.7118550539016724, + "learning_rate": 6.3104866841503885e-06, + "loss": 0.8282, + "step": 15160 + }, + { + "epoch": 0.8344432825141725, + "grad_norm": 0.6651625037193298, + "learning_rate": 6.31006836597684e-06, + "loss": 0.7639, + "step": 15161 + }, + { + "epoch": 0.8344983213165281, + "grad_norm": 0.6745681762695312, + "learning_rate": 6.30965003795717e-06, + "loss": 0.5922, + "step": 15162 + }, + { + "epoch": 0.8345533601188838, + "grad_norm": 0.7344138622283936, + "learning_rate": 6.309231700094518e-06, + "loss": 0.7134, + "step": 15163 + }, + { + "epoch": 0.8346083989212395, + "grad_norm": 0.7628228664398193, + "learning_rate": 6.308813352392034e-06, + "loss": 0.7341, + "step": 15164 + }, + { + "epoch": 0.8346634377235951, + "grad_norm": 0.6599448919296265, + "learning_rate": 6.308394994852858e-06, + "loss": 0.6821, + "step": 15165 + }, + { + "epoch": 0.8347184765259508, + "grad_norm": 0.9132193922996521, + "learning_rate": 6.307976627480136e-06, + "loss": 0.7862, + "step": 15166 + }, + { + "epoch": 0.8347735153283065, + "grad_norm": 0.752200722694397, + "learning_rate": 6.307558250277011e-06, + "loss": 0.7942, + "step": 15167 + }, + { + "epoch": 0.8348285541306621, + "grad_norm": 0.6848111748695374, + "learning_rate": 6.307139863246628e-06, + "loss": 0.8161, + "step": 15168 + }, + { + "epoch": 0.8348835929330177, + "grad_norm": 0.7229306697845459, + "learning_rate": 6.306721466392132e-06, + "loss": 0.684, + "step": 15169 + }, + { + "epoch": 0.8349386317353734, + "grad_norm": 0.7294610142707825, + "learning_rate": 6.306303059716667e-06, + "loss": 0.7046, + "step": 15170 + }, + { + "epoch": 0.8349936705377291, + "grad_norm": 0.7153074741363525, + "learning_rate": 6.305884643223378e-06, + "loss": 0.7613, + "step": 15171 + }, + { + "epoch": 0.8350487093400848, + "grad_norm": 0.6200907826423645, + "learning_rate": 6.30546621691541e-06, + "loss": 0.642, + "step": 15172 + }, + { + "epoch": 0.8351037481424404, + "grad_norm": 0.6640743017196655, + "learning_rate": 6.305047780795907e-06, + "loss": 0.7201, + "step": 15173 + }, + { + "epoch": 0.8351587869447961, + "grad_norm": 0.6427313089370728, + "learning_rate": 6.3046293348680144e-06, + "loss": 0.764, + "step": 15174 + }, + { + "epoch": 0.8352138257471518, + "grad_norm": 0.6475403308868408, + "learning_rate": 6.3042108791348755e-06, + "loss": 0.6678, + "step": 15175 + }, + { + "epoch": 0.8352688645495074, + "grad_norm": 0.6376405358314514, + "learning_rate": 6.303792413599638e-06, + "loss": 0.6972, + "step": 15176 + }, + { + "epoch": 0.835323903351863, + "grad_norm": 0.6648433804512024, + "learning_rate": 6.303373938265447e-06, + "loss": 0.6531, + "step": 15177 + }, + { + "epoch": 0.8353789421542187, + "grad_norm": 0.6582038402557373, + "learning_rate": 6.302955453135446e-06, + "loss": 0.7703, + "step": 15178 + }, + { + "epoch": 0.8354339809565744, + "grad_norm": 0.6386045217514038, + "learning_rate": 6.30253695821278e-06, + "loss": 0.6821, + "step": 15179 + }, + { + "epoch": 0.8354890197589301, + "grad_norm": 0.7268567681312561, + "learning_rate": 6.302118453500594e-06, + "loss": 0.7434, + "step": 15180 + }, + { + "epoch": 0.8355440585612857, + "grad_norm": 0.8008975982666016, + "learning_rate": 6.301699939002035e-06, + "loss": 0.8537, + "step": 15181 + }, + { + "epoch": 0.8355990973636414, + "grad_norm": 0.6803351044654846, + "learning_rate": 6.301281414720247e-06, + "loss": 0.6741, + "step": 15182 + }, + { + "epoch": 0.835654136165997, + "grad_norm": 0.6567045450210571, + "learning_rate": 6.3008628806583785e-06, + "loss": 0.7033, + "step": 15183 + }, + { + "epoch": 0.8357091749683527, + "grad_norm": 0.7088850140571594, + "learning_rate": 6.3004443368195685e-06, + "loss": 0.699, + "step": 15184 + }, + { + "epoch": 0.8357642137707083, + "grad_norm": 0.664929986000061, + "learning_rate": 6.3000257832069715e-06, + "loss": 0.6875, + "step": 15185 + }, + { + "epoch": 0.835819252573064, + "grad_norm": 0.7132309079170227, + "learning_rate": 6.299607219823727e-06, + "loss": 0.8172, + "step": 15186 + }, + { + "epoch": 0.8358742913754197, + "grad_norm": 0.7312454581260681, + "learning_rate": 6.2991886466729815e-06, + "loss": 0.7277, + "step": 15187 + }, + { + "epoch": 0.8359293301777754, + "grad_norm": 0.6576625108718872, + "learning_rate": 6.298770063757882e-06, + "loss": 0.7134, + "step": 15188 + }, + { + "epoch": 0.835984368980131, + "grad_norm": 0.6840282678604126, + "learning_rate": 6.2983514710815756e-06, + "loss": 0.777, + "step": 15189 + }, + { + "epoch": 0.8360394077824866, + "grad_norm": 0.7194011211395264, + "learning_rate": 6.297932868647207e-06, + "loss": 0.783, + "step": 15190 + }, + { + "epoch": 0.8360944465848423, + "grad_norm": 0.6619371175765991, + "learning_rate": 6.297514256457922e-06, + "loss": 0.7809, + "step": 15191 + }, + { + "epoch": 0.836149485387198, + "grad_norm": 0.8256712555885315, + "learning_rate": 6.2970956345168666e-06, + "loss": 0.9086, + "step": 15192 + }, + { + "epoch": 0.8362045241895536, + "grad_norm": 0.6951783299446106, + "learning_rate": 6.296677002827188e-06, + "loss": 0.7489, + "step": 15193 + }, + { + "epoch": 0.8362595629919093, + "grad_norm": 0.8535193204879761, + "learning_rate": 6.296258361392033e-06, + "loss": 0.7744, + "step": 15194 + }, + { + "epoch": 0.836314601794265, + "grad_norm": 0.7569966912269592, + "learning_rate": 6.295839710214546e-06, + "loss": 0.7091, + "step": 15195 + }, + { + "epoch": 0.8363696405966207, + "grad_norm": 0.6435930728912354, + "learning_rate": 6.295421049297875e-06, + "loss": 0.6601, + "step": 15196 + }, + { + "epoch": 0.8364246793989762, + "grad_norm": 0.811500608921051, + "learning_rate": 6.295002378645166e-06, + "loss": 0.7304, + "step": 15197 + }, + { + "epoch": 0.8364797182013319, + "grad_norm": 0.7306826114654541, + "learning_rate": 6.294583698259566e-06, + "loss": 0.8471, + "step": 15198 + }, + { + "epoch": 0.8365347570036876, + "grad_norm": 0.6411521434783936, + "learning_rate": 6.294165008144222e-06, + "loss": 0.6572, + "step": 15199 + }, + { + "epoch": 0.8365897958060433, + "grad_norm": 0.6460714340209961, + "learning_rate": 6.293746308302278e-06, + "loss": 0.7514, + "step": 15200 + }, + { + "epoch": 0.8366448346083989, + "grad_norm": 0.9355582594871521, + "learning_rate": 6.2933275987368855e-06, + "loss": 0.8171, + "step": 15201 + }, + { + "epoch": 0.8366998734107546, + "grad_norm": 0.6221946477890015, + "learning_rate": 6.292908879451189e-06, + "loss": 0.7323, + "step": 15202 + }, + { + "epoch": 0.8367549122131103, + "grad_norm": 0.6820993423461914, + "learning_rate": 6.292490150448335e-06, + "loss": 0.8168, + "step": 15203 + }, + { + "epoch": 0.836809951015466, + "grad_norm": 0.6494680643081665, + "learning_rate": 6.29207141173147e-06, + "loss": 0.7926, + "step": 15204 + }, + { + "epoch": 0.8368649898178215, + "grad_norm": 0.7658956050872803, + "learning_rate": 6.291652663303744e-06, + "loss": 0.7304, + "step": 15205 + }, + { + "epoch": 0.8369200286201772, + "grad_norm": 0.6653497219085693, + "learning_rate": 6.2912339051683e-06, + "loss": 0.7284, + "step": 15206 + }, + { + "epoch": 0.8369750674225329, + "grad_norm": 0.6136276721954346, + "learning_rate": 6.290815137328289e-06, + "loss": 0.7313, + "step": 15207 + }, + { + "epoch": 0.8370301062248885, + "grad_norm": 0.7542527914047241, + "learning_rate": 6.2903963597868555e-06, + "loss": 0.7806, + "step": 15208 + }, + { + "epoch": 0.8370851450272442, + "grad_norm": 0.6994839906692505, + "learning_rate": 6.2899775725471505e-06, + "loss": 0.8132, + "step": 15209 + }, + { + "epoch": 0.8371401838295999, + "grad_norm": 0.6558997631072998, + "learning_rate": 6.289558775612319e-06, + "loss": 0.7188, + "step": 15210 + }, + { + "epoch": 0.8371952226319556, + "grad_norm": 0.7155564427375793, + "learning_rate": 6.289139968985507e-06, + "loss": 0.6584, + "step": 15211 + }, + { + "epoch": 0.8372502614343111, + "grad_norm": 0.7645565867424011, + "learning_rate": 6.288721152669865e-06, + "loss": 0.761, + "step": 15212 + }, + { + "epoch": 0.8373053002366668, + "grad_norm": 0.6507940292358398, + "learning_rate": 6.288302326668542e-06, + "loss": 0.7139, + "step": 15213 + }, + { + "epoch": 0.8373603390390225, + "grad_norm": 0.7598558664321899, + "learning_rate": 6.287883490984682e-06, + "loss": 0.7627, + "step": 15214 + }, + { + "epoch": 0.8374153778413782, + "grad_norm": 0.6542350649833679, + "learning_rate": 6.287464645621434e-06, + "loss": 0.7508, + "step": 15215 + }, + { + "epoch": 0.8374704166437338, + "grad_norm": 0.7530503869056702, + "learning_rate": 6.287045790581946e-06, + "loss": 0.8234, + "step": 15216 + }, + { + "epoch": 0.8375254554460895, + "grad_norm": 0.9945759773254395, + "learning_rate": 6.286626925869367e-06, + "loss": 0.7637, + "step": 15217 + }, + { + "epoch": 0.8375804942484452, + "grad_norm": 0.6644982695579529, + "learning_rate": 6.286208051486844e-06, + "loss": 0.7671, + "step": 15218 + }, + { + "epoch": 0.8376355330508009, + "grad_norm": 0.8195061683654785, + "learning_rate": 6.285789167437526e-06, + "loss": 0.662, + "step": 15219 + }, + { + "epoch": 0.8376905718531564, + "grad_norm": 0.6578626036643982, + "learning_rate": 6.2853702737245605e-06, + "loss": 0.7681, + "step": 15220 + }, + { + "epoch": 0.8377456106555121, + "grad_norm": 0.6632179021835327, + "learning_rate": 6.2849513703510955e-06, + "loss": 0.759, + "step": 15221 + }, + { + "epoch": 0.8378006494578678, + "grad_norm": 0.6822313070297241, + "learning_rate": 6.284532457320282e-06, + "loss": 0.7859, + "step": 15222 + }, + { + "epoch": 0.8378556882602235, + "grad_norm": 0.6448203921318054, + "learning_rate": 6.284113534635265e-06, + "loss": 0.7224, + "step": 15223 + }, + { + "epoch": 0.8379107270625791, + "grad_norm": 0.6147580146789551, + "learning_rate": 6.2836946022991926e-06, + "loss": 0.7389, + "step": 15224 + }, + { + "epoch": 0.8379657658649348, + "grad_norm": 0.7476562857627869, + "learning_rate": 6.283275660315219e-06, + "loss": 0.7535, + "step": 15225 + }, + { + "epoch": 0.8380208046672905, + "grad_norm": 0.7396713495254517, + "learning_rate": 6.282856708686488e-06, + "loss": 0.7621, + "step": 15226 + }, + { + "epoch": 0.8380758434696461, + "grad_norm": 0.7220024466514587, + "learning_rate": 6.282437747416148e-06, + "loss": 0.672, + "step": 15227 + }, + { + "epoch": 0.8381308822720017, + "grad_norm": 0.9414284229278564, + "learning_rate": 6.2820187765073495e-06, + "loss": 0.8791, + "step": 15228 + }, + { + "epoch": 0.8381859210743574, + "grad_norm": 0.6074691414833069, + "learning_rate": 6.281599795963241e-06, + "loss": 0.6771, + "step": 15229 + }, + { + "epoch": 0.8382409598767131, + "grad_norm": 0.7367346286773682, + "learning_rate": 6.281180805786973e-06, + "loss": 0.7869, + "step": 15230 + }, + { + "epoch": 0.8382959986790688, + "grad_norm": 0.711016833782196, + "learning_rate": 6.280761805981691e-06, + "loss": 0.7166, + "step": 15231 + }, + { + "epoch": 0.8383510374814244, + "grad_norm": 0.6464707255363464, + "learning_rate": 6.280342796550546e-06, + "loss": 0.6965, + "step": 15232 + }, + { + "epoch": 0.83840607628378, + "grad_norm": 0.7385185956954956, + "learning_rate": 6.279923777496688e-06, + "loss": 0.7031, + "step": 15233 + }, + { + "epoch": 0.8384611150861357, + "grad_norm": 0.6799347996711731, + "learning_rate": 6.2795047488232665e-06, + "loss": 0.6777, + "step": 15234 + }, + { + "epoch": 0.8385161538884914, + "grad_norm": 0.690740168094635, + "learning_rate": 6.279085710533429e-06, + "loss": 0.7675, + "step": 15235 + }, + { + "epoch": 0.838571192690847, + "grad_norm": 0.9359111189842224, + "learning_rate": 6.278666662630325e-06, + "loss": 0.7063, + "step": 15236 + }, + { + "epoch": 0.8386262314932027, + "grad_norm": 0.751430094242096, + "learning_rate": 6.2782476051171075e-06, + "loss": 0.7851, + "step": 15237 + }, + { + "epoch": 0.8386812702955584, + "grad_norm": 0.6865997314453125, + "learning_rate": 6.27782853799692e-06, + "loss": 0.7347, + "step": 15238 + }, + { + "epoch": 0.8387363090979141, + "grad_norm": 0.6713284850120544, + "learning_rate": 6.277409461272916e-06, + "loss": 0.7651, + "step": 15239 + }, + { + "epoch": 0.8387913479002697, + "grad_norm": 0.7481899857521057, + "learning_rate": 6.276990374948244e-06, + "loss": 0.7681, + "step": 15240 + }, + { + "epoch": 0.8388463867026253, + "grad_norm": 0.7126002311706543, + "learning_rate": 6.2765712790260554e-06, + "loss": 0.7772, + "step": 15241 + }, + { + "epoch": 0.838901425504981, + "grad_norm": 0.6616978645324707, + "learning_rate": 6.276152173509497e-06, + "loss": 0.7028, + "step": 15242 + }, + { + "epoch": 0.8389564643073367, + "grad_norm": 0.9032973051071167, + "learning_rate": 6.2757330584017225e-06, + "loss": 0.7646, + "step": 15243 + }, + { + "epoch": 0.8390115031096923, + "grad_norm": 0.6345590353012085, + "learning_rate": 6.275313933705879e-06, + "loss": 0.6692, + "step": 15244 + }, + { + "epoch": 0.839066541912048, + "grad_norm": 0.6989019513130188, + "learning_rate": 6.2748947994251175e-06, + "loss": 0.6916, + "step": 15245 + }, + { + "epoch": 0.8391215807144037, + "grad_norm": 0.7115045189857483, + "learning_rate": 6.2744756555625875e-06, + "loss": 0.6923, + "step": 15246 + }, + { + "epoch": 0.8391766195167594, + "grad_norm": 0.6989235281944275, + "learning_rate": 6.2740565021214406e-06, + "loss": 0.7057, + "step": 15247 + }, + { + "epoch": 0.839231658319115, + "grad_norm": 0.684779942035675, + "learning_rate": 6.273637339104824e-06, + "loss": 0.7777, + "step": 15248 + }, + { + "epoch": 0.8392866971214706, + "grad_norm": 0.6341322064399719, + "learning_rate": 6.2732181665158934e-06, + "loss": 0.7335, + "step": 15249 + }, + { + "epoch": 0.8393417359238263, + "grad_norm": 0.7232723832130432, + "learning_rate": 6.272798984357793e-06, + "loss": 0.8055, + "step": 15250 + }, + { + "epoch": 0.8393967747261819, + "grad_norm": 0.9725174307823181, + "learning_rate": 6.272379792633678e-06, + "loss": 0.6221, + "step": 15251 + }, + { + "epoch": 0.8394518135285376, + "grad_norm": 0.6602086424827576, + "learning_rate": 6.271960591346695e-06, + "loss": 0.8023, + "step": 15252 + }, + { + "epoch": 0.8395068523308933, + "grad_norm": 0.7092040777206421, + "learning_rate": 6.271541380499998e-06, + "loss": 0.8135, + "step": 15253 + }, + { + "epoch": 0.839561891133249, + "grad_norm": 0.5656731724739075, + "learning_rate": 6.271122160096736e-06, + "loss": 0.647, + "step": 15254 + }, + { + "epoch": 0.8396169299356046, + "grad_norm": 1.1831625699996948, + "learning_rate": 6.270702930140061e-06, + "loss": 0.8513, + "step": 15255 + }, + { + "epoch": 0.8396719687379602, + "grad_norm": 0.6398816704750061, + "learning_rate": 6.270283690633121e-06, + "loss": 0.6988, + "step": 15256 + }, + { + "epoch": 0.8397270075403159, + "grad_norm": 0.6856167316436768, + "learning_rate": 6.26986444157907e-06, + "loss": 0.7789, + "step": 15257 + }, + { + "epoch": 0.8397820463426716, + "grad_norm": 0.7355605363845825, + "learning_rate": 6.269445182981058e-06, + "loss": 0.6652, + "step": 15258 + }, + { + "epoch": 0.8398370851450272, + "grad_norm": 0.6691173315048218, + "learning_rate": 6.2690259148422364e-06, + "loss": 0.6807, + "step": 15259 + }, + { + "epoch": 0.8398921239473829, + "grad_norm": 0.6596276164054871, + "learning_rate": 6.268606637165754e-06, + "loss": 0.6947, + "step": 15260 + }, + { + "epoch": 0.8399471627497386, + "grad_norm": 0.7198327779769897, + "learning_rate": 6.268187349954766e-06, + "loss": 0.7981, + "step": 15261 + }, + { + "epoch": 0.8400022015520943, + "grad_norm": 0.7006517648696899, + "learning_rate": 6.267768053212419e-06, + "loss": 0.7756, + "step": 15262 + }, + { + "epoch": 0.8400572403544498, + "grad_norm": 0.769062340259552, + "learning_rate": 6.267348746941869e-06, + "loss": 0.8433, + "step": 15263 + }, + { + "epoch": 0.8401122791568055, + "grad_norm": 0.6317951679229736, + "learning_rate": 6.266929431146263e-06, + "loss": 0.6575, + "step": 15264 + }, + { + "epoch": 0.8401673179591612, + "grad_norm": 0.7127153873443604, + "learning_rate": 6.2665101058287554e-06, + "loss": 0.7745, + "step": 15265 + }, + { + "epoch": 0.8402223567615169, + "grad_norm": 0.6909182667732239, + "learning_rate": 6.266090770992497e-06, + "loss": 0.7567, + "step": 15266 + }, + { + "epoch": 0.8402773955638725, + "grad_norm": 0.7875083684921265, + "learning_rate": 6.2656714266406384e-06, + "loss": 0.7392, + "step": 15267 + }, + { + "epoch": 0.8403324343662282, + "grad_norm": 0.7068803906440735, + "learning_rate": 6.2652520727763326e-06, + "loss": 0.6723, + "step": 15268 + }, + { + "epoch": 0.8403874731685839, + "grad_norm": 0.6994038820266724, + "learning_rate": 6.264832709402731e-06, + "loss": 0.6989, + "step": 15269 + }, + { + "epoch": 0.8404425119709396, + "grad_norm": 0.714044988155365, + "learning_rate": 6.264413336522985e-06, + "loss": 0.7464, + "step": 15270 + }, + { + "epoch": 0.8404975507732951, + "grad_norm": 0.8202210068702698, + "learning_rate": 6.263993954140249e-06, + "loss": 0.7174, + "step": 15271 + }, + { + "epoch": 0.8405525895756508, + "grad_norm": 0.6762316823005676, + "learning_rate": 6.2635745622576694e-06, + "loss": 0.7416, + "step": 15272 + }, + { + "epoch": 0.8406076283780065, + "grad_norm": 0.7461959719657898, + "learning_rate": 6.263155160878405e-06, + "loss": 0.7835, + "step": 15273 + }, + { + "epoch": 0.8406626671803622, + "grad_norm": 0.6263054609298706, + "learning_rate": 6.262735750005602e-06, + "loss": 0.7034, + "step": 15274 + }, + { + "epoch": 0.8407177059827178, + "grad_norm": 0.7489733695983887, + "learning_rate": 6.2623163296424165e-06, + "loss": 0.7387, + "step": 15275 + }, + { + "epoch": 0.8407727447850735, + "grad_norm": 0.7841430306434631, + "learning_rate": 6.261896899791997e-06, + "loss": 0.8487, + "step": 15276 + }, + { + "epoch": 0.8408277835874292, + "grad_norm": 0.8390078544616699, + "learning_rate": 6.2614774604575e-06, + "loss": 0.8335, + "step": 15277 + }, + { + "epoch": 0.8408828223897848, + "grad_norm": 0.9100946187973022, + "learning_rate": 6.261058011642076e-06, + "loss": 0.6196, + "step": 15278 + }, + { + "epoch": 0.8409378611921404, + "grad_norm": 0.7001772522926331, + "learning_rate": 6.260638553348879e-06, + "loss": 0.6935, + "step": 15279 + }, + { + "epoch": 0.8409928999944961, + "grad_norm": 0.7877102494239807, + "learning_rate": 6.260219085581057e-06, + "loss": 0.7378, + "step": 15280 + }, + { + "epoch": 0.8410479387968518, + "grad_norm": 0.687240719795227, + "learning_rate": 6.259799608341768e-06, + "loss": 0.7224, + "step": 15281 + }, + { + "epoch": 0.8411029775992075, + "grad_norm": 0.7766143083572388, + "learning_rate": 6.2593801216341625e-06, + "loss": 0.7157, + "step": 15282 + }, + { + "epoch": 0.8411580164015631, + "grad_norm": 1.1593633890151978, + "learning_rate": 6.258960625461391e-06, + "loss": 0.8555, + "step": 15283 + }, + { + "epoch": 0.8412130552039188, + "grad_norm": 0.6179451942443848, + "learning_rate": 6.2585411198266085e-06, + "loss": 0.6715, + "step": 15284 + }, + { + "epoch": 0.8412680940062744, + "grad_norm": 0.6755460500717163, + "learning_rate": 6.258121604732971e-06, + "loss": 0.7475, + "step": 15285 + }, + { + "epoch": 0.8413231328086301, + "grad_norm": 0.6775393486022949, + "learning_rate": 6.257702080183627e-06, + "loss": 0.6594, + "step": 15286 + }, + { + "epoch": 0.8413781716109857, + "grad_norm": 0.6972197890281677, + "learning_rate": 6.25728254618173e-06, + "loss": 0.7865, + "step": 15287 + }, + { + "epoch": 0.8414332104133414, + "grad_norm": 0.6446948051452637, + "learning_rate": 6.256863002730433e-06, + "loss": 0.6874, + "step": 15288 + }, + { + "epoch": 0.8414882492156971, + "grad_norm": 0.7012035846710205, + "learning_rate": 6.256443449832892e-06, + "loss": 0.7465, + "step": 15289 + }, + { + "epoch": 0.8415432880180528, + "grad_norm": 0.698693573474884, + "learning_rate": 6.256023887492257e-06, + "loss": 0.8206, + "step": 15290 + }, + { + "epoch": 0.8415983268204084, + "grad_norm": 0.7083185315132141, + "learning_rate": 6.255604315711684e-06, + "loss": 0.8306, + "step": 15291 + }, + { + "epoch": 0.841653365622764, + "grad_norm": 0.6605321764945984, + "learning_rate": 6.255184734494324e-06, + "loss": 0.6742, + "step": 15292 + }, + { + "epoch": 0.8417084044251197, + "grad_norm": 0.681881844997406, + "learning_rate": 6.254765143843331e-06, + "loss": 0.7009, + "step": 15293 + }, + { + "epoch": 0.8417634432274753, + "grad_norm": 0.6995699405670166, + "learning_rate": 6.2543455437618605e-06, + "loss": 0.8069, + "step": 15294 + }, + { + "epoch": 0.841818482029831, + "grad_norm": 0.7004442811012268, + "learning_rate": 6.2539259342530644e-06, + "loss": 0.71, + "step": 15295 + }, + { + "epoch": 0.8418735208321867, + "grad_norm": 0.7816279530525208, + "learning_rate": 6.253506315320097e-06, + "loss": 0.7833, + "step": 15296 + }, + { + "epoch": 0.8419285596345424, + "grad_norm": 0.6875490546226501, + "learning_rate": 6.25308668696611e-06, + "loss": 0.7223, + "step": 15297 + }, + { + "epoch": 0.841983598436898, + "grad_norm": 0.7126815915107727, + "learning_rate": 6.252667049194261e-06, + "loss": 0.7934, + "step": 15298 + }, + { + "epoch": 0.8420386372392537, + "grad_norm": 0.8048780560493469, + "learning_rate": 6.252247402007701e-06, + "loss": 0.7775, + "step": 15299 + }, + { + "epoch": 0.8420936760416093, + "grad_norm": 0.6681318879127502, + "learning_rate": 6.251827745409583e-06, + "loss": 0.6516, + "step": 15300 + }, + { + "epoch": 0.842148714843965, + "grad_norm": 0.6467457413673401, + "learning_rate": 6.251408079403064e-06, + "loss": 0.7417, + "step": 15301 + }, + { + "epoch": 0.8422037536463206, + "grad_norm": 0.6815666556358337, + "learning_rate": 6.250988403991297e-06, + "loss": 0.7498, + "step": 15302 + }, + { + "epoch": 0.8422587924486763, + "grad_norm": 0.6596205234527588, + "learning_rate": 6.250568719177437e-06, + "loss": 0.762, + "step": 15303 + }, + { + "epoch": 0.842313831251032, + "grad_norm": 0.7564731240272522, + "learning_rate": 6.250149024964635e-06, + "loss": 0.7592, + "step": 15304 + }, + { + "epoch": 0.8423688700533877, + "grad_norm": 0.6755058169364929, + "learning_rate": 6.249729321356048e-06, + "loss": 0.6953, + "step": 15305 + }, + { + "epoch": 0.8424239088557433, + "grad_norm": 0.7423762083053589, + "learning_rate": 6.249309608354832e-06, + "loss": 0.7018, + "step": 15306 + }, + { + "epoch": 0.8424789476580989, + "grad_norm": 0.727678120136261, + "learning_rate": 6.248889885964138e-06, + "loss": 0.8159, + "step": 15307 + }, + { + "epoch": 0.8425339864604546, + "grad_norm": 1.0823713541030884, + "learning_rate": 6.248470154187123e-06, + "loss": 0.872, + "step": 15308 + }, + { + "epoch": 0.8425890252628103, + "grad_norm": 0.6428259015083313, + "learning_rate": 6.248050413026939e-06, + "loss": 0.683, + "step": 15309 + }, + { + "epoch": 0.8426440640651659, + "grad_norm": 0.6622119545936584, + "learning_rate": 6.247630662486743e-06, + "loss": 0.7891, + "step": 15310 + }, + { + "epoch": 0.8426991028675216, + "grad_norm": 1.2377631664276123, + "learning_rate": 6.247210902569689e-06, + "loss": 0.7675, + "step": 15311 + }, + { + "epoch": 0.8427541416698773, + "grad_norm": 0.7909934520721436, + "learning_rate": 6.246791133278931e-06, + "loss": 0.8688, + "step": 15312 + }, + { + "epoch": 0.842809180472233, + "grad_norm": 0.6541300415992737, + "learning_rate": 6.246371354617625e-06, + "loss": 0.6754, + "step": 15313 + }, + { + "epoch": 0.8428642192745885, + "grad_norm": 0.6664960384368896, + "learning_rate": 6.245951566588926e-06, + "loss": 0.6666, + "step": 15314 + }, + { + "epoch": 0.8429192580769442, + "grad_norm": 0.7288552522659302, + "learning_rate": 6.245531769195988e-06, + "loss": 0.8179, + "step": 15315 + }, + { + "epoch": 0.8429742968792999, + "grad_norm": 0.7044054865837097, + "learning_rate": 6.245111962441966e-06, + "loss": 0.7306, + "step": 15316 + }, + { + "epoch": 0.8430293356816556, + "grad_norm": 0.6108603477478027, + "learning_rate": 6.244692146330016e-06, + "loss": 0.6213, + "step": 15317 + }, + { + "epoch": 0.8430843744840112, + "grad_norm": 0.6381129622459412, + "learning_rate": 6.2442723208632935e-06, + "loss": 0.7709, + "step": 15318 + }, + { + "epoch": 0.8431394132863669, + "grad_norm": 0.7355496883392334, + "learning_rate": 6.243852486044955e-06, + "loss": 0.665, + "step": 15319 + }, + { + "epoch": 0.8431944520887226, + "grad_norm": 0.7450826168060303, + "learning_rate": 6.2434326418781525e-06, + "loss": 0.7551, + "step": 15320 + }, + { + "epoch": 0.8432494908910783, + "grad_norm": 0.6463751792907715, + "learning_rate": 6.243012788366043e-06, + "loss": 0.7956, + "step": 15321 + }, + { + "epoch": 0.8433045296934338, + "grad_norm": 0.6673271059989929, + "learning_rate": 6.242592925511782e-06, + "loss": 0.7148, + "step": 15322 + }, + { + "epoch": 0.8433595684957895, + "grad_norm": 0.7663269639015198, + "learning_rate": 6.242173053318526e-06, + "loss": 0.8594, + "step": 15323 + }, + { + "epoch": 0.8434146072981452, + "grad_norm": 0.8503594994544983, + "learning_rate": 6.2417531717894285e-06, + "loss": 0.7594, + "step": 15324 + }, + { + "epoch": 0.8434696461005009, + "grad_norm": 0.6903344988822937, + "learning_rate": 6.241333280927647e-06, + "loss": 0.7252, + "step": 15325 + }, + { + "epoch": 0.8435246849028565, + "grad_norm": 0.6472830772399902, + "learning_rate": 6.240913380736337e-06, + "loss": 0.7379, + "step": 15326 + }, + { + "epoch": 0.8435797237052122, + "grad_norm": 0.6442959308624268, + "learning_rate": 6.240493471218655e-06, + "loss": 0.7447, + "step": 15327 + }, + { + "epoch": 0.8436347625075679, + "grad_norm": 0.6387843489646912, + "learning_rate": 6.240073552377756e-06, + "loss": 0.7659, + "step": 15328 + }, + { + "epoch": 0.8436898013099235, + "grad_norm": 0.7017341256141663, + "learning_rate": 6.239653624216794e-06, + "loss": 0.6934, + "step": 15329 + }, + { + "epoch": 0.8437448401122791, + "grad_norm": 0.6204355359077454, + "learning_rate": 6.2392336867389294e-06, + "loss": 0.6553, + "step": 15330 + }, + { + "epoch": 0.8437998789146348, + "grad_norm": 0.6765483021736145, + "learning_rate": 6.238813739947315e-06, + "loss": 0.7492, + "step": 15331 + }, + { + "epoch": 0.8438549177169905, + "grad_norm": 0.7261079549789429, + "learning_rate": 6.238393783845109e-06, + "loss": 0.7373, + "step": 15332 + }, + { + "epoch": 0.8439099565193462, + "grad_norm": 0.7019803524017334, + "learning_rate": 6.237973818435466e-06, + "loss": 0.7742, + "step": 15333 + }, + { + "epoch": 0.8439649953217018, + "grad_norm": 0.7521516680717468, + "learning_rate": 6.237553843721545e-06, + "loss": 0.8808, + "step": 15334 + }, + { + "epoch": 0.8440200341240575, + "grad_norm": 0.6796375513076782, + "learning_rate": 6.237133859706499e-06, + "loss": 0.7759, + "step": 15335 + }, + { + "epoch": 0.8440750729264132, + "grad_norm": 0.6199387311935425, + "learning_rate": 6.236713866393487e-06, + "loss": 0.6203, + "step": 15336 + }, + { + "epoch": 0.8441301117287687, + "grad_norm": 0.6968052983283997, + "learning_rate": 6.236293863785663e-06, + "loss": 0.7645, + "step": 15337 + }, + { + "epoch": 0.8441851505311244, + "grad_norm": 0.757556676864624, + "learning_rate": 6.235873851886186e-06, + "loss": 0.8005, + "step": 15338 + }, + { + "epoch": 0.8442401893334801, + "grad_norm": 0.6558085680007935, + "learning_rate": 6.235453830698211e-06, + "loss": 0.796, + "step": 15339 + }, + { + "epoch": 0.8442952281358358, + "grad_norm": 0.6963368654251099, + "learning_rate": 6.235033800224898e-06, + "loss": 0.7077, + "step": 15340 + }, + { + "epoch": 0.8443502669381914, + "grad_norm": 0.6057709455490112, + "learning_rate": 6.234613760469399e-06, + "loss": 0.5443, + "step": 15341 + }, + { + "epoch": 0.8444053057405471, + "grad_norm": 0.7616491317749023, + "learning_rate": 6.234193711434875e-06, + "loss": 0.6764, + "step": 15342 + }, + { + "epoch": 0.8444603445429028, + "grad_norm": 0.7143368721008301, + "learning_rate": 6.233773653124482e-06, + "loss": 0.6647, + "step": 15343 + }, + { + "epoch": 0.8445153833452584, + "grad_norm": 0.8766696453094482, + "learning_rate": 6.233353585541375e-06, + "loss": 0.7112, + "step": 15344 + }, + { + "epoch": 0.844570422147614, + "grad_norm": 0.6184048652648926, + "learning_rate": 6.232933508688714e-06, + "loss": 0.6645, + "step": 15345 + }, + { + "epoch": 0.8446254609499697, + "grad_norm": 0.8119208812713623, + "learning_rate": 6.232513422569655e-06, + "loss": 0.6729, + "step": 15346 + }, + { + "epoch": 0.8446804997523254, + "grad_norm": 0.5964543223381042, + "learning_rate": 6.2320933271873544e-06, + "loss": 0.6931, + "step": 15347 + }, + { + "epoch": 0.8447355385546811, + "grad_norm": 0.696611225605011, + "learning_rate": 6.23167322254497e-06, + "loss": 0.8292, + "step": 15348 + }, + { + "epoch": 0.8447905773570367, + "grad_norm": 0.6196489930152893, + "learning_rate": 6.231253108645658e-06, + "loss": 0.6651, + "step": 15349 + }, + { + "epoch": 0.8448456161593924, + "grad_norm": 0.6222663521766663, + "learning_rate": 6.230832985492579e-06, + "loss": 0.6513, + "step": 15350 + }, + { + "epoch": 0.844900654961748, + "grad_norm": 0.6424199342727661, + "learning_rate": 6.230412853088889e-06, + "loss": 0.7005, + "step": 15351 + }, + { + "epoch": 0.8449556937641037, + "grad_norm": 0.6484132409095764, + "learning_rate": 6.229992711437745e-06, + "loss": 0.6931, + "step": 15352 + }, + { + "epoch": 0.8450107325664593, + "grad_norm": 0.7568885684013367, + "learning_rate": 6.229572560542303e-06, + "loss": 0.7036, + "step": 15353 + }, + { + "epoch": 0.845065771368815, + "grad_norm": 0.665937602519989, + "learning_rate": 6.229152400405724e-06, + "loss": 0.5498, + "step": 15354 + }, + { + "epoch": 0.8451208101711707, + "grad_norm": 0.6861961483955383, + "learning_rate": 6.228732231031165e-06, + "loss": 0.7622, + "step": 15355 + }, + { + "epoch": 0.8451758489735264, + "grad_norm": 0.6793088316917419, + "learning_rate": 6.2283120524217845e-06, + "loss": 0.758, + "step": 15356 + }, + { + "epoch": 0.845230887775882, + "grad_norm": 0.7460890412330627, + "learning_rate": 6.227891864580739e-06, + "loss": 0.6618, + "step": 15357 + }, + { + "epoch": 0.8452859265782376, + "grad_norm": 0.6434195041656494, + "learning_rate": 6.227471667511186e-06, + "loss": 0.7226, + "step": 15358 + }, + { + "epoch": 0.8453409653805933, + "grad_norm": 0.7655256986618042, + "learning_rate": 6.227051461216285e-06, + "loss": 0.8461, + "step": 15359 + }, + { + "epoch": 0.845396004182949, + "grad_norm": 0.6727028489112854, + "learning_rate": 6.226631245699193e-06, + "loss": 0.6765, + "step": 15360 + }, + { + "epoch": 0.8454510429853046, + "grad_norm": 0.6030625700950623, + "learning_rate": 6.226211020963069e-06, + "loss": 0.6548, + "step": 15361 + }, + { + "epoch": 0.8455060817876603, + "grad_norm": 0.6430317163467407, + "learning_rate": 6.225790787011071e-06, + "loss": 0.7564, + "step": 15362 + }, + { + "epoch": 0.845561120590016, + "grad_norm": 0.633975088596344, + "learning_rate": 6.225370543846359e-06, + "loss": 0.716, + "step": 15363 + }, + { + "epoch": 0.8456161593923717, + "grad_norm": 0.6722174286842346, + "learning_rate": 6.2249502914720895e-06, + "loss": 0.7266, + "step": 15364 + }, + { + "epoch": 0.8456711981947272, + "grad_norm": 0.724166214466095, + "learning_rate": 6.22453002989142e-06, + "loss": 0.788, + "step": 15365 + }, + { + "epoch": 0.8457262369970829, + "grad_norm": 0.6406343579292297, + "learning_rate": 6.224109759107512e-06, + "loss": 0.8086, + "step": 15366 + }, + { + "epoch": 0.8457812757994386, + "grad_norm": 0.7344949245452881, + "learning_rate": 6.223689479123523e-06, + "loss": 0.7838, + "step": 15367 + }, + { + "epoch": 0.8458363146017943, + "grad_norm": 0.8572549819946289, + "learning_rate": 6.22326918994261e-06, + "loss": 0.7427, + "step": 15368 + }, + { + "epoch": 0.8458913534041499, + "grad_norm": 0.662644624710083, + "learning_rate": 6.222848891567934e-06, + "loss": 0.7165, + "step": 15369 + }, + { + "epoch": 0.8459463922065056, + "grad_norm": 0.7139797210693359, + "learning_rate": 6.222428584002654e-06, + "loss": 0.8218, + "step": 15370 + }, + { + "epoch": 0.8460014310088613, + "grad_norm": 0.6846550107002258, + "learning_rate": 6.222008267249927e-06, + "loss": 0.6686, + "step": 15371 + }, + { + "epoch": 0.846056469811217, + "grad_norm": 0.6675787568092346, + "learning_rate": 6.221587941312914e-06, + "loss": 0.7151, + "step": 15372 + }, + { + "epoch": 0.8461115086135725, + "grad_norm": 0.626371443271637, + "learning_rate": 6.221167606194771e-06, + "loss": 0.7637, + "step": 15373 + }, + { + "epoch": 0.8461665474159282, + "grad_norm": 0.6768763065338135, + "learning_rate": 6.220747261898661e-06, + "loss": 0.7363, + "step": 15374 + }, + { + "epoch": 0.8462215862182839, + "grad_norm": 0.7771314978599548, + "learning_rate": 6.220326908427741e-06, + "loss": 0.7032, + "step": 15375 + }, + { + "epoch": 0.8462766250206396, + "grad_norm": 0.8215247392654419, + "learning_rate": 6.219906545785171e-06, + "loss": 0.8917, + "step": 15376 + }, + { + "epoch": 0.8463316638229952, + "grad_norm": 0.7277588248252869, + "learning_rate": 6.219486173974107e-06, + "loss": 0.7531, + "step": 15377 + }, + { + "epoch": 0.8463867026253509, + "grad_norm": 0.6487376093864441, + "learning_rate": 6.219065792997714e-06, + "loss": 0.7182, + "step": 15378 + }, + { + "epoch": 0.8464417414277066, + "grad_norm": 0.6960493326187134, + "learning_rate": 6.218645402859148e-06, + "loss": 0.8125, + "step": 15379 + }, + { + "epoch": 0.8464967802300621, + "grad_norm": 0.7183159589767456, + "learning_rate": 6.218225003561571e-06, + "loss": 0.6536, + "step": 15380 + }, + { + "epoch": 0.8465518190324178, + "grad_norm": 0.7001940011978149, + "learning_rate": 6.217804595108139e-06, + "loss": 0.8203, + "step": 15381 + }, + { + "epoch": 0.8466068578347735, + "grad_norm": 0.5986705422401428, + "learning_rate": 6.217384177502015e-06, + "loss": 0.6672, + "step": 15382 + }, + { + "epoch": 0.8466618966371292, + "grad_norm": 0.6191138029098511, + "learning_rate": 6.216963750746356e-06, + "loss": 0.6565, + "step": 15383 + }, + { + "epoch": 0.8467169354394848, + "grad_norm": 1.2927004098892212, + "learning_rate": 6.216543314844326e-06, + "loss": 0.7511, + "step": 15384 + }, + { + "epoch": 0.8467719742418405, + "grad_norm": 0.6715198159217834, + "learning_rate": 6.2161228697990785e-06, + "loss": 0.7712, + "step": 15385 + }, + { + "epoch": 0.8468270130441962, + "grad_norm": 0.7516033053398132, + "learning_rate": 6.215702415613778e-06, + "loss": 0.6595, + "step": 15386 + }, + { + "epoch": 0.8468820518465519, + "grad_norm": 0.6913008689880371, + "learning_rate": 6.215281952291585e-06, + "loss": 0.7262, + "step": 15387 + }, + { + "epoch": 0.8469370906489074, + "grad_norm": 0.7288102507591248, + "learning_rate": 6.214861479835657e-06, + "loss": 0.6628, + "step": 15388 + }, + { + "epoch": 0.8469921294512631, + "grad_norm": 0.7889914512634277, + "learning_rate": 6.214440998249155e-06, + "loss": 0.7744, + "step": 15389 + }, + { + "epoch": 0.8470471682536188, + "grad_norm": 0.7622396945953369, + "learning_rate": 6.21402050753524e-06, + "loss": 0.7818, + "step": 15390 + }, + { + "epoch": 0.8471022070559745, + "grad_norm": 0.6172721982002258, + "learning_rate": 6.213600007697072e-06, + "loss": 0.626, + "step": 15391 + }, + { + "epoch": 0.8471572458583301, + "grad_norm": 0.710991621017456, + "learning_rate": 6.213179498737812e-06, + "loss": 0.7313, + "step": 15392 + }, + { + "epoch": 0.8472122846606858, + "grad_norm": 0.660139262676239, + "learning_rate": 6.2127589806606195e-06, + "loss": 0.6479, + "step": 15393 + }, + { + "epoch": 0.8472673234630415, + "grad_norm": 0.6611735224723816, + "learning_rate": 6.2123384534686534e-06, + "loss": 0.7091, + "step": 15394 + }, + { + "epoch": 0.8473223622653971, + "grad_norm": 0.8392653465270996, + "learning_rate": 6.211917917165078e-06, + "loss": 0.8514, + "step": 15395 + }, + { + "epoch": 0.8473774010677527, + "grad_norm": 0.6202608942985535, + "learning_rate": 6.211497371753052e-06, + "loss": 0.7068, + "step": 15396 + }, + { + "epoch": 0.8474324398701084, + "grad_norm": 0.6785926818847656, + "learning_rate": 6.211076817235734e-06, + "loss": 0.7216, + "step": 15397 + }, + { + "epoch": 0.8474874786724641, + "grad_norm": 0.7234075665473938, + "learning_rate": 6.210656253616288e-06, + "loss": 0.7379, + "step": 15398 + }, + { + "epoch": 0.8475425174748198, + "grad_norm": 0.6223714351654053, + "learning_rate": 6.210235680897874e-06, + "loss": 0.758, + "step": 15399 + }, + { + "epoch": 0.8475975562771754, + "grad_norm": 0.7993804812431335, + "learning_rate": 6.209815099083651e-06, + "loss": 0.8174, + "step": 15400 + }, + { + "epoch": 0.8476525950795311, + "grad_norm": 0.7897897362709045, + "learning_rate": 6.209394508176783e-06, + "loss": 0.6833, + "step": 15401 + }, + { + "epoch": 0.8477076338818867, + "grad_norm": 0.6803291440010071, + "learning_rate": 6.208973908180429e-06, + "loss": 0.7977, + "step": 15402 + }, + { + "epoch": 0.8477626726842424, + "grad_norm": 0.6937161087989807, + "learning_rate": 6.208553299097751e-06, + "loss": 0.7118, + "step": 15403 + }, + { + "epoch": 0.847817711486598, + "grad_norm": 0.7939958572387695, + "learning_rate": 6.208132680931911e-06, + "loss": 0.794, + "step": 15404 + }, + { + "epoch": 0.8478727502889537, + "grad_norm": 0.7009061574935913, + "learning_rate": 6.207712053686068e-06, + "loss": 0.7534, + "step": 15405 + }, + { + "epoch": 0.8479277890913094, + "grad_norm": 0.6890555620193481, + "learning_rate": 6.207291417363384e-06, + "loss": 0.7638, + "step": 15406 + }, + { + "epoch": 0.8479828278936651, + "grad_norm": 0.677119255065918, + "learning_rate": 6.206870771967022e-06, + "loss": 0.6814, + "step": 15407 + }, + { + "epoch": 0.8480378666960207, + "grad_norm": 0.706792950630188, + "learning_rate": 6.2064501175001425e-06, + "loss": 0.7722, + "step": 15408 + }, + { + "epoch": 0.8480929054983763, + "grad_norm": 0.6590496897697449, + "learning_rate": 6.206029453965905e-06, + "loss": 0.772, + "step": 15409 + }, + { + "epoch": 0.848147944300732, + "grad_norm": 0.6821194887161255, + "learning_rate": 6.205608781367475e-06, + "loss": 0.7687, + "step": 15410 + }, + { + "epoch": 0.8482029831030877, + "grad_norm": 0.6030088663101196, + "learning_rate": 6.205188099708011e-06, + "loss": 0.6673, + "step": 15411 + }, + { + "epoch": 0.8482580219054433, + "grad_norm": 0.6877727508544922, + "learning_rate": 6.204767408990676e-06, + "loss": 0.756, + "step": 15412 + }, + { + "epoch": 0.848313060707799, + "grad_norm": 0.7107367515563965, + "learning_rate": 6.204346709218632e-06, + "loss": 0.7481, + "step": 15413 + }, + { + "epoch": 0.8483680995101547, + "grad_norm": 0.7213658094406128, + "learning_rate": 6.2039260003950395e-06, + "loss": 0.7135, + "step": 15414 + }, + { + "epoch": 0.8484231383125104, + "grad_norm": 0.7002324461936951, + "learning_rate": 6.203505282523063e-06, + "loss": 0.6768, + "step": 15415 + }, + { + "epoch": 0.848478177114866, + "grad_norm": 0.7483230829238892, + "learning_rate": 6.2030845556058614e-06, + "loss": 0.633, + "step": 15416 + }, + { + "epoch": 0.8485332159172216, + "grad_norm": 0.6701670289039612, + "learning_rate": 6.2026638196466e-06, + "loss": 0.7936, + "step": 15417 + }, + { + "epoch": 0.8485882547195773, + "grad_norm": 0.6940304636955261, + "learning_rate": 6.202243074648438e-06, + "loss": 0.7787, + "step": 15418 + }, + { + "epoch": 0.848643293521933, + "grad_norm": 0.5912098288536072, + "learning_rate": 6.20182232061454e-06, + "loss": 0.6458, + "step": 15419 + }, + { + "epoch": 0.8486983323242886, + "grad_norm": 0.6538116931915283, + "learning_rate": 6.201401557548066e-06, + "loss": 0.6986, + "step": 15420 + }, + { + "epoch": 0.8487533711266443, + "grad_norm": 1.0245170593261719, + "learning_rate": 6.20098078545218e-06, + "loss": 0.7111, + "step": 15421 + }, + { + "epoch": 0.848808409929, + "grad_norm": 0.6896708011627197, + "learning_rate": 6.200560004330043e-06, + "loss": 0.7921, + "step": 15422 + }, + { + "epoch": 0.8488634487313556, + "grad_norm": 0.6219936013221741, + "learning_rate": 6.2001392141848195e-06, + "loss": 0.7345, + "step": 15423 + }, + { + "epoch": 0.8489184875337112, + "grad_norm": 0.7418678998947144, + "learning_rate": 6.199718415019671e-06, + "loss": 0.8517, + "step": 15424 + }, + { + "epoch": 0.8489735263360669, + "grad_norm": 0.7002347111701965, + "learning_rate": 6.199297606837759e-06, + "loss": 0.7345, + "step": 15425 + }, + { + "epoch": 0.8490285651384226, + "grad_norm": 0.7004539966583252, + "learning_rate": 6.198876789642247e-06, + "loss": 0.7639, + "step": 15426 + }, + { + "epoch": 0.8490836039407782, + "grad_norm": 0.64945387840271, + "learning_rate": 6.1984559634362995e-06, + "loss": 0.7556, + "step": 15427 + }, + { + "epoch": 0.8491386427431339, + "grad_norm": 0.6660465598106384, + "learning_rate": 6.1980351282230764e-06, + "loss": 0.7342, + "step": 15428 + }, + { + "epoch": 0.8491936815454896, + "grad_norm": 0.6177669763565063, + "learning_rate": 6.197614284005743e-06, + "loss": 0.7092, + "step": 15429 + }, + { + "epoch": 0.8492487203478453, + "grad_norm": 0.7604618072509766, + "learning_rate": 6.197193430787462e-06, + "loss": 0.8271, + "step": 15430 + }, + { + "epoch": 0.8493037591502008, + "grad_norm": 0.6788204312324524, + "learning_rate": 6.196772568571394e-06, + "loss": 0.7817, + "step": 15431 + }, + { + "epoch": 0.8493587979525565, + "grad_norm": 0.6073753833770752, + "learning_rate": 6.196351697360704e-06, + "loss": 0.6479, + "step": 15432 + }, + { + "epoch": 0.8494138367549122, + "grad_norm": 0.6842348575592041, + "learning_rate": 6.195930817158555e-06, + "loss": 0.7956, + "step": 15433 + }, + { + "epoch": 0.8494688755572679, + "grad_norm": 0.7863163352012634, + "learning_rate": 6.19550992796811e-06, + "loss": 0.7441, + "step": 15434 + }, + { + "epoch": 0.8495239143596235, + "grad_norm": 0.7495602965354919, + "learning_rate": 6.195089029792532e-06, + "loss": 0.7854, + "step": 15435 + }, + { + "epoch": 0.8495789531619792, + "grad_norm": 0.6595779061317444, + "learning_rate": 6.194668122634986e-06, + "loss": 0.6705, + "step": 15436 + }, + { + "epoch": 0.8496339919643349, + "grad_norm": 0.7727940082550049, + "learning_rate": 6.194247206498633e-06, + "loss": 0.7269, + "step": 15437 + }, + { + "epoch": 0.8496890307666906, + "grad_norm": 0.7433161735534668, + "learning_rate": 6.193826281386639e-06, + "loss": 0.7747, + "step": 15438 + }, + { + "epoch": 0.8497440695690461, + "grad_norm": 0.7075695991516113, + "learning_rate": 6.193405347302165e-06, + "loss": 0.8423, + "step": 15439 + }, + { + "epoch": 0.8497991083714018, + "grad_norm": 0.8821007013320923, + "learning_rate": 6.192984404248377e-06, + "loss": 0.705, + "step": 15440 + }, + { + "epoch": 0.8498541471737575, + "grad_norm": 0.7283695936203003, + "learning_rate": 6.192563452228437e-06, + "loss": 0.7013, + "step": 15441 + }, + { + "epoch": 0.8499091859761132, + "grad_norm": 0.7810649275779724, + "learning_rate": 6.192142491245509e-06, + "loss": 0.8303, + "step": 15442 + }, + { + "epoch": 0.8499642247784688, + "grad_norm": 0.5930086374282837, + "learning_rate": 6.191721521302758e-06, + "loss": 0.7117, + "step": 15443 + }, + { + "epoch": 0.8500192635808245, + "grad_norm": 0.6570530533790588, + "learning_rate": 6.191300542403347e-06, + "loss": 0.7525, + "step": 15444 + }, + { + "epoch": 0.8500743023831802, + "grad_norm": 0.8024932146072388, + "learning_rate": 6.190879554550437e-06, + "loss": 0.8011, + "step": 15445 + }, + { + "epoch": 0.8501293411855358, + "grad_norm": 0.851327121257782, + "learning_rate": 6.190458557747199e-06, + "loss": 0.8117, + "step": 15446 + }, + { + "epoch": 0.8501843799878914, + "grad_norm": 0.816034197807312, + "learning_rate": 6.190037551996791e-06, + "loss": 0.6659, + "step": 15447 + }, + { + "epoch": 0.8502394187902471, + "grad_norm": 0.7001582980155945, + "learning_rate": 6.18961653730238e-06, + "loss": 0.7406, + "step": 15448 + }, + { + "epoch": 0.8502944575926028, + "grad_norm": 0.6798322200775146, + "learning_rate": 6.189195513667129e-06, + "loss": 0.7504, + "step": 15449 + }, + { + "epoch": 0.8503494963949585, + "grad_norm": 0.6565585136413574, + "learning_rate": 6.188774481094203e-06, + "loss": 0.6445, + "step": 15450 + }, + { + "epoch": 0.8504045351973141, + "grad_norm": 0.674721360206604, + "learning_rate": 6.188353439586767e-06, + "loss": 0.6718, + "step": 15451 + }, + { + "epoch": 0.8504595739996698, + "grad_norm": 0.7626152634620667, + "learning_rate": 6.187932389147984e-06, + "loss": 0.7273, + "step": 15452 + }, + { + "epoch": 0.8505146128020254, + "grad_norm": 0.6497740149497986, + "learning_rate": 6.18751132978102e-06, + "loss": 0.7619, + "step": 15453 + } + ], + "logging_steps": 1, + "max_steps": 36338, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 909, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.56027501400739e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15453/training_args.bin b/checkpoint-15453/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4fcf8689837015e25934915ab36e9943776ca6cd --- /dev/null +++ b/checkpoint-15453/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c62f9cafd9057de88f53b2d6143eaf1e38cf3558d65c4e5642eaa284f31d316 +size 7928 diff --git a/checkpoint-15453/zero_to_fp32.py b/checkpoint-15453/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-15453/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-16362/config.json b/checkpoint-16362/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0153f8ee396146a87c398da9234b3dce005be --- /dev/null +++ b/checkpoint-16362/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128259 +} diff --git a/checkpoint-16362/generation_config.json b/checkpoint-16362/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eab5082496e8b01f9c606a306676cbfabe0cce9d --- /dev/null +++ b/checkpoint-16362/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..208eca3dc9ea9b43180f7f309a749f971ae0b91e --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:933ce374c000183b56a419eab968cd0a1e3835365602a7a37fbf5bb985805b3e +size 12045435328 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8b9180e0a7e0d78b29f61d807b14e80d8c41112 --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97988fccd7683dfea1fe3f8eca6195b096d794a28a1b768412a680d9b1a8ed35 +size 12045436096 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..336d99055a88118b985abed612fa3ad7d4cad95f --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb14ea5df05ae37c0517d666147798ebe68fca05076f39cf4aecc9dd2e347ed +size 12045436352 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46200ca224046ad261cdb28534730c2aec328cde --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c26c0ecd59cee11e37d2c5010491a872ece5595d930f3e1fb26891cb07bdc07 +size 12045436096 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce97550c609e8389dae12a730c6691a7130605c4 --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ad6ed5097c0522ab0c6bf9984a73f35010697302ce0036df0b42413c66eed7 +size 12045436352 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8adc726dd90f155b7005f5cb8a41311b9e5365e --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0eae3bc6d097cc01fa1b34d91409dfdbd05c9150a05849ef07131d88128eed +size 12045436416 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adc20527195a753413d678ad5c8c23d68a44dd5c --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb290fe0b07bb65f5fad20416f2e1a5730907e7ecb47273bf5118d43dfef9c3 +size 12045436096 diff --git a/checkpoint-16362/global_step16362/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..247177967abdafdc547927a3b067c7843612a996 --- /dev/null +++ b/checkpoint-16362/global_step16362/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb4f5710b4691930f57fac785196dcf95e5deb80174579db8d6f05f58ab55f0d +size 12045435008 diff --git a/checkpoint-16362/global_step16362/mp_rank_00_model_states.pt b/checkpoint-16362/global_step16362/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6284fc149abd65a5eaf373b5f475b4cb706532a4 --- /dev/null +++ b/checkpoint-16362/global_step16362/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab5f2df80acc237251ee72d699c484001e1f9d466703467db2da3b59ea27fcc +size 16060659704 diff --git a/checkpoint-16362/latest b/checkpoint-16362/latest new file mode 100644 index 0000000000000000000000000000000000000000..eb3c4d5f7ee1a2d39466c03921bd7e4b1fe64307 --- /dev/null +++ b/checkpoint-16362/latest @@ -0,0 +1 @@ +global_step16362 \ No newline at end of file diff --git a/checkpoint-16362/model-00001-of-00004.safetensors b/checkpoint-16362/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e809e5c1ea0929651b328fe8045434e5f279847f --- /dev/null +++ b/checkpoint-16362/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:749a06336aa47ec31f40c88739b5b6353cc26df367e1cbfaddf59331df676f60 +size 4976723248 diff --git a/checkpoint-16362/model-00002-of-00004.safetensors b/checkpoint-16362/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5258d95941c9e7620b0bca8979a110ce181a6d28 --- /dev/null +++ b/checkpoint-16362/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4856485eb711d19c990579e98a654674783e70cd86f41401353e68f429bfee24 +size 4999802720 diff --git a/checkpoint-16362/model-00003-of-00004.safetensors b/checkpoint-16362/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d5209129bffb835605433653501a85dabc1cf88b --- /dev/null +++ b/checkpoint-16362/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30cea172a88a56546196eb695a5ae988ce84b879e5cdc11b25c2c4097311074 +size 4915916176 diff --git a/checkpoint-16362/model-00004-of-00004.safetensors b/checkpoint-16362/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2c6983a200d0782f2332579893e0421ea421d1a4 --- /dev/null +++ b/checkpoint-16362/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324624bec23fdc1b231bde700b117c6d7f3ab28ba5e1b655f5675b276a7ad014 +size 1168163384 diff --git a/checkpoint-16362/model.safetensors.index.json b/checkpoint-16362/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..e734f8f9bcabe95e936a11f19b77148f54640122 --- /dev/null +++ b/checkpoint-16362/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060571648 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-16362/rng_state_0.pth b/checkpoint-16362/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-16362/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-16362/rng_state_1.pth b/checkpoint-16362/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-16362/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-16362/rng_state_2.pth b/checkpoint-16362/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-16362/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-16362/rng_state_3.pth b/checkpoint-16362/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-16362/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-16362/rng_state_4.pth b/checkpoint-16362/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-16362/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-16362/rng_state_5.pth b/checkpoint-16362/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-16362/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-16362/rng_state_6.pth b/checkpoint-16362/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-16362/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-16362/rng_state_7.pth b/checkpoint-16362/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-16362/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-16362/scheduler.pt b/checkpoint-16362/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3a840cbb7f7aac87425e7eedd7f3be61ab24290 --- /dev/null +++ b/checkpoint-16362/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43175d3456786eac68100efb57397de337a3f5ee909899653ef15f80de3f2d9 +size 1064 diff --git a/checkpoint-16362/special_tokens_map.json b/checkpoint-16362/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-16362/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-16362/tokenizer.json b/checkpoint-16362/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d29771c68b37af9541b4c450532cb095b564ca5 --- /dev/null +++ b/checkpoint-16362/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a36f231bc2882e8c2e1859bc27098f73c95ea211ccb73ad0cdb441a16f49c6 +size 17210280 diff --git a/checkpoint-16362/tokenizer_config.json b/checkpoint-16362/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a695c457b54a00f10768564f6c25b0142ccc840 --- /dev/null +++ b/checkpoint-16362/tokenizer_config.json @@ -0,0 +1,2087 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|im_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|end_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|autheur|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|sujet|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|><|khey|><|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-16362/trainer_state.json b/checkpoint-16362/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f050e233469fe543ec49c7caa3922c3ddbabdaed --- /dev/null +++ b/checkpoint-16362/trainer_state.json @@ -0,0 +1,114567 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.900544884143321, + "eval_steps": 500, + "global_step": 16362, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.503880235566074e-05, + "grad_norm": 459.8753356933594, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.303, + "step": 1 + }, + { + "epoch": 0.00011007760471132149, + "grad_norm": 314.2561950683594, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.8226, + "step": 2 + }, + { + "epoch": 0.0001651164070669822, + "grad_norm": 314.1292419433594, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8517, + "step": 3 + }, + { + "epoch": 0.00022015520942264297, + "grad_norm": 312.4049072265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6248, + "step": 4 + }, + { + "epoch": 0.0002751940117783037, + "grad_norm": 353.7213134765625, + "learning_rate": 5.000000000000001e-07, + "loss": 2.7883, + "step": 5 + }, + { + "epoch": 0.0003302328141339644, + "grad_norm": 278.41668701171875, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5468, + "step": 6 + }, + { + "epoch": 0.0003852716164896252, + "grad_norm": 336.14532470703125, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7721, + "step": 7 + }, + { + "epoch": 0.00044031041884528595, + "grad_norm": 201.19374084472656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4873, + "step": 8 + }, + { + "epoch": 0.0004953492212009466, + "grad_norm": 184.7027587890625, + "learning_rate": 9.000000000000001e-07, + "loss": 2.6647, + "step": 9 + }, + { + "epoch": 0.0005503880235566074, + "grad_norm": 154.597412109375, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.602, + "step": 10 + }, + { + "epoch": 0.0006054268259122681, + "grad_norm": 40.47785568237305, + "learning_rate": 1.1e-06, + "loss": 2.6716, + "step": 11 + }, + { + "epoch": 0.0006604656282679288, + "grad_norm": 25.338607788085938, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2631, + "step": 12 + }, + { + "epoch": 0.0007155044306235897, + "grad_norm": 24.976919174194336, + "learning_rate": 1.3e-06, + "loss": 2.3564, + "step": 13 + }, + { + "epoch": 0.0007705432329792504, + "grad_norm": 15.239912033081055, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3295, + "step": 14 + }, + { + "epoch": 0.0008255820353349112, + "grad_norm": 14.125042915344238, + "learning_rate": 1.5e-06, + "loss": 2.307, + "step": 15 + }, + { + "epoch": 0.0008806208376905719, + "grad_norm": 13.163726806640625, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1493, + "step": 16 + }, + { + "epoch": 0.0009356596400462326, + "grad_norm": 8.726515769958496, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.0333, + "step": 17 + }, + { + "epoch": 0.0009906984424018933, + "grad_norm": 9.072502136230469, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.2046, + "step": 18 + }, + { + "epoch": 0.001045737244757554, + "grad_norm": 9.412588119506836, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.2001, + "step": 19 + }, + { + "epoch": 0.0011007760471132147, + "grad_norm": 8.67534065246582, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0011558148494688755, + "grad_norm": 14.015918731689453, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.9566, + "step": 21 + }, + { + "epoch": 0.0012108536518245362, + "grad_norm": 7.9474687576293945, + "learning_rate": 2.2e-06, + "loss": 1.9085, + "step": 22 + }, + { + "epoch": 0.001265892454180197, + "grad_norm": 6.806368350982666, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7918, + "step": 23 + }, + { + "epoch": 0.0013209312565358577, + "grad_norm": 5.3452582359313965, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8321, + "step": 24 + }, + { + "epoch": 0.0013759700588915184, + "grad_norm": 8.744244575500488, + "learning_rate": 2.5e-06, + "loss": 1.6317, + "step": 25 + }, + { + "epoch": 0.0014310088612471794, + "grad_norm": 5.304683685302734, + "learning_rate": 2.6e-06, + "loss": 1.6846, + "step": 26 + }, + { + "epoch": 0.00148604766360284, + "grad_norm": 5.650127410888672, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7449, + "step": 27 + }, + { + "epoch": 0.0015410864659585008, + "grad_norm": 5.479269504547119, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8158, + "step": 28 + }, + { + "epoch": 0.0015961252683141616, + "grad_norm": 4.873537063598633, + "learning_rate": 2.9e-06, + "loss": 1.8015, + "step": 29 + }, + { + "epoch": 0.0016511640706698223, + "grad_norm": 4.971101760864258, + "learning_rate": 3e-06, + "loss": 1.9034, + "step": 30 + }, + { + "epoch": 0.001706202873025483, + "grad_norm": 4.407571315765381, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.9037, + "step": 31 + }, + { + "epoch": 0.0017612416753811438, + "grad_norm": 4.429073810577393, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6812, + "step": 32 + }, + { + "epoch": 0.0018162804777368045, + "grad_norm": 5.16085147857666, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7627, + "step": 33 + }, + { + "epoch": 0.0018713192800924653, + "grad_norm": 4.0805768966674805, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6799, + "step": 34 + }, + { + "epoch": 0.001926358082448126, + "grad_norm": 4.548702239990234, + "learning_rate": 3.5e-06, + "loss": 1.7799, + "step": 35 + }, + { + "epoch": 0.0019813968848037865, + "grad_norm": 5.181888580322266, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.8235, + "step": 36 + }, + { + "epoch": 0.0020364356871594475, + "grad_norm": 3.9876129627227783, + "learning_rate": 3.7e-06, + "loss": 1.5999, + "step": 37 + }, + { + "epoch": 0.002091474489515108, + "grad_norm": 6.325051307678223, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.7499, + "step": 38 + }, + { + "epoch": 0.002146513291870769, + "grad_norm": 6.199049949645996, + "learning_rate": 3.900000000000001e-06, + "loss": 1.784, + "step": 39 + }, + { + "epoch": 0.0022015520942264295, + "grad_norm": 4.83912992477417, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8895, + "step": 40 + }, + { + "epoch": 0.0022565908965820904, + "grad_norm": 4.515626907348633, + "learning_rate": 4.1e-06, + "loss": 1.4887, + "step": 41 + }, + { + "epoch": 0.002311629698937751, + "grad_norm": 5.032265663146973, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7324, + "step": 42 + }, + { + "epoch": 0.002366668501293412, + "grad_norm": 4.1879048347473145, + "learning_rate": 4.3e-06, + "loss": 1.4912, + "step": 43 + }, + { + "epoch": 0.0024217073036490724, + "grad_norm": 4.128026485443115, + "learning_rate": 4.4e-06, + "loss": 1.554, + "step": 44 + }, + { + "epoch": 0.0024767461060047334, + "grad_norm": 4.527958393096924, + "learning_rate": 4.5e-06, + "loss": 1.652, + "step": 45 + }, + { + "epoch": 0.002531784908360394, + "grad_norm": 4.8388190269470215, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6696, + "step": 46 + }, + { + "epoch": 0.002586823710716055, + "grad_norm": 4.2088541984558105, + "learning_rate": 4.7e-06, + "loss": 1.568, + "step": 47 + }, + { + "epoch": 0.0026418625130717154, + "grad_norm": 4.789997577667236, + "learning_rate": 4.800000000000001e-06, + "loss": 1.642, + "step": 48 + }, + { + "epoch": 0.0026969013154273763, + "grad_norm": 4.408346652984619, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5181, + "step": 49 + }, + { + "epoch": 0.002751940117783037, + "grad_norm": 4.572340488433838, + "learning_rate": 5e-06, + "loss": 1.6698, + "step": 50 + }, + { + "epoch": 0.0028069789201386978, + "grad_norm": 4.728564739227295, + "learning_rate": 5.1e-06, + "loss": 1.5785, + "step": 51 + }, + { + "epoch": 0.0028620177224943587, + "grad_norm": 4.449855327606201, + "learning_rate": 5.2e-06, + "loss": 1.4624, + "step": 52 + }, + { + "epoch": 0.0029170565248500193, + "grad_norm": 4.127189636230469, + "learning_rate": 5.300000000000001e-06, + "loss": 1.6061, + "step": 53 + }, + { + "epoch": 0.00297209532720568, + "grad_norm": 4.244532108306885, + "learning_rate": 5.400000000000001e-06, + "loss": 1.491, + "step": 54 + }, + { + "epoch": 0.0030271341295613407, + "grad_norm": 3.437682628631592, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1967, + "step": 55 + }, + { + "epoch": 0.0030821729319170017, + "grad_norm": 3.83516788482666, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4731, + "step": 56 + }, + { + "epoch": 0.003137211734272662, + "grad_norm": 3.9108972549438477, + "learning_rate": 5.7e-06, + "loss": 1.4393, + "step": 57 + }, + { + "epoch": 0.003192250536628323, + "grad_norm": 3.5258419513702393, + "learning_rate": 5.8e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.0032472893389839837, + "grad_norm": 4.124903678894043, + "learning_rate": 5.9e-06, + "loss": 1.4747, + "step": 59 + }, + { + "epoch": 0.0033023281413396446, + "grad_norm": 4.055769920349121, + "learning_rate": 6e-06, + "loss": 1.4655, + "step": 60 + }, + { + "epoch": 0.003357366943695305, + "grad_norm": 3.904837131500244, + "learning_rate": 6.1e-06, + "loss": 1.5125, + "step": 61 + }, + { + "epoch": 0.003412405746050966, + "grad_norm": 3.2904794216156006, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4596, + "step": 62 + }, + { + "epoch": 0.0034674445484066266, + "grad_norm": 3.24053692817688, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3851, + "step": 63 + }, + { + "epoch": 0.0035224833507622876, + "grad_norm": 3.457639217376709, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4019, + "step": 64 + }, + { + "epoch": 0.003577522153117948, + "grad_norm": 3.073054790496826, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2872, + "step": 65 + }, + { + "epoch": 0.003632560955473609, + "grad_norm": 2.6726694107055664, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2361, + "step": 66 + }, + { + "epoch": 0.0036875997578292696, + "grad_norm": 2.9378459453582764, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4452, + "step": 67 + }, + { + "epoch": 0.0037426385601849305, + "grad_norm": 2.81107234954834, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4804, + "step": 68 + }, + { + "epoch": 0.003797677362540591, + "grad_norm": 2.60062313079834, + "learning_rate": 6.9e-06, + "loss": 1.3263, + "step": 69 + }, + { + "epoch": 0.003852716164896252, + "grad_norm": 2.5642921924591064, + "learning_rate": 7e-06, + "loss": 1.2751, + "step": 70 + }, + { + "epoch": 0.0039077549672519125, + "grad_norm": 2.3608031272888184, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2614, + "step": 71 + }, + { + "epoch": 0.003962793769607573, + "grad_norm": 2.7201738357543945, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5018, + "step": 72 + }, + { + "epoch": 0.004017832571963234, + "grad_norm": 2.584726095199585, + "learning_rate": 7.3e-06, + "loss": 1.3519, + "step": 73 + }, + { + "epoch": 0.004072871374318895, + "grad_norm": 1.9693044424057007, + "learning_rate": 7.4e-06, + "loss": 1.0934, + "step": 74 + }, + { + "epoch": 0.0041279101766745555, + "grad_norm": 2.220736503601074, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4687, + "step": 75 + }, + { + "epoch": 0.004182948979030216, + "grad_norm": 2.2629456520080566, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3328, + "step": 76 + }, + { + "epoch": 0.004237987781385877, + "grad_norm": 2.051820993423462, + "learning_rate": 7.7e-06, + "loss": 1.3058, + "step": 77 + }, + { + "epoch": 0.004293026583741538, + "grad_norm": 2.2451820373535156, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3556, + "step": 78 + }, + { + "epoch": 0.004348065386097198, + "grad_norm": 3.13584303855896, + "learning_rate": 7.9e-06, + "loss": 1.3262, + "step": 79 + }, + { + "epoch": 0.004403104188452859, + "grad_norm": 5.024479866027832, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2103, + "step": 80 + }, + { + "epoch": 0.00445814299080852, + "grad_norm": 2.070889711380005, + "learning_rate": 8.1e-06, + "loss": 1.1994, + "step": 81 + }, + { + "epoch": 0.004513181793164181, + "grad_norm": 2.797286033630371, + "learning_rate": 8.2e-06, + "loss": 1.3075, + "step": 82 + }, + { + "epoch": 0.004568220595519841, + "grad_norm": 2.11370849609375, + "learning_rate": 8.3e-06, + "loss": 1.36, + "step": 83 + }, + { + "epoch": 0.004623259397875502, + "grad_norm": 2.5416152477264404, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3484, + "step": 84 + }, + { + "epoch": 0.004678298200231163, + "grad_norm": 2.4702343940734863, + "learning_rate": 8.5e-06, + "loss": 1.3677, + "step": 85 + }, + { + "epoch": 0.004733337002586824, + "grad_norm": 3.670365333557129, + "learning_rate": 8.6e-06, + "loss": 1.2192, + "step": 86 + }, + { + "epoch": 0.004788375804942484, + "grad_norm": 2.282954692840576, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2982, + "step": 87 + }, + { + "epoch": 0.004843414607298145, + "grad_norm": 2.3659238815307617, + "learning_rate": 8.8e-06, + "loss": 1.3206, + "step": 88 + }, + { + "epoch": 0.004898453409653806, + "grad_norm": 4.939981460571289, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4328, + "step": 89 + }, + { + "epoch": 0.004953492212009467, + "grad_norm": 2.335858106613159, + "learning_rate": 9e-06, + "loss": 1.2603, + "step": 90 + }, + { + "epoch": 0.005008531014365127, + "grad_norm": 2.2165043354034424, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3141, + "step": 91 + }, + { + "epoch": 0.005063569816720788, + "grad_norm": 2.7872185707092285, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3314, + "step": 92 + }, + { + "epoch": 0.005118608619076449, + "grad_norm": 2.6353912353515625, + "learning_rate": 9.3e-06, + "loss": 1.2027, + "step": 93 + }, + { + "epoch": 0.00517364742143211, + "grad_norm": 3.2509102821350098, + "learning_rate": 9.4e-06, + "loss": 1.2316, + "step": 94 + }, + { + "epoch": 0.00522868622378777, + "grad_norm": 2.4560611248016357, + "learning_rate": 9.5e-06, + "loss": 1.1848, + "step": 95 + }, + { + "epoch": 0.005283725026143431, + "grad_norm": 2.338151216506958, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2392, + "step": 96 + }, + { + "epoch": 0.005338763828499092, + "grad_norm": 2.231065034866333, + "learning_rate": 9.7e-06, + "loss": 1.2089, + "step": 97 + }, + { + "epoch": 0.005393802630854753, + "grad_norm": 2.278428077697754, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2267, + "step": 98 + }, + { + "epoch": 0.005448841433210413, + "grad_norm": 2.4422810077667236, + "learning_rate": 9.9e-06, + "loss": 1.2041, + "step": 99 + }, + { + "epoch": 0.005503880235566074, + "grad_norm": 2.216248035430908, + "learning_rate": 1e-05, + "loss": 1.0798, + "step": 100 + }, + { + "epoch": 0.005558919037921735, + "grad_norm": 2.3301615715026855, + "learning_rate": 9.99999998121067e-06, + "loss": 1.3069, + "step": 101 + }, + { + "epoch": 0.0056139578402773956, + "grad_norm": 2.315436363220215, + "learning_rate": 9.999999924842678e-06, + "loss": 1.1589, + "step": 102 + }, + { + "epoch": 0.005668996642633056, + "grad_norm": 2.3522140979766846, + "learning_rate": 9.999999830896024e-06, + "loss": 1.0978, + "step": 103 + }, + { + "epoch": 0.0057240354449887175, + "grad_norm": 2.5798308849334717, + "learning_rate": 9.99999969937071e-06, + "loss": 1.0599, + "step": 104 + }, + { + "epoch": 0.005779074247344378, + "grad_norm": 2.456644058227539, + "learning_rate": 9.999999530266738e-06, + "loss": 1.1682, + "step": 105 + }, + { + "epoch": 0.0058341130497000385, + "grad_norm": 2.1559031009674072, + "learning_rate": 9.999999323584106e-06, + "loss": 1.0631, + "step": 106 + }, + { + "epoch": 0.005889151852055699, + "grad_norm": 2.2985048294067383, + "learning_rate": 9.99999907932282e-06, + "loss": 1.1455, + "step": 107 + }, + { + "epoch": 0.00594419065441136, + "grad_norm": 2.596167802810669, + "learning_rate": 9.999998797482877e-06, + "loss": 1.1686, + "step": 108 + }, + { + "epoch": 0.005999229456767021, + "grad_norm": 2.378618001937866, + "learning_rate": 9.999998478064283e-06, + "loss": 1.2226, + "step": 109 + }, + { + "epoch": 0.0060542682591226814, + "grad_norm": 2.228116750717163, + "learning_rate": 9.999998121067038e-06, + "loss": 1.1396, + "step": 110 + }, + { + "epoch": 0.006109307061478342, + "grad_norm": 2.4419472217559814, + "learning_rate": 9.999997726491146e-06, + "loss": 1.1401, + "step": 111 + }, + { + "epoch": 0.006164345863834003, + "grad_norm": 2.0695526599884033, + "learning_rate": 9.999997294336608e-06, + "loss": 1.1868, + "step": 112 + }, + { + "epoch": 0.006219384666189664, + "grad_norm": 2.3170363903045654, + "learning_rate": 9.99999682460343e-06, + "loss": 1.1172, + "step": 113 + }, + { + "epoch": 0.006274423468545324, + "grad_norm": 2.670466184616089, + "learning_rate": 9.999996317291615e-06, + "loss": 1.2481, + "step": 114 + }, + { + "epoch": 0.006329462270900985, + "grad_norm": 2.1214540004730225, + "learning_rate": 9.999995772401166e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.006384501073256646, + "grad_norm": 1.9283969402313232, + "learning_rate": 9.999995189932085e-06, + "loss": 1.0692, + "step": 116 + }, + { + "epoch": 0.006439539875612307, + "grad_norm": 2.2620882987976074, + "learning_rate": 9.99999456988438e-06, + "loss": 1.0725, + "step": 117 + }, + { + "epoch": 0.006494578677967967, + "grad_norm": 2.2121341228485107, + "learning_rate": 9.999993912258055e-06, + "loss": 1.1328, + "step": 118 + }, + { + "epoch": 0.006549617480323628, + "grad_norm": 2.298126220703125, + "learning_rate": 9.999993217053113e-06, + "loss": 1.1272, + "step": 119 + }, + { + "epoch": 0.006604656282679289, + "grad_norm": 1.81593656539917, + "learning_rate": 9.99999248426956e-06, + "loss": 1.017, + "step": 120 + }, + { + "epoch": 0.00665969508503495, + "grad_norm": 2.1174378395080566, + "learning_rate": 9.999991713907403e-06, + "loss": 1.0557, + "step": 121 + }, + { + "epoch": 0.00671473388739061, + "grad_norm": 1.9061017036437988, + "learning_rate": 9.999990905966647e-06, + "loss": 1.0379, + "step": 122 + }, + { + "epoch": 0.006769772689746271, + "grad_norm": 1.912500023841858, + "learning_rate": 9.999990060447297e-06, + "loss": 1.104, + "step": 123 + }, + { + "epoch": 0.006824811492101932, + "grad_norm": 1.9249529838562012, + "learning_rate": 9.99998917734936e-06, + "loss": 1.0136, + "step": 124 + }, + { + "epoch": 0.006879850294457593, + "grad_norm": 1.8504948616027832, + "learning_rate": 9.999988256672843e-06, + "loss": 0.99, + "step": 125 + }, + { + "epoch": 0.006934889096813253, + "grad_norm": 1.720042109489441, + "learning_rate": 9.999987298417753e-06, + "loss": 1.0666, + "step": 126 + }, + { + "epoch": 0.006989927899168914, + "grad_norm": 1.778251051902771, + "learning_rate": 9.999986302584097e-06, + "loss": 1.0424, + "step": 127 + }, + { + "epoch": 0.007044966701524575, + "grad_norm": 1.9485961198806763, + "learning_rate": 9.999985269171881e-06, + "loss": 1.105, + "step": 128 + }, + { + "epoch": 0.007100005503880236, + "grad_norm": 3.0802104473114014, + "learning_rate": 9.999984198181114e-06, + "loss": 1.1081, + "step": 129 + }, + { + "epoch": 0.007155044306235896, + "grad_norm": 1.7476954460144043, + "learning_rate": 9.999983089611806e-06, + "loss": 0.9677, + "step": 130 + }, + { + "epoch": 0.007210083108591557, + "grad_norm": 1.6127299070358276, + "learning_rate": 9.999981943463963e-06, + "loss": 0.9937, + "step": 131 + }, + { + "epoch": 0.007265121910947218, + "grad_norm": 2.1477208137512207, + "learning_rate": 9.999980759737594e-06, + "loss": 1.0319, + "step": 132 + }, + { + "epoch": 0.007320160713302879, + "grad_norm": 1.531163215637207, + "learning_rate": 9.999979538432707e-06, + "loss": 0.8696, + "step": 133 + }, + { + "epoch": 0.007375199515658539, + "grad_norm": 1.8226820230484009, + "learning_rate": 9.999978279549313e-06, + "loss": 1.2061, + "step": 134 + }, + { + "epoch": 0.0074302383180142, + "grad_norm": 1.481895923614502, + "learning_rate": 9.99997698308742e-06, + "loss": 0.949, + "step": 135 + }, + { + "epoch": 0.007485277120369861, + "grad_norm": 1.6715927124023438, + "learning_rate": 9.99997564904704e-06, + "loss": 1.1579, + "step": 136 + }, + { + "epoch": 0.0075403159227255215, + "grad_norm": 1.4235272407531738, + "learning_rate": 9.999974277428179e-06, + "loss": 1.064, + "step": 137 + }, + { + "epoch": 0.007595354725081182, + "grad_norm": 1.3524872064590454, + "learning_rate": 9.999972868230852e-06, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.007650393527436843, + "grad_norm": 1.3741765022277832, + "learning_rate": 9.999971421455066e-06, + "loss": 1.0256, + "step": 139 + }, + { + "epoch": 0.007705432329792504, + "grad_norm": 1.9869598150253296, + "learning_rate": 9.999969937100835e-06, + "loss": 0.9489, + "step": 140 + }, + { + "epoch": 0.0077604711321481645, + "grad_norm": 1.4785465002059937, + "learning_rate": 9.999968415168166e-06, + "loss": 0.9243, + "step": 141 + }, + { + "epoch": 0.007815509934503825, + "grad_norm": 1.5476176738739014, + "learning_rate": 9.999966855657074e-06, + "loss": 1.178, + "step": 142 + }, + { + "epoch": 0.007870548736859486, + "grad_norm": 1.500401258468628, + "learning_rate": 9.99996525856757e-06, + "loss": 0.9837, + "step": 143 + }, + { + "epoch": 0.007925587539215146, + "grad_norm": 1.3777157068252563, + "learning_rate": 9.999963623899664e-06, + "loss": 1.0732, + "step": 144 + }, + { + "epoch": 0.007980626341570807, + "grad_norm": 1.4466841220855713, + "learning_rate": 9.99996195165337e-06, + "loss": 0.9779, + "step": 145 + }, + { + "epoch": 0.008035665143926469, + "grad_norm": 1.5304051637649536, + "learning_rate": 9.9999602418287e-06, + "loss": 1.196, + "step": 146 + }, + { + "epoch": 0.008090703946282128, + "grad_norm": 1.9012362957000732, + "learning_rate": 9.99995849442567e-06, + "loss": 0.9797, + "step": 147 + }, + { + "epoch": 0.00814574274863779, + "grad_norm": 1.430679202079773, + "learning_rate": 9.999956709444289e-06, + "loss": 0.9869, + "step": 148 + }, + { + "epoch": 0.00820078155099345, + "grad_norm": 1.3489817380905151, + "learning_rate": 9.99995488688457e-06, + "loss": 1.0137, + "step": 149 + }, + { + "epoch": 0.008255820353349111, + "grad_norm": 1.1878125667572021, + "learning_rate": 9.999953026746531e-06, + "loss": 0.9355, + "step": 150 + }, + { + "epoch": 0.008310859155704772, + "grad_norm": 1.3481942415237427, + "learning_rate": 9.999951129030182e-06, + "loss": 1.1235, + "step": 151 + }, + { + "epoch": 0.008365897958060432, + "grad_norm": 1.7335314750671387, + "learning_rate": 9.999949193735539e-06, + "loss": 0.9382, + "step": 152 + }, + { + "epoch": 0.008420936760416093, + "grad_norm": 1.2029480934143066, + "learning_rate": 9.999947220862615e-06, + "loss": 0.9419, + "step": 153 + }, + { + "epoch": 0.008475975562771755, + "grad_norm": 1.2104203701019287, + "learning_rate": 9.999945210411428e-06, + "loss": 0.9196, + "step": 154 + }, + { + "epoch": 0.008531014365127414, + "grad_norm": 1.1857126951217651, + "learning_rate": 9.999943162381991e-06, + "loss": 0.9421, + "step": 155 + }, + { + "epoch": 0.008586053167483076, + "grad_norm": 1.115027904510498, + "learning_rate": 9.999941076774319e-06, + "loss": 0.9634, + "step": 156 + }, + { + "epoch": 0.008641091969838737, + "grad_norm": 1.4227553606033325, + "learning_rate": 9.999938953588428e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.008696130772194397, + "grad_norm": 1.2913776636123657, + "learning_rate": 9.999936792824334e-06, + "loss": 0.9232, + "step": 158 + }, + { + "epoch": 0.008751169574550058, + "grad_norm": 1.2817318439483643, + "learning_rate": 9.999934594482055e-06, + "loss": 0.9691, + "step": 159 + }, + { + "epoch": 0.008806208376905718, + "grad_norm": 1.5647841691970825, + "learning_rate": 9.999932358561604e-06, + "loss": 1.1842, + "step": 160 + }, + { + "epoch": 0.00886124717926138, + "grad_norm": 1.368135929107666, + "learning_rate": 9.999930085063002e-06, + "loss": 1.0873, + "step": 161 + }, + { + "epoch": 0.00891628598161704, + "grad_norm": 1.2297240495681763, + "learning_rate": 9.999927773986262e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.0089713247839727, + "grad_norm": 1.0658279657363892, + "learning_rate": 9.999925425331405e-06, + "loss": 0.9008, + "step": 163 + }, + { + "epoch": 0.009026363586328362, + "grad_norm": 1.3484326601028442, + "learning_rate": 9.999923039098445e-06, + "loss": 1.0664, + "step": 164 + }, + { + "epoch": 0.009081402388684023, + "grad_norm": 1.1839075088500977, + "learning_rate": 9.999920615287401e-06, + "loss": 0.9257, + "step": 165 + }, + { + "epoch": 0.009136441191039683, + "grad_norm": 1.2757254838943481, + "learning_rate": 9.999918153898295e-06, + "loss": 0.9473, + "step": 166 + }, + { + "epoch": 0.009191479993395344, + "grad_norm": 1.2414579391479492, + "learning_rate": 9.99991565493114e-06, + "loss": 1.1091, + "step": 167 + }, + { + "epoch": 0.009246518795751004, + "grad_norm": 1.2802611589431763, + "learning_rate": 9.999913118385959e-06, + "loss": 1.063, + "step": 168 + }, + { + "epoch": 0.009301557598106665, + "grad_norm": 1.2055327892303467, + "learning_rate": 9.99991054426277e-06, + "loss": 0.8, + "step": 169 + }, + { + "epoch": 0.009356596400462327, + "grad_norm": 1.0391098260879517, + "learning_rate": 9.99990793256159e-06, + "loss": 0.8672, + "step": 170 + }, + { + "epoch": 0.009411635202817986, + "grad_norm": 1.131536602973938, + "learning_rate": 9.99990528328244e-06, + "loss": 0.9569, + "step": 171 + }, + { + "epoch": 0.009466674005173648, + "grad_norm": 1.164307951927185, + "learning_rate": 9.999902596425342e-06, + "loss": 0.9999, + "step": 172 + }, + { + "epoch": 0.009521712807529309, + "grad_norm": 1.2099504470825195, + "learning_rate": 9.999899871990313e-06, + "loss": 0.9994, + "step": 173 + }, + { + "epoch": 0.009576751609884969, + "grad_norm": 1.7294539213180542, + "learning_rate": 9.999897109977376e-06, + "loss": 1.0265, + "step": 174 + }, + { + "epoch": 0.00963179041224063, + "grad_norm": 1.3009883165359497, + "learning_rate": 9.99989431038655e-06, + "loss": 0.9022, + "step": 175 + }, + { + "epoch": 0.00968682921459629, + "grad_norm": 1.1014611721038818, + "learning_rate": 9.999891473217857e-06, + "loss": 0.8476, + "step": 176 + }, + { + "epoch": 0.009741868016951951, + "grad_norm": 1.2410900592803955, + "learning_rate": 9.99988859847132e-06, + "loss": 1.0272, + "step": 177 + }, + { + "epoch": 0.009796906819307612, + "grad_norm": 1.336348295211792, + "learning_rate": 9.999885686146957e-06, + "loss": 0.9456, + "step": 178 + }, + { + "epoch": 0.009851945621663272, + "grad_norm": 1.2931095361709595, + "learning_rate": 9.99988273624479e-06, + "loss": 0.9554, + "step": 179 + }, + { + "epoch": 0.009906984424018933, + "grad_norm": 1.2647838592529297, + "learning_rate": 9.999879748764845e-06, + "loss": 1.0394, + "step": 180 + }, + { + "epoch": 0.009962023226374595, + "grad_norm": 1.3485127687454224, + "learning_rate": 9.99987672370714e-06, + "loss": 1.1016, + "step": 181 + }, + { + "epoch": 0.010017062028730254, + "grad_norm": 1.110187292098999, + "learning_rate": 9.999873661071702e-06, + "loss": 0.946, + "step": 182 + }, + { + "epoch": 0.010072100831085916, + "grad_norm": 1.0991623401641846, + "learning_rate": 9.999870560858551e-06, + "loss": 1.0084, + "step": 183 + }, + { + "epoch": 0.010127139633441576, + "grad_norm": 1.049804449081421, + "learning_rate": 9.999867423067713e-06, + "loss": 0.8264, + "step": 184 + }, + { + "epoch": 0.010182178435797237, + "grad_norm": 1.0947058200836182, + "learning_rate": 9.999864247699207e-06, + "loss": 0.8884, + "step": 185 + }, + { + "epoch": 0.010237217238152898, + "grad_norm": 1.1147902011871338, + "learning_rate": 9.999861034753061e-06, + "loss": 0.9657, + "step": 186 + }, + { + "epoch": 0.010292256040508558, + "grad_norm": 1.260027527809143, + "learning_rate": 9.999857784229298e-06, + "loss": 1.0102, + "step": 187 + }, + { + "epoch": 0.01034729484286422, + "grad_norm": 1.1275582313537598, + "learning_rate": 9.999854496127942e-06, + "loss": 1.028, + "step": 188 + }, + { + "epoch": 0.01040233364521988, + "grad_norm": 1.1377174854278564, + "learning_rate": 9.999851170449018e-06, + "loss": 1.032, + "step": 189 + }, + { + "epoch": 0.01045737244757554, + "grad_norm": 1.1734225749969482, + "learning_rate": 9.999847807192552e-06, + "loss": 1.0009, + "step": 190 + }, + { + "epoch": 0.010512411249931202, + "grad_norm": 1.1934596300125122, + "learning_rate": 9.999844406358565e-06, + "loss": 1.0432, + "step": 191 + }, + { + "epoch": 0.010567450052286861, + "grad_norm": 1.0638024806976318, + "learning_rate": 9.99984096794709e-06, + "loss": 0.8651, + "step": 192 + }, + { + "epoch": 0.010622488854642523, + "grad_norm": 1.2381829023361206, + "learning_rate": 9.999837491958147e-06, + "loss": 1.0088, + "step": 193 + }, + { + "epoch": 0.010677527656998184, + "grad_norm": 1.030246615409851, + "learning_rate": 9.999833978391763e-06, + "loss": 0.9488, + "step": 194 + }, + { + "epoch": 0.010732566459353844, + "grad_norm": 1.1640657186508179, + "learning_rate": 9.999830427247965e-06, + "loss": 1.0588, + "step": 195 + }, + { + "epoch": 0.010787605261709505, + "grad_norm": 1.0431616306304932, + "learning_rate": 9.99982683852678e-06, + "loss": 0.8728, + "step": 196 + }, + { + "epoch": 0.010842644064065167, + "grad_norm": 1.032263159751892, + "learning_rate": 9.999823212228235e-06, + "loss": 0.9498, + "step": 197 + }, + { + "epoch": 0.010897682866420826, + "grad_norm": 1.1383745670318604, + "learning_rate": 9.999819548352358e-06, + "loss": 0.9498, + "step": 198 + }, + { + "epoch": 0.010952721668776488, + "grad_norm": 1.1324639320373535, + "learning_rate": 9.999815846899175e-06, + "loss": 1.0432, + "step": 199 + }, + { + "epoch": 0.011007760471132147, + "grad_norm": 1.188672661781311, + "learning_rate": 9.999812107868714e-06, + "loss": 0.982, + "step": 200 + }, + { + "epoch": 0.011062799273487809, + "grad_norm": 1.1011098623275757, + "learning_rate": 9.999808331261005e-06, + "loss": 0.9587, + "step": 201 + }, + { + "epoch": 0.01111783807584347, + "grad_norm": 1.1782938241958618, + "learning_rate": 9.999804517076073e-06, + "loss": 1.0659, + "step": 202 + }, + { + "epoch": 0.01117287687819913, + "grad_norm": 1.0520117282867432, + "learning_rate": 9.99980066531395e-06, + "loss": 1.0056, + "step": 203 + }, + { + "epoch": 0.011227915680554791, + "grad_norm": 1.1584919691085815, + "learning_rate": 9.999796775974663e-06, + "loss": 0.9435, + "step": 204 + }, + { + "epoch": 0.011282954482910452, + "grad_norm": 1.2201849222183228, + "learning_rate": 9.999792849058242e-06, + "loss": 1.0562, + "step": 205 + }, + { + "epoch": 0.011337993285266112, + "grad_norm": 1.2985976934432983, + "learning_rate": 9.999788884564715e-06, + "loss": 1.0126, + "step": 206 + }, + { + "epoch": 0.011393032087621774, + "grad_norm": 0.9926307201385498, + "learning_rate": 9.999784882494115e-06, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.011448070889977435, + "grad_norm": 1.103365182876587, + "learning_rate": 9.99978084284647e-06, + "loss": 0.9833, + "step": 208 + }, + { + "epoch": 0.011503109692333095, + "grad_norm": 1.1798462867736816, + "learning_rate": 9.99977676562181e-06, + "loss": 0.8479, + "step": 209 + }, + { + "epoch": 0.011558148494688756, + "grad_norm": 1.2887194156646729, + "learning_rate": 9.999772650820168e-06, + "loss": 0.9606, + "step": 210 + }, + { + "epoch": 0.011613187297044416, + "grad_norm": 1.1120634078979492, + "learning_rate": 9.99976849844157e-06, + "loss": 0.9604, + "step": 211 + }, + { + "epoch": 0.011668226099400077, + "grad_norm": 1.1248979568481445, + "learning_rate": 9.999764308486052e-06, + "loss": 0.9428, + "step": 212 + }, + { + "epoch": 0.011723264901755738, + "grad_norm": 1.274610161781311, + "learning_rate": 9.999760080953643e-06, + "loss": 0.9044, + "step": 213 + }, + { + "epoch": 0.011778303704111398, + "grad_norm": 1.1746865510940552, + "learning_rate": 9.999755815844377e-06, + "loss": 0.9114, + "step": 214 + }, + { + "epoch": 0.01183334250646706, + "grad_norm": 1.2531086206436157, + "learning_rate": 9.999751513158282e-06, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.01188838130882272, + "grad_norm": 1.0789539813995361, + "learning_rate": 9.999747172895395e-06, + "loss": 0.9794, + "step": 216 + }, + { + "epoch": 0.01194342011117838, + "grad_norm": 1.1805329322814941, + "learning_rate": 9.999742795055746e-06, + "loss": 0.9602, + "step": 217 + }, + { + "epoch": 0.011998458913534042, + "grad_norm": 2.309329032897949, + "learning_rate": 9.99973837963937e-06, + "loss": 0.9482, + "step": 218 + }, + { + "epoch": 0.012053497715889702, + "grad_norm": 1.2379088401794434, + "learning_rate": 9.999733926646296e-06, + "loss": 1.0237, + "step": 219 + }, + { + "epoch": 0.012108536518245363, + "grad_norm": 1.1581377983093262, + "learning_rate": 9.999729436076562e-06, + "loss": 1.0583, + "step": 220 + }, + { + "epoch": 0.012163575320601024, + "grad_norm": 1.3006727695465088, + "learning_rate": 9.999724907930199e-06, + "loss": 0.9581, + "step": 221 + }, + { + "epoch": 0.012218614122956684, + "grad_norm": 1.3215982913970947, + "learning_rate": 9.999720342207243e-06, + "loss": 0.9438, + "step": 222 + }, + { + "epoch": 0.012273652925312345, + "grad_norm": 1.1107337474822998, + "learning_rate": 9.999715738907727e-06, + "loss": 0.9987, + "step": 223 + }, + { + "epoch": 0.012328691727668007, + "grad_norm": 1.0745457410812378, + "learning_rate": 9.999711098031685e-06, + "loss": 0.9637, + "step": 224 + }, + { + "epoch": 0.012383730530023666, + "grad_norm": 1.110861897468567, + "learning_rate": 9.999706419579154e-06, + "loss": 1.0225, + "step": 225 + }, + { + "epoch": 0.012438769332379328, + "grad_norm": 1.0755527019500732, + "learning_rate": 9.999701703550167e-06, + "loss": 1.0204, + "step": 226 + }, + { + "epoch": 0.012493808134734987, + "grad_norm": 1.1694976091384888, + "learning_rate": 9.99969694994476e-06, + "loss": 1.0566, + "step": 227 + }, + { + "epoch": 0.012548846937090649, + "grad_norm": 1.455856442451477, + "learning_rate": 9.99969215876297e-06, + "loss": 0.9397, + "step": 228 + }, + { + "epoch": 0.01260388573944631, + "grad_norm": 1.0707073211669922, + "learning_rate": 9.99968733000483e-06, + "loss": 0.8286, + "step": 229 + }, + { + "epoch": 0.01265892454180197, + "grad_norm": 1.189548134803772, + "learning_rate": 9.99968246367038e-06, + "loss": 0.8762, + "step": 230 + }, + { + "epoch": 0.012713963344157631, + "grad_norm": 1.1439214944839478, + "learning_rate": 9.999677559759655e-06, + "loss": 0.9187, + "step": 231 + }, + { + "epoch": 0.012769002146513293, + "grad_norm": 1.2329761981964111, + "learning_rate": 9.999672618272691e-06, + "loss": 1.0374, + "step": 232 + }, + { + "epoch": 0.012824040948868952, + "grad_norm": 1.1545134782791138, + "learning_rate": 9.999667639209527e-06, + "loss": 0.9343, + "step": 233 + }, + { + "epoch": 0.012879079751224614, + "grad_norm": 1.0946775674819946, + "learning_rate": 9.999662622570198e-06, + "loss": 0.9568, + "step": 234 + }, + { + "epoch": 0.012934118553580273, + "grad_norm": 1.2099589109420776, + "learning_rate": 9.999657568354743e-06, + "loss": 1.0364, + "step": 235 + }, + { + "epoch": 0.012989157355935935, + "grad_norm": 1.09062922000885, + "learning_rate": 9.999652476563202e-06, + "loss": 1.0289, + "step": 236 + }, + { + "epoch": 0.013044196158291596, + "grad_norm": 1.154557228088379, + "learning_rate": 9.999647347195612e-06, + "loss": 0.9925, + "step": 237 + }, + { + "epoch": 0.013099234960647256, + "grad_norm": 1.025374174118042, + "learning_rate": 9.999642180252008e-06, + "loss": 0.9346, + "step": 238 + }, + { + "epoch": 0.013154273763002917, + "grad_norm": 1.1473641395568848, + "learning_rate": 9.999636975732433e-06, + "loss": 1.0244, + "step": 239 + }, + { + "epoch": 0.013209312565358578, + "grad_norm": 1.0421240329742432, + "learning_rate": 9.999631733636923e-06, + "loss": 0.9368, + "step": 240 + }, + { + "epoch": 0.013264351367714238, + "grad_norm": 1.1076610088348389, + "learning_rate": 9.99962645396552e-06, + "loss": 1.0276, + "step": 241 + }, + { + "epoch": 0.0133193901700699, + "grad_norm": 1.143559455871582, + "learning_rate": 9.999621136718266e-06, + "loss": 0.9626, + "step": 242 + }, + { + "epoch": 0.01337442897242556, + "grad_norm": 1.0958378314971924, + "learning_rate": 9.999615781895195e-06, + "loss": 1.0254, + "step": 243 + }, + { + "epoch": 0.01342946777478122, + "grad_norm": 1.117688536643982, + "learning_rate": 9.99961038949635e-06, + "loss": 0.9685, + "step": 244 + }, + { + "epoch": 0.013484506577136882, + "grad_norm": 1.1645647287368774, + "learning_rate": 9.999604959521771e-06, + "loss": 1.0666, + "step": 245 + }, + { + "epoch": 0.013539545379492542, + "grad_norm": 1.1238516569137573, + "learning_rate": 9.999599491971502e-06, + "loss": 1.0252, + "step": 246 + }, + { + "epoch": 0.013594584181848203, + "grad_norm": 1.0196914672851562, + "learning_rate": 9.999593986845579e-06, + "loss": 0.9389, + "step": 247 + }, + { + "epoch": 0.013649622984203864, + "grad_norm": 1.0231372117996216, + "learning_rate": 9.999588444144049e-06, + "loss": 0.8786, + "step": 248 + }, + { + "epoch": 0.013704661786559524, + "grad_norm": 1.2504147291183472, + "learning_rate": 9.999582863866947e-06, + "loss": 1.0969, + "step": 249 + }, + { + "epoch": 0.013759700588915185, + "grad_norm": 1.1123549938201904, + "learning_rate": 9.99957724601432e-06, + "loss": 0.8833, + "step": 250 + }, + { + "epoch": 0.013814739391270847, + "grad_norm": 1.1068202257156372, + "learning_rate": 9.999571590586208e-06, + "loss": 0.9709, + "step": 251 + }, + { + "epoch": 0.013869778193626506, + "grad_norm": 0.9891651272773743, + "learning_rate": 9.999565897582655e-06, + "loss": 0.8598, + "step": 252 + }, + { + "epoch": 0.013924816995982168, + "grad_norm": 0.9866491556167603, + "learning_rate": 9.999560167003703e-06, + "loss": 0.8101, + "step": 253 + }, + { + "epoch": 0.013979855798337828, + "grad_norm": 1.0862594842910767, + "learning_rate": 9.999554398849396e-06, + "loss": 0.9411, + "step": 254 + }, + { + "epoch": 0.014034894600693489, + "grad_norm": 1.1898949146270752, + "learning_rate": 9.999548593119774e-06, + "loss": 0.9548, + "step": 255 + }, + { + "epoch": 0.01408993340304915, + "grad_norm": 1.2167880535125732, + "learning_rate": 9.999542749814886e-06, + "loss": 1.0302, + "step": 256 + }, + { + "epoch": 0.01414497220540481, + "grad_norm": 1.0784146785736084, + "learning_rate": 9.999536868934771e-06, + "loss": 0.8875, + "step": 257 + }, + { + "epoch": 0.014200011007760471, + "grad_norm": 1.1128027439117432, + "learning_rate": 9.999530950479475e-06, + "loss": 0.9498, + "step": 258 + }, + { + "epoch": 0.014255049810116133, + "grad_norm": 1.1311595439910889, + "learning_rate": 9.999524994449044e-06, + "loss": 0.9035, + "step": 259 + }, + { + "epoch": 0.014310088612471792, + "grad_norm": 1.225615382194519, + "learning_rate": 9.999519000843521e-06, + "loss": 1.0104, + "step": 260 + }, + { + "epoch": 0.014365127414827454, + "grad_norm": 1.2347793579101562, + "learning_rate": 9.99951296966295e-06, + "loss": 1.0288, + "step": 261 + }, + { + "epoch": 0.014420166217183113, + "grad_norm": 1.1837103366851807, + "learning_rate": 9.99950690090738e-06, + "loss": 0.9553, + "step": 262 + }, + { + "epoch": 0.014475205019538775, + "grad_norm": 1.1985397338867188, + "learning_rate": 9.999500794576852e-06, + "loss": 0.9561, + "step": 263 + }, + { + "epoch": 0.014530243821894436, + "grad_norm": 1.036928415298462, + "learning_rate": 9.999494650671418e-06, + "loss": 0.8906, + "step": 264 + }, + { + "epoch": 0.014585282624250096, + "grad_norm": 1.0797842741012573, + "learning_rate": 9.999488469191116e-06, + "loss": 0.8975, + "step": 265 + }, + { + "epoch": 0.014640321426605757, + "grad_norm": 1.0571156740188599, + "learning_rate": 9.999482250136e-06, + "loss": 0.9334, + "step": 266 + }, + { + "epoch": 0.014695360228961419, + "grad_norm": 1.2065023183822632, + "learning_rate": 9.999475993506114e-06, + "loss": 0.8986, + "step": 267 + }, + { + "epoch": 0.014750399031317078, + "grad_norm": 1.201586127281189, + "learning_rate": 9.999469699301502e-06, + "loss": 0.9192, + "step": 268 + }, + { + "epoch": 0.01480543783367274, + "grad_norm": 1.0470168590545654, + "learning_rate": 9.999463367522216e-06, + "loss": 0.8604, + "step": 269 + }, + { + "epoch": 0.0148604766360284, + "grad_norm": 1.1142147779464722, + "learning_rate": 9.9994569981683e-06, + "loss": 0.9847, + "step": 270 + }, + { + "epoch": 0.01491551543838406, + "grad_norm": 1.0352061986923218, + "learning_rate": 9.999450591239805e-06, + "loss": 0.8927, + "step": 271 + }, + { + "epoch": 0.014970554240739722, + "grad_norm": 1.0353184938430786, + "learning_rate": 9.999444146736779e-06, + "loss": 0.8435, + "step": 272 + }, + { + "epoch": 0.015025593043095382, + "grad_norm": 1.2091951370239258, + "learning_rate": 9.999437664659267e-06, + "loss": 0.8959, + "step": 273 + }, + { + "epoch": 0.015080631845451043, + "grad_norm": 1.006361722946167, + "learning_rate": 9.999431145007319e-06, + "loss": 0.8579, + "step": 274 + }, + { + "epoch": 0.015135670647806704, + "grad_norm": 1.1265509128570557, + "learning_rate": 9.999424587780985e-06, + "loss": 0.8808, + "step": 275 + }, + { + "epoch": 0.015190709450162364, + "grad_norm": 1.060882568359375, + "learning_rate": 9.999417992980317e-06, + "loss": 1.044, + "step": 276 + }, + { + "epoch": 0.015245748252518026, + "grad_norm": 1.0216747522354126, + "learning_rate": 9.999411360605358e-06, + "loss": 0.7773, + "step": 277 + }, + { + "epoch": 0.015300787054873685, + "grad_norm": 1.1382462978363037, + "learning_rate": 9.999404690656163e-06, + "loss": 0.8954, + "step": 278 + }, + { + "epoch": 0.015355825857229347, + "grad_norm": 1.113815188407898, + "learning_rate": 9.99939798313278e-06, + "loss": 0.8143, + "step": 279 + }, + { + "epoch": 0.015410864659585008, + "grad_norm": 1.123530387878418, + "learning_rate": 9.99939123803526e-06, + "loss": 0.8872, + "step": 280 + }, + { + "epoch": 0.015465903461940668, + "grad_norm": 1.0873669385910034, + "learning_rate": 9.999384455363656e-06, + "loss": 1.008, + "step": 281 + }, + { + "epoch": 0.015520942264296329, + "grad_norm": 1.5956637859344482, + "learning_rate": 9.999377635118014e-06, + "loss": 0.9456, + "step": 282 + }, + { + "epoch": 0.01557598106665199, + "grad_norm": 1.1471425294876099, + "learning_rate": 9.999370777298389e-06, + "loss": 0.9897, + "step": 283 + }, + { + "epoch": 0.01563101986900765, + "grad_norm": 0.9960193634033203, + "learning_rate": 9.999363881904831e-06, + "loss": 0.8196, + "step": 284 + }, + { + "epoch": 0.01568605867136331, + "grad_norm": 1.1033951044082642, + "learning_rate": 9.999356948937393e-06, + "loss": 0.879, + "step": 285 + }, + { + "epoch": 0.015741097473718973, + "grad_norm": 1.157765507698059, + "learning_rate": 9.999349978396126e-06, + "loss": 1.0116, + "step": 286 + }, + { + "epoch": 0.015796136276074634, + "grad_norm": 1.0472352504730225, + "learning_rate": 9.999342970281084e-06, + "loss": 0.8657, + "step": 287 + }, + { + "epoch": 0.015851175078430292, + "grad_norm": 1.1346659660339355, + "learning_rate": 9.999335924592315e-06, + "loss": 0.8482, + "step": 288 + }, + { + "epoch": 0.015906213880785953, + "grad_norm": 1.1164487600326538, + "learning_rate": 9.999328841329879e-06, + "loss": 1.0542, + "step": 289 + }, + { + "epoch": 0.015961252683141615, + "grad_norm": 1.1890591382980347, + "learning_rate": 9.999321720493825e-06, + "loss": 0.9598, + "step": 290 + }, + { + "epoch": 0.016016291485497276, + "grad_norm": 1.0419867038726807, + "learning_rate": 9.999314562084205e-06, + "loss": 0.9548, + "step": 291 + }, + { + "epoch": 0.016071330287852938, + "grad_norm": 1.0652042627334595, + "learning_rate": 9.999307366101077e-06, + "loss": 0.9359, + "step": 292 + }, + { + "epoch": 0.016126369090208596, + "grad_norm": 1.0166404247283936, + "learning_rate": 9.999300132544492e-06, + "loss": 0.9276, + "step": 293 + }, + { + "epoch": 0.016181407892564257, + "grad_norm": 1.1638866662979126, + "learning_rate": 9.999292861414507e-06, + "loss": 0.957, + "step": 294 + }, + { + "epoch": 0.01623644669491992, + "grad_norm": 1.5505993366241455, + "learning_rate": 9.999285552711173e-06, + "loss": 0.9878, + "step": 295 + }, + { + "epoch": 0.01629148549727558, + "grad_norm": 1.177262783050537, + "learning_rate": 9.999278206434549e-06, + "loss": 0.8631, + "step": 296 + }, + { + "epoch": 0.01634652429963124, + "grad_norm": 1.8578168153762817, + "learning_rate": 9.999270822584687e-06, + "loss": 0.9684, + "step": 297 + }, + { + "epoch": 0.0164015631019869, + "grad_norm": 1.2617360353469849, + "learning_rate": 9.999263401161643e-06, + "loss": 1.014, + "step": 298 + }, + { + "epoch": 0.01645660190434256, + "grad_norm": 0.9740132689476013, + "learning_rate": 9.999255942165475e-06, + "loss": 0.8606, + "step": 299 + }, + { + "epoch": 0.016511640706698222, + "grad_norm": 0.9821745753288269, + "learning_rate": 9.999248445596238e-06, + "loss": 0.8241, + "step": 300 + }, + { + "epoch": 0.016566679509053883, + "grad_norm": 1.0200445652008057, + "learning_rate": 9.999240911453986e-06, + "loss": 0.8256, + "step": 301 + }, + { + "epoch": 0.016621718311409545, + "grad_norm": 1.4100390672683716, + "learning_rate": 9.999233339738779e-06, + "loss": 0.9057, + "step": 302 + }, + { + "epoch": 0.016676757113765206, + "grad_norm": 1.056544303894043, + "learning_rate": 9.99922573045067e-06, + "loss": 1.0808, + "step": 303 + }, + { + "epoch": 0.016731795916120864, + "grad_norm": 0.9271026253700256, + "learning_rate": 9.99921808358972e-06, + "loss": 0.878, + "step": 304 + }, + { + "epoch": 0.016786834718476525, + "grad_norm": 0.9864157438278198, + "learning_rate": 9.999210399155987e-06, + "loss": 0.9198, + "step": 305 + }, + { + "epoch": 0.016841873520832187, + "grad_norm": 1.093995451927185, + "learning_rate": 9.999202677149525e-06, + "loss": 0.9794, + "step": 306 + }, + { + "epoch": 0.016896912323187848, + "grad_norm": 0.9717912077903748, + "learning_rate": 9.999194917570395e-06, + "loss": 0.8764, + "step": 307 + }, + { + "epoch": 0.01695195112554351, + "grad_norm": 1.0026428699493408, + "learning_rate": 9.999187120418653e-06, + "loss": 0.8526, + "step": 308 + }, + { + "epoch": 0.017006989927899167, + "grad_norm": 1.122870922088623, + "learning_rate": 9.999179285694359e-06, + "loss": 0.9773, + "step": 309 + }, + { + "epoch": 0.01706202873025483, + "grad_norm": 1.0522836446762085, + "learning_rate": 9.999171413397572e-06, + "loss": 1.0183, + "step": 310 + }, + { + "epoch": 0.01711706753261049, + "grad_norm": 0.9303658604621887, + "learning_rate": 9.99916350352835e-06, + "loss": 0.8402, + "step": 311 + }, + { + "epoch": 0.01717210633496615, + "grad_norm": 0.9606096148490906, + "learning_rate": 9.999155556086755e-06, + "loss": 0.9692, + "step": 312 + }, + { + "epoch": 0.017227145137321813, + "grad_norm": 1.176992416381836, + "learning_rate": 9.999147571072844e-06, + "loss": 0.8172, + "step": 313 + }, + { + "epoch": 0.017282183939677474, + "grad_norm": 1.1948801279067993, + "learning_rate": 9.999139548486678e-06, + "loss": 1.0205, + "step": 314 + }, + { + "epoch": 0.017337222742033132, + "grad_norm": 1.0064897537231445, + "learning_rate": 9.999131488328318e-06, + "loss": 0.9479, + "step": 315 + }, + { + "epoch": 0.017392261544388794, + "grad_norm": 1.048242449760437, + "learning_rate": 9.999123390597822e-06, + "loss": 0.9862, + "step": 316 + }, + { + "epoch": 0.017447300346744455, + "grad_norm": 1.12875497341156, + "learning_rate": 9.999115255295256e-06, + "loss": 0.9743, + "step": 317 + }, + { + "epoch": 0.017502339149100116, + "grad_norm": 1.0607460737228394, + "learning_rate": 9.999107082420674e-06, + "loss": 0.8878, + "step": 318 + }, + { + "epoch": 0.017557377951455778, + "grad_norm": 1.1480191946029663, + "learning_rate": 9.999098871974144e-06, + "loss": 0.8769, + "step": 319 + }, + { + "epoch": 0.017612416753811436, + "grad_norm": 1.1150004863739014, + "learning_rate": 9.999090623955724e-06, + "loss": 0.8615, + "step": 320 + }, + { + "epoch": 0.017667455556167097, + "grad_norm": 1.137839913368225, + "learning_rate": 9.999082338365478e-06, + "loss": 0.9703, + "step": 321 + }, + { + "epoch": 0.01772249435852276, + "grad_norm": 1.0883489847183228, + "learning_rate": 9.999074015203467e-06, + "loss": 0.9273, + "step": 322 + }, + { + "epoch": 0.01777753316087842, + "grad_norm": 1.0999557971954346, + "learning_rate": 9.999065654469752e-06, + "loss": 0.9605, + "step": 323 + }, + { + "epoch": 0.01783257196323408, + "grad_norm": 0.9911689758300781, + "learning_rate": 9.999057256164401e-06, + "loss": 0.9117, + "step": 324 + }, + { + "epoch": 0.01788761076558974, + "grad_norm": 1.040933609008789, + "learning_rate": 9.999048820287472e-06, + "loss": 0.9229, + "step": 325 + }, + { + "epoch": 0.0179426495679454, + "grad_norm": 1.4341392517089844, + "learning_rate": 9.999040346839031e-06, + "loss": 1.0718, + "step": 326 + }, + { + "epoch": 0.017997688370301062, + "grad_norm": 1.0246332883834839, + "learning_rate": 9.99903183581914e-06, + "loss": 0.9617, + "step": 327 + }, + { + "epoch": 0.018052727172656723, + "grad_norm": 10.162322998046875, + "learning_rate": 9.999023287227863e-06, + "loss": 1.0391, + "step": 328 + }, + { + "epoch": 0.018107765975012385, + "grad_norm": 1.3370027542114258, + "learning_rate": 9.999014701065266e-06, + "loss": 1.0211, + "step": 329 + }, + { + "epoch": 0.018162804777368046, + "grad_norm": 1.0146219730377197, + "learning_rate": 9.999006077331413e-06, + "loss": 0.8611, + "step": 330 + }, + { + "epoch": 0.018217843579723704, + "grad_norm": 1.0899269580841064, + "learning_rate": 9.998997416026368e-06, + "loss": 0.9209, + "step": 331 + }, + { + "epoch": 0.018272882382079365, + "grad_norm": 1.1343204975128174, + "learning_rate": 9.998988717150198e-06, + "loss": 0.9405, + "step": 332 + }, + { + "epoch": 0.018327921184435027, + "grad_norm": 1.2308380603790283, + "learning_rate": 9.998979980702965e-06, + "loss": 0.9579, + "step": 333 + }, + { + "epoch": 0.018382959986790688, + "grad_norm": 1.1433519124984741, + "learning_rate": 9.998971206684737e-06, + "loss": 1.0045, + "step": 334 + }, + { + "epoch": 0.01843799878914635, + "grad_norm": 1.0585781335830688, + "learning_rate": 9.99896239509558e-06, + "loss": 0.9171, + "step": 335 + }, + { + "epoch": 0.018493037591502007, + "grad_norm": 1.2735164165496826, + "learning_rate": 9.99895354593556e-06, + "loss": 1.1001, + "step": 336 + }, + { + "epoch": 0.01854807639385767, + "grad_norm": 1.2905755043029785, + "learning_rate": 9.998944659204744e-06, + "loss": 1.0294, + "step": 337 + }, + { + "epoch": 0.01860311519621333, + "grad_norm": 1.1442075967788696, + "learning_rate": 9.998935734903198e-06, + "loss": 0.9385, + "step": 338 + }, + { + "epoch": 0.01865815399856899, + "grad_norm": 1.1005232334136963, + "learning_rate": 9.998926773030987e-06, + "loss": 1.026, + "step": 339 + }, + { + "epoch": 0.018713192800924653, + "grad_norm": 1.2770785093307495, + "learning_rate": 9.998917773588182e-06, + "loss": 1.0015, + "step": 340 + }, + { + "epoch": 0.01876823160328031, + "grad_norm": 1.0963070392608643, + "learning_rate": 9.998908736574849e-06, + "loss": 0.9347, + "step": 341 + }, + { + "epoch": 0.018823270405635972, + "grad_norm": 1.10364830493927, + "learning_rate": 9.998899661991055e-06, + "loss": 0.869, + "step": 342 + }, + { + "epoch": 0.018878309207991634, + "grad_norm": 1.0364975929260254, + "learning_rate": 9.99889054983687e-06, + "loss": 0.9855, + "step": 343 + }, + { + "epoch": 0.018933348010347295, + "grad_norm": 1.104702115058899, + "learning_rate": 9.998881400112362e-06, + "loss": 0.9555, + "step": 344 + }, + { + "epoch": 0.018988386812702956, + "grad_norm": 0.9957441687583923, + "learning_rate": 9.998872212817599e-06, + "loss": 0.9634, + "step": 345 + }, + { + "epoch": 0.019043425615058618, + "grad_norm": 1.262271523475647, + "learning_rate": 9.998862987952651e-06, + "loss": 1.0133, + "step": 346 + }, + { + "epoch": 0.019098464417414276, + "grad_norm": 1.2075226306915283, + "learning_rate": 9.998853725517587e-06, + "loss": 1.0588, + "step": 347 + }, + { + "epoch": 0.019153503219769937, + "grad_norm": 1.0609898567199707, + "learning_rate": 9.998844425512477e-06, + "loss": 0.9952, + "step": 348 + }, + { + "epoch": 0.0192085420221256, + "grad_norm": 1.1930195093154907, + "learning_rate": 9.998835087937389e-06, + "loss": 0.9617, + "step": 349 + }, + { + "epoch": 0.01926358082448126, + "grad_norm": 1.2359932661056519, + "learning_rate": 9.998825712792396e-06, + "loss": 0.8768, + "step": 350 + }, + { + "epoch": 0.01931861962683692, + "grad_norm": 0.9984115362167358, + "learning_rate": 9.998816300077566e-06, + "loss": 0.8205, + "step": 351 + }, + { + "epoch": 0.01937365842919258, + "grad_norm": 1.6853677034378052, + "learning_rate": 9.998806849792972e-06, + "loss": 0.9066, + "step": 352 + }, + { + "epoch": 0.01942869723154824, + "grad_norm": 1.2869856357574463, + "learning_rate": 9.998797361938683e-06, + "loss": 1.0054, + "step": 353 + }, + { + "epoch": 0.019483736033903902, + "grad_norm": 1.2791584730148315, + "learning_rate": 9.99878783651477e-06, + "loss": 0.7627, + "step": 354 + }, + { + "epoch": 0.019538774836259563, + "grad_norm": 1.0795867443084717, + "learning_rate": 9.998778273521307e-06, + "loss": 0.9343, + "step": 355 + }, + { + "epoch": 0.019593813638615225, + "grad_norm": 1.0926088094711304, + "learning_rate": 9.998768672958365e-06, + "loss": 0.943, + "step": 356 + }, + { + "epoch": 0.019648852440970886, + "grad_norm": 1.0530847311019897, + "learning_rate": 9.998759034826015e-06, + "loss": 0.9656, + "step": 357 + }, + { + "epoch": 0.019703891243326544, + "grad_norm": 1.1793400049209595, + "learning_rate": 9.99874935912433e-06, + "loss": 0.9799, + "step": 358 + }, + { + "epoch": 0.019758930045682205, + "grad_norm": 1.0726191997528076, + "learning_rate": 9.998739645853383e-06, + "loss": 0.8739, + "step": 359 + }, + { + "epoch": 0.019813968848037867, + "grad_norm": 1.0488981008529663, + "learning_rate": 9.998729895013246e-06, + "loss": 0.8986, + "step": 360 + }, + { + "epoch": 0.019869007650393528, + "grad_norm": 1.8267477750778198, + "learning_rate": 9.998720106603993e-06, + "loss": 0.9175, + "step": 361 + }, + { + "epoch": 0.01992404645274919, + "grad_norm": 0.9868306517601013, + "learning_rate": 9.9987102806257e-06, + "loss": 0.9609, + "step": 362 + }, + { + "epoch": 0.019979085255104848, + "grad_norm": 1.0171183347702026, + "learning_rate": 9.998700417078438e-06, + "loss": 0.8904, + "step": 363 + }, + { + "epoch": 0.02003412405746051, + "grad_norm": 0.9800812602043152, + "learning_rate": 9.998690515962282e-06, + "loss": 0.8344, + "step": 364 + }, + { + "epoch": 0.02008916285981617, + "grad_norm": 1.024707317352295, + "learning_rate": 9.998680577277304e-06, + "loss": 0.9026, + "step": 365 + }, + { + "epoch": 0.02014420166217183, + "grad_norm": 1.1056619882583618, + "learning_rate": 9.998670601023584e-06, + "loss": 1.017, + "step": 366 + }, + { + "epoch": 0.020199240464527493, + "grad_norm": 1.0555908679962158, + "learning_rate": 9.998660587201191e-06, + "loss": 0.9627, + "step": 367 + }, + { + "epoch": 0.02025427926688315, + "grad_norm": 0.9502031803131104, + "learning_rate": 9.998650535810204e-06, + "loss": 0.935, + "step": 368 + }, + { + "epoch": 0.020309318069238812, + "grad_norm": 1.0355613231658936, + "learning_rate": 9.998640446850699e-06, + "loss": 0.9946, + "step": 369 + }, + { + "epoch": 0.020364356871594474, + "grad_norm": 0.9906355142593384, + "learning_rate": 9.99863032032275e-06, + "loss": 0.9389, + "step": 370 + }, + { + "epoch": 0.020419395673950135, + "grad_norm": 0.9483911395072937, + "learning_rate": 9.99862015622643e-06, + "loss": 0.979, + "step": 371 + }, + { + "epoch": 0.020474434476305797, + "grad_norm": 0.9769986271858215, + "learning_rate": 9.998609954561822e-06, + "loss": 0.8972, + "step": 372 + }, + { + "epoch": 0.020529473278661458, + "grad_norm": 1.1682699918746948, + "learning_rate": 9.998599715329e-06, + "loss": 0.943, + "step": 373 + }, + { + "epoch": 0.020584512081017116, + "grad_norm": 1.007912516593933, + "learning_rate": 9.99858943852804e-06, + "loss": 0.8825, + "step": 374 + }, + { + "epoch": 0.020639550883372777, + "grad_norm": 0.9788785576820374, + "learning_rate": 9.99857912415902e-06, + "loss": 0.9667, + "step": 375 + }, + { + "epoch": 0.02069458968572844, + "grad_norm": 1.0804275274276733, + "learning_rate": 9.998568772222017e-06, + "loss": 1.0026, + "step": 376 + }, + { + "epoch": 0.0207496284880841, + "grad_norm": 1.0859237909317017, + "learning_rate": 9.998558382717109e-06, + "loss": 0.9592, + "step": 377 + }, + { + "epoch": 0.02080466729043976, + "grad_norm": 1.2925337553024292, + "learning_rate": 9.998547955644373e-06, + "loss": 0.9067, + "step": 378 + }, + { + "epoch": 0.02085970609279542, + "grad_norm": 0.9853373765945435, + "learning_rate": 9.99853749100389e-06, + "loss": 0.9538, + "step": 379 + }, + { + "epoch": 0.02091474489515108, + "grad_norm": 1.0461076498031616, + "learning_rate": 9.998526988795738e-06, + "loss": 0.9261, + "step": 380 + }, + { + "epoch": 0.020969783697506742, + "grad_norm": 1.024559497833252, + "learning_rate": 9.998516449019995e-06, + "loss": 0.9117, + "step": 381 + }, + { + "epoch": 0.021024822499862404, + "grad_norm": 1.1474825143814087, + "learning_rate": 9.998505871676739e-06, + "loss": 1.0177, + "step": 382 + }, + { + "epoch": 0.021079861302218065, + "grad_norm": 0.9587596654891968, + "learning_rate": 9.998495256766051e-06, + "loss": 0.8809, + "step": 383 + }, + { + "epoch": 0.021134900104573723, + "grad_norm": 0.9505122303962708, + "learning_rate": 9.998484604288013e-06, + "loss": 0.9266, + "step": 384 + }, + { + "epoch": 0.021189938906929384, + "grad_norm": 0.9625647664070129, + "learning_rate": 9.9984739142427e-06, + "loss": 0.9073, + "step": 385 + }, + { + "epoch": 0.021244977709285046, + "grad_norm": 0.9650934338569641, + "learning_rate": 9.998463186630196e-06, + "loss": 0.9042, + "step": 386 + }, + { + "epoch": 0.021300016511640707, + "grad_norm": 1.0289491415023804, + "learning_rate": 9.99845242145058e-06, + "loss": 0.929, + "step": 387 + }, + { + "epoch": 0.02135505531399637, + "grad_norm": 0.9543869495391846, + "learning_rate": 9.998441618703935e-06, + "loss": 0.9406, + "step": 388 + }, + { + "epoch": 0.02141009411635203, + "grad_norm": 0.9276942610740662, + "learning_rate": 9.99843077839034e-06, + "loss": 0.8982, + "step": 389 + }, + { + "epoch": 0.021465132918707688, + "grad_norm": 0.9264664053916931, + "learning_rate": 9.998419900509877e-06, + "loss": 0.7255, + "step": 390 + }, + { + "epoch": 0.02152017172106335, + "grad_norm": 0.9961187243461609, + "learning_rate": 9.998408985062628e-06, + "loss": 0.9826, + "step": 391 + }, + { + "epoch": 0.02157521052341901, + "grad_norm": 0.966596245765686, + "learning_rate": 9.998398032048676e-06, + "loss": 0.8159, + "step": 392 + }, + { + "epoch": 0.021630249325774672, + "grad_norm": 1.1336095333099365, + "learning_rate": 9.998387041468102e-06, + "loss": 0.9289, + "step": 393 + }, + { + "epoch": 0.021685288128130333, + "grad_norm": 1.0453619956970215, + "learning_rate": 9.998376013320989e-06, + "loss": 0.8816, + "step": 394 + }, + { + "epoch": 0.02174032693048599, + "grad_norm": 0.8961821794509888, + "learning_rate": 9.998364947607419e-06, + "loss": 0.871, + "step": 395 + }, + { + "epoch": 0.021795365732841653, + "grad_norm": 1.3420332670211792, + "learning_rate": 9.998353844327477e-06, + "loss": 0.9338, + "step": 396 + }, + { + "epoch": 0.021850404535197314, + "grad_norm": 0.9635335206985474, + "learning_rate": 9.998342703481246e-06, + "loss": 0.9592, + "step": 397 + }, + { + "epoch": 0.021905443337552975, + "grad_norm": 1.3322341442108154, + "learning_rate": 9.998331525068807e-06, + "loss": 1.0974, + "step": 398 + }, + { + "epoch": 0.021960482139908637, + "grad_norm": 1.017220377922058, + "learning_rate": 9.998320309090247e-06, + "loss": 0.9827, + "step": 399 + }, + { + "epoch": 0.022015520942264295, + "grad_norm": 1.0080329179763794, + "learning_rate": 9.99830905554565e-06, + "loss": 0.877, + "step": 400 + }, + { + "epoch": 0.022070559744619956, + "grad_norm": 0.9883211255073547, + "learning_rate": 9.998297764435101e-06, + "loss": 0.9625, + "step": 401 + }, + { + "epoch": 0.022125598546975617, + "grad_norm": 1.0948412418365479, + "learning_rate": 9.998286435758684e-06, + "loss": 0.9058, + "step": 402 + }, + { + "epoch": 0.02218063734933128, + "grad_norm": 0.9402000308036804, + "learning_rate": 9.998275069516482e-06, + "loss": 0.8882, + "step": 403 + }, + { + "epoch": 0.02223567615168694, + "grad_norm": 0.9858806133270264, + "learning_rate": 9.998263665708583e-06, + "loss": 0.9086, + "step": 404 + }, + { + "epoch": 0.0222907149540426, + "grad_norm": 1.0556131601333618, + "learning_rate": 9.998252224335073e-06, + "loss": 0.9583, + "step": 405 + }, + { + "epoch": 0.02234575375639826, + "grad_norm": 1.092766284942627, + "learning_rate": 9.998240745396037e-06, + "loss": 0.9124, + "step": 406 + }, + { + "epoch": 0.02240079255875392, + "grad_norm": 1.1902250051498413, + "learning_rate": 9.998229228891563e-06, + "loss": 1.0566, + "step": 407 + }, + { + "epoch": 0.022455831361109582, + "grad_norm": 1.067906141281128, + "learning_rate": 9.998217674821734e-06, + "loss": 0.9823, + "step": 408 + }, + { + "epoch": 0.022510870163465244, + "grad_norm": 1.0051710605621338, + "learning_rate": 9.998206083186638e-06, + "loss": 0.9141, + "step": 409 + }, + { + "epoch": 0.022565908965820905, + "grad_norm": 1.046412467956543, + "learning_rate": 9.998194453986367e-06, + "loss": 0.9439, + "step": 410 + }, + { + "epoch": 0.022620947768176563, + "grad_norm": 1.1103553771972656, + "learning_rate": 9.998182787221e-06, + "loss": 0.9494, + "step": 411 + }, + { + "epoch": 0.022675986570532224, + "grad_norm": 1.0508466958999634, + "learning_rate": 9.998171082890632e-06, + "loss": 0.9202, + "step": 412 + }, + { + "epoch": 0.022731025372887886, + "grad_norm": 1.1364226341247559, + "learning_rate": 9.998159340995347e-06, + "loss": 0.9859, + "step": 413 + }, + { + "epoch": 0.022786064175243547, + "grad_norm": 1.2073607444763184, + "learning_rate": 9.998147561535234e-06, + "loss": 0.8883, + "step": 414 + }, + { + "epoch": 0.02284110297759921, + "grad_norm": 1.0657012462615967, + "learning_rate": 9.998135744510384e-06, + "loss": 0.8321, + "step": 415 + }, + { + "epoch": 0.02289614177995487, + "grad_norm": 1.0101548433303833, + "learning_rate": 9.998123889920881e-06, + "loss": 0.9374, + "step": 416 + }, + { + "epoch": 0.022951180582310528, + "grad_norm": 1.057455062866211, + "learning_rate": 9.998111997766817e-06, + "loss": 0.8831, + "step": 417 + }, + { + "epoch": 0.02300621938466619, + "grad_norm": 1.206092357635498, + "learning_rate": 9.998100068048282e-06, + "loss": 0.8812, + "step": 418 + }, + { + "epoch": 0.02306125818702185, + "grad_norm": 1.0709773302078247, + "learning_rate": 9.998088100765366e-06, + "loss": 0.9486, + "step": 419 + }, + { + "epoch": 0.023116296989377512, + "grad_norm": 1.066469669342041, + "learning_rate": 9.998076095918156e-06, + "loss": 1.0229, + "step": 420 + }, + { + "epoch": 0.023171335791733173, + "grad_norm": 1.0443583726882935, + "learning_rate": 9.998064053506744e-06, + "loss": 0.8615, + "step": 421 + }, + { + "epoch": 0.02322637459408883, + "grad_norm": 1.103096842765808, + "learning_rate": 9.99805197353122e-06, + "loss": 0.9909, + "step": 422 + }, + { + "epoch": 0.023281413396444493, + "grad_norm": 0.9804643392562866, + "learning_rate": 9.998039855991677e-06, + "loss": 0.9214, + "step": 423 + }, + { + "epoch": 0.023336452198800154, + "grad_norm": 0.9880676865577698, + "learning_rate": 9.998027700888202e-06, + "loss": 0.9345, + "step": 424 + }, + { + "epoch": 0.023391491001155815, + "grad_norm": 0.9633826017379761, + "learning_rate": 9.99801550822089e-06, + "loss": 0.9897, + "step": 425 + }, + { + "epoch": 0.023446529803511477, + "grad_norm": 1.0159331560134888, + "learning_rate": 9.998003277989831e-06, + "loss": 0.9385, + "step": 426 + }, + { + "epoch": 0.023501568605867135, + "grad_norm": 1.009667158126831, + "learning_rate": 9.99799101019512e-06, + "loss": 0.9013, + "step": 427 + }, + { + "epoch": 0.023556607408222796, + "grad_norm": 0.9478578567504883, + "learning_rate": 9.997978704836842e-06, + "loss": 0.8775, + "step": 428 + }, + { + "epoch": 0.023611646210578457, + "grad_norm": 1.013181447982788, + "learning_rate": 9.997966361915096e-06, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.02366668501293412, + "grad_norm": 1.0337481498718262, + "learning_rate": 9.997953981429974e-06, + "loss": 1.0047, + "step": 430 + }, + { + "epoch": 0.02372172381528978, + "grad_norm": 0.9423721432685852, + "learning_rate": 9.997941563381566e-06, + "loss": 0.8639, + "step": 431 + }, + { + "epoch": 0.02377676261764544, + "grad_norm": 1.100492000579834, + "learning_rate": 9.997929107769968e-06, + "loss": 1.0022, + "step": 432 + }, + { + "epoch": 0.0238318014200011, + "grad_norm": 1.1232364177703857, + "learning_rate": 9.997916614595272e-06, + "loss": 0.9145, + "step": 433 + }, + { + "epoch": 0.02388684022235676, + "grad_norm": 0.9466833472251892, + "learning_rate": 9.997904083857572e-06, + "loss": 0.9397, + "step": 434 + }, + { + "epoch": 0.023941879024712422, + "grad_norm": 0.9514566659927368, + "learning_rate": 9.997891515556963e-06, + "loss": 0.8025, + "step": 435 + }, + { + "epoch": 0.023996917827068084, + "grad_norm": 0.9292222261428833, + "learning_rate": 9.997878909693539e-06, + "loss": 0.7739, + "step": 436 + }, + { + "epoch": 0.024051956629423745, + "grad_norm": 1.1049963235855103, + "learning_rate": 9.997866266267397e-06, + "loss": 0.9439, + "step": 437 + }, + { + "epoch": 0.024106995431779403, + "grad_norm": 1.0938019752502441, + "learning_rate": 9.997853585278627e-06, + "loss": 0.9479, + "step": 438 + }, + { + "epoch": 0.024162034234135064, + "grad_norm": 1.0423611402511597, + "learning_rate": 9.997840866727331e-06, + "loss": 0.9309, + "step": 439 + }, + { + "epoch": 0.024217073036490726, + "grad_norm": 1.0584756135940552, + "learning_rate": 9.997828110613598e-06, + "loss": 1.0218, + "step": 440 + }, + { + "epoch": 0.024272111838846387, + "grad_norm": 0.9986408948898315, + "learning_rate": 9.997815316937527e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.02432715064120205, + "grad_norm": 0.9680983424186707, + "learning_rate": 9.997802485699215e-06, + "loss": 0.9286, + "step": 442 + }, + { + "epoch": 0.024382189443557706, + "grad_norm": 1.2231700420379639, + "learning_rate": 9.997789616898757e-06, + "loss": 0.8083, + "step": 443 + }, + { + "epoch": 0.024437228245913368, + "grad_norm": 1.0064021348953247, + "learning_rate": 9.99777671053625e-06, + "loss": 0.9161, + "step": 444 + }, + { + "epoch": 0.02449226704826903, + "grad_norm": 0.9658541679382324, + "learning_rate": 9.99776376661179e-06, + "loss": 0.8027, + "step": 445 + }, + { + "epoch": 0.02454730585062469, + "grad_norm": 0.9440343379974365, + "learning_rate": 9.997750785125477e-06, + "loss": 0.9124, + "step": 446 + }, + { + "epoch": 0.024602344652980352, + "grad_norm": 0.998792827129364, + "learning_rate": 9.997737766077404e-06, + "loss": 0.8699, + "step": 447 + }, + { + "epoch": 0.024657383455336013, + "grad_norm": 1.430880069732666, + "learning_rate": 9.997724709467676e-06, + "loss": 0.9158, + "step": 448 + }, + { + "epoch": 0.02471242225769167, + "grad_norm": 0.9737820029258728, + "learning_rate": 9.997711615296384e-06, + "loss": 0.9496, + "step": 449 + }, + { + "epoch": 0.024767461060047333, + "grad_norm": 0.9710075855255127, + "learning_rate": 9.997698483563629e-06, + "loss": 0.8714, + "step": 450 + }, + { + "epoch": 0.024822499862402994, + "grad_norm": 1.5286253690719604, + "learning_rate": 9.997685314269511e-06, + "loss": 0.8421, + "step": 451 + }, + { + "epoch": 0.024877538664758655, + "grad_norm": 1.0269445180892944, + "learning_rate": 9.99767210741413e-06, + "loss": 1.0131, + "step": 452 + }, + { + "epoch": 0.024932577467114317, + "grad_norm": 0.9780508279800415, + "learning_rate": 9.99765886299758e-06, + "loss": 0.9897, + "step": 453 + }, + { + "epoch": 0.024987616269469975, + "grad_norm": 0.998332679271698, + "learning_rate": 9.997645581019965e-06, + "loss": 0.9647, + "step": 454 + }, + { + "epoch": 0.025042655071825636, + "grad_norm": 1.7062602043151855, + "learning_rate": 9.997632261481383e-06, + "loss": 1.0729, + "step": 455 + }, + { + "epoch": 0.025097693874181298, + "grad_norm": 0.9793694615364075, + "learning_rate": 9.997618904381936e-06, + "loss": 0.9556, + "step": 456 + }, + { + "epoch": 0.02515273267653696, + "grad_norm": 1.0183895826339722, + "learning_rate": 9.997605509721721e-06, + "loss": 0.9194, + "step": 457 + }, + { + "epoch": 0.02520777147889262, + "grad_norm": 1.0288400650024414, + "learning_rate": 9.997592077500844e-06, + "loss": 0.955, + "step": 458 + }, + { + "epoch": 0.025262810281248282, + "grad_norm": 0.9551253914833069, + "learning_rate": 9.997578607719401e-06, + "loss": 0.8498, + "step": 459 + }, + { + "epoch": 0.02531784908360394, + "grad_norm": 0.9648008942604065, + "learning_rate": 9.997565100377494e-06, + "loss": 0.9306, + "step": 460 + }, + { + "epoch": 0.0253728878859596, + "grad_norm": 0.9206677675247192, + "learning_rate": 9.997551555475225e-06, + "loss": 0.7874, + "step": 461 + }, + { + "epoch": 0.025427926688315262, + "grad_norm": 1.0479545593261719, + "learning_rate": 9.997537973012698e-06, + "loss": 0.9201, + "step": 462 + }, + { + "epoch": 0.025482965490670924, + "grad_norm": 1.0329946279525757, + "learning_rate": 9.997524352990013e-06, + "loss": 0.9577, + "step": 463 + }, + { + "epoch": 0.025538004293026585, + "grad_norm": 1.1177828311920166, + "learning_rate": 9.997510695407273e-06, + "loss": 1.0041, + "step": 464 + }, + { + "epoch": 0.025593043095382243, + "grad_norm": 1.0351577997207642, + "learning_rate": 9.99749700026458e-06, + "loss": 0.9952, + "step": 465 + }, + { + "epoch": 0.025648081897737905, + "grad_norm": 0.905274510383606, + "learning_rate": 9.997483267562035e-06, + "loss": 0.8185, + "step": 466 + }, + { + "epoch": 0.025703120700093566, + "grad_norm": 1.0749776363372803, + "learning_rate": 9.997469497299747e-06, + "loss": 1.0611, + "step": 467 + }, + { + "epoch": 0.025758159502449227, + "grad_norm": 0.8972223401069641, + "learning_rate": 9.997455689477815e-06, + "loss": 0.8994, + "step": 468 + }, + { + "epoch": 0.02581319830480489, + "grad_norm": 1.0669914484024048, + "learning_rate": 9.997441844096342e-06, + "loss": 1.06, + "step": 469 + }, + { + "epoch": 0.025868237107160547, + "grad_norm": 1.0431914329528809, + "learning_rate": 9.997427961155435e-06, + "loss": 0.8657, + "step": 470 + }, + { + "epoch": 0.025923275909516208, + "grad_norm": 0.9609962701797485, + "learning_rate": 9.997414040655198e-06, + "loss": 0.8864, + "step": 471 + }, + { + "epoch": 0.02597831471187187, + "grad_norm": 1.0829721689224243, + "learning_rate": 9.997400082595735e-06, + "loss": 0.9221, + "step": 472 + }, + { + "epoch": 0.02603335351422753, + "grad_norm": 0.992082953453064, + "learning_rate": 9.99738608697715e-06, + "loss": 0.8455, + "step": 473 + }, + { + "epoch": 0.026088392316583192, + "grad_norm": 1.0486301183700562, + "learning_rate": 9.997372053799547e-06, + "loss": 0.8729, + "step": 474 + }, + { + "epoch": 0.026143431118938854, + "grad_norm": 1.0328491926193237, + "learning_rate": 9.997357983063036e-06, + "loss": 0.8788, + "step": 475 + }, + { + "epoch": 0.02619846992129451, + "grad_norm": 0.963333249092102, + "learning_rate": 9.997343874767719e-06, + "loss": 0.892, + "step": 476 + }, + { + "epoch": 0.026253508723650173, + "grad_norm": 1.1606497764587402, + "learning_rate": 9.997329728913704e-06, + "loss": 0.9984, + "step": 477 + }, + { + "epoch": 0.026308547526005834, + "grad_norm": 1.241650104522705, + "learning_rate": 9.997315545501096e-06, + "loss": 0.946, + "step": 478 + }, + { + "epoch": 0.026363586328361496, + "grad_norm": 1.008004069328308, + "learning_rate": 9.99730132453e-06, + "loss": 0.849, + "step": 479 + }, + { + "epoch": 0.026418625130717157, + "grad_norm": 0.9883478879928589, + "learning_rate": 9.997287066000527e-06, + "loss": 0.9478, + "step": 480 + }, + { + "epoch": 0.026473663933072815, + "grad_norm": 1.0224446058273315, + "learning_rate": 9.997272769912783e-06, + "loss": 1.0318, + "step": 481 + }, + { + "epoch": 0.026528702735428476, + "grad_norm": 0.9412569403648376, + "learning_rate": 9.997258436266874e-06, + "loss": 0.9119, + "step": 482 + }, + { + "epoch": 0.026583741537784138, + "grad_norm": 0.9214537739753723, + "learning_rate": 9.997244065062906e-06, + "loss": 0.8785, + "step": 483 + }, + { + "epoch": 0.0266387803401398, + "grad_norm": 1.0015628337860107, + "learning_rate": 9.997229656300991e-06, + "loss": 0.8869, + "step": 484 + }, + { + "epoch": 0.02669381914249546, + "grad_norm": 0.8965190052986145, + "learning_rate": 9.997215209981237e-06, + "loss": 0.7009, + "step": 485 + }, + { + "epoch": 0.02674885794485112, + "grad_norm": 1.1976135969161987, + "learning_rate": 9.997200726103749e-06, + "loss": 0.9795, + "step": 486 + }, + { + "epoch": 0.02680389674720678, + "grad_norm": 0.864780843257904, + "learning_rate": 9.997186204668639e-06, + "loss": 0.7687, + "step": 487 + }, + { + "epoch": 0.02685893554956244, + "grad_norm": 0.9946566820144653, + "learning_rate": 9.997171645676013e-06, + "loss": 0.9672, + "step": 488 + }, + { + "epoch": 0.026913974351918103, + "grad_norm": 1.043835997581482, + "learning_rate": 9.997157049125985e-06, + "loss": 0.862, + "step": 489 + }, + { + "epoch": 0.026969013154273764, + "grad_norm": 0.9697456955909729, + "learning_rate": 9.99714241501866e-06, + "loss": 0.8368, + "step": 490 + }, + { + "epoch": 0.027024051956629425, + "grad_norm": 0.9975618124008179, + "learning_rate": 9.997127743354153e-06, + "loss": 0.8739, + "step": 491 + }, + { + "epoch": 0.027079090758985083, + "grad_norm": 1.0055313110351562, + "learning_rate": 9.99711303413257e-06, + "loss": 0.9227, + "step": 492 + }, + { + "epoch": 0.027134129561340745, + "grad_norm": 1.0418384075164795, + "learning_rate": 9.997098287354024e-06, + "loss": 0.9978, + "step": 493 + }, + { + "epoch": 0.027189168363696406, + "grad_norm": 0.8648970723152161, + "learning_rate": 9.997083503018625e-06, + "loss": 0.8363, + "step": 494 + }, + { + "epoch": 0.027244207166052067, + "grad_norm": 1.13506019115448, + "learning_rate": 9.997068681126483e-06, + "loss": 0.8851, + "step": 495 + }, + { + "epoch": 0.02729924596840773, + "grad_norm": 0.974400520324707, + "learning_rate": 9.997053821677712e-06, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.027354284770763387, + "grad_norm": 1.226507544517517, + "learning_rate": 9.997038924672419e-06, + "loss": 0.8586, + "step": 497 + }, + { + "epoch": 0.027409323573119048, + "grad_norm": 1.004753589630127, + "learning_rate": 9.997023990110721e-06, + "loss": 0.8974, + "step": 498 + }, + { + "epoch": 0.02746436237547471, + "grad_norm": 1.0492571592330933, + "learning_rate": 9.997009017992729e-06, + "loss": 0.8457, + "step": 499 + }, + { + "epoch": 0.02751940117783037, + "grad_norm": 1.0068167448043823, + "learning_rate": 9.996994008318554e-06, + "loss": 0.9608, + "step": 500 + }, + { + "epoch": 0.027574439980186032, + "grad_norm": 0.9686044454574585, + "learning_rate": 9.996978961088311e-06, + "loss": 0.9041, + "step": 501 + }, + { + "epoch": 0.027629478782541694, + "grad_norm": 1.281728744506836, + "learning_rate": 9.99696387630211e-06, + "loss": 0.9739, + "step": 502 + }, + { + "epoch": 0.02768451758489735, + "grad_norm": 0.9069758653640747, + "learning_rate": 9.996948753960065e-06, + "loss": 0.8467, + "step": 503 + }, + { + "epoch": 0.027739556387253013, + "grad_norm": 1.0337222814559937, + "learning_rate": 9.996933594062293e-06, + "loss": 0.9638, + "step": 504 + }, + { + "epoch": 0.027794595189608674, + "grad_norm": 0.9695359468460083, + "learning_rate": 9.996918396608905e-06, + "loss": 0.8986, + "step": 505 + }, + { + "epoch": 0.027849633991964336, + "grad_norm": 0.9120615124702454, + "learning_rate": 9.996903161600016e-06, + "loss": 0.9103, + "step": 506 + }, + { + "epoch": 0.027904672794319997, + "grad_norm": 0.9736546874046326, + "learning_rate": 9.996887889035741e-06, + "loss": 0.9308, + "step": 507 + }, + { + "epoch": 0.027959711596675655, + "grad_norm": 1.0184897184371948, + "learning_rate": 9.996872578916192e-06, + "loss": 0.8978, + "step": 508 + }, + { + "epoch": 0.028014750399031316, + "grad_norm": 0.9791838526725769, + "learning_rate": 9.996857231241489e-06, + "loss": 0.8639, + "step": 509 + }, + { + "epoch": 0.028069789201386978, + "grad_norm": 1.2985681295394897, + "learning_rate": 9.996841846011742e-06, + "loss": 0.9581, + "step": 510 + }, + { + "epoch": 0.02812482800374264, + "grad_norm": 1.0647368431091309, + "learning_rate": 9.996826423227071e-06, + "loss": 1.0565, + "step": 511 + }, + { + "epoch": 0.0281798668060983, + "grad_norm": 1.0336421728134155, + "learning_rate": 9.996810962887591e-06, + "loss": 1.008, + "step": 512 + }, + { + "epoch": 0.02823490560845396, + "grad_norm": 1.1838933229446411, + "learning_rate": 9.996795464993416e-06, + "loss": 0.8359, + "step": 513 + }, + { + "epoch": 0.02828994441080962, + "grad_norm": 0.9898360371589661, + "learning_rate": 9.996779929544663e-06, + "loss": 0.8501, + "step": 514 + }, + { + "epoch": 0.02834498321316528, + "grad_norm": 0.9836066365242004, + "learning_rate": 9.99676435654145e-06, + "loss": 0.8795, + "step": 515 + }, + { + "epoch": 0.028400022015520943, + "grad_norm": 1.0621601343154907, + "learning_rate": 9.996748745983895e-06, + "loss": 0.8746, + "step": 516 + }, + { + "epoch": 0.028455060817876604, + "grad_norm": 1.0082437992095947, + "learning_rate": 9.996733097872113e-06, + "loss": 0.9278, + "step": 517 + }, + { + "epoch": 0.028510099620232265, + "grad_norm": 0.9903931617736816, + "learning_rate": 9.996717412206222e-06, + "loss": 0.8264, + "step": 518 + }, + { + "epoch": 0.028565138422587923, + "grad_norm": 1.0797243118286133, + "learning_rate": 9.996701688986342e-06, + "loss": 1.0077, + "step": 519 + }, + { + "epoch": 0.028620177224943585, + "grad_norm": 1.147133231163025, + "learning_rate": 9.99668592821259e-06, + "loss": 0.9374, + "step": 520 + }, + { + "epoch": 0.028675216027299246, + "grad_norm": 0.9993947744369507, + "learning_rate": 9.996670129885082e-06, + "loss": 0.9562, + "step": 521 + }, + { + "epoch": 0.028730254829654907, + "grad_norm": 0.8580895066261292, + "learning_rate": 9.99665429400394e-06, + "loss": 0.7985, + "step": 522 + }, + { + "epoch": 0.02878529363201057, + "grad_norm": 0.9251388907432556, + "learning_rate": 9.996638420569281e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.028840332434366227, + "grad_norm": 1.0010193586349487, + "learning_rate": 9.996622509581227e-06, + "loss": 0.9316, + "step": 524 + }, + { + "epoch": 0.028895371236721888, + "grad_norm": 0.9822579026222229, + "learning_rate": 9.996606561039894e-06, + "loss": 0.8978, + "step": 525 + }, + { + "epoch": 0.02895041003907755, + "grad_norm": 1.0760595798492432, + "learning_rate": 9.996590574945403e-06, + "loss": 0.9125, + "step": 526 + }, + { + "epoch": 0.02900544884143321, + "grad_norm": 1.138869285583496, + "learning_rate": 9.996574551297876e-06, + "loss": 0.8185, + "step": 527 + }, + { + "epoch": 0.029060487643788872, + "grad_norm": 1.002994179725647, + "learning_rate": 9.996558490097433e-06, + "loss": 0.9404, + "step": 528 + }, + { + "epoch": 0.02911552644614453, + "grad_norm": 0.9550611972808838, + "learning_rate": 9.996542391344194e-06, + "loss": 0.859, + "step": 529 + }, + { + "epoch": 0.02917056524850019, + "grad_norm": 0.9236055612564087, + "learning_rate": 9.996526255038277e-06, + "loss": 0.7758, + "step": 530 + }, + { + "epoch": 0.029225604050855853, + "grad_norm": 1.103966474533081, + "learning_rate": 9.996510081179808e-06, + "loss": 1.0147, + "step": 531 + }, + { + "epoch": 0.029280642853211514, + "grad_norm": 0.9884665012359619, + "learning_rate": 9.996493869768906e-06, + "loss": 0.8784, + "step": 532 + }, + { + "epoch": 0.029335681655567176, + "grad_norm": 0.9173223376274109, + "learning_rate": 9.996477620805694e-06, + "loss": 0.8741, + "step": 533 + }, + { + "epoch": 0.029390720457922837, + "grad_norm": 0.965548574924469, + "learning_rate": 9.996461334290294e-06, + "loss": 0.8989, + "step": 534 + }, + { + "epoch": 0.029445759260278495, + "grad_norm": 0.9939296245574951, + "learning_rate": 9.996445010222828e-06, + "loss": 0.8552, + "step": 535 + }, + { + "epoch": 0.029500798062634156, + "grad_norm": 1.0081578493118286, + "learning_rate": 9.996428648603417e-06, + "loss": 0.9138, + "step": 536 + }, + { + "epoch": 0.029555836864989818, + "grad_norm": 1.0139487981796265, + "learning_rate": 9.996412249432188e-06, + "loss": 0.9452, + "step": 537 + }, + { + "epoch": 0.02961087566734548, + "grad_norm": 0.9463647603988647, + "learning_rate": 9.996395812709262e-06, + "loss": 0.8721, + "step": 538 + }, + { + "epoch": 0.02966591446970114, + "grad_norm": 0.9981473684310913, + "learning_rate": 9.99637933843476e-06, + "loss": 0.7791, + "step": 539 + }, + { + "epoch": 0.0297209532720568, + "grad_norm": 1.1637190580368042, + "learning_rate": 9.996362826608812e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 0.02977599207441246, + "grad_norm": 2.2887051105499268, + "learning_rate": 9.996346277231536e-06, + "loss": 0.9303, + "step": 541 + }, + { + "epoch": 0.02983103087676812, + "grad_norm": 0.9173391461372375, + "learning_rate": 9.99632969030306e-06, + "loss": 0.8627, + "step": 542 + }, + { + "epoch": 0.029886069679123783, + "grad_norm": 1.033355474472046, + "learning_rate": 9.996313065823506e-06, + "loss": 0.9906, + "step": 543 + }, + { + "epoch": 0.029941108481479444, + "grad_norm": 0.9286639094352722, + "learning_rate": 9.996296403793002e-06, + "loss": 0.7043, + "step": 544 + }, + { + "epoch": 0.029996147283835102, + "grad_norm": 0.963238000869751, + "learning_rate": 9.996279704211671e-06, + "loss": 1.0236, + "step": 545 + }, + { + "epoch": 0.030051186086190763, + "grad_norm": 1.0275089740753174, + "learning_rate": 9.99626296707964e-06, + "loss": 0.976, + "step": 546 + }, + { + "epoch": 0.030106224888546425, + "grad_norm": 1.0944674015045166, + "learning_rate": 9.996246192397032e-06, + "loss": 0.9209, + "step": 547 + }, + { + "epoch": 0.030161263690902086, + "grad_norm": 0.9620945453643799, + "learning_rate": 9.996229380163976e-06, + "loss": 0.8973, + "step": 548 + }, + { + "epoch": 0.030216302493257748, + "grad_norm": 1.032549500465393, + "learning_rate": 9.996212530380597e-06, + "loss": 0.892, + "step": 549 + }, + { + "epoch": 0.03027134129561341, + "grad_norm": 1.0433719158172607, + "learning_rate": 9.996195643047023e-06, + "loss": 0.8428, + "step": 550 + }, + { + "epoch": 0.030326380097969067, + "grad_norm": 1.1541085243225098, + "learning_rate": 9.996178718163378e-06, + "loss": 0.9084, + "step": 551 + }, + { + "epoch": 0.03038141890032473, + "grad_norm": 0.9386873245239258, + "learning_rate": 9.996161755729793e-06, + "loss": 0.9246, + "step": 552 + }, + { + "epoch": 0.03043645770268039, + "grad_norm": 1.092236042022705, + "learning_rate": 9.996144755746393e-06, + "loss": 0.8419, + "step": 553 + }, + { + "epoch": 0.03049149650503605, + "grad_norm": 0.9517606496810913, + "learning_rate": 9.996127718213306e-06, + "loss": 0.9002, + "step": 554 + }, + { + "epoch": 0.030546535307391712, + "grad_norm": 0.965972900390625, + "learning_rate": 9.996110643130661e-06, + "loss": 0.9197, + "step": 555 + }, + { + "epoch": 0.03060157410974737, + "grad_norm": 0.9396095275878906, + "learning_rate": 9.996093530498586e-06, + "loss": 0.8686, + "step": 556 + }, + { + "epoch": 0.030656612912103032, + "grad_norm": 1.0154120922088623, + "learning_rate": 9.99607638031721e-06, + "loss": 0.9773, + "step": 557 + }, + { + "epoch": 0.030711651714458693, + "grad_norm": 1.3572301864624023, + "learning_rate": 9.99605919258666e-06, + "loss": 0.911, + "step": 558 + }, + { + "epoch": 0.030766690516814355, + "grad_norm": 0.968278169631958, + "learning_rate": 9.996041967307066e-06, + "loss": 0.7704, + "step": 559 + }, + { + "epoch": 0.030821729319170016, + "grad_norm": 0.9867869019508362, + "learning_rate": 9.99602470447856e-06, + "loss": 0.873, + "step": 560 + }, + { + "epoch": 0.030876768121525677, + "grad_norm": 1.056450605392456, + "learning_rate": 9.996007404101269e-06, + "loss": 0.941, + "step": 561 + }, + { + "epoch": 0.030931806923881335, + "grad_norm": 1.0419799089431763, + "learning_rate": 9.995990066175321e-06, + "loss": 0.957, + "step": 562 + }, + { + "epoch": 0.030986845726236997, + "grad_norm": 0.9789314866065979, + "learning_rate": 9.995972690700852e-06, + "loss": 0.9229, + "step": 563 + }, + { + "epoch": 0.031041884528592658, + "grad_norm": 0.917783796787262, + "learning_rate": 9.995955277677989e-06, + "loss": 0.8186, + "step": 564 + }, + { + "epoch": 0.03109692333094832, + "grad_norm": 1.0231432914733887, + "learning_rate": 9.995937827106863e-06, + "loss": 0.8624, + "step": 565 + }, + { + "epoch": 0.03115196213330398, + "grad_norm": 0.9552083015441895, + "learning_rate": 9.995920338987605e-06, + "loss": 0.7967, + "step": 566 + }, + { + "epoch": 0.03120700093565964, + "grad_norm": 0.9441083669662476, + "learning_rate": 9.995902813320349e-06, + "loss": 0.8471, + "step": 567 + }, + { + "epoch": 0.0312620397380153, + "grad_norm": 1.0025299787521362, + "learning_rate": 9.995885250105223e-06, + "loss": 0.8646, + "step": 568 + }, + { + "epoch": 0.03131707854037096, + "grad_norm": 0.8997280597686768, + "learning_rate": 9.99586764934236e-06, + "loss": 0.8736, + "step": 569 + }, + { + "epoch": 0.03137211734272662, + "grad_norm": 0.9090663194656372, + "learning_rate": 9.995850011031896e-06, + "loss": 0.8548, + "step": 570 + }, + { + "epoch": 0.031427156145082284, + "grad_norm": 0.9641294479370117, + "learning_rate": 9.995832335173959e-06, + "loss": 0.8667, + "step": 571 + }, + { + "epoch": 0.031482194947437946, + "grad_norm": 0.9165804982185364, + "learning_rate": 9.995814621768682e-06, + "loss": 0.803, + "step": 572 + }, + { + "epoch": 0.03153723374979361, + "grad_norm": 0.9672492742538452, + "learning_rate": 9.995796870816202e-06, + "loss": 0.8335, + "step": 573 + }, + { + "epoch": 0.03159227255214927, + "grad_norm": 0.9359404444694519, + "learning_rate": 9.995779082316648e-06, + "loss": 0.8294, + "step": 574 + }, + { + "epoch": 0.03164731135450492, + "grad_norm": 0.926925003528595, + "learning_rate": 9.995761256270157e-06, + "loss": 0.7714, + "step": 575 + }, + { + "epoch": 0.031702350156860584, + "grad_norm": 1.1848629713058472, + "learning_rate": 9.995743392676862e-06, + "loss": 0.8925, + "step": 576 + }, + { + "epoch": 0.031757388959216246, + "grad_norm": 0.9624786972999573, + "learning_rate": 9.995725491536897e-06, + "loss": 0.9292, + "step": 577 + }, + { + "epoch": 0.03181242776157191, + "grad_norm": 0.9479736089706421, + "learning_rate": 9.995707552850396e-06, + "loss": 0.8797, + "step": 578 + }, + { + "epoch": 0.03186746656392757, + "grad_norm": 0.9551546573638916, + "learning_rate": 9.995689576617494e-06, + "loss": 0.8793, + "step": 579 + }, + { + "epoch": 0.03192250536628323, + "grad_norm": 0.9210056662559509, + "learning_rate": 9.995671562838325e-06, + "loss": 0.9714, + "step": 580 + }, + { + "epoch": 0.03197754416863889, + "grad_norm": 1.063117504119873, + "learning_rate": 9.995653511513029e-06, + "loss": 0.9608, + "step": 581 + }, + { + "epoch": 0.03203258297099455, + "grad_norm": 0.9426459670066833, + "learning_rate": 9.995635422641736e-06, + "loss": 0.9102, + "step": 582 + }, + { + "epoch": 0.032087621773350214, + "grad_norm": 1.0176693201065063, + "learning_rate": 9.995617296224584e-06, + "loss": 0.9109, + "step": 583 + }, + { + "epoch": 0.032142660575705875, + "grad_norm": 0.9457042217254639, + "learning_rate": 9.995599132261711e-06, + "loss": 0.9017, + "step": 584 + }, + { + "epoch": 0.03219769937806154, + "grad_norm": 1.5851638317108154, + "learning_rate": 9.995580930753252e-06, + "loss": 0.967, + "step": 585 + }, + { + "epoch": 0.03225273818041719, + "grad_norm": 0.9961487054824829, + "learning_rate": 9.995562691699345e-06, + "loss": 0.9396, + "step": 586 + }, + { + "epoch": 0.03230777698277285, + "grad_norm": 0.9892112016677856, + "learning_rate": 9.995544415100125e-06, + "loss": 0.9058, + "step": 587 + }, + { + "epoch": 0.032362815785128514, + "grad_norm": 0.9052272439002991, + "learning_rate": 9.99552610095573e-06, + "loss": 0.9194, + "step": 588 + }, + { + "epoch": 0.032417854587484175, + "grad_norm": 0.8381399512290955, + "learning_rate": 9.995507749266297e-06, + "loss": 0.7465, + "step": 589 + }, + { + "epoch": 0.03247289338983984, + "grad_norm": 1.018964171409607, + "learning_rate": 9.995489360031969e-06, + "loss": 0.841, + "step": 590 + }, + { + "epoch": 0.0325279321921955, + "grad_norm": 0.908311128616333, + "learning_rate": 9.995470933252876e-06, + "loss": 0.8592, + "step": 591 + }, + { + "epoch": 0.03258297099455116, + "grad_norm": 1.2986040115356445, + "learning_rate": 9.995452468929162e-06, + "loss": 0.8341, + "step": 592 + }, + { + "epoch": 0.03263800979690682, + "grad_norm": 1.6565190553665161, + "learning_rate": 9.995433967060966e-06, + "loss": 0.8681, + "step": 593 + }, + { + "epoch": 0.03269304859926248, + "grad_norm": 0.9725674390792847, + "learning_rate": 9.995415427648423e-06, + "loss": 0.8449, + "step": 594 + }, + { + "epoch": 0.032748087401618144, + "grad_norm": 0.8683852553367615, + "learning_rate": 9.995396850691677e-06, + "loss": 0.8478, + "step": 595 + }, + { + "epoch": 0.0328031262039738, + "grad_norm": 0.9912856817245483, + "learning_rate": 9.995378236190862e-06, + "loss": 0.8912, + "step": 596 + }, + { + "epoch": 0.03285816500632946, + "grad_norm": 0.9396800398826599, + "learning_rate": 9.995359584146125e-06, + "loss": 0.856, + "step": 597 + }, + { + "epoch": 0.03291320380868512, + "grad_norm": 1.385006308555603, + "learning_rate": 9.995340894557601e-06, + "loss": 0.9633, + "step": 598 + }, + { + "epoch": 0.03296824261104078, + "grad_norm": 0.8982875943183899, + "learning_rate": 9.995322167425433e-06, + "loss": 0.9244, + "step": 599 + }, + { + "epoch": 0.033023281413396444, + "grad_norm": 0.8981022834777832, + "learning_rate": 9.995303402749759e-06, + "loss": 0.8854, + "step": 600 + }, + { + "epoch": 0.033078320215752105, + "grad_norm": 0.9917197227478027, + "learning_rate": 9.995284600530724e-06, + "loss": 1.0086, + "step": 601 + }, + { + "epoch": 0.033133359018107766, + "grad_norm": 1.0540626049041748, + "learning_rate": 9.995265760768464e-06, + "loss": 1.0022, + "step": 602 + }, + { + "epoch": 0.03318839782046343, + "grad_norm": 0.9523479342460632, + "learning_rate": 9.995246883463126e-06, + "loss": 0.9893, + "step": 603 + }, + { + "epoch": 0.03324343662281909, + "grad_norm": 0.9824770092964172, + "learning_rate": 9.99522796861485e-06, + "loss": 0.8385, + "step": 604 + }, + { + "epoch": 0.03329847542517475, + "grad_norm": 1.0968893766403198, + "learning_rate": 9.995209016223776e-06, + "loss": 1.0109, + "step": 605 + }, + { + "epoch": 0.03335351422753041, + "grad_norm": 0.9115625023841858, + "learning_rate": 9.995190026290049e-06, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.033408553029886066, + "grad_norm": 0.9795814156532288, + "learning_rate": 9.99517099881381e-06, + "loss": 0.8941, + "step": 607 + }, + { + "epoch": 0.03346359183224173, + "grad_norm": 0.9317291378974915, + "learning_rate": 9.995151933795204e-06, + "loss": 0.7819, + "step": 608 + }, + { + "epoch": 0.03351863063459739, + "grad_norm": 0.9936283230781555, + "learning_rate": 9.995132831234373e-06, + "loss": 0.8674, + "step": 609 + }, + { + "epoch": 0.03357366943695305, + "grad_norm": 0.9872812032699585, + "learning_rate": 9.995113691131462e-06, + "loss": 0.9038, + "step": 610 + }, + { + "epoch": 0.03362870823930871, + "grad_norm": 0.9516895413398743, + "learning_rate": 9.995094513486611e-06, + "loss": 0.9038, + "step": 611 + }, + { + "epoch": 0.03368374704166437, + "grad_norm": 1.090579867362976, + "learning_rate": 9.995075298299968e-06, + "loss": 0.9587, + "step": 612 + }, + { + "epoch": 0.033738785844020035, + "grad_norm": 1.021398663520813, + "learning_rate": 9.995056045571677e-06, + "loss": 0.9569, + "step": 613 + }, + { + "epoch": 0.033793824646375696, + "grad_norm": 1.009657382965088, + "learning_rate": 9.99503675530188e-06, + "loss": 0.8346, + "step": 614 + }, + { + "epoch": 0.03384886344873136, + "grad_norm": 1.0478712320327759, + "learning_rate": 9.995017427490725e-06, + "loss": 1.0566, + "step": 615 + }, + { + "epoch": 0.03390390225108702, + "grad_norm": 1.1391830444335938, + "learning_rate": 9.994998062138355e-06, + "loss": 1.0727, + "step": 616 + }, + { + "epoch": 0.03395894105344268, + "grad_norm": 1.0172302722930908, + "learning_rate": 9.994978659244918e-06, + "loss": 0.7869, + "step": 617 + }, + { + "epoch": 0.034013979855798335, + "grad_norm": 1.0532630681991577, + "learning_rate": 9.994959218810558e-06, + "loss": 0.8626, + "step": 618 + }, + { + "epoch": 0.034069018658153996, + "grad_norm": 0.8300478458404541, + "learning_rate": 9.99493974083542e-06, + "loss": 0.8166, + "step": 619 + }, + { + "epoch": 0.03412405746050966, + "grad_norm": 1.0613664388656616, + "learning_rate": 9.994920225319656e-06, + "loss": 0.8899, + "step": 620 + }, + { + "epoch": 0.03417909626286532, + "grad_norm": 0.9827042818069458, + "learning_rate": 9.994900672263406e-06, + "loss": 0.8243, + "step": 621 + }, + { + "epoch": 0.03423413506522098, + "grad_norm": 0.8790082931518555, + "learning_rate": 9.994881081666818e-06, + "loss": 0.8153, + "step": 622 + }, + { + "epoch": 0.03428917386757664, + "grad_norm": 1.033378005027771, + "learning_rate": 9.994861453530044e-06, + "loss": 0.8916, + "step": 623 + }, + { + "epoch": 0.0343442126699323, + "grad_norm": 0.9547238349914551, + "learning_rate": 9.994841787853227e-06, + "loss": 0.9141, + "step": 624 + }, + { + "epoch": 0.034399251472287964, + "grad_norm": 0.9606438279151917, + "learning_rate": 9.994822084636514e-06, + "loss": 0.9435, + "step": 625 + }, + { + "epoch": 0.034454290274643626, + "grad_norm": 0.8461503982543945, + "learning_rate": 9.994802343880059e-06, + "loss": 0.7914, + "step": 626 + }, + { + "epoch": 0.03450932907699929, + "grad_norm": 1.144538402557373, + "learning_rate": 9.994782565584004e-06, + "loss": 0.8025, + "step": 627 + }, + { + "epoch": 0.03456436787935495, + "grad_norm": 1.0099962949752808, + "learning_rate": 9.994762749748502e-06, + "loss": 0.9607, + "step": 628 + }, + { + "epoch": 0.0346194066817106, + "grad_norm": 0.9822041988372803, + "learning_rate": 9.9947428963737e-06, + "loss": 0.9216, + "step": 629 + }, + { + "epoch": 0.034674445484066264, + "grad_norm": 0.9056866765022278, + "learning_rate": 9.994723005459746e-06, + "loss": 0.7913, + "step": 630 + }, + { + "epoch": 0.034729484286421926, + "grad_norm": 1.0099287033081055, + "learning_rate": 9.994703077006792e-06, + "loss": 0.9937, + "step": 631 + }, + { + "epoch": 0.03478452308877759, + "grad_norm": 0.9559167623519897, + "learning_rate": 9.994683111014984e-06, + "loss": 0.9774, + "step": 632 + }, + { + "epoch": 0.03483956189113325, + "grad_norm": 1.0359059572219849, + "learning_rate": 9.994663107484478e-06, + "loss": 0.9062, + "step": 633 + }, + { + "epoch": 0.03489460069348891, + "grad_norm": 0.8803057074546814, + "learning_rate": 9.99464306641542e-06, + "loss": 0.9638, + "step": 634 + }, + { + "epoch": 0.03494963949584457, + "grad_norm": 1.0926579236984253, + "learning_rate": 9.994622987807962e-06, + "loss": 1.0467, + "step": 635 + }, + { + "epoch": 0.03500467829820023, + "grad_norm": 1.0051401853561401, + "learning_rate": 9.994602871662253e-06, + "loss": 0.8717, + "step": 636 + }, + { + "epoch": 0.035059717100555894, + "grad_norm": 1.2007508277893066, + "learning_rate": 9.994582717978448e-06, + "loss": 0.8004, + "step": 637 + }, + { + "epoch": 0.035114755902911556, + "grad_norm": 0.8826266527175903, + "learning_rate": 9.994562526756695e-06, + "loss": 0.8888, + "step": 638 + }, + { + "epoch": 0.03516979470526721, + "grad_norm": 0.9953717589378357, + "learning_rate": 9.994542297997147e-06, + "loss": 0.8999, + "step": 639 + }, + { + "epoch": 0.03522483350762287, + "grad_norm": 1.0203614234924316, + "learning_rate": 9.994522031699958e-06, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.03527987230997853, + "grad_norm": 0.8760203719139099, + "learning_rate": 9.994501727865276e-06, + "loss": 0.7893, + "step": 641 + }, + { + "epoch": 0.035334911112334194, + "grad_norm": 1.024888277053833, + "learning_rate": 9.994481386493257e-06, + "loss": 0.9865, + "step": 642 + }, + { + "epoch": 0.035389949914689856, + "grad_norm": 0.907454788684845, + "learning_rate": 9.994461007584052e-06, + "loss": 0.891, + "step": 643 + }, + { + "epoch": 0.03544498871704552, + "grad_norm": 1.0400965213775635, + "learning_rate": 9.994440591137816e-06, + "loss": 0.9345, + "step": 644 + }, + { + "epoch": 0.03550002751940118, + "grad_norm": 0.9816616177558899, + "learning_rate": 9.9944201371547e-06, + "loss": 0.91, + "step": 645 + }, + { + "epoch": 0.03555506632175684, + "grad_norm": 1.0528117418289185, + "learning_rate": 9.99439964563486e-06, + "loss": 0.952, + "step": 646 + }, + { + "epoch": 0.0356101051241125, + "grad_norm": 0.9802080988883972, + "learning_rate": 9.99437911657845e-06, + "loss": 0.9392, + "step": 647 + }, + { + "epoch": 0.03566514392646816, + "grad_norm": 0.9580393433570862, + "learning_rate": 9.994358549985623e-06, + "loss": 0.874, + "step": 648 + }, + { + "epoch": 0.035720182728823824, + "grad_norm": 0.8935576677322388, + "learning_rate": 9.994337945856533e-06, + "loss": 0.8435, + "step": 649 + }, + { + "epoch": 0.03577522153117948, + "grad_norm": 1.009699821472168, + "learning_rate": 9.994317304191337e-06, + "loss": 0.9436, + "step": 650 + }, + { + "epoch": 0.03583026033353514, + "grad_norm": 0.9126121401786804, + "learning_rate": 9.994296624990188e-06, + "loss": 0.8424, + "step": 651 + }, + { + "epoch": 0.0358852991358908, + "grad_norm": 0.9555553197860718, + "learning_rate": 9.994275908253243e-06, + "loss": 0.93, + "step": 652 + }, + { + "epoch": 0.03594033793824646, + "grad_norm": 0.8359857797622681, + "learning_rate": 9.994255153980658e-06, + "loss": 0.6326, + "step": 653 + }, + { + "epoch": 0.035995376740602124, + "grad_norm": 0.8918783664703369, + "learning_rate": 9.994234362172587e-06, + "loss": 0.8287, + "step": 654 + }, + { + "epoch": 0.036050415542957785, + "grad_norm": 0.9878549575805664, + "learning_rate": 9.994213532829188e-06, + "loss": 0.8841, + "step": 655 + }, + { + "epoch": 0.03610545434531345, + "grad_norm": 0.9504040479660034, + "learning_rate": 9.994192665950617e-06, + "loss": 1.0182, + "step": 656 + }, + { + "epoch": 0.03616049314766911, + "grad_norm": 0.9531422257423401, + "learning_rate": 9.99417176153703e-06, + "loss": 0.8504, + "step": 657 + }, + { + "epoch": 0.03621553195002477, + "grad_norm": 0.9580292105674744, + "learning_rate": 9.994150819588587e-06, + "loss": 0.8048, + "step": 658 + }, + { + "epoch": 0.03627057075238043, + "grad_norm": 0.9786819815635681, + "learning_rate": 9.99412984010544e-06, + "loss": 0.9124, + "step": 659 + }, + { + "epoch": 0.03632560955473609, + "grad_norm": 0.9733422994613647, + "learning_rate": 9.994108823087751e-06, + "loss": 0.8868, + "step": 660 + }, + { + "epoch": 0.03638064835709175, + "grad_norm": 1.093173623085022, + "learning_rate": 9.994087768535679e-06, + "loss": 0.9428, + "step": 661 + }, + { + "epoch": 0.03643568715944741, + "grad_norm": 0.9067148566246033, + "learning_rate": 9.994066676449378e-06, + "loss": 0.8838, + "step": 662 + }, + { + "epoch": 0.03649072596180307, + "grad_norm": 0.9509521722793579, + "learning_rate": 9.99404554682901e-06, + "loss": 0.9034, + "step": 663 + }, + { + "epoch": 0.03654576476415873, + "grad_norm": 0.9523824453353882, + "learning_rate": 9.994024379674731e-06, + "loss": 0.9623, + "step": 664 + }, + { + "epoch": 0.03660080356651439, + "grad_norm": 0.987276554107666, + "learning_rate": 9.994003174986703e-06, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.036655842368870054, + "grad_norm": 0.9500744342803955, + "learning_rate": 9.993981932765083e-06, + "loss": 0.9742, + "step": 666 + }, + { + "epoch": 0.036710881171225715, + "grad_norm": 0.9420705437660217, + "learning_rate": 9.993960653010034e-06, + "loss": 0.9657, + "step": 667 + }, + { + "epoch": 0.036765919973581376, + "grad_norm": 0.9443248510360718, + "learning_rate": 9.99393933572171e-06, + "loss": 0.8468, + "step": 668 + }, + { + "epoch": 0.03682095877593704, + "grad_norm": 0.9666558504104614, + "learning_rate": 9.993917980900276e-06, + "loss": 0.9871, + "step": 669 + }, + { + "epoch": 0.0368759975782927, + "grad_norm": 1.0236201286315918, + "learning_rate": 9.993896588545892e-06, + "loss": 0.9814, + "step": 670 + }, + { + "epoch": 0.03693103638064836, + "grad_norm": 1.016190528869629, + "learning_rate": 9.993875158658716e-06, + "loss": 1.0156, + "step": 671 + }, + { + "epoch": 0.036986075183004015, + "grad_norm": 0.9296661019325256, + "learning_rate": 9.993853691238913e-06, + "loss": 0.7956, + "step": 672 + }, + { + "epoch": 0.037041113985359676, + "grad_norm": 0.9276684522628784, + "learning_rate": 9.993832186286643e-06, + "loss": 0.9253, + "step": 673 + }, + { + "epoch": 0.03709615278771534, + "grad_norm": 0.8588787913322449, + "learning_rate": 9.993810643802065e-06, + "loss": 0.7878, + "step": 674 + }, + { + "epoch": 0.037151191590071, + "grad_norm": 0.9955212473869324, + "learning_rate": 9.993789063785344e-06, + "loss": 0.8711, + "step": 675 + }, + { + "epoch": 0.03720623039242666, + "grad_norm": 0.925578236579895, + "learning_rate": 9.993767446236642e-06, + "loss": 0.9431, + "step": 676 + }, + { + "epoch": 0.03726126919478232, + "grad_norm": 0.9610552787780762, + "learning_rate": 9.99374579115612e-06, + "loss": 0.887, + "step": 677 + }, + { + "epoch": 0.03731630799713798, + "grad_norm": 1.0052428245544434, + "learning_rate": 9.99372409854394e-06, + "loss": 0.8751, + "step": 678 + }, + { + "epoch": 0.037371346799493645, + "grad_norm": 0.9503066539764404, + "learning_rate": 9.99370236840027e-06, + "loss": 0.8556, + "step": 679 + }, + { + "epoch": 0.037426385601849306, + "grad_norm": 2.426232099533081, + "learning_rate": 9.993680600725266e-06, + "loss": 0.9077, + "step": 680 + }, + { + "epoch": 0.03748142440420497, + "grad_norm": 0.9119723439216614, + "learning_rate": 9.993658795519096e-06, + "loss": 0.8575, + "step": 681 + }, + { + "epoch": 0.03753646320656062, + "grad_norm": 0.9688286781311035, + "learning_rate": 9.993636952781923e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 0.03759150200891628, + "grad_norm": 1.030013084411621, + "learning_rate": 9.993615072513913e-06, + "loss": 0.8622, + "step": 683 + }, + { + "epoch": 0.037646540811271945, + "grad_norm": 1.055187463760376, + "learning_rate": 9.993593154715228e-06, + "loss": 0.9251, + "step": 684 + }, + { + "epoch": 0.037701579613627606, + "grad_norm": 1.0518591403961182, + "learning_rate": 9.993571199386032e-06, + "loss": 0.9575, + "step": 685 + }, + { + "epoch": 0.03775661841598327, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.993549206526495e-06, + "loss": 0.8522, + "step": 686 + }, + { + "epoch": 0.03781165721833893, + "grad_norm": 1.0212332010269165, + "learning_rate": 9.993527176136775e-06, + "loss": 0.9358, + "step": 687 + }, + { + "epoch": 0.03786669602069459, + "grad_norm": 0.9137141108512878, + "learning_rate": 9.993505108217045e-06, + "loss": 0.8561, + "step": 688 + }, + { + "epoch": 0.03792173482305025, + "grad_norm": 1.0069375038146973, + "learning_rate": 9.993483002767465e-06, + "loss": 0.8274, + "step": 689 + }, + { + "epoch": 0.03797677362540591, + "grad_norm": 0.9820672869682312, + "learning_rate": 9.993460859788204e-06, + "loss": 0.907, + "step": 690 + }, + { + "epoch": 0.038031812427761574, + "grad_norm": 1.0042002201080322, + "learning_rate": 9.993438679279428e-06, + "loss": 0.9263, + "step": 691 + }, + { + "epoch": 0.038086851230117236, + "grad_norm": 0.9733695983886719, + "learning_rate": 9.993416461241304e-06, + "loss": 0.8455, + "step": 692 + }, + { + "epoch": 0.03814189003247289, + "grad_norm": 0.9106015563011169, + "learning_rate": 9.993394205673996e-06, + "loss": 0.8469, + "step": 693 + }, + { + "epoch": 0.03819692883482855, + "grad_norm": 0.9802660346031189, + "learning_rate": 9.993371912577677e-06, + "loss": 0.8662, + "step": 694 + }, + { + "epoch": 0.03825196763718421, + "grad_norm": 0.9183964729309082, + "learning_rate": 9.99334958195251e-06, + "loss": 0.8968, + "step": 695 + }, + { + "epoch": 0.038307006439539874, + "grad_norm": 0.9572185277938843, + "learning_rate": 9.993327213798663e-06, + "loss": 0.953, + "step": 696 + }, + { + "epoch": 0.038362045241895536, + "grad_norm": 1.4480071067810059, + "learning_rate": 9.993304808116307e-06, + "loss": 1.1131, + "step": 697 + }, + { + "epoch": 0.0384170840442512, + "grad_norm": 0.9297361969947815, + "learning_rate": 9.993282364905607e-06, + "loss": 0.884, + "step": 698 + }, + { + "epoch": 0.03847212284660686, + "grad_norm": 0.9400073885917664, + "learning_rate": 9.993259884166735e-06, + "loss": 0.932, + "step": 699 + }, + { + "epoch": 0.03852716164896252, + "grad_norm": 0.9231798052787781, + "learning_rate": 9.993237365899858e-06, + "loss": 0.8981, + "step": 700 + }, + { + "epoch": 0.03858220045131818, + "grad_norm": 0.8233712911605835, + "learning_rate": 9.993214810105144e-06, + "loss": 0.8218, + "step": 701 + }, + { + "epoch": 0.03863723925367384, + "grad_norm": 1.0997854471206665, + "learning_rate": 9.993192216782768e-06, + "loss": 0.9298, + "step": 702 + }, + { + "epoch": 0.038692278056029504, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.993169585932893e-06, + "loss": 0.7815, + "step": 703 + }, + { + "epoch": 0.03874731685838516, + "grad_norm": 0.9913730025291443, + "learning_rate": 9.993146917555692e-06, + "loss": 0.9621, + "step": 704 + }, + { + "epoch": 0.03880235566074082, + "grad_norm": 1.088767409324646, + "learning_rate": 9.993124211651334e-06, + "loss": 0.9295, + "step": 705 + }, + { + "epoch": 0.03885739446309648, + "grad_norm": 0.8199124336242676, + "learning_rate": 9.993101468219995e-06, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.03891243326545214, + "grad_norm": 1.112566351890564, + "learning_rate": 9.99307868726184e-06, + "loss": 0.791, + "step": 707 + }, + { + "epoch": 0.038967472067807804, + "grad_norm": 0.9372578859329224, + "learning_rate": 9.99305586877704e-06, + "loss": 0.8567, + "step": 708 + }, + { + "epoch": 0.039022510870163465, + "grad_norm": 1.0167721509933472, + "learning_rate": 9.99303301276577e-06, + "loss": 0.9787, + "step": 709 + }, + { + "epoch": 0.03907754967251913, + "grad_norm": 1.3526856899261475, + "learning_rate": 9.993010119228202e-06, + "loss": 1.2215, + "step": 710 + }, + { + "epoch": 0.03913258847487479, + "grad_norm": 0.8819016814231873, + "learning_rate": 9.992987188164505e-06, + "loss": 0.7736, + "step": 711 + }, + { + "epoch": 0.03918762727723045, + "grad_norm": 1.0033677816390991, + "learning_rate": 9.992964219574852e-06, + "loss": 0.9919, + "step": 712 + }, + { + "epoch": 0.03924266607958611, + "grad_norm": 0.894926130771637, + "learning_rate": 9.992941213459417e-06, + "loss": 0.9058, + "step": 713 + }, + { + "epoch": 0.03929770488194177, + "grad_norm": 0.9481377005577087, + "learning_rate": 9.992918169818373e-06, + "loss": 0.8436, + "step": 714 + }, + { + "epoch": 0.03935274368429743, + "grad_norm": 0.9312933087348938, + "learning_rate": 9.992895088651893e-06, + "loss": 0.8869, + "step": 715 + }, + { + "epoch": 0.03940778248665309, + "grad_norm": 0.9765705466270447, + "learning_rate": 9.99287196996015e-06, + "loss": 0.9512, + "step": 716 + }, + { + "epoch": 0.03946282128900875, + "grad_norm": 0.9610235691070557, + "learning_rate": 9.992848813743317e-06, + "loss": 0.8005, + "step": 717 + }, + { + "epoch": 0.03951786009136441, + "grad_norm": 1.102995753288269, + "learning_rate": 9.99282562000157e-06, + "loss": 0.8017, + "step": 718 + }, + { + "epoch": 0.03957289889372007, + "grad_norm": 1.023317575454712, + "learning_rate": 9.99280238873508e-06, + "loss": 0.911, + "step": 719 + }, + { + "epoch": 0.039627937696075734, + "grad_norm": 1.0531049966812134, + "learning_rate": 9.992779119944025e-06, + "loss": 0.8562, + "step": 720 + }, + { + "epoch": 0.039682976498431395, + "grad_norm": 0.918250322341919, + "learning_rate": 9.992755813628579e-06, + "loss": 0.92, + "step": 721 + }, + { + "epoch": 0.039738015300787057, + "grad_norm": 0.8508251309394836, + "learning_rate": 9.992732469788915e-06, + "loss": 0.7347, + "step": 722 + }, + { + "epoch": 0.03979305410314272, + "grad_norm": 0.9184926152229309, + "learning_rate": 9.992709088425211e-06, + "loss": 0.8732, + "step": 723 + }, + { + "epoch": 0.03984809290549838, + "grad_norm": 1.1613929271697998, + "learning_rate": 9.992685669537643e-06, + "loss": 0.9522, + "step": 724 + }, + { + "epoch": 0.039903131707854034, + "grad_norm": 1.091513752937317, + "learning_rate": 9.992662213126386e-06, + "loss": 0.9646, + "step": 725 + }, + { + "epoch": 0.039958170510209695, + "grad_norm": 1.057803750038147, + "learning_rate": 9.992638719191615e-06, + "loss": 0.7032, + "step": 726 + }, + { + "epoch": 0.040013209312565357, + "grad_norm": 0.8771823644638062, + "learning_rate": 9.992615187733508e-06, + "loss": 0.8577, + "step": 727 + }, + { + "epoch": 0.04006824811492102, + "grad_norm": 0.9471028447151184, + "learning_rate": 9.992591618752244e-06, + "loss": 0.9057, + "step": 728 + }, + { + "epoch": 0.04012328691727668, + "grad_norm": 0.9547705054283142, + "learning_rate": 9.992568012247995e-06, + "loss": 0.9549, + "step": 729 + }, + { + "epoch": 0.04017832571963234, + "grad_norm": 0.8862974047660828, + "learning_rate": 9.992544368220941e-06, + "loss": 0.8593, + "step": 730 + }, + { + "epoch": 0.040233364521988, + "grad_norm": 0.906334400177002, + "learning_rate": 9.992520686671261e-06, + "loss": 0.8832, + "step": 731 + }, + { + "epoch": 0.04028840332434366, + "grad_norm": 1.07270085811615, + "learning_rate": 9.992496967599133e-06, + "loss": 0.9409, + "step": 732 + }, + { + "epoch": 0.040343442126699325, + "grad_norm": 0.9026005268096924, + "learning_rate": 9.992473211004734e-06, + "loss": 0.8326, + "step": 733 + }, + { + "epoch": 0.040398480929054986, + "grad_norm": 0.9762942790985107, + "learning_rate": 9.992449416888241e-06, + "loss": 0.9048, + "step": 734 + }, + { + "epoch": 0.04045351973141065, + "grad_norm": 0.9658033847808838, + "learning_rate": 9.992425585249837e-06, + "loss": 0.9219, + "step": 735 + }, + { + "epoch": 0.0405085585337663, + "grad_norm": 0.8909044861793518, + "learning_rate": 9.992401716089698e-06, + "loss": 0.8564, + "step": 736 + }, + { + "epoch": 0.04056359733612196, + "grad_norm": 1.0387929677963257, + "learning_rate": 9.992377809408001e-06, + "loss": 0.9533, + "step": 737 + }, + { + "epoch": 0.040618636138477625, + "grad_norm": 0.9044275879859924, + "learning_rate": 9.99235386520493e-06, + "loss": 0.8508, + "step": 738 + }, + { + "epoch": 0.040673674940833286, + "grad_norm": 1.019377589225769, + "learning_rate": 9.992329883480667e-06, + "loss": 0.8684, + "step": 739 + }, + { + "epoch": 0.04072871374318895, + "grad_norm": 0.9394627213478088, + "learning_rate": 9.992305864235385e-06, + "loss": 0.7665, + "step": 740 + }, + { + "epoch": 0.04078375254554461, + "grad_norm": 0.8652323484420776, + "learning_rate": 9.99228180746927e-06, + "loss": 0.8576, + "step": 741 + }, + { + "epoch": 0.04083879134790027, + "grad_norm": 0.9347619414329529, + "learning_rate": 9.992257713182502e-06, + "loss": 0.9586, + "step": 742 + }, + { + "epoch": 0.04089383015025593, + "grad_norm": 0.9510203003883362, + "learning_rate": 9.99223358137526e-06, + "loss": 0.9092, + "step": 743 + }, + { + "epoch": 0.04094886895261159, + "grad_norm": 0.8242866396903992, + "learning_rate": 9.992209412047729e-06, + "loss": 0.6997, + "step": 744 + }, + { + "epoch": 0.041003907754967255, + "grad_norm": 0.8842730522155762, + "learning_rate": 9.992185205200087e-06, + "loss": 0.8873, + "step": 745 + }, + { + "epoch": 0.041058946557322916, + "grad_norm": 1.0813730955123901, + "learning_rate": 9.992160960832518e-06, + "loss": 1.0162, + "step": 746 + }, + { + "epoch": 0.04111398535967857, + "grad_norm": 1.1276283264160156, + "learning_rate": 9.9921366789452e-06, + "loss": 1.0004, + "step": 747 + }, + { + "epoch": 0.04116902416203423, + "grad_norm": 0.8810326457023621, + "learning_rate": 9.992112359538323e-06, + "loss": 0.7823, + "step": 748 + }, + { + "epoch": 0.04122406296438989, + "grad_norm": 0.9939407110214233, + "learning_rate": 9.992088002612066e-06, + "loss": 1.0016, + "step": 749 + }, + { + "epoch": 0.041279101766745555, + "grad_norm": 1.0963523387908936, + "learning_rate": 9.99206360816661e-06, + "loss": 0.9252, + "step": 750 + }, + { + "epoch": 0.041334140569101216, + "grad_norm": 1.1346478462219238, + "learning_rate": 9.99203917620214e-06, + "loss": 0.9608, + "step": 751 + }, + { + "epoch": 0.04138917937145688, + "grad_norm": 1.0108580589294434, + "learning_rate": 9.992014706718841e-06, + "loss": 0.9179, + "step": 752 + }, + { + "epoch": 0.04144421817381254, + "grad_norm": 0.897293210029602, + "learning_rate": 9.991990199716894e-06, + "loss": 0.9295, + "step": 753 + }, + { + "epoch": 0.0414992569761682, + "grad_norm": 1.0152363777160645, + "learning_rate": 9.991965655196488e-06, + "loss": 0.8467, + "step": 754 + }, + { + "epoch": 0.04155429577852386, + "grad_norm": 0.8655388355255127, + "learning_rate": 9.9919410731578e-06, + "loss": 0.796, + "step": 755 + }, + { + "epoch": 0.04160933458087952, + "grad_norm": 1.0140331983566284, + "learning_rate": 9.991916453601023e-06, + "loss": 0.8444, + "step": 756 + }, + { + "epoch": 0.041664373383235184, + "grad_norm": 0.9387341141700745, + "learning_rate": 9.991891796526338e-06, + "loss": 0.8669, + "step": 757 + }, + { + "epoch": 0.04171941218559084, + "grad_norm": 0.9395696520805359, + "learning_rate": 9.991867101933928e-06, + "loss": 0.8376, + "step": 758 + }, + { + "epoch": 0.0417744509879465, + "grad_norm": 1.0856634378433228, + "learning_rate": 9.991842369823983e-06, + "loss": 0.9271, + "step": 759 + }, + { + "epoch": 0.04182948979030216, + "grad_norm": 0.8777190446853638, + "learning_rate": 9.991817600196687e-06, + "loss": 0.9197, + "step": 760 + }, + { + "epoch": 0.04188452859265782, + "grad_norm": 0.9639917016029358, + "learning_rate": 9.991792793052225e-06, + "loss": 0.8835, + "step": 761 + }, + { + "epoch": 0.041939567395013484, + "grad_norm": 0.9384773969650269, + "learning_rate": 9.991767948390785e-06, + "loss": 0.8403, + "step": 762 + }, + { + "epoch": 0.041994606197369146, + "grad_norm": 0.8987650275230408, + "learning_rate": 9.991743066212554e-06, + "loss": 0.7948, + "step": 763 + }, + { + "epoch": 0.04204964499972481, + "grad_norm": 1.0545049905776978, + "learning_rate": 9.991718146517717e-06, + "loss": 0.9359, + "step": 764 + }, + { + "epoch": 0.04210468380208047, + "grad_norm": 0.9840022325515747, + "learning_rate": 9.991693189306463e-06, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.04215972260443613, + "grad_norm": 0.8769927620887756, + "learning_rate": 9.991668194578981e-06, + "loss": 0.8647, + "step": 766 + }, + { + "epoch": 0.04221476140679179, + "grad_norm": 0.9268791675567627, + "learning_rate": 9.991643162335455e-06, + "loss": 0.897, + "step": 767 + }, + { + "epoch": 0.042269800209147446, + "grad_norm": 0.9316747784614563, + "learning_rate": 9.991618092576075e-06, + "loss": 0.9341, + "step": 768 + }, + { + "epoch": 0.04232483901150311, + "grad_norm": 0.8348364233970642, + "learning_rate": 9.991592985301031e-06, + "loss": 0.7528, + "step": 769 + }, + { + "epoch": 0.04237987781385877, + "grad_norm": 0.9139068126678467, + "learning_rate": 9.99156784051051e-06, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 0.04243491661621443, + "grad_norm": 0.9403928518295288, + "learning_rate": 9.991542658204701e-06, + "loss": 0.974, + "step": 771 + }, + { + "epoch": 0.04248995541857009, + "grad_norm": 0.993549108505249, + "learning_rate": 9.991517438383793e-06, + "loss": 0.9479, + "step": 772 + }, + { + "epoch": 0.04254499422092575, + "grad_norm": 0.8494916558265686, + "learning_rate": 9.991492181047975e-06, + "loss": 0.9149, + "step": 773 + }, + { + "epoch": 0.042600033023281414, + "grad_norm": 1.0351910591125488, + "learning_rate": 9.991466886197441e-06, + "loss": 0.9552, + "step": 774 + }, + { + "epoch": 0.042655071825637075, + "grad_norm": 0.916829526424408, + "learning_rate": 9.991441553832375e-06, + "loss": 0.8781, + "step": 775 + }, + { + "epoch": 0.04271011062799274, + "grad_norm": 1.113476276397705, + "learning_rate": 9.991416183952972e-06, + "loss": 0.8137, + "step": 776 + }, + { + "epoch": 0.0427651494303484, + "grad_norm": 1.1608171463012695, + "learning_rate": 9.991390776559421e-06, + "loss": 1.0045, + "step": 777 + }, + { + "epoch": 0.04282018823270406, + "grad_norm": 1.0045493841171265, + "learning_rate": 9.991365331651913e-06, + "loss": 0.8813, + "step": 778 + }, + { + "epoch": 0.042875227035059714, + "grad_norm": 0.918820858001709, + "learning_rate": 9.991339849230639e-06, + "loss": 0.9198, + "step": 779 + }, + { + "epoch": 0.042930265837415375, + "grad_norm": 0.9875735640525818, + "learning_rate": 9.991314329295792e-06, + "loss": 0.8665, + "step": 780 + }, + { + "epoch": 0.04298530463977104, + "grad_norm": 0.873768150806427, + "learning_rate": 9.991288771847561e-06, + "loss": 0.8606, + "step": 781 + }, + { + "epoch": 0.0430403434421267, + "grad_norm": 0.8892746567726135, + "learning_rate": 9.991263176886139e-06, + "loss": 0.9011, + "step": 782 + }, + { + "epoch": 0.04309538224448236, + "grad_norm": 1.097734808921814, + "learning_rate": 9.99123754441172e-06, + "loss": 1.009, + "step": 783 + }, + { + "epoch": 0.04315042104683802, + "grad_norm": 1.0065964460372925, + "learning_rate": 9.991211874424497e-06, + "loss": 0.9492, + "step": 784 + }, + { + "epoch": 0.04320545984919368, + "grad_norm": 1.0791678428649902, + "learning_rate": 9.99118616692466e-06, + "loss": 1.0142, + "step": 785 + }, + { + "epoch": 0.043260498651549344, + "grad_norm": 0.9454777836799622, + "learning_rate": 9.991160421912404e-06, + "loss": 0.8058, + "step": 786 + }, + { + "epoch": 0.043315537453905005, + "grad_norm": 0.9448156952857971, + "learning_rate": 9.991134639387922e-06, + "loss": 0.8184, + "step": 787 + }, + { + "epoch": 0.043370576256260666, + "grad_norm": 0.9636550545692444, + "learning_rate": 9.99110881935141e-06, + "loss": 0.8606, + "step": 788 + }, + { + "epoch": 0.04342561505861633, + "grad_norm": 0.9933613538742065, + "learning_rate": 9.991082961803058e-06, + "loss": 0.9449, + "step": 789 + }, + { + "epoch": 0.04348065386097198, + "grad_norm": 0.8906797170639038, + "learning_rate": 9.991057066743065e-06, + "loss": 0.8053, + "step": 790 + }, + { + "epoch": 0.043535692663327644, + "grad_norm": 1.0393906831741333, + "learning_rate": 9.991031134171621e-06, + "loss": 0.8487, + "step": 791 + }, + { + "epoch": 0.043590731465683305, + "grad_norm": 1.0618231296539307, + "learning_rate": 9.991005164088923e-06, + "loss": 0.9847, + "step": 792 + }, + { + "epoch": 0.043645770268038966, + "grad_norm": 0.9525149464607239, + "learning_rate": 9.990979156495167e-06, + "loss": 0.9318, + "step": 793 + }, + { + "epoch": 0.04370080907039463, + "grad_norm": 0.9430851936340332, + "learning_rate": 9.990953111390546e-06, + "loss": 0.8483, + "step": 794 + }, + { + "epoch": 0.04375584787275029, + "grad_norm": 0.9259672164916992, + "learning_rate": 9.99092702877526e-06, + "loss": 0.9365, + "step": 795 + }, + { + "epoch": 0.04381088667510595, + "grad_norm": 0.942609965801239, + "learning_rate": 9.9909009086495e-06, + "loss": 0.8408, + "step": 796 + }, + { + "epoch": 0.04386592547746161, + "grad_norm": 0.939255952835083, + "learning_rate": 9.990874751013467e-06, + "loss": 0.8749, + "step": 797 + }, + { + "epoch": 0.04392096427981727, + "grad_norm": 1.1701711416244507, + "learning_rate": 9.990848555867353e-06, + "loss": 0.9312, + "step": 798 + }, + { + "epoch": 0.043976003082172935, + "grad_norm": 1.0441124439239502, + "learning_rate": 9.990822323211358e-06, + "loss": 0.8618, + "step": 799 + }, + { + "epoch": 0.04403104188452859, + "grad_norm": 0.9601489305496216, + "learning_rate": 9.990796053045679e-06, + "loss": 0.9569, + "step": 800 + }, + { + "epoch": 0.04408608068688425, + "grad_norm": 0.9394032955169678, + "learning_rate": 9.990769745370513e-06, + "loss": 0.846, + "step": 801 + }, + { + "epoch": 0.04414111948923991, + "grad_norm": 0.9631348252296448, + "learning_rate": 9.990743400186056e-06, + "loss": 0.8754, + "step": 802 + }, + { + "epoch": 0.04419615829159557, + "grad_norm": 0.9234963059425354, + "learning_rate": 9.990717017492508e-06, + "loss": 0.8613, + "step": 803 + }, + { + "epoch": 0.044251197093951235, + "grad_norm": 0.9169090390205383, + "learning_rate": 9.990690597290069e-06, + "loss": 0.8867, + "step": 804 + }, + { + "epoch": 0.044306235896306896, + "grad_norm": 1.0194867849349976, + "learning_rate": 9.990664139578933e-06, + "loss": 0.8675, + "step": 805 + }, + { + "epoch": 0.04436127469866256, + "grad_norm": 1.3226114511489868, + "learning_rate": 9.990637644359302e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.04441631350101822, + "grad_norm": 0.8904317617416382, + "learning_rate": 9.990611111631374e-06, + "loss": 0.7274, + "step": 807 + }, + { + "epoch": 0.04447135230337388, + "grad_norm": 0.8909007906913757, + "learning_rate": 9.99058454139535e-06, + "loss": 0.8141, + "step": 808 + }, + { + "epoch": 0.04452639110572954, + "grad_norm": 1.004015564918518, + "learning_rate": 9.990557933651429e-06, + "loss": 0.9883, + "step": 809 + }, + { + "epoch": 0.0445814299080852, + "grad_norm": 1.1215732097625732, + "learning_rate": 9.990531288399807e-06, + "loss": 0.9355, + "step": 810 + }, + { + "epoch": 0.04463646871044086, + "grad_norm": 1.0545012950897217, + "learning_rate": 9.99050460564069e-06, + "loss": 0.9532, + "step": 811 + }, + { + "epoch": 0.04469150751279652, + "grad_norm": 0.9608867168426514, + "learning_rate": 9.990477885374277e-06, + "loss": 0.9363, + "step": 812 + }, + { + "epoch": 0.04474654631515218, + "grad_norm": 0.8750461935997009, + "learning_rate": 9.990451127600766e-06, + "loss": 0.7343, + "step": 813 + }, + { + "epoch": 0.04480158511750784, + "grad_norm": 0.891740620136261, + "learning_rate": 9.99042433232036e-06, + "loss": 0.8541, + "step": 814 + }, + { + "epoch": 0.0448566239198635, + "grad_norm": 1.1520029306411743, + "learning_rate": 9.990397499533264e-06, + "loss": 0.7696, + "step": 815 + }, + { + "epoch": 0.044911662722219164, + "grad_norm": 0.9526278972625732, + "learning_rate": 9.990370629239673e-06, + "loss": 0.8953, + "step": 816 + }, + { + "epoch": 0.044966701524574826, + "grad_norm": 0.9218434691429138, + "learning_rate": 9.990343721439795e-06, + "loss": 0.8198, + "step": 817 + }, + { + "epoch": 0.04502174032693049, + "grad_norm": 0.8502745628356934, + "learning_rate": 9.990316776133827e-06, + "loss": 0.8035, + "step": 818 + }, + { + "epoch": 0.04507677912928615, + "grad_norm": 0.8861565589904785, + "learning_rate": 9.990289793321975e-06, + "loss": 0.8626, + "step": 819 + }, + { + "epoch": 0.04513181793164181, + "grad_norm": 1.1113256216049194, + "learning_rate": 9.99026277300444e-06, + "loss": 0.9363, + "step": 820 + }, + { + "epoch": 0.04518685673399747, + "grad_norm": 0.9984708428382874, + "learning_rate": 9.990235715181426e-06, + "loss": 1.0376, + "step": 821 + }, + { + "epoch": 0.045241895536353126, + "grad_norm": 0.9026711583137512, + "learning_rate": 9.990208619853137e-06, + "loss": 0.9079, + "step": 822 + }, + { + "epoch": 0.04529693433870879, + "grad_norm": 0.8724965453147888, + "learning_rate": 9.990181487019775e-06, + "loss": 0.8665, + "step": 823 + }, + { + "epoch": 0.04535197314106445, + "grad_norm": 0.8923047780990601, + "learning_rate": 9.990154316681543e-06, + "loss": 0.7779, + "step": 824 + }, + { + "epoch": 0.04540701194342011, + "grad_norm": 0.9024640321731567, + "learning_rate": 9.99012710883865e-06, + "loss": 0.8859, + "step": 825 + }, + { + "epoch": 0.04546205074577577, + "grad_norm": 0.9245888590812683, + "learning_rate": 9.990099863491296e-06, + "loss": 0.8501, + "step": 826 + }, + { + "epoch": 0.04551708954813143, + "grad_norm": 0.9257050156593323, + "learning_rate": 9.990072580639687e-06, + "loss": 0.9561, + "step": 827 + }, + { + "epoch": 0.045572128350487094, + "grad_norm": 0.995610773563385, + "learning_rate": 9.99004526028403e-06, + "loss": 0.917, + "step": 828 + }, + { + "epoch": 0.045627167152842756, + "grad_norm": 0.9524009823799133, + "learning_rate": 9.990017902424525e-06, + "loss": 0.9184, + "step": 829 + }, + { + "epoch": 0.04568220595519842, + "grad_norm": 0.9264503121376038, + "learning_rate": 9.989990507061385e-06, + "loss": 0.8615, + "step": 830 + }, + { + "epoch": 0.04573724475755408, + "grad_norm": 1.0068570375442505, + "learning_rate": 9.989963074194809e-06, + "loss": 0.8331, + "step": 831 + }, + { + "epoch": 0.04579228355990974, + "grad_norm": 0.9295952320098877, + "learning_rate": 9.989935603825009e-06, + "loss": 0.8387, + "step": 832 + }, + { + "epoch": 0.045847322362265394, + "grad_norm": 1.0408827066421509, + "learning_rate": 9.989908095952186e-06, + "loss": 0.9686, + "step": 833 + }, + { + "epoch": 0.045902361164621056, + "grad_norm": 0.8874136209487915, + "learning_rate": 9.989880550576551e-06, + "loss": 0.815, + "step": 834 + }, + { + "epoch": 0.04595739996697672, + "grad_norm": 0.9898836016654968, + "learning_rate": 9.989852967698311e-06, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 0.04601243876933238, + "grad_norm": 0.9828970432281494, + "learning_rate": 9.989825347317668e-06, + "loss": 0.7922, + "step": 836 + }, + { + "epoch": 0.04606747757168804, + "grad_norm": 1.025447964668274, + "learning_rate": 9.989797689434836e-06, + "loss": 0.9349, + "step": 837 + }, + { + "epoch": 0.0461225163740437, + "grad_norm": 0.8623831272125244, + "learning_rate": 9.98976999405002e-06, + "loss": 0.8786, + "step": 838 + }, + { + "epoch": 0.04617755517639936, + "grad_norm": 0.9614997506141663, + "learning_rate": 9.98974226116343e-06, + "loss": 0.7885, + "step": 839 + }, + { + "epoch": 0.046232593978755024, + "grad_norm": 1.0207616090774536, + "learning_rate": 9.989714490775269e-06, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.046287632781110685, + "grad_norm": 0.8509595990180969, + "learning_rate": 9.98968668288575e-06, + "loss": 0.7312, + "step": 841 + }, + { + "epoch": 0.04634267158346635, + "grad_norm": 0.9822607040405273, + "learning_rate": 9.989658837495084e-06, + "loss": 0.952, + "step": 842 + }, + { + "epoch": 0.046397710385822, + "grad_norm": 1.0058252811431885, + "learning_rate": 9.989630954603477e-06, + "loss": 0.8811, + "step": 843 + }, + { + "epoch": 0.04645274918817766, + "grad_norm": 1.0146985054016113, + "learning_rate": 9.989603034211139e-06, + "loss": 0.9051, + "step": 844 + }, + { + "epoch": 0.046507787990533324, + "grad_norm": 0.8976503610610962, + "learning_rate": 9.98957507631828e-06, + "loss": 0.879, + "step": 845 + }, + { + "epoch": 0.046562826792888985, + "grad_norm": 0.8791939616203308, + "learning_rate": 9.989547080925111e-06, + "loss": 0.8944, + "step": 846 + }, + { + "epoch": 0.04661786559524465, + "grad_norm": 0.8530884981155396, + "learning_rate": 9.989519048031842e-06, + "loss": 0.9029, + "step": 847 + }, + { + "epoch": 0.04667290439760031, + "grad_norm": 0.9621617197990417, + "learning_rate": 9.989490977638683e-06, + "loss": 0.8374, + "step": 848 + }, + { + "epoch": 0.04672794319995597, + "grad_norm": 0.9629075527191162, + "learning_rate": 9.989462869745845e-06, + "loss": 0.9032, + "step": 849 + }, + { + "epoch": 0.04678298200231163, + "grad_norm": 1.3256126642227173, + "learning_rate": 9.989434724353541e-06, + "loss": 0.9748, + "step": 850 + }, + { + "epoch": 0.04683802080466729, + "grad_norm": 1.0230494737625122, + "learning_rate": 9.989406541461979e-06, + "loss": 0.9752, + "step": 851 + }, + { + "epoch": 0.046893059607022954, + "grad_norm": 0.8454533219337463, + "learning_rate": 9.989378321071375e-06, + "loss": 0.8426, + "step": 852 + }, + { + "epoch": 0.046948098409378615, + "grad_norm": 0.9995863437652588, + "learning_rate": 9.989350063181939e-06, + "loss": 0.9955, + "step": 853 + }, + { + "epoch": 0.04700313721173427, + "grad_norm": 0.8956604599952698, + "learning_rate": 9.989321767793883e-06, + "loss": 0.9024, + "step": 854 + }, + { + "epoch": 0.04705817601408993, + "grad_norm": 1.0123292207717896, + "learning_rate": 9.989293434907419e-06, + "loss": 0.7856, + "step": 855 + }, + { + "epoch": 0.04711321481644559, + "grad_norm": 0.814577043056488, + "learning_rate": 9.989265064522762e-06, + "loss": 0.8377, + "step": 856 + }, + { + "epoch": 0.047168253618801254, + "grad_norm": 1.1571552753448486, + "learning_rate": 9.989236656640125e-06, + "loss": 0.8562, + "step": 857 + }, + { + "epoch": 0.047223292421156915, + "grad_norm": 0.9681577682495117, + "learning_rate": 9.98920821125972e-06, + "loss": 0.8473, + "step": 858 + }, + { + "epoch": 0.047278331223512576, + "grad_norm": 0.9680121541023254, + "learning_rate": 9.989179728381761e-06, + "loss": 0.9811, + "step": 859 + }, + { + "epoch": 0.04733337002586824, + "grad_norm": 0.985477089881897, + "learning_rate": 9.989151208006464e-06, + "loss": 0.6994, + "step": 860 + }, + { + "epoch": 0.0473884088282239, + "grad_norm": 0.8612962365150452, + "learning_rate": 9.98912265013404e-06, + "loss": 0.7667, + "step": 861 + }, + { + "epoch": 0.04744344763057956, + "grad_norm": 0.8884604573249817, + "learning_rate": 9.989094054764708e-06, + "loss": 0.8382, + "step": 862 + }, + { + "epoch": 0.04749848643293522, + "grad_norm": 1.036881923675537, + "learning_rate": 9.989065421898681e-06, + "loss": 0.8748, + "step": 863 + }, + { + "epoch": 0.04755352523529088, + "grad_norm": 0.9954493045806885, + "learning_rate": 9.989036751536171e-06, + "loss": 0.9174, + "step": 864 + }, + { + "epoch": 0.04760856403764654, + "grad_norm": 0.9984694123268127, + "learning_rate": 9.989008043677399e-06, + "loss": 0.7636, + "step": 865 + }, + { + "epoch": 0.0476636028400022, + "grad_norm": 1.0412588119506836, + "learning_rate": 9.988979298322576e-06, + "loss": 0.773, + "step": 866 + }, + { + "epoch": 0.04771864164235786, + "grad_norm": 0.8034874796867371, + "learning_rate": 9.98895051547192e-06, + "loss": 0.7914, + "step": 867 + }, + { + "epoch": 0.04777368044471352, + "grad_norm": 0.8983979225158691, + "learning_rate": 9.988921695125648e-06, + "loss": 0.7292, + "step": 868 + }, + { + "epoch": 0.04782871924706918, + "grad_norm": 0.9445077776908875, + "learning_rate": 9.988892837283976e-06, + "loss": 0.8263, + "step": 869 + }, + { + "epoch": 0.047883758049424845, + "grad_norm": 1.0753306150436401, + "learning_rate": 9.988863941947121e-06, + "loss": 1.1122, + "step": 870 + }, + { + "epoch": 0.047938796851780506, + "grad_norm": 1.0091484785079956, + "learning_rate": 9.9888350091153e-06, + "loss": 0.9276, + "step": 871 + }, + { + "epoch": 0.04799383565413617, + "grad_norm": 1.0977306365966797, + "learning_rate": 9.988806038788732e-06, + "loss": 0.854, + "step": 872 + }, + { + "epoch": 0.04804887445649183, + "grad_norm": 1.0285007953643799, + "learning_rate": 9.988777030967632e-06, + "loss": 0.9441, + "step": 873 + }, + { + "epoch": 0.04810391325884749, + "grad_norm": 0.8973976373672485, + "learning_rate": 9.988747985652218e-06, + "loss": 0.786, + "step": 874 + }, + { + "epoch": 0.04815895206120315, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.98871890284271e-06, + "loss": 0.9042, + "step": 875 + }, + { + "epoch": 0.048213990863558806, + "grad_norm": 0.8514279723167419, + "learning_rate": 9.988689782539326e-06, + "loss": 0.7874, + "step": 876 + }, + { + "epoch": 0.04826902966591447, + "grad_norm": 0.8299674391746521, + "learning_rate": 9.988660624742286e-06, + "loss": 0.8704, + "step": 877 + }, + { + "epoch": 0.04832406846827013, + "grad_norm": 0.9862462282180786, + "learning_rate": 9.988631429451809e-06, + "loss": 0.9963, + "step": 878 + }, + { + "epoch": 0.04837910727062579, + "grad_norm": 0.9041131734848022, + "learning_rate": 9.988602196668111e-06, + "loss": 0.9207, + "step": 879 + }, + { + "epoch": 0.04843414607298145, + "grad_norm": 0.8597276210784912, + "learning_rate": 9.988572926391416e-06, + "loss": 0.8226, + "step": 880 + }, + { + "epoch": 0.04848918487533711, + "grad_norm": 0.9494329690933228, + "learning_rate": 9.988543618621941e-06, + "loss": 0.8834, + "step": 881 + }, + { + "epoch": 0.048544223677692774, + "grad_norm": 0.9129118323326111, + "learning_rate": 9.98851427335991e-06, + "loss": 0.7819, + "step": 882 + }, + { + "epoch": 0.048599262480048436, + "grad_norm": 0.9145999550819397, + "learning_rate": 9.988484890605539e-06, + "loss": 0.885, + "step": 883 + }, + { + "epoch": 0.0486543012824041, + "grad_norm": 1.0115307569503784, + "learning_rate": 9.98845547035905e-06, + "loss": 0.8347, + "step": 884 + }, + { + "epoch": 0.04870934008475976, + "grad_norm": 1.1372706890106201, + "learning_rate": 9.988426012620667e-06, + "loss": 0.944, + "step": 885 + }, + { + "epoch": 0.04876437888711541, + "grad_norm": 0.9502811431884766, + "learning_rate": 9.98839651739061e-06, + "loss": 0.9054, + "step": 886 + }, + { + "epoch": 0.048819417689471074, + "grad_norm": 0.9612823128700256, + "learning_rate": 9.988366984669097e-06, + "loss": 0.8796, + "step": 887 + }, + { + "epoch": 0.048874456491826736, + "grad_norm": 0.9551461935043335, + "learning_rate": 9.988337414456355e-06, + "loss": 0.8769, + "step": 888 + }, + { + "epoch": 0.0489294952941824, + "grad_norm": 0.8554086089134216, + "learning_rate": 9.988307806752603e-06, + "loss": 0.892, + "step": 889 + }, + { + "epoch": 0.04898453409653806, + "grad_norm": 0.8418886661529541, + "learning_rate": 9.988278161558067e-06, + "loss": 0.7568, + "step": 890 + }, + { + "epoch": 0.04903957289889372, + "grad_norm": 1.4780360460281372, + "learning_rate": 9.988248478872967e-06, + "loss": 0.9126, + "step": 891 + }, + { + "epoch": 0.04909461170124938, + "grad_norm": 0.8236714005470276, + "learning_rate": 9.988218758697526e-06, + "loss": 0.7317, + "step": 892 + }, + { + "epoch": 0.04914965050360504, + "grad_norm": 0.8777141571044922, + "learning_rate": 9.988189001031968e-06, + "loss": 0.7989, + "step": 893 + }, + { + "epoch": 0.049204689305960704, + "grad_norm": 1.0235031843185425, + "learning_rate": 9.988159205876516e-06, + "loss": 0.8335, + "step": 894 + }, + { + "epoch": 0.049259728108316365, + "grad_norm": 0.9340357184410095, + "learning_rate": 9.988129373231395e-06, + "loss": 0.8129, + "step": 895 + }, + { + "epoch": 0.04931476691067203, + "grad_norm": 1.7686667442321777, + "learning_rate": 9.98809950309683e-06, + "loss": 0.9792, + "step": 896 + }, + { + "epoch": 0.04936980571302768, + "grad_norm": 0.9252369403839111, + "learning_rate": 9.988069595473044e-06, + "loss": 0.8671, + "step": 897 + }, + { + "epoch": 0.04942484451538334, + "grad_norm": 0.9989960789680481, + "learning_rate": 9.988039650360262e-06, + "loss": 0.9245, + "step": 898 + }, + { + "epoch": 0.049479883317739004, + "grad_norm": 1.062912106513977, + "learning_rate": 9.98800966775871e-06, + "loss": 0.9146, + "step": 899 + }, + { + "epoch": 0.049534922120094665, + "grad_norm": 0.8698169589042664, + "learning_rate": 9.98797964766861e-06, + "loss": 0.8606, + "step": 900 + }, + { + "epoch": 0.04958996092245033, + "grad_norm": 1.6754224300384521, + "learning_rate": 9.98794959009019e-06, + "loss": 0.9236, + "step": 901 + }, + { + "epoch": 0.04964499972480599, + "grad_norm": 1.084174394607544, + "learning_rate": 9.98791949502368e-06, + "loss": 0.9252, + "step": 902 + }, + { + "epoch": 0.04970003852716165, + "grad_norm": 0.9866724610328674, + "learning_rate": 9.987889362469301e-06, + "loss": 0.9096, + "step": 903 + }, + { + "epoch": 0.04975507732951731, + "grad_norm": 0.8814040422439575, + "learning_rate": 9.987859192427279e-06, + "loss": 0.8475, + "step": 904 + }, + { + "epoch": 0.04981011613187297, + "grad_norm": 0.8796457052230835, + "learning_rate": 9.987828984897843e-06, + "loss": 0.8478, + "step": 905 + }, + { + "epoch": 0.049865154934228634, + "grad_norm": 1.0541884899139404, + "learning_rate": 9.98779873988122e-06, + "loss": 0.9799, + "step": 906 + }, + { + "epoch": 0.049920193736584295, + "grad_norm": 0.91409832239151, + "learning_rate": 9.987768457377636e-06, + "loss": 0.8701, + "step": 907 + }, + { + "epoch": 0.04997523253893995, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.98773813738732e-06, + "loss": 0.8417, + "step": 908 + }, + { + "epoch": 0.05003027134129561, + "grad_norm": 1.7744206190109253, + "learning_rate": 9.987707779910499e-06, + "loss": 0.9263, + "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + }, + { + "epoch": 0.2502063955088337, + "grad_norm": 0.8124812841415405, + "learning_rate": 9.633168274647203e-06, + "loss": 0.8133, + "step": 4546 + }, + { + "epoch": 0.2502614343111894, + "grad_norm": 0.8511367440223694, + "learning_rate": 9.63300528873605e-06, + "loss": 0.7747, + "step": 4547 + }, + { + "epoch": 0.25031647311354505, + "grad_norm": 0.7305121421813965, + "learning_rate": 9.632842268004469e-06, + "loss": 0.8479, + "step": 4548 + }, + { + "epoch": 0.25037151191590074, + "grad_norm": 0.7127692103385925, + "learning_rate": 9.632679212453686e-06, + "loss": 0.8514, + "step": 4549 + }, + { + "epoch": 0.25042655071825637, + "grad_norm": 0.8251872062683105, + "learning_rate": 9.632516122084926e-06, + "loss": 0.7686, + "step": 4550 + }, + { + "epoch": 0.25048158952061206, + "grad_norm": 0.6756613850593567, + "learning_rate": 9.632352996899413e-06, + "loss": 0.5959, + "step": 4551 + }, + { + "epoch": 0.2505366283229677, + "grad_norm": 0.9266120791435242, + "learning_rate": 9.632189836898377e-06, + "loss": 0.7889, + "step": 4552 + }, + { + "epoch": 0.2505916671253233, + "grad_norm": 0.769890546798706, + "learning_rate": 9.63202664208304e-06, + "loss": 0.7864, + "step": 4553 + }, + { + "epoch": 0.250646705927679, + "grad_norm": 0.7314025163650513, + "learning_rate": 9.631863412454634e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 0.25070174473003465, + "grad_norm": 0.818317711353302, + "learning_rate": 9.63170014801438e-06, + "loss": 0.7096, + "step": 4555 + }, + { + "epoch": 0.25075678353239034, + "grad_norm": 0.7538807392120361, + "learning_rate": 9.631536848763508e-06, + "loss": 0.7779, + "step": 4556 + }, + { + "epoch": 0.25081182233474597, + "grad_norm": 0.7658100128173828, + "learning_rate": 9.631373514703247e-06, + "loss": 0.8535, + "step": 4557 + }, + { + "epoch": 0.25086686113710166, + "grad_norm": 0.8019290566444397, + "learning_rate": 9.631210145834819e-06, + "loss": 0.8141, + "step": 4558 + }, + { + "epoch": 0.2509218999394573, + "grad_norm": 0.7257653474807739, + "learning_rate": 9.631046742159456e-06, + "loss": 0.7451, + "step": 4559 + }, + { + "epoch": 0.250976938741813, + "grad_norm": 0.7546024918556213, + "learning_rate": 9.630883303678386e-06, + "loss": 0.7707, + "step": 4560 + }, + { + "epoch": 0.2510319775441686, + "grad_norm": 0.7288938760757446, + "learning_rate": 9.630719830392835e-06, + "loss": 0.7362, + "step": 4561 + }, + { + "epoch": 0.2510870163465243, + "grad_norm": 0.7814223170280457, + "learning_rate": 9.630556322304036e-06, + "loss": 0.8514, + "step": 4562 + }, + { + "epoch": 0.25114205514887994, + "grad_norm": 0.7561381459236145, + "learning_rate": 9.630392779413214e-06, + "loss": 0.7659, + "step": 4563 + }, + { + "epoch": 0.25119709395123563, + "grad_norm": 0.750641942024231, + "learning_rate": 9.6302292017216e-06, + "loss": 0.8496, + "step": 4564 + }, + { + "epoch": 0.25125213275359126, + "grad_norm": 0.832155704498291, + "learning_rate": 9.630065589230422e-06, + "loss": 0.7778, + "step": 4565 + }, + { + "epoch": 0.25130717155594695, + "grad_norm": 0.8202440142631531, + "learning_rate": 9.62990194194091e-06, + "loss": 0.8962, + "step": 4566 + }, + { + "epoch": 0.2513622103583026, + "grad_norm": 0.8777977824211121, + "learning_rate": 9.629738259854295e-06, + "loss": 0.7215, + "step": 4567 + }, + { + "epoch": 0.2514172491606583, + "grad_norm": 1.1868599653244019, + "learning_rate": 9.629574542971806e-06, + "loss": 0.8238, + "step": 4568 + }, + { + "epoch": 0.2514722879630139, + "grad_norm": 0.9128753542900085, + "learning_rate": 9.629410791294675e-06, + "loss": 0.7638, + "step": 4569 + }, + { + "epoch": 0.2515273267653696, + "grad_norm": 0.7350082993507385, + "learning_rate": 9.629247004824132e-06, + "loss": 0.8041, + "step": 4570 + }, + { + "epoch": 0.25158236556772523, + "grad_norm": 0.7279660701751709, + "learning_rate": 9.629083183561407e-06, + "loss": 0.7377, + "step": 4571 + }, + { + "epoch": 0.2516374043700809, + "grad_norm": 0.8570461273193359, + "learning_rate": 9.628919327507732e-06, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.25169244317243655, + "grad_norm": 0.8998312950134277, + "learning_rate": 9.62875543666434e-06, + "loss": 0.8171, + "step": 4573 + }, + { + "epoch": 0.25174748197479224, + "grad_norm": 0.7631624937057495, + "learning_rate": 9.628591511032456e-06, + "loss": 0.7871, + "step": 4574 + }, + { + "epoch": 0.2518025207771479, + "grad_norm": 0.7752320766448975, + "learning_rate": 9.628427550613322e-06, + "loss": 0.8241, + "step": 4575 + }, + { + "epoch": 0.25185755957950356, + "grad_norm": 0.8741563558578491, + "learning_rate": 9.628263555408163e-06, + "loss": 0.7312, + "step": 4576 + }, + { + "epoch": 0.2519125983818592, + "grad_norm": 0.8615008592605591, + "learning_rate": 9.628099525418216e-06, + "loss": 0.8586, + "step": 4577 + }, + { + "epoch": 0.2519676371842149, + "grad_norm": 0.8273662328720093, + "learning_rate": 9.62793546064471e-06, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.2520226759865705, + "grad_norm": 0.7454090118408203, + "learning_rate": 9.627771361088882e-06, + "loss": 0.8461, + "step": 4579 + }, + { + "epoch": 0.2520777147889262, + "grad_norm": 0.8225379586219788, + "learning_rate": 9.627607226751962e-06, + "loss": 0.7792, + "step": 4580 + }, + { + "epoch": 0.25213275359128184, + "grad_norm": 0.8655416369438171, + "learning_rate": 9.627443057635184e-06, + "loss": 0.8165, + "step": 4581 + }, + { + "epoch": 0.25218779239363753, + "grad_norm": 0.7735984921455383, + "learning_rate": 9.627278853739783e-06, + "loss": 0.8208, + "step": 4582 + }, + { + "epoch": 0.25224283119599317, + "grad_norm": 0.8293350338935852, + "learning_rate": 9.627114615066994e-06, + "loss": 0.7394, + "step": 4583 + }, + { + "epoch": 0.25229786999834886, + "grad_norm": 0.7840214371681213, + "learning_rate": 9.626950341618048e-06, + "loss": 0.8522, + "step": 4584 + }, + { + "epoch": 0.2523529088007045, + "grad_norm": 0.7724186182022095, + "learning_rate": 9.626786033394185e-06, + "loss": 0.8175, + "step": 4585 + }, + { + "epoch": 0.2524079476030602, + "grad_norm": 1.0751588344573975, + "learning_rate": 9.626621690396634e-06, + "loss": 0.9229, + "step": 4586 + }, + { + "epoch": 0.2524629864054158, + "grad_norm": 0.7016913294792175, + "learning_rate": 9.626457312626634e-06, + "loss": 0.6883, + "step": 4587 + }, + { + "epoch": 0.2525180252077715, + "grad_norm": 0.918377697467804, + "learning_rate": 9.626292900085419e-06, + "loss": 0.7889, + "step": 4588 + }, + { + "epoch": 0.25257306401012714, + "grad_norm": 1.006564736366272, + "learning_rate": 9.626128452774226e-06, + "loss": 0.7888, + "step": 4589 + }, + { + "epoch": 0.2526281028124828, + "grad_norm": 1.0214998722076416, + "learning_rate": 9.625963970694287e-06, + "loss": 0.768, + "step": 4590 + }, + { + "epoch": 0.25268314161483846, + "grad_norm": 0.7980843186378479, + "learning_rate": 9.625799453846844e-06, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 0.25273818041719415, + "grad_norm": 0.734582245349884, + "learning_rate": 9.625634902233128e-06, + "loss": 0.759, + "step": 4592 + }, + { + "epoch": 0.2527932192195498, + "grad_norm": 0.7185904383659363, + "learning_rate": 9.62547031585438e-06, + "loss": 0.774, + "step": 4593 + }, + { + "epoch": 0.25284825802190547, + "grad_norm": 0.7356622219085693, + "learning_rate": 9.625305694711835e-06, + "loss": 0.7435, + "step": 4594 + }, + { + "epoch": 0.2529032968242611, + "grad_norm": 0.7589355707168579, + "learning_rate": 9.62514103880673e-06, + "loss": 0.807, + "step": 4595 + }, + { + "epoch": 0.25295833562661674, + "grad_norm": 0.889228880405426, + "learning_rate": 9.624976348140305e-06, + "loss": 0.8609, + "step": 4596 + }, + { + "epoch": 0.2530133744289724, + "grad_norm": 0.7546125650405884, + "learning_rate": 9.624811622713793e-06, + "loss": 0.8379, + "step": 4597 + }, + { + "epoch": 0.25306841323132806, + "grad_norm": 0.8262770175933838, + "learning_rate": 9.624646862528436e-06, + "loss": 0.7611, + "step": 4598 + }, + { + "epoch": 0.25312345203368375, + "grad_norm": 0.8876076936721802, + "learning_rate": 9.624482067585472e-06, + "loss": 0.8106, + "step": 4599 + }, + { + "epoch": 0.2531784908360394, + "grad_norm": 0.7045544981956482, + "learning_rate": 9.624317237886137e-06, + "loss": 0.7121, + "step": 4600 + }, + { + "epoch": 0.25323352963839507, + "grad_norm": 0.7693355083465576, + "learning_rate": 9.624152373431672e-06, + "loss": 0.8052, + "step": 4601 + }, + { + "epoch": 0.2532885684407507, + "grad_norm": 0.8072683811187744, + "learning_rate": 9.623987474223316e-06, + "loss": 0.8543, + "step": 4602 + }, + { + "epoch": 0.2533436072431064, + "grad_norm": 0.8158687949180603, + "learning_rate": 9.62382254026231e-06, + "loss": 0.6922, + "step": 4603 + }, + { + "epoch": 0.25339864604546203, + "grad_norm": 0.7688641548156738, + "learning_rate": 9.623657571549887e-06, + "loss": 0.7198, + "step": 4604 + }, + { + "epoch": 0.2534536848478177, + "grad_norm": 0.7806578278541565, + "learning_rate": 9.623492568087293e-06, + "loss": 0.8539, + "step": 4605 + }, + { + "epoch": 0.25350872365017335, + "grad_norm": 0.9557347893714905, + "learning_rate": 9.623327529875769e-06, + "loss": 0.6996, + "step": 4606 + }, + { + "epoch": 0.25356376245252904, + "grad_norm": 0.9465067386627197, + "learning_rate": 9.62316245691655e-06, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 0.2536188012548847, + "grad_norm": 0.8029165863990784, + "learning_rate": 9.62299734921088e-06, + "loss": 0.8573, + "step": 4608 + }, + { + "epoch": 0.25367384005724036, + "grad_norm": 0.7530128955841064, + "learning_rate": 9.62283220676e-06, + "loss": 0.7466, + "step": 4609 + }, + { + "epoch": 0.253728878859596, + "grad_norm": 0.6704453825950623, + "learning_rate": 9.622667029565151e-06, + "loss": 0.6512, + "step": 4610 + }, + { + "epoch": 0.2537839176619517, + "grad_norm": 0.7162728309631348, + "learning_rate": 9.622501817627574e-06, + "loss": 0.7615, + "step": 4611 + }, + { + "epoch": 0.2538389564643073, + "grad_norm": 0.7599188089370728, + "learning_rate": 9.622336570948509e-06, + "loss": 0.8463, + "step": 4612 + }, + { + "epoch": 0.253893995266663, + "grad_norm": 0.7922326922416687, + "learning_rate": 9.6221712895292e-06, + "loss": 0.9221, + "step": 4613 + }, + { + "epoch": 0.25394903406901864, + "grad_norm": 1.4635218381881714, + "learning_rate": 9.622005973370892e-06, + "loss": 0.9159, + "step": 4614 + }, + { + "epoch": 0.25400407287137433, + "grad_norm": 0.8695057034492493, + "learning_rate": 9.62184062247482e-06, + "loss": 0.6792, + "step": 4615 + }, + { + "epoch": 0.25405911167372996, + "grad_norm": 0.8070930242538452, + "learning_rate": 9.621675236842235e-06, + "loss": 0.8257, + "step": 4616 + }, + { + "epoch": 0.25411415047608565, + "grad_norm": 0.8642075061798096, + "learning_rate": 9.621509816474372e-06, + "loss": 0.8223, + "step": 4617 + }, + { + "epoch": 0.2541691892784413, + "grad_norm": 0.7131080031394958, + "learning_rate": 9.621344361372483e-06, + "loss": 0.6831, + "step": 4618 + }, + { + "epoch": 0.254224228080797, + "grad_norm": 0.7582216262817383, + "learning_rate": 9.621178871537804e-06, + "loss": 0.8091, + "step": 4619 + }, + { + "epoch": 0.2542792668831526, + "grad_norm": 0.7705016732215881, + "learning_rate": 9.62101334697158e-06, + "loss": 0.7537, + "step": 4620 + }, + { + "epoch": 0.2543343056855083, + "grad_norm": 0.7638342976570129, + "learning_rate": 9.62084778767506e-06, + "loss": 0.7661, + "step": 4621 + }, + { + "epoch": 0.25438934448786393, + "grad_norm": 0.9296607971191406, + "learning_rate": 9.620682193649482e-06, + "loss": 0.8875, + "step": 4622 + }, + { + "epoch": 0.2544443832902196, + "grad_norm": 0.795394778251648, + "learning_rate": 9.620516564896096e-06, + "loss": 0.6884, + "step": 4623 + }, + { + "epoch": 0.25449942209257526, + "grad_norm": 0.9164957404136658, + "learning_rate": 9.620350901416142e-06, + "loss": 0.8693, + "step": 4624 + }, + { + "epoch": 0.25455446089493095, + "grad_norm": 0.8306281566619873, + "learning_rate": 9.62018520321087e-06, + "loss": 0.8972, + "step": 4625 + }, + { + "epoch": 0.2546094996972866, + "grad_norm": 0.778831422328949, + "learning_rate": 9.620019470281521e-06, + "loss": 0.7574, + "step": 4626 + }, + { + "epoch": 0.25466453849964227, + "grad_norm": 0.9326225519180298, + "learning_rate": 9.619853702629343e-06, + "loss": 0.7712, + "step": 4627 + }, + { + "epoch": 0.2547195773019979, + "grad_norm": 0.8772255182266235, + "learning_rate": 9.619687900255581e-06, + "loss": 0.8241, + "step": 4628 + }, + { + "epoch": 0.2547746161043536, + "grad_norm": 0.8777550458908081, + "learning_rate": 9.619522063161482e-06, + "loss": 0.8724, + "step": 4629 + }, + { + "epoch": 0.2548296549067092, + "grad_norm": 0.8332602381706238, + "learning_rate": 9.61935619134829e-06, + "loss": 0.8716, + "step": 4630 + }, + { + "epoch": 0.2548846937090649, + "grad_norm": 0.8246355056762695, + "learning_rate": 9.619190284817255e-06, + "loss": 0.7789, + "step": 4631 + }, + { + "epoch": 0.25493973251142055, + "grad_norm": 0.7200644612312317, + "learning_rate": 9.61902434356962e-06, + "loss": 0.7956, + "step": 4632 + }, + { + "epoch": 0.25499477131377624, + "grad_norm": 0.827756404876709, + "learning_rate": 9.618858367606638e-06, + "loss": 0.7925, + "step": 4633 + }, + { + "epoch": 0.25504981011613187, + "grad_norm": 0.7749341726303101, + "learning_rate": 9.618692356929551e-06, + "loss": 0.8706, + "step": 4634 + }, + { + "epoch": 0.25510484891848756, + "grad_norm": 0.7233432531356812, + "learning_rate": 9.618526311539608e-06, + "loss": 0.7725, + "step": 4635 + }, + { + "epoch": 0.2551598877208432, + "grad_norm": 0.846340537071228, + "learning_rate": 9.618360231438058e-06, + "loss": 0.8758, + "step": 4636 + }, + { + "epoch": 0.2552149265231989, + "grad_norm": 0.8262908458709717, + "learning_rate": 9.61819411662615e-06, + "loss": 0.7758, + "step": 4637 + }, + { + "epoch": 0.2552699653255545, + "grad_norm": 0.7829110026359558, + "learning_rate": 9.61802796710513e-06, + "loss": 0.8494, + "step": 4638 + }, + { + "epoch": 0.25532500412791015, + "grad_norm": 0.7480815649032593, + "learning_rate": 9.617861782876247e-06, + "loss": 0.7639, + "step": 4639 + }, + { + "epoch": 0.25538004293026584, + "grad_norm": 0.8782994747161865, + "learning_rate": 9.617695563940752e-06, + "loss": 0.9651, + "step": 4640 + }, + { + "epoch": 0.25543508173262147, + "grad_norm": 0.7215868234634399, + "learning_rate": 9.617529310299895e-06, + "loss": 0.7833, + "step": 4641 + }, + { + "epoch": 0.25549012053497716, + "grad_norm": 0.8287535905838013, + "learning_rate": 9.617363021954922e-06, + "loss": 0.901, + "step": 4642 + }, + { + "epoch": 0.2555451593373328, + "grad_norm": 0.7679935097694397, + "learning_rate": 9.617196698907084e-06, + "loss": 0.761, + "step": 4643 + }, + { + "epoch": 0.2556001981396885, + "grad_norm": 0.7765942811965942, + "learning_rate": 9.617030341157632e-06, + "loss": 0.7356, + "step": 4644 + }, + { + "epoch": 0.2556552369420441, + "grad_norm": 0.6964583396911621, + "learning_rate": 9.616863948707816e-06, + "loss": 0.7683, + "step": 4645 + }, + { + "epoch": 0.2557102757443998, + "grad_norm": 0.8031953573226929, + "learning_rate": 9.616697521558886e-06, + "loss": 0.7875, + "step": 4646 + }, + { + "epoch": 0.25576531454675544, + "grad_norm": 0.7155965566635132, + "learning_rate": 9.616531059712094e-06, + "loss": 0.6516, + "step": 4647 + }, + { + "epoch": 0.25582035334911113, + "grad_norm": 0.6870070099830627, + "learning_rate": 9.61636456316869e-06, + "loss": 0.7217, + "step": 4648 + }, + { + "epoch": 0.25587539215146676, + "grad_norm": 0.7686315774917603, + "learning_rate": 9.616198031929926e-06, + "loss": 0.8136, + "step": 4649 + }, + { + "epoch": 0.25593043095382245, + "grad_norm": 0.7532772421836853, + "learning_rate": 9.616031465997054e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.2559854697561781, + "grad_norm": 0.8111574053764343, + "learning_rate": 9.615864865371323e-06, + "loss": 0.8501, + "step": 4651 + }, + { + "epoch": 0.2560405085585338, + "grad_norm": 0.771065890789032, + "learning_rate": 9.615698230053989e-06, + "loss": 0.7417, + "step": 4652 + }, + { + "epoch": 0.2560955473608894, + "grad_norm": 0.7468003034591675, + "learning_rate": 9.6155315600463e-06, + "loss": 0.7303, + "step": 4653 + }, + { + "epoch": 0.2561505861632451, + "grad_norm": 0.8041057586669922, + "learning_rate": 9.615364855349514e-06, + "loss": 0.8689, + "step": 4654 + }, + { + "epoch": 0.25620562496560073, + "grad_norm": 0.8439033627510071, + "learning_rate": 9.61519811596488e-06, + "loss": 0.8654, + "step": 4655 + }, + { + "epoch": 0.2562606637679564, + "grad_norm": 0.7768430113792419, + "learning_rate": 9.615031341893653e-06, + "loss": 0.8789, + "step": 4656 + }, + { + "epoch": 0.25631570257031205, + "grad_norm": 0.712876558303833, + "learning_rate": 9.614864533137086e-06, + "loss": 0.7497, + "step": 4657 + }, + { + "epoch": 0.25637074137266774, + "grad_norm": 0.7586949467658997, + "learning_rate": 9.614697689696431e-06, + "loss": 0.81, + "step": 4658 + }, + { + "epoch": 0.2564257801750234, + "grad_norm": 0.717078447341919, + "learning_rate": 9.614530811572946e-06, + "loss": 0.8023, + "step": 4659 + }, + { + "epoch": 0.25648081897737907, + "grad_norm": 0.7369407415390015, + "learning_rate": 9.61436389876788e-06, + "loss": 0.784, + "step": 4660 + }, + { + "epoch": 0.2565358577797347, + "grad_norm": 0.7536265850067139, + "learning_rate": 9.61419695128249e-06, + "loss": 0.7687, + "step": 4661 + }, + { + "epoch": 0.2565908965820904, + "grad_norm": 0.9718124866485596, + "learning_rate": 9.614029969118033e-06, + "loss": 0.8495, + "step": 4662 + }, + { + "epoch": 0.256645935384446, + "grad_norm": 1.1578630208969116, + "learning_rate": 9.613862952275762e-06, + "loss": 0.9189, + "step": 4663 + }, + { + "epoch": 0.2567009741868017, + "grad_norm": 0.7752498984336853, + "learning_rate": 9.613695900756929e-06, + "loss": 0.7677, + "step": 4664 + }, + { + "epoch": 0.25675601298915735, + "grad_norm": 0.9640393257141113, + "learning_rate": 9.613528814562795e-06, + "loss": 0.719, + "step": 4665 + }, + { + "epoch": 0.25681105179151303, + "grad_norm": 0.7690972089767456, + "learning_rate": 9.613361693694614e-06, + "loss": 0.7977, + "step": 4666 + }, + { + "epoch": 0.25686609059386867, + "grad_norm": 0.8390190601348877, + "learning_rate": 9.61319453815364e-06, + "loss": 0.8032, + "step": 4667 + }, + { + "epoch": 0.25692112939622436, + "grad_norm": 0.8293220400810242, + "learning_rate": 9.613027347941131e-06, + "loss": 0.8645, + "step": 4668 + }, + { + "epoch": 0.25697616819858, + "grad_norm": 0.8020731210708618, + "learning_rate": 9.612860123058344e-06, + "loss": 0.8374, + "step": 4669 + }, + { + "epoch": 0.2570312070009357, + "grad_norm": 0.7756736278533936, + "learning_rate": 9.612692863506534e-06, + "loss": 0.7318, + "step": 4670 + }, + { + "epoch": 0.2570862458032913, + "grad_norm": 0.895416259765625, + "learning_rate": 9.61252556928696e-06, + "loss": 0.9654, + "step": 4671 + }, + { + "epoch": 0.257141284605647, + "grad_norm": 0.8647375106811523, + "learning_rate": 9.61235824040088e-06, + "loss": 0.7411, + "step": 4672 + }, + { + "epoch": 0.25719632340800264, + "grad_norm": 0.6927250623703003, + "learning_rate": 9.612190876849546e-06, + "loss": 0.7558, + "step": 4673 + }, + { + "epoch": 0.2572513622103583, + "grad_norm": 0.7614898085594177, + "learning_rate": 9.612023478634222e-06, + "loss": 0.7696, + "step": 4674 + }, + { + "epoch": 0.25730640101271396, + "grad_norm": 0.7910586595535278, + "learning_rate": 9.611856045756166e-06, + "loss": 0.8207, + "step": 4675 + }, + { + "epoch": 0.25736143981506965, + "grad_norm": 0.7330125570297241, + "learning_rate": 9.611688578216632e-06, + "loss": 0.8615, + "step": 4676 + }, + { + "epoch": 0.2574164786174253, + "grad_norm": 0.7703417539596558, + "learning_rate": 9.611521076016882e-06, + "loss": 0.8321, + "step": 4677 + }, + { + "epoch": 0.25747151741978097, + "grad_norm": 0.7121796607971191, + "learning_rate": 9.611353539158174e-06, + "loss": 0.8228, + "step": 4678 + }, + { + "epoch": 0.2575265562221366, + "grad_norm": 0.8313117027282715, + "learning_rate": 9.611185967641768e-06, + "loss": 0.9012, + "step": 4679 + }, + { + "epoch": 0.2575815950244923, + "grad_norm": 0.806776225566864, + "learning_rate": 9.61101836146892e-06, + "loss": 0.769, + "step": 4680 + }, + { + "epoch": 0.2576366338268479, + "grad_norm": 0.7049515843391418, + "learning_rate": 9.610850720640894e-06, + "loss": 0.7938, + "step": 4681 + }, + { + "epoch": 0.25769167262920356, + "grad_norm": 0.7286638021469116, + "learning_rate": 9.610683045158948e-06, + "loss": 0.8168, + "step": 4682 + }, + { + "epoch": 0.25774671143155925, + "grad_norm": 0.7916898727416992, + "learning_rate": 9.610515335024345e-06, + "loss": 0.7681, + "step": 4683 + }, + { + "epoch": 0.2578017502339149, + "grad_norm": 0.7649673819541931, + "learning_rate": 9.61034759023834e-06, + "loss": 0.7273, + "step": 4684 + }, + { + "epoch": 0.2578567890362706, + "grad_norm": 0.8280686736106873, + "learning_rate": 9.610179810802196e-06, + "loss": 0.7968, + "step": 4685 + }, + { + "epoch": 0.2579118278386262, + "grad_norm": 0.7206569910049438, + "learning_rate": 9.610011996717175e-06, + "loss": 0.7359, + "step": 4686 + }, + { + "epoch": 0.2579668666409819, + "grad_norm": 0.7365424036979675, + "learning_rate": 9.60984414798454e-06, + "loss": 0.7962, + "step": 4687 + }, + { + "epoch": 0.25802190544333753, + "grad_norm": 0.8030344247817993, + "learning_rate": 9.609676264605549e-06, + "loss": 0.7931, + "step": 4688 + }, + { + "epoch": 0.2580769442456932, + "grad_norm": 0.8812693357467651, + "learning_rate": 9.609508346581464e-06, + "loss": 0.8493, + "step": 4689 + }, + { + "epoch": 0.25813198304804885, + "grad_norm": 0.8026734590530396, + "learning_rate": 9.60934039391355e-06, + "loss": 0.8368, + "step": 4690 + }, + { + "epoch": 0.25818702185040454, + "grad_norm": 0.8270768523216248, + "learning_rate": 9.609172406603067e-06, + "loss": 0.9077, + "step": 4691 + }, + { + "epoch": 0.2582420606527602, + "grad_norm": 0.7362856864929199, + "learning_rate": 9.609004384651276e-06, + "loss": 0.7384, + "step": 4692 + }, + { + "epoch": 0.25829709945511586, + "grad_norm": 0.7195929288864136, + "learning_rate": 9.608836328059444e-06, + "loss": 0.8475, + "step": 4693 + }, + { + "epoch": 0.2583521382574715, + "grad_norm": 0.7653167843818665, + "learning_rate": 9.60866823682883e-06, + "loss": 0.7704, + "step": 4694 + }, + { + "epoch": 0.2584071770598272, + "grad_norm": 0.7056792974472046, + "learning_rate": 9.6085001109607e-06, + "loss": 0.7835, + "step": 4695 + }, + { + "epoch": 0.2584622158621828, + "grad_norm": 0.7299804091453552, + "learning_rate": 9.60833195045632e-06, + "loss": 0.7894, + "step": 4696 + }, + { + "epoch": 0.2585172546645385, + "grad_norm": 0.7235645055770874, + "learning_rate": 9.608163755316948e-06, + "loss": 0.8113, + "step": 4697 + }, + { + "epoch": 0.25857229346689414, + "grad_norm": 0.7066782116889954, + "learning_rate": 9.60799552554385e-06, + "loss": 0.739, + "step": 4698 + }, + { + "epoch": 0.25862733226924983, + "grad_norm": 0.769930362701416, + "learning_rate": 9.607827261138291e-06, + "loss": 0.7565, + "step": 4699 + }, + { + "epoch": 0.25868237107160547, + "grad_norm": 0.8875935077667236, + "learning_rate": 9.607658962101538e-06, + "loss": 0.849, + "step": 4700 + }, + { + "epoch": 0.25873740987396115, + "grad_norm": 0.7887380123138428, + "learning_rate": 9.60749062843485e-06, + "loss": 0.8795, + "step": 4701 + }, + { + "epoch": 0.2587924486763168, + "grad_norm": 0.7600420117378235, + "learning_rate": 9.607322260139499e-06, + "loss": 0.7581, + "step": 4702 + }, + { + "epoch": 0.2588474874786725, + "grad_norm": 0.7431491017341614, + "learning_rate": 9.607153857216746e-06, + "loss": 0.7119, + "step": 4703 + }, + { + "epoch": 0.2589025262810281, + "grad_norm": 0.7444193363189697, + "learning_rate": 9.606985419667858e-06, + "loss": 0.7492, + "step": 4704 + }, + { + "epoch": 0.2589575650833838, + "grad_norm": 0.8348917365074158, + "learning_rate": 9.6068169474941e-06, + "loss": 0.7656, + "step": 4705 + }, + { + "epoch": 0.25901260388573943, + "grad_norm": 0.6790240406990051, + "learning_rate": 9.60664844069674e-06, + "loss": 0.6354, + "step": 4706 + }, + { + "epoch": 0.2590676426880951, + "grad_norm": 0.8425769805908203, + "learning_rate": 9.606479899277044e-06, + "loss": 0.7927, + "step": 4707 + }, + { + "epoch": 0.25912268149045076, + "grad_norm": 0.7234740853309631, + "learning_rate": 9.606311323236277e-06, + "loss": 0.8122, + "step": 4708 + }, + { + "epoch": 0.25917772029280645, + "grad_norm": 0.839507520198822, + "learning_rate": 9.606142712575707e-06, + "loss": 0.8807, + "step": 4709 + }, + { + "epoch": 0.2592327590951621, + "grad_norm": 0.7155291438102722, + "learning_rate": 9.605974067296601e-06, + "loss": 0.7852, + "step": 4710 + }, + { + "epoch": 0.25928779789751777, + "grad_norm": 0.7222152352333069, + "learning_rate": 9.605805387400228e-06, + "loss": 0.7362, + "step": 4711 + }, + { + "epoch": 0.2593428366998734, + "grad_norm": 0.8350114226341248, + "learning_rate": 9.605636672887854e-06, + "loss": 0.7201, + "step": 4712 + }, + { + "epoch": 0.2593978755022291, + "grad_norm": 0.6805943250656128, + "learning_rate": 9.605467923760747e-06, + "loss": 0.6936, + "step": 4713 + }, + { + "epoch": 0.2594529143045847, + "grad_norm": 0.7863980531692505, + "learning_rate": 9.605299140020177e-06, + "loss": 0.9079, + "step": 4714 + }, + { + "epoch": 0.2595079531069404, + "grad_norm": 0.838843584060669, + "learning_rate": 9.60513032166741e-06, + "loss": 0.839, + "step": 4715 + }, + { + "epoch": 0.25956299190929605, + "grad_norm": 0.7872797250747681, + "learning_rate": 9.60496146870372e-06, + "loss": 0.9164, + "step": 4716 + }, + { + "epoch": 0.25961803071165174, + "grad_norm": 0.7300794720649719, + "learning_rate": 9.604792581130369e-06, + "loss": 0.8227, + "step": 4717 + }, + { + "epoch": 0.25967306951400737, + "grad_norm": 0.8420879244804382, + "learning_rate": 9.60462365894863e-06, + "loss": 0.7865, + "step": 4718 + }, + { + "epoch": 0.25972810831636306, + "grad_norm": 0.807697057723999, + "learning_rate": 9.604454702159771e-06, + "loss": 0.9081, + "step": 4719 + }, + { + "epoch": 0.2597831471187187, + "grad_norm": 0.9041245579719543, + "learning_rate": 9.604285710765064e-06, + "loss": 0.8102, + "step": 4720 + }, + { + "epoch": 0.2598381859210744, + "grad_norm": 0.7061690092086792, + "learning_rate": 9.604116684765779e-06, + "loss": 0.762, + "step": 4721 + }, + { + "epoch": 0.25989322472343, + "grad_norm": 0.7790346741676331, + "learning_rate": 9.603947624163186e-06, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.2599482635257857, + "grad_norm": 0.8109704256057739, + "learning_rate": 9.603778528958553e-06, + "loss": 0.9105, + "step": 4723 + }, + { + "epoch": 0.26000330232814134, + "grad_norm": 0.7396997213363647, + "learning_rate": 9.603609399153153e-06, + "loss": 0.8384, + "step": 4724 + }, + { + "epoch": 0.260058341130497, + "grad_norm": 0.8594317436218262, + "learning_rate": 9.603440234748257e-06, + "loss": 0.8301, + "step": 4725 + }, + { + "epoch": 0.26011337993285266, + "grad_norm": 0.7087241411209106, + "learning_rate": 9.603271035745138e-06, + "loss": 0.6652, + "step": 4726 + }, + { + "epoch": 0.2601684187352083, + "grad_norm": 0.7405440211296082, + "learning_rate": 9.603101802145065e-06, + "loss": 0.7804, + "step": 4727 + }, + { + "epoch": 0.260223457537564, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.602932533949312e-06, + "loss": 0.8509, + "step": 4728 + }, + { + "epoch": 0.2602784963399196, + "grad_norm": 0.7040451765060425, + "learning_rate": 9.60276323115915e-06, + "loss": 0.7842, + "step": 4729 + }, + { + "epoch": 0.2603335351422753, + "grad_norm": 0.7743955254554749, + "learning_rate": 9.602593893775852e-06, + "loss": 0.8492, + "step": 4730 + }, + { + "epoch": 0.26038857394463094, + "grad_norm": 0.7110480070114136, + "learning_rate": 9.602424521800688e-06, + "loss": 0.7227, + "step": 4731 + }, + { + "epoch": 0.26044361274698663, + "grad_norm": 1.0066583156585693, + "learning_rate": 9.602255115234936e-06, + "loss": 0.8825, + "step": 4732 + }, + { + "epoch": 0.26049865154934226, + "grad_norm": 0.7746492624282837, + "learning_rate": 9.602085674079864e-06, + "loss": 0.8316, + "step": 4733 + }, + { + "epoch": 0.26055369035169795, + "grad_norm": 0.7394356727600098, + "learning_rate": 9.60191619833675e-06, + "loss": 0.746, + "step": 4734 + }, + { + "epoch": 0.2606087291540536, + "grad_norm": 0.7140582203865051, + "learning_rate": 9.601746688006866e-06, + "loss": 0.7204, + "step": 4735 + }, + { + "epoch": 0.2606637679564093, + "grad_norm": 0.753399133682251, + "learning_rate": 9.601577143091483e-06, + "loss": 0.8157, + "step": 4736 + }, + { + "epoch": 0.2607188067587649, + "grad_norm": 0.674320638179779, + "learning_rate": 9.601407563591881e-06, + "loss": 0.7279, + "step": 4737 + }, + { + "epoch": 0.2607738455611206, + "grad_norm": 0.855944037437439, + "learning_rate": 9.60123794950933e-06, + "loss": 0.804, + "step": 4738 + }, + { + "epoch": 0.26082888436347623, + "grad_norm": 0.6833948493003845, + "learning_rate": 9.601068300845106e-06, + "loss": 0.701, + "step": 4739 + }, + { + "epoch": 0.2608839231658319, + "grad_norm": 0.8085536360740662, + "learning_rate": 9.600898617600485e-06, + "loss": 0.8435, + "step": 4740 + }, + { + "epoch": 0.26093896196818755, + "grad_norm": 0.752849817276001, + "learning_rate": 9.600728899776741e-06, + "loss": 0.7205, + "step": 4741 + }, + { + "epoch": 0.26099400077054324, + "grad_norm": 0.7320554852485657, + "learning_rate": 9.600559147375151e-06, + "loss": 0.7556, + "step": 4742 + }, + { + "epoch": 0.2610490395728989, + "grad_norm": 0.7789202928543091, + "learning_rate": 9.600389360396988e-06, + "loss": 0.8467, + "step": 4743 + }, + { + "epoch": 0.26110407837525457, + "grad_norm": 0.8480898141860962, + "learning_rate": 9.600219538843532e-06, + "loss": 0.7762, + "step": 4744 + }, + { + "epoch": 0.2611591171776102, + "grad_norm": 0.8382542133331299, + "learning_rate": 9.600049682716055e-06, + "loss": 0.9051, + "step": 4745 + }, + { + "epoch": 0.2612141559799659, + "grad_norm": 0.8319274187088013, + "learning_rate": 9.599879792015838e-06, + "loss": 0.8221, + "step": 4746 + }, + { + "epoch": 0.2612691947823215, + "grad_norm": 0.7325875163078308, + "learning_rate": 9.599709866744156e-06, + "loss": 0.7968, + "step": 4747 + }, + { + "epoch": 0.2613242335846772, + "grad_norm": 0.7053360342979431, + "learning_rate": 9.599539906902285e-06, + "loss": 0.7073, + "step": 4748 + }, + { + "epoch": 0.26137927238703285, + "grad_norm": 0.763017475605011, + "learning_rate": 9.599369912491503e-06, + "loss": 0.7031, + "step": 4749 + }, + { + "epoch": 0.26143431118938854, + "grad_norm": 0.6816151738166809, + "learning_rate": 9.599199883513088e-06, + "loss": 0.7295, + "step": 4750 + }, + { + "epoch": 0.26148934999174417, + "grad_norm": 0.8143941164016724, + "learning_rate": 9.599029819968319e-06, + "loss": 0.8449, + "step": 4751 + }, + { + "epoch": 0.26154438879409986, + "grad_norm": 0.8093858361244202, + "learning_rate": 9.598859721858471e-06, + "loss": 0.8397, + "step": 4752 + }, + { + "epoch": 0.2615994275964555, + "grad_norm": 0.7431835532188416, + "learning_rate": 9.598689589184827e-06, + "loss": 0.7299, + "step": 4753 + }, + { + "epoch": 0.2616544663988112, + "grad_norm": 0.9871510863304138, + "learning_rate": 9.59851942194866e-06, + "loss": 0.7992, + "step": 4754 + }, + { + "epoch": 0.2617095052011668, + "grad_norm": 0.9304273724555969, + "learning_rate": 9.598349220151254e-06, + "loss": 0.7519, + "step": 4755 + }, + { + "epoch": 0.2617645440035225, + "grad_norm": 0.9361812472343445, + "learning_rate": 9.598178983793886e-06, + "loss": 0.8131, + "step": 4756 + }, + { + "epoch": 0.26181958280587814, + "grad_norm": 0.7783429622650146, + "learning_rate": 9.598008712877835e-06, + "loss": 0.7351, + "step": 4757 + }, + { + "epoch": 0.2618746216082338, + "grad_norm": 0.8739376068115234, + "learning_rate": 9.597838407404381e-06, + "loss": 0.9458, + "step": 4758 + }, + { + "epoch": 0.26192966041058946, + "grad_norm": 0.7076277732849121, + "learning_rate": 9.597668067374805e-06, + "loss": 0.7632, + "step": 4759 + }, + { + "epoch": 0.26198469921294515, + "grad_norm": 0.7652345299720764, + "learning_rate": 9.597497692790386e-06, + "loss": 0.8018, + "step": 4760 + }, + { + "epoch": 0.2620397380153008, + "grad_norm": 0.7332149147987366, + "learning_rate": 9.597327283652405e-06, + "loss": 0.8223, + "step": 4761 + }, + { + "epoch": 0.26209477681765647, + "grad_norm": 0.8361638784408569, + "learning_rate": 9.597156839962145e-06, + "loss": 0.8784, + "step": 4762 + }, + { + "epoch": 0.2621498156200121, + "grad_norm": 1.183772325515747, + "learning_rate": 9.596986361720882e-06, + "loss": 0.8768, + "step": 4763 + }, + { + "epoch": 0.2622048544223678, + "grad_norm": 0.9895418882369995, + "learning_rate": 9.596815848929902e-06, + "loss": 0.714, + "step": 4764 + }, + { + "epoch": 0.26225989322472343, + "grad_norm": 0.8210558295249939, + "learning_rate": 9.59664530159048e-06, + "loss": 0.7246, + "step": 4765 + }, + { + "epoch": 0.2623149320270791, + "grad_norm": 0.8003455996513367, + "learning_rate": 9.596474719703908e-06, + "loss": 0.8385, + "step": 4766 + }, + { + "epoch": 0.26236997082943475, + "grad_norm": 0.7555826306343079, + "learning_rate": 9.59630410327146e-06, + "loss": 0.7243, + "step": 4767 + }, + { + "epoch": 0.2624250096317904, + "grad_norm": 0.7746273279190063, + "learning_rate": 9.596133452294421e-06, + "loss": 0.8763, + "step": 4768 + }, + { + "epoch": 0.2624800484341461, + "grad_norm": 0.7238507866859436, + "learning_rate": 9.595962766774074e-06, + "loss": 0.8302, + "step": 4769 + }, + { + "epoch": 0.2625350872365017, + "grad_norm": 0.7874132394790649, + "learning_rate": 9.595792046711699e-06, + "loss": 0.7979, + "step": 4770 + }, + { + "epoch": 0.2625901260388574, + "grad_norm": 0.8792033791542053, + "learning_rate": 9.595621292108583e-06, + "loss": 0.8555, + "step": 4771 + }, + { + "epoch": 0.26264516484121303, + "grad_norm": 0.7026945948600769, + "learning_rate": 9.595450502966006e-06, + "loss": 0.718, + "step": 4772 + }, + { + "epoch": 0.2627002036435687, + "grad_norm": 0.7747959494590759, + "learning_rate": 9.595279679285254e-06, + "loss": 0.8329, + "step": 4773 + }, + { + "epoch": 0.26275524244592435, + "grad_norm": 0.697979748249054, + "learning_rate": 9.59510882106761e-06, + "loss": 0.7456, + "step": 4774 + }, + { + "epoch": 0.26281028124828004, + "grad_norm": 0.7600447535514832, + "learning_rate": 9.594937928314359e-06, + "loss": 0.875, + "step": 4775 + }, + { + "epoch": 0.2628653200506357, + "grad_norm": 0.7591384649276733, + "learning_rate": 9.594767001026783e-06, + "loss": 0.7607, + "step": 4776 + }, + { + "epoch": 0.26292035885299136, + "grad_norm": 0.9267380833625793, + "learning_rate": 9.59459603920617e-06, + "loss": 0.8926, + "step": 4777 + }, + { + "epoch": 0.262975397655347, + "grad_norm": 0.7751328349113464, + "learning_rate": 9.594425042853802e-06, + "loss": 0.7449, + "step": 4778 + }, + { + "epoch": 0.2630304364577027, + "grad_norm": 0.7066012620925903, + "learning_rate": 9.594254011970966e-06, + "loss": 0.8374, + "step": 4779 + }, + { + "epoch": 0.2630854752600583, + "grad_norm": 0.7564317584037781, + "learning_rate": 9.594082946558945e-06, + "loss": 0.735, + "step": 4780 + }, + { + "epoch": 0.263140514062414, + "grad_norm": 0.8151416182518005, + "learning_rate": 9.593911846619027e-06, + "loss": 0.8575, + "step": 4781 + }, + { + "epoch": 0.26319555286476964, + "grad_norm": 0.719261646270752, + "learning_rate": 9.593740712152497e-06, + "loss": 0.7981, + "step": 4782 + }, + { + "epoch": 0.26325059166712533, + "grad_norm": 0.8627344369888306, + "learning_rate": 9.593569543160642e-06, + "loss": 0.895, + "step": 4783 + }, + { + "epoch": 0.26330563046948097, + "grad_norm": 1.293272614479065, + "learning_rate": 9.593398339644748e-06, + "loss": 0.7531, + "step": 4784 + }, + { + "epoch": 0.26336066927183666, + "grad_norm": 0.8475207686424255, + "learning_rate": 9.593227101606102e-06, + "loss": 0.9091, + "step": 4785 + }, + { + "epoch": 0.2634157080741923, + "grad_norm": 0.78054279088974, + "learning_rate": 9.593055829045989e-06, + "loss": 0.7692, + "step": 4786 + }, + { + "epoch": 0.263470746876548, + "grad_norm": 0.7677399516105652, + "learning_rate": 9.592884521965699e-06, + "loss": 0.6232, + "step": 4787 + }, + { + "epoch": 0.2635257856789036, + "grad_norm": 0.7232677340507507, + "learning_rate": 9.59271318036652e-06, + "loss": 0.8087, + "step": 4788 + }, + { + "epoch": 0.2635808244812593, + "grad_norm": 0.8728463649749756, + "learning_rate": 9.592541804249735e-06, + "loss": 0.7824, + "step": 4789 + }, + { + "epoch": 0.26363586328361494, + "grad_norm": 0.7569910883903503, + "learning_rate": 9.592370393616637e-06, + "loss": 0.7418, + "step": 4790 + }, + { + "epoch": 0.2636909020859706, + "grad_norm": 0.7631934285163879, + "learning_rate": 9.592198948468511e-06, + "loss": 0.7929, + "step": 4791 + }, + { + "epoch": 0.26374594088832626, + "grad_norm": 0.8021631240844727, + "learning_rate": 9.592027468806649e-06, + "loss": 0.8111, + "step": 4792 + }, + { + "epoch": 0.26380097969068195, + "grad_norm": 0.9454651474952698, + "learning_rate": 9.591855954632336e-06, + "loss": 0.8239, + "step": 4793 + }, + { + "epoch": 0.2638560184930376, + "grad_norm": 0.672924280166626, + "learning_rate": 9.591684405946863e-06, + "loss": 0.6877, + "step": 4794 + }, + { + "epoch": 0.26391105729539327, + "grad_norm": 0.7942802906036377, + "learning_rate": 9.59151282275152e-06, + "loss": 0.9002, + "step": 4795 + }, + { + "epoch": 0.2639660960977489, + "grad_norm": 0.7131155133247375, + "learning_rate": 9.591341205047596e-06, + "loss": 0.7692, + "step": 4796 + }, + { + "epoch": 0.2640211349001046, + "grad_norm": 1.0395869016647339, + "learning_rate": 9.59116955283638e-06, + "loss": 0.8352, + "step": 4797 + }, + { + "epoch": 0.2640761737024602, + "grad_norm": 0.9503256678581238, + "learning_rate": 9.590997866119163e-06, + "loss": 1.0287, + "step": 4798 + }, + { + "epoch": 0.2641312125048159, + "grad_norm": 0.7539612054824829, + "learning_rate": 9.590826144897235e-06, + "loss": 0.872, + "step": 4799 + }, + { + "epoch": 0.26418625130717155, + "grad_norm": 0.7067893743515015, + "learning_rate": 9.590654389171885e-06, + "loss": 0.7636, + "step": 4800 + }, + { + "epoch": 0.26424129010952724, + "grad_norm": 0.7355281710624695, + "learning_rate": 9.590482598944407e-06, + "loss": 0.7715, + "step": 4801 + }, + { + "epoch": 0.26429632891188287, + "grad_norm": 0.7589674592018127, + "learning_rate": 9.590310774216089e-06, + "loss": 0.7451, + "step": 4802 + }, + { + "epoch": 0.26435136771423856, + "grad_norm": 0.701386034488678, + "learning_rate": 9.590138914988226e-06, + "loss": 0.7317, + "step": 4803 + }, + { + "epoch": 0.2644064065165942, + "grad_norm": 0.7663118243217468, + "learning_rate": 9.589967021262105e-06, + "loss": 0.8227, + "step": 4804 + }, + { + "epoch": 0.2644614453189499, + "grad_norm": 0.7059655785560608, + "learning_rate": 9.589795093039023e-06, + "loss": 0.7829, + "step": 4805 + }, + { + "epoch": 0.2645164841213055, + "grad_norm": 0.7377020120620728, + "learning_rate": 9.58962313032027e-06, + "loss": 0.8308, + "step": 4806 + }, + { + "epoch": 0.2645715229236612, + "grad_norm": 0.8635388612747192, + "learning_rate": 9.589451133107134e-06, + "loss": 0.7882, + "step": 4807 + }, + { + "epoch": 0.26462656172601684, + "grad_norm": 0.8282824754714966, + "learning_rate": 9.589279101400915e-06, + "loss": 0.8055, + "step": 4808 + }, + { + "epoch": 0.26468160052837253, + "grad_norm": 0.7026814818382263, + "learning_rate": 9.589107035202903e-06, + "loss": 0.7567, + "step": 4809 + }, + { + "epoch": 0.26473663933072816, + "grad_norm": 0.7575708031654358, + "learning_rate": 9.588934934514392e-06, + "loss": 0.7456, + "step": 4810 + }, + { + "epoch": 0.2647916781330838, + "grad_norm": 0.9732069969177246, + "learning_rate": 9.588762799336671e-06, + "loss": 0.8217, + "step": 4811 + }, + { + "epoch": 0.2648467169354395, + "grad_norm": 0.786803126335144, + "learning_rate": 9.58859062967104e-06, + "loss": 0.729, + "step": 4812 + }, + { + "epoch": 0.2649017557377951, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.588418425518789e-06, + "loss": 0.8204, + "step": 4813 + }, + { + "epoch": 0.2649567945401508, + "grad_norm": 0.8222702145576477, + "learning_rate": 9.588246186881213e-06, + "loss": 0.8349, + "step": 4814 + }, + { + "epoch": 0.26501183334250644, + "grad_norm": 0.7560802698135376, + "learning_rate": 9.588073913759608e-06, + "loss": 0.7601, + "step": 4815 + }, + { + "epoch": 0.26506687214486213, + "grad_norm": 0.9221365451812744, + "learning_rate": 9.587901606155266e-06, + "loss": 0.7725, + "step": 4816 + }, + { + "epoch": 0.26512191094721776, + "grad_norm": 0.8092262744903564, + "learning_rate": 9.587729264069485e-06, + "loss": 0.9074, + "step": 4817 + }, + { + "epoch": 0.26517694974957345, + "grad_norm": 0.8183920979499817, + "learning_rate": 9.587556887503557e-06, + "loss": 0.8321, + "step": 4818 + }, + { + "epoch": 0.2652319885519291, + "grad_norm": 0.7023420929908752, + "learning_rate": 9.587384476458781e-06, + "loss": 0.7842, + "step": 4819 + }, + { + "epoch": 0.2652870273542848, + "grad_norm": 1.2864880561828613, + "learning_rate": 9.58721203093645e-06, + "loss": 0.7519, + "step": 4820 + }, + { + "epoch": 0.2653420661566404, + "grad_norm": 0.8133784532546997, + "learning_rate": 9.587039550937864e-06, + "loss": 0.8208, + "step": 4821 + }, + { + "epoch": 0.2653971049589961, + "grad_norm": 0.739732027053833, + "learning_rate": 9.586867036464314e-06, + "loss": 0.8553, + "step": 4822 + }, + { + "epoch": 0.26545214376135173, + "grad_norm": 0.7539162635803223, + "learning_rate": 9.5866944875171e-06, + "loss": 0.7385, + "step": 4823 + }, + { + "epoch": 0.2655071825637074, + "grad_norm": 0.8012336492538452, + "learning_rate": 9.58652190409752e-06, + "loss": 0.8343, + "step": 4824 + }, + { + "epoch": 0.26556222136606306, + "grad_norm": 0.7972521185874939, + "learning_rate": 9.586349286206865e-06, + "loss": 0.8481, + "step": 4825 + }, + { + "epoch": 0.26561726016841875, + "grad_norm": 0.7772900462150574, + "learning_rate": 9.58617663384644e-06, + "loss": 0.7655, + "step": 4826 + }, + { + "epoch": 0.2656722989707744, + "grad_norm": 0.677916944026947, + "learning_rate": 9.586003947017537e-06, + "loss": 0.696, + "step": 4827 + }, + { + "epoch": 0.26572733777313007, + "grad_norm": 0.8254117369651794, + "learning_rate": 9.585831225721455e-06, + "loss": 0.7841, + "step": 4828 + }, + { + "epoch": 0.2657823765754857, + "grad_norm": 0.7256904244422913, + "learning_rate": 9.585658469959496e-06, + "loss": 0.8057, + "step": 4829 + }, + { + "epoch": 0.2658374153778414, + "grad_norm": 0.7651757001876831, + "learning_rate": 9.585485679732953e-06, + "loss": 0.7918, + "step": 4830 + }, + { + "epoch": 0.265892454180197, + "grad_norm": 0.7581052184104919, + "learning_rate": 9.58531285504313e-06, + "loss": 0.759, + "step": 4831 + }, + { + "epoch": 0.2659474929825527, + "grad_norm": 0.7190486192703247, + "learning_rate": 9.58513999589132e-06, + "loss": 0.7403, + "step": 4832 + }, + { + "epoch": 0.26600253178490835, + "grad_norm": 0.8603141903877258, + "learning_rate": 9.584967102278825e-06, + "loss": 0.8944, + "step": 4833 + }, + { + "epoch": 0.26605757058726404, + "grad_norm": 0.806297779083252, + "learning_rate": 9.584794174206947e-06, + "loss": 0.7039, + "step": 4834 + }, + { + "epoch": 0.26611260938961967, + "grad_norm": 0.7604451775550842, + "learning_rate": 9.584621211676981e-06, + "loss": 0.8076, + "step": 4835 + }, + { + "epoch": 0.26616764819197536, + "grad_norm": 0.7276773452758789, + "learning_rate": 9.584448214690232e-06, + "loss": 0.786, + "step": 4836 + }, + { + "epoch": 0.266222686994331, + "grad_norm": 0.8737080693244934, + "learning_rate": 9.584275183247994e-06, + "loss": 0.8071, + "step": 4837 + }, + { + "epoch": 0.2662777257966867, + "grad_norm": 0.8447219133377075, + "learning_rate": 9.584102117351574e-06, + "loss": 0.7682, + "step": 4838 + }, + { + "epoch": 0.2663327645990423, + "grad_norm": 0.7001703381538391, + "learning_rate": 9.583929017002268e-06, + "loss": 0.7077, + "step": 4839 + }, + { + "epoch": 0.266387803401398, + "grad_norm": 0.7935730218887329, + "learning_rate": 9.583755882201377e-06, + "loss": 0.8122, + "step": 4840 + }, + { + "epoch": 0.26644284220375364, + "grad_norm": 0.8763312697410583, + "learning_rate": 9.583582712950207e-06, + "loss": 0.8241, + "step": 4841 + }, + { + "epoch": 0.2664978810061093, + "grad_norm": 0.7910245656967163, + "learning_rate": 9.583409509250055e-06, + "loss": 0.7717, + "step": 4842 + }, + { + "epoch": 0.26655291980846496, + "grad_norm": 0.7975226640701294, + "learning_rate": 9.583236271102222e-06, + "loss": 0.7165, + "step": 4843 + }, + { + "epoch": 0.26660795861082065, + "grad_norm": 0.8060342073440552, + "learning_rate": 9.583062998508014e-06, + "loss": 0.7659, + "step": 4844 + }, + { + "epoch": 0.2666629974131763, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.582889691468732e-06, + "loss": 0.8207, + "step": 4845 + }, + { + "epoch": 0.266718036215532, + "grad_norm": 0.7409310936927795, + "learning_rate": 9.582716349985677e-06, + "loss": 0.8439, + "step": 4846 + }, + { + "epoch": 0.2667730750178876, + "grad_norm": 0.8871899843215942, + "learning_rate": 9.582542974060152e-06, + "loss": 0.8305, + "step": 4847 + }, + { + "epoch": 0.2668281138202433, + "grad_norm": 0.9003115296363831, + "learning_rate": 9.58236956369346e-06, + "loss": 0.8334, + "step": 4848 + }, + { + "epoch": 0.26688315262259893, + "grad_norm": 1.0149577856063843, + "learning_rate": 9.582196118886909e-06, + "loss": 0.7962, + "step": 4849 + }, + { + "epoch": 0.2669381914249546, + "grad_norm": 0.785214900970459, + "learning_rate": 9.582022639641795e-06, + "loss": 0.7806, + "step": 4850 + }, + { + "epoch": 0.26699323022731025, + "grad_norm": 0.9833952188491821, + "learning_rate": 9.581849125959426e-06, + "loss": 0.7607, + "step": 4851 + }, + { + "epoch": 0.26704826902966594, + "grad_norm": 1.404751181602478, + "learning_rate": 9.581675577841104e-06, + "loss": 0.9046, + "step": 4852 + }, + { + "epoch": 0.2671033078320216, + "grad_norm": 0.791159451007843, + "learning_rate": 9.581501995288137e-06, + "loss": 0.6582, + "step": 4853 + }, + { + "epoch": 0.2671583466343772, + "grad_norm": 0.8507272005081177, + "learning_rate": 9.581328378301827e-06, + "loss": 0.8946, + "step": 4854 + }, + { + "epoch": 0.2672133854367329, + "grad_norm": 0.7372786998748779, + "learning_rate": 9.58115472688348e-06, + "loss": 0.7865, + "step": 4855 + }, + { + "epoch": 0.26726842423908853, + "grad_norm": 0.8293853998184204, + "learning_rate": 9.580981041034398e-06, + "loss": 0.9113, + "step": 4856 + }, + { + "epoch": 0.2673234630414442, + "grad_norm": 0.7212402820587158, + "learning_rate": 9.580807320755889e-06, + "loss": 0.7149, + "step": 4857 + }, + { + "epoch": 0.26737850184379985, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.58063356604926e-06, + "loss": 0.8651, + "step": 4858 + }, + { + "epoch": 0.26743354064615554, + "grad_norm": 0.8444308042526245, + "learning_rate": 9.580459776915814e-06, + "loss": 0.7968, + "step": 4859 + }, + { + "epoch": 0.2674885794485112, + "grad_norm": 0.7974254488945007, + "learning_rate": 9.58028595335686e-06, + "loss": 0.8499, + "step": 4860 + }, + { + "epoch": 0.26754361825086687, + "grad_norm": 0.7491242289543152, + "learning_rate": 9.580112095373702e-06, + "loss": 0.8278, + "step": 4861 + }, + { + "epoch": 0.2675986570532225, + "grad_norm": 0.6856499314308167, + "learning_rate": 9.579938202967646e-06, + "loss": 0.7466, + "step": 4862 + }, + { + "epoch": 0.2676536958555782, + "grad_norm": 0.7347447872161865, + "learning_rate": 9.579764276140002e-06, + "loss": 0.8046, + "step": 4863 + }, + { + "epoch": 0.2677087346579338, + "grad_norm": 0.6797083020210266, + "learning_rate": 9.579590314892077e-06, + "loss": 0.7012, + "step": 4864 + }, + { + "epoch": 0.2677637734602895, + "grad_norm": 0.8219562768936157, + "learning_rate": 9.579416319225175e-06, + "loss": 0.7592, + "step": 4865 + }, + { + "epoch": 0.26781881226264515, + "grad_norm": 0.7388357520103455, + "learning_rate": 9.579242289140607e-06, + "loss": 0.8179, + "step": 4866 + }, + { + "epoch": 0.26787385106500083, + "grad_norm": 0.7394490838050842, + "learning_rate": 9.579068224639679e-06, + "loss": 0.694, + "step": 4867 + }, + { + "epoch": 0.26792888986735647, + "grad_norm": 0.7309017181396484, + "learning_rate": 9.578894125723699e-06, + "loss": 0.7882, + "step": 4868 + }, + { + "epoch": 0.26798392866971216, + "grad_norm": 0.7785035967826843, + "learning_rate": 9.578719992393978e-06, + "loss": 0.8142, + "step": 4869 + }, + { + "epoch": 0.2680389674720678, + "grad_norm": 0.8983079195022583, + "learning_rate": 9.57854582465182e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.2680940062744235, + "grad_norm": 0.7433765530586243, + "learning_rate": 9.578371622498542e-06, + "loss": 0.8937, + "step": 4871 + }, + { + "epoch": 0.2681490450767791, + "grad_norm": 0.8808667659759521, + "learning_rate": 9.578197385935446e-06, + "loss": 0.7821, + "step": 4872 + }, + { + "epoch": 0.2682040838791348, + "grad_norm": 0.825794517993927, + "learning_rate": 9.578023114963843e-06, + "loss": 0.8228, + "step": 4873 + }, + { + "epoch": 0.26825912268149044, + "grad_norm": 1.0165129899978638, + "learning_rate": 9.577848809585046e-06, + "loss": 0.7964, + "step": 4874 + }, + { + "epoch": 0.2683141614838461, + "grad_norm": 0.742028534412384, + "learning_rate": 9.577674469800362e-06, + "loss": 0.9126, + "step": 4875 + }, + { + "epoch": 0.26836920028620176, + "grad_norm": 0.7571890354156494, + "learning_rate": 9.577500095611101e-06, + "loss": 0.879, + "step": 4876 + }, + { + "epoch": 0.26842423908855745, + "grad_norm": 0.7577160596847534, + "learning_rate": 9.577325687018575e-06, + "loss": 0.8048, + "step": 4877 + }, + { + "epoch": 0.2684792778909131, + "grad_norm": 0.7704411745071411, + "learning_rate": 9.577151244024095e-06, + "loss": 0.7451, + "step": 4878 + }, + { + "epoch": 0.26853431669326877, + "grad_norm": 0.8323166966438293, + "learning_rate": 9.57697676662897e-06, + "loss": 0.7591, + "step": 4879 + }, + { + "epoch": 0.2685893554956244, + "grad_norm": 0.7257028222084045, + "learning_rate": 9.576802254834516e-06, + "loss": 0.7941, + "step": 4880 + }, + { + "epoch": 0.2686443942979801, + "grad_norm": 0.8170442581176758, + "learning_rate": 9.57662770864204e-06, + "loss": 0.8617, + "step": 4881 + }, + { + "epoch": 0.2686994331003357, + "grad_norm": 0.7435339689254761, + "learning_rate": 9.576453128052852e-06, + "loss": 0.7683, + "step": 4882 + }, + { + "epoch": 0.2687544719026914, + "grad_norm": 0.7932955026626587, + "learning_rate": 9.576278513068271e-06, + "loss": 0.7103, + "step": 4883 + }, + { + "epoch": 0.26880951070504705, + "grad_norm": 0.8008469939231873, + "learning_rate": 9.576103863689604e-06, + "loss": 0.8144, + "step": 4884 + }, + { + "epoch": 0.26886454950740274, + "grad_norm": 0.8573774695396423, + "learning_rate": 9.575929179918167e-06, + "loss": 0.8992, + "step": 4885 + }, + { + "epoch": 0.2689195883097584, + "grad_norm": 0.7326993942260742, + "learning_rate": 9.57575446175527e-06, + "loss": 0.699, + "step": 4886 + }, + { + "epoch": 0.26897462711211406, + "grad_norm": 0.8249791264533997, + "learning_rate": 9.575579709202228e-06, + "loss": 0.7445, + "step": 4887 + }, + { + "epoch": 0.2690296659144697, + "grad_norm": 0.7136644124984741, + "learning_rate": 9.575404922260351e-06, + "loss": 0.779, + "step": 4888 + }, + { + "epoch": 0.2690847047168254, + "grad_norm": 1.0130438804626465, + "learning_rate": 9.575230100930958e-06, + "loss": 0.8535, + "step": 4889 + }, + { + "epoch": 0.269139743519181, + "grad_norm": 0.6784926652908325, + "learning_rate": 9.575055245215358e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.2691947823215367, + "grad_norm": 0.7492508888244629, + "learning_rate": 9.57488035511487e-06, + "loss": 0.6748, + "step": 4891 + }, + { + "epoch": 0.26924982112389234, + "grad_norm": 0.7951217889785767, + "learning_rate": 9.574705430630807e-06, + "loss": 0.8119, + "step": 4892 + }, + { + "epoch": 0.26930485992624803, + "grad_norm": 0.9756677746772766, + "learning_rate": 9.574530471764478e-06, + "loss": 0.855, + "step": 4893 + }, + { + "epoch": 0.26935989872860366, + "grad_norm": 0.7806811928749084, + "learning_rate": 9.574355478517206e-06, + "loss": 0.8432, + "step": 4894 + }, + { + "epoch": 0.26941493753095935, + "grad_norm": 0.7814774513244629, + "learning_rate": 9.574180450890301e-06, + "loss": 0.8226, + "step": 4895 + }, + { + "epoch": 0.269469976333315, + "grad_norm": 0.7745325565338135, + "learning_rate": 9.574005388885081e-06, + "loss": 0.7722, + "step": 4896 + }, + { + "epoch": 0.2695250151356706, + "grad_norm": 0.7805666327476501, + "learning_rate": 9.573830292502862e-06, + "loss": 0.8357, + "step": 4897 + }, + { + "epoch": 0.2695800539380263, + "grad_norm": 0.8428031802177429, + "learning_rate": 9.573655161744958e-06, + "loss": 0.8056, + "step": 4898 + }, + { + "epoch": 0.26963509274038194, + "grad_norm": 0.7896600961685181, + "learning_rate": 9.573479996612684e-06, + "loss": 0.7984, + "step": 4899 + }, + { + "epoch": 0.26969013154273763, + "grad_norm": 0.7718683481216431, + "learning_rate": 9.57330479710736e-06, + "loss": 0.7527, + "step": 4900 + }, + { + "epoch": 0.26974517034509327, + "grad_norm": 0.7868129014968872, + "learning_rate": 9.573129563230302e-06, + "loss": 0.7876, + "step": 4901 + }, + { + "epoch": 0.26980020914744895, + "grad_norm": 0.8493777513504028, + "learning_rate": 9.572954294982826e-06, + "loss": 0.864, + "step": 4902 + }, + { + "epoch": 0.2698552479498046, + "grad_norm": 0.7492502331733704, + "learning_rate": 9.57277899236625e-06, + "loss": 0.8236, + "step": 4903 + }, + { + "epoch": 0.2699102867521603, + "grad_norm": 1.0534250736236572, + "learning_rate": 9.57260365538189e-06, + "loss": 0.8012, + "step": 4904 + }, + { + "epoch": 0.2699653255545159, + "grad_norm": 0.7557470202445984, + "learning_rate": 9.572428284031065e-06, + "loss": 0.9084, + "step": 4905 + }, + { + "epoch": 0.2700203643568716, + "grad_norm": 0.8055123686790466, + "learning_rate": 9.572252878315094e-06, + "loss": 0.7468, + "step": 4906 + }, + { + "epoch": 0.27007540315922723, + "grad_norm": 0.8399039506912231, + "learning_rate": 9.572077438235294e-06, + "loss": 0.9293, + "step": 4907 + }, + { + "epoch": 0.2701304419615829, + "grad_norm": 0.9800041317939758, + "learning_rate": 9.571901963792983e-06, + "loss": 0.8664, + "step": 4908 + }, + { + "epoch": 0.27018548076393856, + "grad_norm": 0.7732129096984863, + "learning_rate": 9.571726454989482e-06, + "loss": 0.7227, + "step": 4909 + }, + { + "epoch": 0.27024051956629425, + "grad_norm": 0.730754017829895, + "learning_rate": 9.571550911826109e-06, + "loss": 0.6467, + "step": 4910 + }, + { + "epoch": 0.2702955583686499, + "grad_norm": 0.8245325684547424, + "learning_rate": 9.57137533430418e-06, + "loss": 0.7847, + "step": 4911 + }, + { + "epoch": 0.27035059717100557, + "grad_norm": 0.8606786131858826, + "learning_rate": 9.57119972242502e-06, + "loss": 0.9556, + "step": 4912 + }, + { + "epoch": 0.2704056359733612, + "grad_norm": 0.7480195164680481, + "learning_rate": 9.571024076189947e-06, + "loss": 0.8504, + "step": 4913 + }, + { + "epoch": 0.2704606747757169, + "grad_norm": 0.718913197517395, + "learning_rate": 9.57084839560028e-06, + "loss": 0.7869, + "step": 4914 + }, + { + "epoch": 0.2705157135780725, + "grad_norm": 0.9778180122375488, + "learning_rate": 9.57067268065734e-06, + "loss": 0.8514, + "step": 4915 + }, + { + "epoch": 0.2705707523804282, + "grad_norm": 0.7394844889640808, + "learning_rate": 9.570496931362448e-06, + "loss": 0.7906, + "step": 4916 + }, + { + "epoch": 0.27062579118278385, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.570321147716923e-06, + "loss": 0.8194, + "step": 4917 + }, + { + "epoch": 0.27068082998513954, + "grad_norm": 0.8002632260322571, + "learning_rate": 9.57014532972209e-06, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.27073586878749517, + "grad_norm": 0.8668341040611267, + "learning_rate": 9.569969477379267e-06, + "loss": 0.8954, + "step": 4919 + }, + { + "epoch": 0.27079090758985086, + "grad_norm": 0.7403327226638794, + "learning_rate": 9.569793590689775e-06, + "loss": 0.7755, + "step": 4920 + }, + { + "epoch": 0.2708459463922065, + "grad_norm": 0.7399682998657227, + "learning_rate": 9.569617669654938e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 0.2709009851945622, + "grad_norm": 0.788600504398346, + "learning_rate": 9.56944171427608e-06, + "loss": 0.7565, + "step": 4922 + }, + { + "epoch": 0.2709560239969178, + "grad_norm": 0.7044861912727356, + "learning_rate": 9.56926572455452e-06, + "loss": 0.7073, + "step": 4923 + }, + { + "epoch": 0.2710110627992735, + "grad_norm": 0.8195114135742188, + "learning_rate": 9.569089700491581e-06, + "loss": 0.8658, + "step": 4924 + }, + { + "epoch": 0.27106610160162914, + "grad_norm": 0.7792258858680725, + "learning_rate": 9.568913642088589e-06, + "loss": 0.8628, + "step": 4925 + }, + { + "epoch": 0.27112114040398483, + "grad_norm": 0.764930248260498, + "learning_rate": 9.568737549346862e-06, + "loss": 0.7761, + "step": 4926 + }, + { + "epoch": 0.27117617920634046, + "grad_norm": 0.7226328253746033, + "learning_rate": 9.56856142226773e-06, + "loss": 0.7208, + "step": 4927 + }, + { + "epoch": 0.27123121800869615, + "grad_norm": 0.8726598620414734, + "learning_rate": 9.568385260852512e-06, + "loss": 0.8599, + "step": 4928 + }, + { + "epoch": 0.2712862568110518, + "grad_norm": 1.0126571655273438, + "learning_rate": 9.568209065102533e-06, + "loss": 0.8145, + "step": 4929 + }, + { + "epoch": 0.2713412956134075, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.568032835019116e-06, + "loss": 0.6758, + "step": 4930 + }, + { + "epoch": 0.2713963344157631, + "grad_norm": 0.6955474019050598, + "learning_rate": 9.567856570603589e-06, + "loss": 0.7461, + "step": 4931 + }, + { + "epoch": 0.2714513732181188, + "grad_norm": 0.7136832475662231, + "learning_rate": 9.567680271857274e-06, + "loss": 0.7692, + "step": 4932 + }, + { + "epoch": 0.27150641202047443, + "grad_norm": 1.2288198471069336, + "learning_rate": 9.567503938781497e-06, + "loss": 0.7815, + "step": 4933 + }, + { + "epoch": 0.2715614508228301, + "grad_norm": 0.9182234406471252, + "learning_rate": 9.567327571377584e-06, + "loss": 0.8822, + "step": 4934 + }, + { + "epoch": 0.27161648962518575, + "grad_norm": 0.7684763669967651, + "learning_rate": 9.567151169646859e-06, + "loss": 0.7618, + "step": 4935 + }, + { + "epoch": 0.27167152842754144, + "grad_norm": 0.872360348701477, + "learning_rate": 9.566974733590647e-06, + "loss": 0.7975, + "step": 4936 + }, + { + "epoch": 0.2717265672298971, + "grad_norm": 0.9010463356971741, + "learning_rate": 9.566798263210277e-06, + "loss": 0.7159, + "step": 4937 + }, + { + "epoch": 0.27178160603225276, + "grad_norm": 0.7254281044006348, + "learning_rate": 9.566621758507072e-06, + "loss": 0.6724, + "step": 4938 + }, + { + "epoch": 0.2718366448346084, + "grad_norm": 0.8478212356567383, + "learning_rate": 9.566445219482363e-06, + "loss": 0.659, + "step": 4939 + }, + { + "epoch": 0.27189168363696403, + "grad_norm": 0.9038714170455933, + "learning_rate": 9.56626864613747e-06, + "loss": 0.8766, + "step": 4940 + }, + { + "epoch": 0.2719467224393197, + "grad_norm": 0.9704582691192627, + "learning_rate": 9.566092038473728e-06, + "loss": 0.8972, + "step": 4941 + }, + { + "epoch": 0.27200176124167535, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.565915396492459e-06, + "loss": 0.8116, + "step": 4942 + }, + { + "epoch": 0.27205680004403104, + "grad_norm": 0.7432642579078674, + "learning_rate": 9.565738720194993e-06, + "loss": 0.847, + "step": 4943 + }, + { + "epoch": 0.2721118388463867, + "grad_norm": 0.6813814043998718, + "learning_rate": 9.565562009582655e-06, + "loss": 0.7146, + "step": 4944 + }, + { + "epoch": 0.27216687764874237, + "grad_norm": 0.7447707056999207, + "learning_rate": 9.565385264656776e-06, + "loss": 0.7696, + "step": 4945 + }, + { + "epoch": 0.272221916451098, + "grad_norm": 0.875073254108429, + "learning_rate": 9.565208485418685e-06, + "loss": 0.8714, + "step": 4946 + }, + { + "epoch": 0.2722769552534537, + "grad_norm": 0.7753880620002747, + "learning_rate": 9.565031671869707e-06, + "loss": 0.739, + "step": 4947 + }, + { + "epoch": 0.2723319940558093, + "grad_norm": 0.749264121055603, + "learning_rate": 9.564854824011172e-06, + "loss": 0.7957, + "step": 4948 + }, + { + "epoch": 0.272387032858165, + "grad_norm": 0.6733991503715515, + "learning_rate": 9.564677941844412e-06, + "loss": 0.7402, + "step": 4949 + }, + { + "epoch": 0.27244207166052065, + "grad_norm": 0.7426447868347168, + "learning_rate": 9.564501025370753e-06, + "loss": 0.7977, + "step": 4950 + }, + { + "epoch": 0.27249711046287634, + "grad_norm": 0.7930514812469482, + "learning_rate": 9.564324074591529e-06, + "loss": 0.8485, + "step": 4951 + }, + { + "epoch": 0.27255214926523197, + "grad_norm": 0.8087072968482971, + "learning_rate": 9.564147089508064e-06, + "loss": 0.9215, + "step": 4952 + }, + { + "epoch": 0.27260718806758766, + "grad_norm": 0.7560327053070068, + "learning_rate": 9.563970070121694e-06, + "loss": 0.7966, + "step": 4953 + }, + { + "epoch": 0.2726622268699433, + "grad_norm": 0.735573947429657, + "learning_rate": 9.563793016433744e-06, + "loss": 0.7737, + "step": 4954 + }, + { + "epoch": 0.272717265672299, + "grad_norm": 0.7603545784950256, + "learning_rate": 9.563615928445548e-06, + "loss": 0.7717, + "step": 4955 + }, + { + "epoch": 0.2727723044746546, + "grad_norm": 0.7185375094413757, + "learning_rate": 9.563438806158437e-06, + "loss": 0.8057, + "step": 4956 + }, + { + "epoch": 0.2728273432770103, + "grad_norm": 0.7619272470474243, + "learning_rate": 9.56326164957374e-06, + "loss": 0.8173, + "step": 4957 + }, + { + "epoch": 0.27288238207936594, + "grad_norm": 0.7868000864982605, + "learning_rate": 9.563084458692793e-06, + "loss": 0.6855, + "step": 4958 + }, + { + "epoch": 0.2729374208817216, + "grad_norm": 0.7949535846710205, + "learning_rate": 9.562907233516923e-06, + "loss": 0.7754, + "step": 4959 + }, + { + "epoch": 0.27299245968407726, + "grad_norm": 0.7037919163703918, + "learning_rate": 9.562729974047462e-06, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.27304749848643295, + "grad_norm": 0.7236568927764893, + "learning_rate": 9.562552680285746e-06, + "loss": 0.7135, + "step": 4961 + }, + { + "epoch": 0.2731025372887886, + "grad_norm": 0.8410467505455017, + "learning_rate": 9.562375352233105e-06, + "loss": 0.8507, + "step": 4962 + }, + { + "epoch": 0.27315757609114427, + "grad_norm": 0.8043560981750488, + "learning_rate": 9.562197989890871e-06, + "loss": 0.8484, + "step": 4963 + }, + { + "epoch": 0.2732126148934999, + "grad_norm": 0.6926127672195435, + "learning_rate": 9.56202059326038e-06, + "loss": 0.8087, + "step": 4964 + }, + { + "epoch": 0.2732676536958556, + "grad_norm": 0.7149024605751038, + "learning_rate": 9.561843162342961e-06, + "loss": 0.7349, + "step": 4965 + }, + { + "epoch": 0.27332269249821123, + "grad_norm": 0.7165781855583191, + "learning_rate": 9.561665697139952e-06, + "loss": 0.8139, + "step": 4966 + }, + { + "epoch": 0.2733777313005669, + "grad_norm": 0.7481133341789246, + "learning_rate": 9.561488197652684e-06, + "loss": 0.7712, + "step": 4967 + }, + { + "epoch": 0.27343277010292255, + "grad_norm": 0.6928209066390991, + "learning_rate": 9.561310663882491e-06, + "loss": 0.7524, + "step": 4968 + }, + { + "epoch": 0.27348780890527824, + "grad_norm": 0.7397856116294861, + "learning_rate": 9.561133095830708e-06, + "loss": 0.718, + "step": 4969 + }, + { + "epoch": 0.2735428477076339, + "grad_norm": 0.7712383270263672, + "learning_rate": 9.560955493498672e-06, + "loss": 0.8201, + "step": 4970 + }, + { + "epoch": 0.27359788650998956, + "grad_norm": 0.96076899766922, + "learning_rate": 9.560777856887714e-06, + "loss": 0.8555, + "step": 4971 + }, + { + "epoch": 0.2736529253123452, + "grad_norm": 0.7331019639968872, + "learning_rate": 9.56060018599917e-06, + "loss": 0.8315, + "step": 4972 + }, + { + "epoch": 0.2737079641147009, + "grad_norm": 0.7157140970230103, + "learning_rate": 9.560422480834374e-06, + "loss": 0.7177, + "step": 4973 + }, + { + "epoch": 0.2737630029170565, + "grad_norm": 0.807614266872406, + "learning_rate": 9.560244741394666e-06, + "loss": 0.8413, + "step": 4974 + }, + { + "epoch": 0.2738180417194122, + "grad_norm": 0.7618574500083923, + "learning_rate": 9.560066967681378e-06, + "loss": 0.8248, + "step": 4975 + }, + { + "epoch": 0.27387308052176784, + "grad_norm": 0.7886885404586792, + "learning_rate": 9.559889159695848e-06, + "loss": 0.8793, + "step": 4976 + }, + { + "epoch": 0.27392811932412353, + "grad_norm": 1.0090755224227905, + "learning_rate": 9.559711317439411e-06, + "loss": 0.9255, + "step": 4977 + }, + { + "epoch": 0.27398315812647916, + "grad_norm": 0.7855443358421326, + "learning_rate": 9.559533440913405e-06, + "loss": 0.8001, + "step": 4978 + }, + { + "epoch": 0.27403819692883485, + "grad_norm": 0.768741250038147, + "learning_rate": 9.559355530119165e-06, + "loss": 0.8109, + "step": 4979 + }, + { + "epoch": 0.2740932357311905, + "grad_norm": 0.759589672088623, + "learning_rate": 9.55917758505803e-06, + "loss": 0.8001, + "step": 4980 + }, + { + "epoch": 0.2741482745335462, + "grad_norm": 0.7937445640563965, + "learning_rate": 9.558999605731338e-06, + "loss": 0.8924, + "step": 4981 + }, + { + "epoch": 0.2742033133359018, + "grad_norm": 0.9041592478752136, + "learning_rate": 9.558821592140423e-06, + "loss": 0.9167, + "step": 4982 + }, + { + "epoch": 0.27425835213825744, + "grad_norm": 0.6971380710601807, + "learning_rate": 9.558643544286627e-06, + "loss": 0.7589, + "step": 4983 + }, + { + "epoch": 0.27431339094061313, + "grad_norm": 0.9292929172515869, + "learning_rate": 9.558465462171287e-06, + "loss": 0.9566, + "step": 4984 + }, + { + "epoch": 0.27436842974296877, + "grad_norm": 0.8320629000663757, + "learning_rate": 9.558287345795738e-06, + "loss": 0.8854, + "step": 4985 + }, + { + "epoch": 0.27442346854532446, + "grad_norm": 0.797272801399231, + "learning_rate": 9.558109195161325e-06, + "loss": 0.7838, + "step": 4986 + }, + { + "epoch": 0.2744785073476801, + "grad_norm": 0.9702700972557068, + "learning_rate": 9.557931010269382e-06, + "loss": 0.8593, + "step": 4987 + }, + { + "epoch": 0.2745335461500358, + "grad_norm": 0.8309103846549988, + "learning_rate": 9.557752791121248e-06, + "loss": 0.8902, + "step": 4988 + }, + { + "epoch": 0.2745885849523914, + "grad_norm": 0.706667959690094, + "learning_rate": 9.557574537718265e-06, + "loss": 0.7259, + "step": 4989 + }, + { + "epoch": 0.2746436237547471, + "grad_norm": 0.770239531993866, + "learning_rate": 9.557396250061771e-06, + "loss": 0.8644, + "step": 4990 + }, + { + "epoch": 0.27469866255710274, + "grad_norm": 0.8695803880691528, + "learning_rate": 9.557217928153108e-06, + "loss": 0.895, + "step": 4991 + }, + { + "epoch": 0.2747537013594584, + "grad_norm": 0.7525948286056519, + "learning_rate": 9.557039571993614e-06, + "loss": 0.7029, + "step": 4992 + }, + { + "epoch": 0.27480874016181406, + "grad_norm": 0.7616680264472961, + "learning_rate": 9.556861181584631e-06, + "loss": 0.8025, + "step": 4993 + }, + { + "epoch": 0.27486377896416975, + "grad_norm": 0.7216167449951172, + "learning_rate": 9.5566827569275e-06, + "loss": 0.8314, + "step": 4994 + }, + { + "epoch": 0.2749188177665254, + "grad_norm": 0.7412614226341248, + "learning_rate": 9.55650429802356e-06, + "loss": 0.7877, + "step": 4995 + }, + { + "epoch": 0.27497385656888107, + "grad_norm": 0.7176525592803955, + "learning_rate": 9.556325804874154e-06, + "loss": 0.7615, + "step": 4996 + }, + { + "epoch": 0.2750288953712367, + "grad_norm": 0.7544515132904053, + "learning_rate": 9.556147277480623e-06, + "loss": 0.8352, + "step": 4997 + }, + { + "epoch": 0.2750839341735924, + "grad_norm": 0.7318205833435059, + "learning_rate": 9.555968715844309e-06, + "loss": 0.7403, + "step": 4998 + }, + { + "epoch": 0.275138972975948, + "grad_norm": 0.7495027780532837, + "learning_rate": 9.555790119966552e-06, + "loss": 0.7611, + "step": 4999 + }, + { + "epoch": 0.2751940117783037, + "grad_norm": 0.7544401288032532, + "learning_rate": 9.555611489848697e-06, + "loss": 0.8594, + "step": 5000 + }, + { + "epoch": 0.27524905058065935, + "grad_norm": 0.7698250412940979, + "learning_rate": 9.555432825492084e-06, + "loss": 0.8438, + "step": 5001 + }, + { + "epoch": 0.27530408938301504, + "grad_norm": 0.7668892741203308, + "learning_rate": 9.555254126898059e-06, + "loss": 0.8082, + "step": 5002 + }, + { + "epoch": 0.27535912818537067, + "grad_norm": 0.9170669317245483, + "learning_rate": 9.555075394067963e-06, + "loss": 0.7443, + "step": 5003 + }, + { + "epoch": 0.27541416698772636, + "grad_norm": 0.7890255451202393, + "learning_rate": 9.55489662700314e-06, + "loss": 0.8269, + "step": 5004 + }, + { + "epoch": 0.275469205790082, + "grad_norm": 0.6740512847900391, + "learning_rate": 9.554717825704932e-06, + "loss": 0.6906, + "step": 5005 + }, + { + "epoch": 0.2755242445924377, + "grad_norm": 0.8032376170158386, + "learning_rate": 9.554538990174685e-06, + "loss": 0.812, + "step": 5006 + }, + { + "epoch": 0.2755792833947933, + "grad_norm": 0.6932135224342346, + "learning_rate": 9.554360120413741e-06, + "loss": 0.7823, + "step": 5007 + }, + { + "epoch": 0.275634322197149, + "grad_norm": 0.7447643876075745, + "learning_rate": 9.554181216423447e-06, + "loss": 0.8753, + "step": 5008 + }, + { + "epoch": 0.27568936099950464, + "grad_norm": 0.8035081624984741, + "learning_rate": 9.554002278205145e-06, + "loss": 0.7135, + "step": 5009 + }, + { + "epoch": 0.27574439980186033, + "grad_norm": 0.7544171214103699, + "learning_rate": 9.553823305760182e-06, + "loss": 0.7574, + "step": 5010 + }, + { + "epoch": 0.27579943860421596, + "grad_norm": 0.6648419499397278, + "learning_rate": 9.553644299089902e-06, + "loss": 0.7566, + "step": 5011 + }, + { + "epoch": 0.27585447740657165, + "grad_norm": 0.7481752038002014, + "learning_rate": 9.55346525819565e-06, + "loss": 0.7862, + "step": 5012 + }, + { + "epoch": 0.2759095162089273, + "grad_norm": 0.7000668048858643, + "learning_rate": 9.55328618307877e-06, + "loss": 0.7767, + "step": 5013 + }, + { + "epoch": 0.275964555011283, + "grad_norm": 0.7435166239738464, + "learning_rate": 9.553107073740612e-06, + "loss": 0.6888, + "step": 5014 + }, + { + "epoch": 0.2760195938136386, + "grad_norm": 0.7593170404434204, + "learning_rate": 9.552927930182521e-06, + "loss": 0.7272, + "step": 5015 + }, + { + "epoch": 0.2760746326159943, + "grad_norm": 0.870079755783081, + "learning_rate": 9.55274875240584e-06, + "loss": 0.8692, + "step": 5016 + }, + { + "epoch": 0.27612967141834993, + "grad_norm": 0.8550307750701904, + "learning_rate": 9.55256954041192e-06, + "loss": 0.8729, + "step": 5017 + }, + { + "epoch": 0.2761847102207056, + "grad_norm": 0.888830304145813, + "learning_rate": 9.552390294202105e-06, + "loss": 0.8607, + "step": 5018 + }, + { + "epoch": 0.27623974902306125, + "grad_norm": 0.8295729160308838, + "learning_rate": 9.552211013777743e-06, + "loss": 0.8722, + "step": 5019 + }, + { + "epoch": 0.27629478782541694, + "grad_norm": 0.7732356190681458, + "learning_rate": 9.552031699140182e-06, + "loss": 0.8332, + "step": 5020 + }, + { + "epoch": 0.2763498266277726, + "grad_norm": 0.9132987856864929, + "learning_rate": 9.55185235029077e-06, + "loss": 0.769, + "step": 5021 + }, + { + "epoch": 0.27640486543012827, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.551672967230851e-06, + "loss": 0.8505, + "step": 5022 + }, + { + "epoch": 0.2764599042324839, + "grad_norm": 0.8526949882507324, + "learning_rate": 9.551493549961778e-06, + "loss": 0.8002, + "step": 5023 + }, + { + "epoch": 0.2765149430348396, + "grad_norm": 0.9513188004493713, + "learning_rate": 9.551314098484901e-06, + "loss": 0.8558, + "step": 5024 + }, + { + "epoch": 0.2765699818371952, + "grad_norm": 0.7543003559112549, + "learning_rate": 9.551134612801563e-06, + "loss": 0.8292, + "step": 5025 + }, + { + "epoch": 0.27662502063955086, + "grad_norm": 0.7531017065048218, + "learning_rate": 9.550955092913115e-06, + "loss": 0.7837, + "step": 5026 + }, + { + "epoch": 0.27668005944190655, + "grad_norm": 0.8725717663764954, + "learning_rate": 9.550775538820907e-06, + "loss": 0.8362, + "step": 5027 + }, + { + "epoch": 0.2767350982442622, + "grad_norm": 0.8122721910476685, + "learning_rate": 9.550595950526288e-06, + "loss": 0.8539, + "step": 5028 + }, + { + "epoch": 0.27679013704661787, + "grad_norm": 0.7756829261779785, + "learning_rate": 9.550416328030608e-06, + "loss": 0.787, + "step": 5029 + }, + { + "epoch": 0.2768451758489735, + "grad_norm": 0.9086001515388489, + "learning_rate": 9.550236671335218e-06, + "loss": 0.7972, + "step": 5030 + }, + { + "epoch": 0.2769002146513292, + "grad_norm": 0.7857060432434082, + "learning_rate": 9.550056980441466e-06, + "loss": 0.7577, + "step": 5031 + }, + { + "epoch": 0.2769552534536848, + "grad_norm": 0.8190392851829529, + "learning_rate": 9.549877255350703e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.2770102922560405, + "grad_norm": 0.7714588642120361, + "learning_rate": 9.549697496064283e-06, + "loss": 0.7916, + "step": 5033 + }, + { + "epoch": 0.27706533105839615, + "grad_norm": 0.7178533673286438, + "learning_rate": 9.549517702583552e-06, + "loss": 0.8001, + "step": 5034 + }, + { + "epoch": 0.27712036986075184, + "grad_norm": 0.7552955150604248, + "learning_rate": 9.549337874909865e-06, + "loss": 0.8361, + "step": 5035 + }, + { + "epoch": 0.27717540866310747, + "grad_norm": 0.7823992371559143, + "learning_rate": 9.549158013044573e-06, + "loss": 0.7033, + "step": 5036 + }, + { + "epoch": 0.27723044746546316, + "grad_norm": 0.731504499912262, + "learning_rate": 9.548978116989026e-06, + "loss": 0.73, + "step": 5037 + }, + { + "epoch": 0.2772854862678188, + "grad_norm": 0.7455994486808777, + "learning_rate": 9.548798186744578e-06, + "loss": 0.8005, + "step": 5038 + }, + { + "epoch": 0.2773405250701745, + "grad_norm": 0.7020164728164673, + "learning_rate": 9.54861822231258e-06, + "loss": 0.6707, + "step": 5039 + }, + { + "epoch": 0.2773955638725301, + "grad_norm": 0.7526360750198364, + "learning_rate": 9.548438223694385e-06, + "loss": 0.7686, + "step": 5040 + }, + { + "epoch": 0.2774506026748858, + "grad_norm": 0.7268579006195068, + "learning_rate": 9.548258190891344e-06, + "loss": 0.7039, + "step": 5041 + }, + { + "epoch": 0.27750564147724144, + "grad_norm": 0.9361631274223328, + "learning_rate": 9.548078123904815e-06, + "loss": 0.8023, + "step": 5042 + }, + { + "epoch": 0.2775606802795971, + "grad_norm": 0.7786710262298584, + "learning_rate": 9.547898022736147e-06, + "loss": 0.6866, + "step": 5043 + }, + { + "epoch": 0.27761571908195276, + "grad_norm": 0.7175624370574951, + "learning_rate": 9.547717887386695e-06, + "loss": 0.7554, + "step": 5044 + }, + { + "epoch": 0.27767075788430845, + "grad_norm": 0.9157657623291016, + "learning_rate": 9.547537717857813e-06, + "loss": 0.7936, + "step": 5045 + }, + { + "epoch": 0.2777257966866641, + "grad_norm": 0.7881377935409546, + "learning_rate": 9.547357514150854e-06, + "loss": 0.8198, + "step": 5046 + }, + { + "epoch": 0.2777808354890198, + "grad_norm": 1.0444039106369019, + "learning_rate": 9.547177276267173e-06, + "loss": 0.7954, + "step": 5047 + }, + { + "epoch": 0.2778358742913754, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.546997004208124e-06, + "loss": 0.7697, + "step": 5048 + }, + { + "epoch": 0.2778909130937311, + "grad_norm": 0.7304134368896484, + "learning_rate": 9.546816697975066e-06, + "loss": 0.7034, + "step": 5049 + }, + { + "epoch": 0.27794595189608673, + "grad_norm": 0.7783082723617554, + "learning_rate": 9.546636357569347e-06, + "loss": 0.8185, + "step": 5050 + }, + { + "epoch": 0.2780009906984424, + "grad_norm": 0.750712513923645, + "learning_rate": 9.54645598299233e-06, + "loss": 0.7336, + "step": 5051 + }, + { + "epoch": 0.27805602950079805, + "grad_norm": 0.7849590182304382, + "learning_rate": 9.546275574245364e-06, + "loss": 0.8088, + "step": 5052 + }, + { + "epoch": 0.27811106830315374, + "grad_norm": 0.8490208983421326, + "learning_rate": 9.546095131329809e-06, + "loss": 0.8507, + "step": 5053 + }, + { + "epoch": 0.2781661071055094, + "grad_norm": 0.8107250928878784, + "learning_rate": 9.54591465424702e-06, + "loss": 0.7787, + "step": 5054 + }, + { + "epoch": 0.27822114590786506, + "grad_norm": 0.8278594613075256, + "learning_rate": 9.54573414299835e-06, + "loss": 0.7836, + "step": 5055 + }, + { + "epoch": 0.2782761847102207, + "grad_norm": 0.7982015013694763, + "learning_rate": 9.545553597585163e-06, + "loss": 0.7672, + "step": 5056 + }, + { + "epoch": 0.2783312235125764, + "grad_norm": 0.7311522364616394, + "learning_rate": 9.54537301800881e-06, + "loss": 0.7571, + "step": 5057 + }, + { + "epoch": 0.278386262314932, + "grad_norm": 0.8039999604225159, + "learning_rate": 9.545192404270651e-06, + "loss": 0.764, + "step": 5058 + }, + { + "epoch": 0.2784413011172877, + "grad_norm": 0.7810946702957153, + "learning_rate": 9.545011756372042e-06, + "loss": 0.9217, + "step": 5059 + }, + { + "epoch": 0.27849633991964334, + "grad_norm": 0.7092248797416687, + "learning_rate": 9.544831074314343e-06, + "loss": 0.7599, + "step": 5060 + }, + { + "epoch": 0.27855137872199903, + "grad_norm": 0.831550657749176, + "learning_rate": 9.544650358098908e-06, + "loss": 0.7278, + "step": 5061 + }, + { + "epoch": 0.27860641752435467, + "grad_norm": 0.7645474076271057, + "learning_rate": 9.544469607727098e-06, + "loss": 0.7945, + "step": 5062 + }, + { + "epoch": 0.27866145632671036, + "grad_norm": 0.6956788301467896, + "learning_rate": 9.544288823200273e-06, + "loss": 0.749, + "step": 5063 + }, + { + "epoch": 0.278716495129066, + "grad_norm": 0.7262974381446838, + "learning_rate": 9.544108004519786e-06, + "loss": 0.8074, + "step": 5064 + }, + { + "epoch": 0.2787715339314217, + "grad_norm": 0.7439202666282654, + "learning_rate": 9.543927151687001e-06, + "loss": 0.9403, + "step": 5065 + }, + { + "epoch": 0.2788265727337773, + "grad_norm": 0.8468778133392334, + "learning_rate": 9.543746264703277e-06, + "loss": 0.8182, + "step": 5066 + }, + { + "epoch": 0.278881611536133, + "grad_norm": 0.8396204113960266, + "learning_rate": 9.54356534356997e-06, + "loss": 0.8067, + "step": 5067 + }, + { + "epoch": 0.27893665033848863, + "grad_norm": 0.718758225440979, + "learning_rate": 9.543384388288445e-06, + "loss": 0.8172, + "step": 5068 + }, + { + "epoch": 0.27899168914084427, + "grad_norm": 0.7562685012817383, + "learning_rate": 9.543203398860056e-06, + "loss": 0.9053, + "step": 5069 + }, + { + "epoch": 0.27904672794319996, + "grad_norm": 0.9592792987823486, + "learning_rate": 9.543022375286169e-06, + "loss": 0.9375, + "step": 5070 + }, + { + "epoch": 0.2791017667455556, + "grad_norm": 0.7162739634513855, + "learning_rate": 9.54284131756814e-06, + "loss": 0.7297, + "step": 5071 + }, + { + "epoch": 0.2791568055479113, + "grad_norm": 0.7703517079353333, + "learning_rate": 9.542660225707335e-06, + "loss": 0.8863, + "step": 5072 + }, + { + "epoch": 0.2792118443502669, + "grad_norm": 0.7860418558120728, + "learning_rate": 9.542479099705109e-06, + "loss": 0.8335, + "step": 5073 + }, + { + "epoch": 0.2792668831526226, + "grad_norm": 0.8880825042724609, + "learning_rate": 9.542297939562825e-06, + "loss": 0.8344, + "step": 5074 + }, + { + "epoch": 0.27932192195497824, + "grad_norm": 0.7900505661964417, + "learning_rate": 9.542116745281849e-06, + "loss": 0.7613, + "step": 5075 + }, + { + "epoch": 0.2793769607573339, + "grad_norm": 0.7446081042289734, + "learning_rate": 9.541935516863536e-06, + "loss": 0.6615, + "step": 5076 + }, + { + "epoch": 0.27943199955968956, + "grad_norm": 0.7831308245658875, + "learning_rate": 9.541754254309254e-06, + "loss": 0.779, + "step": 5077 + }, + { + "epoch": 0.27948703836204525, + "grad_norm": 0.9007606506347656, + "learning_rate": 9.541572957620361e-06, + "loss": 0.8883, + "step": 5078 + }, + { + "epoch": 0.2795420771644009, + "grad_norm": 0.8033407330513, + "learning_rate": 9.541391626798222e-06, + "loss": 0.7354, + "step": 5079 + }, + { + "epoch": 0.27959711596675657, + "grad_norm": 0.9259470105171204, + "learning_rate": 9.5412102618442e-06, + "loss": 0.7602, + "step": 5080 + }, + { + "epoch": 0.2796521547691122, + "grad_norm": 0.786523163318634, + "learning_rate": 9.541028862759656e-06, + "loss": 0.7402, + "step": 5081 + }, + { + "epoch": 0.2797071935714679, + "grad_norm": 0.8053372502326965, + "learning_rate": 9.540847429545954e-06, + "loss": 0.825, + "step": 5082 + }, + { + "epoch": 0.2797622323738235, + "grad_norm": 0.8578022122383118, + "learning_rate": 9.54066596220446e-06, + "loss": 0.7866, + "step": 5083 + }, + { + "epoch": 0.2798172711761792, + "grad_norm": 0.916161835193634, + "learning_rate": 9.540484460736535e-06, + "loss": 0.5961, + "step": 5084 + }, + { + "epoch": 0.27987230997853485, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.540302925143545e-06, + "loss": 0.764, + "step": 5085 + }, + { + "epoch": 0.27992734878089054, + "grad_norm": 0.7392510771751404, + "learning_rate": 9.540121355426852e-06, + "loss": 0.8038, + "step": 5086 + }, + { + "epoch": 0.2799823875832462, + "grad_norm": 0.7406296133995056, + "learning_rate": 9.539939751587825e-06, + "loss": 0.8202, + "step": 5087 + }, + { + "epoch": 0.28003742638560186, + "grad_norm": 0.7274924516677856, + "learning_rate": 9.539758113627823e-06, + "loss": 0.7691, + "step": 5088 + }, + { + "epoch": 0.2800924651879575, + "grad_norm": 0.8563184142112732, + "learning_rate": 9.539576441548218e-06, + "loss": 0.8341, + "step": 5089 + }, + { + "epoch": 0.2801475039903132, + "grad_norm": 0.7708351016044617, + "learning_rate": 9.539394735350366e-06, + "loss": 0.7126, + "step": 5090 + }, + { + "epoch": 0.2802025427926688, + "grad_norm": 0.7314836382865906, + "learning_rate": 9.539212995035642e-06, + "loss": 0.7465, + "step": 5091 + }, + { + "epoch": 0.2802575815950245, + "grad_norm": 0.7594754695892334, + "learning_rate": 9.539031220605409e-06, + "loss": 0.7563, + "step": 5092 + }, + { + "epoch": 0.28031262039738014, + "grad_norm": 0.699414074420929, + "learning_rate": 9.53884941206103e-06, + "loss": 0.7847, + "step": 5093 + }, + { + "epoch": 0.28036765919973583, + "grad_norm": 0.8013063073158264, + "learning_rate": 9.538667569403877e-06, + "loss": 0.7769, + "step": 5094 + }, + { + "epoch": 0.28042269800209146, + "grad_norm": 0.7778805494308472, + "learning_rate": 9.538485692635312e-06, + "loss": 0.7646, + "step": 5095 + }, + { + "epoch": 0.28047773680444715, + "grad_norm": 0.785649299621582, + "learning_rate": 9.538303781756702e-06, + "loss": 0.8162, + "step": 5096 + }, + { + "epoch": 0.2805327756068028, + "grad_norm": 0.7073212265968323, + "learning_rate": 9.538121836769417e-06, + "loss": 0.7208, + "step": 5097 + }, + { + "epoch": 0.2805878144091585, + "grad_norm": 0.7545642852783203, + "learning_rate": 9.53793985767482e-06, + "loss": 0.8673, + "step": 5098 + }, + { + "epoch": 0.2806428532115141, + "grad_norm": 0.6818416118621826, + "learning_rate": 9.537757844474285e-06, + "loss": 0.7576, + "step": 5099 + }, + { + "epoch": 0.2806978920138698, + "grad_norm": 0.6718038320541382, + "learning_rate": 9.537575797169176e-06, + "loss": 0.6683, + "step": 5100 + }, + { + "epoch": 0.28075293081622543, + "grad_norm": 0.7851004600524902, + "learning_rate": 9.53739371576086e-06, + "loss": 0.8871, + "step": 5101 + }, + { + "epoch": 0.2808079696185811, + "grad_norm": 0.7565650343894958, + "learning_rate": 9.53721160025071e-06, + "loss": 0.8799, + "step": 5102 + }, + { + "epoch": 0.28086300842093676, + "grad_norm": 0.7522932887077332, + "learning_rate": 9.537029450640091e-06, + "loss": 0.838, + "step": 5103 + }, + { + "epoch": 0.28091804722329244, + "grad_norm": 0.929634690284729, + "learning_rate": 9.536847266930375e-06, + "loss": 0.7997, + "step": 5104 + }, + { + "epoch": 0.2809730860256481, + "grad_norm": 0.8050084710121155, + "learning_rate": 9.536665049122928e-06, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.28102812482800377, + "grad_norm": 0.7401233315467834, + "learning_rate": 9.53648279721912e-06, + "loss": 0.7904, + "step": 5106 + }, + { + "epoch": 0.2810831636303594, + "grad_norm": 0.7125453948974609, + "learning_rate": 9.536300511220322e-06, + "loss": 0.7349, + "step": 5107 + }, + { + "epoch": 0.2811382024327151, + "grad_norm": 0.7165758609771729, + "learning_rate": 9.536118191127905e-06, + "loss": 0.7314, + "step": 5108 + }, + { + "epoch": 0.2811932412350707, + "grad_norm": 0.7507439851760864, + "learning_rate": 9.535935836943237e-06, + "loss": 0.7603, + "step": 5109 + }, + { + "epoch": 0.2812482800374264, + "grad_norm": 0.7832109332084656, + "learning_rate": 9.535753448667688e-06, + "loss": 0.7279, + "step": 5110 + }, + { + "epoch": 0.28130331883978205, + "grad_norm": 0.7346609234809875, + "learning_rate": 9.535571026302633e-06, + "loss": 0.6882, + "step": 5111 + }, + { + "epoch": 0.2813583576421377, + "grad_norm": 0.7569608688354492, + "learning_rate": 9.535388569849437e-06, + "loss": 0.8451, + "step": 5112 + }, + { + "epoch": 0.28141339644449337, + "grad_norm": 0.7319865822792053, + "learning_rate": 9.535206079309478e-06, + "loss": 0.8161, + "step": 5113 + }, + { + "epoch": 0.281468435246849, + "grad_norm": 0.7744631171226501, + "learning_rate": 9.535023554684122e-06, + "loss": 0.8025, + "step": 5114 + }, + { + "epoch": 0.2815234740492047, + "grad_norm": 0.6867525577545166, + "learning_rate": 9.534840995974743e-06, + "loss": 0.7693, + "step": 5115 + }, + { + "epoch": 0.2815785128515603, + "grad_norm": 0.7625848054885864, + "learning_rate": 9.534658403182715e-06, + "loss": 0.8034, + "step": 5116 + }, + { + "epoch": 0.281633551653916, + "grad_norm": 0.7369832992553711, + "learning_rate": 9.534475776309406e-06, + "loss": 0.873, + "step": 5117 + }, + { + "epoch": 0.28168859045627165, + "grad_norm": 0.7267127633094788, + "learning_rate": 9.534293115356191e-06, + "loss": 0.7954, + "step": 5118 + }, + { + "epoch": 0.28174362925862734, + "grad_norm": 0.7244247794151306, + "learning_rate": 9.534110420324443e-06, + "loss": 0.7784, + "step": 5119 + }, + { + "epoch": 0.28179866806098297, + "grad_norm": 0.8207812905311584, + "learning_rate": 9.533927691215534e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.28185370686333866, + "grad_norm": 0.8669891357421875, + "learning_rate": 9.53374492803084e-06, + "loss": 0.8203, + "step": 5121 + }, + { + "epoch": 0.2819087456656943, + "grad_norm": 0.7650816440582275, + "learning_rate": 9.533562130771732e-06, + "loss": 0.77, + "step": 5122 + }, + { + "epoch": 0.28196378446805, + "grad_norm": 0.7664972543716431, + "learning_rate": 9.533379299439584e-06, + "loss": 0.7187, + "step": 5123 + }, + { + "epoch": 0.2820188232704056, + "grad_norm": 0.7921896576881409, + "learning_rate": 9.533196434035772e-06, + "loss": 0.8669, + "step": 5124 + }, + { + "epoch": 0.2820738620727613, + "grad_norm": 0.7714456915855408, + "learning_rate": 9.533013534561669e-06, + "loss": 0.8783, + "step": 5125 + }, + { + "epoch": 0.28212890087511694, + "grad_norm": 0.7222065329551697, + "learning_rate": 9.532830601018648e-06, + "loss": 0.7449, + "step": 5126 + }, + { + "epoch": 0.28218393967747263, + "grad_norm": 0.718142569065094, + "learning_rate": 9.532647633408085e-06, + "loss": 0.8226, + "step": 5127 + }, + { + "epoch": 0.28223897847982826, + "grad_norm": 0.730592668056488, + "learning_rate": 9.532464631731357e-06, + "loss": 0.7878, + "step": 5128 + }, + { + "epoch": 0.28229401728218395, + "grad_norm": 0.7841802835464478, + "learning_rate": 9.532281595989839e-06, + "loss": 0.8262, + "step": 5129 + }, + { + "epoch": 0.2823490560845396, + "grad_norm": 0.8617212772369385, + "learning_rate": 9.532098526184904e-06, + "loss": 0.8368, + "step": 5130 + }, + { + "epoch": 0.2824040948868953, + "grad_norm": 0.6968556642532349, + "learning_rate": 9.53191542231793e-06, + "loss": 0.6848, + "step": 5131 + }, + { + "epoch": 0.2824591336892509, + "grad_norm": 0.7872157096862793, + "learning_rate": 9.531732284390294e-06, + "loss": 0.7898, + "step": 5132 + }, + { + "epoch": 0.2825141724916066, + "grad_norm": 0.7727276086807251, + "learning_rate": 9.53154911240337e-06, + "loss": 0.8506, + "step": 5133 + }, + { + "epoch": 0.28256921129396223, + "grad_norm": 0.7279896140098572, + "learning_rate": 9.531365906358536e-06, + "loss": 0.7415, + "step": 5134 + }, + { + "epoch": 0.2826242500963179, + "grad_norm": 0.7457457780838013, + "learning_rate": 9.53118266625717e-06, + "loss": 0.7652, + "step": 5135 + }, + { + "epoch": 0.28267928889867355, + "grad_norm": 0.8989270329475403, + "learning_rate": 9.530999392100646e-06, + "loss": 0.9085, + "step": 5136 + }, + { + "epoch": 0.28273432770102924, + "grad_norm": 0.9622626304626465, + "learning_rate": 9.530816083890347e-06, + "loss": 0.8726, + "step": 5137 + }, + { + "epoch": 0.2827893665033849, + "grad_norm": 0.7712846994400024, + "learning_rate": 9.530632741627643e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.28284440530574056, + "grad_norm": 0.8320727348327637, + "learning_rate": 9.530449365313918e-06, + "loss": 0.7828, + "step": 5139 + }, + { + "epoch": 0.2828994441080962, + "grad_norm": 0.9310963153839111, + "learning_rate": 9.530265954950549e-06, + "loss": 0.8482, + "step": 5140 + }, + { + "epoch": 0.2829544829104519, + "grad_norm": 0.9984502792358398, + "learning_rate": 9.530082510538914e-06, + "loss": 0.8673, + "step": 5141 + }, + { + "epoch": 0.2830095217128075, + "grad_norm": 0.8300992250442505, + "learning_rate": 9.52989903208039e-06, + "loss": 0.8232, + "step": 5142 + }, + { + "epoch": 0.2830645605151632, + "grad_norm": 0.930052638053894, + "learning_rate": 9.529715519576356e-06, + "loss": 0.7766, + "step": 5143 + }, + { + "epoch": 0.28311959931751884, + "grad_norm": 0.8038359880447388, + "learning_rate": 9.529531973028194e-06, + "loss": 0.712, + "step": 5144 + }, + { + "epoch": 0.28317463811987453, + "grad_norm": 0.856250524520874, + "learning_rate": 9.529348392437283e-06, + "loss": 0.8578, + "step": 5145 + }, + { + "epoch": 0.28322967692223017, + "grad_norm": 0.7602483630180359, + "learning_rate": 9.529164777805002e-06, + "loss": 0.749, + "step": 5146 + }, + { + "epoch": 0.28328471572458586, + "grad_norm": 0.8946549892425537, + "learning_rate": 9.52898112913273e-06, + "loss": 0.8101, + "step": 5147 + }, + { + "epoch": 0.2833397545269415, + "grad_norm": 0.8015615344047546, + "learning_rate": 9.52879744642185e-06, + "loss": 0.8203, + "step": 5148 + }, + { + "epoch": 0.2833947933292972, + "grad_norm": 0.7767183780670166, + "learning_rate": 9.528613729673738e-06, + "loss": 0.8409, + "step": 5149 + }, + { + "epoch": 0.2834498321316528, + "grad_norm": 0.7604000568389893, + "learning_rate": 9.52842997888978e-06, + "loss": 0.8853, + "step": 5150 + }, + { + "epoch": 0.2835048709340085, + "grad_norm": 0.7079401016235352, + "learning_rate": 9.528246194071353e-06, + "loss": 0.6855, + "step": 5151 + }, + { + "epoch": 0.28355990973636414, + "grad_norm": 0.7616782188415527, + "learning_rate": 9.52806237521984e-06, + "loss": 0.785, + "step": 5152 + }, + { + "epoch": 0.2836149485387198, + "grad_norm": 0.7408583760261536, + "learning_rate": 9.527878522336622e-06, + "loss": 0.7105, + "step": 5153 + }, + { + "epoch": 0.28366998734107546, + "grad_norm": 0.694821834564209, + "learning_rate": 9.52769463542308e-06, + "loss": 0.6552, + "step": 5154 + }, + { + "epoch": 0.2837250261434311, + "grad_norm": 0.796925961971283, + "learning_rate": 9.5275107144806e-06, + "loss": 0.7122, + "step": 5155 + }, + { + "epoch": 0.2837800649457868, + "grad_norm": 0.8001971244812012, + "learning_rate": 9.527326759510558e-06, + "loss": 0.8528, + "step": 5156 + }, + { + "epoch": 0.2838351037481424, + "grad_norm": 0.8605831265449524, + "learning_rate": 9.527142770514341e-06, + "loss": 0.7948, + "step": 5157 + }, + { + "epoch": 0.2838901425504981, + "grad_norm": 0.8380078077316284, + "learning_rate": 9.526958747493334e-06, + "loss": 0.8184, + "step": 5158 + }, + { + "epoch": 0.28394518135285374, + "grad_norm": 0.8758485317230225, + "learning_rate": 9.526774690448913e-06, + "loss": 0.7625, + "step": 5159 + }, + { + "epoch": 0.2840002201552094, + "grad_norm": 0.7078989744186401, + "learning_rate": 9.526590599382466e-06, + "loss": 0.8179, + "step": 5160 + }, + { + "epoch": 0.28405525895756506, + "grad_norm": 0.6668990850448608, + "learning_rate": 9.526406474295376e-06, + "loss": 0.7169, + "step": 5161 + }, + { + "epoch": 0.28411029775992075, + "grad_norm": 0.7666084170341492, + "learning_rate": 9.526222315189026e-06, + "loss": 0.8511, + "step": 5162 + }, + { + "epoch": 0.2841653365622764, + "grad_norm": 0.7390545606613159, + "learning_rate": 9.526038122064802e-06, + "loss": 0.7926, + "step": 5163 + }, + { + "epoch": 0.28422037536463207, + "grad_norm": 0.7972092032432556, + "learning_rate": 9.525853894924086e-06, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 0.2842754141669877, + "grad_norm": 0.8988455533981323, + "learning_rate": 9.525669633768265e-06, + "loss": 0.9497, + "step": 5165 + }, + { + "epoch": 0.2843304529693434, + "grad_norm": 0.7092710137367249, + "learning_rate": 9.525485338598722e-06, + "loss": 0.7241, + "step": 5166 + }, + { + "epoch": 0.28438549177169903, + "grad_norm": 0.8630063533782959, + "learning_rate": 9.525301009416843e-06, + "loss": 0.8318, + "step": 5167 + }, + { + "epoch": 0.2844405305740547, + "grad_norm": 0.7336890697479248, + "learning_rate": 9.52511664622401e-06, + "loss": 0.7077, + "step": 5168 + }, + { + "epoch": 0.28449556937641035, + "grad_norm": 0.8156722784042358, + "learning_rate": 9.524932249021615e-06, + "loss": 0.8573, + "step": 5169 + }, + { + "epoch": 0.28455060817876604, + "grad_norm": 0.7061388492584229, + "learning_rate": 9.524747817811038e-06, + "loss": 0.7432, + "step": 5170 + }, + { + "epoch": 0.2846056469811217, + "grad_norm": 0.7948413491249084, + "learning_rate": 9.52456335259367e-06, + "loss": 0.8082, + "step": 5171 + }, + { + "epoch": 0.28466068578347736, + "grad_norm": 0.7208091020584106, + "learning_rate": 9.524378853370893e-06, + "loss": 0.7027, + "step": 5172 + }, + { + "epoch": 0.284715724585833, + "grad_norm": 0.8377540111541748, + "learning_rate": 9.524194320144096e-06, + "loss": 0.7093, + "step": 5173 + }, + { + "epoch": 0.2847707633881887, + "grad_norm": 0.8734563589096069, + "learning_rate": 9.524009752914666e-06, + "loss": 0.8422, + "step": 5174 + }, + { + "epoch": 0.2848258021905443, + "grad_norm": 0.7303940653800964, + "learning_rate": 9.523825151683989e-06, + "loss": 0.811, + "step": 5175 + }, + { + "epoch": 0.2848808409929, + "grad_norm": 0.7653842568397522, + "learning_rate": 9.523640516453455e-06, + "loss": 0.8595, + "step": 5176 + }, + { + "epoch": 0.28493587979525564, + "grad_norm": 0.7366930246353149, + "learning_rate": 9.523455847224448e-06, + "loss": 0.7832, + "step": 5177 + }, + { + "epoch": 0.28499091859761133, + "grad_norm": 0.7908505797386169, + "learning_rate": 9.523271143998357e-06, + "loss": 0.8115, + "step": 5178 + }, + { + "epoch": 0.28504595739996696, + "grad_norm": 0.8176048398017883, + "learning_rate": 9.523086406776572e-06, + "loss": 0.8377, + "step": 5179 + }, + { + "epoch": 0.28510099620232265, + "grad_norm": 0.724086344242096, + "learning_rate": 9.52290163556048e-06, + "loss": 0.7804, + "step": 5180 + }, + { + "epoch": 0.2851560350046783, + "grad_norm": 0.6461299657821655, + "learning_rate": 9.52271683035147e-06, + "loss": 0.5727, + "step": 5181 + }, + { + "epoch": 0.285211073807034, + "grad_norm": 0.7275353074073792, + "learning_rate": 9.522531991150932e-06, + "loss": 0.8345, + "step": 5182 + }, + { + "epoch": 0.2852661126093896, + "grad_norm": 0.7321951985359192, + "learning_rate": 9.522347117960253e-06, + "loss": 0.8832, + "step": 5183 + }, + { + "epoch": 0.2853211514117453, + "grad_norm": 0.7526552677154541, + "learning_rate": 9.522162210780825e-06, + "loss": 0.831, + "step": 5184 + }, + { + "epoch": 0.28537619021410093, + "grad_norm": 0.7592381238937378, + "learning_rate": 9.521977269614036e-06, + "loss": 0.7293, + "step": 5185 + }, + { + "epoch": 0.2854312290164566, + "grad_norm": 0.8060448169708252, + "learning_rate": 9.521792294461274e-06, + "loss": 0.819, + "step": 5186 + }, + { + "epoch": 0.28548626781881226, + "grad_norm": 0.7178553342819214, + "learning_rate": 9.521607285323932e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.28554130662116795, + "grad_norm": 0.8186969757080078, + "learning_rate": 9.521422242203401e-06, + "loss": 0.8526, + "step": 5188 + }, + { + "epoch": 0.2855963454235236, + "grad_norm": 0.8480883240699768, + "learning_rate": 9.521237165101071e-06, + "loss": 0.8088, + "step": 5189 + }, + { + "epoch": 0.28565138422587927, + "grad_norm": 0.8053719401359558, + "learning_rate": 9.521052054018333e-06, + "loss": 0.928, + "step": 5190 + }, + { + "epoch": 0.2857064230282349, + "grad_norm": 0.6937163472175598, + "learning_rate": 9.52086690895658e-06, + "loss": 0.7418, + "step": 5191 + }, + { + "epoch": 0.2857614618305906, + "grad_norm": 1.0616179704666138, + "learning_rate": 9.520681729917196e-06, + "loss": 0.8726, + "step": 5192 + }, + { + "epoch": 0.2858165006329462, + "grad_norm": 0.7504106163978577, + "learning_rate": 9.520496516901582e-06, + "loss": 0.844, + "step": 5193 + }, + { + "epoch": 0.2858715394353019, + "grad_norm": 0.7634509205818176, + "learning_rate": 9.520311269911127e-06, + "loss": 0.7595, + "step": 5194 + }, + { + "epoch": 0.28592657823765755, + "grad_norm": 0.7069799900054932, + "learning_rate": 9.52012598894722e-06, + "loss": 0.7566, + "step": 5195 + }, + { + "epoch": 0.28598161704001324, + "grad_norm": 0.695737361907959, + "learning_rate": 9.519940674011256e-06, + "loss": 0.7534, + "step": 5196 + }, + { + "epoch": 0.28603665584236887, + "grad_norm": 0.7212124466896057, + "learning_rate": 9.51975532510463e-06, + "loss": 0.8237, + "step": 5197 + }, + { + "epoch": 0.2860916946447245, + "grad_norm": 0.7274062633514404, + "learning_rate": 9.519569942228732e-06, + "loss": 0.756, + "step": 5198 + }, + { + "epoch": 0.2861467334470802, + "grad_norm": 0.7038697600364685, + "learning_rate": 9.519384525384956e-06, + "loss": 0.7308, + "step": 5199 + }, + { + "epoch": 0.2862017722494358, + "grad_norm": 0.6897109150886536, + "learning_rate": 9.519199074574694e-06, + "loss": 0.7858, + "step": 5200 + }, + { + "epoch": 0.2862568110517915, + "grad_norm": 0.8471527099609375, + "learning_rate": 9.519013589799343e-06, + "loss": 0.8198, + "step": 5201 + }, + { + "epoch": 0.28631184985414715, + "grad_norm": 0.6828129291534424, + "learning_rate": 9.518828071060295e-06, + "loss": 0.7734, + "step": 5202 + }, + { + "epoch": 0.28636688865650284, + "grad_norm": 0.7437755465507507, + "learning_rate": 9.518642518358946e-06, + "loss": 0.7669, + "step": 5203 + }, + { + "epoch": 0.28642192745885847, + "grad_norm": 0.8841923475265503, + "learning_rate": 9.518456931696689e-06, + "loss": 0.8201, + "step": 5204 + }, + { + "epoch": 0.28647696626121416, + "grad_norm": 0.9514154195785522, + "learning_rate": 9.518271311074917e-06, + "loss": 0.7864, + "step": 5205 + }, + { + "epoch": 0.2865320050635698, + "grad_norm": 0.830795407295227, + "learning_rate": 9.51808565649503e-06, + "loss": 0.8024, + "step": 5206 + }, + { + "epoch": 0.2865870438659255, + "grad_norm": 0.7274934649467468, + "learning_rate": 9.51789996795842e-06, + "loss": 0.7631, + "step": 5207 + }, + { + "epoch": 0.2866420826682811, + "grad_norm": 0.7004290223121643, + "learning_rate": 9.517714245466482e-06, + "loss": 0.7344, + "step": 5208 + }, + { + "epoch": 0.2866971214706368, + "grad_norm": 0.8559010624885559, + "learning_rate": 9.517528489020614e-06, + "loss": 0.7502, + "step": 5209 + }, + { + "epoch": 0.28675216027299244, + "grad_norm": 0.8913494348526001, + "learning_rate": 9.517342698622212e-06, + "loss": 0.8908, + "step": 5210 + }, + { + "epoch": 0.28680719907534813, + "grad_norm": 0.8375207781791687, + "learning_rate": 9.51715687427267e-06, + "loss": 0.7701, + "step": 5211 + }, + { + "epoch": 0.28686223787770376, + "grad_norm": 1.1804776191711426, + "learning_rate": 9.516971015973386e-06, + "loss": 0.8449, + "step": 5212 + }, + { + "epoch": 0.28691727668005945, + "grad_norm": 0.7260473370552063, + "learning_rate": 9.516785123725758e-06, + "loss": 0.7978, + "step": 5213 + }, + { + "epoch": 0.2869723154824151, + "grad_norm": 0.8159041404724121, + "learning_rate": 9.516599197531182e-06, + "loss": 0.7454, + "step": 5214 + }, + { + "epoch": 0.2870273542847708, + "grad_norm": 0.7850227952003479, + "learning_rate": 9.516413237391056e-06, + "loss": 0.8082, + "step": 5215 + }, + { + "epoch": 0.2870823930871264, + "grad_norm": 0.7596960067749023, + "learning_rate": 9.516227243306774e-06, + "loss": 0.7286, + "step": 5216 + }, + { + "epoch": 0.2871374318894821, + "grad_norm": 0.8763321042060852, + "learning_rate": 9.516041215279741e-06, + "loss": 0.8685, + "step": 5217 + }, + { + "epoch": 0.28719247069183773, + "grad_norm": 1.2130110263824463, + "learning_rate": 9.515855153311349e-06, + "loss": 0.8374, + "step": 5218 + }, + { + "epoch": 0.2872475094941934, + "grad_norm": 0.7578628063201904, + "learning_rate": 9.515669057402999e-06, + "loss": 0.793, + "step": 5219 + }, + { + "epoch": 0.28730254829654905, + "grad_norm": 0.9085225462913513, + "learning_rate": 9.515482927556088e-06, + "loss": 0.8366, + "step": 5220 + }, + { + "epoch": 0.28735758709890474, + "grad_norm": 0.7107900977134705, + "learning_rate": 9.515296763772017e-06, + "loss": 0.6571, + "step": 5221 + }, + { + "epoch": 0.2874126259012604, + "grad_norm": 0.7742018699645996, + "learning_rate": 9.515110566052183e-06, + "loss": 0.8387, + "step": 5222 + }, + { + "epoch": 0.28746766470361607, + "grad_norm": 0.8934319615364075, + "learning_rate": 9.514924334397987e-06, + "loss": 0.8546, + "step": 5223 + }, + { + "epoch": 0.2875227035059717, + "grad_norm": 0.720245897769928, + "learning_rate": 9.51473806881083e-06, + "loss": 0.7459, + "step": 5224 + }, + { + "epoch": 0.2875777423083274, + "grad_norm": 0.7074370384216309, + "learning_rate": 9.514551769292109e-06, + "loss": 0.8598, + "step": 5225 + }, + { + "epoch": 0.287632781110683, + "grad_norm": 0.7608621120452881, + "learning_rate": 9.514365435843226e-06, + "loss": 0.7263, + "step": 5226 + }, + { + "epoch": 0.2876878199130387, + "grad_norm": 0.7581011652946472, + "learning_rate": 9.51417906846558e-06, + "loss": 0.7498, + "step": 5227 + }, + { + "epoch": 0.28774285871539435, + "grad_norm": 0.8184412121772766, + "learning_rate": 9.513992667160572e-06, + "loss": 0.6889, + "step": 5228 + }, + { + "epoch": 0.28779789751775003, + "grad_norm": 0.6835145354270935, + "learning_rate": 9.513806231929605e-06, + "loss": 0.7399, + "step": 5229 + }, + { + "epoch": 0.28785293632010567, + "grad_norm": 0.7601536512374878, + "learning_rate": 9.513619762774077e-06, + "loss": 0.846, + "step": 5230 + }, + { + "epoch": 0.28790797512246136, + "grad_norm": 0.781491219997406, + "learning_rate": 9.513433259695392e-06, + "loss": 0.8326, + "step": 5231 + }, + { + "epoch": 0.287963013924817, + "grad_norm": 0.7978106141090393, + "learning_rate": 9.513246722694951e-06, + "loss": 0.7917, + "step": 5232 + }, + { + "epoch": 0.2880180527271727, + "grad_norm": 0.8071381449699402, + "learning_rate": 9.513060151774156e-06, + "loss": 0.8054, + "step": 5233 + }, + { + "epoch": 0.2880730915295283, + "grad_norm": 0.815567135810852, + "learning_rate": 9.512873546934406e-06, + "loss": 0.8647, + "step": 5234 + }, + { + "epoch": 0.288128130331884, + "grad_norm": 0.8255048990249634, + "learning_rate": 9.512686908177111e-06, + "loss": 0.9011, + "step": 5235 + }, + { + "epoch": 0.28818316913423964, + "grad_norm": 0.8392062187194824, + "learning_rate": 9.512500235503666e-06, + "loss": 0.8778, + "step": 5236 + }, + { + "epoch": 0.2882382079365953, + "grad_norm": 0.7256191372871399, + "learning_rate": 9.512313528915478e-06, + "loss": 0.7231, + "step": 5237 + }, + { + "epoch": 0.28829324673895096, + "grad_norm": 0.9041032195091248, + "learning_rate": 9.51212678841395e-06, + "loss": 0.8469, + "step": 5238 + }, + { + "epoch": 0.28834828554130665, + "grad_norm": 0.7857525944709778, + "learning_rate": 9.511940014000485e-06, + "loss": 0.7447, + "step": 5239 + }, + { + "epoch": 0.2884033243436623, + "grad_norm": 0.6925225257873535, + "learning_rate": 9.511753205676485e-06, + "loss": 0.8302, + "step": 5240 + }, + { + "epoch": 0.2884583631460179, + "grad_norm": 0.7253623008728027, + "learning_rate": 9.511566363443356e-06, + "loss": 0.8373, + "step": 5241 + }, + { + "epoch": 0.2885134019483736, + "grad_norm": 0.7198607921600342, + "learning_rate": 9.511379487302504e-06, + "loss": 0.79, + "step": 5242 + }, + { + "epoch": 0.28856844075072924, + "grad_norm": 0.7966421246528625, + "learning_rate": 9.511192577255328e-06, + "loss": 0.7933, + "step": 5243 + }, + { + "epoch": 0.2886234795530849, + "grad_norm": 0.9159359931945801, + "learning_rate": 9.511005633303239e-06, + "loss": 0.7254, + "step": 5244 + }, + { + "epoch": 0.28867851835544056, + "grad_norm": 0.9514481425285339, + "learning_rate": 9.510818655447638e-06, + "loss": 0.8916, + "step": 5245 + }, + { + "epoch": 0.28873355715779625, + "grad_norm": 0.7505099773406982, + "learning_rate": 9.510631643689932e-06, + "loss": 0.765, + "step": 5246 + }, + { + "epoch": 0.2887885959601519, + "grad_norm": 0.7824658751487732, + "learning_rate": 9.510444598031526e-06, + "loss": 0.6972, + "step": 5247 + }, + { + "epoch": 0.2888436347625076, + "grad_norm": 0.7778681516647339, + "learning_rate": 9.510257518473824e-06, + "loss": 0.8705, + "step": 5248 + }, + { + "epoch": 0.2888986735648632, + "grad_norm": 0.6785199642181396, + "learning_rate": 9.510070405018235e-06, + "loss": 0.6889, + "step": 5249 + }, + { + "epoch": 0.2889537123672189, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.509883257666164e-06, + "loss": 0.7979, + "step": 5250 + }, + { + "epoch": 0.28900875116957453, + "grad_norm": 1.3174562454223633, + "learning_rate": 9.509696076419018e-06, + "loss": 0.8802, + "step": 5251 + }, + { + "epoch": 0.2890637899719302, + "grad_norm": 1.1800767183303833, + "learning_rate": 9.509508861278205e-06, + "loss": 0.9246, + "step": 5252 + }, + { + "epoch": 0.28911882877428585, + "grad_norm": 0.7057580947875977, + "learning_rate": 9.509321612245128e-06, + "loss": 0.7565, + "step": 5253 + }, + { + "epoch": 0.28917386757664154, + "grad_norm": 0.7681905031204224, + "learning_rate": 9.509134329321197e-06, + "loss": 0.8678, + "step": 5254 + }, + { + "epoch": 0.2892289063789972, + "grad_norm": 0.96025550365448, + "learning_rate": 9.50894701250782e-06, + "loss": 0.9108, + "step": 5255 + }, + { + "epoch": 0.28928394518135286, + "grad_norm": 0.7786841988563538, + "learning_rate": 9.508759661806405e-06, + "loss": 0.7747, + "step": 5256 + }, + { + "epoch": 0.2893389839837085, + "grad_norm": 0.7073540091514587, + "learning_rate": 9.508572277218358e-06, + "loss": 0.7573, + "step": 5257 + }, + { + "epoch": 0.2893940227860642, + "grad_norm": 0.6648856401443481, + "learning_rate": 9.50838485874509e-06, + "loss": 0.7294, + "step": 5258 + }, + { + "epoch": 0.2894490615884198, + "grad_norm": 0.6794270873069763, + "learning_rate": 9.508197406388007e-06, + "loss": 0.7001, + "step": 5259 + }, + { + "epoch": 0.2895041003907755, + "grad_norm": 0.6819350123405457, + "learning_rate": 9.50800992014852e-06, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.28955913919313114, + "grad_norm": 0.6616997122764587, + "learning_rate": 9.507822400028036e-06, + "loss": 0.7108, + "step": 5261 + }, + { + "epoch": 0.28961417799548683, + "grad_norm": 0.7447230219841003, + "learning_rate": 9.507634846027966e-06, + "loss": 0.7865, + "step": 5262 + }, + { + "epoch": 0.28966921679784247, + "grad_norm": 0.7826278209686279, + "learning_rate": 9.50744725814972e-06, + "loss": 0.7922, + "step": 5263 + }, + { + "epoch": 0.28972425560019816, + "grad_norm": 0.8054459095001221, + "learning_rate": 9.507259636394706e-06, + "loss": 0.795, + "step": 5264 + }, + { + "epoch": 0.2897792944025538, + "grad_norm": 0.9539191722869873, + "learning_rate": 9.507071980764335e-06, + "loss": 0.9495, + "step": 5265 + }, + { + "epoch": 0.2898343332049095, + "grad_norm": 0.8877993226051331, + "learning_rate": 9.506884291260017e-06, + "loss": 0.8418, + "step": 5266 + }, + { + "epoch": 0.2898893720072651, + "grad_norm": 0.6620327234268188, + "learning_rate": 9.506696567883164e-06, + "loss": 0.6285, + "step": 5267 + }, + { + "epoch": 0.2899444108096208, + "grad_norm": 0.7604434490203857, + "learning_rate": 9.506508810635187e-06, + "loss": 0.8562, + "step": 5268 + }, + { + "epoch": 0.28999944961197643, + "grad_norm": 0.8181812763214111, + "learning_rate": 9.506321019517494e-06, + "loss": 0.905, + "step": 5269 + }, + { + "epoch": 0.2900544884143321, + "grad_norm": 0.7776391506195068, + "learning_rate": 9.5061331945315e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.29010952721668776, + "grad_norm": 0.8125039339065552, + "learning_rate": 9.505945335678613e-06, + "loss": 0.7254, + "step": 5271 + }, + { + "epoch": 0.29016456601904345, + "grad_norm": 0.7229846715927124, + "learning_rate": 9.50575744296025e-06, + "loss": 0.8192, + "step": 5272 + }, + { + "epoch": 0.2902196048213991, + "grad_norm": 0.72443026304245, + "learning_rate": 9.505569516377817e-06, + "loss": 0.7813, + "step": 5273 + }, + { + "epoch": 0.29027464362375477, + "grad_norm": 0.6798073053359985, + "learning_rate": 9.505381555932731e-06, + "loss": 0.7655, + "step": 5274 + }, + { + "epoch": 0.2903296824261104, + "grad_norm": 1.0805624723434448, + "learning_rate": 9.505193561626404e-06, + "loss": 0.9035, + "step": 5275 + }, + { + "epoch": 0.2903847212284661, + "grad_norm": 0.7579694986343384, + "learning_rate": 9.505005533460247e-06, + "loss": 0.8612, + "step": 5276 + }, + { + "epoch": 0.2904397600308217, + "grad_norm": 1.2496099472045898, + "learning_rate": 9.504817471435676e-06, + "loss": 0.813, + "step": 5277 + }, + { + "epoch": 0.2904947988331774, + "grad_norm": 0.6915673017501831, + "learning_rate": 9.504629375554102e-06, + "loss": 0.6891, + "step": 5278 + }, + { + "epoch": 0.29054983763553305, + "grad_norm": 0.8581767082214355, + "learning_rate": 9.504441245816937e-06, + "loss": 0.7137, + "step": 5279 + }, + { + "epoch": 0.29060487643788874, + "grad_norm": 0.7469545006752014, + "learning_rate": 9.504253082225601e-06, + "loss": 0.7621, + "step": 5280 + }, + { + "epoch": 0.29065991524024437, + "grad_norm": 0.7725615501403809, + "learning_rate": 9.504064884781503e-06, + "loss": 0.7988, + "step": 5281 + }, + { + "epoch": 0.29071495404260006, + "grad_norm": 1.0187722444534302, + "learning_rate": 9.503876653486058e-06, + "loss": 0.7772, + "step": 5282 + }, + { + "epoch": 0.2907699928449557, + "grad_norm": 0.675574779510498, + "learning_rate": 9.503688388340683e-06, + "loss": 0.7096, + "step": 5283 + }, + { + "epoch": 0.2908250316473113, + "grad_norm": 0.7980207800865173, + "learning_rate": 9.503500089346792e-06, + "loss": 0.8291, + "step": 5284 + }, + { + "epoch": 0.290880070449667, + "grad_norm": 0.6891655325889587, + "learning_rate": 9.503311756505797e-06, + "loss": 0.7186, + "step": 5285 + }, + { + "epoch": 0.29093510925202265, + "grad_norm": 0.7273408770561218, + "learning_rate": 9.50312338981912e-06, + "loss": 0.7483, + "step": 5286 + }, + { + "epoch": 0.29099014805437834, + "grad_norm": 0.7346869111061096, + "learning_rate": 9.50293498928817e-06, + "loss": 0.766, + "step": 5287 + }, + { + "epoch": 0.291045186856734, + "grad_norm": 0.7627394795417786, + "learning_rate": 9.502746554914368e-06, + "loss": 0.867, + "step": 5288 + }, + { + "epoch": 0.29110022565908966, + "grad_norm": 0.8477200865745544, + "learning_rate": 9.502558086699128e-06, + "loss": 0.8317, + "step": 5289 + }, + { + "epoch": 0.2911552644614453, + "grad_norm": 0.7696006894111633, + "learning_rate": 9.502369584643867e-06, + "loss": 0.7814, + "step": 5290 + }, + { + "epoch": 0.291210303263801, + "grad_norm": 0.7614455819129944, + "learning_rate": 9.502181048749999e-06, + "loss": 0.7398, + "step": 5291 + }, + { + "epoch": 0.2912653420661566, + "grad_norm": 0.7877628207206726, + "learning_rate": 9.501992479018946e-06, + "loss": 0.8731, + "step": 5292 + }, + { + "epoch": 0.2913203808685123, + "grad_norm": 0.7455846667289734, + "learning_rate": 9.50180387545212e-06, + "loss": 0.7059, + "step": 5293 + }, + { + "epoch": 0.29137541967086794, + "grad_norm": 1.145520567893982, + "learning_rate": 9.501615238050944e-06, + "loss": 0.6968, + "step": 5294 + }, + { + "epoch": 0.29143045847322363, + "grad_norm": 0.8100234866142273, + "learning_rate": 9.501426566816831e-06, + "loss": 0.8122, + "step": 5295 + }, + { + "epoch": 0.29148549727557926, + "grad_norm": 0.6813066005706787, + "learning_rate": 9.501237861751203e-06, + "loss": 0.6718, + "step": 5296 + }, + { + "epoch": 0.29154053607793495, + "grad_norm": 0.7400195002555847, + "learning_rate": 9.501049122855473e-06, + "loss": 0.802, + "step": 5297 + }, + { + "epoch": 0.2915955748802906, + "grad_norm": 0.7948681712150574, + "learning_rate": 9.500860350131065e-06, + "loss": 0.8237, + "step": 5298 + }, + { + "epoch": 0.2916506136826463, + "grad_norm": 0.772093653678894, + "learning_rate": 9.500671543579394e-06, + "loss": 0.7687, + "step": 5299 + }, + { + "epoch": 0.2917056524850019, + "grad_norm": 0.7468486428260803, + "learning_rate": 9.500482703201881e-06, + "loss": 0.7827, + "step": 5300 + }, + { + "epoch": 0.2917606912873576, + "grad_norm": 0.7284440398216248, + "learning_rate": 9.500293828999945e-06, + "loss": 0.8086, + "step": 5301 + }, + { + "epoch": 0.29181573008971323, + "grad_norm": 0.8014211654663086, + "learning_rate": 9.500104920975005e-06, + "loss": 0.8409, + "step": 5302 + }, + { + "epoch": 0.2918707688920689, + "grad_norm": 0.7588346004486084, + "learning_rate": 9.49991597912848e-06, + "loss": 0.7149, + "step": 5303 + }, + { + "epoch": 0.29192580769442456, + "grad_norm": 0.8098518252372742, + "learning_rate": 9.499727003461794e-06, + "loss": 0.8375, + "step": 5304 + }, + { + "epoch": 0.29198084649678024, + "grad_norm": 0.8502426743507385, + "learning_rate": 9.499537993976363e-06, + "loss": 0.8177, + "step": 5305 + }, + { + "epoch": 0.2920358852991359, + "grad_norm": 0.8010903596878052, + "learning_rate": 9.499348950673607e-06, + "loss": 0.8457, + "step": 5306 + }, + { + "epoch": 0.29209092410149157, + "grad_norm": 0.6628156304359436, + "learning_rate": 9.49915987355495e-06, + "loss": 0.7327, + "step": 5307 + }, + { + "epoch": 0.2921459629038472, + "grad_norm": 0.7414939999580383, + "learning_rate": 9.49897076262181e-06, + "loss": 0.8271, + "step": 5308 + }, + { + "epoch": 0.2922010017062029, + "grad_norm": 0.7490847706794739, + "learning_rate": 9.498781617875613e-06, + "loss": 0.7689, + "step": 5309 + }, + { + "epoch": 0.2922560405085585, + "grad_norm": 0.7913424968719482, + "learning_rate": 9.498592439317777e-06, + "loss": 0.8571, + "step": 5310 + }, + { + "epoch": 0.2923110793109142, + "grad_norm": 0.6903867125511169, + "learning_rate": 9.498403226949724e-06, + "loss": 0.7325, + "step": 5311 + }, + { + "epoch": 0.29236611811326985, + "grad_norm": 0.8087130188941956, + "learning_rate": 9.498213980772875e-06, + "loss": 0.8167, + "step": 5312 + }, + { + "epoch": 0.29242115691562554, + "grad_norm": 1.1316752433776855, + "learning_rate": 9.498024700788655e-06, + "loss": 0.912, + "step": 5313 + }, + { + "epoch": 0.29247619571798117, + "grad_norm": 0.8701719045639038, + "learning_rate": 9.497835386998486e-06, + "loss": 0.8728, + "step": 5314 + }, + { + "epoch": 0.29253123452033686, + "grad_norm": 0.6688953638076782, + "learning_rate": 9.49764603940379e-06, + "loss": 0.6561, + "step": 5315 + }, + { + "epoch": 0.2925862733226925, + "grad_norm": 0.8067505359649658, + "learning_rate": 9.49745665800599e-06, + "loss": 0.8419, + "step": 5316 + }, + { + "epoch": 0.2926413121250482, + "grad_norm": 0.7157390117645264, + "learning_rate": 9.49726724280651e-06, + "loss": 0.7964, + "step": 5317 + }, + { + "epoch": 0.2926963509274038, + "grad_norm": 0.7038627862930298, + "learning_rate": 9.497077793806772e-06, + "loss": 0.7343, + "step": 5318 + }, + { + "epoch": 0.2927513897297595, + "grad_norm": 0.7674478888511658, + "learning_rate": 9.4968883110082e-06, + "loss": 0.7624, + "step": 5319 + }, + { + "epoch": 0.29280642853211514, + "grad_norm": 0.6708847284317017, + "learning_rate": 9.496698794412223e-06, + "loss": 0.6554, + "step": 5320 + }, + { + "epoch": 0.2928614673344708, + "grad_norm": 0.8332329392433167, + "learning_rate": 9.49650924402026e-06, + "loss": 0.9357, + "step": 5321 + }, + { + "epoch": 0.29291650613682646, + "grad_norm": 0.7601341605186462, + "learning_rate": 9.496319659833737e-06, + "loss": 0.8208, + "step": 5322 + }, + { + "epoch": 0.29297154493918215, + "grad_norm": 0.8320396542549133, + "learning_rate": 9.496130041854077e-06, + "loss": 0.8423, + "step": 5323 + }, + { + "epoch": 0.2930265837415378, + "grad_norm": 0.8242839574813843, + "learning_rate": 9.49594039008271e-06, + "loss": 0.9101, + "step": 5324 + }, + { + "epoch": 0.29308162254389347, + "grad_norm": 0.8906320333480835, + "learning_rate": 9.495750704521058e-06, + "loss": 0.7343, + "step": 5325 + }, + { + "epoch": 0.2931366613462491, + "grad_norm": 0.7964318990707397, + "learning_rate": 9.495560985170546e-06, + "loss": 0.7789, + "step": 5326 + }, + { + "epoch": 0.29319170014860474, + "grad_norm": 0.8267771601676941, + "learning_rate": 9.495371232032602e-06, + "loss": 0.7447, + "step": 5327 + }, + { + "epoch": 0.29324673895096043, + "grad_norm": 0.8120046257972717, + "learning_rate": 9.49518144510865e-06, + "loss": 0.7803, + "step": 5328 + }, + { + "epoch": 0.29330177775331606, + "grad_norm": 0.7314801812171936, + "learning_rate": 9.494991624400119e-06, + "loss": 0.6758, + "step": 5329 + }, + { + "epoch": 0.29335681655567175, + "grad_norm": 0.6989930272102356, + "learning_rate": 9.494801769908433e-06, + "loss": 0.7945, + "step": 5330 + }, + { + "epoch": 0.2934118553580274, + "grad_norm": 0.7804785966873169, + "learning_rate": 9.494611881635021e-06, + "loss": 0.7977, + "step": 5331 + }, + { + "epoch": 0.2934668941603831, + "grad_norm": 0.8377045392990112, + "learning_rate": 9.494421959581308e-06, + "loss": 0.8077, + "step": 5332 + }, + { + "epoch": 0.2935219329627387, + "grad_norm": 0.7463418245315552, + "learning_rate": 9.494232003748724e-06, + "loss": 0.783, + "step": 5333 + }, + { + "epoch": 0.2935769717650944, + "grad_norm": 0.7598912715911865, + "learning_rate": 9.494042014138695e-06, + "loss": 0.7869, + "step": 5334 + }, + { + "epoch": 0.29363201056745003, + "grad_norm": 0.7634113430976868, + "learning_rate": 9.493851990752648e-06, + "loss": 0.8108, + "step": 5335 + }, + { + "epoch": 0.2936870493698057, + "grad_norm": 0.8056474328041077, + "learning_rate": 9.493661933592013e-06, + "loss": 0.7921, + "step": 5336 + }, + { + "epoch": 0.29374208817216135, + "grad_norm": 0.8699371218681335, + "learning_rate": 9.493471842658219e-06, + "loss": 0.8833, + "step": 5337 + }, + { + "epoch": 0.29379712697451704, + "grad_norm": 0.8803261518478394, + "learning_rate": 9.493281717952691e-06, + "loss": 0.7848, + "step": 5338 + }, + { + "epoch": 0.2938521657768727, + "grad_norm": 0.7678453922271729, + "learning_rate": 9.493091559476864e-06, + "loss": 0.836, + "step": 5339 + }, + { + "epoch": 0.29390720457922836, + "grad_norm": 0.7653701305389404, + "learning_rate": 9.49290136723216e-06, + "loss": 0.8215, + "step": 5340 + }, + { + "epoch": 0.293962243381584, + "grad_norm": 0.768120527267456, + "learning_rate": 9.492711141220013e-06, + "loss": 0.7498, + "step": 5341 + }, + { + "epoch": 0.2940172821839397, + "grad_norm": 0.7665749788284302, + "learning_rate": 9.492520881441854e-06, + "loss": 0.7883, + "step": 5342 + }, + { + "epoch": 0.2940723209862953, + "grad_norm": 0.7405015230178833, + "learning_rate": 9.492330587899108e-06, + "loss": 0.8112, + "step": 5343 + }, + { + "epoch": 0.294127359788651, + "grad_norm": 0.7183459997177124, + "learning_rate": 9.492140260593208e-06, + "loss": 0.8227, + "step": 5344 + }, + { + "epoch": 0.29418239859100664, + "grad_norm": 0.7453572154045105, + "learning_rate": 9.491949899525585e-06, + "loss": 0.8148, + "step": 5345 + }, + { + "epoch": 0.29423743739336233, + "grad_norm": 0.8963750600814819, + "learning_rate": 9.491759504697669e-06, + "loss": 0.9261, + "step": 5346 + }, + { + "epoch": 0.29429247619571797, + "grad_norm": 0.7631667256355286, + "learning_rate": 9.49156907611089e-06, + "loss": 0.7708, + "step": 5347 + }, + { + "epoch": 0.29434751499807366, + "grad_norm": 0.6324381232261658, + "learning_rate": 9.49137861376668e-06, + "loss": 0.6688, + "step": 5348 + }, + { + "epoch": 0.2944025538004293, + "grad_norm": 0.6969807147979736, + "learning_rate": 9.491188117666472e-06, + "loss": 0.7516, + "step": 5349 + }, + { + "epoch": 0.294457592602785, + "grad_norm": 1.633340835571289, + "learning_rate": 9.490997587811697e-06, + "loss": 0.8111, + "step": 5350 + }, + { + "epoch": 0.2945126314051406, + "grad_norm": 0.7084371447563171, + "learning_rate": 9.490807024203785e-06, + "loss": 0.8375, + "step": 5351 + }, + { + "epoch": 0.2945676702074963, + "grad_norm": 0.7335958480834961, + "learning_rate": 9.490616426844169e-06, + "loss": 0.7884, + "step": 5352 + }, + { + "epoch": 0.29462270900985194, + "grad_norm": 0.7560276985168457, + "learning_rate": 9.490425795734282e-06, + "loss": 0.8918, + "step": 5353 + }, + { + "epoch": 0.2946777478122076, + "grad_norm": 0.9185894727706909, + "learning_rate": 9.490235130875557e-06, + "loss": 0.7976, + "step": 5354 + }, + { + "epoch": 0.29473278661456326, + "grad_norm": 0.7871553897857666, + "learning_rate": 9.490044432269427e-06, + "loss": 0.8564, + "step": 5355 + }, + { + "epoch": 0.29478782541691895, + "grad_norm": 0.8736812472343445, + "learning_rate": 9.489853699917326e-06, + "loss": 0.8114, + "step": 5356 + }, + { + "epoch": 0.2948428642192746, + "grad_norm": 0.8068968653678894, + "learning_rate": 9.489662933820684e-06, + "loss": 0.9198, + "step": 5357 + }, + { + "epoch": 0.29489790302163027, + "grad_norm": 0.7816325426101685, + "learning_rate": 9.489472133980939e-06, + "loss": 0.8012, + "step": 5358 + }, + { + "epoch": 0.2949529418239859, + "grad_norm": 0.7248200178146362, + "learning_rate": 9.489281300399522e-06, + "loss": 0.8099, + "step": 5359 + }, + { + "epoch": 0.2950079806263416, + "grad_norm": 0.7887724041938782, + "learning_rate": 9.48909043307787e-06, + "loss": 0.884, + "step": 5360 + }, + { + "epoch": 0.2950630194286972, + "grad_norm": 0.765163004398346, + "learning_rate": 9.488899532017415e-06, + "loss": 0.8563, + "step": 5361 + }, + { + "epoch": 0.2951180582310529, + "grad_norm": 0.7658557295799255, + "learning_rate": 9.488708597219592e-06, + "loss": 0.8897, + "step": 5362 + }, + { + "epoch": 0.29517309703340855, + "grad_norm": 0.6653227806091309, + "learning_rate": 9.488517628685838e-06, + "loss": 0.7107, + "step": 5363 + }, + { + "epoch": 0.29522813583576424, + "grad_norm": 0.787739098072052, + "learning_rate": 9.488326626417586e-06, + "loss": 0.8181, + "step": 5364 + }, + { + "epoch": 0.29528317463811987, + "grad_norm": 0.7822532057762146, + "learning_rate": 9.488135590416275e-06, + "loss": 0.8238, + "step": 5365 + }, + { + "epoch": 0.29533821344047556, + "grad_norm": 0.7797419428825378, + "learning_rate": 9.487944520683334e-06, + "loss": 0.8484, + "step": 5366 + }, + { + "epoch": 0.2953932522428312, + "grad_norm": 0.7230222225189209, + "learning_rate": 9.487753417220207e-06, + "loss": 0.8193, + "step": 5367 + }, + { + "epoch": 0.2954482910451869, + "grad_norm": 0.8256810307502747, + "learning_rate": 9.487562280028325e-06, + "loss": 0.7691, + "step": 5368 + }, + { + "epoch": 0.2955033298475425, + "grad_norm": 0.7704648375511169, + "learning_rate": 9.487371109109127e-06, + "loss": 0.8235, + "step": 5369 + }, + { + "epoch": 0.29555836864989815, + "grad_norm": 0.7580391764640808, + "learning_rate": 9.487179904464048e-06, + "loss": 0.7911, + "step": 5370 + }, + { + "epoch": 0.29561340745225384, + "grad_norm": 0.7211806774139404, + "learning_rate": 9.486988666094526e-06, + "loss": 0.7188, + "step": 5371 + }, + { + "epoch": 0.2956684462546095, + "grad_norm": 0.8375828862190247, + "learning_rate": 9.486797394001999e-06, + "loss": 0.881, + "step": 5372 + }, + { + "epoch": 0.29572348505696516, + "grad_norm": 0.8500093221664429, + "learning_rate": 9.486606088187903e-06, + "loss": 0.8632, + "step": 5373 + }, + { + "epoch": 0.2957785238593208, + "grad_norm": 0.7754727005958557, + "learning_rate": 9.486414748653677e-06, + "loss": 0.8124, + "step": 5374 + }, + { + "epoch": 0.2958335626616765, + "grad_norm": 0.9395208954811096, + "learning_rate": 9.486223375400759e-06, + "loss": 0.8046, + "step": 5375 + }, + { + "epoch": 0.2958886014640321, + "grad_norm": 0.7587517499923706, + "learning_rate": 9.486031968430587e-06, + "loss": 0.7852, + "step": 5376 + }, + { + "epoch": 0.2959436402663878, + "grad_norm": 0.6921781301498413, + "learning_rate": 9.485840527744599e-06, + "loss": 0.7392, + "step": 5377 + }, + { + "epoch": 0.29599867906874344, + "grad_norm": 0.8768522143363953, + "learning_rate": 9.485649053344233e-06, + "loss": 0.7819, + "step": 5378 + }, + { + "epoch": 0.29605371787109913, + "grad_norm": 0.7565680146217346, + "learning_rate": 9.485457545230932e-06, + "loss": 0.7489, + "step": 5379 + }, + { + "epoch": 0.29610875667345476, + "grad_norm": 0.7760992050170898, + "learning_rate": 9.485266003406132e-06, + "loss": 0.8129, + "step": 5380 + }, + { + "epoch": 0.29616379547581045, + "grad_norm": 0.7726097106933594, + "learning_rate": 9.485074427871272e-06, + "loss": 0.725, + "step": 5381 + }, + { + "epoch": 0.2962188342781661, + "grad_norm": 0.6885473728179932, + "learning_rate": 9.484882818627796e-06, + "loss": 0.685, + "step": 5382 + }, + { + "epoch": 0.2962738730805218, + "grad_norm": 0.776509702205658, + "learning_rate": 9.484691175677138e-06, + "loss": 0.8077, + "step": 5383 + }, + { + "epoch": 0.2963289118828774, + "grad_norm": 0.7436297535896301, + "learning_rate": 9.484499499020744e-06, + "loss": 0.8161, + "step": 5384 + }, + { + "epoch": 0.2963839506852331, + "grad_norm": 0.7604314088821411, + "learning_rate": 9.484307788660052e-06, + "loss": 0.825, + "step": 5385 + }, + { + "epoch": 0.29643898948758873, + "grad_norm": 0.7230789065361023, + "learning_rate": 9.484116044596501e-06, + "loss": 0.8005, + "step": 5386 + }, + { + "epoch": 0.2964940282899444, + "grad_norm": 0.820442259311676, + "learning_rate": 9.483924266831536e-06, + "loss": 0.789, + "step": 5387 + }, + { + "epoch": 0.29654906709230006, + "grad_norm": 0.7514582276344299, + "learning_rate": 9.483732455366596e-06, + "loss": 0.8531, + "step": 5388 + }, + { + "epoch": 0.29660410589465575, + "grad_norm": 0.6671503782272339, + "learning_rate": 9.483540610203124e-06, + "loss": 0.7627, + "step": 5389 + }, + { + "epoch": 0.2966591446970114, + "grad_norm": 0.6955942511558533, + "learning_rate": 9.483348731342559e-06, + "loss": 0.726, + "step": 5390 + }, + { + "epoch": 0.29671418349936707, + "grad_norm": 0.769781768321991, + "learning_rate": 9.483156818786347e-06, + "loss": 0.8064, + "step": 5391 + }, + { + "epoch": 0.2967692223017227, + "grad_norm": 1.0764707326889038, + "learning_rate": 9.482964872535927e-06, + "loss": 0.8249, + "step": 5392 + }, + { + "epoch": 0.2968242611040784, + "grad_norm": 1.0508921146392822, + "learning_rate": 9.482772892592744e-06, + "loss": 0.706, + "step": 5393 + }, + { + "epoch": 0.296879299906434, + "grad_norm": 0.6442564129829407, + "learning_rate": 9.482580878958239e-06, + "loss": 0.6025, + "step": 5394 + }, + { + "epoch": 0.2969343387087897, + "grad_norm": 0.7622735500335693, + "learning_rate": 9.482388831633856e-06, + "loss": 0.7639, + "step": 5395 + }, + { + "epoch": 0.29698937751114535, + "grad_norm": 0.8179057240486145, + "learning_rate": 9.482196750621038e-06, + "loss": 0.7641, + "step": 5396 + }, + { + "epoch": 0.29704441631350104, + "grad_norm": 0.7955192923545837, + "learning_rate": 9.48200463592123e-06, + "loss": 0.8407, + "step": 5397 + }, + { + "epoch": 0.29709945511585667, + "grad_norm": 0.7909773588180542, + "learning_rate": 9.481812487535875e-06, + "loss": 0.7833, + "step": 5398 + }, + { + "epoch": 0.29715449391821236, + "grad_norm": 0.8409042954444885, + "learning_rate": 9.481620305466417e-06, + "loss": 0.7788, + "step": 5399 + }, + { + "epoch": 0.297209532720568, + "grad_norm": 0.7521414160728455, + "learning_rate": 9.4814280897143e-06, + "loss": 0.7192, + "step": 5400 + }, + { + "epoch": 0.2972645715229237, + "grad_norm": 0.7016280889511108, + "learning_rate": 9.481235840280969e-06, + "loss": 0.7181, + "step": 5401 + }, + { + "epoch": 0.2973196103252793, + "grad_norm": 0.7257362604141235, + "learning_rate": 9.48104355716787e-06, + "loss": 0.7845, + "step": 5402 + }, + { + "epoch": 0.297374649127635, + "grad_norm": 0.8048765659332275, + "learning_rate": 9.480851240376445e-06, + "loss": 0.7921, + "step": 5403 + }, + { + "epoch": 0.29742968792999064, + "grad_norm": 0.8715546131134033, + "learning_rate": 9.480658889908143e-06, + "loss": 0.856, + "step": 5404 + }, + { + "epoch": 0.2974847267323463, + "grad_norm": 0.7211160063743591, + "learning_rate": 9.480466505764408e-06, + "loss": 0.7687, + "step": 5405 + }, + { + "epoch": 0.29753976553470196, + "grad_norm": 0.8749645352363586, + "learning_rate": 9.480274087946686e-06, + "loss": 0.8419, + "step": 5406 + }, + { + "epoch": 0.29759480433705765, + "grad_norm": 0.7986398935317993, + "learning_rate": 9.480081636456424e-06, + "loss": 0.8309, + "step": 5407 + }, + { + "epoch": 0.2976498431394133, + "grad_norm": 0.8435508012771606, + "learning_rate": 9.479889151295067e-06, + "loss": 0.7457, + "step": 5408 + }, + { + "epoch": 0.297704881941769, + "grad_norm": 0.8725010752677917, + "learning_rate": 9.479696632464063e-06, + "loss": 0.8069, + "step": 5409 + }, + { + "epoch": 0.2977599207441246, + "grad_norm": 0.7364320158958435, + "learning_rate": 9.479504079964856e-06, + "loss": 0.8316, + "step": 5410 + }, + { + "epoch": 0.2978149595464803, + "grad_norm": 0.7967824935913086, + "learning_rate": 9.479311493798898e-06, + "loss": 0.7689, + "step": 5411 + }, + { + "epoch": 0.29786999834883593, + "grad_norm": 0.8415414094924927, + "learning_rate": 9.479118873967632e-06, + "loss": 0.8288, + "step": 5412 + }, + { + "epoch": 0.29792503715119156, + "grad_norm": 0.9723265767097473, + "learning_rate": 9.478926220472508e-06, + "loss": 0.7422, + "step": 5413 + }, + { + "epoch": 0.29798007595354725, + "grad_norm": 0.7203155159950256, + "learning_rate": 9.478733533314974e-06, + "loss": 0.707, + "step": 5414 + }, + { + "epoch": 0.2980351147559029, + "grad_norm": 0.7643926739692688, + "learning_rate": 9.478540812496478e-06, + "loss": 0.7793, + "step": 5415 + }, + { + "epoch": 0.2980901535582586, + "grad_norm": 0.9177087545394897, + "learning_rate": 9.478348058018467e-06, + "loss": 0.865, + "step": 5416 + }, + { + "epoch": 0.2981451923606142, + "grad_norm": 0.678931713104248, + "learning_rate": 9.478155269882392e-06, + "loss": 0.7716, + "step": 5417 + }, + { + "epoch": 0.2982002311629699, + "grad_norm": 0.8440513610839844, + "learning_rate": 9.4779624480897e-06, + "loss": 0.8904, + "step": 5418 + }, + { + "epoch": 0.29825526996532553, + "grad_norm": 0.8508756756782532, + "learning_rate": 9.47776959264184e-06, + "loss": 0.7994, + "step": 5419 + }, + { + "epoch": 0.2983103087676812, + "grad_norm": 0.8736951947212219, + "learning_rate": 9.477576703540265e-06, + "loss": 0.8374, + "step": 5420 + }, + { + "epoch": 0.29836534757003685, + "grad_norm": 0.8063240051269531, + "learning_rate": 9.47738378078642e-06, + "loss": 0.7217, + "step": 5421 + }, + { + "epoch": 0.29842038637239254, + "grad_norm": 1.1495088338851929, + "learning_rate": 9.477190824381757e-06, + "loss": 0.8902, + "step": 5422 + }, + { + "epoch": 0.2984754251747482, + "grad_norm": 1.0241554975509644, + "learning_rate": 9.476997834327725e-06, + "loss": 0.9354, + "step": 5423 + }, + { + "epoch": 0.29853046397710387, + "grad_norm": 0.939950168132782, + "learning_rate": 9.476804810625779e-06, + "loss": 0.8714, + "step": 5424 + }, + { + "epoch": 0.2985855027794595, + "grad_norm": 0.7592660188674927, + "learning_rate": 9.476611753277364e-06, + "loss": 0.7513, + "step": 5425 + }, + { + "epoch": 0.2986405415818152, + "grad_norm": 0.776153028011322, + "learning_rate": 9.476418662283935e-06, + "loss": 0.7828, + "step": 5426 + }, + { + "epoch": 0.2986955803841708, + "grad_norm": 0.9317814707756042, + "learning_rate": 9.47622553764694e-06, + "loss": 0.865, + "step": 5427 + }, + { + "epoch": 0.2987506191865265, + "grad_norm": 0.7770501971244812, + "learning_rate": 9.476032379367832e-06, + "loss": 0.7281, + "step": 5428 + }, + { + "epoch": 0.29880565798888215, + "grad_norm": 0.7815201282501221, + "learning_rate": 9.475839187448064e-06, + "loss": 0.7565, + "step": 5429 + }, + { + "epoch": 0.29886069679123783, + "grad_norm": 0.7992607951164246, + "learning_rate": 9.475645961889086e-06, + "loss": 0.8109, + "step": 5430 + }, + { + "epoch": 0.29891573559359347, + "grad_norm": 0.7780614495277405, + "learning_rate": 9.475452702692351e-06, + "loss": 0.7814, + "step": 5431 + }, + { + "epoch": 0.29897077439594916, + "grad_norm": 0.7409062385559082, + "learning_rate": 9.475259409859313e-06, + "loss": 0.7712, + "step": 5432 + }, + { + "epoch": 0.2990258131983048, + "grad_norm": 0.7935584187507629, + "learning_rate": 9.47506608339142e-06, + "loss": 0.8301, + "step": 5433 + }, + { + "epoch": 0.2990808520006605, + "grad_norm": 0.6931030750274658, + "learning_rate": 9.474872723290132e-06, + "loss": 0.7471, + "step": 5434 + }, + { + "epoch": 0.2991358908030161, + "grad_norm": 0.7622918486595154, + "learning_rate": 9.474679329556894e-06, + "loss": 0.7727, + "step": 5435 + }, + { + "epoch": 0.2991909296053718, + "grad_norm": 0.7957701086997986, + "learning_rate": 9.474485902193169e-06, + "loss": 0.7663, + "step": 5436 + }, + { + "epoch": 0.29924596840772744, + "grad_norm": 1.0600612163543701, + "learning_rate": 9.474292441200404e-06, + "loss": 0.7861, + "step": 5437 + }, + { + "epoch": 0.2993010072100831, + "grad_norm": 0.7343600392341614, + "learning_rate": 9.474098946580053e-06, + "loss": 0.8609, + "step": 5438 + }, + { + "epoch": 0.29935604601243876, + "grad_norm": 0.7477726340293884, + "learning_rate": 9.473905418333573e-06, + "loss": 0.7683, + "step": 5439 + }, + { + "epoch": 0.29941108481479445, + "grad_norm": 0.7955546379089355, + "learning_rate": 9.473711856462417e-06, + "loss": 0.8406, + "step": 5440 + }, + { + "epoch": 0.2994661236171501, + "grad_norm": 0.8291183114051819, + "learning_rate": 9.47351826096804e-06, + "loss": 0.6919, + "step": 5441 + }, + { + "epoch": 0.29952116241950577, + "grad_norm": 0.8899849057197571, + "learning_rate": 9.473324631851898e-06, + "loss": 0.9403, + "step": 5442 + }, + { + "epoch": 0.2995762012218614, + "grad_norm": 0.837066650390625, + "learning_rate": 9.473130969115445e-06, + "loss": 0.8676, + "step": 5443 + }, + { + "epoch": 0.2996312400242171, + "grad_norm": 0.8385708928108215, + "learning_rate": 9.472937272760138e-06, + "loss": 0.7588, + "step": 5444 + }, + { + "epoch": 0.2996862788265727, + "grad_norm": 0.6990595459938049, + "learning_rate": 9.472743542787431e-06, + "loss": 0.6769, + "step": 5445 + }, + { + "epoch": 0.2997413176289284, + "grad_norm": 0.789165735244751, + "learning_rate": 9.472549779198781e-06, + "loss": 0.8084, + "step": 5446 + }, + { + "epoch": 0.29979635643128405, + "grad_norm": 0.8820298314094543, + "learning_rate": 9.472355981995643e-06, + "loss": 0.8262, + "step": 5447 + }, + { + "epoch": 0.29985139523363974, + "grad_norm": 0.8928382992744446, + "learning_rate": 9.472162151179475e-06, + "loss": 0.8123, + "step": 5448 + }, + { + "epoch": 0.2999064340359954, + "grad_norm": 0.7688086032867432, + "learning_rate": 9.471968286751735e-06, + "loss": 0.6846, + "step": 5449 + }, + { + "epoch": 0.29996147283835106, + "grad_norm": 0.6962918043136597, + "learning_rate": 9.471774388713877e-06, + "loss": 0.7872, + "step": 5450 + }, + { + "epoch": 0.3000165116407067, + "grad_norm": 0.7467569708824158, + "learning_rate": 9.47158045706736e-06, + "loss": 0.8201, + "step": 5451 + }, + { + "epoch": 0.3000715504430624, + "grad_norm": 0.7651814222335815, + "learning_rate": 9.471386491813642e-06, + "loss": 0.7734, + "step": 5452 + }, + { + "epoch": 0.300126589245418, + "grad_norm": 0.8001144528388977, + "learning_rate": 9.47119249295418e-06, + "loss": 0.8266, + "step": 5453 + }, + { + "epoch": 0.3001816280477737, + "grad_norm": 0.7937704920768738, + "learning_rate": 9.47099846049043e-06, + "loss": 0.8025, + "step": 5454 + }, + { + "epoch": 0.30023666685012934, + "grad_norm": 0.7353448867797852, + "learning_rate": 9.470804394423853e-06, + "loss": 0.7926, + "step": 5455 + }, + { + "epoch": 0.300291705652485, + "grad_norm": 0.9116304516792297, + "learning_rate": 9.470610294755908e-06, + "loss": 0.8295, + "step": 5456 + }, + { + "epoch": 0.30034674445484066, + "grad_norm": 0.7169163823127747, + "learning_rate": 9.470416161488053e-06, + "loss": 0.822, + "step": 5457 + }, + { + "epoch": 0.3004017832571963, + "grad_norm": 1.0421968698501587, + "learning_rate": 9.470221994621747e-06, + "loss": 0.9273, + "step": 5458 + }, + { + "epoch": 0.300456822059552, + "grad_norm": 0.9064405560493469, + "learning_rate": 9.470027794158447e-06, + "loss": 0.7087, + "step": 5459 + }, + { + "epoch": 0.3005118608619076, + "grad_norm": 0.6766010522842407, + "learning_rate": 9.469833560099617e-06, + "loss": 0.7063, + "step": 5460 + }, + { + "epoch": 0.3005668996642633, + "grad_norm": 0.7987816333770752, + "learning_rate": 9.469639292446712e-06, + "loss": 0.8216, + "step": 5461 + }, + { + "epoch": 0.30062193846661894, + "grad_norm": 0.776792049407959, + "learning_rate": 9.469444991201197e-06, + "loss": 0.8598, + "step": 5462 + }, + { + "epoch": 0.30067697726897463, + "grad_norm": 0.8048756718635559, + "learning_rate": 9.469250656364529e-06, + "loss": 0.8645, + "step": 5463 + }, + { + "epoch": 0.30073201607133027, + "grad_norm": 1.0650218725204468, + "learning_rate": 9.46905628793817e-06, + "loss": 0.8918, + "step": 5464 + }, + { + "epoch": 0.30078705487368596, + "grad_norm": 0.7378712296485901, + "learning_rate": 9.468861885923577e-06, + "loss": 0.6866, + "step": 5465 + }, + { + "epoch": 0.3008420936760416, + "grad_norm": 0.7382808327674866, + "learning_rate": 9.468667450322218e-06, + "loss": 0.8413, + "step": 5466 + }, + { + "epoch": 0.3008971324783973, + "grad_norm": 0.8390250205993652, + "learning_rate": 9.468472981135548e-06, + "loss": 0.8275, + "step": 5467 + }, + { + "epoch": 0.3009521712807529, + "grad_norm": 0.9169766902923584, + "learning_rate": 9.468278478365034e-06, + "loss": 0.8274, + "step": 5468 + }, + { + "epoch": 0.3010072100831086, + "grad_norm": 0.7487995028495789, + "learning_rate": 9.468083942012134e-06, + "loss": 0.7729, + "step": 5469 + }, + { + "epoch": 0.30106224888546423, + "grad_norm": 0.7457556128501892, + "learning_rate": 9.467889372078309e-06, + "loss": 0.7435, + "step": 5470 + }, + { + "epoch": 0.3011172876878199, + "grad_norm": 0.7085639834403992, + "learning_rate": 9.467694768565026e-06, + "loss": 0.7686, + "step": 5471 + }, + { + "epoch": 0.30117232649017556, + "grad_norm": 0.7396196722984314, + "learning_rate": 9.467500131473744e-06, + "loss": 0.7496, + "step": 5472 + }, + { + "epoch": 0.30122736529253125, + "grad_norm": 0.7906790971755981, + "learning_rate": 9.467305460805927e-06, + "loss": 0.8341, + "step": 5473 + }, + { + "epoch": 0.3012824040948869, + "grad_norm": 0.673541247844696, + "learning_rate": 9.467110756563039e-06, + "loss": 0.8041, + "step": 5474 + }, + { + "epoch": 0.30133744289724257, + "grad_norm": 0.8247049450874329, + "learning_rate": 9.46691601874654e-06, + "loss": 0.8227, + "step": 5475 + }, + { + "epoch": 0.3013924816995982, + "grad_norm": 0.7564057111740112, + "learning_rate": 9.466721247357898e-06, + "loss": 0.8181, + "step": 5476 + }, + { + "epoch": 0.3014475205019539, + "grad_norm": 0.7533192038536072, + "learning_rate": 9.466526442398574e-06, + "loss": 0.782, + "step": 5477 + }, + { + "epoch": 0.3015025593043095, + "grad_norm": 0.6934120059013367, + "learning_rate": 9.466331603870033e-06, + "loss": 0.7153, + "step": 5478 + }, + { + "epoch": 0.3015575981066652, + "grad_norm": 0.7417232990264893, + "learning_rate": 9.466136731773738e-06, + "loss": 0.753, + "step": 5479 + }, + { + "epoch": 0.30161263690902085, + "grad_norm": 0.7421486973762512, + "learning_rate": 9.465941826111156e-06, + "loss": 0.7668, + "step": 5480 + }, + { + "epoch": 0.30166767571137654, + "grad_norm": 1.0851647853851318, + "learning_rate": 9.465746886883751e-06, + "loss": 0.8019, + "step": 5481 + }, + { + "epoch": 0.30172271451373217, + "grad_norm": 0.9209244847297668, + "learning_rate": 9.465551914092987e-06, + "loss": 0.7912, + "step": 5482 + }, + { + "epoch": 0.30177775331608786, + "grad_norm": 0.6915135383605957, + "learning_rate": 9.465356907740331e-06, + "loss": 0.8112, + "step": 5483 + }, + { + "epoch": 0.3018327921184435, + "grad_norm": 0.824593722820282, + "learning_rate": 9.465161867827247e-06, + "loss": 0.7969, + "step": 5484 + }, + { + "epoch": 0.3018878309207992, + "grad_norm": 0.7985100746154785, + "learning_rate": 9.464966794355201e-06, + "loss": 0.8258, + "step": 5485 + }, + { + "epoch": 0.3019428697231548, + "grad_norm": 0.8471764326095581, + "learning_rate": 9.464771687325663e-06, + "loss": 0.8241, + "step": 5486 + }, + { + "epoch": 0.3019979085255105, + "grad_norm": 0.8133455514907837, + "learning_rate": 9.464576546740093e-06, + "loss": 0.7809, + "step": 5487 + }, + { + "epoch": 0.30205294732786614, + "grad_norm": 0.7684013843536377, + "learning_rate": 9.464381372599961e-06, + "loss": 0.9023, + "step": 5488 + }, + { + "epoch": 0.30210798613022183, + "grad_norm": 0.7818747758865356, + "learning_rate": 9.464186164906735e-06, + "loss": 0.7152, + "step": 5489 + }, + { + "epoch": 0.30216302493257746, + "grad_norm": 0.7524297833442688, + "learning_rate": 9.46399092366188e-06, + "loss": 0.782, + "step": 5490 + }, + { + "epoch": 0.30221806373493315, + "grad_norm": 0.6550590991973877, + "learning_rate": 9.463795648866864e-06, + "loss": 0.7696, + "step": 5491 + }, + { + "epoch": 0.3022731025372888, + "grad_norm": 0.8679335117340088, + "learning_rate": 9.463600340523154e-06, + "loss": 0.8115, + "step": 5492 + }, + { + "epoch": 0.3023281413396445, + "grad_norm": 0.692500114440918, + "learning_rate": 9.46340499863222e-06, + "loss": 0.7692, + "step": 5493 + }, + { + "epoch": 0.3023831801420001, + "grad_norm": 0.8604017496109009, + "learning_rate": 9.463209623195528e-06, + "loss": 0.8547, + "step": 5494 + }, + { + "epoch": 0.3024382189443558, + "grad_norm": 0.6715821623802185, + "learning_rate": 9.463014214214548e-06, + "loss": 0.7638, + "step": 5495 + }, + { + "epoch": 0.30249325774671143, + "grad_norm": 0.7803179025650024, + "learning_rate": 9.462818771690747e-06, + "loss": 0.7795, + "step": 5496 + }, + { + "epoch": 0.3025482965490671, + "grad_norm": 0.787323534488678, + "learning_rate": 9.462623295625596e-06, + "loss": 0.735, + "step": 5497 + }, + { + "epoch": 0.30260333535142275, + "grad_norm": 0.9943159222602844, + "learning_rate": 9.462427786020563e-06, + "loss": 0.7451, + "step": 5498 + }, + { + "epoch": 0.3026583741537784, + "grad_norm": 0.772524893283844, + "learning_rate": 9.462232242877116e-06, + "loss": 0.9167, + "step": 5499 + }, + { + "epoch": 0.3027134129561341, + "grad_norm": 0.7204643487930298, + "learning_rate": 9.462036666196726e-06, + "loss": 0.7442, + "step": 5500 + }, + { + "epoch": 0.3027684517584897, + "grad_norm": 0.7450547218322754, + "learning_rate": 9.461841055980863e-06, + "loss": 0.8002, + "step": 5501 + }, + { + "epoch": 0.3028234905608454, + "grad_norm": 0.8096264004707336, + "learning_rate": 9.461645412230997e-06, + "loss": 0.8601, + "step": 5502 + }, + { + "epoch": 0.30287852936320103, + "grad_norm": 0.684968888759613, + "learning_rate": 9.461449734948597e-06, + "loss": 0.7251, + "step": 5503 + }, + { + "epoch": 0.3029335681655567, + "grad_norm": 0.7727203369140625, + "learning_rate": 9.461254024135138e-06, + "loss": 0.7797, + "step": 5504 + }, + { + "epoch": 0.30298860696791236, + "grad_norm": 0.9292891025543213, + "learning_rate": 9.461058279792086e-06, + "loss": 0.7519, + "step": 5505 + }, + { + "epoch": 0.30304364577026804, + "grad_norm": 0.7836466431617737, + "learning_rate": 9.460862501920915e-06, + "loss": 0.8201, + "step": 5506 + }, + { + "epoch": 0.3030986845726237, + "grad_norm": 0.9043576121330261, + "learning_rate": 9.460666690523094e-06, + "loss": 0.79, + "step": 5507 + }, + { + "epoch": 0.30315372337497937, + "grad_norm": 0.8339952230453491, + "learning_rate": 9.460470845600098e-06, + "loss": 0.8392, + "step": 5508 + }, + { + "epoch": 0.303208762177335, + "grad_norm": 0.7603133320808411, + "learning_rate": 9.460274967153395e-06, + "loss": 0.7168, + "step": 5509 + }, + { + "epoch": 0.3032638009796907, + "grad_norm": 0.7287996411323547, + "learning_rate": 9.460079055184461e-06, + "loss": 0.7452, + "step": 5510 + }, + { + "epoch": 0.3033188397820463, + "grad_norm": 0.707953691482544, + "learning_rate": 9.459883109694767e-06, + "loss": 0.8081, + "step": 5511 + }, + { + "epoch": 0.303373878584402, + "grad_norm": 0.7556451559066772, + "learning_rate": 9.459687130685784e-06, + "loss": 0.8145, + "step": 5512 + }, + { + "epoch": 0.30342891738675765, + "grad_norm": 0.8076426386833191, + "learning_rate": 9.459491118158987e-06, + "loss": 0.8006, + "step": 5513 + }, + { + "epoch": 0.30348395618911334, + "grad_norm": 0.7343682646751404, + "learning_rate": 9.459295072115849e-06, + "loss": 0.7574, + "step": 5514 + }, + { + "epoch": 0.30353899499146897, + "grad_norm": 0.68440181016922, + "learning_rate": 9.459098992557843e-06, + "loss": 0.7432, + "step": 5515 + }, + { + "epoch": 0.30359403379382466, + "grad_norm": 0.8278071880340576, + "learning_rate": 9.458902879486441e-06, + "loss": 0.8357, + "step": 5516 + }, + { + "epoch": 0.3036490725961803, + "grad_norm": 0.8377245664596558, + "learning_rate": 9.458706732903121e-06, + "loss": 0.7552, + "step": 5517 + }, + { + "epoch": 0.303704111398536, + "grad_norm": 0.7354543805122375, + "learning_rate": 9.458510552809353e-06, + "loss": 0.7862, + "step": 5518 + }, + { + "epoch": 0.3037591502008916, + "grad_norm": 0.8071799874305725, + "learning_rate": 9.458314339206611e-06, + "loss": 0.8428, + "step": 5519 + }, + { + "epoch": 0.3038141890032473, + "grad_norm": 0.7452389597892761, + "learning_rate": 9.458118092096376e-06, + "loss": 0.8252, + "step": 5520 + }, + { + "epoch": 0.30386922780560294, + "grad_norm": 0.7370620965957642, + "learning_rate": 9.457921811480115e-06, + "loss": 0.8143, + "step": 5521 + }, + { + "epoch": 0.3039242666079586, + "grad_norm": 0.8816156387329102, + "learning_rate": 9.45772549735931e-06, + "loss": 0.7163, + "step": 5522 + }, + { + "epoch": 0.30397930541031426, + "grad_norm": 0.7208901643753052, + "learning_rate": 9.457529149735432e-06, + "loss": 0.7877, + "step": 5523 + }, + { + "epoch": 0.30403434421266995, + "grad_norm": 0.820792019367218, + "learning_rate": 9.457332768609959e-06, + "loss": 0.8275, + "step": 5524 + }, + { + "epoch": 0.3040893830150256, + "grad_norm": 0.8471686244010925, + "learning_rate": 9.457136353984365e-06, + "loss": 0.8127, + "step": 5525 + }, + { + "epoch": 0.30414442181738127, + "grad_norm": 0.9448342323303223, + "learning_rate": 9.456939905860127e-06, + "loss": 0.8157, + "step": 5526 + }, + { + "epoch": 0.3041994606197369, + "grad_norm": 0.7835188508033752, + "learning_rate": 9.456743424238723e-06, + "loss": 0.7116, + "step": 5527 + }, + { + "epoch": 0.3042544994220926, + "grad_norm": 0.8884950876235962, + "learning_rate": 9.456546909121629e-06, + "loss": 0.8514, + "step": 5528 + }, + { + "epoch": 0.30430953822444823, + "grad_norm": 0.7400928735733032, + "learning_rate": 9.45635036051032e-06, + "loss": 0.8207, + "step": 5529 + }, + { + "epoch": 0.3043645770268039, + "grad_norm": 0.8278732299804688, + "learning_rate": 9.456153778406274e-06, + "loss": 0.8269, + "step": 5530 + }, + { + "epoch": 0.30441961582915955, + "grad_norm": 0.7423332929611206, + "learning_rate": 9.45595716281097e-06, + "loss": 0.7937, + "step": 5531 + }, + { + "epoch": 0.30447465463151524, + "grad_norm": 1.5018088817596436, + "learning_rate": 9.455760513725885e-06, + "loss": 0.7935, + "step": 5532 + }, + { + "epoch": 0.3045296934338709, + "grad_norm": 0.8105388283729553, + "learning_rate": 9.455563831152496e-06, + "loss": 0.8225, + "step": 5533 + }, + { + "epoch": 0.30458473223622656, + "grad_norm": 0.6874535083770752, + "learning_rate": 9.455367115092283e-06, + "loss": 0.7301, + "step": 5534 + }, + { + "epoch": 0.3046397710385822, + "grad_norm": 0.8085837960243225, + "learning_rate": 9.455170365546721e-06, + "loss": 0.83, + "step": 5535 + }, + { + "epoch": 0.3046948098409379, + "grad_norm": 0.810773491859436, + "learning_rate": 9.454973582517293e-06, + "loss": 0.7186, + "step": 5536 + }, + { + "epoch": 0.3047498486432935, + "grad_norm": 0.7290367484092712, + "learning_rate": 9.454776766005476e-06, + "loss": 0.8181, + "step": 5537 + }, + { + "epoch": 0.3048048874456492, + "grad_norm": 0.773728609085083, + "learning_rate": 9.45457991601275e-06, + "loss": 0.8454, + "step": 5538 + }, + { + "epoch": 0.30485992624800484, + "grad_norm": 0.792169451713562, + "learning_rate": 9.454383032540592e-06, + "loss": 0.8797, + "step": 5539 + }, + { + "epoch": 0.30491496505036053, + "grad_norm": 0.7478733658790588, + "learning_rate": 9.454186115590485e-06, + "loss": 0.7544, + "step": 5540 + }, + { + "epoch": 0.30497000385271616, + "grad_norm": 0.8527306318283081, + "learning_rate": 9.453989165163906e-06, + "loss": 0.8379, + "step": 5541 + }, + { + "epoch": 0.3050250426550718, + "grad_norm": 0.8829329013824463, + "learning_rate": 9.453792181262337e-06, + "loss": 0.7643, + "step": 5542 + }, + { + "epoch": 0.3050800814574275, + "grad_norm": 0.9477338790893555, + "learning_rate": 9.453595163887258e-06, + "loss": 0.7414, + "step": 5543 + }, + { + "epoch": 0.3051351202597831, + "grad_norm": 0.8311536312103271, + "learning_rate": 9.453398113040151e-06, + "loss": 0.8133, + "step": 5544 + }, + { + "epoch": 0.3051901590621388, + "grad_norm": 0.8035525679588318, + "learning_rate": 9.453201028722497e-06, + "loss": 0.7841, + "step": 5545 + }, + { + "epoch": 0.30524519786449444, + "grad_norm": 0.7779183983802795, + "learning_rate": 9.453003910935775e-06, + "loss": 0.7696, + "step": 5546 + }, + { + "epoch": 0.30530023666685013, + "grad_norm": 0.7843946218490601, + "learning_rate": 9.452806759681465e-06, + "loss": 0.6018, + "step": 5547 + }, + { + "epoch": 0.30535527546920577, + "grad_norm": 0.7215032577514648, + "learning_rate": 9.452609574961053e-06, + "loss": 0.7457, + "step": 5548 + }, + { + "epoch": 0.30541031427156146, + "grad_norm": 0.9628198742866516, + "learning_rate": 9.452412356776021e-06, + "loss": 0.8061, + "step": 5549 + }, + { + "epoch": 0.3054653530739171, + "grad_norm": 0.9468308687210083, + "learning_rate": 9.452215105127848e-06, + "loss": 0.7909, + "step": 5550 + }, + { + "epoch": 0.3055203918762728, + "grad_norm": 0.876402735710144, + "learning_rate": 9.452017820018017e-06, + "loss": 0.69, + "step": 5551 + }, + { + "epoch": 0.3055754306786284, + "grad_norm": 1.03409743309021, + "learning_rate": 9.451820501448014e-06, + "loss": 0.8375, + "step": 5552 + }, + { + "epoch": 0.3056304694809841, + "grad_norm": 0.8057541847229004, + "learning_rate": 9.45162314941932e-06, + "loss": 0.7704, + "step": 5553 + }, + { + "epoch": 0.30568550828333974, + "grad_norm": 0.7256304025650024, + "learning_rate": 9.451425763933417e-06, + "loss": 0.7819, + "step": 5554 + }, + { + "epoch": 0.3057405470856954, + "grad_norm": 0.7982180118560791, + "learning_rate": 9.451228344991788e-06, + "loss": 0.8094, + "step": 5555 + }, + { + "epoch": 0.30579558588805106, + "grad_norm": 1.0314620733261108, + "learning_rate": 9.45103089259592e-06, + "loss": 0.7777, + "step": 5556 + }, + { + "epoch": 0.30585062469040675, + "grad_norm": 0.6948755383491516, + "learning_rate": 9.450833406747294e-06, + "loss": 0.7189, + "step": 5557 + }, + { + "epoch": 0.3059056634927624, + "grad_norm": 0.7412117719650269, + "learning_rate": 9.450635887447396e-06, + "loss": 0.783, + "step": 5558 + }, + { + "epoch": 0.30596070229511807, + "grad_norm": 0.7394647002220154, + "learning_rate": 9.450438334697711e-06, + "loss": 0.7888, + "step": 5559 + }, + { + "epoch": 0.3060157410974737, + "grad_norm": 0.692701518535614, + "learning_rate": 9.450240748499725e-06, + "loss": 0.7427, + "step": 5560 + }, + { + "epoch": 0.3060707798998294, + "grad_norm": 0.6854925751686096, + "learning_rate": 9.450043128854916e-06, + "loss": 0.7877, + "step": 5561 + }, + { + "epoch": 0.306125818702185, + "grad_norm": 0.8073517680168152, + "learning_rate": 9.449845475764776e-06, + "loss": 0.8715, + "step": 5562 + }, + { + "epoch": 0.3061808575045407, + "grad_norm": 0.9672908186912537, + "learning_rate": 9.449647789230789e-06, + "loss": 0.782, + "step": 5563 + }, + { + "epoch": 0.30623589630689635, + "grad_norm": 0.7409735918045044, + "learning_rate": 9.44945006925444e-06, + "loss": 0.7956, + "step": 5564 + }, + { + "epoch": 0.30629093510925204, + "grad_norm": 0.7839213609695435, + "learning_rate": 9.449252315837215e-06, + "loss": 0.7559, + "step": 5565 + }, + { + "epoch": 0.30634597391160767, + "grad_norm": 0.668393075466156, + "learning_rate": 9.449054528980602e-06, + "loss": 0.717, + "step": 5566 + }, + { + "epoch": 0.30640101271396336, + "grad_norm": 0.8818438053131104, + "learning_rate": 9.448856708686084e-06, + "loss": 0.7801, + "step": 5567 + }, + { + "epoch": 0.306456051516319, + "grad_norm": 0.7331361770629883, + "learning_rate": 9.44865885495515e-06, + "loss": 0.6999, + "step": 5568 + }, + { + "epoch": 0.3065110903186747, + "grad_norm": 0.7818138599395752, + "learning_rate": 9.448460967789288e-06, + "loss": 0.7437, + "step": 5569 + }, + { + "epoch": 0.3065661291210303, + "grad_norm": 0.7713417410850525, + "learning_rate": 9.448263047189985e-06, + "loss": 0.8523, + "step": 5570 + }, + { + "epoch": 0.306621167923386, + "grad_norm": 0.7152866125106812, + "learning_rate": 9.448065093158726e-06, + "loss": 0.7706, + "step": 5571 + }, + { + "epoch": 0.30667620672574164, + "grad_norm": 0.7486638426780701, + "learning_rate": 9.447867105697e-06, + "loss": 0.7738, + "step": 5572 + }, + { + "epoch": 0.30673124552809733, + "grad_norm": 0.7014918923377991, + "learning_rate": 9.447669084806297e-06, + "loss": 0.7013, + "step": 5573 + }, + { + "epoch": 0.30678628433045296, + "grad_norm": 0.8328303694725037, + "learning_rate": 9.447471030488102e-06, + "loss": 0.8113, + "step": 5574 + }, + { + "epoch": 0.30684132313280865, + "grad_norm": 0.6800024509429932, + "learning_rate": 9.447272942743906e-06, + "loss": 0.6786, + "step": 5575 + }, + { + "epoch": 0.3068963619351643, + "grad_norm": 0.6827595829963684, + "learning_rate": 9.447074821575198e-06, + "loss": 0.812, + "step": 5576 + }, + { + "epoch": 0.30695140073752, + "grad_norm": 0.8775614500045776, + "learning_rate": 9.446876666983465e-06, + "loss": 0.7683, + "step": 5577 + }, + { + "epoch": 0.3070064395398756, + "grad_norm": 0.7440332174301147, + "learning_rate": 9.446678478970198e-06, + "loss": 0.7152, + "step": 5578 + }, + { + "epoch": 0.3070614783422313, + "grad_norm": 0.7031408548355103, + "learning_rate": 9.446480257536885e-06, + "loss": 0.7603, + "step": 5579 + }, + { + "epoch": 0.30711651714458693, + "grad_norm": 0.8419817090034485, + "learning_rate": 9.446282002685019e-06, + "loss": 0.9939, + "step": 5580 + }, + { + "epoch": 0.3071715559469426, + "grad_norm": 0.7622908353805542, + "learning_rate": 9.446083714416085e-06, + "loss": 0.8682, + "step": 5581 + }, + { + "epoch": 0.30722659474929825, + "grad_norm": 0.7341362833976746, + "learning_rate": 9.445885392731576e-06, + "loss": 0.848, + "step": 5582 + }, + { + "epoch": 0.30728163355165394, + "grad_norm": 0.7248286604881287, + "learning_rate": 9.445687037632984e-06, + "loss": 0.7699, + "step": 5583 + }, + { + "epoch": 0.3073366723540096, + "grad_norm": 0.9409947991371155, + "learning_rate": 9.445488649121797e-06, + "loss": 1.0051, + "step": 5584 + }, + { + "epoch": 0.3073917111563652, + "grad_norm": 0.7279968857765198, + "learning_rate": 9.445290227199509e-06, + "loss": 0.8001, + "step": 5585 + }, + { + "epoch": 0.3074467499587209, + "grad_norm": 0.7904797196388245, + "learning_rate": 9.445091771867607e-06, + "loss": 0.8892, + "step": 5586 + }, + { + "epoch": 0.30750178876107653, + "grad_norm": 0.7090430855751038, + "learning_rate": 9.444893283127587e-06, + "loss": 0.5983, + "step": 5587 + }, + { + "epoch": 0.3075568275634322, + "grad_norm": 0.8363901376724243, + "learning_rate": 9.444694760980939e-06, + "loss": 0.7688, + "step": 5588 + }, + { + "epoch": 0.30761186636578786, + "grad_norm": 0.7487169504165649, + "learning_rate": 9.444496205429152e-06, + "loss": 0.7585, + "step": 5589 + }, + { + "epoch": 0.30766690516814355, + "grad_norm": 0.750801146030426, + "learning_rate": 9.444297616473724e-06, + "loss": 0.6493, + "step": 5590 + }, + { + "epoch": 0.3077219439704992, + "grad_norm": 0.754846453666687, + "learning_rate": 9.444098994116144e-06, + "loss": 0.8528, + "step": 5591 + }, + { + "epoch": 0.30777698277285487, + "grad_norm": 0.7088152766227722, + "learning_rate": 9.443900338357907e-06, + "loss": 0.7927, + "step": 5592 + }, + { + "epoch": 0.3078320215752105, + "grad_norm": 0.7077113389968872, + "learning_rate": 9.443701649200503e-06, + "loss": 0.7996, + "step": 5593 + }, + { + "epoch": 0.3078870603775662, + "grad_norm": 0.732982873916626, + "learning_rate": 9.443502926645427e-06, + "loss": 0.7473, + "step": 5594 + }, + { + "epoch": 0.3079420991799218, + "grad_norm": 0.7068434357643127, + "learning_rate": 9.443304170694174e-06, + "loss": 0.7575, + "step": 5595 + }, + { + "epoch": 0.3079971379822775, + "grad_norm": 0.7703887224197388, + "learning_rate": 9.443105381348234e-06, + "loss": 0.8157, + "step": 5596 + }, + { + "epoch": 0.30805217678463315, + "grad_norm": 0.806924045085907, + "learning_rate": 9.442906558609103e-06, + "loss": 0.7572, + "step": 5597 + }, + { + "epoch": 0.30810721558698884, + "grad_norm": 0.8364617824554443, + "learning_rate": 9.442707702478278e-06, + "loss": 0.7491, + "step": 5598 + }, + { + "epoch": 0.30816225438934447, + "grad_norm": 0.9269624352455139, + "learning_rate": 9.442508812957249e-06, + "loss": 0.8746, + "step": 5599 + }, + { + "epoch": 0.30821729319170016, + "grad_norm": 0.7308455109596252, + "learning_rate": 9.442309890047515e-06, + "loss": 0.8068, + "step": 5600 + }, + { + "epoch": 0.3082723319940558, + "grad_norm": 0.812622606754303, + "learning_rate": 9.442110933750567e-06, + "loss": 0.9137, + "step": 5601 + }, + { + "epoch": 0.3083273707964115, + "grad_norm": 0.7100754976272583, + "learning_rate": 9.441911944067905e-06, + "loss": 0.7471, + "step": 5602 + }, + { + "epoch": 0.3083824095987671, + "grad_norm": 0.760208010673523, + "learning_rate": 9.44171292100102e-06, + "loss": 0.8243, + "step": 5603 + }, + { + "epoch": 0.3084374484011228, + "grad_norm": 0.6931812763214111, + "learning_rate": 9.44151386455141e-06, + "loss": 0.7523, + "step": 5604 + }, + { + "epoch": 0.30849248720347844, + "grad_norm": 0.6584734916687012, + "learning_rate": 9.44131477472057e-06, + "loss": 0.6929, + "step": 5605 + }, + { + "epoch": 0.3085475260058341, + "grad_norm": 0.977661669254303, + "learning_rate": 9.441115651509997e-06, + "loss": 0.8003, + "step": 5606 + }, + { + "epoch": 0.30860256480818976, + "grad_norm": 0.650434672832489, + "learning_rate": 9.440916494921189e-06, + "loss": 0.6629, + "step": 5607 + }, + { + "epoch": 0.30865760361054545, + "grad_norm": 0.6804447770118713, + "learning_rate": 9.44071730495564e-06, + "loss": 0.7216, + "step": 5608 + }, + { + "epoch": 0.3087126424129011, + "grad_norm": 0.7942929267883301, + "learning_rate": 9.44051808161485e-06, + "loss": 0.7593, + "step": 5609 + }, + { + "epoch": 0.3087676812152568, + "grad_norm": 0.7069621086120605, + "learning_rate": 9.440318824900313e-06, + "loss": 0.7453, + "step": 5610 + }, + { + "epoch": 0.3088227200176124, + "grad_norm": 0.7903168797492981, + "learning_rate": 9.440119534813528e-06, + "loss": 0.8084, + "step": 5611 + }, + { + "epoch": 0.3088777588199681, + "grad_norm": 0.7828298807144165, + "learning_rate": 9.439920211355993e-06, + "loss": 0.7556, + "step": 5612 + }, + { + "epoch": 0.30893279762232373, + "grad_norm": 0.8118648529052734, + "learning_rate": 9.43972085452921e-06, + "loss": 0.8548, + "step": 5613 + }, + { + "epoch": 0.3089878364246794, + "grad_norm": 0.9169642329216003, + "learning_rate": 9.439521464334669e-06, + "loss": 0.833, + "step": 5614 + }, + { + "epoch": 0.30904287522703505, + "grad_norm": 0.7844422459602356, + "learning_rate": 9.439322040773875e-06, + "loss": 0.8363, + "step": 5615 + }, + { + "epoch": 0.30909791402939074, + "grad_norm": 1.4801305532455444, + "learning_rate": 9.439122583848324e-06, + "loss": 0.7617, + "step": 5616 + }, + { + "epoch": 0.3091529528317464, + "grad_norm": 0.7737647891044617, + "learning_rate": 9.438923093559517e-06, + "loss": 0.7224, + "step": 5617 + }, + { + "epoch": 0.30920799163410206, + "grad_norm": 0.7279127836227417, + "learning_rate": 9.438723569908952e-06, + "loss": 0.7783, + "step": 5618 + }, + { + "epoch": 0.3092630304364577, + "grad_norm": 0.7635996341705322, + "learning_rate": 9.438524012898127e-06, + "loss": 0.8408, + "step": 5619 + }, + { + "epoch": 0.3093180692388134, + "grad_norm": 0.818445086479187, + "learning_rate": 9.438324422528547e-06, + "loss": 0.8836, + "step": 5620 + }, + { + "epoch": 0.309373108041169, + "grad_norm": 0.8620640635490417, + "learning_rate": 9.438124798801706e-06, + "loss": 0.925, + "step": 5621 + }, + { + "epoch": 0.3094281468435247, + "grad_norm": 0.7294883728027344, + "learning_rate": 9.437925141719108e-06, + "loss": 0.8387, + "step": 5622 + }, + { + "epoch": 0.30948318564588034, + "grad_norm": 0.6696046590805054, + "learning_rate": 9.437725451282252e-06, + "loss": 0.6712, + "step": 5623 + }, + { + "epoch": 0.30953822444823603, + "grad_norm": 0.8200504779815674, + "learning_rate": 9.43752572749264e-06, + "loss": 0.8191, + "step": 5624 + }, + { + "epoch": 0.30959326325059167, + "grad_norm": 0.8440756797790527, + "learning_rate": 9.437325970351773e-06, + "loss": 0.7412, + "step": 5625 + }, + { + "epoch": 0.30964830205294736, + "grad_norm": 0.8550771474838257, + "learning_rate": 9.43712617986115e-06, + "loss": 0.7842, + "step": 5626 + }, + { + "epoch": 0.309703340855303, + "grad_norm": 0.8203451037406921, + "learning_rate": 9.436926356022275e-06, + "loss": 0.8298, + "step": 5627 + }, + { + "epoch": 0.3097583796576586, + "grad_norm": 1.0105336904525757, + "learning_rate": 9.436726498836651e-06, + "loss": 0.8416, + "step": 5628 + }, + { + "epoch": 0.3098134184600143, + "grad_norm": 0.7684324383735657, + "learning_rate": 9.436526608305777e-06, + "loss": 0.7051, + "step": 5629 + }, + { + "epoch": 0.30986845726236995, + "grad_norm": 0.7284610867500305, + "learning_rate": 9.436326684431157e-06, + "loss": 0.755, + "step": 5630 + }, + { + "epoch": 0.30992349606472563, + "grad_norm": 0.7125874161720276, + "learning_rate": 9.436126727214293e-06, + "loss": 0.7336, + "step": 5631 + }, + { + "epoch": 0.30997853486708127, + "grad_norm": 0.7008525729179382, + "learning_rate": 9.435926736656687e-06, + "loss": 0.7185, + "step": 5632 + }, + { + "epoch": 0.31003357366943696, + "grad_norm": 0.7087175250053406, + "learning_rate": 9.435726712759844e-06, + "loss": 0.717, + "step": 5633 + }, + { + "epoch": 0.3100886124717926, + "grad_norm": 0.7892497777938843, + "learning_rate": 9.435526655525267e-06, + "loss": 0.8308, + "step": 5634 + }, + { + "epoch": 0.3101436512741483, + "grad_norm": 0.733906626701355, + "learning_rate": 9.435326564954457e-06, + "loss": 0.7421, + "step": 5635 + }, + { + "epoch": 0.3101986900765039, + "grad_norm": 0.7874915599822998, + "learning_rate": 9.43512644104892e-06, + "loss": 0.8808, + "step": 5636 + }, + { + "epoch": 0.3102537288788596, + "grad_norm": 0.6849297881126404, + "learning_rate": 9.434926283810162e-06, + "loss": 0.7297, + "step": 5637 + }, + { + "epoch": 0.31030876768121524, + "grad_norm": 0.7847834825515747, + "learning_rate": 9.434726093239685e-06, + "loss": 0.7873, + "step": 5638 + }, + { + "epoch": 0.3103638064835709, + "grad_norm": 0.6999106407165527, + "learning_rate": 9.434525869338992e-06, + "loss": 0.7699, + "step": 5639 + }, + { + "epoch": 0.31041884528592656, + "grad_norm": 0.7662788033485413, + "learning_rate": 9.43432561210959e-06, + "loss": 0.7583, + "step": 5640 + }, + { + "epoch": 0.31047388408828225, + "grad_norm": 0.8336607217788696, + "learning_rate": 9.434125321552985e-06, + "loss": 0.7297, + "step": 5641 + }, + { + "epoch": 0.3105289228906379, + "grad_norm": 0.8038349151611328, + "learning_rate": 9.433924997670681e-06, + "loss": 0.798, + "step": 5642 + }, + { + "epoch": 0.31058396169299357, + "grad_norm": 0.6819794178009033, + "learning_rate": 9.433724640464181e-06, + "loss": 0.7951, + "step": 5643 + }, + { + "epoch": 0.3106390004953492, + "grad_norm": 0.916238009929657, + "learning_rate": 9.433524249934995e-06, + "loss": 0.7371, + "step": 5644 + }, + { + "epoch": 0.3106940392977049, + "grad_norm": 0.8390263915061951, + "learning_rate": 9.433323826084628e-06, + "loss": 0.8211, + "step": 5645 + }, + { + "epoch": 0.3107490781000605, + "grad_norm": 0.7957239747047424, + "learning_rate": 9.433123368914586e-06, + "loss": 0.8406, + "step": 5646 + }, + { + "epoch": 0.3108041169024162, + "grad_norm": 0.6771933436393738, + "learning_rate": 9.432922878426374e-06, + "loss": 0.7664, + "step": 5647 + }, + { + "epoch": 0.31085915570477185, + "grad_norm": 0.7874065041542053, + "learning_rate": 9.432722354621503e-06, + "loss": 0.7445, + "step": 5648 + }, + { + "epoch": 0.31091419450712754, + "grad_norm": 0.674749493598938, + "learning_rate": 9.432521797501475e-06, + "loss": 0.745, + "step": 5649 + }, + { + "epoch": 0.3109692333094832, + "grad_norm": 0.7695828676223755, + "learning_rate": 9.432321207067799e-06, + "loss": 0.7555, + "step": 5650 + }, + { + "epoch": 0.31102427211183886, + "grad_norm": 0.8050221800804138, + "learning_rate": 9.432120583321984e-06, + "loss": 0.8464, + "step": 5651 + }, + { + "epoch": 0.3110793109141945, + "grad_norm": 0.7242713570594788, + "learning_rate": 9.431919926265538e-06, + "loss": 0.7439, + "step": 5652 + }, + { + "epoch": 0.3111343497165502, + "grad_norm": 0.7372434735298157, + "learning_rate": 9.431719235899967e-06, + "loss": 0.7973, + "step": 5653 + }, + { + "epoch": 0.3111893885189058, + "grad_norm": 0.7573439478874207, + "learning_rate": 9.431518512226783e-06, + "loss": 0.8259, + "step": 5654 + }, + { + "epoch": 0.3112444273212615, + "grad_norm": 0.7098552584648132, + "learning_rate": 9.43131775524749e-06, + "loss": 0.8159, + "step": 5655 + }, + { + "epoch": 0.31129946612361714, + "grad_norm": 0.7804632186889648, + "learning_rate": 9.431116964963599e-06, + "loss": 0.7795, + "step": 5656 + }, + { + "epoch": 0.31135450492597283, + "grad_norm": 1.0158027410507202, + "learning_rate": 9.43091614137662e-06, + "loss": 0.7935, + "step": 5657 + }, + { + "epoch": 0.31140954372832846, + "grad_norm": 0.708238422870636, + "learning_rate": 9.430715284488059e-06, + "loss": 0.7592, + "step": 5658 + }, + { + "epoch": 0.31146458253068415, + "grad_norm": 0.7086984515190125, + "learning_rate": 9.43051439429943e-06, + "loss": 0.7303, + "step": 5659 + }, + { + "epoch": 0.3115196213330398, + "grad_norm": 0.7620081305503845, + "learning_rate": 9.43031347081224e-06, + "loss": 0.7429, + "step": 5660 + }, + { + "epoch": 0.3115746601353955, + "grad_norm": 0.746126115322113, + "learning_rate": 9.430112514028e-06, + "loss": 0.8836, + "step": 5661 + }, + { + "epoch": 0.3116296989377511, + "grad_norm": 0.9113686680793762, + "learning_rate": 9.429911523948221e-06, + "loss": 0.6343, + "step": 5662 + }, + { + "epoch": 0.3116847377401068, + "grad_norm": 0.700890839099884, + "learning_rate": 9.429710500574413e-06, + "loss": 0.8201, + "step": 5663 + }, + { + "epoch": 0.31173977654246243, + "grad_norm": 0.7428706288337708, + "learning_rate": 9.429509443908085e-06, + "loss": 0.6838, + "step": 5664 + }, + { + "epoch": 0.3117948153448181, + "grad_norm": 0.851725697517395, + "learning_rate": 9.429308353950752e-06, + "loss": 0.7151, + "step": 5665 + }, + { + "epoch": 0.31184985414717376, + "grad_norm": 0.8555309176445007, + "learning_rate": 9.42910723070392e-06, + "loss": 0.7384, + "step": 5666 + }, + { + "epoch": 0.31190489294952944, + "grad_norm": 0.735927939414978, + "learning_rate": 9.428906074169107e-06, + "loss": 0.6911, + "step": 5667 + }, + { + "epoch": 0.3119599317518851, + "grad_norm": 0.8007609844207764, + "learning_rate": 9.42870488434782e-06, + "loss": 0.869, + "step": 5668 + }, + { + "epoch": 0.31201497055424077, + "grad_norm": 0.7604133486747742, + "learning_rate": 9.42850366124157e-06, + "loss": 0.7633, + "step": 5669 + }, + { + "epoch": 0.3120700093565964, + "grad_norm": 0.8181144595146179, + "learning_rate": 9.428302404851875e-06, + "loss": 0.7631, + "step": 5670 + }, + { + "epoch": 0.31212504815895203, + "grad_norm": 0.7115523219108582, + "learning_rate": 9.428101115180243e-06, + "loss": 0.734, + "step": 5671 + }, + { + "epoch": 0.3121800869613077, + "grad_norm": 0.7165855765342712, + "learning_rate": 9.42789979222819e-06, + "loss": 0.8068, + "step": 5672 + }, + { + "epoch": 0.31223512576366336, + "grad_norm": 0.6515665650367737, + "learning_rate": 9.427698435997225e-06, + "loss": 0.6946, + "step": 5673 + }, + { + "epoch": 0.31229016456601905, + "grad_norm": 0.7692676186561584, + "learning_rate": 9.427497046488867e-06, + "loss": 0.7387, + "step": 5674 + }, + { + "epoch": 0.3123452033683747, + "grad_norm": 0.70064777135849, + "learning_rate": 9.427295623704625e-06, + "loss": 0.7976, + "step": 5675 + }, + { + "epoch": 0.31240024217073037, + "grad_norm": 0.7464852333068848, + "learning_rate": 9.427094167646013e-06, + "loss": 0.7574, + "step": 5676 + }, + { + "epoch": 0.312455280973086, + "grad_norm": 0.7721675634384155, + "learning_rate": 9.426892678314548e-06, + "loss": 0.7405, + "step": 5677 + }, + { + "epoch": 0.3125103197754417, + "grad_norm": 0.6581596732139587, + "learning_rate": 9.42669115571174e-06, + "loss": 0.6972, + "step": 5678 + }, + { + "epoch": 0.3125653585777973, + "grad_norm": 0.8722662329673767, + "learning_rate": 9.426489599839108e-06, + "loss": 0.8073, + "step": 5679 + }, + { + "epoch": 0.312620397380153, + "grad_norm": 0.6800306439399719, + "learning_rate": 9.426288010698165e-06, + "loss": 0.7721, + "step": 5680 + }, + { + "epoch": 0.31267543618250865, + "grad_norm": 0.7443979382514954, + "learning_rate": 9.426086388290428e-06, + "loss": 0.7719, + "step": 5681 + }, + { + "epoch": 0.31273047498486434, + "grad_norm": 0.7818729877471924, + "learning_rate": 9.425884732617407e-06, + "loss": 0.7815, + "step": 5682 + }, + { + "epoch": 0.31278551378721997, + "grad_norm": 0.7640877366065979, + "learning_rate": 9.425683043680624e-06, + "loss": 0.8315, + "step": 5683 + }, + { + "epoch": 0.31284055258957566, + "grad_norm": 0.6871064305305481, + "learning_rate": 9.42548132148159e-06, + "loss": 0.8017, + "step": 5684 + }, + { + "epoch": 0.3128955913919313, + "grad_norm": 0.8394801616668701, + "learning_rate": 9.425279566021824e-06, + "loss": 0.763, + "step": 5685 + }, + { + "epoch": 0.312950630194287, + "grad_norm": 0.7104960083961487, + "learning_rate": 9.42507777730284e-06, + "loss": 0.7991, + "step": 5686 + }, + { + "epoch": 0.3130056689966426, + "grad_norm": 0.7820347547531128, + "learning_rate": 9.424875955326159e-06, + "loss": 0.825, + "step": 5687 + }, + { + "epoch": 0.3130607077989983, + "grad_norm": 0.783343493938446, + "learning_rate": 9.424674100093292e-06, + "loss": 0.8189, + "step": 5688 + }, + { + "epoch": 0.31311574660135394, + "grad_norm": 0.7998474836349487, + "learning_rate": 9.42447221160576e-06, + "loss": 0.7382, + "step": 5689 + }, + { + "epoch": 0.31317078540370963, + "grad_norm": 0.7232120633125305, + "learning_rate": 9.424270289865078e-06, + "loss": 0.8556, + "step": 5690 + }, + { + "epoch": 0.31322582420606526, + "grad_norm": 0.7944191694259644, + "learning_rate": 9.424068334872764e-06, + "loss": 0.8272, + "step": 5691 + }, + { + "epoch": 0.31328086300842095, + "grad_norm": 0.7951859831809998, + "learning_rate": 9.42386634663034e-06, + "loss": 0.7612, + "step": 5692 + }, + { + "epoch": 0.3133359018107766, + "grad_norm": 1.394667387008667, + "learning_rate": 9.423664325139318e-06, + "loss": 0.8108, + "step": 5693 + }, + { + "epoch": 0.3133909406131323, + "grad_norm": 0.868886411190033, + "learning_rate": 9.42346227040122e-06, + "loss": 0.8308, + "step": 5694 + }, + { + "epoch": 0.3134459794154879, + "grad_norm": 0.9442586302757263, + "learning_rate": 9.423260182417563e-06, + "loss": 0.9145, + "step": 5695 + }, + { + "epoch": 0.3135010182178436, + "grad_norm": 0.7432793974876404, + "learning_rate": 9.423058061189868e-06, + "loss": 0.7715, + "step": 5696 + }, + { + "epoch": 0.31355605702019923, + "grad_norm": 0.7221946120262146, + "learning_rate": 9.422855906719652e-06, + "loss": 0.7588, + "step": 5697 + }, + { + "epoch": 0.3136110958225549, + "grad_norm": 0.7459834814071655, + "learning_rate": 9.422653719008434e-06, + "loss": 0.7834, + "step": 5698 + }, + { + "epoch": 0.31366613462491055, + "grad_norm": 0.8562330007553101, + "learning_rate": 9.422451498057737e-06, + "loss": 0.6994, + "step": 5699 + }, + { + "epoch": 0.31372117342726624, + "grad_norm": 0.672696053981781, + "learning_rate": 9.422249243869075e-06, + "loss": 0.7201, + "step": 5700 + }, + { + "epoch": 0.3137762122296219, + "grad_norm": 0.7459990382194519, + "learning_rate": 9.422046956443973e-06, + "loss": 0.7663, + "step": 5701 + }, + { + "epoch": 0.31383125103197757, + "grad_norm": 0.9653169512748718, + "learning_rate": 9.42184463578395e-06, + "loss": 0.8899, + "step": 5702 + }, + { + "epoch": 0.3138862898343332, + "grad_norm": 0.7137778997421265, + "learning_rate": 9.421642281890526e-06, + "loss": 0.74, + "step": 5703 + }, + { + "epoch": 0.3139413286366889, + "grad_norm": 0.6961745619773865, + "learning_rate": 9.421439894765222e-06, + "loss": 0.7309, + "step": 5704 + }, + { + "epoch": 0.3139963674390445, + "grad_norm": 0.7843212485313416, + "learning_rate": 9.421237474409559e-06, + "loss": 0.8654, + "step": 5705 + }, + { + "epoch": 0.3140514062414002, + "grad_norm": 0.7560604810714722, + "learning_rate": 9.42103502082506e-06, + "loss": 0.7949, + "step": 5706 + }, + { + "epoch": 0.31410644504375584, + "grad_norm": 0.756200909614563, + "learning_rate": 9.420832534013245e-06, + "loss": 0.7315, + "step": 5707 + }, + { + "epoch": 0.31416148384611153, + "grad_norm": 0.7857967615127563, + "learning_rate": 9.420630013975635e-06, + "loss": 0.7698, + "step": 5708 + }, + { + "epoch": 0.31421652264846717, + "grad_norm": 0.6943809986114502, + "learning_rate": 9.420427460713754e-06, + "loss": 0.7691, + "step": 5709 + }, + { + "epoch": 0.31427156145082286, + "grad_norm": 0.7460532188415527, + "learning_rate": 9.420224874229123e-06, + "loss": 0.7679, + "step": 5710 + }, + { + "epoch": 0.3143266002531785, + "grad_norm": 0.764406144618988, + "learning_rate": 9.420022254523265e-06, + "loss": 0.9545, + "step": 5711 + }, + { + "epoch": 0.3143816390555342, + "grad_norm": 0.7191083431243896, + "learning_rate": 9.419819601597703e-06, + "loss": 0.728, + "step": 5712 + }, + { + "epoch": 0.3144366778578898, + "grad_norm": 0.8799699544906616, + "learning_rate": 9.419616915453959e-06, + "loss": 0.6911, + "step": 5713 + }, + { + "epoch": 0.31449171666024545, + "grad_norm": 0.7505975365638733, + "learning_rate": 9.419414196093558e-06, + "loss": 0.7953, + "step": 5714 + }, + { + "epoch": 0.31454675546260114, + "grad_norm": 0.7575502395629883, + "learning_rate": 9.419211443518023e-06, + "loss": 0.7752, + "step": 5715 + }, + { + "epoch": 0.31460179426495677, + "grad_norm": 0.7220337986946106, + "learning_rate": 9.419008657728879e-06, + "loss": 0.7894, + "step": 5716 + }, + { + "epoch": 0.31465683306731246, + "grad_norm": 0.7797306776046753, + "learning_rate": 9.418805838727648e-06, + "loss": 0.7582, + "step": 5717 + }, + { + "epoch": 0.3147118718696681, + "grad_norm": 0.9011242985725403, + "learning_rate": 9.418602986515855e-06, + "loss": 0.7379, + "step": 5718 + }, + { + "epoch": 0.3147669106720238, + "grad_norm": 0.7568445801734924, + "learning_rate": 9.418400101095025e-06, + "loss": 0.8003, + "step": 5719 + }, + { + "epoch": 0.3148219494743794, + "grad_norm": 0.6810547709465027, + "learning_rate": 9.418197182466681e-06, + "loss": 0.7186, + "step": 5720 + }, + { + "epoch": 0.3148769882767351, + "grad_norm": 0.7390284538269043, + "learning_rate": 9.417994230632352e-06, + "loss": 0.7478, + "step": 5721 + }, + { + "epoch": 0.31493202707909074, + "grad_norm": 0.695286214351654, + "learning_rate": 9.41779124559356e-06, + "loss": 0.7467, + "step": 5722 + }, + { + "epoch": 0.3149870658814464, + "grad_norm": 0.7783445715904236, + "learning_rate": 9.41758822735183e-06, + "loss": 0.824, + "step": 5723 + }, + { + "epoch": 0.31504210468380206, + "grad_norm": 0.7176268696784973, + "learning_rate": 9.41738517590869e-06, + "loss": 0.7596, + "step": 5724 + }, + { + "epoch": 0.31509714348615775, + "grad_norm": 0.7829678058624268, + "learning_rate": 9.417182091265668e-06, + "loss": 0.8184, + "step": 5725 + }, + { + "epoch": 0.3151521822885134, + "grad_norm": 0.7461703419685364, + "learning_rate": 9.416978973424286e-06, + "loss": 0.8732, + "step": 5726 + }, + { + "epoch": 0.31520722109086907, + "grad_norm": 0.7186999320983887, + "learning_rate": 9.416775822386073e-06, + "loss": 0.6878, + "step": 5727 + }, + { + "epoch": 0.3152622598932247, + "grad_norm": 0.6775033473968506, + "learning_rate": 9.416572638152553e-06, + "loss": 0.7211, + "step": 5728 + }, + { + "epoch": 0.3153172986955804, + "grad_norm": 0.6845641732215881, + "learning_rate": 9.416369420725258e-06, + "loss": 0.7282, + "step": 5729 + }, + { + "epoch": 0.31537233749793603, + "grad_norm": 0.8301281929016113, + "learning_rate": 9.416166170105712e-06, + "loss": 0.7999, + "step": 5730 + }, + { + "epoch": 0.3154273763002917, + "grad_norm": 0.8487183451652527, + "learning_rate": 9.415962886295442e-06, + "loss": 0.8202, + "step": 5731 + }, + { + "epoch": 0.31548241510264735, + "grad_norm": 0.74607914686203, + "learning_rate": 9.415759569295979e-06, + "loss": 0.7552, + "step": 5732 + }, + { + "epoch": 0.31553745390500304, + "grad_norm": 0.7774194478988647, + "learning_rate": 9.415556219108846e-06, + "loss": 0.7847, + "step": 5733 + }, + { + "epoch": 0.3155924927073587, + "grad_norm": 0.7782126069068909, + "learning_rate": 9.415352835735576e-06, + "loss": 0.8001, + "step": 5734 + }, + { + "epoch": 0.31564753150971436, + "grad_norm": 0.7577764987945557, + "learning_rate": 9.415149419177698e-06, + "loss": 0.8262, + "step": 5735 + }, + { + "epoch": 0.31570257031207, + "grad_norm": 0.7949855327606201, + "learning_rate": 9.414945969436737e-06, + "loss": 0.8259, + "step": 5736 + }, + { + "epoch": 0.3157576091144257, + "grad_norm": 0.7670153379440308, + "learning_rate": 9.414742486514224e-06, + "loss": 0.7181, + "step": 5737 + }, + { + "epoch": 0.3158126479167813, + "grad_norm": 0.7852359414100647, + "learning_rate": 9.414538970411687e-06, + "loss": 0.8802, + "step": 5738 + }, + { + "epoch": 0.315867686719137, + "grad_norm": 0.8300517201423645, + "learning_rate": 9.414335421130658e-06, + "loss": 0.7665, + "step": 5739 + }, + { + "epoch": 0.31592272552149264, + "grad_norm": 0.7631614804267883, + "learning_rate": 9.414131838672666e-06, + "loss": 0.8864, + "step": 5740 + }, + { + "epoch": 0.31597776432384833, + "grad_norm": 0.7946471571922302, + "learning_rate": 9.41392822303924e-06, + "loss": 0.7587, + "step": 5741 + }, + { + "epoch": 0.31603280312620396, + "grad_norm": 0.7043818235397339, + "learning_rate": 9.413724574231912e-06, + "loss": 0.7793, + "step": 5742 + }, + { + "epoch": 0.31608784192855965, + "grad_norm": 0.7276063561439514, + "learning_rate": 9.41352089225221e-06, + "loss": 0.8064, + "step": 5743 + }, + { + "epoch": 0.3161428807309153, + "grad_norm": 0.7141419053077698, + "learning_rate": 9.413317177101667e-06, + "loss": 0.7251, + "step": 5744 + }, + { + "epoch": 0.316197919533271, + "grad_norm": 0.7961493730545044, + "learning_rate": 9.413113428781815e-06, + "loss": 0.8438, + "step": 5745 + }, + { + "epoch": 0.3162529583356266, + "grad_norm": 0.7046970129013062, + "learning_rate": 9.412909647294181e-06, + "loss": 0.8319, + "step": 5746 + }, + { + "epoch": 0.3163079971379823, + "grad_norm": 0.8231918215751648, + "learning_rate": 9.412705832640302e-06, + "loss": 0.7707, + "step": 5747 + }, + { + "epoch": 0.31636303594033793, + "grad_norm": 0.769840657711029, + "learning_rate": 9.412501984821705e-06, + "loss": 0.6819, + "step": 5748 + }, + { + "epoch": 0.3164180747426936, + "grad_norm": 0.7526834607124329, + "learning_rate": 9.412298103839925e-06, + "loss": 0.8106, + "step": 5749 + }, + { + "epoch": 0.31647311354504926, + "grad_norm": 0.6763152480125427, + "learning_rate": 9.412094189696494e-06, + "loss": 0.7577, + "step": 5750 + }, + { + "epoch": 0.31652815234740495, + "grad_norm": 0.8460820317268372, + "learning_rate": 9.411890242392945e-06, + "loss": 0.752, + "step": 5751 + }, + { + "epoch": 0.3165831911497606, + "grad_norm": 0.7610277533531189, + "learning_rate": 9.411686261930809e-06, + "loss": 0.7284, + "step": 5752 + }, + { + "epoch": 0.31663822995211627, + "grad_norm": 0.7596566081047058, + "learning_rate": 9.411482248311619e-06, + "loss": 0.8518, + "step": 5753 + }, + { + "epoch": 0.3166932687544719, + "grad_norm": 0.7615048885345459, + "learning_rate": 9.41127820153691e-06, + "loss": 0.8232, + "step": 5754 + }, + { + "epoch": 0.3167483075568276, + "grad_norm": 0.7882834672927856, + "learning_rate": 9.411074121608215e-06, + "loss": 0.8682, + "step": 5755 + }, + { + "epoch": 0.3168033463591832, + "grad_norm": 0.748002827167511, + "learning_rate": 9.410870008527067e-06, + "loss": 0.7934, + "step": 5756 + }, + { + "epoch": 0.31685838516153886, + "grad_norm": 0.7677696943283081, + "learning_rate": 9.410665862295003e-06, + "loss": 0.8114, + "step": 5757 + }, + { + "epoch": 0.31691342396389455, + "grad_norm": 0.8966217041015625, + "learning_rate": 9.410461682913552e-06, + "loss": 0.8005, + "step": 5758 + }, + { + "epoch": 0.3169684627662502, + "grad_norm": 0.8769435286521912, + "learning_rate": 9.410257470384253e-06, + "loss": 0.7935, + "step": 5759 + }, + { + "epoch": 0.31702350156860587, + "grad_norm": 0.9828680753707886, + "learning_rate": 9.41005322470864e-06, + "loss": 0.8182, + "step": 5760 + }, + { + "epoch": 0.3170785403709615, + "grad_norm": 0.7340976595878601, + "learning_rate": 9.409848945888245e-06, + "loss": 0.7832, + "step": 5761 + }, + { + "epoch": 0.3171335791733172, + "grad_norm": 0.7516821622848511, + "learning_rate": 9.409644633924609e-06, + "loss": 0.8223, + "step": 5762 + }, + { + "epoch": 0.3171886179756728, + "grad_norm": 0.7556331157684326, + "learning_rate": 9.409440288819263e-06, + "loss": 0.7631, + "step": 5763 + }, + { + "epoch": 0.3172436567780285, + "grad_norm": 0.6182114481925964, + "learning_rate": 9.409235910573743e-06, + "loss": 0.558, + "step": 5764 + }, + { + "epoch": 0.31729869558038415, + "grad_norm": 0.7854578495025635, + "learning_rate": 9.409031499189586e-06, + "loss": 0.8496, + "step": 5765 + }, + { + "epoch": 0.31735373438273984, + "grad_norm": 0.7246551513671875, + "learning_rate": 9.40882705466833e-06, + "loss": 0.8407, + "step": 5766 + }, + { + "epoch": 0.31740877318509547, + "grad_norm": 1.089107632637024, + "learning_rate": 9.40862257701151e-06, + "loss": 0.8363, + "step": 5767 + }, + { + "epoch": 0.31746381198745116, + "grad_norm": 0.9886558055877686, + "learning_rate": 9.408418066220664e-06, + "loss": 0.6888, + "step": 5768 + }, + { + "epoch": 0.3175188507898068, + "grad_norm": 0.8724960088729858, + "learning_rate": 9.408213522297325e-06, + "loss": 0.7717, + "step": 5769 + }, + { + "epoch": 0.3175738895921625, + "grad_norm": 0.7453228831291199, + "learning_rate": 9.408008945243035e-06, + "loss": 0.7081, + "step": 5770 + }, + { + "epoch": 0.3176289283945181, + "grad_norm": 0.7601909637451172, + "learning_rate": 9.40780433505933e-06, + "loss": 0.7974, + "step": 5771 + }, + { + "epoch": 0.3176839671968738, + "grad_norm": 0.7704907655715942, + "learning_rate": 9.407599691747746e-06, + "loss": 0.7521, + "step": 5772 + }, + { + "epoch": 0.31773900599922944, + "grad_norm": 0.7639214396476746, + "learning_rate": 9.407395015309824e-06, + "loss": 0.7888, + "step": 5773 + }, + { + "epoch": 0.31779404480158513, + "grad_norm": 0.711355984210968, + "learning_rate": 9.4071903057471e-06, + "loss": 0.7482, + "step": 5774 + }, + { + "epoch": 0.31784908360394076, + "grad_norm": 0.6097242832183838, + "learning_rate": 9.406985563061114e-06, + "loss": 0.6533, + "step": 5775 + }, + { + "epoch": 0.31790412240629645, + "grad_norm": 0.807133138179779, + "learning_rate": 9.406780787253402e-06, + "loss": 0.7788, + "step": 5776 + }, + { + "epoch": 0.3179591612086521, + "grad_norm": 0.6938545107841492, + "learning_rate": 9.406575978325508e-06, + "loss": 0.8046, + "step": 5777 + }, + { + "epoch": 0.3180142000110078, + "grad_norm": 0.848858118057251, + "learning_rate": 9.406371136278968e-06, + "loss": 0.8481, + "step": 5778 + }, + { + "epoch": 0.3180692388133634, + "grad_norm": 0.8496920466423035, + "learning_rate": 9.40616626111532e-06, + "loss": 0.8172, + "step": 5779 + }, + { + "epoch": 0.3181242776157191, + "grad_norm": 0.8169928193092346, + "learning_rate": 9.405961352836107e-06, + "loss": 0.792, + "step": 5780 + }, + { + "epoch": 0.31817931641807473, + "grad_norm": 0.9380607604980469, + "learning_rate": 9.405756411442868e-06, + "loss": 0.8371, + "step": 5781 + }, + { + "epoch": 0.3182343552204304, + "grad_norm": 0.6938190460205078, + "learning_rate": 9.405551436937144e-06, + "loss": 0.7825, + "step": 5782 + }, + { + "epoch": 0.31828939402278605, + "grad_norm": 0.7726871371269226, + "learning_rate": 9.405346429320473e-06, + "loss": 0.6481, + "step": 5783 + }, + { + "epoch": 0.31834443282514174, + "grad_norm": 0.77762770652771, + "learning_rate": 9.4051413885944e-06, + "loss": 0.6916, + "step": 5784 + }, + { + "epoch": 0.3183994716274974, + "grad_norm": 0.7580817341804504, + "learning_rate": 9.404936314760459e-06, + "loss": 0.8222, + "step": 5785 + }, + { + "epoch": 0.31845451042985307, + "grad_norm": 0.6984102725982666, + "learning_rate": 9.4047312078202e-06, + "loss": 0.707, + "step": 5786 + }, + { + "epoch": 0.3185095492322087, + "grad_norm": 0.6887965202331543, + "learning_rate": 9.404526067775159e-06, + "loss": 0.7289, + "step": 5787 + }, + { + "epoch": 0.3185645880345644, + "grad_norm": 0.7022155523300171, + "learning_rate": 9.404320894626879e-06, + "loss": 0.741, + "step": 5788 + }, + { + "epoch": 0.31861962683692, + "grad_norm": 0.8007381558418274, + "learning_rate": 9.404115688376903e-06, + "loss": 0.8332, + "step": 5789 + }, + { + "epoch": 0.3186746656392757, + "grad_norm": 0.6985924243927002, + "learning_rate": 9.40391044902677e-06, + "loss": 0.7849, + "step": 5790 + }, + { + "epoch": 0.31872970444163135, + "grad_norm": 0.771060585975647, + "learning_rate": 9.403705176578028e-06, + "loss": 0.8728, + "step": 5791 + }, + { + "epoch": 0.31878474324398703, + "grad_norm": 0.6976794600486755, + "learning_rate": 9.403499871032214e-06, + "loss": 0.7621, + "step": 5792 + }, + { + "epoch": 0.31883978204634267, + "grad_norm": 0.7552126049995422, + "learning_rate": 9.403294532390876e-06, + "loss": 0.7641, + "step": 5793 + }, + { + "epoch": 0.31889482084869836, + "grad_norm": 1.0032007694244385, + "learning_rate": 9.403089160655553e-06, + "loss": 0.8497, + "step": 5794 + }, + { + "epoch": 0.318949859651054, + "grad_norm": 0.7193583250045776, + "learning_rate": 9.402883755827792e-06, + "loss": 0.7991, + "step": 5795 + }, + { + "epoch": 0.3190048984534097, + "grad_norm": 0.7665852308273315, + "learning_rate": 9.402678317909135e-06, + "loss": 0.7692, + "step": 5796 + }, + { + "epoch": 0.3190599372557653, + "grad_norm": 0.7514237761497498, + "learning_rate": 9.402472846901125e-06, + "loss": 0.7388, + "step": 5797 + }, + { + "epoch": 0.319114976058121, + "grad_norm": 0.6817325353622437, + "learning_rate": 9.402267342805309e-06, + "loss": 0.7249, + "step": 5798 + }, + { + "epoch": 0.31917001486047664, + "grad_norm": 0.7659624218940735, + "learning_rate": 9.402061805623229e-06, + "loss": 0.755, + "step": 5799 + }, + { + "epoch": 0.31922505366283227, + "grad_norm": 0.7860668301582336, + "learning_rate": 9.401856235356431e-06, + "loss": 0.8175, + "step": 5800 + }, + { + "epoch": 0.31928009246518796, + "grad_norm": 0.714030921459198, + "learning_rate": 9.401650632006461e-06, + "loss": 0.7359, + "step": 5801 + }, + { + "epoch": 0.3193351312675436, + "grad_norm": 0.6052672266960144, + "learning_rate": 9.401444995574862e-06, + "loss": 0.6167, + "step": 5802 + }, + { + "epoch": 0.3193901700698993, + "grad_norm": 0.7960858941078186, + "learning_rate": 9.40123932606318e-06, + "loss": 0.7542, + "step": 5803 + }, + { + "epoch": 0.3194452088722549, + "grad_norm": 0.7926718592643738, + "learning_rate": 9.401033623472962e-06, + "loss": 0.8292, + "step": 5804 + }, + { + "epoch": 0.3195002476746106, + "grad_norm": 0.7950098514556885, + "learning_rate": 9.400827887805754e-06, + "loss": 0.9332, + "step": 5805 + }, + { + "epoch": 0.31955528647696624, + "grad_norm": 0.7564939260482788, + "learning_rate": 9.400622119063101e-06, + "loss": 0.7217, + "step": 5806 + }, + { + "epoch": 0.3196103252793219, + "grad_norm": 0.7582511901855469, + "learning_rate": 9.40041631724655e-06, + "loss": 0.723, + "step": 5807 + }, + { + "epoch": 0.31966536408167756, + "grad_norm": 0.8826366066932678, + "learning_rate": 9.400210482357648e-06, + "loss": 0.6977, + "step": 5808 + }, + { + "epoch": 0.31972040288403325, + "grad_norm": 0.7029523253440857, + "learning_rate": 9.400004614397941e-06, + "loss": 0.6949, + "step": 5809 + }, + { + "epoch": 0.3197754416863889, + "grad_norm": 0.7651532888412476, + "learning_rate": 9.399798713368979e-06, + "loss": 0.7158, + "step": 5810 + }, + { + "epoch": 0.3198304804887446, + "grad_norm": 0.9379491806030273, + "learning_rate": 9.399592779272307e-06, + "loss": 0.7639, + "step": 5811 + }, + { + "epoch": 0.3198855192911002, + "grad_norm": 0.7945839762687683, + "learning_rate": 9.399386812109474e-06, + "loss": 0.8175, + "step": 5812 + }, + { + "epoch": 0.3199405580934559, + "grad_norm": 0.9462345242500305, + "learning_rate": 9.399180811882025e-06, + "loss": 0.6635, + "step": 5813 + }, + { + "epoch": 0.31999559689581153, + "grad_norm": 1.0449726581573486, + "learning_rate": 9.398974778591513e-06, + "loss": 0.789, + "step": 5814 + }, + { + "epoch": 0.3200506356981672, + "grad_norm": 0.8295683860778809, + "learning_rate": 9.398768712239483e-06, + "loss": 0.7937, + "step": 5815 + }, + { + "epoch": 0.32010567450052285, + "grad_norm": 0.7578030228614807, + "learning_rate": 9.398562612827485e-06, + "loss": 0.8291, + "step": 5816 + }, + { + "epoch": 0.32016071330287854, + "grad_norm": 0.804563581943512, + "learning_rate": 9.398356480357068e-06, + "loss": 0.7604, + "step": 5817 + }, + { + "epoch": 0.3202157521052342, + "grad_norm": 0.8073337078094482, + "learning_rate": 9.39815031482978e-06, + "loss": 0.8288, + "step": 5818 + }, + { + "epoch": 0.32027079090758986, + "grad_norm": 0.8054978251457214, + "learning_rate": 9.397944116247173e-06, + "loss": 0.819, + "step": 5819 + }, + { + "epoch": 0.3203258297099455, + "grad_norm": 0.8304697871208191, + "learning_rate": 9.397737884610794e-06, + "loss": 0.7991, + "step": 5820 + }, + { + "epoch": 0.3203808685123012, + "grad_norm": 0.784662663936615, + "learning_rate": 9.397531619922195e-06, + "loss": 0.763, + "step": 5821 + }, + { + "epoch": 0.3204359073146568, + "grad_norm": 0.726046085357666, + "learning_rate": 9.397325322182926e-06, + "loss": 0.7926, + "step": 5822 + }, + { + "epoch": 0.3204909461170125, + "grad_norm": 0.7291107773780823, + "learning_rate": 9.397118991394535e-06, + "loss": 0.6871, + "step": 5823 + }, + { + "epoch": 0.32054598491936814, + "grad_norm": 0.7870203256607056, + "learning_rate": 9.396912627558577e-06, + "loss": 0.7827, + "step": 5824 + }, + { + "epoch": 0.32060102372172383, + "grad_norm": 0.8665844798088074, + "learning_rate": 9.3967062306766e-06, + "loss": 0.8098, + "step": 5825 + }, + { + "epoch": 0.32065606252407947, + "grad_norm": 0.7743843793869019, + "learning_rate": 9.396499800750157e-06, + "loss": 0.835, + "step": 5826 + }, + { + "epoch": 0.32071110132643516, + "grad_norm": 0.7724023461341858, + "learning_rate": 9.396293337780796e-06, + "loss": 0.8928, + "step": 5827 + }, + { + "epoch": 0.3207661401287908, + "grad_norm": 0.7497217655181885, + "learning_rate": 9.39608684177007e-06, + "loss": 0.8035, + "step": 5828 + }, + { + "epoch": 0.3208211789311465, + "grad_norm": 0.8346971869468689, + "learning_rate": 9.395880312719536e-06, + "loss": 0.8879, + "step": 5829 + }, + { + "epoch": 0.3208762177335021, + "grad_norm": 0.836626410484314, + "learning_rate": 9.39567375063074e-06, + "loss": 0.8523, + "step": 5830 + }, + { + "epoch": 0.3209312565358578, + "grad_norm": 0.734428346157074, + "learning_rate": 9.395467155505237e-06, + "loss": 0.7568, + "step": 5831 + }, + { + "epoch": 0.32098629533821343, + "grad_norm": 0.6620383858680725, + "learning_rate": 9.39526052734458e-06, + "loss": 0.7296, + "step": 5832 + }, + { + "epoch": 0.3210413341405691, + "grad_norm": 0.9356484413146973, + "learning_rate": 9.39505386615032e-06, + "loss": 0.8233, + "step": 5833 + }, + { + "epoch": 0.32109637294292476, + "grad_norm": 0.9238032698631287, + "learning_rate": 9.394847171924013e-06, + "loss": 0.7397, + "step": 5834 + }, + { + "epoch": 0.32115141174528045, + "grad_norm": 0.7161185145378113, + "learning_rate": 9.39464044466721e-06, + "loss": 0.7541, + "step": 5835 + }, + { + "epoch": 0.3212064505476361, + "grad_norm": 0.8381507396697998, + "learning_rate": 9.394433684381467e-06, + "loss": 0.7839, + "step": 5836 + }, + { + "epoch": 0.32126148934999177, + "grad_norm": 0.8299819231033325, + "learning_rate": 9.394226891068337e-06, + "loss": 0.871, + "step": 5837 + }, + { + "epoch": 0.3213165281523474, + "grad_norm": 0.7443987131118774, + "learning_rate": 9.394020064729372e-06, + "loss": 0.7661, + "step": 5838 + }, + { + "epoch": 0.3213715669547031, + "grad_norm": 0.7084206938743591, + "learning_rate": 9.393813205366128e-06, + "loss": 0.7609, + "step": 5839 + }, + { + "epoch": 0.3214266057570587, + "grad_norm": 0.7443114519119263, + "learning_rate": 9.393606312980164e-06, + "loss": 0.8189, + "step": 5840 + }, + { + "epoch": 0.3214816445594144, + "grad_norm": 0.7157652974128723, + "learning_rate": 9.393399387573028e-06, + "loss": 0.8369, + "step": 5841 + }, + { + "epoch": 0.32153668336177005, + "grad_norm": 0.709507942199707, + "learning_rate": 9.393192429146278e-06, + "loss": 0.7314, + "step": 5842 + }, + { + "epoch": 0.3215917221641257, + "grad_norm": 0.7704687714576721, + "learning_rate": 9.39298543770147e-06, + "loss": 0.8793, + "step": 5843 + }, + { + "epoch": 0.32164676096648137, + "grad_norm": 0.8123828172683716, + "learning_rate": 9.39277841324016e-06, + "loss": 0.8748, + "step": 5844 + }, + { + "epoch": 0.321701799768837, + "grad_norm": 0.6951777338981628, + "learning_rate": 9.392571355763903e-06, + "loss": 0.7883, + "step": 5845 + }, + { + "epoch": 0.3217568385711927, + "grad_norm": 0.6753274202346802, + "learning_rate": 9.392364265274256e-06, + "loss": 0.7292, + "step": 5846 + }, + { + "epoch": 0.3218118773735483, + "grad_norm": 0.7940227389335632, + "learning_rate": 9.392157141772775e-06, + "loss": 0.7919, + "step": 5847 + }, + { + "epoch": 0.321866916175904, + "grad_norm": 0.6706317067146301, + "learning_rate": 9.391949985261016e-06, + "loss": 0.6791, + "step": 5848 + }, + { + "epoch": 0.32192195497825965, + "grad_norm": 0.7898741960525513, + "learning_rate": 9.391742795740537e-06, + "loss": 0.7539, + "step": 5849 + }, + { + "epoch": 0.32197699378061534, + "grad_norm": 0.7623887658119202, + "learning_rate": 9.391535573212895e-06, + "loss": 0.7891, + "step": 5850 + }, + { + "epoch": 0.322032032582971, + "grad_norm": 0.6852909326553345, + "learning_rate": 9.391328317679647e-06, + "loss": 0.6587, + "step": 5851 + }, + { + "epoch": 0.32208707138532666, + "grad_norm": 0.7944231033325195, + "learning_rate": 9.39112102914235e-06, + "loss": 0.8316, + "step": 5852 + }, + { + "epoch": 0.3221421101876823, + "grad_norm": 0.6720889806747437, + "learning_rate": 9.390913707602563e-06, + "loss": 0.7791, + "step": 5853 + }, + { + "epoch": 0.322197148990038, + "grad_norm": 0.7482234239578247, + "learning_rate": 9.390706353061845e-06, + "loss": 0.826, + "step": 5854 + }, + { + "epoch": 0.3222521877923936, + "grad_norm": 0.6821579933166504, + "learning_rate": 9.390498965521752e-06, + "loss": 0.7183, + "step": 5855 + }, + { + "epoch": 0.3223072265947493, + "grad_norm": 0.755171537399292, + "learning_rate": 9.390291544983845e-06, + "loss": 0.6887, + "step": 5856 + }, + { + "epoch": 0.32236226539710494, + "grad_norm": 0.748824417591095, + "learning_rate": 9.39008409144968e-06, + "loss": 0.7169, + "step": 5857 + }, + { + "epoch": 0.32241730419946063, + "grad_norm": 0.7479343414306641, + "learning_rate": 9.38987660492082e-06, + "loss": 0.8122, + "step": 5858 + }, + { + "epoch": 0.32247234300181626, + "grad_norm": 0.7459376454353333, + "learning_rate": 9.389669085398823e-06, + "loss": 0.7782, + "step": 5859 + }, + { + "epoch": 0.32252738180417195, + "grad_norm": 0.7016253471374512, + "learning_rate": 9.389461532885246e-06, + "loss": 0.7866, + "step": 5860 + }, + { + "epoch": 0.3225824206065276, + "grad_norm": 0.6711822152137756, + "learning_rate": 9.389253947381654e-06, + "loss": 0.7223, + "step": 5861 + }, + { + "epoch": 0.3226374594088833, + "grad_norm": 0.855045735836029, + "learning_rate": 9.389046328889602e-06, + "loss": 0.7327, + "step": 5862 + }, + { + "epoch": 0.3226924982112389, + "grad_norm": 0.7309823632240295, + "learning_rate": 9.388838677410654e-06, + "loss": 0.7737, + "step": 5863 + }, + { + "epoch": 0.3227475370135946, + "grad_norm": 0.7737841010093689, + "learning_rate": 9.388630992946369e-06, + "loss": 0.7061, + "step": 5864 + }, + { + "epoch": 0.32280257581595023, + "grad_norm": 0.9448195099830627, + "learning_rate": 9.388423275498307e-06, + "loss": 0.8382, + "step": 5865 + }, + { + "epoch": 0.3228576146183059, + "grad_norm": 0.7348229885101318, + "learning_rate": 9.388215525068032e-06, + "loss": 0.8317, + "step": 5866 + }, + { + "epoch": 0.32291265342066156, + "grad_norm": 1.2628185749053955, + "learning_rate": 9.388007741657103e-06, + "loss": 0.7959, + "step": 5867 + }, + { + "epoch": 0.32296769222301724, + "grad_norm": 0.7730327844619751, + "learning_rate": 9.387799925267083e-06, + "loss": 0.7455, + "step": 5868 + }, + { + "epoch": 0.3230227310253729, + "grad_norm": 0.8273047804832458, + "learning_rate": 9.387592075899532e-06, + "loss": 0.877, + "step": 5869 + }, + { + "epoch": 0.32307776982772857, + "grad_norm": 0.7413405776023865, + "learning_rate": 9.387384193556014e-06, + "loss": 0.7734, + "step": 5870 + }, + { + "epoch": 0.3231328086300842, + "grad_norm": 1.0173207521438599, + "learning_rate": 9.387176278238092e-06, + "loss": 0.8674, + "step": 5871 + }, + { + "epoch": 0.3231878474324399, + "grad_norm": 0.7741677761077881, + "learning_rate": 9.386968329947327e-06, + "loss": 0.8226, + "step": 5872 + }, + { + "epoch": 0.3232428862347955, + "grad_norm": 0.8912034034729004, + "learning_rate": 9.38676034868528e-06, + "loss": 0.7977, + "step": 5873 + }, + { + "epoch": 0.3232979250371512, + "grad_norm": 0.7343642711639404, + "learning_rate": 9.386552334453519e-06, + "loss": 0.7639, + "step": 5874 + }, + { + "epoch": 0.32335296383950685, + "grad_norm": 0.697225034236908, + "learning_rate": 9.386344287253603e-06, + "loss": 0.6801, + "step": 5875 + }, + { + "epoch": 0.32340800264186254, + "grad_norm": 0.7082511186599731, + "learning_rate": 9.386136207087099e-06, + "loss": 0.746, + "step": 5876 + }, + { + "epoch": 0.32346304144421817, + "grad_norm": 0.671419620513916, + "learning_rate": 9.38592809395557e-06, + "loss": 0.7023, + "step": 5877 + }, + { + "epoch": 0.32351808024657386, + "grad_norm": 0.775834321975708, + "learning_rate": 9.385719947860579e-06, + "loss": 0.7797, + "step": 5878 + }, + { + "epoch": 0.3235731190489295, + "grad_norm": 0.7867023348808289, + "learning_rate": 9.38551176880369e-06, + "loss": 0.8165, + "step": 5879 + }, + { + "epoch": 0.3236281578512852, + "grad_norm": 0.7099916934967041, + "learning_rate": 9.385303556786469e-06, + "loss": 0.7598, + "step": 5880 + }, + { + "epoch": 0.3236831966536408, + "grad_norm": 0.7362176179885864, + "learning_rate": 9.385095311810479e-06, + "loss": 0.8002, + "step": 5881 + }, + { + "epoch": 0.3237382354559965, + "grad_norm": 0.7310882806777954, + "learning_rate": 9.384887033877288e-06, + "loss": 0.7641, + "step": 5882 + }, + { + "epoch": 0.32379327425835214, + "grad_norm": 0.7769907116889954, + "learning_rate": 9.384678722988458e-06, + "loss": 0.7938, + "step": 5883 + }, + { + "epoch": 0.3238483130607078, + "grad_norm": 0.9913623929023743, + "learning_rate": 9.384470379145558e-06, + "loss": 0.8203, + "step": 5884 + }, + { + "epoch": 0.32390335186306346, + "grad_norm": 0.8765702247619629, + "learning_rate": 9.384262002350153e-06, + "loss": 0.9343, + "step": 5885 + }, + { + "epoch": 0.3239583906654191, + "grad_norm": 0.8122400641441345, + "learning_rate": 9.384053592603808e-06, + "loss": 0.8325, + "step": 5886 + }, + { + "epoch": 0.3240134294677748, + "grad_norm": 0.7600317597389221, + "learning_rate": 9.383845149908089e-06, + "loss": 0.8335, + "step": 5887 + }, + { + "epoch": 0.3240684682701304, + "grad_norm": 0.9472025632858276, + "learning_rate": 9.383636674264563e-06, + "loss": 0.7265, + "step": 5888 + }, + { + "epoch": 0.3241235070724861, + "grad_norm": 0.6961854100227356, + "learning_rate": 9.383428165674797e-06, + "loss": 0.6962, + "step": 5889 + }, + { + "epoch": 0.32417854587484174, + "grad_norm": 0.7032504081726074, + "learning_rate": 9.38321962414036e-06, + "loss": 0.7627, + "step": 5890 + }, + { + "epoch": 0.32423358467719743, + "grad_norm": 0.7727648019790649, + "learning_rate": 9.383011049662816e-06, + "loss": 0.757, + "step": 5891 + }, + { + "epoch": 0.32428862347955306, + "grad_norm": 0.7263824343681335, + "learning_rate": 9.382802442243735e-06, + "loss": 0.8057, + "step": 5892 + }, + { + "epoch": 0.32434366228190875, + "grad_norm": 0.7576926350593567, + "learning_rate": 9.382593801884683e-06, + "loss": 0.763, + "step": 5893 + }, + { + "epoch": 0.3243987010842644, + "grad_norm": 0.7468064427375793, + "learning_rate": 9.38238512858723e-06, + "loss": 0.731, + "step": 5894 + }, + { + "epoch": 0.3244537398866201, + "grad_norm": 0.9570005536079407, + "learning_rate": 9.382176422352944e-06, + "loss": 0.7985, + "step": 5895 + }, + { + "epoch": 0.3245087786889757, + "grad_norm": 0.7296027541160583, + "learning_rate": 9.381967683183393e-06, + "loss": 0.8117, + "step": 5896 + }, + { + "epoch": 0.3245638174913314, + "grad_norm": 0.7330880165100098, + "learning_rate": 9.381758911080145e-06, + "loss": 0.7229, + "step": 5897 + }, + { + "epoch": 0.32461885629368703, + "grad_norm": 0.7247695922851562, + "learning_rate": 9.38155010604477e-06, + "loss": 0.7704, + "step": 5898 + }, + { + "epoch": 0.3246738950960427, + "grad_norm": 0.8011599779129028, + "learning_rate": 9.381341268078836e-06, + "loss": 0.6982, + "step": 5899 + }, + { + "epoch": 0.32472893389839835, + "grad_norm": 0.7931570410728455, + "learning_rate": 9.381132397183917e-06, + "loss": 0.8188, + "step": 5900 + }, + { + "epoch": 0.32478397270075404, + "grad_norm": 0.7469003200531006, + "learning_rate": 9.380923493361577e-06, + "loss": 0.7638, + "step": 5901 + }, + { + "epoch": 0.3248390115031097, + "grad_norm": 0.7442750334739685, + "learning_rate": 9.380714556613391e-06, + "loss": 0.8134, + "step": 5902 + }, + { + "epoch": 0.32489405030546537, + "grad_norm": 0.8014402985572815, + "learning_rate": 9.380505586940925e-06, + "loss": 0.838, + "step": 5903 + }, + { + "epoch": 0.324949089107821, + "grad_norm": 0.7287543416023254, + "learning_rate": 9.380296584345751e-06, + "loss": 0.7317, + "step": 5904 + }, + { + "epoch": 0.3250041279101767, + "grad_norm": 0.7754266262054443, + "learning_rate": 9.380087548829441e-06, + "loss": 0.7205, + "step": 5905 + }, + { + "epoch": 0.3250591667125323, + "grad_norm": 0.7439714074134827, + "learning_rate": 9.379878480393567e-06, + "loss": 0.821, + "step": 5906 + }, + { + "epoch": 0.325114205514888, + "grad_norm": 0.7142870426177979, + "learning_rate": 9.379669379039698e-06, + "loss": 0.7462, + "step": 5907 + }, + { + "epoch": 0.32516924431724364, + "grad_norm": 0.6522948145866394, + "learning_rate": 9.379460244769407e-06, + "loss": 0.739, + "step": 5908 + }, + { + "epoch": 0.32522428311959933, + "grad_norm": 0.7879271507263184, + "learning_rate": 9.379251077584263e-06, + "loss": 0.719, + "step": 5909 + }, + { + "epoch": 0.32527932192195497, + "grad_norm": 0.6969109773635864, + "learning_rate": 9.379041877485842e-06, + "loss": 0.7517, + "step": 5910 + }, + { + "epoch": 0.32533436072431066, + "grad_norm": 0.736890971660614, + "learning_rate": 9.378832644475714e-06, + "loss": 0.7797, + "step": 5911 + }, + { + "epoch": 0.3253893995266663, + "grad_norm": 0.7504066824913025, + "learning_rate": 9.378623378555451e-06, + "loss": 0.7502, + "step": 5912 + }, + { + "epoch": 0.325444438329022, + "grad_norm": 0.9339223504066467, + "learning_rate": 9.378414079726629e-06, + "loss": 0.8842, + "step": 5913 + }, + { + "epoch": 0.3254994771313776, + "grad_norm": 1.08317232131958, + "learning_rate": 9.378204747990818e-06, + "loss": 0.7503, + "step": 5914 + }, + { + "epoch": 0.3255545159337333, + "grad_norm": 0.722665011882782, + "learning_rate": 9.37799538334959e-06, + "loss": 0.7825, + "step": 5915 + }, + { + "epoch": 0.32560955473608894, + "grad_norm": 0.7969509959220886, + "learning_rate": 9.377785985804521e-06, + "loss": 0.8678, + "step": 5916 + }, + { + "epoch": 0.3256645935384446, + "grad_norm": 0.7944697141647339, + "learning_rate": 9.377576555357187e-06, + "loss": 0.8067, + "step": 5917 + }, + { + "epoch": 0.32571963234080026, + "grad_norm": 0.905580461025238, + "learning_rate": 9.377367092009158e-06, + "loss": 0.7689, + "step": 5918 + }, + { + "epoch": 0.32577467114315595, + "grad_norm": 0.7428018450737, + "learning_rate": 9.37715759576201e-06, + "loss": 0.7748, + "step": 5919 + }, + { + "epoch": 0.3258297099455116, + "grad_norm": 0.7746098041534424, + "learning_rate": 9.376948066617316e-06, + "loss": 0.7235, + "step": 5920 + }, + { + "epoch": 0.32588474874786727, + "grad_norm": 0.6842886805534363, + "learning_rate": 9.376738504576653e-06, + "loss": 0.7697, + "step": 5921 + }, + { + "epoch": 0.3259397875502229, + "grad_norm": 0.7858961224555969, + "learning_rate": 9.376528909641595e-06, + "loss": 0.7746, + "step": 5922 + }, + { + "epoch": 0.3259948263525786, + "grad_norm": 0.7534621357917786, + "learning_rate": 9.376319281813717e-06, + "loss": 0.8183, + "step": 5923 + }, + { + "epoch": 0.3260498651549342, + "grad_norm": 1.2406045198440552, + "learning_rate": 9.376109621094594e-06, + "loss": 0.8173, + "step": 5924 + }, + { + "epoch": 0.3261049039572899, + "grad_norm": 0.740075945854187, + "learning_rate": 9.375899927485804e-06, + "loss": 0.725, + "step": 5925 + }, + { + "epoch": 0.32615994275964555, + "grad_norm": 0.8432604074478149, + "learning_rate": 9.375690200988921e-06, + "loss": 0.7805, + "step": 5926 + }, + { + "epoch": 0.32621498156200124, + "grad_norm": 0.7652943134307861, + "learning_rate": 9.37548044160552e-06, + "loss": 0.8609, + "step": 5927 + }, + { + "epoch": 0.32627002036435687, + "grad_norm": 0.7629607915878296, + "learning_rate": 9.37527064933718e-06, + "loss": 0.8776, + "step": 5928 + }, + { + "epoch": 0.3263250591667125, + "grad_norm": 0.8648995757102966, + "learning_rate": 9.375060824185479e-06, + "loss": 0.7543, + "step": 5929 + }, + { + "epoch": 0.3263800979690682, + "grad_norm": 0.8069457411766052, + "learning_rate": 9.374850966151989e-06, + "loss": 0.7995, + "step": 5930 + }, + { + "epoch": 0.32643513677142383, + "grad_norm": 0.7948445677757263, + "learning_rate": 9.374641075238293e-06, + "loss": 0.8312, + "step": 5931 + }, + { + "epoch": 0.3264901755737795, + "grad_norm": 0.7739841341972351, + "learning_rate": 9.374431151445963e-06, + "loss": 0.8442, + "step": 5932 + }, + { + "epoch": 0.32654521437613515, + "grad_norm": 0.7382220029830933, + "learning_rate": 9.374221194776583e-06, + "loss": 0.7519, + "step": 5933 + }, + { + "epoch": 0.32660025317849084, + "grad_norm": 0.7876916527748108, + "learning_rate": 9.374011205231725e-06, + "loss": 0.817, + "step": 5934 + }, + { + "epoch": 0.3266552919808465, + "grad_norm": 0.7175565958023071, + "learning_rate": 9.373801182812969e-06, + "loss": 0.7317, + "step": 5935 + }, + { + "epoch": 0.32671033078320216, + "grad_norm": 0.7739143967628479, + "learning_rate": 9.373591127521894e-06, + "loss": 0.8134, + "step": 5936 + }, + { + "epoch": 0.3267653695855578, + "grad_norm": 0.7388991713523865, + "learning_rate": 9.373381039360082e-06, + "loss": 0.8758, + "step": 5937 + }, + { + "epoch": 0.3268204083879135, + "grad_norm": 0.7393535375595093, + "learning_rate": 9.373170918329105e-06, + "loss": 0.7453, + "step": 5938 + }, + { + "epoch": 0.3268754471902691, + "grad_norm": 0.7168294191360474, + "learning_rate": 9.372960764430547e-06, + "loss": 0.6535, + "step": 5939 + }, + { + "epoch": 0.3269304859926248, + "grad_norm": 0.7472337484359741, + "learning_rate": 9.372750577665988e-06, + "loss": 0.8065, + "step": 5940 + }, + { + "epoch": 0.32698552479498044, + "grad_norm": 0.7211272120475769, + "learning_rate": 9.372540358037005e-06, + "loss": 0.7389, + "step": 5941 + }, + { + "epoch": 0.32704056359733613, + "grad_norm": 0.8097178339958191, + "learning_rate": 9.37233010554518e-06, + "loss": 0.8034, + "step": 5942 + }, + { + "epoch": 0.32709560239969176, + "grad_norm": 0.7929103970527649, + "learning_rate": 9.372119820192091e-06, + "loss": 0.796, + "step": 5943 + }, + { + "epoch": 0.32715064120204745, + "grad_norm": 0.701171875, + "learning_rate": 9.37190950197932e-06, + "loss": 0.7092, + "step": 5944 + }, + { + "epoch": 0.3272056800044031, + "grad_norm": 0.679142951965332, + "learning_rate": 9.371699150908448e-06, + "loss": 0.6995, + "step": 5945 + }, + { + "epoch": 0.3272607188067588, + "grad_norm": 0.7757906913757324, + "learning_rate": 9.371488766981057e-06, + "loss": 0.8662, + "step": 5946 + }, + { + "epoch": 0.3273157576091144, + "grad_norm": 0.8086597323417664, + "learning_rate": 9.371278350198724e-06, + "loss": 0.7455, + "step": 5947 + }, + { + "epoch": 0.3273707964114701, + "grad_norm": 0.6443416476249695, + "learning_rate": 9.371067900563033e-06, + "loss": 0.7262, + "step": 5948 + }, + { + "epoch": 0.32742583521382573, + "grad_norm": 0.8132354021072388, + "learning_rate": 9.370857418075567e-06, + "loss": 0.7841, + "step": 5949 + }, + { + "epoch": 0.3274808740161814, + "grad_norm": 0.6811150908470154, + "learning_rate": 9.370646902737907e-06, + "loss": 0.6955, + "step": 5950 + }, + { + "epoch": 0.32753591281853706, + "grad_norm": 0.8956614136695862, + "learning_rate": 9.370436354551633e-06, + "loss": 0.8218, + "step": 5951 + }, + { + "epoch": 0.32759095162089275, + "grad_norm": 0.6807655692100525, + "learning_rate": 9.370225773518332e-06, + "loss": 0.7869, + "step": 5952 + }, + { + "epoch": 0.3276459904232484, + "grad_norm": 0.7506592869758606, + "learning_rate": 9.37001515963958e-06, + "loss": 0.7975, + "step": 5953 + }, + { + "epoch": 0.32770102922560407, + "grad_norm": 0.7488718032836914, + "learning_rate": 9.369804512916966e-06, + "loss": 0.7611, + "step": 5954 + }, + { + "epoch": 0.3277560680279597, + "grad_norm": 0.734569251537323, + "learning_rate": 9.369593833352073e-06, + "loss": 0.8532, + "step": 5955 + }, + { + "epoch": 0.3278111068303154, + "grad_norm": 0.780170738697052, + "learning_rate": 9.36938312094648e-06, + "loss": 0.7766, + "step": 5956 + }, + { + "epoch": 0.327866145632671, + "grad_norm": 0.6329935193061829, + "learning_rate": 9.369172375701774e-06, + "loss": 0.6789, + "step": 5957 + }, + { + "epoch": 0.3279211844350267, + "grad_norm": 1.0177193880081177, + "learning_rate": 9.368961597619537e-06, + "loss": 0.8362, + "step": 5958 + }, + { + "epoch": 0.32797622323738235, + "grad_norm": 0.730696439743042, + "learning_rate": 9.368750786701354e-06, + "loss": 0.7696, + "step": 5959 + }, + { + "epoch": 0.32803126203973804, + "grad_norm": 0.7946468591690063, + "learning_rate": 9.36853994294881e-06, + "loss": 0.8559, + "step": 5960 + }, + { + "epoch": 0.32808630084209367, + "grad_norm": 0.9353142976760864, + "learning_rate": 9.368329066363489e-06, + "loss": 0.9041, + "step": 5961 + }, + { + "epoch": 0.32814133964444936, + "grad_norm": 0.7256187796592712, + "learning_rate": 9.368118156946977e-06, + "loss": 0.787, + "step": 5962 + }, + { + "epoch": 0.328196378446805, + "grad_norm": 0.7454268336296082, + "learning_rate": 9.367907214700858e-06, + "loss": 0.7255, + "step": 5963 + }, + { + "epoch": 0.3282514172491607, + "grad_norm": 0.7087902426719666, + "learning_rate": 9.367696239626716e-06, + "loss": 0.7166, + "step": 5964 + }, + { + "epoch": 0.3283064560515163, + "grad_norm": 0.8217566609382629, + "learning_rate": 9.36748523172614e-06, + "loss": 0.8351, + "step": 5965 + }, + { + "epoch": 0.328361494853872, + "grad_norm": 0.7712824940681458, + "learning_rate": 9.367274191000713e-06, + "loss": 0.7561, + "step": 5966 + }, + { + "epoch": 0.32841653365622764, + "grad_norm": 0.6798166036605835, + "learning_rate": 9.367063117452024e-06, + "loss": 0.7447, + "step": 5967 + }, + { + "epoch": 0.3284715724585833, + "grad_norm": 0.7139115929603577, + "learning_rate": 9.366852011081655e-06, + "loss": 0.7728, + "step": 5968 + }, + { + "epoch": 0.32852661126093896, + "grad_norm": 1.0488213300704956, + "learning_rate": 9.366640871891196e-06, + "loss": 0.8283, + "step": 5969 + }, + { + "epoch": 0.32858165006329465, + "grad_norm": 0.7939574122428894, + "learning_rate": 9.366429699882233e-06, + "loss": 0.849, + "step": 5970 + }, + { + "epoch": 0.3286366888656503, + "grad_norm": 0.7959052324295044, + "learning_rate": 9.366218495056356e-06, + "loss": 0.7469, + "step": 5971 + }, + { + "epoch": 0.3286917276680059, + "grad_norm": 0.7293235063552856, + "learning_rate": 9.366007257415146e-06, + "loss": 0.8537, + "step": 5972 + }, + { + "epoch": 0.3287467664703616, + "grad_norm": 0.7490390539169312, + "learning_rate": 9.365795986960196e-06, + "loss": 0.8166, + "step": 5973 + }, + { + "epoch": 0.32880180527271724, + "grad_norm": 0.6572316884994507, + "learning_rate": 9.365584683693093e-06, + "loss": 0.6919, + "step": 5974 + }, + { + "epoch": 0.32885684407507293, + "grad_norm": 0.7286609411239624, + "learning_rate": 9.365373347615421e-06, + "loss": 0.768, + "step": 5975 + }, + { + "epoch": 0.32891188287742856, + "grad_norm": 0.7798202037811279, + "learning_rate": 9.365161978728772e-06, + "loss": 0.788, + "step": 5976 + }, + { + "epoch": 0.32896692167978425, + "grad_norm": 0.7224245071411133, + "learning_rate": 9.364950577034737e-06, + "loss": 0.7551, + "step": 5977 + }, + { + "epoch": 0.3290219604821399, + "grad_norm": 0.7238701581954956, + "learning_rate": 9.364739142534898e-06, + "loss": 0.6663, + "step": 5978 + }, + { + "epoch": 0.3290769992844956, + "grad_norm": 0.8947147727012634, + "learning_rate": 9.36452767523085e-06, + "loss": 0.8559, + "step": 5979 + }, + { + "epoch": 0.3291320380868512, + "grad_norm": 0.7346563935279846, + "learning_rate": 9.36431617512418e-06, + "loss": 0.7915, + "step": 5980 + }, + { + "epoch": 0.3291870768892069, + "grad_norm": 0.7674046158790588, + "learning_rate": 9.364104642216479e-06, + "loss": 0.7643, + "step": 5981 + }, + { + "epoch": 0.32924211569156253, + "grad_norm": 0.7288179397583008, + "learning_rate": 9.363893076509335e-06, + "loss": 0.7796, + "step": 5982 + }, + { + "epoch": 0.3292971544939182, + "grad_norm": 0.6603766083717346, + "learning_rate": 9.363681478004339e-06, + "loss": 0.7035, + "step": 5983 + }, + { + "epoch": 0.32935219329627385, + "grad_norm": 0.7523066997528076, + "learning_rate": 9.36346984670308e-06, + "loss": 0.8196, + "step": 5984 + }, + { + "epoch": 0.32940723209862954, + "grad_norm": 0.730312168598175, + "learning_rate": 9.36325818260715e-06, + "loss": 0.7967, + "step": 5985 + }, + { + "epoch": 0.3294622709009852, + "grad_norm": 0.7341319918632507, + "learning_rate": 9.363046485718139e-06, + "loss": 0.8361, + "step": 5986 + }, + { + "epoch": 0.32951730970334087, + "grad_norm": 0.839894711971283, + "learning_rate": 9.36283475603764e-06, + "loss": 0.862, + "step": 5987 + }, + { + "epoch": 0.3295723485056965, + "grad_norm": 0.7794893980026245, + "learning_rate": 9.362622993567243e-06, + "loss": 0.8521, + "step": 5988 + }, + { + "epoch": 0.3296273873080522, + "grad_norm": 0.929410457611084, + "learning_rate": 9.362411198308538e-06, + "loss": 0.7644, + "step": 5989 + }, + { + "epoch": 0.3296824261104078, + "grad_norm": 0.7687333226203918, + "learning_rate": 9.362199370263118e-06, + "loss": 0.8047, + "step": 5990 + }, + { + "epoch": 0.3297374649127635, + "grad_norm": 0.8040616512298584, + "learning_rate": 9.361987509432576e-06, + "loss": 0.7574, + "step": 5991 + }, + { + "epoch": 0.32979250371511915, + "grad_norm": 0.7743237614631653, + "learning_rate": 9.361775615818503e-06, + "loss": 0.8491, + "step": 5992 + }, + { + "epoch": 0.32984754251747483, + "grad_norm": 1.2796664237976074, + "learning_rate": 9.361563689422493e-06, + "loss": 0.7975, + "step": 5993 + }, + { + "epoch": 0.32990258131983047, + "grad_norm": 0.9493466019630432, + "learning_rate": 9.361351730246136e-06, + "loss": 1.0258, + "step": 5994 + }, + { + "epoch": 0.32995762012218616, + "grad_norm": 0.7148050665855408, + "learning_rate": 9.36113973829103e-06, + "loss": 0.805, + "step": 5995 + }, + { + "epoch": 0.3300126589245418, + "grad_norm": 0.723426342010498, + "learning_rate": 9.360927713558762e-06, + "loss": 0.6886, + "step": 5996 + }, + { + "epoch": 0.3300676977268975, + "grad_norm": 0.8274679183959961, + "learning_rate": 9.360715656050929e-06, + "loss": 0.8559, + "step": 5997 + }, + { + "epoch": 0.3301227365292531, + "grad_norm": 0.7493795156478882, + "learning_rate": 9.360503565769126e-06, + "loss": 0.8266, + "step": 5998 + }, + { + "epoch": 0.3301777753316088, + "grad_norm": 0.7690125703811646, + "learning_rate": 9.360291442714944e-06, + "loss": 0.783, + "step": 5999 + }, + { + "epoch": 0.33023281413396444, + "grad_norm": 0.8740219473838806, + "learning_rate": 9.360079286889981e-06, + "loss": 0.8409, + "step": 6000 + }, + { + "epoch": 0.3302878529363201, + "grad_norm": 0.6931017637252808, + "learning_rate": 9.359867098295827e-06, + "loss": 0.7985, + "step": 6001 + }, + { + "epoch": 0.33034289173867576, + "grad_norm": 0.915532112121582, + "learning_rate": 9.35965487693408e-06, + "loss": 0.8718, + "step": 6002 + }, + { + "epoch": 0.33039793054103145, + "grad_norm": 0.7898837924003601, + "learning_rate": 9.359442622806332e-06, + "loss": 0.8571, + "step": 6003 + }, + { + "epoch": 0.3304529693433871, + "grad_norm": 0.8661002516746521, + "learning_rate": 9.359230335914182e-06, + "loss": 0.7963, + "step": 6004 + }, + { + "epoch": 0.33050800814574277, + "grad_norm": 0.7188493013381958, + "learning_rate": 9.359018016259223e-06, + "loss": 0.8188, + "step": 6005 + }, + { + "epoch": 0.3305630469480984, + "grad_norm": 0.8648282289505005, + "learning_rate": 9.358805663843051e-06, + "loss": 0.9136, + "step": 6006 + }, + { + "epoch": 0.3306180857504541, + "grad_norm": 0.8010255098342896, + "learning_rate": 9.358593278667265e-06, + "loss": 0.849, + "step": 6007 + }, + { + "epoch": 0.3306731245528097, + "grad_norm": 0.8128451108932495, + "learning_rate": 9.358380860733456e-06, + "loss": 0.8082, + "step": 6008 + }, + { + "epoch": 0.3307281633551654, + "grad_norm": 1.0003761053085327, + "learning_rate": 9.358168410043224e-06, + "loss": 0.9064, + "step": 6009 + }, + { + "epoch": 0.33078320215752105, + "grad_norm": 0.7412391901016235, + "learning_rate": 9.357955926598163e-06, + "loss": 0.8049, + "step": 6010 + }, + { + "epoch": 0.33083824095987674, + "grad_norm": 0.795615553855896, + "learning_rate": 9.357743410399875e-06, + "loss": 0.7923, + "step": 6011 + }, + { + "epoch": 0.3308932797622324, + "grad_norm": 0.8696123957633972, + "learning_rate": 9.357530861449953e-06, + "loss": 0.8543, + "step": 6012 + }, + { + "epoch": 0.33094831856458806, + "grad_norm": 0.8909900784492493, + "learning_rate": 9.357318279749994e-06, + "loss": 0.6157, + "step": 6013 + }, + { + "epoch": 0.3310033573669437, + "grad_norm": 0.7326250672340393, + "learning_rate": 9.357105665301597e-06, + "loss": 0.7647, + "step": 6014 + }, + { + "epoch": 0.33105839616929933, + "grad_norm": 0.8425576090812683, + "learning_rate": 9.356893018106364e-06, + "loss": 0.7832, + "step": 6015 + }, + { + "epoch": 0.331113434971655, + "grad_norm": 0.7404599785804749, + "learning_rate": 9.356680338165885e-06, + "loss": 0.7759, + "step": 6016 + }, + { + "epoch": 0.33116847377401065, + "grad_norm": 0.6935396790504456, + "learning_rate": 9.356467625481765e-06, + "loss": 0.7488, + "step": 6017 + }, + { + "epoch": 0.33122351257636634, + "grad_norm": 0.7799031138420105, + "learning_rate": 9.3562548800556e-06, + "loss": 0.7617, + "step": 6018 + }, + { + "epoch": 0.331278551378722, + "grad_norm": 0.7824636101722717, + "learning_rate": 9.35604210188899e-06, + "loss": 0.7936, + "step": 6019 + }, + { + "epoch": 0.33133359018107766, + "grad_norm": 0.7051861882209778, + "learning_rate": 9.355829290983531e-06, + "loss": 0.7869, + "step": 6020 + }, + { + "epoch": 0.3313886289834333, + "grad_norm": 0.8172006607055664, + "learning_rate": 9.355616447340826e-06, + "loss": 0.8888, + "step": 6021 + }, + { + "epoch": 0.331443667785789, + "grad_norm": 0.7263272404670715, + "learning_rate": 9.355403570962475e-06, + "loss": 0.8393, + "step": 6022 + }, + { + "epoch": 0.3314987065881446, + "grad_norm": 0.7143926620483398, + "learning_rate": 9.355190661850077e-06, + "loss": 0.6693, + "step": 6023 + }, + { + "epoch": 0.3315537453905003, + "grad_norm": 0.7294363975524902, + "learning_rate": 9.354977720005232e-06, + "loss": 0.8035, + "step": 6024 + }, + { + "epoch": 0.33160878419285594, + "grad_norm": 0.7072308659553528, + "learning_rate": 9.354764745429538e-06, + "loss": 0.761, + "step": 6025 + }, + { + "epoch": 0.33166382299521163, + "grad_norm": 0.6945865154266357, + "learning_rate": 9.3545517381246e-06, + "loss": 0.7212, + "step": 6026 + }, + { + "epoch": 0.33171886179756727, + "grad_norm": 0.7645060420036316, + "learning_rate": 9.354338698092016e-06, + "loss": 0.812, + "step": 6027 + }, + { + "epoch": 0.33177390059992296, + "grad_norm": 0.9494503140449524, + "learning_rate": 9.354125625333387e-06, + "loss": 0.9037, + "step": 6028 + }, + { + "epoch": 0.3318289394022786, + "grad_norm": 0.7311872243881226, + "learning_rate": 9.353912519850317e-06, + "loss": 0.7137, + "step": 6029 + }, + { + "epoch": 0.3318839782046343, + "grad_norm": 0.658562958240509, + "learning_rate": 9.353699381644405e-06, + "loss": 0.7048, + "step": 6030 + }, + { + "epoch": 0.3319390170069899, + "grad_norm": 0.8106339573860168, + "learning_rate": 9.353486210717253e-06, + "loss": 0.8905, + "step": 6031 + }, + { + "epoch": 0.3319940558093456, + "grad_norm": 0.8166239261627197, + "learning_rate": 9.353273007070465e-06, + "loss": 0.7011, + "step": 6032 + }, + { + "epoch": 0.33204909461170123, + "grad_norm": 0.730172872543335, + "learning_rate": 9.353059770705643e-06, + "loss": 0.6934, + "step": 6033 + }, + { + "epoch": 0.3321041334140569, + "grad_norm": 0.7633965611457825, + "learning_rate": 9.352846501624387e-06, + "loss": 0.7379, + "step": 6034 + }, + { + "epoch": 0.33215917221641256, + "grad_norm": 0.7786447405815125, + "learning_rate": 9.352633199828304e-06, + "loss": 0.8533, + "step": 6035 + }, + { + "epoch": 0.33221421101876825, + "grad_norm": 0.7211753726005554, + "learning_rate": 9.352419865318993e-06, + "loss": 0.815, + "step": 6036 + }, + { + "epoch": 0.3322692498211239, + "grad_norm": 0.6861024498939514, + "learning_rate": 9.352206498098062e-06, + "loss": 0.7678, + "step": 6037 + }, + { + "epoch": 0.33232428862347957, + "grad_norm": 0.7702088952064514, + "learning_rate": 9.35199309816711e-06, + "loss": 0.8463, + "step": 6038 + }, + { + "epoch": 0.3323793274258352, + "grad_norm": 0.7179547548294067, + "learning_rate": 9.351779665527742e-06, + "loss": 0.8315, + "step": 6039 + }, + { + "epoch": 0.3324343662281909, + "grad_norm": 0.8686990737915039, + "learning_rate": 9.351566200181565e-06, + "loss": 0.8396, + "step": 6040 + }, + { + "epoch": 0.3324894050305465, + "grad_norm": 0.7269062995910645, + "learning_rate": 9.351352702130181e-06, + "loss": 0.7126, + "step": 6041 + }, + { + "epoch": 0.3325444438329022, + "grad_norm": 0.7759222984313965, + "learning_rate": 9.351139171375195e-06, + "loss": 0.8383, + "step": 6042 + }, + { + "epoch": 0.33259948263525785, + "grad_norm": 0.6882128119468689, + "learning_rate": 9.350925607918212e-06, + "loss": 0.6371, + "step": 6043 + }, + { + "epoch": 0.33265452143761354, + "grad_norm": 0.7552365660667419, + "learning_rate": 9.350712011760834e-06, + "loss": 0.8018, + "step": 6044 + }, + { + "epoch": 0.33270956023996917, + "grad_norm": 0.8320692181587219, + "learning_rate": 9.350498382904672e-06, + "loss": 0.8556, + "step": 6045 + }, + { + "epoch": 0.33276459904232486, + "grad_norm": 0.7542223334312439, + "learning_rate": 9.350284721351326e-06, + "loss": 0.8006, + "step": 6046 + }, + { + "epoch": 0.3328196378446805, + "grad_norm": 1.2724859714508057, + "learning_rate": 9.350071027102406e-06, + "loss": 0.9253, + "step": 6047 + }, + { + "epoch": 0.3328746766470362, + "grad_norm": 0.731383204460144, + "learning_rate": 9.349857300159517e-06, + "loss": 0.83, + "step": 6048 + }, + { + "epoch": 0.3329297154493918, + "grad_norm": 0.731419026851654, + "learning_rate": 9.349643540524265e-06, + "loss": 0.779, + "step": 6049 + }, + { + "epoch": 0.3329847542517475, + "grad_norm": 0.8462278842926025, + "learning_rate": 9.349429748198256e-06, + "loss": 0.84, + "step": 6050 + }, + { + "epoch": 0.33303979305410314, + "grad_norm": 0.8199888467788696, + "learning_rate": 9.349215923183098e-06, + "loss": 0.844, + "step": 6051 + }, + { + "epoch": 0.33309483185645883, + "grad_norm": 0.8696722984313965, + "learning_rate": 9.349002065480397e-06, + "loss": 0.709, + "step": 6052 + }, + { + "epoch": 0.33314987065881446, + "grad_norm": 0.8484870195388794, + "learning_rate": 9.34878817509176e-06, + "loss": 0.7434, + "step": 6053 + }, + { + "epoch": 0.33320490946117015, + "grad_norm": 0.8392589688301086, + "learning_rate": 9.348574252018796e-06, + "loss": 0.8972, + "step": 6054 + }, + { + "epoch": 0.3332599482635258, + "grad_norm": 0.673829972743988, + "learning_rate": 9.34836029626311e-06, + "loss": 0.6789, + "step": 6055 + }, + { + "epoch": 0.3333149870658815, + "grad_norm": 0.6693649888038635, + "learning_rate": 9.348146307826315e-06, + "loss": 0.68, + "step": 6056 + }, + { + "epoch": 0.3333700258682371, + "grad_norm": 0.8516272306442261, + "learning_rate": 9.347932286710014e-06, + "loss": 0.8585, + "step": 6057 + }, + { + "epoch": 0.33342506467059274, + "grad_norm": 0.7431588768959045, + "learning_rate": 9.347718232915818e-06, + "loss": 0.8239, + "step": 6058 + }, + { + "epoch": 0.33348010347294843, + "grad_norm": 0.8823427557945251, + "learning_rate": 9.347504146445336e-06, + "loss": 0.845, + "step": 6059 + }, + { + "epoch": 0.33353514227530406, + "grad_norm": 0.7884035110473633, + "learning_rate": 9.347290027300177e-06, + "loss": 0.8503, + "step": 6060 + }, + { + "epoch": 0.33359018107765975, + "grad_norm": 0.841397225856781, + "learning_rate": 9.34707587548195e-06, + "loss": 0.7551, + "step": 6061 + }, + { + "epoch": 0.3336452198800154, + "grad_norm": 0.7592034935951233, + "learning_rate": 9.346861690992263e-06, + "loss": 0.8516, + "step": 6062 + }, + { + "epoch": 0.3337002586823711, + "grad_norm": 0.6925262212753296, + "learning_rate": 9.346647473832728e-06, + "loss": 0.7351, + "step": 6063 + }, + { + "epoch": 0.3337552974847267, + "grad_norm": 0.8152759075164795, + "learning_rate": 9.346433224004955e-06, + "loss": 0.7673, + "step": 6064 + }, + { + "epoch": 0.3338103362870824, + "grad_norm": 0.7383455038070679, + "learning_rate": 9.346218941510551e-06, + "loss": 0.7312, + "step": 6065 + }, + { + "epoch": 0.33386537508943803, + "grad_norm": 0.7905310392379761, + "learning_rate": 9.346004626351131e-06, + "loss": 0.7891, + "step": 6066 + }, + { + "epoch": 0.3339204138917937, + "grad_norm": 0.7032167315483093, + "learning_rate": 9.345790278528305e-06, + "loss": 0.8358, + "step": 6067 + }, + { + "epoch": 0.33397545269414936, + "grad_norm": 0.6415952444076538, + "learning_rate": 9.34557589804368e-06, + "loss": 0.6716, + "step": 6068 + }, + { + "epoch": 0.33403049149650504, + "grad_norm": 0.7558899521827698, + "learning_rate": 9.34536148489887e-06, + "loss": 0.781, + "step": 6069 + }, + { + "epoch": 0.3340855302988607, + "grad_norm": 0.8913301825523376, + "learning_rate": 9.345147039095485e-06, + "loss": 0.8482, + "step": 6070 + }, + { + "epoch": 0.33414056910121637, + "grad_norm": 0.768984854221344, + "learning_rate": 9.34493256063514e-06, + "loss": 0.7578, + "step": 6071 + }, + { + "epoch": 0.334195607903572, + "grad_norm": 0.7428637742996216, + "learning_rate": 9.344718049519445e-06, + "loss": 0.7812, + "step": 6072 + }, + { + "epoch": 0.3342506467059277, + "grad_norm": 0.7290430665016174, + "learning_rate": 9.344503505750012e-06, + "loss": 0.7536, + "step": 6073 + }, + { + "epoch": 0.3343056855082833, + "grad_norm": 0.7637680172920227, + "learning_rate": 9.344288929328453e-06, + "loss": 0.8576, + "step": 6074 + }, + { + "epoch": 0.334360724310639, + "grad_norm": 0.9568214416503906, + "learning_rate": 9.344074320256379e-06, + "loss": 0.897, + "step": 6075 + }, + { + "epoch": 0.33441576311299465, + "grad_norm": 0.7516217827796936, + "learning_rate": 9.34385967853541e-06, + "loss": 0.7853, + "step": 6076 + }, + { + "epoch": 0.33447080191535034, + "grad_norm": 0.833039402961731, + "learning_rate": 9.34364500416715e-06, + "loss": 0.702, + "step": 6077 + }, + { + "epoch": 0.33452584071770597, + "grad_norm": 0.8080580830574036, + "learning_rate": 9.34343029715322e-06, + "loss": 0.7867, + "step": 6078 + }, + { + "epoch": 0.33458087952006166, + "grad_norm": 0.8039596080780029, + "learning_rate": 9.343215557495229e-06, + "loss": 0.8221, + "step": 6079 + }, + { + "epoch": 0.3346359183224173, + "grad_norm": 0.7003986835479736, + "learning_rate": 9.343000785194794e-06, + "loss": 0.746, + "step": 6080 + }, + { + "epoch": 0.334690957124773, + "grad_norm": 0.6623722314834595, + "learning_rate": 9.342785980253526e-06, + "loss": 0.6998, + "step": 6081 + }, + { + "epoch": 0.3347459959271286, + "grad_norm": 0.8425901532173157, + "learning_rate": 9.342571142673042e-06, + "loss": 0.8789, + "step": 6082 + }, + { + "epoch": 0.3348010347294843, + "grad_norm": 0.7263861894607544, + "learning_rate": 9.342356272454954e-06, + "loss": 0.7299, + "step": 6083 + }, + { + "epoch": 0.33485607353183994, + "grad_norm": 0.8420364260673523, + "learning_rate": 9.34214136960088e-06, + "loss": 0.8073, + "step": 6084 + }, + { + "epoch": 0.3349111123341956, + "grad_norm": 0.950019359588623, + "learning_rate": 9.341926434112435e-06, + "loss": 0.9288, + "step": 6085 + }, + { + "epoch": 0.33496615113655126, + "grad_norm": 0.7583657503128052, + "learning_rate": 9.341711465991231e-06, + "loss": 0.8079, + "step": 6086 + }, + { + "epoch": 0.33502118993890695, + "grad_norm": 0.7623111605644226, + "learning_rate": 9.341496465238887e-06, + "loss": 0.879, + "step": 6087 + }, + { + "epoch": 0.3350762287412626, + "grad_norm": 0.8934749960899353, + "learning_rate": 9.341281431857017e-06, + "loss": 0.9348, + "step": 6088 + }, + { + "epoch": 0.33513126754361827, + "grad_norm": 0.7363337874412537, + "learning_rate": 9.341066365847238e-06, + "loss": 0.8284, + "step": 6089 + }, + { + "epoch": 0.3351863063459739, + "grad_norm": 0.6408932209014893, + "learning_rate": 9.340851267211166e-06, + "loss": 0.6019, + "step": 6090 + }, + { + "epoch": 0.3352413451483296, + "grad_norm": 0.8491614460945129, + "learning_rate": 9.34063613595042e-06, + "loss": 0.7287, + "step": 6091 + }, + { + "epoch": 0.33529638395068523, + "grad_norm": 0.6922628879547119, + "learning_rate": 9.340420972066612e-06, + "loss": 0.6649, + "step": 6092 + }, + { + "epoch": 0.3353514227530409, + "grad_norm": 0.7304210662841797, + "learning_rate": 9.340205775561364e-06, + "loss": 0.7373, + "step": 6093 + }, + { + "epoch": 0.33540646155539655, + "grad_norm": 0.8924282193183899, + "learning_rate": 9.339990546436289e-06, + "loss": 0.8337, + "step": 6094 + }, + { + "epoch": 0.33546150035775224, + "grad_norm": 0.7671791315078735, + "learning_rate": 9.339775284693008e-06, + "loss": 0.856, + "step": 6095 + }, + { + "epoch": 0.3355165391601079, + "grad_norm": 0.830427348613739, + "learning_rate": 9.339559990333138e-06, + "loss": 0.7204, + "step": 6096 + }, + { + "epoch": 0.33557157796246356, + "grad_norm": 0.7064357399940491, + "learning_rate": 9.339344663358297e-06, + "loss": 0.8533, + "step": 6097 + }, + { + "epoch": 0.3356266167648192, + "grad_norm": 0.7828566431999207, + "learning_rate": 9.3391293037701e-06, + "loss": 0.7203, + "step": 6098 + }, + { + "epoch": 0.3356816555671749, + "grad_norm": 0.7686871886253357, + "learning_rate": 9.338913911570172e-06, + "loss": 0.7813, + "step": 6099 + }, + { + "epoch": 0.3357366943695305, + "grad_norm": 0.7536553740501404, + "learning_rate": 9.338698486760126e-06, + "loss": 0.7581, + "step": 6100 + }, + { + "epoch": 0.33579173317188615, + "grad_norm": 0.7240094542503357, + "learning_rate": 9.338483029341586e-06, + "loss": 0.7513, + "step": 6101 + }, + { + "epoch": 0.33584677197424184, + "grad_norm": 0.7519696354866028, + "learning_rate": 9.338267539316169e-06, + "loss": 0.8139, + "step": 6102 + }, + { + "epoch": 0.3359018107765975, + "grad_norm": 0.7267377376556396, + "learning_rate": 9.338052016685492e-06, + "loss": 0.7807, + "step": 6103 + }, + { + "epoch": 0.33595684957895317, + "grad_norm": 0.6925491094589233, + "learning_rate": 9.33783646145118e-06, + "loss": 0.8124, + "step": 6104 + }, + { + "epoch": 0.3360118883813088, + "grad_norm": 0.6896460652351379, + "learning_rate": 9.337620873614848e-06, + "loss": 0.7459, + "step": 6105 + }, + { + "epoch": 0.3360669271836645, + "grad_norm": 0.8631082773208618, + "learning_rate": 9.337405253178121e-06, + "loss": 0.7662, + "step": 6106 + }, + { + "epoch": 0.3361219659860201, + "grad_norm": 0.76750248670578, + "learning_rate": 9.337189600142614e-06, + "loss": 0.9016, + "step": 6107 + }, + { + "epoch": 0.3361770047883758, + "grad_norm": 0.9230479001998901, + "learning_rate": 9.336973914509952e-06, + "loss": 0.7631, + "step": 6108 + }, + { + "epoch": 0.33623204359073144, + "grad_norm": 0.746776282787323, + "learning_rate": 9.336758196281756e-06, + "loss": 0.6934, + "step": 6109 + }, + { + "epoch": 0.33628708239308713, + "grad_norm": 0.7631211280822754, + "learning_rate": 9.336542445459646e-06, + "loss": 0.7957, + "step": 6110 + }, + { + "epoch": 0.33634212119544277, + "grad_norm": 0.7460417151451111, + "learning_rate": 9.336326662045243e-06, + "loss": 0.7979, + "step": 6111 + }, + { + "epoch": 0.33639715999779846, + "grad_norm": 0.7072319388389587, + "learning_rate": 9.336110846040171e-06, + "loss": 0.763, + "step": 6112 + }, + { + "epoch": 0.3364521988001541, + "grad_norm": 0.822266697883606, + "learning_rate": 9.33589499744605e-06, + "loss": 0.7719, + "step": 6113 + }, + { + "epoch": 0.3365072376025098, + "grad_norm": 0.778685986995697, + "learning_rate": 9.335679116264502e-06, + "loss": 0.896, + "step": 6114 + }, + { + "epoch": 0.3365622764048654, + "grad_norm": 0.9335552453994751, + "learning_rate": 9.33546320249715e-06, + "loss": 0.7317, + "step": 6115 + }, + { + "epoch": 0.3366173152072211, + "grad_norm": 0.755109965801239, + "learning_rate": 9.33524725614562e-06, + "loss": 0.8184, + "step": 6116 + }, + { + "epoch": 0.33667235400957674, + "grad_norm": 0.7963696122169495, + "learning_rate": 9.33503127721153e-06, + "loss": 0.7835, + "step": 6117 + }, + { + "epoch": 0.3367273928119324, + "grad_norm": 0.8298614621162415, + "learning_rate": 9.334815265696506e-06, + "loss": 0.7946, + "step": 6118 + }, + { + "epoch": 0.33678243161428806, + "grad_norm": 0.728638768196106, + "learning_rate": 9.33459922160217e-06, + "loss": 0.801, + "step": 6119 + }, + { + "epoch": 0.33683747041664375, + "grad_norm": 0.7275198698043823, + "learning_rate": 9.334383144930146e-06, + "loss": 0.7721, + "step": 6120 + }, + { + "epoch": 0.3368925092189994, + "grad_norm": 0.7146986722946167, + "learning_rate": 9.33416703568206e-06, + "loss": 0.7573, + "step": 6121 + }, + { + "epoch": 0.33694754802135507, + "grad_norm": 0.7875215411186218, + "learning_rate": 9.333950893859533e-06, + "loss": 0.8223, + "step": 6122 + }, + { + "epoch": 0.3370025868237107, + "grad_norm": 0.7636967301368713, + "learning_rate": 9.333734719464193e-06, + "loss": 0.7596, + "step": 6123 + }, + { + "epoch": 0.3370576256260664, + "grad_norm": 0.8068925142288208, + "learning_rate": 9.333518512497663e-06, + "loss": 0.834, + "step": 6124 + }, + { + "epoch": 0.337112664428422, + "grad_norm": 0.7153680920600891, + "learning_rate": 9.333302272961566e-06, + "loss": 0.703, + "step": 6125 + }, + { + "epoch": 0.3371677032307777, + "grad_norm": 0.7429617047309875, + "learning_rate": 9.33308600085753e-06, + "loss": 0.7327, + "step": 6126 + }, + { + "epoch": 0.33722274203313335, + "grad_norm": 0.6937283873558044, + "learning_rate": 9.33286969618718e-06, + "loss": 0.6494, + "step": 6127 + }, + { + "epoch": 0.33727778083548904, + "grad_norm": 0.7775923609733582, + "learning_rate": 9.33265335895214e-06, + "loss": 0.8668, + "step": 6128 + }, + { + "epoch": 0.33733281963784467, + "grad_norm": 0.6911064386367798, + "learning_rate": 9.33243698915404e-06, + "loss": 0.6462, + "step": 6129 + }, + { + "epoch": 0.33738785844020036, + "grad_norm": 0.8951280117034912, + "learning_rate": 9.3322205867945e-06, + "loss": 0.825, + "step": 6130 + }, + { + "epoch": 0.337442897242556, + "grad_norm": 0.9521064758300781, + "learning_rate": 9.332004151875151e-06, + "loss": 0.641, + "step": 6131 + }, + { + "epoch": 0.3374979360449117, + "grad_norm": 0.7036865949630737, + "learning_rate": 9.33178768439762e-06, + "loss": 0.804, + "step": 6132 + }, + { + "epoch": 0.3375529748472673, + "grad_norm": 1.0232574939727783, + "learning_rate": 9.331571184363529e-06, + "loss": 0.8577, + "step": 6133 + }, + { + "epoch": 0.337608013649623, + "grad_norm": 0.9680090546607971, + "learning_rate": 9.33135465177451e-06, + "loss": 0.7725, + "step": 6134 + }, + { + "epoch": 0.33766305245197864, + "grad_norm": 0.7664901614189148, + "learning_rate": 9.33113808663219e-06, + "loss": 0.8406, + "step": 6135 + }, + { + "epoch": 0.33771809125433433, + "grad_norm": 0.6703250408172607, + "learning_rate": 9.330921488938193e-06, + "loss": 0.7311, + "step": 6136 + }, + { + "epoch": 0.33777313005668996, + "grad_norm": 0.7364899516105652, + "learning_rate": 9.330704858694151e-06, + "loss": 0.8571, + "step": 6137 + }, + { + "epoch": 0.33782816885904565, + "grad_norm": 0.7167731523513794, + "learning_rate": 9.33048819590169e-06, + "loss": 0.7597, + "step": 6138 + }, + { + "epoch": 0.3378832076614013, + "grad_norm": 0.7761037945747375, + "learning_rate": 9.33027150056244e-06, + "loss": 0.8112, + "step": 6139 + }, + { + "epoch": 0.337938246463757, + "grad_norm": 0.8143900632858276, + "learning_rate": 9.330054772678028e-06, + "loss": 0.8213, + "step": 6140 + }, + { + "epoch": 0.3379932852661126, + "grad_norm": 0.7181026339530945, + "learning_rate": 9.329838012250083e-06, + "loss": 0.8228, + "step": 6141 + }, + { + "epoch": 0.3380483240684683, + "grad_norm": 0.7229815721511841, + "learning_rate": 9.329621219280235e-06, + "loss": 0.8205, + "step": 6142 + }, + { + "epoch": 0.33810336287082393, + "grad_norm": 0.7120887637138367, + "learning_rate": 9.329404393770113e-06, + "loss": 0.8012, + "step": 6143 + }, + { + "epoch": 0.33815840167317956, + "grad_norm": 0.7859634757041931, + "learning_rate": 9.329187535721346e-06, + "loss": 0.7583, + "step": 6144 + }, + { + "epoch": 0.33821344047553525, + "grad_norm": 0.7630401253700256, + "learning_rate": 9.328970645135564e-06, + "loss": 0.9087, + "step": 6145 + }, + { + "epoch": 0.3382684792778909, + "grad_norm": 0.7028466463088989, + "learning_rate": 9.328753722014399e-06, + "loss": 0.7253, + "step": 6146 + }, + { + "epoch": 0.3383235180802466, + "grad_norm": 0.8910240530967712, + "learning_rate": 9.328536766359477e-06, + "loss": 0.9048, + "step": 6147 + }, + { + "epoch": 0.3383785568826022, + "grad_norm": 0.6695914268493652, + "learning_rate": 9.328319778172435e-06, + "loss": 0.6817, + "step": 6148 + }, + { + "epoch": 0.3384335956849579, + "grad_norm": 0.9667700529098511, + "learning_rate": 9.328102757454898e-06, + "loss": 0.7721, + "step": 6149 + }, + { + "epoch": 0.33848863448731353, + "grad_norm": 0.7267603874206543, + "learning_rate": 9.3278857042085e-06, + "loss": 0.7263, + "step": 6150 + }, + { + "epoch": 0.3385436732896692, + "grad_norm": 0.7603437900543213, + "learning_rate": 9.32766861843487e-06, + "loss": 0.7856, + "step": 6151 + }, + { + "epoch": 0.33859871209202486, + "grad_norm": 0.7355918288230896, + "learning_rate": 9.327451500135641e-06, + "loss": 0.7687, + "step": 6152 + }, + { + "epoch": 0.33865375089438055, + "grad_norm": 0.712210476398468, + "learning_rate": 9.327234349312446e-06, + "loss": 0.7689, + "step": 6153 + }, + { + "epoch": 0.3387087896967362, + "grad_norm": 0.9011964797973633, + "learning_rate": 9.327017165966916e-06, + "loss": 0.888, + "step": 6154 + }, + { + "epoch": 0.33876382849909187, + "grad_norm": 0.7334766387939453, + "learning_rate": 9.326799950100683e-06, + "loss": 0.7577, + "step": 6155 + }, + { + "epoch": 0.3388188673014475, + "grad_norm": 0.711370587348938, + "learning_rate": 9.32658270171538e-06, + "loss": 0.7653, + "step": 6156 + }, + { + "epoch": 0.3388739061038032, + "grad_norm": 0.8465714454650879, + "learning_rate": 9.32636542081264e-06, + "loss": 0.7252, + "step": 6157 + }, + { + "epoch": 0.3389289449061588, + "grad_norm": 0.8105099201202393, + "learning_rate": 9.326148107394094e-06, + "loss": 0.7886, + "step": 6158 + }, + { + "epoch": 0.3389839837085145, + "grad_norm": 0.8082063794136047, + "learning_rate": 9.32593076146138e-06, + "loss": 0.8968, + "step": 6159 + }, + { + "epoch": 0.33903902251087015, + "grad_norm": 0.7451661229133606, + "learning_rate": 9.325713383016125e-06, + "loss": 0.762, + "step": 6160 + }, + { + "epoch": 0.33909406131322584, + "grad_norm": 0.8174484372138977, + "learning_rate": 9.325495972059968e-06, + "loss": 0.8285, + "step": 6161 + }, + { + "epoch": 0.33914910011558147, + "grad_norm": 0.7690935134887695, + "learning_rate": 9.32527852859454e-06, + "loss": 0.8908, + "step": 6162 + }, + { + "epoch": 0.33920413891793716, + "grad_norm": 0.7730095386505127, + "learning_rate": 9.325061052621476e-06, + "loss": 0.8571, + "step": 6163 + }, + { + "epoch": 0.3392591777202928, + "grad_norm": 0.7750043869018555, + "learning_rate": 9.324843544142412e-06, + "loss": 0.8314, + "step": 6164 + }, + { + "epoch": 0.3393142165226485, + "grad_norm": 0.8184822797775269, + "learning_rate": 9.32462600315898e-06, + "loss": 0.8783, + "step": 6165 + }, + { + "epoch": 0.3393692553250041, + "grad_norm": 0.8553629517555237, + "learning_rate": 9.32440842967282e-06, + "loss": 0.7116, + "step": 6166 + }, + { + "epoch": 0.3394242941273598, + "grad_norm": 0.8072115778923035, + "learning_rate": 9.324190823685562e-06, + "loss": 0.7498, + "step": 6167 + }, + { + "epoch": 0.33947933292971544, + "grad_norm": 0.7787594795227051, + "learning_rate": 9.323973185198843e-06, + "loss": 0.7567, + "step": 6168 + }, + { + "epoch": 0.3395343717320711, + "grad_norm": 0.7571421265602112, + "learning_rate": 9.323755514214299e-06, + "loss": 0.8349, + "step": 6169 + }, + { + "epoch": 0.33958941053442676, + "grad_norm": 0.6768494248390198, + "learning_rate": 9.323537810733565e-06, + "loss": 0.7382, + "step": 6170 + }, + { + "epoch": 0.33964444933678245, + "grad_norm": 0.7091678380966187, + "learning_rate": 9.32332007475828e-06, + "loss": 0.8107, + "step": 6171 + }, + { + "epoch": 0.3396994881391381, + "grad_norm": 0.6896559596061707, + "learning_rate": 9.323102306290078e-06, + "loss": 0.7973, + "step": 6172 + }, + { + "epoch": 0.3397545269414938, + "grad_norm": 0.7383756637573242, + "learning_rate": 9.322884505330595e-06, + "loss": 0.7998, + "step": 6173 + }, + { + "epoch": 0.3398095657438494, + "grad_norm": 0.7487883567810059, + "learning_rate": 9.32266667188147e-06, + "loss": 0.7928, + "step": 6174 + }, + { + "epoch": 0.3398646045462051, + "grad_norm": 0.7935298681259155, + "learning_rate": 9.32244880594434e-06, + "loss": 0.8457, + "step": 6175 + }, + { + "epoch": 0.33991964334856073, + "grad_norm": 0.6571856737136841, + "learning_rate": 9.322230907520841e-06, + "loss": 0.7177, + "step": 6176 + }, + { + "epoch": 0.3399746821509164, + "grad_norm": 0.7694165706634521, + "learning_rate": 9.322012976612613e-06, + "loss": 0.7124, + "step": 6177 + }, + { + "epoch": 0.34002972095327205, + "grad_norm": 0.8665503263473511, + "learning_rate": 9.32179501322129e-06, + "loss": 0.8054, + "step": 6178 + }, + { + "epoch": 0.34008475975562774, + "grad_norm": 0.6794337034225464, + "learning_rate": 9.321577017348515e-06, + "loss": 0.6468, + "step": 6179 + }, + { + "epoch": 0.3401397985579834, + "grad_norm": 0.7875672578811646, + "learning_rate": 9.32135898899592e-06, + "loss": 0.8384, + "step": 6180 + }, + { + "epoch": 0.34019483736033906, + "grad_norm": 0.8050880432128906, + "learning_rate": 9.321140928165152e-06, + "loss": 0.7261, + "step": 6181 + }, + { + "epoch": 0.3402498761626947, + "grad_norm": 0.7489742040634155, + "learning_rate": 9.320922834857844e-06, + "loss": 0.8252, + "step": 6182 + }, + { + "epoch": 0.3403049149650504, + "grad_norm": 0.7785589098930359, + "learning_rate": 9.320704709075637e-06, + "loss": 0.7123, + "step": 6183 + }, + { + "epoch": 0.340359953767406, + "grad_norm": 0.7698208689689636, + "learning_rate": 9.320486550820169e-06, + "loss": 0.704, + "step": 6184 + }, + { + "epoch": 0.3404149925697617, + "grad_norm": 0.78490149974823, + "learning_rate": 9.320268360093081e-06, + "loss": 0.8446, + "step": 6185 + }, + { + "epoch": 0.34047003137211734, + "grad_norm": 0.6684672236442566, + "learning_rate": 9.320050136896012e-06, + "loss": 0.6728, + "step": 6186 + }, + { + "epoch": 0.340525070174473, + "grad_norm": 0.818122386932373, + "learning_rate": 9.319831881230603e-06, + "loss": 0.7744, + "step": 6187 + }, + { + "epoch": 0.34058010897682867, + "grad_norm": 0.83867347240448, + "learning_rate": 9.319613593098494e-06, + "loss": 0.7423, + "step": 6188 + }, + { + "epoch": 0.3406351477791843, + "grad_norm": 0.7800338268280029, + "learning_rate": 9.319395272501326e-06, + "loss": 0.8189, + "step": 6189 + }, + { + "epoch": 0.34069018658154, + "grad_norm": 0.7530137300491333, + "learning_rate": 9.319176919440737e-06, + "loss": 0.7978, + "step": 6190 + }, + { + "epoch": 0.3407452253838956, + "grad_norm": 0.8916274309158325, + "learning_rate": 9.318958533918374e-06, + "loss": 0.8828, + "step": 6191 + }, + { + "epoch": 0.3408002641862513, + "grad_norm": 0.76950603723526, + "learning_rate": 9.318740115935873e-06, + "loss": 0.7691, + "step": 6192 + }, + { + "epoch": 0.34085530298860695, + "grad_norm": 0.8348222970962524, + "learning_rate": 9.318521665494877e-06, + "loss": 0.8022, + "step": 6193 + }, + { + "epoch": 0.34091034179096263, + "grad_norm": 0.6879388689994812, + "learning_rate": 9.318303182597029e-06, + "loss": 0.747, + "step": 6194 + }, + { + "epoch": 0.34096538059331827, + "grad_norm": 0.8032572269439697, + "learning_rate": 9.31808466724397e-06, + "loss": 0.7621, + "step": 6195 + }, + { + "epoch": 0.34102041939567396, + "grad_norm": 0.6842368841171265, + "learning_rate": 9.317866119437342e-06, + "loss": 0.6867, + "step": 6196 + }, + { + "epoch": 0.3410754581980296, + "grad_norm": 0.7797672152519226, + "learning_rate": 9.317647539178788e-06, + "loss": 0.8329, + "step": 6197 + }, + { + "epoch": 0.3411304970003853, + "grad_norm": 0.6865420341491699, + "learning_rate": 9.317428926469952e-06, + "loss": 0.7544, + "step": 6198 + }, + { + "epoch": 0.3411855358027409, + "grad_norm": 0.818217396736145, + "learning_rate": 9.317210281312475e-06, + "loss": 0.8853, + "step": 6199 + }, + { + "epoch": 0.3412405746050966, + "grad_norm": 0.7531415224075317, + "learning_rate": 9.316991603708001e-06, + "loss": 0.8225, + "step": 6200 + }, + { + "epoch": 0.34129561340745224, + "grad_norm": 0.7347036600112915, + "learning_rate": 9.316772893658173e-06, + "loss": 0.7817, + "step": 6201 + }, + { + "epoch": 0.3413506522098079, + "grad_norm": 0.7162033915519714, + "learning_rate": 9.316554151164636e-06, + "loss": 0.7836, + "step": 6202 + }, + { + "epoch": 0.34140569101216356, + "grad_norm": 0.7421988248825073, + "learning_rate": 9.316335376229035e-06, + "loss": 0.7782, + "step": 6203 + }, + { + "epoch": 0.34146072981451925, + "grad_norm": 0.7672573328018188, + "learning_rate": 9.31611656885301e-06, + "loss": 0.8585, + "step": 6204 + }, + { + "epoch": 0.3415157686168749, + "grad_norm": 0.6898330450057983, + "learning_rate": 9.31589772903821e-06, + "loss": 0.7719, + "step": 6205 + }, + { + "epoch": 0.34157080741923057, + "grad_norm": 0.7700635194778442, + "learning_rate": 9.315678856786279e-06, + "loss": 0.7345, + "step": 6206 + }, + { + "epoch": 0.3416258462215862, + "grad_norm": 0.6982038617134094, + "learning_rate": 9.315459952098858e-06, + "loss": 0.8332, + "step": 6207 + }, + { + "epoch": 0.3416808850239419, + "grad_norm": 0.8882858753204346, + "learning_rate": 9.315241014977598e-06, + "loss": 0.9029, + "step": 6208 + }, + { + "epoch": 0.3417359238262975, + "grad_norm": 0.7313854098320007, + "learning_rate": 9.31502204542414e-06, + "loss": 0.8061, + "step": 6209 + }, + { + "epoch": 0.3417909626286532, + "grad_norm": 0.7324157953262329, + "learning_rate": 9.314803043440131e-06, + "loss": 0.7889, + "step": 6210 + }, + { + "epoch": 0.34184600143100885, + "grad_norm": 0.7498225569725037, + "learning_rate": 9.314584009027218e-06, + "loss": 0.7937, + "step": 6211 + }, + { + "epoch": 0.34190104023336454, + "grad_norm": 0.7093212008476257, + "learning_rate": 9.314364942187048e-06, + "loss": 0.8404, + "step": 6212 + }, + { + "epoch": 0.3419560790357202, + "grad_norm": 0.7008668780326843, + "learning_rate": 9.314145842921264e-06, + "loss": 0.8175, + "step": 6213 + }, + { + "epoch": 0.34201111783807586, + "grad_norm": 0.8049909472465515, + "learning_rate": 9.313926711231516e-06, + "loss": 0.78, + "step": 6214 + }, + { + "epoch": 0.3420661566404315, + "grad_norm": 0.7777613997459412, + "learning_rate": 9.313707547119448e-06, + "loss": 0.9566, + "step": 6215 + }, + { + "epoch": 0.3421211954427872, + "grad_norm": 0.7787579894065857, + "learning_rate": 9.31348835058671e-06, + "loss": 0.7698, + "step": 6216 + }, + { + "epoch": 0.3421762342451428, + "grad_norm": 0.7779031991958618, + "learning_rate": 9.313269121634947e-06, + "loss": 0.8853, + "step": 6217 + }, + { + "epoch": 0.3422312730474985, + "grad_norm": 0.7194382548332214, + "learning_rate": 9.313049860265809e-06, + "loss": 0.8399, + "step": 6218 + }, + { + "epoch": 0.34228631184985414, + "grad_norm": 0.6513093709945679, + "learning_rate": 9.312830566480943e-06, + "loss": 0.7156, + "step": 6219 + }, + { + "epoch": 0.34234135065220983, + "grad_norm": 0.935325026512146, + "learning_rate": 9.312611240281996e-06, + "loss": 0.7525, + "step": 6220 + }, + { + "epoch": 0.34239638945456546, + "grad_norm": 0.7539558410644531, + "learning_rate": 9.312391881670618e-06, + "loss": 0.7716, + "step": 6221 + }, + { + "epoch": 0.34245142825692115, + "grad_norm": 0.7239616513252258, + "learning_rate": 9.312172490648457e-06, + "loss": 0.7272, + "step": 6222 + }, + { + "epoch": 0.3425064670592768, + "grad_norm": 0.7742316126823425, + "learning_rate": 9.311953067217162e-06, + "loss": 0.7657, + "step": 6223 + }, + { + "epoch": 0.3425615058616325, + "grad_norm": 0.782691240310669, + "learning_rate": 9.311733611378379e-06, + "loss": 0.813, + "step": 6224 + }, + { + "epoch": 0.3426165446639881, + "grad_norm": 0.7448118329048157, + "learning_rate": 9.311514123133765e-06, + "loss": 0.8298, + "step": 6225 + }, + { + "epoch": 0.3426715834663438, + "grad_norm": 0.8201695680618286, + "learning_rate": 9.311294602484961e-06, + "loss": 0.7738, + "step": 6226 + }, + { + "epoch": 0.34272662226869943, + "grad_norm": 0.6928383111953735, + "learning_rate": 9.311075049433625e-06, + "loss": 0.6829, + "step": 6227 + }, + { + "epoch": 0.3427816610710551, + "grad_norm": 0.7509302496910095, + "learning_rate": 9.310855463981399e-06, + "loss": 0.6265, + "step": 6228 + }, + { + "epoch": 0.34283669987341076, + "grad_norm": 0.7012569308280945, + "learning_rate": 9.310635846129938e-06, + "loss": 0.7478, + "step": 6229 + }, + { + "epoch": 0.3428917386757664, + "grad_norm": 0.7428532242774963, + "learning_rate": 9.310416195880894e-06, + "loss": 0.7434, + "step": 6230 + }, + { + "epoch": 0.3429467774781221, + "grad_norm": 0.9089111685752869, + "learning_rate": 9.310196513235915e-06, + "loss": 0.6991, + "step": 6231 + }, + { + "epoch": 0.3430018162804777, + "grad_norm": 0.7633285522460938, + "learning_rate": 9.309976798196651e-06, + "loss": 0.7789, + "step": 6232 + }, + { + "epoch": 0.3430568550828334, + "grad_norm": 0.7035595178604126, + "learning_rate": 9.309757050764756e-06, + "loss": 0.6784, + "step": 6233 + }, + { + "epoch": 0.34311189388518903, + "grad_norm": 0.8782615661621094, + "learning_rate": 9.309537270941881e-06, + "loss": 0.8861, + "step": 6234 + }, + { + "epoch": 0.3431669326875447, + "grad_norm": 0.7690381407737732, + "learning_rate": 9.309317458729677e-06, + "loss": 0.7701, + "step": 6235 + }, + { + "epoch": 0.34322197148990036, + "grad_norm": 0.7730939388275146, + "learning_rate": 9.309097614129797e-06, + "loss": 0.8004, + "step": 6236 + }, + { + "epoch": 0.34327701029225605, + "grad_norm": 0.9295101761817932, + "learning_rate": 9.308877737143894e-06, + "loss": 0.6964, + "step": 6237 + }, + { + "epoch": 0.3433320490946117, + "grad_norm": 0.7496231198310852, + "learning_rate": 9.308657827773617e-06, + "loss": 0.8107, + "step": 6238 + }, + { + "epoch": 0.34338708789696737, + "grad_norm": 0.7656146287918091, + "learning_rate": 9.308437886020622e-06, + "loss": 0.8016, + "step": 6239 + }, + { + "epoch": 0.343442126699323, + "grad_norm": 0.8925992846488953, + "learning_rate": 9.308217911886562e-06, + "loss": 0.7136, + "step": 6240 + }, + { + "epoch": 0.3434971655016787, + "grad_norm": 0.7669470906257629, + "learning_rate": 9.307997905373087e-06, + "loss": 0.8284, + "step": 6241 + }, + { + "epoch": 0.3435522043040343, + "grad_norm": 0.6964572072029114, + "learning_rate": 9.307777866481855e-06, + "loss": 0.7926, + "step": 6242 + }, + { + "epoch": 0.34360724310639, + "grad_norm": 0.8405120968818665, + "learning_rate": 9.307557795214517e-06, + "loss": 0.9398, + "step": 6243 + }, + { + "epoch": 0.34366228190874565, + "grad_norm": 0.7517451643943787, + "learning_rate": 9.30733769157273e-06, + "loss": 0.8315, + "step": 6244 + }, + { + "epoch": 0.34371732071110134, + "grad_norm": 0.7740843892097473, + "learning_rate": 9.307117555558144e-06, + "loss": 0.8287, + "step": 6245 + }, + { + "epoch": 0.34377235951345697, + "grad_norm": 0.7214275598526001, + "learning_rate": 9.306897387172413e-06, + "loss": 0.7416, + "step": 6246 + }, + { + "epoch": 0.34382739831581266, + "grad_norm": 0.8217877745628357, + "learning_rate": 9.306677186417197e-06, + "loss": 0.8365, + "step": 6247 + }, + { + "epoch": 0.3438824371181683, + "grad_norm": 0.7397332191467285, + "learning_rate": 9.306456953294148e-06, + "loss": 0.7284, + "step": 6248 + }, + { + "epoch": 0.343937475920524, + "grad_norm": 0.8141350746154785, + "learning_rate": 9.30623668780492e-06, + "loss": 0.8976, + "step": 6249 + }, + { + "epoch": 0.3439925147228796, + "grad_norm": 0.7078670263290405, + "learning_rate": 9.306016389951171e-06, + "loss": 0.8167, + "step": 6250 + }, + { + "epoch": 0.3440475535252353, + "grad_norm": 0.7136256098747253, + "learning_rate": 9.305796059734553e-06, + "loss": 0.7916, + "step": 6251 + }, + { + "epoch": 0.34410259232759094, + "grad_norm": 1.6186310052871704, + "learning_rate": 9.305575697156726e-06, + "loss": 0.8148, + "step": 6252 + }, + { + "epoch": 0.34415763112994663, + "grad_norm": 0.7567281126976013, + "learning_rate": 9.305355302219346e-06, + "loss": 0.8676, + "step": 6253 + }, + { + "epoch": 0.34421266993230226, + "grad_norm": 0.9036027193069458, + "learning_rate": 9.305134874924067e-06, + "loss": 0.8111, + "step": 6254 + }, + { + "epoch": 0.34426770873465795, + "grad_norm": 0.9375718235969543, + "learning_rate": 9.304914415272547e-06, + "loss": 0.6176, + "step": 6255 + }, + { + "epoch": 0.3443227475370136, + "grad_norm": 0.7309718132019043, + "learning_rate": 9.304693923266441e-06, + "loss": 0.7313, + "step": 6256 + }, + { + "epoch": 0.3443777863393693, + "grad_norm": 0.7499229311943054, + "learning_rate": 9.30447339890741e-06, + "loss": 0.6704, + "step": 6257 + }, + { + "epoch": 0.3444328251417249, + "grad_norm": 0.7553356289863586, + "learning_rate": 9.304252842197108e-06, + "loss": 0.8671, + "step": 6258 + }, + { + "epoch": 0.3444878639440806, + "grad_norm": 0.7144323587417603, + "learning_rate": 9.304032253137194e-06, + "loss": 0.7684, + "step": 6259 + }, + { + "epoch": 0.34454290274643623, + "grad_norm": 0.7566905617713928, + "learning_rate": 9.303811631729324e-06, + "loss": 0.8381, + "step": 6260 + }, + { + "epoch": 0.3445979415487919, + "grad_norm": 0.7300242185592651, + "learning_rate": 9.30359097797516e-06, + "loss": 0.7044, + "step": 6261 + }, + { + "epoch": 0.34465298035114755, + "grad_norm": 0.6504725813865662, + "learning_rate": 9.303370291876359e-06, + "loss": 0.6693, + "step": 6262 + }, + { + "epoch": 0.34470801915350324, + "grad_norm": 0.7010672688484192, + "learning_rate": 9.303149573434576e-06, + "loss": 0.6635, + "step": 6263 + }, + { + "epoch": 0.3447630579558589, + "grad_norm": 0.8416483998298645, + "learning_rate": 9.302928822651473e-06, + "loss": 0.8408, + "step": 6264 + }, + { + "epoch": 0.34481809675821457, + "grad_norm": 0.7011786699295044, + "learning_rate": 9.302708039528712e-06, + "loss": 0.7636, + "step": 6265 + }, + { + "epoch": 0.3448731355605702, + "grad_norm": 0.7361586689949036, + "learning_rate": 9.302487224067947e-06, + "loss": 0.824, + "step": 6266 + }, + { + "epoch": 0.3449281743629259, + "grad_norm": 0.7747073173522949, + "learning_rate": 9.302266376270839e-06, + "loss": 0.8012, + "step": 6267 + }, + { + "epoch": 0.3449832131652815, + "grad_norm": 0.9407958388328552, + "learning_rate": 9.302045496139049e-06, + "loss": 0.8664, + "step": 6268 + }, + { + "epoch": 0.3450382519676372, + "grad_norm": 0.8674719929695129, + "learning_rate": 9.301824583674238e-06, + "loss": 0.8842, + "step": 6269 + }, + { + "epoch": 0.34509329076999284, + "grad_norm": 0.7697336673736572, + "learning_rate": 9.301603638878062e-06, + "loss": 0.7148, + "step": 6270 + }, + { + "epoch": 0.34514832957234853, + "grad_norm": 0.7220168709754944, + "learning_rate": 9.301382661752187e-06, + "loss": 0.7199, + "step": 6271 + }, + { + "epoch": 0.34520336837470417, + "grad_norm": 0.6745235919952393, + "learning_rate": 9.301161652298272e-06, + "loss": 0.708, + "step": 6272 + }, + { + "epoch": 0.3452584071770598, + "grad_norm": 0.7062309980392456, + "learning_rate": 9.300940610517974e-06, + "loss": 0.863, + "step": 6273 + }, + { + "epoch": 0.3453134459794155, + "grad_norm": 0.7499971985816956, + "learning_rate": 9.300719536412961e-06, + "loss": 0.7976, + "step": 6274 + }, + { + "epoch": 0.3453684847817711, + "grad_norm": 0.8304464221000671, + "learning_rate": 9.30049842998489e-06, + "loss": 0.8689, + "step": 6275 + }, + { + "epoch": 0.3454235235841268, + "grad_norm": 0.7460494041442871, + "learning_rate": 9.300277291235423e-06, + "loss": 0.7499, + "step": 6276 + }, + { + "epoch": 0.34547856238648245, + "grad_norm": 0.758788526058197, + "learning_rate": 9.300056120166225e-06, + "loss": 0.7501, + "step": 6277 + }, + { + "epoch": 0.34553360118883814, + "grad_norm": 0.7204456925392151, + "learning_rate": 9.299834916778955e-06, + "loss": 0.8234, + "step": 6278 + }, + { + "epoch": 0.34558863999119377, + "grad_norm": 0.7647501826286316, + "learning_rate": 9.299613681075277e-06, + "loss": 0.8653, + "step": 6279 + }, + { + "epoch": 0.34564367879354946, + "grad_norm": 0.7543594837188721, + "learning_rate": 9.299392413056853e-06, + "loss": 0.7915, + "step": 6280 + }, + { + "epoch": 0.3456987175959051, + "grad_norm": 0.7691700458526611, + "learning_rate": 9.299171112725347e-06, + "loss": 0.7429, + "step": 6281 + }, + { + "epoch": 0.3457537563982608, + "grad_norm": 0.7703940272331238, + "learning_rate": 9.29894978008242e-06, + "loss": 0.7424, + "step": 6282 + }, + { + "epoch": 0.3458087952006164, + "grad_norm": 0.8482547402381897, + "learning_rate": 9.29872841512974e-06, + "loss": 0.8971, + "step": 6283 + }, + { + "epoch": 0.3458638340029721, + "grad_norm": 0.755224883556366, + "learning_rate": 9.298507017868966e-06, + "loss": 0.7984, + "step": 6284 + }, + { + "epoch": 0.34591887280532774, + "grad_norm": 1.079891324043274, + "learning_rate": 9.298285588301766e-06, + "loss": 0.8301, + "step": 6285 + }, + { + "epoch": 0.3459739116076834, + "grad_norm": 0.7357321381568909, + "learning_rate": 9.2980641264298e-06, + "loss": 0.9018, + "step": 6286 + }, + { + "epoch": 0.34602895041003906, + "grad_norm": 0.7541963458061218, + "learning_rate": 9.297842632254734e-06, + "loss": 0.8716, + "step": 6287 + }, + { + "epoch": 0.34608398921239475, + "grad_norm": 1.1570138931274414, + "learning_rate": 9.297621105778235e-06, + "loss": 0.9163, + "step": 6288 + }, + { + "epoch": 0.3461390280147504, + "grad_norm": 0.7626895904541016, + "learning_rate": 9.297399547001965e-06, + "loss": 0.8162, + "step": 6289 + }, + { + "epoch": 0.34619406681710607, + "grad_norm": 0.758469820022583, + "learning_rate": 9.297177955927593e-06, + "loss": 0.8966, + "step": 6290 + }, + { + "epoch": 0.3462491056194617, + "grad_norm": 0.8998799324035645, + "learning_rate": 9.296956332556779e-06, + "loss": 0.8127, + "step": 6291 + }, + { + "epoch": 0.3463041444218174, + "grad_norm": 0.7470666170120239, + "learning_rate": 9.29673467689119e-06, + "loss": 0.7738, + "step": 6292 + }, + { + "epoch": 0.34635918322417303, + "grad_norm": 0.8066977858543396, + "learning_rate": 9.296512988932497e-06, + "loss": 0.8958, + "step": 6293 + }, + { + "epoch": 0.3464142220265287, + "grad_norm": 0.8394894003868103, + "learning_rate": 9.29629126868236e-06, + "loss": 0.8023, + "step": 6294 + }, + { + "epoch": 0.34646926082888435, + "grad_norm": 0.9053472876548767, + "learning_rate": 9.29606951614245e-06, + "loss": 0.8244, + "step": 6295 + }, + { + "epoch": 0.34652429963124004, + "grad_norm": 0.6996710896492004, + "learning_rate": 9.295847731314428e-06, + "loss": 0.8203, + "step": 6296 + }, + { + "epoch": 0.3465793384335957, + "grad_norm": 0.7236999273300171, + "learning_rate": 9.295625914199968e-06, + "loss": 0.6982, + "step": 6297 + }, + { + "epoch": 0.34663437723595136, + "grad_norm": 0.7006070017814636, + "learning_rate": 9.295404064800733e-06, + "loss": 0.7881, + "step": 6298 + }, + { + "epoch": 0.346689416038307, + "grad_norm": 0.8188902735710144, + "learning_rate": 9.29518218311839e-06, + "loss": 0.7472, + "step": 6299 + }, + { + "epoch": 0.3467444548406627, + "grad_norm": 0.7708863019943237, + "learning_rate": 9.294960269154608e-06, + "loss": 0.7572, + "step": 6300 + }, + { + "epoch": 0.3467994936430183, + "grad_norm": 0.7819802761077881, + "learning_rate": 9.294738322911052e-06, + "loss": 0.8486, + "step": 6301 + }, + { + "epoch": 0.346854532445374, + "grad_norm": 0.7160501480102539, + "learning_rate": 9.294516344389394e-06, + "loss": 0.8104, + "step": 6302 + }, + { + "epoch": 0.34690957124772964, + "grad_norm": 0.7426022887229919, + "learning_rate": 9.294294333591302e-06, + "loss": 0.7158, + "step": 6303 + }, + { + "epoch": 0.34696461005008533, + "grad_norm": 0.8397019505500793, + "learning_rate": 9.294072290518441e-06, + "loss": 0.8466, + "step": 6304 + }, + { + "epoch": 0.34701964885244097, + "grad_norm": 0.7220905423164368, + "learning_rate": 9.293850215172483e-06, + "loss": 0.7619, + "step": 6305 + }, + { + "epoch": 0.34707468765479665, + "grad_norm": 0.7401862740516663, + "learning_rate": 9.293628107555097e-06, + "loss": 0.7873, + "step": 6306 + }, + { + "epoch": 0.3471297264571523, + "grad_norm": 0.6764525175094604, + "learning_rate": 9.29340596766795e-06, + "loss": 0.7278, + "step": 6307 + }, + { + "epoch": 0.347184765259508, + "grad_norm": 0.8553194403648376, + "learning_rate": 9.293183795512715e-06, + "loss": 0.9074, + "step": 6308 + }, + { + "epoch": 0.3472398040618636, + "grad_norm": 0.6796454191207886, + "learning_rate": 9.292961591091058e-06, + "loss": 0.7179, + "step": 6309 + }, + { + "epoch": 0.3472948428642193, + "grad_norm": 0.6075254082679749, + "learning_rate": 9.292739354404652e-06, + "loss": 0.7228, + "step": 6310 + }, + { + "epoch": 0.34734988166657493, + "grad_norm": 0.7366840243339539, + "learning_rate": 9.292517085455166e-06, + "loss": 0.7934, + "step": 6311 + }, + { + "epoch": 0.3474049204689306, + "grad_norm": 0.6820569038391113, + "learning_rate": 9.29229478424427e-06, + "loss": 0.7315, + "step": 6312 + }, + { + "epoch": 0.34745995927128626, + "grad_norm": 0.8356956243515015, + "learning_rate": 9.292072450773635e-06, + "loss": 0.7787, + "step": 6313 + }, + { + "epoch": 0.34751499807364195, + "grad_norm": 0.70506352186203, + "learning_rate": 9.291850085044933e-06, + "loss": 0.7411, + "step": 6314 + }, + { + "epoch": 0.3475700368759976, + "grad_norm": 0.9074786901473999, + "learning_rate": 9.291627687059835e-06, + "loss": 0.7352, + "step": 6315 + }, + { + "epoch": 0.3476250756783532, + "grad_norm": 0.7858747839927673, + "learning_rate": 9.291405256820013e-06, + "loss": 0.7816, + "step": 6316 + }, + { + "epoch": 0.3476801144807089, + "grad_norm": 0.8576731085777283, + "learning_rate": 9.291182794327134e-06, + "loss": 0.7861, + "step": 6317 + }, + { + "epoch": 0.34773515328306454, + "grad_norm": 0.7500558495521545, + "learning_rate": 9.290960299582877e-06, + "loss": 0.8028, + "step": 6318 + }, + { + "epoch": 0.3477901920854202, + "grad_norm": 0.6577744483947754, + "learning_rate": 9.29073777258891e-06, + "loss": 0.7458, + "step": 6319 + }, + { + "epoch": 0.34784523088777586, + "grad_norm": 0.742855429649353, + "learning_rate": 9.290515213346906e-06, + "loss": 0.755, + "step": 6320 + }, + { + "epoch": 0.34790026969013155, + "grad_norm": 0.7626619338989258, + "learning_rate": 9.290292621858542e-06, + "loss": 0.6671, + "step": 6321 + }, + { + "epoch": 0.3479553084924872, + "grad_norm": 0.7139305472373962, + "learning_rate": 9.290069998125481e-06, + "loss": 0.7981, + "step": 6322 + }, + { + "epoch": 0.34801034729484287, + "grad_norm": 0.9249686002731323, + "learning_rate": 9.289847342149407e-06, + "loss": 0.7243, + "step": 6323 + }, + { + "epoch": 0.3480653860971985, + "grad_norm": 0.8090649843215942, + "learning_rate": 9.289624653931986e-06, + "loss": 0.7892, + "step": 6324 + }, + { + "epoch": 0.3481204248995542, + "grad_norm": 0.6845510005950928, + "learning_rate": 9.289401933474895e-06, + "loss": 0.7427, + "step": 6325 + }, + { + "epoch": 0.3481754637019098, + "grad_norm": 0.7620648741722107, + "learning_rate": 9.289179180779808e-06, + "loss": 0.7715, + "step": 6326 + }, + { + "epoch": 0.3482305025042655, + "grad_norm": 0.7441076040267944, + "learning_rate": 9.288956395848398e-06, + "loss": 0.7814, + "step": 6327 + }, + { + "epoch": 0.34828554130662115, + "grad_norm": 0.6777048707008362, + "learning_rate": 9.28873357868234e-06, + "loss": 0.759, + "step": 6328 + }, + { + "epoch": 0.34834058010897684, + "grad_norm": 0.6534250974655151, + "learning_rate": 9.288510729283307e-06, + "loss": 0.6777, + "step": 6329 + }, + { + "epoch": 0.34839561891133247, + "grad_norm": 0.8205152153968811, + "learning_rate": 9.288287847652977e-06, + "loss": 0.8027, + "step": 6330 + }, + { + "epoch": 0.34845065771368816, + "grad_norm": 0.7152554392814636, + "learning_rate": 9.288064933793024e-06, + "loss": 0.7956, + "step": 6331 + }, + { + "epoch": 0.3485056965160438, + "grad_norm": 0.9816664457321167, + "learning_rate": 9.287841987705121e-06, + "loss": 0.828, + "step": 6332 + }, + { + "epoch": 0.3485607353183995, + "grad_norm": 0.826554000377655, + "learning_rate": 9.287619009390945e-06, + "loss": 0.8544, + "step": 6333 + }, + { + "epoch": 0.3486157741207551, + "grad_norm": 0.7255695462226868, + "learning_rate": 9.287395998852175e-06, + "loss": 0.7749, + "step": 6334 + }, + { + "epoch": 0.3486708129231108, + "grad_norm": 0.7161709070205688, + "learning_rate": 9.287172956090482e-06, + "loss": 0.7114, + "step": 6335 + }, + { + "epoch": 0.34872585172546644, + "grad_norm": 0.7219997644424438, + "learning_rate": 9.286949881107546e-06, + "loss": 0.8309, + "step": 6336 + }, + { + "epoch": 0.34878089052782213, + "grad_norm": 0.7269770503044128, + "learning_rate": 9.286726773905042e-06, + "loss": 0.8039, + "step": 6337 + }, + { + "epoch": 0.34883592933017776, + "grad_norm": 0.8142165541648865, + "learning_rate": 9.286503634484645e-06, + "loss": 0.7673, + "step": 6338 + }, + { + "epoch": 0.34889096813253345, + "grad_norm": 0.7568639516830444, + "learning_rate": 9.286280462848037e-06, + "loss": 0.8471, + "step": 6339 + }, + { + "epoch": 0.3489460069348891, + "grad_norm": 0.7927737236022949, + "learning_rate": 9.28605725899689e-06, + "loss": 0.8828, + "step": 6340 + }, + { + "epoch": 0.3490010457372448, + "grad_norm": 0.9755893349647522, + "learning_rate": 9.285834022932885e-06, + "loss": 0.837, + "step": 6341 + }, + { + "epoch": 0.3490560845396004, + "grad_norm": 0.6831560730934143, + "learning_rate": 9.2856107546577e-06, + "loss": 0.7169, + "step": 6342 + }, + { + "epoch": 0.3491111233419561, + "grad_norm": 0.728239119052887, + "learning_rate": 9.285387454173009e-06, + "loss": 0.7805, + "step": 6343 + }, + { + "epoch": 0.34916616214431173, + "grad_norm": 0.6979145407676697, + "learning_rate": 9.285164121480495e-06, + "loss": 0.7794, + "step": 6344 + }, + { + "epoch": 0.3492212009466674, + "grad_norm": 0.7206674218177795, + "learning_rate": 9.284940756581834e-06, + "loss": 0.7198, + "step": 6345 + }, + { + "epoch": 0.34927623974902305, + "grad_norm": 0.8156035542488098, + "learning_rate": 9.284717359478705e-06, + "loss": 0.884, + "step": 6346 + }, + { + "epoch": 0.34933127855137874, + "grad_norm": 0.6876983046531677, + "learning_rate": 9.284493930172788e-06, + "loss": 0.7426, + "step": 6347 + }, + { + "epoch": 0.3493863173537344, + "grad_norm": 0.6856677532196045, + "learning_rate": 9.284270468665762e-06, + "loss": 0.7085, + "step": 6348 + }, + { + "epoch": 0.34944135615609007, + "grad_norm": 0.8378047943115234, + "learning_rate": 9.284046974959304e-06, + "loss": 0.725, + "step": 6349 + }, + { + "epoch": 0.3494963949584457, + "grad_norm": 0.7410693764686584, + "learning_rate": 9.283823449055097e-06, + "loss": 0.7953, + "step": 6350 + }, + { + "epoch": 0.3495514337608014, + "grad_norm": 0.7558375000953674, + "learning_rate": 9.28359989095482e-06, + "loss": 0.8052, + "step": 6351 + }, + { + "epoch": 0.349606472563157, + "grad_norm": 0.7176862955093384, + "learning_rate": 9.283376300660151e-06, + "loss": 0.7077, + "step": 6352 + }, + { + "epoch": 0.3496615113655127, + "grad_norm": 0.7443307042121887, + "learning_rate": 9.283152678172774e-06, + "loss": 0.7557, + "step": 6353 + }, + { + "epoch": 0.34971655016786835, + "grad_norm": 0.6653748750686646, + "learning_rate": 9.282929023494368e-06, + "loss": 0.7558, + "step": 6354 + }, + { + "epoch": 0.34977158897022403, + "grad_norm": 0.8139400482177734, + "learning_rate": 9.282705336626615e-06, + "loss": 0.847, + "step": 6355 + }, + { + "epoch": 0.34982662777257967, + "grad_norm": 1.012450933456421, + "learning_rate": 9.282481617571193e-06, + "loss": 0.744, + "step": 6356 + }, + { + "epoch": 0.34988166657493536, + "grad_norm": 0.7877402305603027, + "learning_rate": 9.282257866329784e-06, + "loss": 0.7475, + "step": 6357 + }, + { + "epoch": 0.349936705377291, + "grad_norm": 0.7989935874938965, + "learning_rate": 9.282034082904075e-06, + "loss": 0.7379, + "step": 6358 + }, + { + "epoch": 0.3499917441796466, + "grad_norm": 0.6665796637535095, + "learning_rate": 9.281810267295741e-06, + "loss": 0.7253, + "step": 6359 + }, + { + "epoch": 0.3500467829820023, + "grad_norm": 0.8344665765762329, + "learning_rate": 9.28158641950647e-06, + "loss": 0.8095, + "step": 6360 + }, + { + "epoch": 0.35010182178435795, + "grad_norm": 0.8312307596206665, + "learning_rate": 9.281362539537939e-06, + "loss": 0.8452, + "step": 6361 + }, + { + "epoch": 0.35015686058671364, + "grad_norm": 0.7423825263977051, + "learning_rate": 9.281138627391834e-06, + "loss": 0.8291, + "step": 6362 + }, + { + "epoch": 0.35021189938906927, + "grad_norm": 0.7594212293624878, + "learning_rate": 9.280914683069837e-06, + "loss": 0.8314, + "step": 6363 + }, + { + "epoch": 0.35026693819142496, + "grad_norm": 0.8059762716293335, + "learning_rate": 9.280690706573633e-06, + "loss": 0.7695, + "step": 6364 + }, + { + "epoch": 0.3503219769937806, + "grad_norm": 0.8053386807441711, + "learning_rate": 9.280466697904902e-06, + "loss": 0.8941, + "step": 6365 + }, + { + "epoch": 0.3503770157961363, + "grad_norm": 0.6703817248344421, + "learning_rate": 9.280242657065329e-06, + "loss": 0.5978, + "step": 6366 + }, + { + "epoch": 0.3504320545984919, + "grad_norm": 0.9359784722328186, + "learning_rate": 9.280018584056598e-06, + "loss": 0.8479, + "step": 6367 + }, + { + "epoch": 0.3504870934008476, + "grad_norm": 0.7692418098449707, + "learning_rate": 9.279794478880393e-06, + "loss": 0.7254, + "step": 6368 + }, + { + "epoch": 0.35054213220320324, + "grad_norm": 0.7992031574249268, + "learning_rate": 9.279570341538397e-06, + "loss": 0.6749, + "step": 6369 + }, + { + "epoch": 0.3505971710055589, + "grad_norm": 0.7735288739204407, + "learning_rate": 9.279346172032297e-06, + "loss": 0.8545, + "step": 6370 + }, + { + "epoch": 0.35065220980791456, + "grad_norm": 0.7124339938163757, + "learning_rate": 9.279121970363778e-06, + "loss": 0.8066, + "step": 6371 + }, + { + "epoch": 0.35070724861027025, + "grad_norm": 0.8116535544395447, + "learning_rate": 9.278897736534521e-06, + "loss": 0.8197, + "step": 6372 + }, + { + "epoch": 0.3507622874126259, + "grad_norm": 0.9377869963645935, + "learning_rate": 9.278673470546217e-06, + "loss": 0.74, + "step": 6373 + }, + { + "epoch": 0.3508173262149816, + "grad_norm": 0.6726253628730774, + "learning_rate": 9.278449172400548e-06, + "loss": 0.6389, + "step": 6374 + }, + { + "epoch": 0.3508723650173372, + "grad_norm": 0.8470593094825745, + "learning_rate": 9.278224842099198e-06, + "loss": 0.8059, + "step": 6375 + }, + { + "epoch": 0.3509274038196929, + "grad_norm": 0.7041867971420288, + "learning_rate": 9.278000479643857e-06, + "loss": 0.7409, + "step": 6376 + }, + { + "epoch": 0.35098244262204853, + "grad_norm": 0.7467322945594788, + "learning_rate": 9.27777608503621e-06, + "loss": 0.823, + "step": 6377 + }, + { + "epoch": 0.3510374814244042, + "grad_norm": 0.7211065888404846, + "learning_rate": 9.277551658277942e-06, + "loss": 0.7655, + "step": 6378 + }, + { + "epoch": 0.35109252022675985, + "grad_norm": 0.7709450125694275, + "learning_rate": 9.27732719937074e-06, + "loss": 0.8938, + "step": 6379 + }, + { + "epoch": 0.35114755902911554, + "grad_norm": 0.7672929167747498, + "learning_rate": 9.277102708316293e-06, + "loss": 0.6814, + "step": 6380 + }, + { + "epoch": 0.3512025978314712, + "grad_norm": 0.7334907650947571, + "learning_rate": 9.276878185116287e-06, + "loss": 0.6608, + "step": 6381 + }, + { + "epoch": 0.35125763663382686, + "grad_norm": 0.7011460065841675, + "learning_rate": 9.27665362977241e-06, + "loss": 0.8196, + "step": 6382 + }, + { + "epoch": 0.3513126754361825, + "grad_norm": 0.7388820052146912, + "learning_rate": 9.276429042286349e-06, + "loss": 0.8793, + "step": 6383 + }, + { + "epoch": 0.3513677142385382, + "grad_norm": 0.809725821018219, + "learning_rate": 9.27620442265979e-06, + "loss": 0.6976, + "step": 6384 + }, + { + "epoch": 0.3514227530408938, + "grad_norm": 0.6933012008666992, + "learning_rate": 9.275979770894424e-06, + "loss": 0.759, + "step": 6385 + }, + { + "epoch": 0.3514777918432495, + "grad_norm": 0.7928480505943298, + "learning_rate": 9.27575508699194e-06, + "loss": 0.7462, + "step": 6386 + }, + { + "epoch": 0.35153283064560514, + "grad_norm": 0.8461304903030396, + "learning_rate": 9.275530370954024e-06, + "loss": 0.8184, + "step": 6387 + }, + { + "epoch": 0.35158786944796083, + "grad_norm": 0.7624425292015076, + "learning_rate": 9.275305622782366e-06, + "loss": 0.7913, + "step": 6388 + }, + { + "epoch": 0.35164290825031647, + "grad_norm": 0.7103675007820129, + "learning_rate": 9.275080842478657e-06, + "loss": 0.7633, + "step": 6389 + }, + { + "epoch": 0.35169794705267216, + "grad_norm": 0.9002664089202881, + "learning_rate": 9.274856030044583e-06, + "loss": 0.7643, + "step": 6390 + }, + { + "epoch": 0.3517529858550278, + "grad_norm": 0.7658692002296448, + "learning_rate": 9.274631185481836e-06, + "loss": 0.8028, + "step": 6391 + }, + { + "epoch": 0.3518080246573835, + "grad_norm": 0.6747875809669495, + "learning_rate": 9.274406308792106e-06, + "loss": 0.695, + "step": 6392 + }, + { + "epoch": 0.3518630634597391, + "grad_norm": 0.8197165131568909, + "learning_rate": 9.27418139997708e-06, + "loss": 0.7218, + "step": 6393 + }, + { + "epoch": 0.3519181022620948, + "grad_norm": 0.7597750425338745, + "learning_rate": 9.273956459038453e-06, + "loss": 0.7738, + "step": 6394 + }, + { + "epoch": 0.35197314106445043, + "grad_norm": 0.7365928888320923, + "learning_rate": 9.273731485977912e-06, + "loss": 0.7906, + "step": 6395 + }, + { + "epoch": 0.3520281798668061, + "grad_norm": 0.7313928604125977, + "learning_rate": 9.273506480797151e-06, + "loss": 0.834, + "step": 6396 + }, + { + "epoch": 0.35208321866916176, + "grad_norm": 0.758886456489563, + "learning_rate": 9.273281443497858e-06, + "loss": 0.8883, + "step": 6397 + }, + { + "epoch": 0.35213825747151745, + "grad_norm": 0.7318256497383118, + "learning_rate": 9.273056374081726e-06, + "loss": 0.7463, + "step": 6398 + }, + { + "epoch": 0.3521932962738731, + "grad_norm": 0.778448224067688, + "learning_rate": 9.272831272550446e-06, + "loss": 0.6838, + "step": 6399 + }, + { + "epoch": 0.3522483350762287, + "grad_norm": 0.7392274141311646, + "learning_rate": 9.272606138905709e-06, + "loss": 0.7237, + "step": 6400 + }, + { + "epoch": 0.3523033738785844, + "grad_norm": 0.8803032040596008, + "learning_rate": 9.272380973149209e-06, + "loss": 0.7839, + "step": 6401 + }, + { + "epoch": 0.35235841268094004, + "grad_norm": 0.7506754994392395, + "learning_rate": 9.272155775282636e-06, + "loss": 0.7665, + "step": 6402 + }, + { + "epoch": 0.3524134514832957, + "grad_norm": 0.8136595487594604, + "learning_rate": 9.271930545307686e-06, + "loss": 0.9111, + "step": 6403 + }, + { + "epoch": 0.35246849028565136, + "grad_norm": 0.7976880073547363, + "learning_rate": 9.271705283226047e-06, + "loss": 0.735, + "step": 6404 + }, + { + "epoch": 0.35252352908800705, + "grad_norm": 0.89708411693573, + "learning_rate": 9.271479989039415e-06, + "loss": 0.7698, + "step": 6405 + }, + { + "epoch": 0.3525785678903627, + "grad_norm": 0.8618703484535217, + "learning_rate": 9.271254662749484e-06, + "loss": 0.9001, + "step": 6406 + }, + { + "epoch": 0.35263360669271837, + "grad_norm": 0.7143027186393738, + "learning_rate": 9.271029304357946e-06, + "loss": 0.8188, + "step": 6407 + }, + { + "epoch": 0.352688645495074, + "grad_norm": 0.795365571975708, + "learning_rate": 9.270803913866496e-06, + "loss": 0.7389, + "step": 6408 + }, + { + "epoch": 0.3527436842974297, + "grad_norm": 0.6947643756866455, + "learning_rate": 9.270578491276825e-06, + "loss": 0.7278, + "step": 6409 + }, + { + "epoch": 0.3527987230997853, + "grad_norm": 0.7806137204170227, + "learning_rate": 9.27035303659063e-06, + "loss": 0.808, + "step": 6410 + }, + { + "epoch": 0.352853761902141, + "grad_norm": 0.8908704519271851, + "learning_rate": 9.270127549809606e-06, + "loss": 0.8659, + "step": 6411 + }, + { + "epoch": 0.35290880070449665, + "grad_norm": 0.8171417713165283, + "learning_rate": 9.269902030935445e-06, + "loss": 0.7918, + "step": 6412 + }, + { + "epoch": 0.35296383950685234, + "grad_norm": 0.7556712627410889, + "learning_rate": 9.269676479969842e-06, + "loss": 0.7121, + "step": 6413 + }, + { + "epoch": 0.353018878309208, + "grad_norm": 0.8080483675003052, + "learning_rate": 9.269450896914495e-06, + "loss": 0.8185, + "step": 6414 + }, + { + "epoch": 0.35307391711156366, + "grad_norm": 0.8514583706855774, + "learning_rate": 9.2692252817711e-06, + "loss": 0.8055, + "step": 6415 + }, + { + "epoch": 0.3531289559139193, + "grad_norm": 0.7914162278175354, + "learning_rate": 9.268999634541347e-06, + "loss": 0.759, + "step": 6416 + }, + { + "epoch": 0.353183994716275, + "grad_norm": 0.6452118754386902, + "learning_rate": 9.268773955226937e-06, + "loss": 0.6797, + "step": 6417 + }, + { + "epoch": 0.3532390335186306, + "grad_norm": 0.6876220107078552, + "learning_rate": 9.268548243829565e-06, + "loss": 0.7365, + "step": 6418 + }, + { + "epoch": 0.3532940723209863, + "grad_norm": 0.758550226688385, + "learning_rate": 9.268322500350926e-06, + "loss": 0.7069, + "step": 6419 + }, + { + "epoch": 0.35334911112334194, + "grad_norm": 0.7905879020690918, + "learning_rate": 9.268096724792718e-06, + "loss": 0.8024, + "step": 6420 + }, + { + "epoch": 0.35340414992569763, + "grad_norm": 0.755253255367279, + "learning_rate": 9.267870917156638e-06, + "loss": 0.8018, + "step": 6421 + }, + { + "epoch": 0.35345918872805326, + "grad_norm": 0.6879923343658447, + "learning_rate": 9.267645077444382e-06, + "loss": 0.7267, + "step": 6422 + }, + { + "epoch": 0.35351422753040895, + "grad_norm": 0.766214907169342, + "learning_rate": 9.267419205657649e-06, + "loss": 0.7801, + "step": 6423 + }, + { + "epoch": 0.3535692663327646, + "grad_norm": 0.868776798248291, + "learning_rate": 9.267193301798135e-06, + "loss": 0.9234, + "step": 6424 + }, + { + "epoch": 0.3536243051351203, + "grad_norm": 1.2007492780685425, + "learning_rate": 9.266967365867536e-06, + "loss": 0.7743, + "step": 6425 + }, + { + "epoch": 0.3536793439374759, + "grad_norm": 0.7445551156997681, + "learning_rate": 9.266741397867556e-06, + "loss": 0.6755, + "step": 6426 + }, + { + "epoch": 0.3537343827398316, + "grad_norm": 0.7493785619735718, + "learning_rate": 9.266515397799889e-06, + "loss": 0.7891, + "step": 6427 + }, + { + "epoch": 0.35378942154218723, + "grad_norm": 0.6718230843544006, + "learning_rate": 9.266289365666234e-06, + "loss": 0.6908, + "step": 6428 + }, + { + "epoch": 0.3538444603445429, + "grad_norm": 0.7783547639846802, + "learning_rate": 9.266063301468289e-06, + "loss": 0.7115, + "step": 6429 + }, + { + "epoch": 0.35389949914689856, + "grad_norm": 0.745627224445343, + "learning_rate": 9.265837205207755e-06, + "loss": 0.8421, + "step": 6430 + }, + { + "epoch": 0.35395453794925424, + "grad_norm": 0.7314152717590332, + "learning_rate": 9.26561107688633e-06, + "loss": 0.807, + "step": 6431 + }, + { + "epoch": 0.3540095767516099, + "grad_norm": 0.6975863575935364, + "learning_rate": 9.265384916505714e-06, + "loss": 0.7787, + "step": 6432 + }, + { + "epoch": 0.35406461555396557, + "grad_norm": 0.9758319854736328, + "learning_rate": 9.265158724067608e-06, + "loss": 0.8668, + "step": 6433 + }, + { + "epoch": 0.3541196543563212, + "grad_norm": 0.7686764001846313, + "learning_rate": 9.264932499573711e-06, + "loss": 0.7428, + "step": 6434 + }, + { + "epoch": 0.3541746931586769, + "grad_norm": 0.8761935830116272, + "learning_rate": 9.26470624302572e-06, + "loss": 0.8022, + "step": 6435 + }, + { + "epoch": 0.3542297319610325, + "grad_norm": 0.9145118594169617, + "learning_rate": 9.264479954425341e-06, + "loss": 0.7994, + "step": 6436 + }, + { + "epoch": 0.3542847707633882, + "grad_norm": 0.8217951655387878, + "learning_rate": 9.264253633774271e-06, + "loss": 0.7235, + "step": 6437 + }, + { + "epoch": 0.35433980956574385, + "grad_norm": 0.7624716758728027, + "learning_rate": 9.264027281074214e-06, + "loss": 0.8238, + "step": 6438 + }, + { + "epoch": 0.35439484836809954, + "grad_norm": 0.7772085070610046, + "learning_rate": 9.26380089632687e-06, + "loss": 0.7941, + "step": 6439 + }, + { + "epoch": 0.35444988717045517, + "grad_norm": 1.0462371110916138, + "learning_rate": 9.263574479533937e-06, + "loss": 0.8255, + "step": 6440 + }, + { + "epoch": 0.35450492597281086, + "grad_norm": 0.8523101210594177, + "learning_rate": 9.263348030697119e-06, + "loss": 0.8489, + "step": 6441 + }, + { + "epoch": 0.3545599647751665, + "grad_norm": 1.0292255878448486, + "learning_rate": 9.26312154981812e-06, + "loss": 0.7989, + "step": 6442 + }, + { + "epoch": 0.3546150035775221, + "grad_norm": 0.7621143460273743, + "learning_rate": 9.262895036898641e-06, + "loss": 0.8154, + "step": 6443 + }, + { + "epoch": 0.3546700423798778, + "grad_norm": 0.7158074378967285, + "learning_rate": 9.262668491940382e-06, + "loss": 0.7821, + "step": 6444 + }, + { + "epoch": 0.35472508118223345, + "grad_norm": 0.7969478964805603, + "learning_rate": 9.26244191494505e-06, + "loss": 0.8535, + "step": 6445 + }, + { + "epoch": 0.35478011998458914, + "grad_norm": 0.9244762063026428, + "learning_rate": 9.262215305914345e-06, + "loss": 0.7585, + "step": 6446 + }, + { + "epoch": 0.35483515878694477, + "grad_norm": 0.6862454414367676, + "learning_rate": 9.26198866484997e-06, + "loss": 0.7294, + "step": 6447 + }, + { + "epoch": 0.35489019758930046, + "grad_norm": 0.6816834211349487, + "learning_rate": 9.261761991753629e-06, + "loss": 0.7763, + "step": 6448 + }, + { + "epoch": 0.3549452363916561, + "grad_norm": 0.792539119720459, + "learning_rate": 9.261535286627025e-06, + "loss": 0.7829, + "step": 6449 + }, + { + "epoch": 0.3550002751940118, + "grad_norm": 0.8563211560249329, + "learning_rate": 9.261308549471866e-06, + "loss": 0.8945, + "step": 6450 + }, + { + "epoch": 0.3550553139963674, + "grad_norm": 0.7241078019142151, + "learning_rate": 9.26108178028985e-06, + "loss": 0.6936, + "step": 6451 + }, + { + "epoch": 0.3551103527987231, + "grad_norm": 0.7150034308433533, + "learning_rate": 9.260854979082682e-06, + "loss": 0.7689, + "step": 6452 + }, + { + "epoch": 0.35516539160107874, + "grad_norm": 0.8630193471908569, + "learning_rate": 9.260628145852073e-06, + "loss": 0.8506, + "step": 6453 + }, + { + "epoch": 0.35522043040343443, + "grad_norm": 0.7133893370628357, + "learning_rate": 9.26040128059972e-06, + "loss": 0.7976, + "step": 6454 + }, + { + "epoch": 0.35527546920579006, + "grad_norm": 0.6984630823135376, + "learning_rate": 9.260174383327332e-06, + "loss": 0.7442, + "step": 6455 + }, + { + "epoch": 0.35533050800814575, + "grad_norm": 0.7166933417320251, + "learning_rate": 9.259947454036613e-06, + "loss": 0.813, + "step": 6456 + }, + { + "epoch": 0.3553855468105014, + "grad_norm": 0.7353581190109253, + "learning_rate": 9.259720492729272e-06, + "loss": 0.8157, + "step": 6457 + }, + { + "epoch": 0.3554405856128571, + "grad_norm": 0.6810038089752197, + "learning_rate": 9.259493499407011e-06, + "loss": 0.7423, + "step": 6458 + }, + { + "epoch": 0.3554956244152127, + "grad_norm": 1.1599586009979248, + "learning_rate": 9.259266474071535e-06, + "loss": 0.7159, + "step": 6459 + }, + { + "epoch": 0.3555506632175684, + "grad_norm": 0.7857629060745239, + "learning_rate": 9.259039416724554e-06, + "loss": 0.7846, + "step": 6460 + }, + { + "epoch": 0.35560570201992403, + "grad_norm": 0.705333948135376, + "learning_rate": 9.258812327367773e-06, + "loss": 0.751, + "step": 6461 + }, + { + "epoch": 0.3556607408222797, + "grad_norm": 0.6899998188018799, + "learning_rate": 9.258585206002897e-06, + "loss": 0.7303, + "step": 6462 + }, + { + "epoch": 0.35571577962463535, + "grad_norm": 0.8007912039756775, + "learning_rate": 9.258358052631637e-06, + "loss": 0.7363, + "step": 6463 + }, + { + "epoch": 0.35577081842699104, + "grad_norm": 0.9403146505355835, + "learning_rate": 9.258130867255695e-06, + "loss": 0.9096, + "step": 6464 + }, + { + "epoch": 0.3558258572293467, + "grad_norm": 0.7069174647331238, + "learning_rate": 9.257903649876782e-06, + "loss": 0.7362, + "step": 6465 + }, + { + "epoch": 0.35588089603170237, + "grad_norm": 0.770807683467865, + "learning_rate": 9.257676400496607e-06, + "loss": 0.7904, + "step": 6466 + }, + { + "epoch": 0.355935934834058, + "grad_norm": 0.8586871027946472, + "learning_rate": 9.257449119116874e-06, + "loss": 0.7596, + "step": 6467 + }, + { + "epoch": 0.3559909736364137, + "grad_norm": 0.6934101581573486, + "learning_rate": 9.257221805739294e-06, + "loss": 0.6655, + "step": 6468 + }, + { + "epoch": 0.3560460124387693, + "grad_norm": 0.9494497179985046, + "learning_rate": 9.256994460365573e-06, + "loss": 0.7923, + "step": 6469 + }, + { + "epoch": 0.356101051241125, + "grad_norm": 0.7131130695343018, + "learning_rate": 9.256767082997422e-06, + "loss": 0.819, + "step": 6470 + }, + { + "epoch": 0.35615609004348064, + "grad_norm": 0.8641398549079895, + "learning_rate": 9.25653967363655e-06, + "loss": 0.8275, + "step": 6471 + }, + { + "epoch": 0.35621112884583633, + "grad_norm": 0.7350367307662964, + "learning_rate": 9.256312232284665e-06, + "loss": 0.7991, + "step": 6472 + }, + { + "epoch": 0.35626616764819197, + "grad_norm": 0.8174671530723572, + "learning_rate": 9.256084758943476e-06, + "loss": 0.7147, + "step": 6473 + }, + { + "epoch": 0.35632120645054766, + "grad_norm": 0.7560263872146606, + "learning_rate": 9.255857253614693e-06, + "loss": 0.7435, + "step": 6474 + }, + { + "epoch": 0.3563762452529033, + "grad_norm": 0.7465197443962097, + "learning_rate": 9.255629716300025e-06, + "loss": 0.8228, + "step": 6475 + }, + { + "epoch": 0.356431284055259, + "grad_norm": 0.7130733728408813, + "learning_rate": 9.255402147001184e-06, + "loss": 0.8361, + "step": 6476 + }, + { + "epoch": 0.3564863228576146, + "grad_norm": 0.7200759053230286, + "learning_rate": 9.255174545719882e-06, + "loss": 0.7387, + "step": 6477 + }, + { + "epoch": 0.3565413616599703, + "grad_norm": 0.8387622237205505, + "learning_rate": 9.254946912457826e-06, + "loss": 0.8427, + "step": 6478 + }, + { + "epoch": 0.35659640046232594, + "grad_norm": 0.7263510823249817, + "learning_rate": 9.254719247216725e-06, + "loss": 0.712, + "step": 6479 + }, + { + "epoch": 0.3566514392646816, + "grad_norm": 0.7393862009048462, + "learning_rate": 9.254491549998296e-06, + "loss": 0.6916, + "step": 6480 + }, + { + "epoch": 0.35670647806703726, + "grad_norm": 0.7289569973945618, + "learning_rate": 9.254263820804246e-06, + "loss": 0.7561, + "step": 6481 + }, + { + "epoch": 0.35676151686939295, + "grad_norm": 0.7597448825836182, + "learning_rate": 9.254036059636288e-06, + "loss": 0.853, + "step": 6482 + }, + { + "epoch": 0.3568165556717486, + "grad_norm": 0.7652063369750977, + "learning_rate": 9.253808266496136e-06, + "loss": 0.7652, + "step": 6483 + }, + { + "epoch": 0.35687159447410427, + "grad_norm": 1.193938136100769, + "learning_rate": 9.253580441385497e-06, + "loss": 0.8288, + "step": 6484 + }, + { + "epoch": 0.3569266332764599, + "grad_norm": 0.9258719086647034, + "learning_rate": 9.253352584306087e-06, + "loss": 0.807, + "step": 6485 + }, + { + "epoch": 0.35698167207881554, + "grad_norm": 0.78384929895401, + "learning_rate": 9.253124695259617e-06, + "loss": 0.7785, + "step": 6486 + }, + { + "epoch": 0.3570367108811712, + "grad_norm": 0.801403284072876, + "learning_rate": 9.252896774247802e-06, + "loss": 0.8382, + "step": 6487 + }, + { + "epoch": 0.35709174968352686, + "grad_norm": 0.9472376108169556, + "learning_rate": 9.25266882127235e-06, + "loss": 0.8661, + "step": 6488 + }, + { + "epoch": 0.35714678848588255, + "grad_norm": 0.7575686573982239, + "learning_rate": 9.252440836334981e-06, + "loss": 0.8428, + "step": 6489 + }, + { + "epoch": 0.3572018272882382, + "grad_norm": 0.736282467842102, + "learning_rate": 9.252212819437402e-06, + "loss": 0.801, + "step": 6490 + }, + { + "epoch": 0.35725686609059387, + "grad_norm": 0.7420864701271057, + "learning_rate": 9.251984770581332e-06, + "loss": 0.8849, + "step": 6491 + }, + { + "epoch": 0.3573119048929495, + "grad_norm": 0.7129189372062683, + "learning_rate": 9.251756689768482e-06, + "loss": 0.7716, + "step": 6492 + }, + { + "epoch": 0.3573669436953052, + "grad_norm": 0.7777297496795654, + "learning_rate": 9.251528577000566e-06, + "loss": 0.8183, + "step": 6493 + }, + { + "epoch": 0.35742198249766083, + "grad_norm": 0.7644590139389038, + "learning_rate": 9.2513004322793e-06, + "loss": 0.6319, + "step": 6494 + }, + { + "epoch": 0.3574770213000165, + "grad_norm": 0.7112484574317932, + "learning_rate": 9.251072255606399e-06, + "loss": 0.8012, + "step": 6495 + }, + { + "epoch": 0.35753206010237215, + "grad_norm": 0.7772265076637268, + "learning_rate": 9.250844046983576e-06, + "loss": 0.8372, + "step": 6496 + }, + { + "epoch": 0.35758709890472784, + "grad_norm": 0.9530157446861267, + "learning_rate": 9.250615806412546e-06, + "loss": 0.8683, + "step": 6497 + }, + { + "epoch": 0.3576421377070835, + "grad_norm": 0.7249575257301331, + "learning_rate": 9.250387533895026e-06, + "loss": 0.7091, + "step": 6498 + }, + { + "epoch": 0.35769717650943916, + "grad_norm": 0.8549422025680542, + "learning_rate": 9.25015922943273e-06, + "loss": 0.8376, + "step": 6499 + }, + { + "epoch": 0.3577522153117948, + "grad_norm": 0.74477618932724, + "learning_rate": 9.249930893027376e-06, + "loss": 0.7594, + "step": 6500 + }, + { + "epoch": 0.3578072541141505, + "grad_norm": 0.8269739151000977, + "learning_rate": 9.24970252468068e-06, + "loss": 0.6473, + "step": 6501 + }, + { + "epoch": 0.3578622929165061, + "grad_norm": 0.8375437259674072, + "learning_rate": 9.249474124394358e-06, + "loss": 0.7631, + "step": 6502 + }, + { + "epoch": 0.3579173317188618, + "grad_norm": 0.8680340051651001, + "learning_rate": 9.249245692170123e-06, + "loss": 0.7863, + "step": 6503 + }, + { + "epoch": 0.35797237052121744, + "grad_norm": 0.7179692983627319, + "learning_rate": 9.249017228009696e-06, + "loss": 0.8022, + "step": 6504 + }, + { + "epoch": 0.35802740932357313, + "grad_norm": 0.7797464728355408, + "learning_rate": 9.248788731914794e-06, + "loss": 0.8067, + "step": 6505 + }, + { + "epoch": 0.35808244812592877, + "grad_norm": 0.8032993674278259, + "learning_rate": 9.248560203887133e-06, + "loss": 0.7383, + "step": 6506 + }, + { + "epoch": 0.35813748692828445, + "grad_norm": 0.7714722156524658, + "learning_rate": 9.24833164392843e-06, + "loss": 0.7149, + "step": 6507 + }, + { + "epoch": 0.3581925257306401, + "grad_norm": 0.7492430210113525, + "learning_rate": 9.248103052040404e-06, + "loss": 0.7645, + "step": 6508 + }, + { + "epoch": 0.3582475645329958, + "grad_norm": 0.6843901872634888, + "learning_rate": 9.247874428224773e-06, + "loss": 0.7183, + "step": 6509 + }, + { + "epoch": 0.3583026033353514, + "grad_norm": 0.8370186686515808, + "learning_rate": 9.247645772483254e-06, + "loss": 0.7832, + "step": 6510 + }, + { + "epoch": 0.3583576421377071, + "grad_norm": 0.7907791137695312, + "learning_rate": 9.247417084817567e-06, + "loss": 0.8742, + "step": 6511 + }, + { + "epoch": 0.35841268094006273, + "grad_norm": 0.7950869798660278, + "learning_rate": 9.247188365229428e-06, + "loss": 0.8705, + "step": 6512 + }, + { + "epoch": 0.3584677197424184, + "grad_norm": 0.7276936173439026, + "learning_rate": 9.24695961372056e-06, + "loss": 0.7629, + "step": 6513 + }, + { + "epoch": 0.35852275854477406, + "grad_norm": 0.7761141657829285, + "learning_rate": 9.24673083029268e-06, + "loss": 0.8813, + "step": 6514 + }, + { + "epoch": 0.35857779734712975, + "grad_norm": 0.7528283596038818, + "learning_rate": 9.24650201494751e-06, + "loss": 0.7885, + "step": 6515 + }, + { + "epoch": 0.3586328361494854, + "grad_norm": 0.8972534537315369, + "learning_rate": 9.246273167686765e-06, + "loss": 0.9081, + "step": 6516 + }, + { + "epoch": 0.35868787495184107, + "grad_norm": 0.7658557891845703, + "learning_rate": 9.246044288512168e-06, + "loss": 0.8451, + "step": 6517 + }, + { + "epoch": 0.3587429137541967, + "grad_norm": 0.8013193607330322, + "learning_rate": 9.245815377425438e-06, + "loss": 0.7236, + "step": 6518 + }, + { + "epoch": 0.3587979525565524, + "grad_norm": 0.8134163022041321, + "learning_rate": 9.245586434428298e-06, + "loss": 0.908, + "step": 6519 + }, + { + "epoch": 0.358852991358908, + "grad_norm": 0.6479801535606384, + "learning_rate": 9.245357459522466e-06, + "loss": 0.7397, + "step": 6520 + }, + { + "epoch": 0.3589080301612637, + "grad_norm": 0.70014488697052, + "learning_rate": 9.245128452709665e-06, + "loss": 0.6898, + "step": 6521 + }, + { + "epoch": 0.35896306896361935, + "grad_norm": 0.7645437717437744, + "learning_rate": 9.244899413991613e-06, + "loss": 0.8319, + "step": 6522 + }, + { + "epoch": 0.35901810776597504, + "grad_norm": 0.6812799572944641, + "learning_rate": 9.244670343370033e-06, + "loss": 0.7359, + "step": 6523 + }, + { + "epoch": 0.35907314656833067, + "grad_norm": 0.6573774218559265, + "learning_rate": 9.244441240846647e-06, + "loss": 0.742, + "step": 6524 + }, + { + "epoch": 0.35912818537068636, + "grad_norm": 0.7870661020278931, + "learning_rate": 9.244212106423178e-06, + "loss": 0.7307, + "step": 6525 + }, + { + "epoch": 0.359183224173042, + "grad_norm": 0.9163166284561157, + "learning_rate": 9.243982940101347e-06, + "loss": 0.8584, + "step": 6526 + }, + { + "epoch": 0.3592382629753977, + "grad_norm": 0.766888439655304, + "learning_rate": 9.243753741882874e-06, + "loss": 0.8093, + "step": 6527 + }, + { + "epoch": 0.3592933017777533, + "grad_norm": 0.7831236124038696, + "learning_rate": 9.243524511769486e-06, + "loss": 0.8665, + "step": 6528 + }, + { + "epoch": 0.35934834058010895, + "grad_norm": 0.7485133409500122, + "learning_rate": 9.243295249762904e-06, + "loss": 0.7336, + "step": 6529 + }, + { + "epoch": 0.35940337938246464, + "grad_norm": 0.7231502532958984, + "learning_rate": 9.24306595586485e-06, + "loss": 0.8095, + "step": 6530 + }, + { + "epoch": 0.35945841818482027, + "grad_norm": 0.821898877620697, + "learning_rate": 9.242836630077048e-06, + "loss": 0.831, + "step": 6531 + }, + { + "epoch": 0.35951345698717596, + "grad_norm": 0.6792737245559692, + "learning_rate": 9.242607272401223e-06, + "loss": 0.7183, + "step": 6532 + }, + { + "epoch": 0.3595684957895316, + "grad_norm": 0.7200430631637573, + "learning_rate": 9.242377882839095e-06, + "loss": 0.7256, + "step": 6533 + }, + { + "epoch": 0.3596235345918873, + "grad_norm": 0.6713700890541077, + "learning_rate": 9.242148461392393e-06, + "loss": 0.7416, + "step": 6534 + }, + { + "epoch": 0.3596785733942429, + "grad_norm": 0.7054564356803894, + "learning_rate": 9.241919008062836e-06, + "loss": 0.6856, + "step": 6535 + }, + { + "epoch": 0.3597336121965986, + "grad_norm": 0.7516196966171265, + "learning_rate": 9.241689522852152e-06, + "loss": 0.7149, + "step": 6536 + }, + { + "epoch": 0.35978865099895424, + "grad_norm": 0.8547651767730713, + "learning_rate": 9.241460005762067e-06, + "loss": 0.7075, + "step": 6537 + }, + { + "epoch": 0.35984368980130993, + "grad_norm": 0.6791819334030151, + "learning_rate": 9.241230456794302e-06, + "loss": 0.6449, + "step": 6538 + }, + { + "epoch": 0.35989872860366556, + "grad_norm": 0.8365122079849243, + "learning_rate": 9.241000875950583e-06, + "loss": 0.7619, + "step": 6539 + }, + { + "epoch": 0.35995376740602125, + "grad_norm": 0.763829231262207, + "learning_rate": 9.24077126323264e-06, + "loss": 0.71, + "step": 6540 + }, + { + "epoch": 0.3600088062083769, + "grad_norm": 0.7698483467102051, + "learning_rate": 9.240541618642193e-06, + "loss": 0.7949, + "step": 6541 + }, + { + "epoch": 0.3600638450107326, + "grad_norm": 0.7331508994102478, + "learning_rate": 9.24031194218097e-06, + "loss": 0.8292, + "step": 6542 + }, + { + "epoch": 0.3601188838130882, + "grad_norm": 0.7507451772689819, + "learning_rate": 9.2400822338507e-06, + "loss": 0.8651, + "step": 6543 + }, + { + "epoch": 0.3601739226154439, + "grad_norm": 0.8537001609802246, + "learning_rate": 9.239852493653104e-06, + "loss": 0.848, + "step": 6544 + }, + { + "epoch": 0.36022896141779953, + "grad_norm": 0.683311939239502, + "learning_rate": 9.239622721589913e-06, + "loss": 0.803, + "step": 6545 + }, + { + "epoch": 0.3602840002201552, + "grad_norm": 0.6916974186897278, + "learning_rate": 9.239392917662852e-06, + "loss": 0.8037, + "step": 6546 + }, + { + "epoch": 0.36033903902251085, + "grad_norm": 0.798795223236084, + "learning_rate": 9.23916308187365e-06, + "loss": 0.8037, + "step": 6547 + }, + { + "epoch": 0.36039407782486654, + "grad_norm": 0.7284069657325745, + "learning_rate": 9.238933214224032e-06, + "loss": 0.7365, + "step": 6548 + }, + { + "epoch": 0.3604491166272222, + "grad_norm": 0.7789250016212463, + "learning_rate": 9.238703314715727e-06, + "loss": 0.788, + "step": 6549 + }, + { + "epoch": 0.36050415542957787, + "grad_norm": 0.7029675841331482, + "learning_rate": 9.238473383350462e-06, + "loss": 0.7796, + "step": 6550 + }, + { + "epoch": 0.3605591942319335, + "grad_norm": 0.9094457626342773, + "learning_rate": 9.238243420129965e-06, + "loss": 0.7884, + "step": 6551 + }, + { + "epoch": 0.3606142330342892, + "grad_norm": 0.8253848552703857, + "learning_rate": 9.238013425055965e-06, + "loss": 0.7671, + "step": 6552 + }, + { + "epoch": 0.3606692718366448, + "grad_norm": 0.7052987813949585, + "learning_rate": 9.237783398130193e-06, + "loss": 0.7511, + "step": 6553 + }, + { + "epoch": 0.3607243106390005, + "grad_norm": 0.7506607174873352, + "learning_rate": 9.237553339354373e-06, + "loss": 0.6804, + "step": 6554 + }, + { + "epoch": 0.36077934944135615, + "grad_norm": 0.725106418132782, + "learning_rate": 9.237323248730237e-06, + "loss": 0.7658, + "step": 6555 + }, + { + "epoch": 0.36083438824371183, + "grad_norm": 0.8164945244789124, + "learning_rate": 9.237093126259515e-06, + "loss": 0.7857, + "step": 6556 + }, + { + "epoch": 0.36088942704606747, + "grad_norm": 0.6937377452850342, + "learning_rate": 9.236862971943934e-06, + "loss": 0.6985, + "step": 6557 + }, + { + "epoch": 0.36094446584842316, + "grad_norm": 0.7511105537414551, + "learning_rate": 9.236632785785225e-06, + "loss": 0.7891, + "step": 6558 + }, + { + "epoch": 0.3609995046507788, + "grad_norm": 0.7217637896537781, + "learning_rate": 9.236402567785118e-06, + "loss": 0.7942, + "step": 6559 + }, + { + "epoch": 0.3610545434531345, + "grad_norm": 1.1438478231430054, + "learning_rate": 9.236172317945343e-06, + "loss": 0.8311, + "step": 6560 + }, + { + "epoch": 0.3611095822554901, + "grad_norm": 0.7414245009422302, + "learning_rate": 9.23594203626763e-06, + "loss": 0.7726, + "step": 6561 + }, + { + "epoch": 0.3611646210578458, + "grad_norm": 0.7762154340744019, + "learning_rate": 9.235711722753712e-06, + "loss": 0.7891, + "step": 6562 + }, + { + "epoch": 0.36121965986020144, + "grad_norm": 0.7368801832199097, + "learning_rate": 9.23548137740532e-06, + "loss": 0.7656, + "step": 6563 + }, + { + "epoch": 0.3612746986625571, + "grad_norm": 0.7571502923965454, + "learning_rate": 9.235251000224181e-06, + "loss": 0.7845, + "step": 6564 + }, + { + "epoch": 0.36132973746491276, + "grad_norm": 0.8078309297561646, + "learning_rate": 9.235020591212031e-06, + "loss": 0.7969, + "step": 6565 + }, + { + "epoch": 0.36138477626726845, + "grad_norm": 0.6897913813591003, + "learning_rate": 9.234790150370599e-06, + "loss": 0.6922, + "step": 6566 + }, + { + "epoch": 0.3614398150696241, + "grad_norm": 0.8053449988365173, + "learning_rate": 9.234559677701618e-06, + "loss": 0.9126, + "step": 6567 + }, + { + "epoch": 0.36149485387197977, + "grad_norm": 0.8400903940200806, + "learning_rate": 9.23432917320682e-06, + "loss": 0.8144, + "step": 6568 + }, + { + "epoch": 0.3615498926743354, + "grad_norm": 0.7753110527992249, + "learning_rate": 9.234098636887935e-06, + "loss": 0.7025, + "step": 6569 + }, + { + "epoch": 0.3616049314766911, + "grad_norm": 0.7901243567466736, + "learning_rate": 9.233868068746702e-06, + "loss": 0.783, + "step": 6570 + }, + { + "epoch": 0.3616599702790467, + "grad_norm": 1.2297497987747192, + "learning_rate": 9.233637468784849e-06, + "loss": 0.8541, + "step": 6571 + }, + { + "epoch": 0.36171500908140236, + "grad_norm": 0.7590478658676147, + "learning_rate": 9.233406837004108e-06, + "loss": 0.7856, + "step": 6572 + }, + { + "epoch": 0.36177004788375805, + "grad_norm": 0.6651493310928345, + "learning_rate": 9.233176173406216e-06, + "loss": 0.6822, + "step": 6573 + }, + { + "epoch": 0.3618250866861137, + "grad_norm": 0.7760787010192871, + "learning_rate": 9.232945477992905e-06, + "loss": 0.8017, + "step": 6574 + }, + { + "epoch": 0.3618801254884694, + "grad_norm": 0.8788009285926819, + "learning_rate": 9.232714750765908e-06, + "loss": 0.7812, + "step": 6575 + }, + { + "epoch": 0.361935164290825, + "grad_norm": 0.7014517188072205, + "learning_rate": 9.232483991726961e-06, + "loss": 0.7293, + "step": 6576 + }, + { + "epoch": 0.3619902030931807, + "grad_norm": 0.7586061954498291, + "learning_rate": 9.232253200877797e-06, + "loss": 0.7953, + "step": 6577 + }, + { + "epoch": 0.36204524189553633, + "grad_norm": 0.8202564120292664, + "learning_rate": 9.232022378220151e-06, + "loss": 0.8545, + "step": 6578 + }, + { + "epoch": 0.362100280697892, + "grad_norm": 0.7816846966743469, + "learning_rate": 9.231791523755758e-06, + "loss": 0.8573, + "step": 6579 + }, + { + "epoch": 0.36215531950024765, + "grad_norm": 0.883222222328186, + "learning_rate": 9.23156063748635e-06, + "loss": 0.7733, + "step": 6580 + }, + { + "epoch": 0.36221035830260334, + "grad_norm": 0.8472830057144165, + "learning_rate": 9.231329719413668e-06, + "loss": 0.8931, + "step": 6581 + }, + { + "epoch": 0.362265397104959, + "grad_norm": 0.7916087508201599, + "learning_rate": 9.231098769539443e-06, + "loss": 0.8806, + "step": 6582 + }, + { + "epoch": 0.36232043590731466, + "grad_norm": 0.815339982509613, + "learning_rate": 9.230867787865414e-06, + "loss": 0.9081, + "step": 6583 + }, + { + "epoch": 0.3623754747096703, + "grad_norm": 1.2352560758590698, + "learning_rate": 9.230636774393312e-06, + "loss": 0.726, + "step": 6584 + }, + { + "epoch": 0.362430513512026, + "grad_norm": 0.759308397769928, + "learning_rate": 9.230405729124878e-06, + "loss": 0.7648, + "step": 6585 + }, + { + "epoch": 0.3624855523143816, + "grad_norm": 0.8285754323005676, + "learning_rate": 9.230174652061847e-06, + "loss": 0.7972, + "step": 6586 + }, + { + "epoch": 0.3625405911167373, + "grad_norm": 0.7393043041229248, + "learning_rate": 9.229943543205956e-06, + "loss": 0.7859, + "step": 6587 + }, + { + "epoch": 0.36259562991909294, + "grad_norm": 0.7354594469070435, + "learning_rate": 9.229712402558942e-06, + "loss": 0.6683, + "step": 6588 + }, + { + "epoch": 0.36265066872144863, + "grad_norm": 0.8244406580924988, + "learning_rate": 9.229481230122543e-06, + "loss": 0.6977, + "step": 6589 + }, + { + "epoch": 0.36270570752380427, + "grad_norm": 0.810565173625946, + "learning_rate": 9.229250025898493e-06, + "loss": 0.7278, + "step": 6590 + }, + { + "epoch": 0.36276074632615996, + "grad_norm": 0.7443352937698364, + "learning_rate": 9.229018789888532e-06, + "loss": 0.7821, + "step": 6591 + }, + { + "epoch": 0.3628157851285156, + "grad_norm": 0.9211748838424683, + "learning_rate": 9.228787522094398e-06, + "loss": 0.9174, + "step": 6592 + }, + { + "epoch": 0.3628708239308713, + "grad_norm": 0.7099255919456482, + "learning_rate": 9.22855622251783e-06, + "loss": 0.74, + "step": 6593 + }, + { + "epoch": 0.3629258627332269, + "grad_norm": 0.7373029589653015, + "learning_rate": 9.228324891160564e-06, + "loss": 0.7909, + "step": 6594 + }, + { + "epoch": 0.3629809015355826, + "grad_norm": 0.8774755001068115, + "learning_rate": 9.22809352802434e-06, + "loss": 0.8354, + "step": 6595 + }, + { + "epoch": 0.36303594033793823, + "grad_norm": 0.7547696232795715, + "learning_rate": 9.227862133110899e-06, + "loss": 0.6942, + "step": 6596 + }, + { + "epoch": 0.3630909791402939, + "grad_norm": 0.7868191003799438, + "learning_rate": 9.227630706421975e-06, + "loss": 0.7575, + "step": 6597 + }, + { + "epoch": 0.36314601794264956, + "grad_norm": 0.6753721237182617, + "learning_rate": 9.227399247959312e-06, + "loss": 0.7092, + "step": 6598 + }, + { + "epoch": 0.36320105674500525, + "grad_norm": 0.7317304611206055, + "learning_rate": 9.227167757724646e-06, + "loss": 0.8372, + "step": 6599 + }, + { + "epoch": 0.3632560955473609, + "grad_norm": 0.8928040266036987, + "learning_rate": 9.226936235719721e-06, + "loss": 0.8536, + "step": 6600 + }, + { + "epoch": 0.36331113434971657, + "grad_norm": 0.7178280353546143, + "learning_rate": 9.226704681946275e-06, + "loss": 0.7648, + "step": 6601 + }, + { + "epoch": 0.3633661731520722, + "grad_norm": 0.7439851760864258, + "learning_rate": 9.226473096406046e-06, + "loss": 0.8284, + "step": 6602 + }, + { + "epoch": 0.3634212119544279, + "grad_norm": 0.7000887989997864, + "learning_rate": 9.226241479100777e-06, + "loss": 0.7797, + "step": 6603 + }, + { + "epoch": 0.3634762507567835, + "grad_norm": 0.7882626056671143, + "learning_rate": 9.226009830032209e-06, + "loss": 0.72, + "step": 6604 + }, + { + "epoch": 0.3635312895591392, + "grad_norm": 0.6445927619934082, + "learning_rate": 9.225778149202081e-06, + "loss": 0.6785, + "step": 6605 + }, + { + "epoch": 0.36358632836149485, + "grad_norm": 0.7348469495773315, + "learning_rate": 9.225546436612137e-06, + "loss": 0.8117, + "step": 6606 + }, + { + "epoch": 0.36364136716385054, + "grad_norm": 0.7455001473426819, + "learning_rate": 9.225314692264118e-06, + "loss": 0.8196, + "step": 6607 + }, + { + "epoch": 0.36369640596620617, + "grad_norm": 0.7149390578269958, + "learning_rate": 9.225082916159762e-06, + "loss": 0.8841, + "step": 6608 + }, + { + "epoch": 0.36375144476856186, + "grad_norm": 0.7095748782157898, + "learning_rate": 9.224851108300816e-06, + "loss": 0.7336, + "step": 6609 + }, + { + "epoch": 0.3638064835709175, + "grad_norm": 0.7112231850624084, + "learning_rate": 9.224619268689019e-06, + "loss": 0.8606, + "step": 6610 + }, + { + "epoch": 0.3638615223732732, + "grad_norm": 0.8052846789360046, + "learning_rate": 9.224387397326115e-06, + "loss": 0.7838, + "step": 6611 + }, + { + "epoch": 0.3639165611756288, + "grad_norm": 0.7538836002349854, + "learning_rate": 9.224155494213846e-06, + "loss": 0.8252, + "step": 6612 + }, + { + "epoch": 0.3639715999779845, + "grad_norm": 0.6968722343444824, + "learning_rate": 9.223923559353956e-06, + "loss": 0.759, + "step": 6613 + }, + { + "epoch": 0.36402663878034014, + "grad_norm": 0.7797368168830872, + "learning_rate": 9.223691592748185e-06, + "loss": 0.8452, + "step": 6614 + }, + { + "epoch": 0.3640816775826958, + "grad_norm": 0.7738572955131531, + "learning_rate": 9.223459594398278e-06, + "loss": 0.806, + "step": 6615 + }, + { + "epoch": 0.36413671638505146, + "grad_norm": 0.7998547554016113, + "learning_rate": 9.223227564305983e-06, + "loss": 0.748, + "step": 6616 + }, + { + "epoch": 0.3641917551874071, + "grad_norm": 0.838666558265686, + "learning_rate": 9.222995502473037e-06, + "loss": 0.8252, + "step": 6617 + }, + { + "epoch": 0.3642467939897628, + "grad_norm": 1.1672697067260742, + "learning_rate": 9.222763408901189e-06, + "loss": 0.806, + "step": 6618 + }, + { + "epoch": 0.3643018327921184, + "grad_norm": 0.6721193194389343, + "learning_rate": 9.22253128359218e-06, + "loss": 0.6897, + "step": 6619 + }, + { + "epoch": 0.3643568715944741, + "grad_norm": 0.8152795433998108, + "learning_rate": 9.222299126547758e-06, + "loss": 0.8377, + "step": 6620 + }, + { + "epoch": 0.36441191039682974, + "grad_norm": 0.7959492206573486, + "learning_rate": 9.222066937769664e-06, + "loss": 0.8496, + "step": 6621 + }, + { + "epoch": 0.36446694919918543, + "grad_norm": 0.7759784460067749, + "learning_rate": 9.221834717259646e-06, + "loss": 0.7736, + "step": 6622 + }, + { + "epoch": 0.36452198800154106, + "grad_norm": 0.6929076313972473, + "learning_rate": 9.221602465019449e-06, + "loss": 0.7759, + "step": 6623 + }, + { + "epoch": 0.36457702680389675, + "grad_norm": 0.7323315143585205, + "learning_rate": 9.221370181050817e-06, + "loss": 0.7958, + "step": 6624 + }, + { + "epoch": 0.3646320656062524, + "grad_norm": 0.7177294492721558, + "learning_rate": 9.221137865355496e-06, + "loss": 0.8405, + "step": 6625 + }, + { + "epoch": 0.3646871044086081, + "grad_norm": 0.7425093650817871, + "learning_rate": 9.220905517935235e-06, + "loss": 0.7722, + "step": 6626 + }, + { + "epoch": 0.3647421432109637, + "grad_norm": 0.8761040568351746, + "learning_rate": 9.220673138791775e-06, + "loss": 0.8617, + "step": 6627 + }, + { + "epoch": 0.3647971820133194, + "grad_norm": 0.927509069442749, + "learning_rate": 9.220440727926869e-06, + "loss": 0.7839, + "step": 6628 + }, + { + "epoch": 0.36485222081567503, + "grad_norm": 0.874399721622467, + "learning_rate": 9.220208285342258e-06, + "loss": 0.9697, + "step": 6629 + }, + { + "epoch": 0.3649072596180307, + "grad_norm": 0.931384801864624, + "learning_rate": 9.219975811039691e-06, + "loss": 0.8142, + "step": 6630 + }, + { + "epoch": 0.36496229842038636, + "grad_norm": 0.8567885160446167, + "learning_rate": 9.219743305020916e-06, + "loss": 0.7623, + "step": 6631 + }, + { + "epoch": 0.36501733722274204, + "grad_norm": 0.7287514209747314, + "learning_rate": 9.21951076728768e-06, + "loss": 0.8044, + "step": 6632 + }, + { + "epoch": 0.3650723760250977, + "grad_norm": 0.7234703302383423, + "learning_rate": 9.21927819784173e-06, + "loss": 0.7736, + "step": 6633 + }, + { + "epoch": 0.36512741482745337, + "grad_norm": 0.7174978256225586, + "learning_rate": 9.219045596684815e-06, + "loss": 0.7658, + "step": 6634 + }, + { + "epoch": 0.365182453629809, + "grad_norm": 0.751075804233551, + "learning_rate": 9.218812963818682e-06, + "loss": 0.7586, + "step": 6635 + }, + { + "epoch": 0.3652374924321647, + "grad_norm": 0.755283534526825, + "learning_rate": 9.21858029924508e-06, + "loss": 0.8904, + "step": 6636 + }, + { + "epoch": 0.3652925312345203, + "grad_norm": 0.6439716815948486, + "learning_rate": 9.21834760296576e-06, + "loss": 0.7335, + "step": 6637 + }, + { + "epoch": 0.365347570036876, + "grad_norm": 0.735285758972168, + "learning_rate": 9.218114874982467e-06, + "loss": 0.7193, + "step": 6638 + }, + { + "epoch": 0.36540260883923165, + "grad_norm": 0.7724307775497437, + "learning_rate": 9.217882115296952e-06, + "loss": 0.8322, + "step": 6639 + }, + { + "epoch": 0.36545764764158734, + "grad_norm": 0.7771303653717041, + "learning_rate": 9.217649323910964e-06, + "loss": 0.7952, + "step": 6640 + }, + { + "epoch": 0.36551268644394297, + "grad_norm": 0.7753337621688843, + "learning_rate": 9.217416500826251e-06, + "loss": 0.8501, + "step": 6641 + }, + { + "epoch": 0.36556772524629866, + "grad_norm": 0.8104514479637146, + "learning_rate": 9.217183646044567e-06, + "loss": 0.8503, + "step": 6642 + }, + { + "epoch": 0.3656227640486543, + "grad_norm": 0.7191929221153259, + "learning_rate": 9.21695075956766e-06, + "loss": 0.7578, + "step": 6643 + }, + { + "epoch": 0.36567780285101, + "grad_norm": 0.745837926864624, + "learning_rate": 9.216717841397277e-06, + "loss": 0.819, + "step": 6644 + }, + { + "epoch": 0.3657328416533656, + "grad_norm": 0.7019662261009216, + "learning_rate": 9.216484891535174e-06, + "loss": 0.8024, + "step": 6645 + }, + { + "epoch": 0.3657878804557213, + "grad_norm": 0.9709738492965698, + "learning_rate": 9.216251909983095e-06, + "loss": 0.7653, + "step": 6646 + }, + { + "epoch": 0.36584291925807694, + "grad_norm": 0.7973032593727112, + "learning_rate": 9.2160188967428e-06, + "loss": 0.8071, + "step": 6647 + }, + { + "epoch": 0.3658979580604326, + "grad_norm": 0.6945796012878418, + "learning_rate": 9.215785851816034e-06, + "loss": 0.6831, + "step": 6648 + }, + { + "epoch": 0.36595299686278826, + "grad_norm": 0.8685100674629211, + "learning_rate": 9.21555277520455e-06, + "loss": 0.821, + "step": 6649 + }, + { + "epoch": 0.36600803566514395, + "grad_norm": 1.0164310932159424, + "learning_rate": 9.2153196669101e-06, + "loss": 0.7861, + "step": 6650 + }, + { + "epoch": 0.3660630744674996, + "grad_norm": 0.8572850227355957, + "learning_rate": 9.215086526934435e-06, + "loss": 0.7982, + "step": 6651 + }, + { + "epoch": 0.36611811326985527, + "grad_norm": 0.7481987476348877, + "learning_rate": 9.214853355279307e-06, + "loss": 0.8258, + "step": 6652 + }, + { + "epoch": 0.3661731520722109, + "grad_norm": 0.750344455242157, + "learning_rate": 9.214620151946472e-06, + "loss": 0.7842, + "step": 6653 + }, + { + "epoch": 0.3662281908745666, + "grad_norm": 1.0266414880752563, + "learning_rate": 9.214386916937678e-06, + "loss": 0.7313, + "step": 6654 + }, + { + "epoch": 0.36628322967692223, + "grad_norm": 0.7913589477539062, + "learning_rate": 9.214153650254682e-06, + "loss": 0.8251, + "step": 6655 + }, + { + "epoch": 0.3663382684792779, + "grad_norm": 0.7185465693473816, + "learning_rate": 9.213920351899235e-06, + "loss": 0.7145, + "step": 6656 + }, + { + "epoch": 0.36639330728163355, + "grad_norm": 0.7185063362121582, + "learning_rate": 9.213687021873088e-06, + "loss": 0.8321, + "step": 6657 + }, + { + "epoch": 0.3664483460839892, + "grad_norm": 0.8380091190338135, + "learning_rate": 9.213453660178e-06, + "loss": 0.8293, + "step": 6658 + }, + { + "epoch": 0.3665033848863449, + "grad_norm": 0.7569485306739807, + "learning_rate": 9.21322026681572e-06, + "loss": 0.7201, + "step": 6659 + }, + { + "epoch": 0.3665584236887005, + "grad_norm": 0.7212445735931396, + "learning_rate": 9.212986841788005e-06, + "loss": 0.7869, + "step": 6660 + }, + { + "epoch": 0.3666134624910562, + "grad_norm": 0.9435489773750305, + "learning_rate": 9.212753385096612e-06, + "loss": 0.8469, + "step": 6661 + }, + { + "epoch": 0.36666850129341183, + "grad_norm": 0.6609265208244324, + "learning_rate": 9.212519896743289e-06, + "loss": 0.6446, + "step": 6662 + }, + { + "epoch": 0.3667235400957675, + "grad_norm": 0.7232604026794434, + "learning_rate": 9.212286376729794e-06, + "loss": 0.7138, + "step": 6663 + }, + { + "epoch": 0.36677857889812315, + "grad_norm": 0.7276197075843811, + "learning_rate": 9.212052825057882e-06, + "loss": 0.725, + "step": 6664 + }, + { + "epoch": 0.36683361770047884, + "grad_norm": 0.7029727101325989, + "learning_rate": 9.21181924172931e-06, + "loss": 0.6973, + "step": 6665 + }, + { + "epoch": 0.3668886565028345, + "grad_norm": 0.7292968034744263, + "learning_rate": 9.21158562674583e-06, + "loss": 0.6984, + "step": 6666 + }, + { + "epoch": 0.36694369530519017, + "grad_norm": 0.6977009177207947, + "learning_rate": 9.2113519801092e-06, + "loss": 0.7752, + "step": 6667 + }, + { + "epoch": 0.3669987341075458, + "grad_norm": 0.8019471764564514, + "learning_rate": 9.211118301821176e-06, + "loss": 0.7481, + "step": 6668 + }, + { + "epoch": 0.3670537729099015, + "grad_norm": 0.8097867965698242, + "learning_rate": 9.210884591883516e-06, + "loss": 0.8077, + "step": 6669 + }, + { + "epoch": 0.3671088117122571, + "grad_norm": 1.1622828245162964, + "learning_rate": 9.210650850297973e-06, + "loss": 0.8053, + "step": 6670 + }, + { + "epoch": 0.3671638505146128, + "grad_norm": 0.8188957571983337, + "learning_rate": 9.210417077066304e-06, + "loss": 0.7731, + "step": 6671 + }, + { + "epoch": 0.36721888931696844, + "grad_norm": 0.8531584739685059, + "learning_rate": 9.210183272190269e-06, + "loss": 0.8183, + "step": 6672 + }, + { + "epoch": 0.36727392811932413, + "grad_norm": 0.8007203936576843, + "learning_rate": 9.209949435671624e-06, + "loss": 0.7906, + "step": 6673 + }, + { + "epoch": 0.36732896692167977, + "grad_norm": 0.8284860253334045, + "learning_rate": 9.209715567512126e-06, + "loss": 0.7845, + "step": 6674 + }, + { + "epoch": 0.36738400572403546, + "grad_norm": 0.7735304236412048, + "learning_rate": 9.209481667713533e-06, + "loss": 0.7333, + "step": 6675 + }, + { + "epoch": 0.3674390445263911, + "grad_norm": 0.7390912771224976, + "learning_rate": 9.209247736277601e-06, + "loss": 0.7992, + "step": 6676 + }, + { + "epoch": 0.3674940833287468, + "grad_norm": 0.6871926784515381, + "learning_rate": 9.209013773206091e-06, + "loss": 0.7765, + "step": 6677 + }, + { + "epoch": 0.3675491221311024, + "grad_norm": 0.7241746187210083, + "learning_rate": 9.208779778500758e-06, + "loss": 0.7124, + "step": 6678 + }, + { + "epoch": 0.3676041609334581, + "grad_norm": 0.7362630367279053, + "learning_rate": 9.208545752163365e-06, + "loss": 0.7695, + "step": 6679 + }, + { + "epoch": 0.36765919973581374, + "grad_norm": 0.7577944993972778, + "learning_rate": 9.208311694195669e-06, + "loss": 0.8302, + "step": 6680 + }, + { + "epoch": 0.3677142385381694, + "grad_norm": 0.7182355523109436, + "learning_rate": 9.208077604599427e-06, + "loss": 0.8182, + "step": 6681 + }, + { + "epoch": 0.36776927734052506, + "grad_norm": 0.7636679410934448, + "learning_rate": 9.207843483376402e-06, + "loss": 0.7266, + "step": 6682 + }, + { + "epoch": 0.36782431614288075, + "grad_norm": 0.7325936555862427, + "learning_rate": 9.207609330528349e-06, + "loss": 0.735, + "step": 6683 + }, + { + "epoch": 0.3678793549452364, + "grad_norm": 1.1119143962860107, + "learning_rate": 9.207375146057033e-06, + "loss": 1.0124, + "step": 6684 + }, + { + "epoch": 0.36793439374759207, + "grad_norm": 0.7694228291511536, + "learning_rate": 9.207140929964212e-06, + "loss": 0.7803, + "step": 6685 + }, + { + "epoch": 0.3679894325499477, + "grad_norm": 0.7628658413887024, + "learning_rate": 9.206906682251644e-06, + "loss": 0.8057, + "step": 6686 + }, + { + "epoch": 0.3680444713523034, + "grad_norm": 0.766266942024231, + "learning_rate": 9.206672402921092e-06, + "loss": 0.7827, + "step": 6687 + }, + { + "epoch": 0.368099510154659, + "grad_norm": 0.7355746626853943, + "learning_rate": 9.206438091974316e-06, + "loss": 0.8146, + "step": 6688 + }, + { + "epoch": 0.3681545489570147, + "grad_norm": 0.8464547395706177, + "learning_rate": 9.20620374941308e-06, + "loss": 0.8296, + "step": 6689 + }, + { + "epoch": 0.36820958775937035, + "grad_norm": 0.7113955616950989, + "learning_rate": 9.20596937523914e-06, + "loss": 0.7621, + "step": 6690 + }, + { + "epoch": 0.36826462656172604, + "grad_norm": 0.7141324877738953, + "learning_rate": 9.205734969454259e-06, + "loss": 0.738, + "step": 6691 + }, + { + "epoch": 0.36831966536408167, + "grad_norm": 0.7576237320899963, + "learning_rate": 9.2055005320602e-06, + "loss": 0.7727, + "step": 6692 + }, + { + "epoch": 0.36837470416643736, + "grad_norm": 0.7448444962501526, + "learning_rate": 9.205266063058727e-06, + "loss": 0.8238, + "step": 6693 + }, + { + "epoch": 0.368429742968793, + "grad_norm": 0.7441811561584473, + "learning_rate": 9.205031562451599e-06, + "loss": 0.7518, + "step": 6694 + }, + { + "epoch": 0.3684847817711487, + "grad_norm": 0.9284115433692932, + "learning_rate": 9.20479703024058e-06, + "loss": 0.817, + "step": 6695 + }, + { + "epoch": 0.3685398205735043, + "grad_norm": 0.7019243836402893, + "learning_rate": 9.204562466427431e-06, + "loss": 0.7403, + "step": 6696 + }, + { + "epoch": 0.36859485937586, + "grad_norm": 0.6345306634902954, + "learning_rate": 9.204327871013917e-06, + "loss": 0.7058, + "step": 6697 + }, + { + "epoch": 0.36864989817821564, + "grad_norm": 0.7375063300132751, + "learning_rate": 9.2040932440018e-06, + "loss": 0.831, + "step": 6698 + }, + { + "epoch": 0.36870493698057133, + "grad_norm": 0.8213731050491333, + "learning_rate": 9.203858585392842e-06, + "loss": 0.7677, + "step": 6699 + }, + { + "epoch": 0.36875997578292696, + "grad_norm": 0.7114601731300354, + "learning_rate": 9.203623895188809e-06, + "loss": 0.8015, + "step": 6700 + }, + { + "epoch": 0.3688150145852826, + "grad_norm": 0.7707667350769043, + "learning_rate": 9.203389173391463e-06, + "loss": 0.7758, + "step": 6701 + }, + { + "epoch": 0.3688700533876383, + "grad_norm": 0.7374396920204163, + "learning_rate": 9.203154420002572e-06, + "loss": 0.7583, + "step": 6702 + }, + { + "epoch": 0.3689250921899939, + "grad_norm": 0.7156866192817688, + "learning_rate": 9.202919635023895e-06, + "loss": 0.8173, + "step": 6703 + }, + { + "epoch": 0.3689801309923496, + "grad_norm": 0.6811904311180115, + "learning_rate": 9.2026848184572e-06, + "loss": 0.7441, + "step": 6704 + }, + { + "epoch": 0.36903516979470524, + "grad_norm": 0.7515163421630859, + "learning_rate": 9.20244997030425e-06, + "loss": 0.7927, + "step": 6705 + }, + { + "epoch": 0.36909020859706093, + "grad_norm": 0.761116087436676, + "learning_rate": 9.202215090566813e-06, + "loss": 0.7686, + "step": 6706 + }, + { + "epoch": 0.36914524739941657, + "grad_norm": 0.8726711869239807, + "learning_rate": 9.20198017924665e-06, + "loss": 0.7831, + "step": 6707 + }, + { + "epoch": 0.36920028620177225, + "grad_norm": 0.6868153810501099, + "learning_rate": 9.20174523634553e-06, + "loss": 0.7855, + "step": 6708 + }, + { + "epoch": 0.3692553250041279, + "grad_norm": 0.7140498757362366, + "learning_rate": 9.201510261865218e-06, + "loss": 0.8144, + "step": 6709 + }, + { + "epoch": 0.3693103638064836, + "grad_norm": 0.8745181560516357, + "learning_rate": 9.201275255807478e-06, + "loss": 0.9204, + "step": 6710 + }, + { + "epoch": 0.3693654026088392, + "grad_norm": 0.6535945534706116, + "learning_rate": 9.20104021817408e-06, + "loss": 0.7729, + "step": 6711 + }, + { + "epoch": 0.3694204414111949, + "grad_norm": 0.655857503414154, + "learning_rate": 9.200805148966785e-06, + "loss": 0.8373, + "step": 6712 + }, + { + "epoch": 0.36947548021355053, + "grad_norm": 0.8393271565437317, + "learning_rate": 9.200570048187365e-06, + "loss": 0.8532, + "step": 6713 + }, + { + "epoch": 0.3695305190159062, + "grad_norm": 0.7484574913978577, + "learning_rate": 9.200334915837585e-06, + "loss": 0.8411, + "step": 6714 + }, + { + "epoch": 0.36958555781826186, + "grad_norm": 0.9913665652275085, + "learning_rate": 9.200099751919212e-06, + "loss": 0.9011, + "step": 6715 + }, + { + "epoch": 0.36964059662061755, + "grad_norm": 0.7314063310623169, + "learning_rate": 9.199864556434013e-06, + "loss": 0.7184, + "step": 6716 + }, + { + "epoch": 0.3696956354229732, + "grad_norm": 0.7881553173065186, + "learning_rate": 9.199629329383758e-06, + "loss": 0.796, + "step": 6717 + }, + { + "epoch": 0.36975067422532887, + "grad_norm": 0.7440283298492432, + "learning_rate": 9.199394070770212e-06, + "loss": 0.7472, + "step": 6718 + }, + { + "epoch": 0.3698057130276845, + "grad_norm": 0.6916326880455017, + "learning_rate": 9.199158780595144e-06, + "loss": 0.6808, + "step": 6719 + }, + { + "epoch": 0.3698607518300402, + "grad_norm": 0.8482714295387268, + "learning_rate": 9.198923458860323e-06, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.3699157906323958, + "grad_norm": 0.7541999816894531, + "learning_rate": 9.198688105567516e-06, + "loss": 0.7917, + "step": 6721 + }, + { + "epoch": 0.3699708294347515, + "grad_norm": 0.794335126876831, + "learning_rate": 9.198452720718494e-06, + "loss": 0.8463, + "step": 6722 + }, + { + "epoch": 0.37002586823710715, + "grad_norm": 0.7866827845573425, + "learning_rate": 9.198217304315025e-06, + "loss": 0.7938, + "step": 6723 + }, + { + "epoch": 0.37008090703946284, + "grad_norm": 0.7393556833267212, + "learning_rate": 9.19798185635888e-06, + "loss": 0.7825, + "step": 6724 + }, + { + "epoch": 0.37013594584181847, + "grad_norm": 0.7131090760231018, + "learning_rate": 9.197746376851825e-06, + "loss": 0.7184, + "step": 6725 + }, + { + "epoch": 0.37019098464417416, + "grad_norm": 0.7054039239883423, + "learning_rate": 9.197510865795634e-06, + "loss": 0.7458, + "step": 6726 + }, + { + "epoch": 0.3702460234465298, + "grad_norm": 0.7437009811401367, + "learning_rate": 9.197275323192073e-06, + "loss": 0.7921, + "step": 6727 + }, + { + "epoch": 0.3703010622488855, + "grad_norm": 1.0703076124191284, + "learning_rate": 9.197039749042916e-06, + "loss": 0.771, + "step": 6728 + }, + { + "epoch": 0.3703561010512411, + "grad_norm": 0.8278045654296875, + "learning_rate": 9.196804143349929e-06, + "loss": 0.8984, + "step": 6729 + }, + { + "epoch": 0.3704111398535968, + "grad_norm": 0.7713067531585693, + "learning_rate": 9.196568506114887e-06, + "loss": 0.7702, + "step": 6730 + }, + { + "epoch": 0.37046617865595244, + "grad_norm": 0.9040505290031433, + "learning_rate": 9.19633283733956e-06, + "loss": 0.7113, + "step": 6731 + }, + { + "epoch": 0.3705212174583081, + "grad_norm": 0.8853700757026672, + "learning_rate": 9.196097137025718e-06, + "loss": 0.8445, + "step": 6732 + }, + { + "epoch": 0.37057625626066376, + "grad_norm": 0.6870817542076111, + "learning_rate": 9.195861405175133e-06, + "loss": 0.7613, + "step": 6733 + }, + { + "epoch": 0.37063129506301945, + "grad_norm": 0.7539152503013611, + "learning_rate": 9.195625641789579e-06, + "loss": 0.7478, + "step": 6734 + }, + { + "epoch": 0.3706863338653751, + "grad_norm": 0.7084356546401978, + "learning_rate": 9.195389846870822e-06, + "loss": 0.7803, + "step": 6735 + }, + { + "epoch": 0.3707413726677308, + "grad_norm": 0.7883948087692261, + "learning_rate": 9.19515402042064e-06, + "loss": 0.8606, + "step": 6736 + }, + { + "epoch": 0.3707964114700864, + "grad_norm": 0.714948296546936, + "learning_rate": 9.194918162440804e-06, + "loss": 0.8066, + "step": 6737 + }, + { + "epoch": 0.3708514502724421, + "grad_norm": 0.7110786437988281, + "learning_rate": 9.194682272933085e-06, + "loss": 0.7439, + "step": 6738 + }, + { + "epoch": 0.37090648907479773, + "grad_norm": 0.7281045317649841, + "learning_rate": 9.194446351899257e-06, + "loss": 0.7772, + "step": 6739 + }, + { + "epoch": 0.3709615278771534, + "grad_norm": 0.7351245880126953, + "learning_rate": 9.194210399341093e-06, + "loss": 0.8777, + "step": 6740 + }, + { + "epoch": 0.37101656667950905, + "grad_norm": 0.8028532266616821, + "learning_rate": 9.193974415260367e-06, + "loss": 0.7461, + "step": 6741 + }, + { + "epoch": 0.37107160548186474, + "grad_norm": 0.8015451431274414, + "learning_rate": 9.19373839965885e-06, + "loss": 0.8006, + "step": 6742 + }, + { + "epoch": 0.3711266442842204, + "grad_norm": 0.9567442536354065, + "learning_rate": 9.193502352538321e-06, + "loss": 0.8636, + "step": 6743 + }, + { + "epoch": 0.371181683086576, + "grad_norm": 1.1413114070892334, + "learning_rate": 9.193266273900547e-06, + "loss": 0.8976, + "step": 6744 + }, + { + "epoch": 0.3712367218889317, + "grad_norm": 0.6971789002418518, + "learning_rate": 9.19303016374731e-06, + "loss": 0.7419, + "step": 6745 + }, + { + "epoch": 0.37129176069128733, + "grad_norm": 0.8117435574531555, + "learning_rate": 9.192794022080378e-06, + "loss": 0.8166, + "step": 6746 + }, + { + "epoch": 0.371346799493643, + "grad_norm": 0.7748119831085205, + "learning_rate": 9.19255784890153e-06, + "loss": 0.8073, + "step": 6747 + }, + { + "epoch": 0.37140183829599865, + "grad_norm": 0.6550068259239197, + "learning_rate": 9.192321644212539e-06, + "loss": 0.6976, + "step": 6748 + }, + { + "epoch": 0.37145687709835434, + "grad_norm": 0.7931404709815979, + "learning_rate": 9.19208540801518e-06, + "loss": 0.7153, + "step": 6749 + }, + { + "epoch": 0.37151191590071, + "grad_norm": 0.7107539176940918, + "learning_rate": 9.19184914031123e-06, + "loss": 0.7616, + "step": 6750 + }, + { + "epoch": 0.37156695470306567, + "grad_norm": 0.6983848810195923, + "learning_rate": 9.191612841102463e-06, + "loss": 0.6507, + "step": 6751 + }, + { + "epoch": 0.3716219935054213, + "grad_norm": 0.7653477787971497, + "learning_rate": 9.191376510390657e-06, + "loss": 0.708, + "step": 6752 + }, + { + "epoch": 0.371677032307777, + "grad_norm": 0.8903954029083252, + "learning_rate": 9.191140148177586e-06, + "loss": 0.8131, + "step": 6753 + }, + { + "epoch": 0.3717320711101326, + "grad_norm": 0.7584933042526245, + "learning_rate": 9.190903754465028e-06, + "loss": 0.8178, + "step": 6754 + }, + { + "epoch": 0.3717871099124883, + "grad_norm": 0.7338405847549438, + "learning_rate": 9.19066732925476e-06, + "loss": 0.7717, + "step": 6755 + }, + { + "epoch": 0.37184214871484395, + "grad_norm": 0.764944851398468, + "learning_rate": 9.190430872548557e-06, + "loss": 0.7762, + "step": 6756 + }, + { + "epoch": 0.37189718751719963, + "grad_norm": 0.7362231612205505, + "learning_rate": 9.190194384348199e-06, + "loss": 0.8277, + "step": 6757 + }, + { + "epoch": 0.37195222631955527, + "grad_norm": 0.7462226748466492, + "learning_rate": 9.18995786465546e-06, + "loss": 0.7362, + "step": 6758 + }, + { + "epoch": 0.37200726512191096, + "grad_norm": 0.7769725322723389, + "learning_rate": 9.18972131347212e-06, + "loss": 0.8217, + "step": 6759 + }, + { + "epoch": 0.3720623039242666, + "grad_norm": 0.7263969779014587, + "learning_rate": 9.189484730799956e-06, + "loss": 0.7719, + "step": 6760 + }, + { + "epoch": 0.3721173427266223, + "grad_norm": 0.7612473964691162, + "learning_rate": 9.189248116640746e-06, + "loss": 0.7149, + "step": 6761 + }, + { + "epoch": 0.3721723815289779, + "grad_norm": 0.6813042759895325, + "learning_rate": 9.189011470996268e-06, + "loss": 0.7119, + "step": 6762 + }, + { + "epoch": 0.3722274203313336, + "grad_norm": 0.7376571297645569, + "learning_rate": 9.188774793868302e-06, + "loss": 0.7998, + "step": 6763 + }, + { + "epoch": 0.37228245913368924, + "grad_norm": 0.8592102527618408, + "learning_rate": 9.188538085258626e-06, + "loss": 0.8026, + "step": 6764 + }, + { + "epoch": 0.3723374979360449, + "grad_norm": 0.7666613459587097, + "learning_rate": 9.188301345169017e-06, + "loss": 0.8571, + "step": 6765 + }, + { + "epoch": 0.37239253673840056, + "grad_norm": 0.7118985652923584, + "learning_rate": 9.188064573601258e-06, + "loss": 0.7637, + "step": 6766 + }, + { + "epoch": 0.37244757554075625, + "grad_norm": 0.8247082233428955, + "learning_rate": 9.187827770557127e-06, + "loss": 0.8209, + "step": 6767 + }, + { + "epoch": 0.3725026143431119, + "grad_norm": 0.7259567975997925, + "learning_rate": 9.187590936038403e-06, + "loss": 0.7918, + "step": 6768 + }, + { + "epoch": 0.37255765314546757, + "grad_norm": 0.7409893274307251, + "learning_rate": 9.187354070046867e-06, + "loss": 0.8004, + "step": 6769 + }, + { + "epoch": 0.3726126919478232, + "grad_norm": 0.8163084387779236, + "learning_rate": 9.187117172584298e-06, + "loss": 0.8452, + "step": 6770 + }, + { + "epoch": 0.3726677307501789, + "grad_norm": 0.9241586923599243, + "learning_rate": 9.186880243652477e-06, + "loss": 0.8939, + "step": 6771 + }, + { + "epoch": 0.3727227695525345, + "grad_norm": 0.710434079170227, + "learning_rate": 9.186643283253185e-06, + "loss": 0.7337, + "step": 6772 + }, + { + "epoch": 0.3727778083548902, + "grad_norm": 0.7850505709648132, + "learning_rate": 9.186406291388203e-06, + "loss": 0.7892, + "step": 6773 + }, + { + "epoch": 0.37283284715724585, + "grad_norm": 0.813979983329773, + "learning_rate": 9.186169268059311e-06, + "loss": 0.7993, + "step": 6774 + }, + { + "epoch": 0.37288788595960154, + "grad_norm": 0.7923213243484497, + "learning_rate": 9.185932213268292e-06, + "loss": 0.7501, + "step": 6775 + }, + { + "epoch": 0.3729429247619572, + "grad_norm": 0.7923155426979065, + "learning_rate": 9.185695127016928e-06, + "loss": 0.8435, + "step": 6776 + }, + { + "epoch": 0.37299796356431286, + "grad_norm": 0.69893479347229, + "learning_rate": 9.185458009306999e-06, + "loss": 0.7155, + "step": 6777 + }, + { + "epoch": 0.3730530023666685, + "grad_norm": 0.7848305106163025, + "learning_rate": 9.185220860140289e-06, + "loss": 0.7971, + "step": 6778 + }, + { + "epoch": 0.3731080411690242, + "grad_norm": 0.6707655787467957, + "learning_rate": 9.184983679518578e-06, + "loss": 0.6939, + "step": 6779 + }, + { + "epoch": 0.3731630799713798, + "grad_norm": 0.6612532734870911, + "learning_rate": 9.18474646744365e-06, + "loss": 0.7361, + "step": 6780 + }, + { + "epoch": 0.3732181187737355, + "grad_norm": 0.7753985524177551, + "learning_rate": 9.184509223917288e-06, + "loss": 0.7263, + "step": 6781 + }, + { + "epoch": 0.37327315757609114, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.184271948941275e-06, + "loss": 0.6923, + "step": 6782 + }, + { + "epoch": 0.37332819637844683, + "grad_norm": 0.7223647832870483, + "learning_rate": 9.184034642517393e-06, + "loss": 0.793, + "step": 6783 + }, + { + "epoch": 0.37338323518080246, + "grad_norm": 0.7428838014602661, + "learning_rate": 9.183797304647428e-06, + "loss": 0.7781, + "step": 6784 + }, + { + "epoch": 0.37343827398315815, + "grad_norm": 0.7301773428916931, + "learning_rate": 9.183559935333161e-06, + "loss": 0.7964, + "step": 6785 + }, + { + "epoch": 0.3734933127855138, + "grad_norm": 0.7883384823799133, + "learning_rate": 9.183322534576378e-06, + "loss": 0.8904, + "step": 6786 + }, + { + "epoch": 0.3735483515878694, + "grad_norm": 0.7943564653396606, + "learning_rate": 9.183085102378864e-06, + "loss": 0.7229, + "step": 6787 + }, + { + "epoch": 0.3736033903902251, + "grad_norm": 0.7385129928588867, + "learning_rate": 9.1828476387424e-06, + "loss": 0.7967, + "step": 6788 + }, + { + "epoch": 0.37365842919258074, + "grad_norm": 0.7968102097511292, + "learning_rate": 9.182610143668775e-06, + "loss": 0.8016, + "step": 6789 + }, + { + "epoch": 0.37371346799493643, + "grad_norm": 0.7810283303260803, + "learning_rate": 9.18237261715977e-06, + "loss": 0.8956, + "step": 6790 + }, + { + "epoch": 0.37376850679729207, + "grad_norm": 0.7110065221786499, + "learning_rate": 9.182135059217172e-06, + "loss": 0.7808, + "step": 6791 + }, + { + "epoch": 0.37382354559964776, + "grad_norm": 0.7513633370399475, + "learning_rate": 9.181897469842767e-06, + "loss": 0.8236, + "step": 6792 + }, + { + "epoch": 0.3738785844020034, + "grad_norm": 0.7850426435470581, + "learning_rate": 9.18165984903834e-06, + "loss": 0.8642, + "step": 6793 + }, + { + "epoch": 0.3739336232043591, + "grad_norm": 1.4948225021362305, + "learning_rate": 9.181422196805676e-06, + "loss": 0.8765, + "step": 6794 + }, + { + "epoch": 0.3739886620067147, + "grad_norm": 0.8242343068122864, + "learning_rate": 9.181184513146563e-06, + "loss": 0.7213, + "step": 6795 + }, + { + "epoch": 0.3740437008090704, + "grad_norm": 0.8017476797103882, + "learning_rate": 9.180946798062786e-06, + "loss": 0.655, + "step": 6796 + }, + { + "epoch": 0.37409873961142603, + "grad_norm": 0.9573387503623962, + "learning_rate": 9.180709051556132e-06, + "loss": 0.8674, + "step": 6797 + }, + { + "epoch": 0.3741537784137817, + "grad_norm": 0.7575511932373047, + "learning_rate": 9.180471273628388e-06, + "loss": 0.8672, + "step": 6798 + }, + { + "epoch": 0.37420881721613736, + "grad_norm": 0.7723323702812195, + "learning_rate": 9.180233464281343e-06, + "loss": 0.7698, + "step": 6799 + }, + { + "epoch": 0.37426385601849305, + "grad_norm": 0.8352731466293335, + "learning_rate": 9.17999562351678e-06, + "loss": 0.9248, + "step": 6800 + }, + { + "epoch": 0.3743188948208487, + "grad_norm": 0.7459322214126587, + "learning_rate": 9.179757751336488e-06, + "loss": 0.7561, + "step": 6801 + }, + { + "epoch": 0.37437393362320437, + "grad_norm": 0.8053051829338074, + "learning_rate": 9.179519847742257e-06, + "loss": 0.8743, + "step": 6802 + }, + { + "epoch": 0.37442897242556, + "grad_norm": 0.7781768441200256, + "learning_rate": 9.179281912735873e-06, + "loss": 0.7426, + "step": 6803 + }, + { + "epoch": 0.3744840112279157, + "grad_norm": 0.6812007427215576, + "learning_rate": 9.179043946319126e-06, + "loss": 0.761, + "step": 6804 + }, + { + "epoch": 0.3745390500302713, + "grad_norm": 0.8327108025550842, + "learning_rate": 9.178805948493803e-06, + "loss": 0.7633, + "step": 6805 + }, + { + "epoch": 0.374594088832627, + "grad_norm": 0.7519007921218872, + "learning_rate": 9.178567919261692e-06, + "loss": 0.8268, + "step": 6806 + }, + { + "epoch": 0.37464912763498265, + "grad_norm": 0.7507897019386292, + "learning_rate": 9.178329858624584e-06, + "loss": 0.8734, + "step": 6807 + }, + { + "epoch": 0.37470416643733834, + "grad_norm": 0.6874666213989258, + "learning_rate": 9.178091766584267e-06, + "loss": 0.6669, + "step": 6808 + }, + { + "epoch": 0.37475920523969397, + "grad_norm": 0.6987403631210327, + "learning_rate": 9.17785364314253e-06, + "loss": 0.7627, + "step": 6809 + }, + { + "epoch": 0.37481424404204966, + "grad_norm": 0.7777343392372131, + "learning_rate": 9.177615488301163e-06, + "loss": 0.7637, + "step": 6810 + }, + { + "epoch": 0.3748692828444053, + "grad_norm": 0.71980881690979, + "learning_rate": 9.177377302061958e-06, + "loss": 0.7964, + "step": 6811 + }, + { + "epoch": 0.374924321646761, + "grad_norm": 0.627328634262085, + "learning_rate": 9.177139084426704e-06, + "loss": 0.6862, + "step": 6812 + }, + { + "epoch": 0.3749793604491166, + "grad_norm": 0.7099852561950684, + "learning_rate": 9.176900835397188e-06, + "loss": 0.7592, + "step": 6813 + }, + { + "epoch": 0.3750343992514723, + "grad_norm": 0.7880212664604187, + "learning_rate": 9.176662554975205e-06, + "loss": 0.756, + "step": 6814 + }, + { + "epoch": 0.37508943805382794, + "grad_norm": 0.7347460389137268, + "learning_rate": 9.176424243162546e-06, + "loss": 0.8537, + "step": 6815 + }, + { + "epoch": 0.37514447685618363, + "grad_norm": 0.7020999789237976, + "learning_rate": 9.176185899960996e-06, + "loss": 0.7844, + "step": 6816 + }, + { + "epoch": 0.37519951565853926, + "grad_norm": 0.6857696175575256, + "learning_rate": 9.175947525372355e-06, + "loss": 0.8491, + "step": 6817 + }, + { + "epoch": 0.37525455446089495, + "grad_norm": 0.6882391571998596, + "learning_rate": 9.175709119398409e-06, + "loss": 0.7797, + "step": 6818 + }, + { + "epoch": 0.3753095932632506, + "grad_norm": 0.7788485288619995, + "learning_rate": 9.17547068204095e-06, + "loss": 0.6898, + "step": 6819 + }, + { + "epoch": 0.3753646320656063, + "grad_norm": 0.8529300093650818, + "learning_rate": 9.17523221330177e-06, + "loss": 0.8113, + "step": 6820 + }, + { + "epoch": 0.3754196708679619, + "grad_norm": 0.6297540068626404, + "learning_rate": 9.174993713182663e-06, + "loss": 0.7133, + "step": 6821 + }, + { + "epoch": 0.3754747096703176, + "grad_norm": 0.8225051760673523, + "learning_rate": 9.174755181685422e-06, + "loss": 0.83, + "step": 6822 + }, + { + "epoch": 0.37552974847267323, + "grad_norm": 0.7445290684700012, + "learning_rate": 9.174516618811838e-06, + "loss": 0.8597, + "step": 6823 + }, + { + "epoch": 0.3755847872750289, + "grad_norm": 0.7890744209289551, + "learning_rate": 9.174278024563706e-06, + "loss": 0.8021, + "step": 6824 + }, + { + "epoch": 0.37563982607738455, + "grad_norm": 0.644434928894043, + "learning_rate": 9.174039398942815e-06, + "loss": 0.7154, + "step": 6825 + }, + { + "epoch": 0.37569486487974024, + "grad_norm": 0.7664980292320251, + "learning_rate": 9.173800741950962e-06, + "loss": 0.8496, + "step": 6826 + }, + { + "epoch": 0.3757499036820959, + "grad_norm": 0.8062339425086975, + "learning_rate": 9.173562053589942e-06, + "loss": 0.7736, + "step": 6827 + }, + { + "epoch": 0.37580494248445157, + "grad_norm": 0.6334213018417358, + "learning_rate": 9.173323333861543e-06, + "loss": 0.6513, + "step": 6828 + }, + { + "epoch": 0.3758599812868072, + "grad_norm": 0.6825501322746277, + "learning_rate": 9.173084582767567e-06, + "loss": 0.755, + "step": 6829 + }, + { + "epoch": 0.37591502008916283, + "grad_norm": 0.7353835105895996, + "learning_rate": 9.172845800309801e-06, + "loss": 0.7783, + "step": 6830 + }, + { + "epoch": 0.3759700588915185, + "grad_norm": 0.7830193638801575, + "learning_rate": 9.172606986490046e-06, + "loss": 0.7352, + "step": 6831 + }, + { + "epoch": 0.37602509769387416, + "grad_norm": 0.7464943528175354, + "learning_rate": 9.172368141310091e-06, + "loss": 0.6454, + "step": 6832 + }, + { + "epoch": 0.37608013649622984, + "grad_norm": 0.7171493172645569, + "learning_rate": 9.172129264771736e-06, + "loss": 0.7978, + "step": 6833 + }, + { + "epoch": 0.3761351752985855, + "grad_norm": 0.6929624676704407, + "learning_rate": 9.171890356876774e-06, + "loss": 0.8026, + "step": 6834 + }, + { + "epoch": 0.37619021410094117, + "grad_norm": 0.7240758538246155, + "learning_rate": 9.171651417627e-06, + "loss": 0.8469, + "step": 6835 + }, + { + "epoch": 0.3762452529032968, + "grad_norm": 0.7713736891746521, + "learning_rate": 9.17141244702421e-06, + "loss": 0.8307, + "step": 6836 + }, + { + "epoch": 0.3763002917056525, + "grad_norm": 0.7417639493942261, + "learning_rate": 9.171173445070203e-06, + "loss": 0.8165, + "step": 6837 + }, + { + "epoch": 0.3763553305080081, + "grad_norm": 0.811005711555481, + "learning_rate": 9.17093441176677e-06, + "loss": 0.8418, + "step": 6838 + }, + { + "epoch": 0.3764103693103638, + "grad_norm": 0.9996818900108337, + "learning_rate": 9.170695347115713e-06, + "loss": 0.851, + "step": 6839 + }, + { + "epoch": 0.37646540811271945, + "grad_norm": 0.7703381776809692, + "learning_rate": 9.170456251118824e-06, + "loss": 0.8308, + "step": 6840 + }, + { + "epoch": 0.37652044691507514, + "grad_norm": 0.7194466590881348, + "learning_rate": 9.170217123777904e-06, + "loss": 0.699, + "step": 6841 + }, + { + "epoch": 0.37657548571743077, + "grad_norm": 0.7146462202072144, + "learning_rate": 9.169977965094748e-06, + "loss": 0.8247, + "step": 6842 + }, + { + "epoch": 0.37663052451978646, + "grad_norm": 0.7490555047988892, + "learning_rate": 9.169738775071153e-06, + "loss": 0.8627, + "step": 6843 + }, + { + "epoch": 0.3766855633221421, + "grad_norm": 0.827996015548706, + "learning_rate": 9.169499553708919e-06, + "loss": 0.7454, + "step": 6844 + }, + { + "epoch": 0.3767406021244978, + "grad_norm": 0.7185913324356079, + "learning_rate": 9.16926030100984e-06, + "loss": 0.7018, + "step": 6845 + }, + { + "epoch": 0.3767956409268534, + "grad_norm": 0.7879654169082642, + "learning_rate": 9.169021016975718e-06, + "loss": 0.8144, + "step": 6846 + }, + { + "epoch": 0.3768506797292091, + "grad_norm": 0.7072417736053467, + "learning_rate": 9.168781701608352e-06, + "loss": 0.7572, + "step": 6847 + }, + { + "epoch": 0.37690571853156474, + "grad_norm": 0.7359803915023804, + "learning_rate": 9.168542354909536e-06, + "loss": 0.7712, + "step": 6848 + }, + { + "epoch": 0.3769607573339204, + "grad_norm": 0.7672479748725891, + "learning_rate": 9.168302976881072e-06, + "loss": 0.7696, + "step": 6849 + }, + { + "epoch": 0.37701579613627606, + "grad_norm": 0.7276006937026978, + "learning_rate": 9.168063567524758e-06, + "loss": 0.8235, + "step": 6850 + }, + { + "epoch": 0.37707083493863175, + "grad_norm": 0.673577606678009, + "learning_rate": 9.167824126842396e-06, + "loss": 0.6515, + "step": 6851 + }, + { + "epoch": 0.3771258737409874, + "grad_norm": 0.7257997989654541, + "learning_rate": 9.167584654835782e-06, + "loss": 0.729, + "step": 6852 + }, + { + "epoch": 0.37718091254334307, + "grad_norm": 0.6655071377754211, + "learning_rate": 9.167345151506717e-06, + "loss": 0.7917, + "step": 6853 + }, + { + "epoch": 0.3772359513456987, + "grad_norm": 0.7603726983070374, + "learning_rate": 9.167105616857002e-06, + "loss": 0.8383, + "step": 6854 + }, + { + "epoch": 0.3772909901480544, + "grad_norm": 0.7066939473152161, + "learning_rate": 9.166866050888437e-06, + "loss": 0.7589, + "step": 6855 + }, + { + "epoch": 0.37734602895041003, + "grad_norm": 0.7002355456352234, + "learning_rate": 9.16662645360282e-06, + "loss": 0.8305, + "step": 6856 + }, + { + "epoch": 0.3774010677527657, + "grad_norm": 0.9499780535697937, + "learning_rate": 9.166386825001957e-06, + "loss": 0.78, + "step": 6857 + }, + { + "epoch": 0.37745610655512135, + "grad_norm": 0.7136938571929932, + "learning_rate": 9.166147165087645e-06, + "loss": 0.7449, + "step": 6858 + }, + { + "epoch": 0.37751114535747704, + "grad_norm": 0.740443766117096, + "learning_rate": 9.165907473861687e-06, + "loss": 0.8228, + "step": 6859 + }, + { + "epoch": 0.3775661841598327, + "grad_norm": 0.7649856209754944, + "learning_rate": 9.165667751325879e-06, + "loss": 0.7762, + "step": 6860 + }, + { + "epoch": 0.37762122296218836, + "grad_norm": 0.743251383304596, + "learning_rate": 9.165427997482032e-06, + "loss": 0.7536, + "step": 6861 + }, + { + "epoch": 0.377676261764544, + "grad_norm": 0.7023851871490479, + "learning_rate": 9.165188212331941e-06, + "loss": 0.7327, + "step": 6862 + }, + { + "epoch": 0.3777313005668997, + "grad_norm": 0.7304333448410034, + "learning_rate": 9.164948395877411e-06, + "loss": 0.8816, + "step": 6863 + }, + { + "epoch": 0.3777863393692553, + "grad_norm": 0.6666659116744995, + "learning_rate": 9.164708548120244e-06, + "loss": 0.7821, + "step": 6864 + }, + { + "epoch": 0.377841378171611, + "grad_norm": 0.6542865037918091, + "learning_rate": 9.164468669062242e-06, + "loss": 0.7044, + "step": 6865 + }, + { + "epoch": 0.37789641697396664, + "grad_norm": 0.7436043620109558, + "learning_rate": 9.16422875870521e-06, + "loss": 0.8492, + "step": 6866 + }, + { + "epoch": 0.37795145577632233, + "grad_norm": 0.7660424709320068, + "learning_rate": 9.163988817050947e-06, + "loss": 0.7236, + "step": 6867 + }, + { + "epoch": 0.37800649457867797, + "grad_norm": 0.7288914918899536, + "learning_rate": 9.16374884410126e-06, + "loss": 0.6361, + "step": 6868 + }, + { + "epoch": 0.37806153338103365, + "grad_norm": 0.884832501411438, + "learning_rate": 9.163508839857948e-06, + "loss": 0.8112, + "step": 6869 + }, + { + "epoch": 0.3781165721833893, + "grad_norm": 0.937660813331604, + "learning_rate": 9.163268804322822e-06, + "loss": 0.6405, + "step": 6870 + }, + { + "epoch": 0.378171610985745, + "grad_norm": 0.8295212388038635, + "learning_rate": 9.16302873749768e-06, + "loss": 0.8107, + "step": 6871 + }, + { + "epoch": 0.3782266497881006, + "grad_norm": 1.0573647022247314, + "learning_rate": 9.16278863938433e-06, + "loss": 0.7792, + "step": 6872 + }, + { + "epoch": 0.37828168859045624, + "grad_norm": 0.8450027108192444, + "learning_rate": 9.162548509984574e-06, + "loss": 0.8103, + "step": 6873 + }, + { + "epoch": 0.37833672739281193, + "grad_norm": 0.7372947931289673, + "learning_rate": 9.162308349300218e-06, + "loss": 0.8232, + "step": 6874 + }, + { + "epoch": 0.37839176619516757, + "grad_norm": 0.7573776841163635, + "learning_rate": 9.162068157333066e-06, + "loss": 0.773, + "step": 6875 + }, + { + "epoch": 0.37844680499752326, + "grad_norm": 0.7883201241493225, + "learning_rate": 9.161827934084924e-06, + "loss": 0.7561, + "step": 6876 + }, + { + "epoch": 0.3785018437998789, + "grad_norm": 0.7195025086402893, + "learning_rate": 9.161587679557598e-06, + "loss": 0.798, + "step": 6877 + }, + { + "epoch": 0.3785568826022346, + "grad_norm": 0.7047843337059021, + "learning_rate": 9.161347393752891e-06, + "loss": 0.8122, + "step": 6878 + }, + { + "epoch": 0.3786119214045902, + "grad_norm": 0.7354363203048706, + "learning_rate": 9.161107076672613e-06, + "loss": 0.7296, + "step": 6879 + }, + { + "epoch": 0.3786669602069459, + "grad_norm": 0.7748313546180725, + "learning_rate": 9.160866728318567e-06, + "loss": 0.9576, + "step": 6880 + }, + { + "epoch": 0.37872199900930154, + "grad_norm": 0.7197638750076294, + "learning_rate": 9.16062634869256e-06, + "loss": 0.8054, + "step": 6881 + }, + { + "epoch": 0.3787770378116572, + "grad_norm": 0.7086492776870728, + "learning_rate": 9.1603859377964e-06, + "loss": 0.8938, + "step": 6882 + }, + { + "epoch": 0.37883207661401286, + "grad_norm": 0.7764425873756409, + "learning_rate": 9.160145495631894e-06, + "loss": 0.7562, + "step": 6883 + }, + { + "epoch": 0.37888711541636855, + "grad_norm": 0.7673479914665222, + "learning_rate": 9.159905022200846e-06, + "loss": 0.6783, + "step": 6884 + }, + { + "epoch": 0.3789421542187242, + "grad_norm": 0.7323669195175171, + "learning_rate": 9.159664517505067e-06, + "loss": 0.8274, + "step": 6885 + }, + { + "epoch": 0.37899719302107987, + "grad_norm": 0.8283136487007141, + "learning_rate": 9.159423981546362e-06, + "loss": 0.7184, + "step": 6886 + }, + { + "epoch": 0.3790522318234355, + "grad_norm": 0.6949145793914795, + "learning_rate": 9.15918341432654e-06, + "loss": 0.7843, + "step": 6887 + }, + { + "epoch": 0.3791072706257912, + "grad_norm": 0.8584639430046082, + "learning_rate": 9.158942815847408e-06, + "loss": 0.71, + "step": 6888 + }, + { + "epoch": 0.3791623094281468, + "grad_norm": 0.7125271558761597, + "learning_rate": 9.158702186110777e-06, + "loss": 0.7432, + "step": 6889 + }, + { + "epoch": 0.3792173482305025, + "grad_norm": 0.6657430529594421, + "learning_rate": 9.158461525118452e-06, + "loss": 0.6715, + "step": 6890 + }, + { + "epoch": 0.37927238703285815, + "grad_norm": 0.770226240158081, + "learning_rate": 9.158220832872243e-06, + "loss": 0.7029, + "step": 6891 + }, + { + "epoch": 0.37932742583521384, + "grad_norm": 0.7697272300720215, + "learning_rate": 9.15798010937396e-06, + "loss": 0.686, + "step": 6892 + }, + { + "epoch": 0.37938246463756947, + "grad_norm": 0.7693290710449219, + "learning_rate": 9.157739354625413e-06, + "loss": 0.7669, + "step": 6893 + }, + { + "epoch": 0.37943750343992516, + "grad_norm": 0.8365996479988098, + "learning_rate": 9.157498568628406e-06, + "loss": 0.8254, + "step": 6894 + }, + { + "epoch": 0.3794925422422808, + "grad_norm": 0.8075883388519287, + "learning_rate": 9.157257751384756e-06, + "loss": 0.8311, + "step": 6895 + }, + { + "epoch": 0.3795475810446365, + "grad_norm": 0.8422812819480896, + "learning_rate": 9.15701690289627e-06, + "loss": 0.9173, + "step": 6896 + }, + { + "epoch": 0.3796026198469921, + "grad_norm": 0.7930355072021484, + "learning_rate": 9.156776023164755e-06, + "loss": 0.9376, + "step": 6897 + }, + { + "epoch": 0.3796576586493478, + "grad_norm": 0.7877563238143921, + "learning_rate": 9.156535112192026e-06, + "loss": 0.8358, + "step": 6898 + }, + { + "epoch": 0.37971269745170344, + "grad_norm": 0.7712885141372681, + "learning_rate": 9.156294169979891e-06, + "loss": 0.8781, + "step": 6899 + }, + { + "epoch": 0.37976773625405913, + "grad_norm": 0.6953728199005127, + "learning_rate": 9.156053196530162e-06, + "loss": 0.7861, + "step": 6900 + }, + { + "epoch": 0.37982277505641476, + "grad_norm": 0.9581564664840698, + "learning_rate": 9.155812191844649e-06, + "loss": 0.8294, + "step": 6901 + }, + { + "epoch": 0.37987781385877045, + "grad_norm": 0.738571286201477, + "learning_rate": 9.155571155925166e-06, + "loss": 0.7998, + "step": 6902 + }, + { + "epoch": 0.3799328526611261, + "grad_norm": 0.7059765458106995, + "learning_rate": 9.155330088773519e-06, + "loss": 0.7877, + "step": 6903 + }, + { + "epoch": 0.3799878914634818, + "grad_norm": 0.8572642207145691, + "learning_rate": 9.155088990391527e-06, + "loss": 0.7333, + "step": 6904 + }, + { + "epoch": 0.3800429302658374, + "grad_norm": 0.7442637085914612, + "learning_rate": 9.154847860780996e-06, + "loss": 0.685, + "step": 6905 + }, + { + "epoch": 0.3800979690681931, + "grad_norm": 0.7787682414054871, + "learning_rate": 9.154606699943741e-06, + "loss": 0.7893, + "step": 6906 + }, + { + "epoch": 0.38015300787054873, + "grad_norm": 0.8973822593688965, + "learning_rate": 9.154365507881574e-06, + "loss": 0.8297, + "step": 6907 + }, + { + "epoch": 0.3802080466729044, + "grad_norm": 0.7759919166564941, + "learning_rate": 9.154124284596311e-06, + "loss": 0.8257, + "step": 6908 + }, + { + "epoch": 0.38026308547526005, + "grad_norm": 0.8042850494384766, + "learning_rate": 9.153883030089759e-06, + "loss": 0.8024, + "step": 6909 + }, + { + "epoch": 0.38031812427761574, + "grad_norm": 0.8285790085792542, + "learning_rate": 9.153641744363733e-06, + "loss": 0.7824, + "step": 6910 + }, + { + "epoch": 0.3803731630799714, + "grad_norm": 0.7225445508956909, + "learning_rate": 9.15340042742005e-06, + "loss": 0.8065, + "step": 6911 + }, + { + "epoch": 0.38042820188232707, + "grad_norm": 0.7685298919677734, + "learning_rate": 9.15315907926052e-06, + "loss": 0.8151, + "step": 6912 + }, + { + "epoch": 0.3804832406846827, + "grad_norm": 0.9005589485168457, + "learning_rate": 9.152917699886958e-06, + "loss": 0.8413, + "step": 6913 + }, + { + "epoch": 0.3805382794870384, + "grad_norm": 0.8715279698371887, + "learning_rate": 9.152676289301178e-06, + "loss": 0.7233, + "step": 6914 + }, + { + "epoch": 0.380593318289394, + "grad_norm": 0.8764133453369141, + "learning_rate": 9.152434847504996e-06, + "loss": 0.783, + "step": 6915 + }, + { + "epoch": 0.38064835709174966, + "grad_norm": 0.6847019195556641, + "learning_rate": 9.152193374500225e-06, + "loss": 0.7133, + "step": 6916 + }, + { + "epoch": 0.38070339589410535, + "grad_norm": 0.7562721371650696, + "learning_rate": 9.151951870288678e-06, + "loss": 0.8155, + "step": 6917 + }, + { + "epoch": 0.380758434696461, + "grad_norm": 0.6888439059257507, + "learning_rate": 9.151710334872173e-06, + "loss": 0.6395, + "step": 6918 + }, + { + "epoch": 0.38081347349881667, + "grad_norm": 1.0951511859893799, + "learning_rate": 9.151468768252525e-06, + "loss": 0.8936, + "step": 6919 + }, + { + "epoch": 0.3808685123011723, + "grad_norm": 0.7261115908622742, + "learning_rate": 9.151227170431549e-06, + "loss": 0.7864, + "step": 6920 + }, + { + "epoch": 0.380923551103528, + "grad_norm": 1.2851859331130981, + "learning_rate": 9.150985541411061e-06, + "loss": 0.9419, + "step": 6921 + }, + { + "epoch": 0.3809785899058836, + "grad_norm": 0.7621721625328064, + "learning_rate": 9.150743881192876e-06, + "loss": 0.7773, + "step": 6922 + }, + { + "epoch": 0.3810336287082393, + "grad_norm": 0.7605605721473694, + "learning_rate": 9.150502189778811e-06, + "loss": 0.8752, + "step": 6923 + }, + { + "epoch": 0.38108866751059495, + "grad_norm": 0.8422327041625977, + "learning_rate": 9.150260467170683e-06, + "loss": 0.8555, + "step": 6924 + }, + { + "epoch": 0.38114370631295064, + "grad_norm": 0.7227829098701477, + "learning_rate": 9.15001871337031e-06, + "loss": 0.7637, + "step": 6925 + }, + { + "epoch": 0.38119874511530627, + "grad_norm": 0.6568942666053772, + "learning_rate": 9.149776928379506e-06, + "loss": 0.6944, + "step": 6926 + }, + { + "epoch": 0.38125378391766196, + "grad_norm": 0.9317567944526672, + "learning_rate": 9.149535112200087e-06, + "loss": 0.8098, + "step": 6927 + }, + { + "epoch": 0.3813088227200176, + "grad_norm": 0.6374759674072266, + "learning_rate": 9.149293264833877e-06, + "loss": 0.6654, + "step": 6928 + }, + { + "epoch": 0.3813638615223733, + "grad_norm": 0.7276837825775146, + "learning_rate": 9.149051386282685e-06, + "loss": 0.7728, + "step": 6929 + }, + { + "epoch": 0.3814189003247289, + "grad_norm": 0.7573683261871338, + "learning_rate": 9.148809476548337e-06, + "loss": 0.7681, + "step": 6930 + }, + { + "epoch": 0.3814739391270846, + "grad_norm": 0.7535703778266907, + "learning_rate": 9.148567535632647e-06, + "loss": 0.8498, + "step": 6931 + }, + { + "epoch": 0.38152897792944024, + "grad_norm": 0.7510126233100891, + "learning_rate": 9.148325563537432e-06, + "loss": 0.7874, + "step": 6932 + }, + { + "epoch": 0.3815840167317959, + "grad_norm": 0.7809224724769592, + "learning_rate": 9.148083560264515e-06, + "loss": 0.7223, + "step": 6933 + }, + { + "epoch": 0.38163905553415156, + "grad_norm": 0.7433155179023743, + "learning_rate": 9.14784152581571e-06, + "loss": 0.7914, + "step": 6934 + }, + { + "epoch": 0.38169409433650725, + "grad_norm": 0.7142858505249023, + "learning_rate": 9.14759946019284e-06, + "loss": 0.781, + "step": 6935 + }, + { + "epoch": 0.3817491331388629, + "grad_norm": 0.7910202741622925, + "learning_rate": 9.147357363397721e-06, + "loss": 0.755, + "step": 6936 + }, + { + "epoch": 0.3818041719412186, + "grad_norm": 1.007727026939392, + "learning_rate": 9.147115235432176e-06, + "loss": 0.7809, + "step": 6937 + }, + { + "epoch": 0.3818592107435742, + "grad_norm": 0.7227005362510681, + "learning_rate": 9.146873076298024e-06, + "loss": 0.7276, + "step": 6938 + }, + { + "epoch": 0.3819142495459299, + "grad_norm": 0.6945967674255371, + "learning_rate": 9.146630885997081e-06, + "loss": 0.825, + "step": 6939 + }, + { + "epoch": 0.38196928834828553, + "grad_norm": 0.6719669103622437, + "learning_rate": 9.146388664531172e-06, + "loss": 0.6486, + "step": 6940 + }, + { + "epoch": 0.3820243271506412, + "grad_norm": 0.7528467178344727, + "learning_rate": 9.146146411902115e-06, + "loss": 0.8143, + "step": 6941 + }, + { + "epoch": 0.38207936595299685, + "grad_norm": 0.6835548877716064, + "learning_rate": 9.145904128111732e-06, + "loss": 0.7742, + "step": 6942 + }, + { + "epoch": 0.38213440475535254, + "grad_norm": 0.7829870581626892, + "learning_rate": 9.145661813161844e-06, + "loss": 0.8147, + "step": 6943 + }, + { + "epoch": 0.3821894435577082, + "grad_norm": 0.6833155155181885, + "learning_rate": 9.145419467054271e-06, + "loss": 0.7615, + "step": 6944 + }, + { + "epoch": 0.38224448236006386, + "grad_norm": 0.7577275037765503, + "learning_rate": 9.145177089790833e-06, + "loss": 0.8611, + "step": 6945 + }, + { + "epoch": 0.3822995211624195, + "grad_norm": 0.7102984189987183, + "learning_rate": 9.144934681373356e-06, + "loss": 0.8373, + "step": 6946 + }, + { + "epoch": 0.3823545599647752, + "grad_norm": 0.6906121373176575, + "learning_rate": 9.144692241803658e-06, + "loss": 0.8314, + "step": 6947 + }, + { + "epoch": 0.3824095987671308, + "grad_norm": 0.7790967226028442, + "learning_rate": 9.144449771083563e-06, + "loss": 0.8285, + "step": 6948 + }, + { + "epoch": 0.3824646375694865, + "grad_norm": 0.8420237898826599, + "learning_rate": 9.144207269214893e-06, + "loss": 0.8159, + "step": 6949 + }, + { + "epoch": 0.38251967637184214, + "grad_norm": 0.7944310307502747, + "learning_rate": 9.143964736199471e-06, + "loss": 0.7981, + "step": 6950 + }, + { + "epoch": 0.38257471517419783, + "grad_norm": 0.7610076069831848, + "learning_rate": 9.14372217203912e-06, + "loss": 0.8011, + "step": 6951 + }, + { + "epoch": 0.38262975397655347, + "grad_norm": 0.7183333039283752, + "learning_rate": 9.143479576735661e-06, + "loss": 0.7504, + "step": 6952 + }, + { + "epoch": 0.38268479277890916, + "grad_norm": 0.7363573312759399, + "learning_rate": 9.14323695029092e-06, + "loss": 0.7561, + "step": 6953 + }, + { + "epoch": 0.3827398315812648, + "grad_norm": 0.7330427765846252, + "learning_rate": 9.142994292706716e-06, + "loss": 0.754, + "step": 6954 + }, + { + "epoch": 0.3827948703836205, + "grad_norm": 0.8307509422302246, + "learning_rate": 9.142751603984879e-06, + "loss": 0.8059, + "step": 6955 + }, + { + "epoch": 0.3828499091859761, + "grad_norm": 0.7340347766876221, + "learning_rate": 9.142508884127228e-06, + "loss": 0.8636, + "step": 6956 + }, + { + "epoch": 0.3829049479883318, + "grad_norm": 0.7032678127288818, + "learning_rate": 9.14226613313559e-06, + "loss": 0.8237, + "step": 6957 + }, + { + "epoch": 0.38295998679068743, + "grad_norm": 0.769809365272522, + "learning_rate": 9.142023351011788e-06, + "loss": 0.7523, + "step": 6958 + }, + { + "epoch": 0.38301502559304307, + "grad_norm": 0.7446833252906799, + "learning_rate": 9.141780537757647e-06, + "loss": 0.8382, + "step": 6959 + }, + { + "epoch": 0.38307006439539876, + "grad_norm": 0.6926285028457642, + "learning_rate": 9.141537693374994e-06, + "loss": 0.7997, + "step": 6960 + }, + { + "epoch": 0.3831251031977544, + "grad_norm": 0.7303034067153931, + "learning_rate": 9.141294817865651e-06, + "loss": 0.794, + "step": 6961 + }, + { + "epoch": 0.3831801420001101, + "grad_norm": 0.7453297972679138, + "learning_rate": 9.141051911231445e-06, + "loss": 0.7031, + "step": 6962 + }, + { + "epoch": 0.3832351808024657, + "grad_norm": 0.8503912091255188, + "learning_rate": 9.140808973474201e-06, + "loss": 0.7855, + "step": 6963 + }, + { + "epoch": 0.3832902196048214, + "grad_norm": 0.7304036617279053, + "learning_rate": 9.140566004595746e-06, + "loss": 0.7062, + "step": 6964 + }, + { + "epoch": 0.38334525840717704, + "grad_norm": 0.7534968852996826, + "learning_rate": 9.140323004597904e-06, + "loss": 0.8138, + "step": 6965 + }, + { + "epoch": 0.3834002972095327, + "grad_norm": 0.8122013807296753, + "learning_rate": 9.140079973482503e-06, + "loss": 0.7769, + "step": 6966 + }, + { + "epoch": 0.38345533601188836, + "grad_norm": 0.7345744967460632, + "learning_rate": 9.13983691125137e-06, + "loss": 0.7588, + "step": 6967 + }, + { + "epoch": 0.38351037481424405, + "grad_norm": 0.7251620292663574, + "learning_rate": 9.13959381790633e-06, + "loss": 0.8027, + "step": 6968 + }, + { + "epoch": 0.3835654136165997, + "grad_norm": 0.7157594561576843, + "learning_rate": 9.139350693449212e-06, + "loss": 0.7233, + "step": 6969 + }, + { + "epoch": 0.38362045241895537, + "grad_norm": 0.8076621890068054, + "learning_rate": 9.139107537881842e-06, + "loss": 0.7256, + "step": 6970 + }, + { + "epoch": 0.383675491221311, + "grad_norm": 0.717182993888855, + "learning_rate": 9.138864351206047e-06, + "loss": 0.7003, + "step": 6971 + }, + { + "epoch": 0.3837305300236667, + "grad_norm": 0.7534194588661194, + "learning_rate": 9.138621133423656e-06, + "loss": 0.7315, + "step": 6972 + }, + { + "epoch": 0.3837855688260223, + "grad_norm": 0.6400160193443298, + "learning_rate": 9.138377884536494e-06, + "loss": 0.6814, + "step": 6973 + }, + { + "epoch": 0.383840607628378, + "grad_norm": 0.7319507002830505, + "learning_rate": 9.138134604546394e-06, + "loss": 0.7942, + "step": 6974 + }, + { + "epoch": 0.38389564643073365, + "grad_norm": 0.7109829783439636, + "learning_rate": 9.137891293455181e-06, + "loss": 0.7528, + "step": 6975 + }, + { + "epoch": 0.38395068523308934, + "grad_norm": 1.006724238395691, + "learning_rate": 9.137647951264685e-06, + "loss": 0.7652, + "step": 6976 + }, + { + "epoch": 0.384005724035445, + "grad_norm": 0.7080540060997009, + "learning_rate": 9.137404577976736e-06, + "loss": 0.7706, + "step": 6977 + }, + { + "epoch": 0.38406076283780066, + "grad_norm": 0.7551368474960327, + "learning_rate": 9.137161173593161e-06, + "loss": 0.8202, + "step": 6978 + }, + { + "epoch": 0.3841158016401563, + "grad_norm": 0.6624314785003662, + "learning_rate": 9.13691773811579e-06, + "loss": 0.7258, + "step": 6979 + }, + { + "epoch": 0.384170840442512, + "grad_norm": 0.9603848457336426, + "learning_rate": 9.136674271546451e-06, + "loss": 0.9415, + "step": 6980 + }, + { + "epoch": 0.3842258792448676, + "grad_norm": 0.6964829564094543, + "learning_rate": 9.136430773886977e-06, + "loss": 0.7604, + "step": 6981 + }, + { + "epoch": 0.3842809180472233, + "grad_norm": 0.6503588557243347, + "learning_rate": 9.136187245139197e-06, + "loss": 0.7141, + "step": 6982 + }, + { + "epoch": 0.38433595684957894, + "grad_norm": 0.9179829359054565, + "learning_rate": 9.13594368530494e-06, + "loss": 0.7619, + "step": 6983 + }, + { + "epoch": 0.38439099565193463, + "grad_norm": 0.7993278503417969, + "learning_rate": 9.135700094386038e-06, + "loss": 0.832, + "step": 6984 + }, + { + "epoch": 0.38444603445429026, + "grad_norm": 0.8136988282203674, + "learning_rate": 9.13545647238432e-06, + "loss": 0.8127, + "step": 6985 + }, + { + "epoch": 0.38450107325664595, + "grad_norm": 0.9918104410171509, + "learning_rate": 9.135212819301619e-06, + "loss": 0.836, + "step": 6986 + }, + { + "epoch": 0.3845561120590016, + "grad_norm": 0.7767511010169983, + "learning_rate": 9.134969135139765e-06, + "loss": 0.8391, + "step": 6987 + }, + { + "epoch": 0.3846111508613573, + "grad_norm": 0.6889285445213318, + "learning_rate": 9.134725419900589e-06, + "loss": 0.7639, + "step": 6988 + }, + { + "epoch": 0.3846661896637129, + "grad_norm": 1.803467035293579, + "learning_rate": 9.134481673585924e-06, + "loss": 0.7629, + "step": 6989 + }, + { + "epoch": 0.3847212284660686, + "grad_norm": 0.721581757068634, + "learning_rate": 9.134237896197603e-06, + "loss": 0.8194, + "step": 6990 + }, + { + "epoch": 0.38477626726842423, + "grad_norm": 0.8163189888000488, + "learning_rate": 9.133994087737456e-06, + "loss": 0.7789, + "step": 6991 + }, + { + "epoch": 0.3848313060707799, + "grad_norm": 0.7518420815467834, + "learning_rate": 9.133750248207315e-06, + "loss": 0.7529, + "step": 6992 + }, + { + "epoch": 0.38488634487313556, + "grad_norm": 0.7318000197410583, + "learning_rate": 9.133506377609015e-06, + "loss": 0.7829, + "step": 6993 + }, + { + "epoch": 0.38494138367549124, + "grad_norm": 0.7765058875083923, + "learning_rate": 9.133262475944386e-06, + "loss": 0.7902, + "step": 6994 + }, + { + "epoch": 0.3849964224778469, + "grad_norm": 0.845567524433136, + "learning_rate": 9.133018543215265e-06, + "loss": 0.8117, + "step": 6995 + }, + { + "epoch": 0.38505146128020257, + "grad_norm": 0.7081887125968933, + "learning_rate": 9.13277457942348e-06, + "loss": 0.8131, + "step": 6996 + }, + { + "epoch": 0.3851065000825582, + "grad_norm": 0.7447869777679443, + "learning_rate": 9.132530584570869e-06, + "loss": 0.7765, + "step": 6997 + }, + { + "epoch": 0.3851615388849139, + "grad_norm": 0.8554795384407043, + "learning_rate": 9.132286558659265e-06, + "loss": 0.8966, + "step": 6998 + }, + { + "epoch": 0.3852165776872695, + "grad_norm": 0.7117023468017578, + "learning_rate": 9.1320425016905e-06, + "loss": 0.7461, + "step": 6999 + }, + { + "epoch": 0.3852716164896252, + "grad_norm": 0.6965934038162231, + "learning_rate": 9.131798413666411e-06, + "loss": 0.6827, + "step": 7000 + }, + { + "epoch": 0.38532665529198085, + "grad_norm": 0.7449018359184265, + "learning_rate": 9.13155429458883e-06, + "loss": 0.7562, + "step": 7001 + }, + { + "epoch": 0.3853816940943365, + "grad_norm": 0.7764221429824829, + "learning_rate": 9.131310144459593e-06, + "loss": 0.7842, + "step": 7002 + }, + { + "epoch": 0.38543673289669217, + "grad_norm": 0.9788658618927002, + "learning_rate": 9.131065963280536e-06, + "loss": 0.7857, + "step": 7003 + }, + { + "epoch": 0.3854917716990478, + "grad_norm": 0.7900908589363098, + "learning_rate": 9.13082175105349e-06, + "loss": 0.8733, + "step": 7004 + }, + { + "epoch": 0.3855468105014035, + "grad_norm": 0.814822793006897, + "learning_rate": 9.130577507780298e-06, + "loss": 0.8032, + "step": 7005 + }, + { + "epoch": 0.3856018493037591, + "grad_norm": 1.0648475885391235, + "learning_rate": 9.130333233462789e-06, + "loss": 0.8078, + "step": 7006 + }, + { + "epoch": 0.3856568881061148, + "grad_norm": 0.7359917163848877, + "learning_rate": 9.130088928102799e-06, + "loss": 0.6491, + "step": 7007 + }, + { + "epoch": 0.38571192690847045, + "grad_norm": 0.7321771383285522, + "learning_rate": 9.129844591702169e-06, + "loss": 0.7663, + "step": 7008 + }, + { + "epoch": 0.38576696571082614, + "grad_norm": 0.6937146186828613, + "learning_rate": 9.129600224262732e-06, + "loss": 0.7835, + "step": 7009 + }, + { + "epoch": 0.38582200451318177, + "grad_norm": 0.7330107688903809, + "learning_rate": 9.129355825786323e-06, + "loss": 0.7626, + "step": 7010 + }, + { + "epoch": 0.38587704331553746, + "grad_norm": 0.7021715044975281, + "learning_rate": 9.129111396274783e-06, + "loss": 0.7115, + "step": 7011 + }, + { + "epoch": 0.3859320821178931, + "grad_norm": 0.6599563360214233, + "learning_rate": 9.128866935729947e-06, + "loss": 0.6554, + "step": 7012 + }, + { + "epoch": 0.3859871209202488, + "grad_norm": 0.7323513031005859, + "learning_rate": 9.128622444153652e-06, + "loss": 0.7392, + "step": 7013 + }, + { + "epoch": 0.3860421597226044, + "grad_norm": 0.681888222694397, + "learning_rate": 9.128377921547736e-06, + "loss": 0.7474, + "step": 7014 + }, + { + "epoch": 0.3860971985249601, + "grad_norm": 0.8454889059066772, + "learning_rate": 9.128133367914036e-06, + "loss": 0.8355, + "step": 7015 + }, + { + "epoch": 0.38615223732731574, + "grad_norm": 0.7514123916625977, + "learning_rate": 9.12788878325439e-06, + "loss": 0.7683, + "step": 7016 + }, + { + "epoch": 0.38620727612967143, + "grad_norm": 0.7317092418670654, + "learning_rate": 9.12764416757064e-06, + "loss": 0.7201, + "step": 7017 + }, + { + "epoch": 0.38626231493202706, + "grad_norm": 0.7626729011535645, + "learning_rate": 9.127399520864619e-06, + "loss": 0.7701, + "step": 7018 + }, + { + "epoch": 0.38631735373438275, + "grad_norm": 0.9790363311767578, + "learning_rate": 9.127154843138168e-06, + "loss": 0.8034, + "step": 7019 + }, + { + "epoch": 0.3863723925367384, + "grad_norm": 0.663593590259552, + "learning_rate": 9.126910134393125e-06, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 0.3864274313390941, + "grad_norm": 0.6599924564361572, + "learning_rate": 9.126665394631332e-06, + "loss": 0.7395, + "step": 7021 + }, + { + "epoch": 0.3864824701414497, + "grad_norm": 0.8493411540985107, + "learning_rate": 9.126420623854625e-06, + "loss": 0.8008, + "step": 7022 + }, + { + "epoch": 0.3865375089438054, + "grad_norm": 0.7587194442749023, + "learning_rate": 9.126175822064846e-06, + "loss": 0.7533, + "step": 7023 + }, + { + "epoch": 0.38659254774616103, + "grad_norm": 0.773764431476593, + "learning_rate": 9.125930989263835e-06, + "loss": 0.75, + "step": 7024 + }, + { + "epoch": 0.3866475865485167, + "grad_norm": 0.7126749753952026, + "learning_rate": 9.12568612545343e-06, + "loss": 0.7794, + "step": 7025 + }, + { + "epoch": 0.38670262535087235, + "grad_norm": 0.7404584884643555, + "learning_rate": 9.125441230635472e-06, + "loss": 0.7264, + "step": 7026 + }, + { + "epoch": 0.38675766415322804, + "grad_norm": 0.8057644367218018, + "learning_rate": 9.125196304811804e-06, + "loss": 0.8058, + "step": 7027 + }, + { + "epoch": 0.3868127029555837, + "grad_norm": 0.9586995840072632, + "learning_rate": 9.124951347984263e-06, + "loss": 0.7659, + "step": 7028 + }, + { + "epoch": 0.38686774175793937, + "grad_norm": 0.7567793726921082, + "learning_rate": 9.124706360154693e-06, + "loss": 0.8961, + "step": 7029 + }, + { + "epoch": 0.386922780560295, + "grad_norm": 0.8523182272911072, + "learning_rate": 9.124461341324934e-06, + "loss": 0.8815, + "step": 7030 + }, + { + "epoch": 0.3869778193626507, + "grad_norm": 0.7466379404067993, + "learning_rate": 9.124216291496826e-06, + "loss": 0.7817, + "step": 7031 + }, + { + "epoch": 0.3870328581650063, + "grad_norm": 0.6721325516700745, + "learning_rate": 9.123971210672214e-06, + "loss": 0.7637, + "step": 7032 + }, + { + "epoch": 0.387087896967362, + "grad_norm": 0.6620928049087524, + "learning_rate": 9.123726098852936e-06, + "loss": 0.6956, + "step": 7033 + }, + { + "epoch": 0.38714293576971764, + "grad_norm": 0.6784290671348572, + "learning_rate": 9.12348095604084e-06, + "loss": 0.7034, + "step": 7034 + }, + { + "epoch": 0.38719797457207333, + "grad_norm": 0.7138848304748535, + "learning_rate": 9.123235782237763e-06, + "loss": 0.6037, + "step": 7035 + }, + { + "epoch": 0.38725301337442897, + "grad_norm": 0.8473613858222961, + "learning_rate": 9.122990577445548e-06, + "loss": 0.8157, + "step": 7036 + }, + { + "epoch": 0.38730805217678466, + "grad_norm": 0.835381031036377, + "learning_rate": 9.122745341666041e-06, + "loss": 0.8736, + "step": 7037 + }, + { + "epoch": 0.3873630909791403, + "grad_norm": 0.8823271989822388, + "learning_rate": 9.122500074901083e-06, + "loss": 0.7448, + "step": 7038 + }, + { + "epoch": 0.387418129781496, + "grad_norm": 0.6494244933128357, + "learning_rate": 9.122254777152519e-06, + "loss": 0.7423, + "step": 7039 + }, + { + "epoch": 0.3874731685838516, + "grad_norm": 0.7232181429862976, + "learning_rate": 9.122009448422191e-06, + "loss": 0.8489, + "step": 7040 + }, + { + "epoch": 0.3875282073862073, + "grad_norm": 0.7357699275016785, + "learning_rate": 9.121764088711945e-06, + "loss": 0.8799, + "step": 7041 + }, + { + "epoch": 0.38758324618856294, + "grad_norm": 0.7638574838638306, + "learning_rate": 9.121518698023621e-06, + "loss": 0.8539, + "step": 7042 + }, + { + "epoch": 0.3876382849909186, + "grad_norm": 0.7407062649726868, + "learning_rate": 9.121273276359068e-06, + "loss": 0.7152, + "step": 7043 + }, + { + "epoch": 0.38769332379327426, + "grad_norm": 0.6945983171463013, + "learning_rate": 9.121027823720126e-06, + "loss": 0.8224, + "step": 7044 + }, + { + "epoch": 0.3877483625956299, + "grad_norm": 0.7163639068603516, + "learning_rate": 9.120782340108643e-06, + "loss": 0.808, + "step": 7045 + }, + { + "epoch": 0.3878034013979856, + "grad_norm": 0.7062035799026489, + "learning_rate": 9.120536825526463e-06, + "loss": 0.783, + "step": 7046 + }, + { + "epoch": 0.3878584402003412, + "grad_norm": 0.7459971308708191, + "learning_rate": 9.120291279975431e-06, + "loss": 0.8219, + "step": 7047 + }, + { + "epoch": 0.3879134790026969, + "grad_norm": 0.9016150236129761, + "learning_rate": 9.120045703457394e-06, + "loss": 0.8605, + "step": 7048 + }, + { + "epoch": 0.38796851780505254, + "grad_norm": 0.78440922498703, + "learning_rate": 9.119800095974193e-06, + "loss": 0.8424, + "step": 7049 + }, + { + "epoch": 0.3880235566074082, + "grad_norm": 0.751504123210907, + "learning_rate": 9.119554457527681e-06, + "loss": 0.701, + "step": 7050 + }, + { + "epoch": 0.38807859540976386, + "grad_norm": 0.7540284991264343, + "learning_rate": 9.119308788119698e-06, + "loss": 0.7912, + "step": 7051 + }, + { + "epoch": 0.38813363421211955, + "grad_norm": 0.7977007627487183, + "learning_rate": 9.119063087752094e-06, + "loss": 0.9297, + "step": 7052 + }, + { + "epoch": 0.3881886730144752, + "grad_norm": 0.6923508644104004, + "learning_rate": 9.118817356426715e-06, + "loss": 0.7458, + "step": 7053 + }, + { + "epoch": 0.38824371181683087, + "grad_norm": 0.7170272469520569, + "learning_rate": 9.118571594145406e-06, + "loss": 0.733, + "step": 7054 + }, + { + "epoch": 0.3882987506191865, + "grad_norm": 0.7547701001167297, + "learning_rate": 9.118325800910015e-06, + "loss": 0.7758, + "step": 7055 + }, + { + "epoch": 0.3883537894215422, + "grad_norm": 0.7921421527862549, + "learning_rate": 9.118079976722391e-06, + "loss": 0.8262, + "step": 7056 + }, + { + "epoch": 0.38840882822389783, + "grad_norm": 0.734470784664154, + "learning_rate": 9.117834121584379e-06, + "loss": 0.817, + "step": 7057 + }, + { + "epoch": 0.3884638670262535, + "grad_norm": 0.8106420040130615, + "learning_rate": 9.117588235497829e-06, + "loss": 0.8203, + "step": 7058 + }, + { + "epoch": 0.38851890582860915, + "grad_norm": 0.7355543375015259, + "learning_rate": 9.117342318464588e-06, + "loss": 0.8076, + "step": 7059 + }, + { + "epoch": 0.38857394463096484, + "grad_norm": 0.7665252685546875, + "learning_rate": 9.117096370486504e-06, + "loss": 0.7611, + "step": 7060 + }, + { + "epoch": 0.3886289834333205, + "grad_norm": 0.7968598008155823, + "learning_rate": 9.116850391565426e-06, + "loss": 0.6461, + "step": 7061 + }, + { + "epoch": 0.38868402223567616, + "grad_norm": 0.7187741994857788, + "learning_rate": 9.116604381703203e-06, + "loss": 0.7982, + "step": 7062 + }, + { + "epoch": 0.3887390610380318, + "grad_norm": 0.8566913604736328, + "learning_rate": 9.11635834090168e-06, + "loss": 0.9072, + "step": 7063 + }, + { + "epoch": 0.3887940998403875, + "grad_norm": 0.7120797038078308, + "learning_rate": 9.116112269162714e-06, + "loss": 0.7353, + "step": 7064 + }, + { + "epoch": 0.3888491386427431, + "grad_norm": 0.7230019569396973, + "learning_rate": 9.115866166488148e-06, + "loss": 0.7717, + "step": 7065 + }, + { + "epoch": 0.3889041774450988, + "grad_norm": 0.6650584936141968, + "learning_rate": 9.115620032879833e-06, + "loss": 0.7384, + "step": 7066 + }, + { + "epoch": 0.38895921624745444, + "grad_norm": 0.970750629901886, + "learning_rate": 9.115373868339621e-06, + "loss": 0.8478, + "step": 7067 + }, + { + "epoch": 0.38901425504981013, + "grad_norm": 0.7066280245780945, + "learning_rate": 9.115127672869359e-06, + "loss": 0.7638, + "step": 7068 + }, + { + "epoch": 0.38906929385216577, + "grad_norm": 0.6952232718467712, + "learning_rate": 9.1148814464709e-06, + "loss": 0.7869, + "step": 7069 + }, + { + "epoch": 0.38912433265452145, + "grad_norm": 0.804489254951477, + "learning_rate": 9.114635189146094e-06, + "loss": 0.7905, + "step": 7070 + }, + { + "epoch": 0.3891793714568771, + "grad_norm": 0.6988457441329956, + "learning_rate": 9.114388900896791e-06, + "loss": 0.7107, + "step": 7071 + }, + { + "epoch": 0.3892344102592328, + "grad_norm": 0.6379980444908142, + "learning_rate": 9.114142581724842e-06, + "loss": 0.733, + "step": 7072 + }, + { + "epoch": 0.3892894490615884, + "grad_norm": 0.7238649129867554, + "learning_rate": 9.113896231632098e-06, + "loss": 0.8252, + "step": 7073 + }, + { + "epoch": 0.3893444878639441, + "grad_norm": 0.7168585062026978, + "learning_rate": 9.113649850620412e-06, + "loss": 0.6459, + "step": 7074 + }, + { + "epoch": 0.38939952666629973, + "grad_norm": 0.7315915822982788, + "learning_rate": 9.113403438691634e-06, + "loss": 0.7557, + "step": 7075 + }, + { + "epoch": 0.3894545654686554, + "grad_norm": 0.7438754439353943, + "learning_rate": 9.11315699584762e-06, + "loss": 0.7938, + "step": 7076 + }, + { + "epoch": 0.38950960427101106, + "grad_norm": 0.7497848272323608, + "learning_rate": 9.112910522090215e-06, + "loss": 0.8232, + "step": 7077 + }, + { + "epoch": 0.38956464307336675, + "grad_norm": 0.8072896003723145, + "learning_rate": 9.112664017421277e-06, + "loss": 0.7974, + "step": 7078 + }, + { + "epoch": 0.3896196818757224, + "grad_norm": 0.7255920767784119, + "learning_rate": 9.112417481842657e-06, + "loss": 0.7658, + "step": 7079 + }, + { + "epoch": 0.38967472067807807, + "grad_norm": 0.6263132095336914, + "learning_rate": 9.112170915356209e-06, + "loss": 0.7188, + "step": 7080 + }, + { + "epoch": 0.3897297594804337, + "grad_norm": 0.6817660927772522, + "learning_rate": 9.111924317963785e-06, + "loss": 0.7406, + "step": 7081 + }, + { + "epoch": 0.3897847982827894, + "grad_norm": 0.7829134464263916, + "learning_rate": 9.111677689667238e-06, + "loss": 0.8406, + "step": 7082 + }, + { + "epoch": 0.389839837085145, + "grad_norm": 0.7122843861579895, + "learning_rate": 9.111431030468421e-06, + "loss": 0.7722, + "step": 7083 + }, + { + "epoch": 0.3898948758875007, + "grad_norm": 0.7041764259338379, + "learning_rate": 9.11118434036919e-06, + "loss": 0.8307, + "step": 7084 + }, + { + "epoch": 0.38994991468985635, + "grad_norm": 0.7582009434700012, + "learning_rate": 9.110937619371398e-06, + "loss": 0.7461, + "step": 7085 + }, + { + "epoch": 0.39000495349221204, + "grad_norm": 0.7156100273132324, + "learning_rate": 9.110690867476899e-06, + "loss": 0.7294, + "step": 7086 + }, + { + "epoch": 0.39005999229456767, + "grad_norm": 0.79449063539505, + "learning_rate": 9.110444084687549e-06, + "loss": 0.8652, + "step": 7087 + }, + { + "epoch": 0.3901150310969233, + "grad_norm": 0.7692831754684448, + "learning_rate": 9.1101972710052e-06, + "loss": 0.7899, + "step": 7088 + }, + { + "epoch": 0.390170069899279, + "grad_norm": 0.7189639806747437, + "learning_rate": 9.109950426431708e-06, + "loss": 0.726, + "step": 7089 + }, + { + "epoch": 0.3902251087016346, + "grad_norm": 0.7491177916526794, + "learning_rate": 9.10970355096893e-06, + "loss": 0.8881, + "step": 7090 + }, + { + "epoch": 0.3902801475039903, + "grad_norm": 0.783027172088623, + "learning_rate": 9.10945664461872e-06, + "loss": 0.7728, + "step": 7091 + }, + { + "epoch": 0.39033518630634595, + "grad_norm": 1.0871556997299194, + "learning_rate": 9.109209707382934e-06, + "loss": 0.8059, + "step": 7092 + }, + { + "epoch": 0.39039022510870164, + "grad_norm": 0.7287113666534424, + "learning_rate": 9.108962739263429e-06, + "loss": 0.7896, + "step": 7093 + }, + { + "epoch": 0.39044526391105727, + "grad_norm": 0.7801700234413147, + "learning_rate": 9.108715740262058e-06, + "loss": 0.8012, + "step": 7094 + }, + { + "epoch": 0.39050030271341296, + "grad_norm": 0.846709132194519, + "learning_rate": 9.10846871038068e-06, + "loss": 0.8392, + "step": 7095 + }, + { + "epoch": 0.3905553415157686, + "grad_norm": 0.7408092617988586, + "learning_rate": 9.10822164962115e-06, + "loss": 0.8657, + "step": 7096 + }, + { + "epoch": 0.3906103803181243, + "grad_norm": 0.6748743057250977, + "learning_rate": 9.107974557985328e-06, + "loss": 0.7659, + "step": 7097 + }, + { + "epoch": 0.3906654191204799, + "grad_norm": 0.7512170672416687, + "learning_rate": 9.107727435475067e-06, + "loss": 0.7704, + "step": 7098 + }, + { + "epoch": 0.3907204579228356, + "grad_norm": 0.9039596319198608, + "learning_rate": 9.107480282092227e-06, + "loss": 0.8412, + "step": 7099 + }, + { + "epoch": 0.39077549672519124, + "grad_norm": 0.829785943031311, + "learning_rate": 9.107233097838663e-06, + "loss": 0.8229, + "step": 7100 + }, + { + "epoch": 0.39083053552754693, + "grad_norm": 0.7597842812538147, + "learning_rate": 9.106985882716238e-06, + "loss": 0.7798, + "step": 7101 + }, + { + "epoch": 0.39088557432990256, + "grad_norm": 0.7619945406913757, + "learning_rate": 9.106738636726802e-06, + "loss": 0.7504, + "step": 7102 + }, + { + "epoch": 0.39094061313225825, + "grad_norm": 0.6791092157363892, + "learning_rate": 9.10649135987222e-06, + "loss": 0.8167, + "step": 7103 + }, + { + "epoch": 0.3909956519346139, + "grad_norm": 0.7977412343025208, + "learning_rate": 9.10624405215435e-06, + "loss": 0.8252, + "step": 7104 + }, + { + "epoch": 0.3910506907369696, + "grad_norm": 0.7329283356666565, + "learning_rate": 9.105996713575047e-06, + "loss": 0.7084, + "step": 7105 + }, + { + "epoch": 0.3911057295393252, + "grad_norm": 0.7125133872032166, + "learning_rate": 9.105749344136172e-06, + "loss": 0.6672, + "step": 7106 + }, + { + "epoch": 0.3911607683416809, + "grad_norm": 0.6974679827690125, + "learning_rate": 9.105501943839583e-06, + "loss": 0.7354, + "step": 7107 + }, + { + "epoch": 0.39121580714403653, + "grad_norm": 0.7191265225410461, + "learning_rate": 9.10525451268714e-06, + "loss": 0.8133, + "step": 7108 + }, + { + "epoch": 0.3912708459463922, + "grad_norm": 0.7188206911087036, + "learning_rate": 9.105007050680704e-06, + "loss": 0.7947, + "step": 7109 + }, + { + "epoch": 0.39132588474874785, + "grad_norm": 0.9017364382743835, + "learning_rate": 9.104759557822135e-06, + "loss": 0.7848, + "step": 7110 + }, + { + "epoch": 0.39138092355110354, + "grad_norm": 0.7551164031028748, + "learning_rate": 9.104512034113292e-06, + "loss": 0.8266, + "step": 7111 + }, + { + "epoch": 0.3914359623534592, + "grad_norm": 0.7810001969337463, + "learning_rate": 9.104264479556033e-06, + "loss": 0.7731, + "step": 7112 + }, + { + "epoch": 0.39149100115581487, + "grad_norm": 0.787723183631897, + "learning_rate": 9.104016894152223e-06, + "loss": 0.8008, + "step": 7113 + }, + { + "epoch": 0.3915460399581705, + "grad_norm": 0.7303524017333984, + "learning_rate": 9.103769277903718e-06, + "loss": 0.826, + "step": 7114 + }, + { + "epoch": 0.3916010787605262, + "grad_norm": 0.707759439945221, + "learning_rate": 9.103521630812384e-06, + "loss": 0.6303, + "step": 7115 + }, + { + "epoch": 0.3916561175628818, + "grad_norm": 0.6929940581321716, + "learning_rate": 9.10327395288008e-06, + "loss": 0.733, + "step": 7116 + }, + { + "epoch": 0.3917111563652375, + "grad_norm": 0.7133205533027649, + "learning_rate": 9.103026244108667e-06, + "loss": 0.8421, + "step": 7117 + }, + { + "epoch": 0.39176619516759315, + "grad_norm": 1.2049434185028076, + "learning_rate": 9.102778504500005e-06, + "loss": 0.8618, + "step": 7118 + }, + { + "epoch": 0.39182123396994883, + "grad_norm": 0.7792720198631287, + "learning_rate": 9.10253073405596e-06, + "loss": 0.717, + "step": 7119 + }, + { + "epoch": 0.39187627277230447, + "grad_norm": 0.7234412431716919, + "learning_rate": 9.10228293277839e-06, + "loss": 0.7547, + "step": 7120 + }, + { + "epoch": 0.39193131157466016, + "grad_norm": 0.6845420002937317, + "learning_rate": 9.102035100669162e-06, + "loss": 0.7255, + "step": 7121 + }, + { + "epoch": 0.3919863503770158, + "grad_norm": 0.7446799874305725, + "learning_rate": 9.101787237730135e-06, + "loss": 0.7947, + "step": 7122 + }, + { + "epoch": 0.3920413891793715, + "grad_norm": 0.812924325466156, + "learning_rate": 9.101539343963176e-06, + "loss": 0.843, + "step": 7123 + }, + { + "epoch": 0.3920964279817271, + "grad_norm": 0.7373847365379333, + "learning_rate": 9.101291419370141e-06, + "loss": 0.7703, + "step": 7124 + }, + { + "epoch": 0.3921514667840828, + "grad_norm": 0.8305120468139648, + "learning_rate": 9.101043463952899e-06, + "loss": 0.8904, + "step": 7125 + }, + { + "epoch": 0.39220650558643844, + "grad_norm": 0.7263030409812927, + "learning_rate": 9.100795477713313e-06, + "loss": 0.8319, + "step": 7126 + }, + { + "epoch": 0.3922615443887941, + "grad_norm": 0.8358581066131592, + "learning_rate": 9.100547460653245e-06, + "loss": 0.8305, + "step": 7127 + }, + { + "epoch": 0.39231658319114976, + "grad_norm": 0.6608800292015076, + "learning_rate": 9.10029941277456e-06, + "loss": 0.7815, + "step": 7128 + }, + { + "epoch": 0.39237162199350545, + "grad_norm": 0.8590257167816162, + "learning_rate": 9.100051334079122e-06, + "loss": 0.8292, + "step": 7129 + }, + { + "epoch": 0.3924266607958611, + "grad_norm": 0.6241755485534668, + "learning_rate": 9.099803224568797e-06, + "loss": 0.6568, + "step": 7130 + }, + { + "epoch": 0.3924816995982167, + "grad_norm": 0.7298059463500977, + "learning_rate": 9.099555084245447e-06, + "loss": 0.727, + "step": 7131 + }, + { + "epoch": 0.3925367384005724, + "grad_norm": 0.7741055488586426, + "learning_rate": 9.099306913110939e-06, + "loss": 0.8481, + "step": 7132 + }, + { + "epoch": 0.39259177720292804, + "grad_norm": 0.9674170613288879, + "learning_rate": 9.099058711167137e-06, + "loss": 0.8507, + "step": 7133 + }, + { + "epoch": 0.3926468160052837, + "grad_norm": 0.7285159826278687, + "learning_rate": 9.098810478415907e-06, + "loss": 0.766, + "step": 7134 + }, + { + "epoch": 0.39270185480763936, + "grad_norm": 0.7215660810470581, + "learning_rate": 9.098562214859115e-06, + "loss": 0.794, + "step": 7135 + }, + { + "epoch": 0.39275689360999505, + "grad_norm": 0.764437735080719, + "learning_rate": 9.098313920498627e-06, + "loss": 0.8228, + "step": 7136 + }, + { + "epoch": 0.3928119324123507, + "grad_norm": 0.7222796082496643, + "learning_rate": 9.098065595336309e-06, + "loss": 0.8064, + "step": 7137 + }, + { + "epoch": 0.3928669712147064, + "grad_norm": 0.7044625878334045, + "learning_rate": 9.097817239374024e-06, + "loss": 0.8017, + "step": 7138 + }, + { + "epoch": 0.392922010017062, + "grad_norm": 0.7929979562759399, + "learning_rate": 9.097568852613646e-06, + "loss": 0.7527, + "step": 7139 + }, + { + "epoch": 0.3929770488194177, + "grad_norm": 0.7833721041679382, + "learning_rate": 9.097320435057033e-06, + "loss": 0.8335, + "step": 7140 + }, + { + "epoch": 0.39303208762177333, + "grad_norm": 0.8365728259086609, + "learning_rate": 9.097071986706058e-06, + "loss": 0.6439, + "step": 7141 + }, + { + "epoch": 0.393087126424129, + "grad_norm": 0.7547842264175415, + "learning_rate": 9.096823507562588e-06, + "loss": 0.8316, + "step": 7142 + }, + { + "epoch": 0.39314216522648465, + "grad_norm": 0.6598891019821167, + "learning_rate": 9.09657499762849e-06, + "loss": 0.6547, + "step": 7143 + }, + { + "epoch": 0.39319720402884034, + "grad_norm": 0.7913638949394226, + "learning_rate": 9.096326456905627e-06, + "loss": 0.7964, + "step": 7144 + }, + { + "epoch": 0.393252242831196, + "grad_norm": 0.6927905082702637, + "learning_rate": 9.096077885395874e-06, + "loss": 0.7836, + "step": 7145 + }, + { + "epoch": 0.39330728163355166, + "grad_norm": 0.7505417466163635, + "learning_rate": 9.095829283101094e-06, + "loss": 0.7707, + "step": 7146 + }, + { + "epoch": 0.3933623204359073, + "grad_norm": 0.8797083497047424, + "learning_rate": 9.095580650023158e-06, + "loss": 0.866, + "step": 7147 + }, + { + "epoch": 0.393417359238263, + "grad_norm": 0.7023645639419556, + "learning_rate": 9.095331986163935e-06, + "loss": 0.7013, + "step": 7148 + }, + { + "epoch": 0.3934723980406186, + "grad_norm": 0.697354793548584, + "learning_rate": 9.095083291525293e-06, + "loss": 0.7691, + "step": 7149 + }, + { + "epoch": 0.3935274368429743, + "grad_norm": 0.7211105227470398, + "learning_rate": 9.094834566109101e-06, + "loss": 0.6816, + "step": 7150 + }, + { + "epoch": 0.39358247564532994, + "grad_norm": 0.8593278527259827, + "learning_rate": 9.094585809917227e-06, + "loss": 0.915, + "step": 7151 + }, + { + "epoch": 0.39363751444768563, + "grad_norm": 0.7406070828437805, + "learning_rate": 9.094337022951545e-06, + "loss": 0.7825, + "step": 7152 + }, + { + "epoch": 0.39369255325004127, + "grad_norm": 0.7644504308700562, + "learning_rate": 9.09408820521392e-06, + "loss": 0.6796, + "step": 7153 + }, + { + "epoch": 0.39374759205239696, + "grad_norm": 0.8239033222198486, + "learning_rate": 9.093839356706224e-06, + "loss": 0.8396, + "step": 7154 + }, + { + "epoch": 0.3938026308547526, + "grad_norm": 0.6433991193771362, + "learning_rate": 9.093590477430327e-06, + "loss": 0.6941, + "step": 7155 + }, + { + "epoch": 0.3938576696571083, + "grad_norm": 0.6979972124099731, + "learning_rate": 9.093341567388102e-06, + "loss": 0.8142, + "step": 7156 + }, + { + "epoch": 0.3939127084594639, + "grad_norm": 0.7062026262283325, + "learning_rate": 9.093092626581414e-06, + "loss": 0.804, + "step": 7157 + }, + { + "epoch": 0.3939677472618196, + "grad_norm": 0.7070814967155457, + "learning_rate": 9.09284365501214e-06, + "loss": 0.765, + "step": 7158 + }, + { + "epoch": 0.39402278606417523, + "grad_norm": 0.8577908873558044, + "learning_rate": 9.092594652682147e-06, + "loss": 0.7074, + "step": 7159 + }, + { + "epoch": 0.3940778248665309, + "grad_norm": 0.7386197447776794, + "learning_rate": 9.092345619593309e-06, + "loss": 0.7629, + "step": 7160 + }, + { + "epoch": 0.39413286366888656, + "grad_norm": 0.8048123121261597, + "learning_rate": 9.092096555747496e-06, + "loss": 0.9225, + "step": 7161 + }, + { + "epoch": 0.39418790247124225, + "grad_norm": 0.7479888200759888, + "learning_rate": 9.091847461146582e-06, + "loss": 0.7284, + "step": 7162 + }, + { + "epoch": 0.3942429412735979, + "grad_norm": 0.7448734045028687, + "learning_rate": 9.091598335792438e-06, + "loss": 0.8694, + "step": 7163 + }, + { + "epoch": 0.39429798007595357, + "grad_norm": 0.7511261701583862, + "learning_rate": 9.091349179686935e-06, + "loss": 0.7822, + "step": 7164 + }, + { + "epoch": 0.3943530188783092, + "grad_norm": 0.7079344391822815, + "learning_rate": 9.091099992831946e-06, + "loss": 0.7238, + "step": 7165 + }, + { + "epoch": 0.3944080576806649, + "grad_norm": 0.7007229328155518, + "learning_rate": 9.090850775229347e-06, + "loss": 0.7269, + "step": 7166 + }, + { + "epoch": 0.3944630964830205, + "grad_norm": 0.769800066947937, + "learning_rate": 9.090601526881007e-06, + "loss": 0.7894, + "step": 7167 + }, + { + "epoch": 0.3945181352853762, + "grad_norm": 0.7211676836013794, + "learning_rate": 9.090352247788801e-06, + "loss": 0.7998, + "step": 7168 + }, + { + "epoch": 0.39457317408773185, + "grad_norm": 0.6784254312515259, + "learning_rate": 9.090102937954602e-06, + "loss": 0.7576, + "step": 7169 + }, + { + "epoch": 0.39462821289008754, + "grad_norm": 0.7696946859359741, + "learning_rate": 9.089853597380285e-06, + "loss": 0.8395, + "step": 7170 + }, + { + "epoch": 0.39468325169244317, + "grad_norm": 0.8720405697822571, + "learning_rate": 9.089604226067723e-06, + "loss": 0.8971, + "step": 7171 + }, + { + "epoch": 0.39473829049479886, + "grad_norm": 0.8457947373390198, + "learning_rate": 9.08935482401879e-06, + "loss": 0.7002, + "step": 7172 + }, + { + "epoch": 0.3947933292971545, + "grad_norm": 0.8181997537612915, + "learning_rate": 9.089105391235361e-06, + "loss": 0.8949, + "step": 7173 + }, + { + "epoch": 0.3948483680995101, + "grad_norm": 0.7717136144638062, + "learning_rate": 9.08885592771931e-06, + "loss": 0.829, + "step": 7174 + }, + { + "epoch": 0.3949034069018658, + "grad_norm": 0.6941567063331604, + "learning_rate": 9.088606433472514e-06, + "loss": 0.7592, + "step": 7175 + }, + { + "epoch": 0.39495844570422145, + "grad_norm": 0.7358599901199341, + "learning_rate": 9.088356908496845e-06, + "loss": 0.8657, + "step": 7176 + }, + { + "epoch": 0.39501348450657714, + "grad_norm": 1.1329307556152344, + "learning_rate": 9.08810735279418e-06, + "loss": 0.8307, + "step": 7177 + }, + { + "epoch": 0.3950685233089328, + "grad_norm": 0.7011532187461853, + "learning_rate": 9.087857766366395e-06, + "loss": 0.7487, + "step": 7178 + }, + { + "epoch": 0.39512356211128846, + "grad_norm": 0.7390572428703308, + "learning_rate": 9.087608149215366e-06, + "loss": 0.8244, + "step": 7179 + }, + { + "epoch": 0.3951786009136441, + "grad_norm": 0.6907634735107422, + "learning_rate": 9.087358501342966e-06, + "loss": 0.751, + "step": 7180 + }, + { + "epoch": 0.3952336397159998, + "grad_norm": 0.7467379570007324, + "learning_rate": 9.087108822751076e-06, + "loss": 0.8549, + "step": 7181 + }, + { + "epoch": 0.3952886785183554, + "grad_norm": 0.7493302226066589, + "learning_rate": 9.086859113441568e-06, + "loss": 0.8332, + "step": 7182 + }, + { + "epoch": 0.3953437173207111, + "grad_norm": 0.8364959955215454, + "learning_rate": 9.086609373416321e-06, + "loss": 0.7873, + "step": 7183 + }, + { + "epoch": 0.39539875612306674, + "grad_norm": 0.7330418825149536, + "learning_rate": 9.086359602677214e-06, + "loss": 0.7861, + "step": 7184 + }, + { + "epoch": 0.39545379492542243, + "grad_norm": 0.7296311855316162, + "learning_rate": 9.086109801226121e-06, + "loss": 0.7946, + "step": 7185 + }, + { + "epoch": 0.39550883372777806, + "grad_norm": 0.7884660363197327, + "learning_rate": 9.085859969064921e-06, + "loss": 0.7851, + "step": 7186 + }, + { + "epoch": 0.39556387253013375, + "grad_norm": 0.7311955690383911, + "learning_rate": 9.08561010619549e-06, + "loss": 0.7645, + "step": 7187 + }, + { + "epoch": 0.3956189113324894, + "grad_norm": 0.7447296977043152, + "learning_rate": 9.085360212619707e-06, + "loss": 0.7446, + "step": 7188 + }, + { + "epoch": 0.3956739501348451, + "grad_norm": 0.755628228187561, + "learning_rate": 9.08511028833945e-06, + "loss": 0.8107, + "step": 7189 + }, + { + "epoch": 0.3957289889372007, + "grad_norm": 0.6800833940505981, + "learning_rate": 9.0848603333566e-06, + "loss": 0.7471, + "step": 7190 + }, + { + "epoch": 0.3957840277395564, + "grad_norm": 0.6396341919898987, + "learning_rate": 9.08461034767303e-06, + "loss": 0.6797, + "step": 7191 + }, + { + "epoch": 0.39583906654191203, + "grad_norm": 0.729680597782135, + "learning_rate": 9.084360331290625e-06, + "loss": 0.7224, + "step": 7192 + }, + { + "epoch": 0.3958941053442677, + "grad_norm": 0.7630584239959717, + "learning_rate": 9.084110284211259e-06, + "loss": 0.8203, + "step": 7193 + }, + { + "epoch": 0.39594914414662336, + "grad_norm": 0.8799235820770264, + "learning_rate": 9.083860206436813e-06, + "loss": 0.8312, + "step": 7194 + }, + { + "epoch": 0.39600418294897904, + "grad_norm": 0.797081708908081, + "learning_rate": 9.083610097969169e-06, + "loss": 0.7561, + "step": 7195 + }, + { + "epoch": 0.3960592217513347, + "grad_norm": 0.7408759593963623, + "learning_rate": 9.083359958810203e-06, + "loss": 0.7854, + "step": 7196 + }, + { + "epoch": 0.39611426055369037, + "grad_norm": 0.7552130222320557, + "learning_rate": 9.083109788961797e-06, + "loss": 0.8145, + "step": 7197 + }, + { + "epoch": 0.396169299356046, + "grad_norm": 0.7147447466850281, + "learning_rate": 9.08285958842583e-06, + "loss": 0.792, + "step": 7198 + }, + { + "epoch": 0.3962243381584017, + "grad_norm": 0.7416259050369263, + "learning_rate": 9.082609357204183e-06, + "loss": 0.7801, + "step": 7199 + }, + { + "epoch": 0.3962793769607573, + "grad_norm": 0.7551109194755554, + "learning_rate": 9.082359095298741e-06, + "loss": 0.8841, + "step": 7200 + }, + { + "epoch": 0.396334415763113, + "grad_norm": 0.761472225189209, + "learning_rate": 9.082108802711377e-06, + "loss": 0.9061, + "step": 7201 + }, + { + "epoch": 0.39638945456546865, + "grad_norm": 0.7234126329421997, + "learning_rate": 9.081858479443977e-06, + "loss": 0.8308, + "step": 7202 + }, + { + "epoch": 0.39644449336782434, + "grad_norm": 0.7204816341400146, + "learning_rate": 9.08160812549842e-06, + "loss": 0.7481, + "step": 7203 + }, + { + "epoch": 0.39649953217017997, + "grad_norm": 0.7207956910133362, + "learning_rate": 9.081357740876591e-06, + "loss": 0.762, + "step": 7204 + }, + { + "epoch": 0.39655457097253566, + "grad_norm": 0.7967123985290527, + "learning_rate": 9.081107325580367e-06, + "loss": 0.7931, + "step": 7205 + }, + { + "epoch": 0.3966096097748913, + "grad_norm": 0.9839354753494263, + "learning_rate": 9.080856879611635e-06, + "loss": 0.8182, + "step": 7206 + }, + { + "epoch": 0.396664648577247, + "grad_norm": 0.8468357920646667, + "learning_rate": 9.080606402972274e-06, + "loss": 0.7056, + "step": 7207 + }, + { + "epoch": 0.3967196873796026, + "grad_norm": 0.6549574136734009, + "learning_rate": 9.080355895664169e-06, + "loss": 0.7604, + "step": 7208 + }, + { + "epoch": 0.3967747261819583, + "grad_norm": 0.7475417256355286, + "learning_rate": 9.080105357689201e-06, + "loss": 0.7107, + "step": 7209 + }, + { + "epoch": 0.39682976498431394, + "grad_norm": 0.7464179992675781, + "learning_rate": 9.079854789049251e-06, + "loss": 0.793, + "step": 7210 + }, + { + "epoch": 0.3968848037866696, + "grad_norm": 0.8332071900367737, + "learning_rate": 9.079604189746207e-06, + "loss": 0.8383, + "step": 7211 + }, + { + "epoch": 0.39693984258902526, + "grad_norm": 0.722055196762085, + "learning_rate": 9.07935355978195e-06, + "loss": 0.8569, + "step": 7212 + }, + { + "epoch": 0.39699488139138095, + "grad_norm": 0.7442018389701843, + "learning_rate": 9.079102899158363e-06, + "loss": 0.8165, + "step": 7213 + }, + { + "epoch": 0.3970499201937366, + "grad_norm": 0.6986141204833984, + "learning_rate": 9.07885220787733e-06, + "loss": 0.7562, + "step": 7214 + }, + { + "epoch": 0.39710495899609227, + "grad_norm": 0.7718464732170105, + "learning_rate": 9.078601485940736e-06, + "loss": 0.8529, + "step": 7215 + }, + { + "epoch": 0.3971599977984479, + "grad_norm": 0.7583653330802917, + "learning_rate": 9.078350733350464e-06, + "loss": 0.7855, + "step": 7216 + }, + { + "epoch": 0.39721503660080354, + "grad_norm": 0.7699223160743713, + "learning_rate": 9.078099950108401e-06, + "loss": 0.8061, + "step": 7217 + }, + { + "epoch": 0.39727007540315923, + "grad_norm": 0.7374141812324524, + "learning_rate": 9.07784913621643e-06, + "loss": 0.789, + "step": 7218 + }, + { + "epoch": 0.39732511420551486, + "grad_norm": 0.7446104884147644, + "learning_rate": 9.077598291676436e-06, + "loss": 0.8381, + "step": 7219 + }, + { + "epoch": 0.39738015300787055, + "grad_norm": 0.7017301917076111, + "learning_rate": 9.077347416490305e-06, + "loss": 0.7153, + "step": 7220 + }, + { + "epoch": 0.3974351918102262, + "grad_norm": 0.7676172852516174, + "learning_rate": 9.077096510659922e-06, + "loss": 0.8029, + "step": 7221 + }, + { + "epoch": 0.3974902306125819, + "grad_norm": 0.9340602159500122, + "learning_rate": 9.076845574187174e-06, + "loss": 0.7865, + "step": 7222 + }, + { + "epoch": 0.3975452694149375, + "grad_norm": 0.8634235262870789, + "learning_rate": 9.076594607073945e-06, + "loss": 0.7606, + "step": 7223 + }, + { + "epoch": 0.3976003082172932, + "grad_norm": 0.8967369198799133, + "learning_rate": 9.076343609322123e-06, + "loss": 0.7011, + "step": 7224 + }, + { + "epoch": 0.39765534701964883, + "grad_norm": 0.7269352078437805, + "learning_rate": 9.076092580933594e-06, + "loss": 0.8043, + "step": 7225 + }, + { + "epoch": 0.3977103858220045, + "grad_norm": 0.7550628781318665, + "learning_rate": 9.075841521910243e-06, + "loss": 0.7344, + "step": 7226 + }, + { + "epoch": 0.39776542462436015, + "grad_norm": 0.6973844766616821, + "learning_rate": 9.075590432253958e-06, + "loss": 0.6995, + "step": 7227 + }, + { + "epoch": 0.39782046342671584, + "grad_norm": 0.648560643196106, + "learning_rate": 9.075339311966627e-06, + "loss": 0.6997, + "step": 7228 + }, + { + "epoch": 0.3978755022290715, + "grad_norm": 0.8457548022270203, + "learning_rate": 9.075088161050134e-06, + "loss": 0.8548, + "step": 7229 + }, + { + "epoch": 0.39793054103142717, + "grad_norm": 0.7644637823104858, + "learning_rate": 9.074836979506373e-06, + "loss": 0.6966, + "step": 7230 + }, + { + "epoch": 0.3979855798337828, + "grad_norm": 0.7146210670471191, + "learning_rate": 9.074585767337227e-06, + "loss": 0.7673, + "step": 7231 + }, + { + "epoch": 0.3980406186361385, + "grad_norm": 0.8570694327354431, + "learning_rate": 9.074334524544585e-06, + "loss": 0.8233, + "step": 7232 + }, + { + "epoch": 0.3980956574384941, + "grad_norm": 0.7257633805274963, + "learning_rate": 9.074083251130334e-06, + "loss": 0.7464, + "step": 7233 + }, + { + "epoch": 0.3981506962408498, + "grad_norm": 0.9377032518386841, + "learning_rate": 9.073831947096365e-06, + "loss": 0.7814, + "step": 7234 + }, + { + "epoch": 0.39820573504320544, + "grad_norm": 0.8105629086494446, + "learning_rate": 9.073580612444566e-06, + "loss": 0.8069, + "step": 7235 + }, + { + "epoch": 0.39826077384556113, + "grad_norm": 0.7874456644058228, + "learning_rate": 9.073329247176824e-06, + "loss": 0.8414, + "step": 7236 + }, + { + "epoch": 0.39831581264791677, + "grad_norm": 0.6829617023468018, + "learning_rate": 9.07307785129503e-06, + "loss": 0.7633, + "step": 7237 + }, + { + "epoch": 0.39837085145027246, + "grad_norm": 0.6838501691818237, + "learning_rate": 9.072826424801075e-06, + "loss": 0.6972, + "step": 7238 + }, + { + "epoch": 0.3984258902526281, + "grad_norm": 0.7054216861724854, + "learning_rate": 9.072574967696845e-06, + "loss": 0.8049, + "step": 7239 + }, + { + "epoch": 0.3984809290549838, + "grad_norm": 0.9462615847587585, + "learning_rate": 9.072323479984232e-06, + "loss": 0.7988, + "step": 7240 + }, + { + "epoch": 0.3985359678573394, + "grad_norm": 0.7334465980529785, + "learning_rate": 9.072071961665128e-06, + "loss": 0.7538, + "step": 7241 + }, + { + "epoch": 0.3985910066596951, + "grad_norm": 0.7506609559059143, + "learning_rate": 9.071820412741418e-06, + "loss": 0.7991, + "step": 7242 + }, + { + "epoch": 0.39864604546205074, + "grad_norm": 0.6858688592910767, + "learning_rate": 9.071568833214998e-06, + "loss": 0.7258, + "step": 7243 + }, + { + "epoch": 0.3987010842644064, + "grad_norm": 0.8117396235466003, + "learning_rate": 9.071317223087754e-06, + "loss": 0.752, + "step": 7244 + }, + { + "epoch": 0.39875612306676206, + "grad_norm": 0.7772389054298401, + "learning_rate": 9.071065582361582e-06, + "loss": 0.7444, + "step": 7245 + }, + { + "epoch": 0.39881116186911775, + "grad_norm": 0.7221882939338684, + "learning_rate": 9.07081391103837e-06, + "loss": 0.8035, + "step": 7246 + }, + { + "epoch": 0.3988662006714734, + "grad_norm": 0.8113289475440979, + "learning_rate": 9.07056220912001e-06, + "loss": 0.7623, + "step": 7247 + }, + { + "epoch": 0.39892123947382907, + "grad_norm": 0.730823278427124, + "learning_rate": 9.070310476608395e-06, + "loss": 0.7872, + "step": 7248 + }, + { + "epoch": 0.3989762782761847, + "grad_norm": 0.7690893411636353, + "learning_rate": 9.070058713505415e-06, + "loss": 0.7402, + "step": 7249 + }, + { + "epoch": 0.3990313170785404, + "grad_norm": 0.6768597364425659, + "learning_rate": 9.069806919812963e-06, + "loss": 0.7283, + "step": 7250 + }, + { + "epoch": 0.399086355880896, + "grad_norm": 0.6938686370849609, + "learning_rate": 9.069555095532932e-06, + "loss": 0.7209, + "step": 7251 + }, + { + "epoch": 0.3991413946832517, + "grad_norm": 0.7162025570869446, + "learning_rate": 9.069303240667215e-06, + "loss": 0.7915, + "step": 7252 + }, + { + "epoch": 0.39919643348560735, + "grad_norm": 0.9170399308204651, + "learning_rate": 9.069051355217704e-06, + "loss": 0.8399, + "step": 7253 + }, + { + "epoch": 0.39925147228796304, + "grad_norm": 0.7080186009407043, + "learning_rate": 9.068799439186291e-06, + "loss": 0.8678, + "step": 7254 + }, + { + "epoch": 0.39930651109031867, + "grad_norm": 1.013613224029541, + "learning_rate": 9.068547492574872e-06, + "loss": 0.817, + "step": 7255 + }, + { + "epoch": 0.39936154989267436, + "grad_norm": 0.6911013722419739, + "learning_rate": 9.068295515385337e-06, + "loss": 0.7048, + "step": 7256 + }, + { + "epoch": 0.39941658869503, + "grad_norm": 0.748219907283783, + "learning_rate": 9.068043507619584e-06, + "loss": 0.8115, + "step": 7257 + }, + { + "epoch": 0.3994716274973857, + "grad_norm": 0.6763347387313843, + "learning_rate": 9.067791469279504e-06, + "loss": 0.763, + "step": 7258 + }, + { + "epoch": 0.3995266662997413, + "grad_norm": 0.7291030287742615, + "learning_rate": 9.067539400366993e-06, + "loss": 0.7319, + "step": 7259 + }, + { + "epoch": 0.39958170510209695, + "grad_norm": 0.6515628695487976, + "learning_rate": 9.067287300883945e-06, + "loss": 0.7903, + "step": 7260 + }, + { + "epoch": 0.39963674390445264, + "grad_norm": 0.7815985679626465, + "learning_rate": 9.067035170832253e-06, + "loss": 0.8241, + "step": 7261 + }, + { + "epoch": 0.3996917827068083, + "grad_norm": 0.6747417449951172, + "learning_rate": 9.066783010213812e-06, + "loss": 0.7544, + "step": 7262 + }, + { + "epoch": 0.39974682150916396, + "grad_norm": 0.6568340063095093, + "learning_rate": 9.066530819030522e-06, + "loss": 0.7754, + "step": 7263 + }, + { + "epoch": 0.3998018603115196, + "grad_norm": 0.6703339219093323, + "learning_rate": 9.066278597284273e-06, + "loss": 0.7581, + "step": 7264 + }, + { + "epoch": 0.3998568991138753, + "grad_norm": 0.7421279549598694, + "learning_rate": 9.066026344976962e-06, + "loss": 0.7974, + "step": 7265 + }, + { + "epoch": 0.3999119379162309, + "grad_norm": 0.7226015329360962, + "learning_rate": 9.065774062110486e-06, + "loss": 0.7777, + "step": 7266 + }, + { + "epoch": 0.3999669767185866, + "grad_norm": 0.7092894911766052, + "learning_rate": 9.06552174868674e-06, + "loss": 0.7885, + "step": 7267 + }, + { + "epoch": 0.40002201552094224, + "grad_norm": 0.837902307510376, + "learning_rate": 9.065269404707622e-06, + "loss": 0.7425, + "step": 7268 + }, + { + "epoch": 0.40007705432329793, + "grad_norm": 0.803811252117157, + "learning_rate": 9.065017030175027e-06, + "loss": 0.8418, + "step": 7269 + }, + { + "epoch": 0.40013209312565357, + "grad_norm": 0.8110278248786926, + "learning_rate": 9.064764625090854e-06, + "loss": 0.7724, + "step": 7270 + }, + { + "epoch": 0.40018713192800925, + "grad_norm": 0.7305173277854919, + "learning_rate": 9.064512189456995e-06, + "loss": 0.7465, + "step": 7271 + }, + { + "epoch": 0.4002421707303649, + "grad_norm": 0.7312467694282532, + "learning_rate": 9.06425972327535e-06, + "loss": 0.8406, + "step": 7272 + }, + { + "epoch": 0.4002972095327206, + "grad_norm": 0.7348741292953491, + "learning_rate": 9.064007226547819e-06, + "loss": 0.8103, + "step": 7273 + }, + { + "epoch": 0.4003522483350762, + "grad_norm": 0.6561787724494934, + "learning_rate": 9.063754699276297e-06, + "loss": 0.6634, + "step": 7274 + }, + { + "epoch": 0.4004072871374319, + "grad_norm": 0.7924866080284119, + "learning_rate": 9.063502141462682e-06, + "loss": 0.6592, + "step": 7275 + }, + { + "epoch": 0.40046232593978753, + "grad_norm": 0.6873973608016968, + "learning_rate": 9.063249553108873e-06, + "loss": 0.7912, + "step": 7276 + }, + { + "epoch": 0.4005173647421432, + "grad_norm": 0.6872708797454834, + "learning_rate": 9.062996934216768e-06, + "loss": 0.732, + "step": 7277 + }, + { + "epoch": 0.40057240354449886, + "grad_norm": 0.7381585836410522, + "learning_rate": 9.062744284788265e-06, + "loss": 0.84, + "step": 7278 + }, + { + "epoch": 0.40062744234685455, + "grad_norm": 0.7885964512825012, + "learning_rate": 9.062491604825266e-06, + "loss": 0.8229, + "step": 7279 + }, + { + "epoch": 0.4006824811492102, + "grad_norm": 0.9066407680511475, + "learning_rate": 9.062238894329664e-06, + "loss": 0.7299, + "step": 7280 + }, + { + "epoch": 0.40073751995156587, + "grad_norm": 0.7694007754325867, + "learning_rate": 9.061986153303364e-06, + "loss": 0.8033, + "step": 7281 + }, + { + "epoch": 0.4007925587539215, + "grad_norm": 1.021766185760498, + "learning_rate": 9.061733381748263e-06, + "loss": 0.79, + "step": 7282 + }, + { + "epoch": 0.4008475975562772, + "grad_norm": 0.7776662111282349, + "learning_rate": 9.06148057966626e-06, + "loss": 0.8484, + "step": 7283 + }, + { + "epoch": 0.4009026363586328, + "grad_norm": 0.8646043539047241, + "learning_rate": 9.061227747059257e-06, + "loss": 0.8223, + "step": 7284 + }, + { + "epoch": 0.4009576751609885, + "grad_norm": 0.7347257733345032, + "learning_rate": 9.060974883929154e-06, + "loss": 0.8062, + "step": 7285 + }, + { + "epoch": 0.40101271396334415, + "grad_norm": 0.8233902454376221, + "learning_rate": 9.06072199027785e-06, + "loss": 0.8922, + "step": 7286 + }, + { + "epoch": 0.40106775276569984, + "grad_norm": 0.7099601030349731, + "learning_rate": 9.060469066107246e-06, + "loss": 0.7125, + "step": 7287 + }, + { + "epoch": 0.40112279156805547, + "grad_norm": 0.7549998164176941, + "learning_rate": 9.060216111419246e-06, + "loss": 0.7851, + "step": 7288 + }, + { + "epoch": 0.40117783037041116, + "grad_norm": 0.753516435623169, + "learning_rate": 9.059963126215748e-06, + "loss": 0.7831, + "step": 7289 + }, + { + "epoch": 0.4012328691727668, + "grad_norm": 0.6718429327011108, + "learning_rate": 9.059710110498651e-06, + "loss": 0.7305, + "step": 7290 + }, + { + "epoch": 0.4012879079751225, + "grad_norm": 0.6796036958694458, + "learning_rate": 9.05945706426986e-06, + "loss": 0.802, + "step": 7291 + }, + { + "epoch": 0.4013429467774781, + "grad_norm": 0.8046827912330627, + "learning_rate": 9.05920398753128e-06, + "loss": 0.7286, + "step": 7292 + }, + { + "epoch": 0.4013979855798338, + "grad_norm": 0.7518643140792847, + "learning_rate": 9.058950880284807e-06, + "loss": 0.7287, + "step": 7293 + }, + { + "epoch": 0.40145302438218944, + "grad_norm": 0.8386855125427246, + "learning_rate": 9.058697742532345e-06, + "loss": 0.8201, + "step": 7294 + }, + { + "epoch": 0.4015080631845451, + "grad_norm": 0.7780192494392395, + "learning_rate": 9.058444574275797e-06, + "loss": 0.7999, + "step": 7295 + }, + { + "epoch": 0.40156310198690076, + "grad_norm": 0.7715566754341125, + "learning_rate": 9.058191375517068e-06, + "loss": 0.732, + "step": 7296 + }, + { + "epoch": 0.40161814078925645, + "grad_norm": 0.9940280914306641, + "learning_rate": 9.057938146258057e-06, + "loss": 0.8247, + "step": 7297 + }, + { + "epoch": 0.4016731795916121, + "grad_norm": 0.7567923069000244, + "learning_rate": 9.05768488650067e-06, + "loss": 0.8254, + "step": 7298 + }, + { + "epoch": 0.4017282183939678, + "grad_norm": 0.7544496655464172, + "learning_rate": 9.05743159624681e-06, + "loss": 0.811, + "step": 7299 + }, + { + "epoch": 0.4017832571963234, + "grad_norm": 0.63368821144104, + "learning_rate": 9.05717827549838e-06, + "loss": 0.6498, + "step": 7300 + }, + { + "epoch": 0.4018382959986791, + "grad_norm": 0.7077621221542358, + "learning_rate": 9.056924924257284e-06, + "loss": 0.7401, + "step": 7301 + }, + { + "epoch": 0.40189333480103473, + "grad_norm": 0.6782366037368774, + "learning_rate": 9.056671542525426e-06, + "loss": 0.8013, + "step": 7302 + }, + { + "epoch": 0.40194837360339036, + "grad_norm": 0.6605678200721741, + "learning_rate": 9.056418130304709e-06, + "loss": 0.8038, + "step": 7303 + }, + { + "epoch": 0.40200341240574605, + "grad_norm": 0.8716840147972107, + "learning_rate": 9.056164687597041e-06, + "loss": 0.7652, + "step": 7304 + }, + { + "epoch": 0.4020584512081017, + "grad_norm": 0.8464542031288147, + "learning_rate": 9.055911214404325e-06, + "loss": 0.8663, + "step": 7305 + }, + { + "epoch": 0.4021134900104574, + "grad_norm": 0.7165409326553345, + "learning_rate": 9.055657710728466e-06, + "loss": 0.8028, + "step": 7306 + }, + { + "epoch": 0.402168528812813, + "grad_norm": 0.7313430309295654, + "learning_rate": 9.055404176571369e-06, + "loss": 0.7538, + "step": 7307 + }, + { + "epoch": 0.4022235676151687, + "grad_norm": 0.7757230401039124, + "learning_rate": 9.05515061193494e-06, + "loss": 0.9096, + "step": 7308 + }, + { + "epoch": 0.40227860641752433, + "grad_norm": 0.7178354859352112, + "learning_rate": 9.054897016821085e-06, + "loss": 0.7186, + "step": 7309 + }, + { + "epoch": 0.40233364521988, + "grad_norm": 0.8331356048583984, + "learning_rate": 9.054643391231708e-06, + "loss": 0.8724, + "step": 7310 + }, + { + "epoch": 0.40238868402223565, + "grad_norm": 0.7709757685661316, + "learning_rate": 9.054389735168717e-06, + "loss": 0.692, + "step": 7311 + }, + { + "epoch": 0.40244372282459134, + "grad_norm": 0.7393380999565125, + "learning_rate": 9.054136048634018e-06, + "loss": 0.7863, + "step": 7312 + }, + { + "epoch": 0.402498761626947, + "grad_norm": 0.7372385859489441, + "learning_rate": 9.053882331629518e-06, + "loss": 0.781, + "step": 7313 + }, + { + "epoch": 0.40255380042930267, + "grad_norm": 0.7076019048690796, + "learning_rate": 9.053628584157123e-06, + "loss": 0.7598, + "step": 7314 + }, + { + "epoch": 0.4026088392316583, + "grad_norm": 0.7465673685073853, + "learning_rate": 9.053374806218742e-06, + "loss": 0.7454, + "step": 7315 + }, + { + "epoch": 0.402663878034014, + "grad_norm": 0.7414120435714722, + "learning_rate": 9.05312099781628e-06, + "loss": 0.7135, + "step": 7316 + }, + { + "epoch": 0.4027189168363696, + "grad_norm": 0.7490748167037964, + "learning_rate": 9.052867158951646e-06, + "loss": 0.6833, + "step": 7317 + }, + { + "epoch": 0.4027739556387253, + "grad_norm": 0.8027878999710083, + "learning_rate": 9.052613289626747e-06, + "loss": 0.7466, + "step": 7318 + }, + { + "epoch": 0.40282899444108095, + "grad_norm": 0.6777862310409546, + "learning_rate": 9.052359389843493e-06, + "loss": 0.7446, + "step": 7319 + }, + { + "epoch": 0.40288403324343663, + "grad_norm": 0.9240381717681885, + "learning_rate": 9.052105459603787e-06, + "loss": 0.7801, + "step": 7320 + }, + { + "epoch": 0.40293907204579227, + "grad_norm": 0.9592602252960205, + "learning_rate": 9.051851498909543e-06, + "loss": 0.9648, + "step": 7321 + }, + { + "epoch": 0.40299411084814796, + "grad_norm": 0.8469638228416443, + "learning_rate": 9.051597507762669e-06, + "loss": 0.8303, + "step": 7322 + }, + { + "epoch": 0.4030491496505036, + "grad_norm": 0.6981443166732788, + "learning_rate": 9.05134348616507e-06, + "loss": 0.7245, + "step": 7323 + }, + { + "epoch": 0.4031041884528593, + "grad_norm": 0.7133469581604004, + "learning_rate": 9.05108943411866e-06, + "loss": 0.7763, + "step": 7324 + }, + { + "epoch": 0.4031592272552149, + "grad_norm": 0.7043703198432922, + "learning_rate": 9.050835351625344e-06, + "loss": 0.8247, + "step": 7325 + }, + { + "epoch": 0.4032142660575706, + "grad_norm": 0.6662501692771912, + "learning_rate": 9.050581238687036e-06, + "loss": 0.7669, + "step": 7326 + }, + { + "epoch": 0.40326930485992624, + "grad_norm": 0.6482356786727905, + "learning_rate": 9.050327095305643e-06, + "loss": 0.6477, + "step": 7327 + }, + { + "epoch": 0.4033243436622819, + "grad_norm": 0.7465450167655945, + "learning_rate": 9.050072921483076e-06, + "loss": 0.8053, + "step": 7328 + }, + { + "epoch": 0.40337938246463756, + "grad_norm": 0.6765472292900085, + "learning_rate": 9.049818717221245e-06, + "loss": 0.765, + "step": 7329 + }, + { + "epoch": 0.40343442126699325, + "grad_norm": 0.7098689675331116, + "learning_rate": 9.04956448252206e-06, + "loss": 0.8059, + "step": 7330 + }, + { + "epoch": 0.4034894600693489, + "grad_norm": 0.6773823499679565, + "learning_rate": 9.049310217387432e-06, + "loss": 0.6848, + "step": 7331 + }, + { + "epoch": 0.40354449887170457, + "grad_norm": 0.6884829998016357, + "learning_rate": 9.049055921819275e-06, + "loss": 0.696, + "step": 7332 + }, + { + "epoch": 0.4035995376740602, + "grad_norm": 0.662545919418335, + "learning_rate": 9.048801595819494e-06, + "loss": 0.8286, + "step": 7333 + }, + { + "epoch": 0.4036545764764159, + "grad_norm": 0.6863077878952026, + "learning_rate": 9.048547239390007e-06, + "loss": 0.7215, + "step": 7334 + }, + { + "epoch": 0.4037096152787715, + "grad_norm": 0.6982632875442505, + "learning_rate": 9.048292852532721e-06, + "loss": 0.7635, + "step": 7335 + }, + { + "epoch": 0.4037646540811272, + "grad_norm": 0.8512400984764099, + "learning_rate": 9.048038435249548e-06, + "loss": 0.6226, + "step": 7336 + }, + { + "epoch": 0.40381969288348285, + "grad_norm": 0.6952843070030212, + "learning_rate": 9.047783987542405e-06, + "loss": 0.8317, + "step": 7337 + }, + { + "epoch": 0.40387473168583854, + "grad_norm": 0.7802778482437134, + "learning_rate": 9.0475295094132e-06, + "loss": 0.8615, + "step": 7338 + }, + { + "epoch": 0.4039297704881942, + "grad_norm": 0.8783930540084839, + "learning_rate": 9.047275000863844e-06, + "loss": 0.743, + "step": 7339 + }, + { + "epoch": 0.40398480929054986, + "grad_norm": 0.7205806970596313, + "learning_rate": 9.047020461896256e-06, + "loss": 0.7953, + "step": 7340 + }, + { + "epoch": 0.4040398480929055, + "grad_norm": 0.8438451290130615, + "learning_rate": 9.046765892512344e-06, + "loss": 0.7613, + "step": 7341 + }, + { + "epoch": 0.4040948868952612, + "grad_norm": 0.7300973534584045, + "learning_rate": 9.046511292714021e-06, + "loss": 0.7856, + "step": 7342 + }, + { + "epoch": 0.4041499256976168, + "grad_norm": 0.8472041487693787, + "learning_rate": 9.046256662503206e-06, + "loss": 0.8526, + "step": 7343 + }, + { + "epoch": 0.4042049644999725, + "grad_norm": 0.789465606212616, + "learning_rate": 9.046002001881807e-06, + "loss": 0.7792, + "step": 7344 + }, + { + "epoch": 0.40426000330232814, + "grad_norm": 0.7720938920974731, + "learning_rate": 9.04574731085174e-06, + "loss": 0.8065, + "step": 7345 + }, + { + "epoch": 0.4043150421046838, + "grad_norm": 0.6968526840209961, + "learning_rate": 9.04549258941492e-06, + "loss": 0.8135, + "step": 7346 + }, + { + "epoch": 0.40437008090703946, + "grad_norm": 0.746865451335907, + "learning_rate": 9.04523783757326e-06, + "loss": 0.8216, + "step": 7347 + }, + { + "epoch": 0.4044251197093951, + "grad_norm": 0.6750560998916626, + "learning_rate": 9.044983055328676e-06, + "loss": 0.7883, + "step": 7348 + }, + { + "epoch": 0.4044801585117508, + "grad_norm": 0.6791195273399353, + "learning_rate": 9.044728242683081e-06, + "loss": 0.7721, + "step": 7349 + }, + { + "epoch": 0.4045351973141064, + "grad_norm": 0.7238358855247498, + "learning_rate": 9.044473399638392e-06, + "loss": 0.739, + "step": 7350 + }, + { + "epoch": 0.4045902361164621, + "grad_norm": 0.6793557405471802, + "learning_rate": 9.044218526196523e-06, + "loss": 0.7853, + "step": 7351 + }, + { + "epoch": 0.40464527491881774, + "grad_norm": 0.767564058303833, + "learning_rate": 9.043963622359392e-06, + "loss": 0.8158, + "step": 7352 + }, + { + "epoch": 0.40470031372117343, + "grad_norm": 0.6800708770751953, + "learning_rate": 9.043708688128909e-06, + "loss": 0.7493, + "step": 7353 + }, + { + "epoch": 0.40475535252352907, + "grad_norm": 0.75978022813797, + "learning_rate": 9.043453723506996e-06, + "loss": 0.7066, + "step": 7354 + }, + { + "epoch": 0.40481039132588476, + "grad_norm": 1.0194984674453735, + "learning_rate": 9.043198728495568e-06, + "loss": 0.6238, + "step": 7355 + }, + { + "epoch": 0.4048654301282404, + "grad_norm": 0.7102386355400085, + "learning_rate": 9.04294370309654e-06, + "loss": 0.75, + "step": 7356 + }, + { + "epoch": 0.4049204689305961, + "grad_norm": 0.8468191623687744, + "learning_rate": 9.04268864731183e-06, + "loss": 0.8095, + "step": 7357 + }, + { + "epoch": 0.4049755077329517, + "grad_norm": 0.7022871971130371, + "learning_rate": 9.042433561143353e-06, + "loss": 0.8394, + "step": 7358 + }, + { + "epoch": 0.4050305465353074, + "grad_norm": 1.1873482465744019, + "learning_rate": 9.042178444593028e-06, + "loss": 0.7863, + "step": 7359 + }, + { + "epoch": 0.40508558533766303, + "grad_norm": 0.7074940204620361, + "learning_rate": 9.041923297662772e-06, + "loss": 0.7067, + "step": 7360 + }, + { + "epoch": 0.4051406241400187, + "grad_norm": 0.7602211833000183, + "learning_rate": 9.041668120354503e-06, + "loss": 0.6594, + "step": 7361 + }, + { + "epoch": 0.40519566294237436, + "grad_norm": 0.7903324365615845, + "learning_rate": 9.041412912670138e-06, + "loss": 0.7978, + "step": 7362 + }, + { + "epoch": 0.40525070174473005, + "grad_norm": 0.7422891855239868, + "learning_rate": 9.041157674611595e-06, + "loss": 0.8162, + "step": 7363 + }, + { + "epoch": 0.4053057405470857, + "grad_norm": 0.7978767156600952, + "learning_rate": 9.040902406180791e-06, + "loss": 0.762, + "step": 7364 + }, + { + "epoch": 0.40536077934944137, + "grad_norm": 0.7719776630401611, + "learning_rate": 9.04064710737965e-06, + "loss": 0.8098, + "step": 7365 + }, + { + "epoch": 0.405415818151797, + "grad_norm": 0.8646591305732727, + "learning_rate": 9.040391778210083e-06, + "loss": 0.9372, + "step": 7366 + }, + { + "epoch": 0.4054708569541527, + "grad_norm": 0.6616937518119812, + "learning_rate": 9.040136418674015e-06, + "loss": 0.7424, + "step": 7367 + }, + { + "epoch": 0.4055258957565083, + "grad_norm": 0.7676553130149841, + "learning_rate": 9.039881028773363e-06, + "loss": 0.6327, + "step": 7368 + }, + { + "epoch": 0.405580934558864, + "grad_norm": 0.6838239431381226, + "learning_rate": 9.039625608510047e-06, + "loss": 0.7548, + "step": 7369 + }, + { + "epoch": 0.40563597336121965, + "grad_norm": 0.7476304769515991, + "learning_rate": 9.039370157885986e-06, + "loss": 0.7262, + "step": 7370 + }, + { + "epoch": 0.40569101216357534, + "grad_norm": 0.8985139727592468, + "learning_rate": 9.0391146769031e-06, + "loss": 0.7729, + "step": 7371 + }, + { + "epoch": 0.40574605096593097, + "grad_norm": 0.7840422987937927, + "learning_rate": 9.038859165563308e-06, + "loss": 0.7855, + "step": 7372 + }, + { + "epoch": 0.40580108976828666, + "grad_norm": 0.6777672171592712, + "learning_rate": 9.038603623868534e-06, + "loss": 0.7379, + "step": 7373 + }, + { + "epoch": 0.4058561285706423, + "grad_norm": 0.7226746678352356, + "learning_rate": 9.038348051820694e-06, + "loss": 0.7686, + "step": 7374 + }, + { + "epoch": 0.405911167372998, + "grad_norm": 0.7647444605827332, + "learning_rate": 9.038092449421713e-06, + "loss": 0.8859, + "step": 7375 + }, + { + "epoch": 0.4059662061753536, + "grad_norm": 0.6524979472160339, + "learning_rate": 9.037836816673508e-06, + "loss": 0.6982, + "step": 7376 + }, + { + "epoch": 0.4060212449777093, + "grad_norm": 0.7842861413955688, + "learning_rate": 9.037581153578004e-06, + "loss": 0.8099, + "step": 7377 + }, + { + "epoch": 0.40607628378006494, + "grad_norm": 0.6424387693405151, + "learning_rate": 9.03732546013712e-06, + "loss": 0.7387, + "step": 7378 + }, + { + "epoch": 0.40613132258242063, + "grad_norm": 0.8444356918334961, + "learning_rate": 9.037069736352779e-06, + "loss": 0.8813, + "step": 7379 + }, + { + "epoch": 0.40618636138477626, + "grad_norm": 0.6487529277801514, + "learning_rate": 9.036813982226904e-06, + "loss": 0.7609, + "step": 7380 + }, + { + "epoch": 0.40624140018713195, + "grad_norm": 0.7891185879707336, + "learning_rate": 9.036558197761413e-06, + "loss": 0.8589, + "step": 7381 + }, + { + "epoch": 0.4062964389894876, + "grad_norm": 0.7183120250701904, + "learning_rate": 9.036302382958233e-06, + "loss": 0.8429, + "step": 7382 + }, + { + "epoch": 0.4063514777918433, + "grad_norm": 0.6386578679084778, + "learning_rate": 9.036046537819283e-06, + "loss": 0.6955, + "step": 7383 + }, + { + "epoch": 0.4064065165941989, + "grad_norm": 0.7572369575500488, + "learning_rate": 9.035790662346488e-06, + "loss": 0.8018, + "step": 7384 + }, + { + "epoch": 0.4064615553965546, + "grad_norm": 0.7105650305747986, + "learning_rate": 9.035534756541771e-06, + "loss": 0.8527, + "step": 7385 + }, + { + "epoch": 0.40651659419891023, + "grad_norm": 0.7031856179237366, + "learning_rate": 9.035278820407056e-06, + "loss": 0.6991, + "step": 7386 + }, + { + "epoch": 0.4065716330012659, + "grad_norm": 0.7407381534576416, + "learning_rate": 9.035022853944266e-06, + "loss": 0.708, + "step": 7387 + }, + { + "epoch": 0.40662667180362155, + "grad_norm": 0.7078498601913452, + "learning_rate": 9.034766857155322e-06, + "loss": 0.7584, + "step": 7388 + }, + { + "epoch": 0.4066817106059772, + "grad_norm": 0.7643301486968994, + "learning_rate": 9.034510830042151e-06, + "loss": 0.7836, + "step": 7389 + }, + { + "epoch": 0.4067367494083329, + "grad_norm": 0.7165302038192749, + "learning_rate": 9.034254772606676e-06, + "loss": 0.7769, + "step": 7390 + }, + { + "epoch": 0.4067917882106885, + "grad_norm": 0.7442395091056824, + "learning_rate": 9.033998684850824e-06, + "loss": 0.7231, + "step": 7391 + }, + { + "epoch": 0.4068468270130442, + "grad_norm": 0.7425046563148499, + "learning_rate": 9.033742566776517e-06, + "loss": 0.7709, + "step": 7392 + }, + { + "epoch": 0.40690186581539983, + "grad_norm": 0.768419086933136, + "learning_rate": 9.03348641838568e-06, + "loss": 0.7768, + "step": 7393 + }, + { + "epoch": 0.4069569046177555, + "grad_norm": 0.6785634160041809, + "learning_rate": 9.03323023968024e-06, + "loss": 0.7468, + "step": 7394 + }, + { + "epoch": 0.40701194342011116, + "grad_norm": 0.7075444459915161, + "learning_rate": 9.03297403066212e-06, + "loss": 0.7757, + "step": 7395 + }, + { + "epoch": 0.40706698222246684, + "grad_norm": 0.7580223679542542, + "learning_rate": 9.032717791333247e-06, + "loss": 0.7311, + "step": 7396 + }, + { + "epoch": 0.4071220210248225, + "grad_norm": 0.8110041618347168, + "learning_rate": 9.032461521695546e-06, + "loss": 0.7923, + "step": 7397 + }, + { + "epoch": 0.40717705982717817, + "grad_norm": 0.7204881310462952, + "learning_rate": 9.032205221750945e-06, + "loss": 0.759, + "step": 7398 + }, + { + "epoch": 0.4072320986295338, + "grad_norm": 0.8392491340637207, + "learning_rate": 9.031948891501368e-06, + "loss": 0.8292, + "step": 7399 + }, + { + "epoch": 0.4072871374318895, + "grad_norm": 0.7134600281715393, + "learning_rate": 9.031692530948742e-06, + "loss": 0.7, + "step": 7400 + }, + { + "epoch": 0.4073421762342451, + "grad_norm": 0.6324336528778076, + "learning_rate": 9.031436140094995e-06, + "loss": 0.6964, + "step": 7401 + }, + { + "epoch": 0.4073972150366008, + "grad_norm": 0.7281947731971741, + "learning_rate": 9.031179718942052e-06, + "loss": 0.7567, + "step": 7402 + }, + { + "epoch": 0.40745225383895645, + "grad_norm": 0.8828619718551636, + "learning_rate": 9.030923267491842e-06, + "loss": 0.8139, + "step": 7403 + }, + { + "epoch": 0.40750729264131214, + "grad_norm": 0.7039986252784729, + "learning_rate": 9.030666785746292e-06, + "loss": 0.7339, + "step": 7404 + }, + { + "epoch": 0.40756233144366777, + "grad_norm": 0.7049984931945801, + "learning_rate": 9.030410273707331e-06, + "loss": 0.6842, + "step": 7405 + }, + { + "epoch": 0.40761737024602346, + "grad_norm": 0.7149737477302551, + "learning_rate": 9.030153731376883e-06, + "loss": 0.6837, + "step": 7406 + }, + { + "epoch": 0.4076724090483791, + "grad_norm": 1.0804089307785034, + "learning_rate": 9.029897158756878e-06, + "loss": 0.7726, + "step": 7407 + }, + { + "epoch": 0.4077274478507348, + "grad_norm": 0.8354909420013428, + "learning_rate": 9.029640555849244e-06, + "loss": 0.8058, + "step": 7408 + }, + { + "epoch": 0.4077824866530904, + "grad_norm": 0.7091527581214905, + "learning_rate": 9.029383922655914e-06, + "loss": 0.7636, + "step": 7409 + }, + { + "epoch": 0.4078375254554461, + "grad_norm": 0.6720988750457764, + "learning_rate": 9.029127259178809e-06, + "loss": 0.7179, + "step": 7410 + }, + { + "epoch": 0.40789256425780174, + "grad_norm": 0.685858964920044, + "learning_rate": 9.028870565419865e-06, + "loss": 0.7637, + "step": 7411 + }, + { + "epoch": 0.4079476030601574, + "grad_norm": 0.7505033016204834, + "learning_rate": 9.028613841381007e-06, + "loss": 0.7463, + "step": 7412 + }, + { + "epoch": 0.40800264186251306, + "grad_norm": 0.8801671862602234, + "learning_rate": 9.028357087064166e-06, + "loss": 0.8399, + "step": 7413 + }, + { + "epoch": 0.40805768066486875, + "grad_norm": 0.7441918849945068, + "learning_rate": 9.02810030247127e-06, + "loss": 0.7689, + "step": 7414 + }, + { + "epoch": 0.4081127194672244, + "grad_norm": 0.7410128712654114, + "learning_rate": 9.027843487604251e-06, + "loss": 0.8013, + "step": 7415 + }, + { + "epoch": 0.40816775826958007, + "grad_norm": 0.8075226545333862, + "learning_rate": 9.02758664246504e-06, + "loss": 0.7717, + "step": 7416 + }, + { + "epoch": 0.4082227970719357, + "grad_norm": 0.7985545992851257, + "learning_rate": 9.027329767055566e-06, + "loss": 0.8459, + "step": 7417 + }, + { + "epoch": 0.4082778358742914, + "grad_norm": 0.7887235283851624, + "learning_rate": 9.027072861377757e-06, + "loss": 0.8201, + "step": 7418 + }, + { + "epoch": 0.40833287467664703, + "grad_norm": 0.7876266241073608, + "learning_rate": 9.02681592543355e-06, + "loss": 0.8205, + "step": 7419 + }, + { + "epoch": 0.4083879134790027, + "grad_norm": 0.758168637752533, + "learning_rate": 9.02655895922487e-06, + "loss": 0.6619, + "step": 7420 + }, + { + "epoch": 0.40844295228135835, + "grad_norm": 0.7279811501502991, + "learning_rate": 9.02630196275365e-06, + "loss": 0.7634, + "step": 7421 + }, + { + "epoch": 0.40849799108371404, + "grad_norm": 0.7540523409843445, + "learning_rate": 9.026044936021822e-06, + "loss": 0.7819, + "step": 7422 + }, + { + "epoch": 0.4085530298860697, + "grad_norm": 0.8091018795967102, + "learning_rate": 9.02578787903132e-06, + "loss": 0.7749, + "step": 7423 + }, + { + "epoch": 0.40860806868842536, + "grad_norm": 0.7625396847724915, + "learning_rate": 9.025530791784074e-06, + "loss": 0.7635, + "step": 7424 + }, + { + "epoch": 0.408663107490781, + "grad_norm": 0.7663947939872742, + "learning_rate": 9.025273674282015e-06, + "loss": 0.8281, + "step": 7425 + }, + { + "epoch": 0.4087181462931367, + "grad_norm": 0.6672662496566772, + "learning_rate": 9.025016526527077e-06, + "loss": 0.641, + "step": 7426 + }, + { + "epoch": 0.4087731850954923, + "grad_norm": 0.7649143934249878, + "learning_rate": 9.024759348521193e-06, + "loss": 0.7462, + "step": 7427 + }, + { + "epoch": 0.408828223897848, + "grad_norm": 0.7540067434310913, + "learning_rate": 9.024502140266293e-06, + "loss": 0.8756, + "step": 7428 + }, + { + "epoch": 0.40888326270020364, + "grad_norm": 0.721615731716156, + "learning_rate": 9.024244901764314e-06, + "loss": 0.8507, + "step": 7429 + }, + { + "epoch": 0.40893830150255933, + "grad_norm": 0.6949496269226074, + "learning_rate": 9.023987633017186e-06, + "loss": 0.7021, + "step": 7430 + }, + { + "epoch": 0.40899334030491497, + "grad_norm": 0.7108990550041199, + "learning_rate": 9.023730334026845e-06, + "loss": 0.807, + "step": 7431 + }, + { + "epoch": 0.4090483791072706, + "grad_norm": 0.7606124877929688, + "learning_rate": 9.023473004795225e-06, + "loss": 0.7769, + "step": 7432 + }, + { + "epoch": 0.4091034179096263, + "grad_norm": 0.7792031764984131, + "learning_rate": 9.023215645324256e-06, + "loss": 0.728, + "step": 7433 + }, + { + "epoch": 0.4091584567119819, + "grad_norm": 0.728884756565094, + "learning_rate": 9.022958255615877e-06, + "loss": 0.7831, + "step": 7434 + }, + { + "epoch": 0.4092134955143376, + "grad_norm": 0.8196625709533691, + "learning_rate": 9.022700835672022e-06, + "loss": 0.8265, + "step": 7435 + }, + { + "epoch": 0.40926853431669324, + "grad_norm": 0.762734055519104, + "learning_rate": 9.022443385494621e-06, + "loss": 0.8028, + "step": 7436 + }, + { + "epoch": 0.40932357311904893, + "grad_norm": 0.7259558439254761, + "learning_rate": 9.022185905085614e-06, + "loss": 0.789, + "step": 7437 + }, + { + "epoch": 0.40937861192140457, + "grad_norm": 0.7402371764183044, + "learning_rate": 9.021928394446936e-06, + "loss": 0.7667, + "step": 7438 + }, + { + "epoch": 0.40943365072376026, + "grad_norm": 0.8399797677993774, + "learning_rate": 9.021670853580519e-06, + "loss": 0.8451, + "step": 7439 + }, + { + "epoch": 0.4094886895261159, + "grad_norm": 0.6439585089683533, + "learning_rate": 9.0214132824883e-06, + "loss": 0.776, + "step": 7440 + }, + { + "epoch": 0.4095437283284716, + "grad_norm": 0.6956612467765808, + "learning_rate": 9.021155681172215e-06, + "loss": 0.6921, + "step": 7441 + }, + { + "epoch": 0.4095987671308272, + "grad_norm": 0.855413556098938, + "learning_rate": 9.020898049634203e-06, + "loss": 0.8552, + "step": 7442 + }, + { + "epoch": 0.4096538059331829, + "grad_norm": 0.6690535545349121, + "learning_rate": 9.020640387876194e-06, + "loss": 0.7552, + "step": 7443 + }, + { + "epoch": 0.40970884473553854, + "grad_norm": 0.6615462899208069, + "learning_rate": 9.020382695900131e-06, + "loss": 0.8216, + "step": 7444 + }, + { + "epoch": 0.4097638835378942, + "grad_norm": 0.6975858211517334, + "learning_rate": 9.020124973707947e-06, + "loss": 0.7453, + "step": 7445 + }, + { + "epoch": 0.40981892234024986, + "grad_norm": 0.6461964249610901, + "learning_rate": 9.019867221301579e-06, + "loss": 0.656, + "step": 7446 + }, + { + "epoch": 0.40987396114260555, + "grad_norm": 0.7221645712852478, + "learning_rate": 9.019609438682967e-06, + "loss": 0.661, + "step": 7447 + }, + { + "epoch": 0.4099289999449612, + "grad_norm": 0.6785755753517151, + "learning_rate": 9.019351625854044e-06, + "loss": 0.7294, + "step": 7448 + }, + { + "epoch": 0.40998403874731687, + "grad_norm": 0.7040538787841797, + "learning_rate": 9.019093782816751e-06, + "loss": 0.8546, + "step": 7449 + }, + { + "epoch": 0.4100390775496725, + "grad_norm": 0.737922191619873, + "learning_rate": 9.018835909573025e-06, + "loss": 0.8144, + "step": 7450 + }, + { + "epoch": 0.4100941163520282, + "grad_norm": 0.6705496311187744, + "learning_rate": 9.018578006124802e-06, + "loss": 0.6937, + "step": 7451 + }, + { + "epoch": 0.4101491551543838, + "grad_norm": 0.7347431182861328, + "learning_rate": 9.018320072474026e-06, + "loss": 0.7716, + "step": 7452 + }, + { + "epoch": 0.4102041939567395, + "grad_norm": 0.7023493647575378, + "learning_rate": 9.018062108622631e-06, + "loss": 0.7295, + "step": 7453 + }, + { + "epoch": 0.41025923275909515, + "grad_norm": 0.8017870187759399, + "learning_rate": 9.017804114572556e-06, + "loss": 0.7471, + "step": 7454 + }, + { + "epoch": 0.41031427156145084, + "grad_norm": 0.9171211123466492, + "learning_rate": 9.01754609032574e-06, + "loss": 0.8262, + "step": 7455 + }, + { + "epoch": 0.41036931036380647, + "grad_norm": 0.6682952046394348, + "learning_rate": 9.017288035884124e-06, + "loss": 0.7165, + "step": 7456 + }, + { + "epoch": 0.41042434916616216, + "grad_norm": 0.9339122772216797, + "learning_rate": 9.017029951249648e-06, + "loss": 0.8618, + "step": 7457 + }, + { + "epoch": 0.4104793879685178, + "grad_norm": 0.7063136696815491, + "learning_rate": 9.016771836424248e-06, + "loss": 0.8068, + "step": 7458 + }, + { + "epoch": 0.4105344267708735, + "grad_norm": 0.6717063784599304, + "learning_rate": 9.016513691409867e-06, + "loss": 0.738, + "step": 7459 + }, + { + "epoch": 0.4105894655732291, + "grad_norm": 0.6807749271392822, + "learning_rate": 9.016255516208443e-06, + "loss": 0.7842, + "step": 7460 + }, + { + "epoch": 0.4106445043755848, + "grad_norm": 0.6990453600883484, + "learning_rate": 9.01599731082192e-06, + "loss": 0.7726, + "step": 7461 + }, + { + "epoch": 0.41069954317794044, + "grad_norm": 0.6704931259155273, + "learning_rate": 9.015739075252234e-06, + "loss": 0.7006, + "step": 7462 + }, + { + "epoch": 0.41075458198029613, + "grad_norm": 0.7162300944328308, + "learning_rate": 9.01548080950133e-06, + "loss": 0.8462, + "step": 7463 + }, + { + "epoch": 0.41080962078265176, + "grad_norm": 0.6845411658287048, + "learning_rate": 9.015222513571144e-06, + "loss": 0.7466, + "step": 7464 + }, + { + "epoch": 0.41086465958500745, + "grad_norm": 0.7146134376525879, + "learning_rate": 9.014964187463623e-06, + "loss": 0.7594, + "step": 7465 + }, + { + "epoch": 0.4109196983873631, + "grad_norm": 0.7664906978607178, + "learning_rate": 9.014705831180706e-06, + "loss": 0.8376, + "step": 7466 + }, + { + "epoch": 0.4109747371897188, + "grad_norm": 0.7319341897964478, + "learning_rate": 9.014447444724332e-06, + "loss": 0.7748, + "step": 7467 + }, + { + "epoch": 0.4110297759920744, + "grad_norm": 0.7269605398178101, + "learning_rate": 9.014189028096448e-06, + "loss": 0.6941, + "step": 7468 + }, + { + "epoch": 0.4110848147944301, + "grad_norm": 0.72607421875, + "learning_rate": 9.013930581298993e-06, + "loss": 0.7174, + "step": 7469 + }, + { + "epoch": 0.41113985359678573, + "grad_norm": 0.7385421991348267, + "learning_rate": 9.01367210433391e-06, + "loss": 0.7761, + "step": 7470 + }, + { + "epoch": 0.4111948923991414, + "grad_norm": 0.8392042517662048, + "learning_rate": 9.013413597203144e-06, + "loss": 0.7417, + "step": 7471 + }, + { + "epoch": 0.41124993120149705, + "grad_norm": 0.7454584836959839, + "learning_rate": 9.013155059908634e-06, + "loss": 0.8976, + "step": 7472 + }, + { + "epoch": 0.41130497000385274, + "grad_norm": 0.7358037829399109, + "learning_rate": 9.012896492452325e-06, + "loss": 0.7706, + "step": 7473 + }, + { + "epoch": 0.4113600088062084, + "grad_norm": 0.7454121708869934, + "learning_rate": 9.01263789483616e-06, + "loss": 0.7425, + "step": 7474 + }, + { + "epoch": 0.411415047608564, + "grad_norm": 0.7842294573783875, + "learning_rate": 9.012379267062081e-06, + "loss": 0.7739, + "step": 7475 + }, + { + "epoch": 0.4114700864109197, + "grad_norm": 0.7181714773178101, + "learning_rate": 9.012120609132036e-06, + "loss": 0.8466, + "step": 7476 + }, + { + "epoch": 0.41152512521327533, + "grad_norm": 0.7239206433296204, + "learning_rate": 9.011861921047966e-06, + "loss": 0.7493, + "step": 7477 + }, + { + "epoch": 0.411580164015631, + "grad_norm": 0.6773414611816406, + "learning_rate": 9.011603202811816e-06, + "loss": 0.7433, + "step": 7478 + }, + { + "epoch": 0.41163520281798666, + "grad_norm": 0.7770900130271912, + "learning_rate": 9.011344454425527e-06, + "loss": 0.7488, + "step": 7479 + }, + { + "epoch": 0.41169024162034235, + "grad_norm": 0.7305957674980164, + "learning_rate": 9.011085675891051e-06, + "loss": 0.7989, + "step": 7480 + }, + { + "epoch": 0.411745280422698, + "grad_norm": 0.734603762626648, + "learning_rate": 9.010826867210327e-06, + "loss": 0.805, + "step": 7481 + }, + { + "epoch": 0.41180031922505367, + "grad_norm": 0.7438979148864746, + "learning_rate": 9.010568028385303e-06, + "loss": 0.8407, + "step": 7482 + }, + { + "epoch": 0.4118553580274093, + "grad_norm": 0.6718543767929077, + "learning_rate": 9.01030915941792e-06, + "loss": 0.7575, + "step": 7483 + }, + { + "epoch": 0.411910396829765, + "grad_norm": 0.8157614469528198, + "learning_rate": 9.01005026031013e-06, + "loss": 0.8231, + "step": 7484 + }, + { + "epoch": 0.4119654356321206, + "grad_norm": 0.8927714824676514, + "learning_rate": 9.009791331063874e-06, + "loss": 0.808, + "step": 7485 + }, + { + "epoch": 0.4120204744344763, + "grad_norm": 0.7604075074195862, + "learning_rate": 9.009532371681101e-06, + "loss": 0.7505, + "step": 7486 + }, + { + "epoch": 0.41207551323683195, + "grad_norm": 0.6861944794654846, + "learning_rate": 9.009273382163754e-06, + "loss": 0.719, + "step": 7487 + }, + { + "epoch": 0.41213055203918764, + "grad_norm": 0.7043709754943848, + "learning_rate": 9.009014362513784e-06, + "loss": 0.8193, + "step": 7488 + }, + { + "epoch": 0.41218559084154327, + "grad_norm": 0.7459648847579956, + "learning_rate": 9.008755312733136e-06, + "loss": 0.8617, + "step": 7489 + }, + { + "epoch": 0.41224062964389896, + "grad_norm": 0.7272594571113586, + "learning_rate": 9.008496232823754e-06, + "loss": 0.7255, + "step": 7490 + }, + { + "epoch": 0.4122956684462546, + "grad_norm": 0.7486668229103088, + "learning_rate": 9.008237122787586e-06, + "loss": 0.6479, + "step": 7491 + }, + { + "epoch": 0.4123507072486103, + "grad_norm": 0.8149027228355408, + "learning_rate": 9.007977982626582e-06, + "loss": 0.8052, + "step": 7492 + }, + { + "epoch": 0.4124057460509659, + "grad_norm": 0.7054859399795532, + "learning_rate": 9.00771881234269e-06, + "loss": 0.8215, + "step": 7493 + }, + { + "epoch": 0.4124607848533216, + "grad_norm": 0.6840499639511108, + "learning_rate": 9.007459611937854e-06, + "loss": 0.776, + "step": 7494 + }, + { + "epoch": 0.41251582365567724, + "grad_norm": 0.7340932488441467, + "learning_rate": 9.007200381414026e-06, + "loss": 0.713, + "step": 7495 + }, + { + "epoch": 0.4125708624580329, + "grad_norm": 0.8282599449157715, + "learning_rate": 9.00694112077315e-06, + "loss": 0.7037, + "step": 7496 + }, + { + "epoch": 0.41262590126038856, + "grad_norm": 0.849588930606842, + "learning_rate": 9.00668183001718e-06, + "loss": 0.7845, + "step": 7497 + }, + { + "epoch": 0.41268094006274425, + "grad_norm": 0.8330783843994141, + "learning_rate": 9.00642250914806e-06, + "loss": 0.9049, + "step": 7498 + }, + { + "epoch": 0.4127359788650999, + "grad_norm": 0.7020101547241211, + "learning_rate": 9.00616315816774e-06, + "loss": 0.8146, + "step": 7499 + }, + { + "epoch": 0.4127910176674556, + "grad_norm": 0.7632037997245789, + "learning_rate": 9.005903777078173e-06, + "loss": 0.6629, + "step": 7500 + }, + { + "epoch": 0.4128460564698112, + "grad_norm": 0.7286840081214905, + "learning_rate": 9.005644365881304e-06, + "loss": 0.7795, + "step": 7501 + }, + { + "epoch": 0.4129010952721669, + "grad_norm": 0.710451066493988, + "learning_rate": 9.005384924579084e-06, + "loss": 0.7615, + "step": 7502 + }, + { + "epoch": 0.41295613407452253, + "grad_norm": 0.7657510042190552, + "learning_rate": 9.005125453173463e-06, + "loss": 0.8938, + "step": 7503 + }, + { + "epoch": 0.4130111728768782, + "grad_norm": 0.6978467702865601, + "learning_rate": 9.004865951666392e-06, + "loss": 0.7464, + "step": 7504 + }, + { + "epoch": 0.41306621167923385, + "grad_norm": 0.7028319835662842, + "learning_rate": 9.00460642005982e-06, + "loss": 0.7899, + "step": 7505 + }, + { + "epoch": 0.41312125048158954, + "grad_norm": 0.923951268196106, + "learning_rate": 9.004346858355698e-06, + "loss": 0.8851, + "step": 7506 + }, + { + "epoch": 0.4131762892839452, + "grad_norm": 0.7293704748153687, + "learning_rate": 9.004087266555978e-06, + "loss": 0.7594, + "step": 7507 + }, + { + "epoch": 0.41323132808630086, + "grad_norm": 0.7458868622779846, + "learning_rate": 9.003827644662608e-06, + "loss": 0.7538, + "step": 7508 + }, + { + "epoch": 0.4132863668886565, + "grad_norm": 0.6764113306999207, + "learning_rate": 9.003567992677543e-06, + "loss": 0.7303, + "step": 7509 + }, + { + "epoch": 0.4133414056910122, + "grad_norm": 0.7827350497245789, + "learning_rate": 9.003308310602732e-06, + "loss": 0.7708, + "step": 7510 + }, + { + "epoch": 0.4133964444933678, + "grad_norm": 0.7683281302452087, + "learning_rate": 9.003048598440127e-06, + "loss": 0.7971, + "step": 7511 + }, + { + "epoch": 0.4134514832957235, + "grad_norm": 0.8793813586235046, + "learning_rate": 9.002788856191679e-06, + "loss": 0.7434, + "step": 7512 + }, + { + "epoch": 0.41350652209807914, + "grad_norm": 0.6598063111305237, + "learning_rate": 9.002529083859343e-06, + "loss": 0.7082, + "step": 7513 + }, + { + "epoch": 0.41356156090043483, + "grad_norm": 0.8239839673042297, + "learning_rate": 9.002269281445071e-06, + "loss": 0.8457, + "step": 7514 + }, + { + "epoch": 0.41361659970279047, + "grad_norm": 0.7433123588562012, + "learning_rate": 9.002009448950812e-06, + "loss": 0.7399, + "step": 7515 + }, + { + "epoch": 0.41367163850514616, + "grad_norm": 0.8310487866401672, + "learning_rate": 9.001749586378524e-06, + "loss": 0.7482, + "step": 7516 + }, + { + "epoch": 0.4137266773075018, + "grad_norm": 0.7170824408531189, + "learning_rate": 9.001489693730155e-06, + "loss": 0.7856, + "step": 7517 + }, + { + "epoch": 0.4137817161098574, + "grad_norm": 0.9063520431518555, + "learning_rate": 9.00122977100766e-06, + "loss": 0.8623, + "step": 7518 + }, + { + "epoch": 0.4138367549122131, + "grad_norm": 0.8753733038902283, + "learning_rate": 9.000969818212996e-06, + "loss": 0.7875, + "step": 7519 + }, + { + "epoch": 0.41389179371456875, + "grad_norm": 0.7013519406318665, + "learning_rate": 9.000709835348112e-06, + "loss": 0.724, + "step": 7520 + }, + { + "epoch": 0.41394683251692443, + "grad_norm": 0.7385973334312439, + "learning_rate": 9.000449822414963e-06, + "loss": 0.7286, + "step": 7521 + }, + { + "epoch": 0.41400187131928007, + "grad_norm": 0.7605431079864502, + "learning_rate": 9.000189779415505e-06, + "loss": 0.728, + "step": 7522 + }, + { + "epoch": 0.41405691012163576, + "grad_norm": 0.7631710767745972, + "learning_rate": 8.99992970635169e-06, + "loss": 0.8276, + "step": 7523 + }, + { + "epoch": 0.4141119489239914, + "grad_norm": 0.8066657185554504, + "learning_rate": 8.999669603225477e-06, + "loss": 0.8319, + "step": 7524 + }, + { + "epoch": 0.4141669877263471, + "grad_norm": 0.689407229423523, + "learning_rate": 8.999409470038815e-06, + "loss": 0.6675, + "step": 7525 + }, + { + "epoch": 0.4142220265287027, + "grad_norm": 0.7391255497932434, + "learning_rate": 8.999149306793664e-06, + "loss": 0.8228, + "step": 7526 + }, + { + "epoch": 0.4142770653310584, + "grad_norm": 0.7208844423294067, + "learning_rate": 8.998889113491977e-06, + "loss": 0.7689, + "step": 7527 + }, + { + "epoch": 0.41433210413341404, + "grad_norm": 0.8278803825378418, + "learning_rate": 8.99862889013571e-06, + "loss": 0.7964, + "step": 7528 + }, + { + "epoch": 0.4143871429357697, + "grad_norm": 0.7287253141403198, + "learning_rate": 8.998368636726817e-06, + "loss": 0.7689, + "step": 7529 + }, + { + "epoch": 0.41444218173812536, + "grad_norm": 0.7159145474433899, + "learning_rate": 8.998108353267257e-06, + "loss": 0.7537, + "step": 7530 + }, + { + "epoch": 0.41449722054048105, + "grad_norm": 0.7605739235877991, + "learning_rate": 8.997848039758985e-06, + "loss": 0.7327, + "step": 7531 + }, + { + "epoch": 0.4145522593428367, + "grad_norm": 0.7290406227111816, + "learning_rate": 8.997587696203958e-06, + "loss": 0.6804, + "step": 7532 + }, + { + "epoch": 0.41460729814519237, + "grad_norm": 0.7613189816474915, + "learning_rate": 8.997327322604131e-06, + "loss": 0.7465, + "step": 7533 + }, + { + "epoch": 0.414662336947548, + "grad_norm": 0.7796703577041626, + "learning_rate": 8.99706691896146e-06, + "loss": 0.7444, + "step": 7534 + }, + { + "epoch": 0.4147173757499037, + "grad_norm": 0.8758549094200134, + "learning_rate": 8.996806485277904e-06, + "loss": 0.8586, + "step": 7535 + }, + { + "epoch": 0.4147724145522593, + "grad_norm": 0.9599420428276062, + "learning_rate": 8.996546021555423e-06, + "loss": 0.7554, + "step": 7536 + }, + { + "epoch": 0.414827453354615, + "grad_norm": 0.8216326236724854, + "learning_rate": 8.996285527795972e-06, + "loss": 0.7995, + "step": 7537 + }, + { + "epoch": 0.41488249215697065, + "grad_norm": 0.6777452230453491, + "learning_rate": 8.996025004001507e-06, + "loss": 0.7809, + "step": 7538 + }, + { + "epoch": 0.41493753095932634, + "grad_norm": 0.7354100942611694, + "learning_rate": 8.995764450173989e-06, + "loss": 0.6548, + "step": 7539 + }, + { + "epoch": 0.414992569761682, + "grad_norm": 0.7548280358314514, + "learning_rate": 8.995503866315373e-06, + "loss": 0.8308, + "step": 7540 + }, + { + "epoch": 0.41504760856403766, + "grad_norm": 0.6891447901725769, + "learning_rate": 8.995243252427622e-06, + "loss": 0.8386, + "step": 7541 + }, + { + "epoch": 0.4151026473663933, + "grad_norm": 0.6848340034484863, + "learning_rate": 8.99498260851269e-06, + "loss": 0.7587, + "step": 7542 + }, + { + "epoch": 0.415157686168749, + "grad_norm": 0.7109090685844421, + "learning_rate": 8.994721934572538e-06, + "loss": 0.6847, + "step": 7543 + }, + { + "epoch": 0.4152127249711046, + "grad_norm": 0.6708144545555115, + "learning_rate": 8.994461230609128e-06, + "loss": 0.7266, + "step": 7544 + }, + { + "epoch": 0.4152677637734603, + "grad_norm": 0.6985414028167725, + "learning_rate": 8.994200496624415e-06, + "loss": 0.7696, + "step": 7545 + }, + { + "epoch": 0.41532280257581594, + "grad_norm": 0.6989198923110962, + "learning_rate": 8.993939732620359e-06, + "loss": 0.7894, + "step": 7546 + }, + { + "epoch": 0.41537784137817163, + "grad_norm": 0.6667589545249939, + "learning_rate": 8.993678938598921e-06, + "loss": 0.7417, + "step": 7547 + }, + { + "epoch": 0.41543288018052726, + "grad_norm": 1.0692487955093384, + "learning_rate": 8.993418114562064e-06, + "loss": 0.7147, + "step": 7548 + }, + { + "epoch": 0.41548791898288295, + "grad_norm": 0.6709207892417908, + "learning_rate": 8.993157260511742e-06, + "loss": 0.7694, + "step": 7549 + }, + { + "epoch": 0.4155429577852386, + "grad_norm": 0.6714604496955872, + "learning_rate": 8.992896376449923e-06, + "loss": 0.6969, + "step": 7550 + }, + { + "epoch": 0.4155979965875943, + "grad_norm": 0.8266897201538086, + "learning_rate": 8.99263546237856e-06, + "loss": 0.8392, + "step": 7551 + }, + { + "epoch": 0.4156530353899499, + "grad_norm": 0.675188422203064, + "learning_rate": 8.992374518299619e-06, + "loss": 0.7525, + "step": 7552 + }, + { + "epoch": 0.4157080741923056, + "grad_norm": 0.7406265139579773, + "learning_rate": 8.992113544215059e-06, + "loss": 0.7895, + "step": 7553 + }, + { + "epoch": 0.41576311299466123, + "grad_norm": 0.837336003780365, + "learning_rate": 8.991852540126844e-06, + "loss": 0.7376, + "step": 7554 + }, + { + "epoch": 0.4158181517970169, + "grad_norm": 0.6774994730949402, + "learning_rate": 8.991591506036931e-06, + "loss": 0.7231, + "step": 7555 + }, + { + "epoch": 0.41587319059937256, + "grad_norm": 0.6941245794296265, + "learning_rate": 8.991330441947287e-06, + "loss": 0.7213, + "step": 7556 + }, + { + "epoch": 0.41592822940172824, + "grad_norm": 0.7588210105895996, + "learning_rate": 8.991069347859871e-06, + "loss": 0.7829, + "step": 7557 + }, + { + "epoch": 0.4159832682040839, + "grad_norm": 0.7580196857452393, + "learning_rate": 8.990808223776647e-06, + "loss": 0.7782, + "step": 7558 + }, + { + "epoch": 0.41603830700643957, + "grad_norm": 0.7597478032112122, + "learning_rate": 8.990547069699576e-06, + "loss": 0.7764, + "step": 7559 + }, + { + "epoch": 0.4160933458087952, + "grad_norm": 0.7950314283370972, + "learning_rate": 8.990285885630622e-06, + "loss": 0.7263, + "step": 7560 + }, + { + "epoch": 0.41614838461115083, + "grad_norm": 0.6962432265281677, + "learning_rate": 8.990024671571747e-06, + "loss": 0.6616, + "step": 7561 + }, + { + "epoch": 0.4162034234135065, + "grad_norm": 0.682816207408905, + "learning_rate": 8.989763427524915e-06, + "loss": 0.7862, + "step": 7562 + }, + { + "epoch": 0.41625846221586216, + "grad_norm": 0.686673104763031, + "learning_rate": 8.989502153492089e-06, + "loss": 0.8199, + "step": 7563 + }, + { + "epoch": 0.41631350101821785, + "grad_norm": 0.7954965233802795, + "learning_rate": 8.989240849475231e-06, + "loss": 0.8021, + "step": 7564 + }, + { + "epoch": 0.4163685398205735, + "grad_norm": 0.7516284584999084, + "learning_rate": 8.988979515476309e-06, + "loss": 0.7803, + "step": 7565 + }, + { + "epoch": 0.41642357862292917, + "grad_norm": 0.7148317694664001, + "learning_rate": 8.988718151497284e-06, + "loss": 0.7407, + "step": 7566 + }, + { + "epoch": 0.4164786174252848, + "grad_norm": 0.7898986339569092, + "learning_rate": 8.98845675754012e-06, + "loss": 0.8382, + "step": 7567 + }, + { + "epoch": 0.4165336562276405, + "grad_norm": 0.7014235854148865, + "learning_rate": 8.988195333606784e-06, + "loss": 0.7205, + "step": 7568 + }, + { + "epoch": 0.4165886950299961, + "grad_norm": 0.6520957350730896, + "learning_rate": 8.987933879699238e-06, + "loss": 0.7452, + "step": 7569 + }, + { + "epoch": 0.4166437338323518, + "grad_norm": 0.7462863922119141, + "learning_rate": 8.987672395819449e-06, + "loss": 0.7787, + "step": 7570 + }, + { + "epoch": 0.41669877263470745, + "grad_norm": 0.7366049885749817, + "learning_rate": 8.987410881969382e-06, + "loss": 0.7662, + "step": 7571 + }, + { + "epoch": 0.41675381143706314, + "grad_norm": 0.7732293009757996, + "learning_rate": 8.987149338151002e-06, + "loss": 0.8258, + "step": 7572 + }, + { + "epoch": 0.41680885023941877, + "grad_norm": 0.9309358596801758, + "learning_rate": 8.986887764366275e-06, + "loss": 0.6538, + "step": 7573 + }, + { + "epoch": 0.41686388904177446, + "grad_norm": 0.6976680755615234, + "learning_rate": 8.986626160617167e-06, + "loss": 0.7175, + "step": 7574 + }, + { + "epoch": 0.4169189278441301, + "grad_norm": 0.7541783452033997, + "learning_rate": 8.986364526905645e-06, + "loss": 0.8153, + "step": 7575 + }, + { + "epoch": 0.4169739666464858, + "grad_norm": 0.8968943357467651, + "learning_rate": 8.986102863233673e-06, + "loss": 0.7859, + "step": 7576 + }, + { + "epoch": 0.4170290054488414, + "grad_norm": 0.6910044550895691, + "learning_rate": 8.985841169603218e-06, + "loss": 0.8381, + "step": 7577 + }, + { + "epoch": 0.4170840442511971, + "grad_norm": 0.8944257497787476, + "learning_rate": 8.985579446016249e-06, + "loss": 0.7062, + "step": 7578 + }, + { + "epoch": 0.41713908305355274, + "grad_norm": 0.6665629744529724, + "learning_rate": 8.98531769247473e-06, + "loss": 0.7928, + "step": 7579 + }, + { + "epoch": 0.41719412185590843, + "grad_norm": 0.7642979621887207, + "learning_rate": 8.985055908980634e-06, + "loss": 0.8442, + "step": 7580 + }, + { + "epoch": 0.41724916065826406, + "grad_norm": 0.7575559020042419, + "learning_rate": 8.98479409553592e-06, + "loss": 0.795, + "step": 7581 + }, + { + "epoch": 0.41730419946061975, + "grad_norm": 0.6567206978797913, + "learning_rate": 8.984532252142563e-06, + "loss": 0.713, + "step": 7582 + }, + { + "epoch": 0.4173592382629754, + "grad_norm": 0.6677179336547852, + "learning_rate": 8.984270378802527e-06, + "loss": 0.8173, + "step": 7583 + }, + { + "epoch": 0.4174142770653311, + "grad_norm": 0.6846007704734802, + "learning_rate": 8.984008475517782e-06, + "loss": 0.7154, + "step": 7584 + }, + { + "epoch": 0.4174693158676867, + "grad_norm": 0.7758762836456299, + "learning_rate": 8.983746542290294e-06, + "loss": 0.8686, + "step": 7585 + }, + { + "epoch": 0.4175243546700424, + "grad_norm": 0.6850305199623108, + "learning_rate": 8.983484579122036e-06, + "loss": 0.7568, + "step": 7586 + }, + { + "epoch": 0.41757939347239803, + "grad_norm": 0.7165307998657227, + "learning_rate": 8.983222586014973e-06, + "loss": 0.7856, + "step": 7587 + }, + { + "epoch": 0.4176344322747537, + "grad_norm": 0.7747449278831482, + "learning_rate": 8.982960562971074e-06, + "loss": 0.8148, + "step": 7588 + }, + { + "epoch": 0.41768947107710935, + "grad_norm": 0.789235532283783, + "learning_rate": 8.982698509992311e-06, + "loss": 0.8021, + "step": 7589 + }, + { + "epoch": 0.41774450987946504, + "grad_norm": 0.664186954498291, + "learning_rate": 8.982436427080652e-06, + "loss": 0.7394, + "step": 7590 + }, + { + "epoch": 0.4177995486818207, + "grad_norm": 0.7045899033546448, + "learning_rate": 8.982174314238069e-06, + "loss": 0.7029, + "step": 7591 + }, + { + "epoch": 0.41785458748417637, + "grad_norm": 0.7569751739501953, + "learning_rate": 8.981912171466525e-06, + "loss": 0.6106, + "step": 7592 + }, + { + "epoch": 0.417909626286532, + "grad_norm": 0.7383938431739807, + "learning_rate": 8.981649998767998e-06, + "loss": 0.8163, + "step": 7593 + }, + { + "epoch": 0.4179646650888877, + "grad_norm": 0.7314342856407166, + "learning_rate": 8.981387796144456e-06, + "loss": 0.6847, + "step": 7594 + }, + { + "epoch": 0.4180197038912433, + "grad_norm": 0.7249840497970581, + "learning_rate": 8.981125563597867e-06, + "loss": 0.8025, + "step": 7595 + }, + { + "epoch": 0.418074742693599, + "grad_norm": 0.7260022759437561, + "learning_rate": 8.980863301130206e-06, + "loss": 0.7807, + "step": 7596 + }, + { + "epoch": 0.41812978149595464, + "grad_norm": 0.6249421834945679, + "learning_rate": 8.980601008743441e-06, + "loss": 0.6744, + "step": 7597 + }, + { + "epoch": 0.41818482029831033, + "grad_norm": 0.8132835626602173, + "learning_rate": 8.980338686439544e-06, + "loss": 0.7992, + "step": 7598 + }, + { + "epoch": 0.41823985910066597, + "grad_norm": 0.7279506921768188, + "learning_rate": 8.980076334220487e-06, + "loss": 0.8402, + "step": 7599 + }, + { + "epoch": 0.41829489790302166, + "grad_norm": 0.7168325781822205, + "learning_rate": 8.979813952088242e-06, + "loss": 0.9107, + "step": 7600 + }, + { + "epoch": 0.4183499367053773, + "grad_norm": 0.633661150932312, + "learning_rate": 8.97955154004478e-06, + "loss": 0.6328, + "step": 7601 + }, + { + "epoch": 0.418404975507733, + "grad_norm": 0.6770638227462769, + "learning_rate": 8.979289098092074e-06, + "loss": 0.7604, + "step": 7602 + }, + { + "epoch": 0.4184600143100886, + "grad_norm": 0.7589067816734314, + "learning_rate": 8.979026626232098e-06, + "loss": 0.7774, + "step": 7603 + }, + { + "epoch": 0.41851505311244425, + "grad_norm": 0.7116312980651855, + "learning_rate": 8.97876412446682e-06, + "loss": 0.8186, + "step": 7604 + }, + { + "epoch": 0.41857009191479994, + "grad_norm": 0.7369259595870972, + "learning_rate": 8.978501592798219e-06, + "loss": 0.6705, + "step": 7605 + }, + { + "epoch": 0.41862513071715557, + "grad_norm": 0.6201806664466858, + "learning_rate": 8.978239031228265e-06, + "loss": 0.7011, + "step": 7606 + }, + { + "epoch": 0.41868016951951126, + "grad_norm": 0.7652842998504639, + "learning_rate": 8.977976439758929e-06, + "loss": 0.8112, + "step": 7607 + }, + { + "epoch": 0.4187352083218669, + "grad_norm": 0.7214640974998474, + "learning_rate": 8.97771381839219e-06, + "loss": 0.767, + "step": 7608 + }, + { + "epoch": 0.4187902471242226, + "grad_norm": 0.8093706369400024, + "learning_rate": 8.977451167130015e-06, + "loss": 0.8112, + "step": 7609 + }, + { + "epoch": 0.4188452859265782, + "grad_norm": 0.7023005485534668, + "learning_rate": 8.977188485974382e-06, + "loss": 0.7678, + "step": 7610 + }, + { + "epoch": 0.4189003247289339, + "grad_norm": 0.8126183748245239, + "learning_rate": 8.976925774927267e-06, + "loss": 0.8207, + "step": 7611 + }, + { + "epoch": 0.41895536353128954, + "grad_norm": 0.9624595642089844, + "learning_rate": 8.976663033990643e-06, + "loss": 0.7853, + "step": 7612 + }, + { + "epoch": 0.4190104023336452, + "grad_norm": 0.7866421937942505, + "learning_rate": 8.976400263166483e-06, + "loss": 0.6319, + "step": 7613 + }, + { + "epoch": 0.41906544113600086, + "grad_norm": 0.7555810213088989, + "learning_rate": 8.976137462456762e-06, + "loss": 0.7781, + "step": 7614 + }, + { + "epoch": 0.41912047993835655, + "grad_norm": 0.7383303046226501, + "learning_rate": 8.975874631863457e-06, + "loss": 0.8152, + "step": 7615 + }, + { + "epoch": 0.4191755187407122, + "grad_norm": 0.7873355746269226, + "learning_rate": 8.975611771388542e-06, + "loss": 0.723, + "step": 7616 + }, + { + "epoch": 0.41923055754306787, + "grad_norm": 0.7265962362289429, + "learning_rate": 8.975348881033993e-06, + "loss": 0.8016, + "step": 7617 + }, + { + "epoch": 0.4192855963454235, + "grad_norm": 0.7074393033981323, + "learning_rate": 8.975085960801788e-06, + "loss": 0.7453, + "step": 7618 + }, + { + "epoch": 0.4193406351477792, + "grad_norm": 0.6975581049919128, + "learning_rate": 8.9748230106939e-06, + "loss": 0.6516, + "step": 7619 + }, + { + "epoch": 0.41939567395013483, + "grad_norm": 0.7730469107627869, + "learning_rate": 8.974560030712304e-06, + "loss": 0.7297, + "step": 7620 + }, + { + "epoch": 0.4194507127524905, + "grad_norm": 0.7289026379585266, + "learning_rate": 8.974297020858982e-06, + "loss": 0.7087, + "step": 7621 + }, + { + "epoch": 0.41950575155484615, + "grad_norm": 0.8029256463050842, + "learning_rate": 8.974033981135906e-06, + "loss": 0.7923, + "step": 7622 + }, + { + "epoch": 0.41956079035720184, + "grad_norm": 0.765312135219574, + "learning_rate": 8.973770911545055e-06, + "loss": 0.7824, + "step": 7623 + }, + { + "epoch": 0.4196158291595575, + "grad_norm": 0.7903861403465271, + "learning_rate": 8.973507812088404e-06, + "loss": 0.8207, + "step": 7624 + }, + { + "epoch": 0.41967086796191316, + "grad_norm": 0.6875497698783875, + "learning_rate": 8.973244682767934e-06, + "loss": 0.7972, + "step": 7625 + }, + { + "epoch": 0.4197259067642688, + "grad_norm": 0.7781878709793091, + "learning_rate": 8.972981523585617e-06, + "loss": 0.754, + "step": 7626 + }, + { + "epoch": 0.4197809455666245, + "grad_norm": 0.6495640873908997, + "learning_rate": 8.972718334543437e-06, + "loss": 0.6851, + "step": 7627 + }, + { + "epoch": 0.4198359843689801, + "grad_norm": 0.7610780596733093, + "learning_rate": 8.97245511564337e-06, + "loss": 0.8161, + "step": 7628 + }, + { + "epoch": 0.4198910231713358, + "grad_norm": 0.7764771580696106, + "learning_rate": 8.972191866887393e-06, + "loss": 0.8341, + "step": 7629 + }, + { + "epoch": 0.41994606197369144, + "grad_norm": 0.7709774374961853, + "learning_rate": 8.971928588277485e-06, + "loss": 0.765, + "step": 7630 + }, + { + "epoch": 0.42000110077604713, + "grad_norm": 0.8213009238243103, + "learning_rate": 8.971665279815625e-06, + "loss": 0.8971, + "step": 7631 + }, + { + "epoch": 0.42005613957840277, + "grad_norm": 0.7232406735420227, + "learning_rate": 8.971401941503792e-06, + "loss": 0.7919, + "step": 7632 + }, + { + "epoch": 0.42011117838075845, + "grad_norm": 0.7322028279304504, + "learning_rate": 8.971138573343964e-06, + "loss": 0.8167, + "step": 7633 + }, + { + "epoch": 0.4201662171831141, + "grad_norm": 0.7204442024230957, + "learning_rate": 8.970875175338123e-06, + "loss": 0.8152, + "step": 7634 + }, + { + "epoch": 0.4202212559854698, + "grad_norm": 0.7385342121124268, + "learning_rate": 8.970611747488246e-06, + "loss": 0.8204, + "step": 7635 + }, + { + "epoch": 0.4202762947878254, + "grad_norm": 0.758941113948822, + "learning_rate": 8.970348289796316e-06, + "loss": 0.8402, + "step": 7636 + }, + { + "epoch": 0.4203313335901811, + "grad_norm": 0.7331902384757996, + "learning_rate": 8.970084802264309e-06, + "loss": 0.7305, + "step": 7637 + }, + { + "epoch": 0.42038637239253673, + "grad_norm": 0.7822885513305664, + "learning_rate": 8.969821284894208e-06, + "loss": 0.8708, + "step": 7638 + }, + { + "epoch": 0.4204414111948924, + "grad_norm": 0.6625984311103821, + "learning_rate": 8.969557737687992e-06, + "loss": 0.7806, + "step": 7639 + }, + { + "epoch": 0.42049644999724806, + "grad_norm": 1.02848482131958, + "learning_rate": 8.969294160647645e-06, + "loss": 0.7176, + "step": 7640 + }, + { + "epoch": 0.42055148879960375, + "grad_norm": 0.7888724207878113, + "learning_rate": 8.969030553775144e-06, + "loss": 0.8326, + "step": 7641 + }, + { + "epoch": 0.4206065276019594, + "grad_norm": 0.7148883938789368, + "learning_rate": 8.968766917072472e-06, + "loss": 0.7405, + "step": 7642 + }, + { + "epoch": 0.42066156640431507, + "grad_norm": 0.6629698872566223, + "learning_rate": 8.96850325054161e-06, + "loss": 0.845, + "step": 7643 + }, + { + "epoch": 0.4207166052066707, + "grad_norm": 0.8414682149887085, + "learning_rate": 8.96823955418454e-06, + "loss": 1.3631, + "step": 7644 + }, + { + "epoch": 0.4207716440090264, + "grad_norm": 0.7105298638343811, + "learning_rate": 8.967975828003244e-06, + "loss": 0.6808, + "step": 7645 + }, + { + "epoch": 0.420826682811382, + "grad_norm": 0.7324852347373962, + "learning_rate": 8.967712071999703e-06, + "loss": 0.8237, + "step": 7646 + }, + { + "epoch": 0.42088172161373766, + "grad_norm": 0.737324595451355, + "learning_rate": 8.9674482861759e-06, + "loss": 0.8486, + "step": 7647 + }, + { + "epoch": 0.42093676041609335, + "grad_norm": 0.6763800382614136, + "learning_rate": 8.967184470533818e-06, + "loss": 0.72, + "step": 7648 + }, + { + "epoch": 0.420991799218449, + "grad_norm": 0.7560757994651794, + "learning_rate": 8.96692062507544e-06, + "loss": 0.7704, + "step": 7649 + }, + { + "epoch": 0.42104683802080467, + "grad_norm": 0.7289260029792786, + "learning_rate": 8.966656749802748e-06, + "loss": 0.7411, + "step": 7650 + }, + { + "epoch": 0.4211018768231603, + "grad_norm": 0.6935442686080933, + "learning_rate": 8.966392844717726e-06, + "loss": 0.7848, + "step": 7651 + }, + { + "epoch": 0.421156915625516, + "grad_norm": 0.7111918330192566, + "learning_rate": 8.966128909822356e-06, + "loss": 0.8377, + "step": 7652 + }, + { + "epoch": 0.4212119544278716, + "grad_norm": 0.8594884872436523, + "learning_rate": 8.965864945118625e-06, + "loss": 0.8227, + "step": 7653 + }, + { + "epoch": 0.4212669932302273, + "grad_norm": 0.6521008014678955, + "learning_rate": 8.965600950608513e-06, + "loss": 0.7034, + "step": 7654 + }, + { + "epoch": 0.42132203203258295, + "grad_norm": 0.6362404823303223, + "learning_rate": 8.965336926294007e-06, + "loss": 0.6712, + "step": 7655 + }, + { + "epoch": 0.42137707083493864, + "grad_norm": 0.6955040097236633, + "learning_rate": 8.965072872177088e-06, + "loss": 0.7789, + "step": 7656 + }, + { + "epoch": 0.42143210963729427, + "grad_norm": 0.7311720252037048, + "learning_rate": 8.964808788259745e-06, + "loss": 0.7522, + "step": 7657 + }, + { + "epoch": 0.42148714843964996, + "grad_norm": 0.781131386756897, + "learning_rate": 8.96454467454396e-06, + "loss": 0.7831, + "step": 7658 + }, + { + "epoch": 0.4215421872420056, + "grad_norm": 0.6740639805793762, + "learning_rate": 8.964280531031718e-06, + "loss": 0.7102, + "step": 7659 + }, + { + "epoch": 0.4215972260443613, + "grad_norm": 0.7843424677848816, + "learning_rate": 8.964016357725003e-06, + "loss": 0.8325, + "step": 7660 + }, + { + "epoch": 0.4216522648467169, + "grad_norm": 0.7833517789840698, + "learning_rate": 8.963752154625804e-06, + "loss": 0.8603, + "step": 7661 + }, + { + "epoch": 0.4217073036490726, + "grad_norm": 0.7270992994308472, + "learning_rate": 8.963487921736104e-06, + "loss": 0.745, + "step": 7662 + }, + { + "epoch": 0.42176234245142824, + "grad_norm": 0.6517582535743713, + "learning_rate": 8.963223659057892e-06, + "loss": 0.6983, + "step": 7663 + }, + { + "epoch": 0.42181738125378393, + "grad_norm": 0.6974934935569763, + "learning_rate": 8.962959366593149e-06, + "loss": 0.733, + "step": 7664 + }, + { + "epoch": 0.42187242005613956, + "grad_norm": 0.712045431137085, + "learning_rate": 8.962695044343865e-06, + "loss": 0.725, + "step": 7665 + }, + { + "epoch": 0.42192745885849525, + "grad_norm": 0.7311459183692932, + "learning_rate": 8.962430692312028e-06, + "loss": 0.8025, + "step": 7666 + }, + { + "epoch": 0.4219824976608509, + "grad_norm": 0.7439966201782227, + "learning_rate": 8.962166310499621e-06, + "loss": 0.7711, + "step": 7667 + }, + { + "epoch": 0.4220375364632066, + "grad_norm": 0.690832257270813, + "learning_rate": 8.961901898908632e-06, + "loss": 0.8414, + "step": 7668 + }, + { + "epoch": 0.4220925752655622, + "grad_norm": 0.8437964916229248, + "learning_rate": 8.961637457541049e-06, + "loss": 0.8253, + "step": 7669 + }, + { + "epoch": 0.4221476140679179, + "grad_norm": 0.7876344323158264, + "learning_rate": 8.96137298639886e-06, + "loss": 0.754, + "step": 7670 + }, + { + "epoch": 0.42220265287027353, + "grad_norm": 0.7551780343055725, + "learning_rate": 8.961108485484052e-06, + "loss": 0.8555, + "step": 7671 + }, + { + "epoch": 0.4222576916726292, + "grad_norm": 0.6867276430130005, + "learning_rate": 8.96084395479861e-06, + "loss": 0.7216, + "step": 7672 + }, + { + "epoch": 0.42231273047498485, + "grad_norm": 0.9052873849868774, + "learning_rate": 8.960579394344528e-06, + "loss": 0.7945, + "step": 7673 + }, + { + "epoch": 0.42236776927734054, + "grad_norm": 0.6731994152069092, + "learning_rate": 8.96031480412379e-06, + "loss": 0.7691, + "step": 7674 + }, + { + "epoch": 0.4224228080796962, + "grad_norm": 0.7074670195579529, + "learning_rate": 8.960050184138389e-06, + "loss": 0.8008, + "step": 7675 + }, + { + "epoch": 0.42247784688205187, + "grad_norm": 0.9482604265213013, + "learning_rate": 8.959785534390309e-06, + "loss": 0.7095, + "step": 7676 + }, + { + "epoch": 0.4225328856844075, + "grad_norm": 0.6915413737297058, + "learning_rate": 8.95952085488154e-06, + "loss": 0.6717, + "step": 7677 + }, + { + "epoch": 0.4225879244867632, + "grad_norm": 0.7565900087356567, + "learning_rate": 8.959256145614073e-06, + "loss": 0.8311, + "step": 7678 + }, + { + "epoch": 0.4226429632891188, + "grad_norm": 0.8307167887687683, + "learning_rate": 8.958991406589896e-06, + "loss": 0.8585, + "step": 7679 + }, + { + "epoch": 0.4226980020914745, + "grad_norm": 0.7955091595649719, + "learning_rate": 8.958726637811e-06, + "loss": 0.8154, + "step": 7680 + }, + { + "epoch": 0.42275304089383015, + "grad_norm": 0.7692292332649231, + "learning_rate": 8.958461839279376e-06, + "loss": 0.7965, + "step": 7681 + }, + { + "epoch": 0.42280807969618583, + "grad_norm": 0.7355942726135254, + "learning_rate": 8.95819701099701e-06, + "loss": 0.7557, + "step": 7682 + }, + { + "epoch": 0.42286311849854147, + "grad_norm": 0.8781518936157227, + "learning_rate": 8.957932152965895e-06, + "loss": 0.8033, + "step": 7683 + }, + { + "epoch": 0.42291815730089716, + "grad_norm": 0.7180802226066589, + "learning_rate": 8.957667265188022e-06, + "loss": 0.7283, + "step": 7684 + }, + { + "epoch": 0.4229731961032528, + "grad_norm": 0.6967236995697021, + "learning_rate": 8.95740234766538e-06, + "loss": 0.769, + "step": 7685 + }, + { + "epoch": 0.4230282349056085, + "grad_norm": 0.7462503910064697, + "learning_rate": 8.957137400399963e-06, + "loss": 0.8179, + "step": 7686 + }, + { + "epoch": 0.4230832737079641, + "grad_norm": 0.67714524269104, + "learning_rate": 8.956872423393761e-06, + "loss": 0.7976, + "step": 7687 + }, + { + "epoch": 0.4231383125103198, + "grad_norm": 0.8239946365356445, + "learning_rate": 8.956607416648763e-06, + "loss": 0.7946, + "step": 7688 + }, + { + "epoch": 0.42319335131267544, + "grad_norm": 0.6724610924720764, + "learning_rate": 8.956342380166963e-06, + "loss": 0.7633, + "step": 7689 + }, + { + "epoch": 0.42324839011503107, + "grad_norm": 0.744987964630127, + "learning_rate": 8.956077313950354e-06, + "loss": 0.9028, + "step": 7690 + }, + { + "epoch": 0.42330342891738676, + "grad_norm": 0.7700596451759338, + "learning_rate": 8.955812218000925e-06, + "loss": 0.8954, + "step": 7691 + }, + { + "epoch": 0.4233584677197424, + "grad_norm": 0.6952996253967285, + "learning_rate": 8.955547092320673e-06, + "loss": 0.8094, + "step": 7692 + }, + { + "epoch": 0.4234135065220981, + "grad_norm": 0.6410536766052246, + "learning_rate": 8.955281936911586e-06, + "loss": 0.6281, + "step": 7693 + }, + { + "epoch": 0.4234685453244537, + "grad_norm": 1.0939754247665405, + "learning_rate": 8.95501675177566e-06, + "loss": 0.8239, + "step": 7694 + }, + { + "epoch": 0.4235235841268094, + "grad_norm": 0.7419464588165283, + "learning_rate": 8.954751536914885e-06, + "loss": 0.8015, + "step": 7695 + }, + { + "epoch": 0.42357862292916504, + "grad_norm": 0.8171356320381165, + "learning_rate": 8.954486292331257e-06, + "loss": 0.8183, + "step": 7696 + }, + { + "epoch": 0.4236336617315207, + "grad_norm": 0.745884358882904, + "learning_rate": 8.95422101802677e-06, + "loss": 0.7457, + "step": 7697 + }, + { + "epoch": 0.42368870053387636, + "grad_norm": 0.7355740070343018, + "learning_rate": 8.953955714003414e-06, + "loss": 0.7517, + "step": 7698 + }, + { + "epoch": 0.42374373933623205, + "grad_norm": 0.7103458642959595, + "learning_rate": 8.953690380263186e-06, + "loss": 0.7306, + "step": 7699 + }, + { + "epoch": 0.4237987781385877, + "grad_norm": 0.7453970909118652, + "learning_rate": 8.95342501680808e-06, + "loss": 0.8396, + "step": 7700 + }, + { + "epoch": 0.4238538169409434, + "grad_norm": 0.7132760286331177, + "learning_rate": 8.953159623640088e-06, + "loss": 0.7861, + "step": 7701 + }, + { + "epoch": 0.423908855743299, + "grad_norm": 0.785827100276947, + "learning_rate": 8.952894200761209e-06, + "loss": 0.8681, + "step": 7702 + }, + { + "epoch": 0.4239638945456547, + "grad_norm": 0.7075281143188477, + "learning_rate": 8.952628748173433e-06, + "loss": 0.7257, + "step": 7703 + }, + { + "epoch": 0.42401893334801033, + "grad_norm": 0.8205186724662781, + "learning_rate": 8.952363265878758e-06, + "loss": 0.7361, + "step": 7704 + }, + { + "epoch": 0.424073972150366, + "grad_norm": 0.6517061591148376, + "learning_rate": 8.952097753879181e-06, + "loss": 0.7127, + "step": 7705 + }, + { + "epoch": 0.42412901095272165, + "grad_norm": 0.7252761125564575, + "learning_rate": 8.951832212176692e-06, + "loss": 0.796, + "step": 7706 + }, + { + "epoch": 0.42418404975507734, + "grad_norm": 0.6688609719276428, + "learning_rate": 8.951566640773292e-06, + "loss": 0.7698, + "step": 7707 + }, + { + "epoch": 0.424239088557433, + "grad_norm": 0.7163566946983337, + "learning_rate": 8.951301039670974e-06, + "loss": 0.8069, + "step": 7708 + }, + { + "epoch": 0.42429412735978866, + "grad_norm": 0.7027623057365417, + "learning_rate": 8.951035408871735e-06, + "loss": 0.7061, + "step": 7709 + }, + { + "epoch": 0.4243491661621443, + "grad_norm": 0.9558683037757874, + "learning_rate": 8.950769748377572e-06, + "loss": 0.926, + "step": 7710 + }, + { + "epoch": 0.4244042049645, + "grad_norm": 0.7173893451690674, + "learning_rate": 8.950504058190482e-06, + "loss": 0.7519, + "step": 7711 + }, + { + "epoch": 0.4244592437668556, + "grad_norm": 0.8481128811836243, + "learning_rate": 8.950238338312459e-06, + "loss": 0.7804, + "step": 7712 + }, + { + "epoch": 0.4245142825692113, + "grad_norm": 0.6957072615623474, + "learning_rate": 8.949972588745502e-06, + "loss": 0.611, + "step": 7713 + }, + { + "epoch": 0.42456932137156694, + "grad_norm": 0.7910122871398926, + "learning_rate": 8.94970680949161e-06, + "loss": 0.8435, + "step": 7714 + }, + { + "epoch": 0.42462436017392263, + "grad_norm": 0.8068616986274719, + "learning_rate": 8.949441000552777e-06, + "loss": 0.8658, + "step": 7715 + }, + { + "epoch": 0.42467939897627827, + "grad_norm": 0.718110978603363, + "learning_rate": 8.949175161931006e-06, + "loss": 0.7908, + "step": 7716 + }, + { + "epoch": 0.42473443777863396, + "grad_norm": 0.7329656481742859, + "learning_rate": 8.948909293628289e-06, + "loss": 0.7477, + "step": 7717 + }, + { + "epoch": 0.4247894765809896, + "grad_norm": 0.7046940326690674, + "learning_rate": 8.948643395646625e-06, + "loss": 0.7985, + "step": 7718 + }, + { + "epoch": 0.4248445153833453, + "grad_norm": 0.6699581742286682, + "learning_rate": 8.948377467988017e-06, + "loss": 0.6575, + "step": 7719 + }, + { + "epoch": 0.4248995541857009, + "grad_norm": 0.8055217266082764, + "learning_rate": 8.94811151065446e-06, + "loss": 0.7008, + "step": 7720 + }, + { + "epoch": 0.4249545929880566, + "grad_norm": 0.8374543190002441, + "learning_rate": 8.947845523647954e-06, + "loss": 0.8918, + "step": 7721 + }, + { + "epoch": 0.42500963179041223, + "grad_norm": 0.6974833607673645, + "learning_rate": 8.947579506970498e-06, + "loss": 0.8594, + "step": 7722 + }, + { + "epoch": 0.4250646705927679, + "grad_norm": 0.7466567754745483, + "learning_rate": 8.947313460624091e-06, + "loss": 0.6935, + "step": 7723 + }, + { + "epoch": 0.42511970939512356, + "grad_norm": 0.8118101358413696, + "learning_rate": 8.947047384610734e-06, + "loss": 0.8432, + "step": 7724 + }, + { + "epoch": 0.42517474819747925, + "grad_norm": 0.6885644197463989, + "learning_rate": 8.946781278932422e-06, + "loss": 0.8059, + "step": 7725 + }, + { + "epoch": 0.4252297869998349, + "grad_norm": 0.7257012128829956, + "learning_rate": 8.94651514359116e-06, + "loss": 0.8239, + "step": 7726 + }, + { + "epoch": 0.42528482580219057, + "grad_norm": 1.311591386795044, + "learning_rate": 8.946248978588947e-06, + "loss": 0.8207, + "step": 7727 + }, + { + "epoch": 0.4253398646045462, + "grad_norm": 0.7694151997566223, + "learning_rate": 8.945982783927784e-06, + "loss": 0.8948, + "step": 7728 + }, + { + "epoch": 0.4253949034069019, + "grad_norm": 0.6922980546951294, + "learning_rate": 8.945716559609669e-06, + "loss": 0.7883, + "step": 7729 + }, + { + "epoch": 0.4254499422092575, + "grad_norm": 0.7803757786750793, + "learning_rate": 8.945450305636605e-06, + "loss": 0.9166, + "step": 7730 + }, + { + "epoch": 0.4255049810116132, + "grad_norm": 0.6775311827659607, + "learning_rate": 8.945184022010593e-06, + "loss": 0.6976, + "step": 7731 + }, + { + "epoch": 0.42556001981396885, + "grad_norm": 0.7108052968978882, + "learning_rate": 8.944917708733634e-06, + "loss": 0.7763, + "step": 7732 + }, + { + "epoch": 0.4256150586163245, + "grad_norm": 0.7215770483016968, + "learning_rate": 8.94465136580773e-06, + "loss": 0.7907, + "step": 7733 + }, + { + "epoch": 0.42567009741868017, + "grad_norm": 0.6690788865089417, + "learning_rate": 8.944384993234881e-06, + "loss": 0.8403, + "step": 7734 + }, + { + "epoch": 0.4257251362210358, + "grad_norm": 0.7372478246688843, + "learning_rate": 8.94411859101709e-06, + "loss": 0.7618, + "step": 7735 + }, + { + "epoch": 0.4257801750233915, + "grad_norm": 0.9398306608200073, + "learning_rate": 8.94385215915636e-06, + "loss": 0.9043, + "step": 7736 + }, + { + "epoch": 0.4258352138257471, + "grad_norm": 0.8790311217308044, + "learning_rate": 8.943585697654693e-06, + "loss": 0.9378, + "step": 7737 + }, + { + "epoch": 0.4258902526281028, + "grad_norm": 0.7579166889190674, + "learning_rate": 8.943319206514091e-06, + "loss": 0.7913, + "step": 7738 + }, + { + "epoch": 0.42594529143045845, + "grad_norm": 0.6426860690116882, + "learning_rate": 8.943052685736559e-06, + "loss": 0.744, + "step": 7739 + }, + { + "epoch": 0.42600033023281414, + "grad_norm": 0.688117265701294, + "learning_rate": 8.942786135324098e-06, + "loss": 0.8386, + "step": 7740 + }, + { + "epoch": 0.4260553690351698, + "grad_norm": 0.7178692817687988, + "learning_rate": 8.94251955527871e-06, + "loss": 0.7937, + "step": 7741 + }, + { + "epoch": 0.42611040783752546, + "grad_norm": 0.7980415225028992, + "learning_rate": 8.942252945602403e-06, + "loss": 0.76, + "step": 7742 + }, + { + "epoch": 0.4261654466398811, + "grad_norm": 0.6858333349227905, + "learning_rate": 8.941986306297175e-06, + "loss": 0.8155, + "step": 7743 + }, + { + "epoch": 0.4262204854422368, + "grad_norm": 0.763297975063324, + "learning_rate": 8.941719637365037e-06, + "loss": 0.8003, + "step": 7744 + }, + { + "epoch": 0.4262755242445924, + "grad_norm": 0.661016047000885, + "learning_rate": 8.941452938807986e-06, + "loss": 0.6788, + "step": 7745 + }, + { + "epoch": 0.4263305630469481, + "grad_norm": 0.7168089151382446, + "learning_rate": 8.94118621062803e-06, + "loss": 0.7791, + "step": 7746 + }, + { + "epoch": 0.42638560184930374, + "grad_norm": 0.6879743337631226, + "learning_rate": 8.940919452827174e-06, + "loss": 0.7978, + "step": 7747 + }, + { + "epoch": 0.42644064065165943, + "grad_norm": 0.672298014163971, + "learning_rate": 8.940652665407424e-06, + "loss": 0.7569, + "step": 7748 + }, + { + "epoch": 0.42649567945401506, + "grad_norm": 0.7237414717674255, + "learning_rate": 8.940385848370782e-06, + "loss": 0.6788, + "step": 7749 + }, + { + "epoch": 0.42655071825637075, + "grad_norm": 0.6793895363807678, + "learning_rate": 8.940119001719255e-06, + "loss": 0.749, + "step": 7750 + }, + { + "epoch": 0.4266057570587264, + "grad_norm": 1.1172789335250854, + "learning_rate": 8.939852125454847e-06, + "loss": 0.9017, + "step": 7751 + }, + { + "epoch": 0.4266607958610821, + "grad_norm": 0.7138717770576477, + "learning_rate": 8.939585219579567e-06, + "loss": 0.8586, + "step": 7752 + }, + { + "epoch": 0.4267158346634377, + "grad_norm": 0.8678629398345947, + "learning_rate": 8.939318284095417e-06, + "loss": 0.7333, + "step": 7753 + }, + { + "epoch": 0.4267708734657934, + "grad_norm": 0.7274941802024841, + "learning_rate": 8.939051319004407e-06, + "loss": 0.8426, + "step": 7754 + }, + { + "epoch": 0.42682591226814903, + "grad_norm": 0.6845358610153198, + "learning_rate": 8.93878432430854e-06, + "loss": 0.7731, + "step": 7755 + }, + { + "epoch": 0.4268809510705047, + "grad_norm": 0.7042781710624695, + "learning_rate": 8.938517300009826e-06, + "loss": 0.6703, + "step": 7756 + }, + { + "epoch": 0.42693598987286036, + "grad_norm": 0.7147190570831299, + "learning_rate": 8.93825024611027e-06, + "loss": 0.7977, + "step": 7757 + }, + { + "epoch": 0.42699102867521604, + "grad_norm": 0.6584187150001526, + "learning_rate": 8.93798316261188e-06, + "loss": 0.716, + "step": 7758 + }, + { + "epoch": 0.4270460674775717, + "grad_norm": 0.8061439990997314, + "learning_rate": 8.93771604951666e-06, + "loss": 0.9075, + "step": 7759 + }, + { + "epoch": 0.42710110627992737, + "grad_norm": 0.6741406917572021, + "learning_rate": 8.937448906826622e-06, + "loss": 0.7828, + "step": 7760 + }, + { + "epoch": 0.427156145082283, + "grad_norm": 0.8791692852973938, + "learning_rate": 8.937181734543773e-06, + "loss": 0.7685, + "step": 7761 + }, + { + "epoch": 0.4272111838846387, + "grad_norm": 0.6804112195968628, + "learning_rate": 8.936914532670119e-06, + "loss": 0.7672, + "step": 7762 + }, + { + "epoch": 0.4272662226869943, + "grad_norm": 0.6983451843261719, + "learning_rate": 8.936647301207668e-06, + "loss": 0.8228, + "step": 7763 + }, + { + "epoch": 0.42732126148935, + "grad_norm": 0.8248929977416992, + "learning_rate": 8.936380040158432e-06, + "loss": 0.7628, + "step": 7764 + }, + { + "epoch": 0.42737630029170565, + "grad_norm": 0.8324941992759705, + "learning_rate": 8.936112749524415e-06, + "loss": 0.8125, + "step": 7765 + }, + { + "epoch": 0.42743133909406134, + "grad_norm": 0.7489150762557983, + "learning_rate": 8.935845429307631e-06, + "loss": 0.8766, + "step": 7766 + }, + { + "epoch": 0.42748637789641697, + "grad_norm": 0.7323104739189148, + "learning_rate": 8.935578079510083e-06, + "loss": 0.8607, + "step": 7767 + }, + { + "epoch": 0.42754141669877266, + "grad_norm": 0.6825152635574341, + "learning_rate": 8.935310700133786e-06, + "loss": 0.7817, + "step": 7768 + }, + { + "epoch": 0.4275964555011283, + "grad_norm": 0.8928677439689636, + "learning_rate": 8.935043291180748e-06, + "loss": 0.7621, + "step": 7769 + }, + { + "epoch": 0.427651494303484, + "grad_norm": 0.7071405649185181, + "learning_rate": 8.934775852652975e-06, + "loss": 0.7798, + "step": 7770 + }, + { + "epoch": 0.4277065331058396, + "grad_norm": 0.8225427269935608, + "learning_rate": 8.934508384552481e-06, + "loss": 0.7212, + "step": 7771 + }, + { + "epoch": 0.4277615719081953, + "grad_norm": 0.6931234002113342, + "learning_rate": 8.934240886881276e-06, + "loss": 0.7301, + "step": 7772 + }, + { + "epoch": 0.42781661071055094, + "grad_norm": 0.6901859641075134, + "learning_rate": 8.933973359641369e-06, + "loss": 0.6974, + "step": 7773 + }, + { + "epoch": 0.4278716495129066, + "grad_norm": 0.7736960649490356, + "learning_rate": 8.93370580283477e-06, + "loss": 0.6562, + "step": 7774 + }, + { + "epoch": 0.42792668831526226, + "grad_norm": 0.7363499999046326, + "learning_rate": 8.933438216463495e-06, + "loss": 0.8274, + "step": 7775 + }, + { + "epoch": 0.4279817271176179, + "grad_norm": 0.6855602860450745, + "learning_rate": 8.933170600529548e-06, + "loss": 0.7576, + "step": 7776 + }, + { + "epoch": 0.4280367659199736, + "grad_norm": 0.7641676664352417, + "learning_rate": 8.932902955034945e-06, + "loss": 0.7837, + "step": 7777 + }, + { + "epoch": 0.4280918047223292, + "grad_norm": 0.74812251329422, + "learning_rate": 8.932635279981695e-06, + "loss": 0.8402, + "step": 7778 + }, + { + "epoch": 0.4281468435246849, + "grad_norm": 0.7445259094238281, + "learning_rate": 8.932367575371813e-06, + "loss": 0.862, + "step": 7779 + }, + { + "epoch": 0.42820188232704054, + "grad_norm": 0.8977177739143372, + "learning_rate": 8.932099841207306e-06, + "loss": 0.7735, + "step": 7780 + }, + { + "epoch": 0.42825692112939623, + "grad_norm": 0.74172043800354, + "learning_rate": 8.93183207749019e-06, + "loss": 0.7053, + "step": 7781 + }, + { + "epoch": 0.42831195993175186, + "grad_norm": 0.6670083999633789, + "learning_rate": 8.931564284222479e-06, + "loss": 0.6348, + "step": 7782 + }, + { + "epoch": 0.42836699873410755, + "grad_norm": 0.7575422525405884, + "learning_rate": 8.93129646140618e-06, + "loss": 0.9354, + "step": 7783 + }, + { + "epoch": 0.4284220375364632, + "grad_norm": 0.7436977624893188, + "learning_rate": 8.931028609043311e-06, + "loss": 0.7461, + "step": 7784 + }, + { + "epoch": 0.4284770763388189, + "grad_norm": 0.7383070588111877, + "learning_rate": 8.930760727135882e-06, + "loss": 0.7629, + "step": 7785 + }, + { + "epoch": 0.4285321151411745, + "grad_norm": 0.6926067471504211, + "learning_rate": 8.93049281568591e-06, + "loss": 0.6788, + "step": 7786 + }, + { + "epoch": 0.4285871539435302, + "grad_norm": 0.7680530548095703, + "learning_rate": 8.930224874695404e-06, + "loss": 0.722, + "step": 7787 + }, + { + "epoch": 0.42864219274588583, + "grad_norm": 0.9880867004394531, + "learning_rate": 8.92995690416638e-06, + "loss": 0.833, + "step": 7788 + }, + { + "epoch": 0.4286972315482415, + "grad_norm": 0.7915430068969727, + "learning_rate": 8.929688904100853e-06, + "loss": 0.7643, + "step": 7789 + }, + { + "epoch": 0.42875227035059715, + "grad_norm": 0.6972275376319885, + "learning_rate": 8.929420874500836e-06, + "loss": 0.7697, + "step": 7790 + }, + { + "epoch": 0.42880730915295284, + "grad_norm": 0.9583331346511841, + "learning_rate": 8.929152815368343e-06, + "loss": 0.7591, + "step": 7791 + }, + { + "epoch": 0.4288623479553085, + "grad_norm": 0.7254299521446228, + "learning_rate": 8.928884726705388e-06, + "loss": 0.7913, + "step": 7792 + }, + { + "epoch": 0.42891738675766417, + "grad_norm": 0.7925865054130554, + "learning_rate": 8.928616608513989e-06, + "loss": 0.8248, + "step": 7793 + }, + { + "epoch": 0.4289724255600198, + "grad_norm": 0.9367457628250122, + "learning_rate": 8.928348460796157e-06, + "loss": 0.7767, + "step": 7794 + }, + { + "epoch": 0.4290274643623755, + "grad_norm": 0.8511868119239807, + "learning_rate": 8.928080283553912e-06, + "loss": 0.841, + "step": 7795 + }, + { + "epoch": 0.4290825031647311, + "grad_norm": 0.8518061637878418, + "learning_rate": 8.927812076789267e-06, + "loss": 0.7907, + "step": 7796 + }, + { + "epoch": 0.4291375419670868, + "grad_norm": 0.7208365797996521, + "learning_rate": 8.927543840504236e-06, + "loss": 0.7344, + "step": 7797 + }, + { + "epoch": 0.42919258076944244, + "grad_norm": 0.7541850209236145, + "learning_rate": 8.927275574700838e-06, + "loss": 0.7724, + "step": 7798 + }, + { + "epoch": 0.42924761957179813, + "grad_norm": 0.7378629446029663, + "learning_rate": 8.927007279381087e-06, + "loss": 0.7614, + "step": 7799 + }, + { + "epoch": 0.42930265837415377, + "grad_norm": 0.7358561158180237, + "learning_rate": 8.926738954547001e-06, + "loss": 0.7288, + "step": 7800 + }, + { + "epoch": 0.42935769717650946, + "grad_norm": 0.7385967969894409, + "learning_rate": 8.926470600200597e-06, + "loss": 0.7562, + "step": 7801 + }, + { + "epoch": 0.4294127359788651, + "grad_norm": 0.6904877424240112, + "learning_rate": 8.92620221634389e-06, + "loss": 0.6507, + "step": 7802 + }, + { + "epoch": 0.4294677747812208, + "grad_norm": 0.7205148935317993, + "learning_rate": 8.925933802978898e-06, + "loss": 0.7683, + "step": 7803 + }, + { + "epoch": 0.4295228135835764, + "grad_norm": 0.6830344200134277, + "learning_rate": 8.925665360107639e-06, + "loss": 0.6886, + "step": 7804 + }, + { + "epoch": 0.4295778523859321, + "grad_norm": 0.7648812532424927, + "learning_rate": 8.92539688773213e-06, + "loss": 0.7559, + "step": 7805 + }, + { + "epoch": 0.42963289118828774, + "grad_norm": 0.7819112539291382, + "learning_rate": 8.925128385854389e-06, + "loss": 0.7443, + "step": 7806 + }, + { + "epoch": 0.4296879299906434, + "grad_norm": 0.6742433309555054, + "learning_rate": 8.924859854476433e-06, + "loss": 0.7191, + "step": 7807 + }, + { + "epoch": 0.42974296879299906, + "grad_norm": 0.7368177771568298, + "learning_rate": 8.924591293600281e-06, + "loss": 0.6946, + "step": 7808 + }, + { + "epoch": 0.42979800759535475, + "grad_norm": 0.663112998008728, + "learning_rate": 8.924322703227953e-06, + "loss": 0.7405, + "step": 7809 + }, + { + "epoch": 0.4298530463977104, + "grad_norm": 0.6735410690307617, + "learning_rate": 8.924054083361465e-06, + "loss": 0.7982, + "step": 7810 + }, + { + "epoch": 0.42990808520006607, + "grad_norm": 0.7770369648933411, + "learning_rate": 8.923785434002834e-06, + "loss": 0.9179, + "step": 7811 + }, + { + "epoch": 0.4299631240024217, + "grad_norm": 0.7464482188224792, + "learning_rate": 8.923516755154085e-06, + "loss": 0.8514, + "step": 7812 + }, + { + "epoch": 0.4300181628047774, + "grad_norm": 0.9249551892280579, + "learning_rate": 8.923248046817235e-06, + "loss": 0.8287, + "step": 7813 + }, + { + "epoch": 0.430073201607133, + "grad_norm": 0.7071338891983032, + "learning_rate": 8.922979308994302e-06, + "loss": 0.7509, + "step": 7814 + }, + { + "epoch": 0.4301282404094887, + "grad_norm": 0.6910794377326965, + "learning_rate": 8.922710541687305e-06, + "loss": 0.7373, + "step": 7815 + }, + { + "epoch": 0.43018327921184435, + "grad_norm": 0.8424028158187866, + "learning_rate": 8.922441744898267e-06, + "loss": 0.741, + "step": 7816 + }, + { + "epoch": 0.43023831801420004, + "grad_norm": 0.8162125945091248, + "learning_rate": 8.922172918629208e-06, + "loss": 0.8044, + "step": 7817 + }, + { + "epoch": 0.43029335681655567, + "grad_norm": 0.7415170669555664, + "learning_rate": 8.921904062882145e-06, + "loss": 0.7427, + "step": 7818 + }, + { + "epoch": 0.4303483956189113, + "grad_norm": 1.1357808113098145, + "learning_rate": 8.921635177659103e-06, + "loss": 0.7802, + "step": 7819 + }, + { + "epoch": 0.430403434421267, + "grad_norm": 0.7039839625358582, + "learning_rate": 8.9213662629621e-06, + "loss": 0.7368, + "step": 7820 + }, + { + "epoch": 0.43045847322362263, + "grad_norm": 0.721077024936676, + "learning_rate": 8.921097318793157e-06, + "loss": 0.6575, + "step": 7821 + }, + { + "epoch": 0.4305135120259783, + "grad_norm": 0.7823510766029358, + "learning_rate": 8.920828345154297e-06, + "loss": 0.7499, + "step": 7822 + }, + { + "epoch": 0.43056855082833395, + "grad_norm": 0.6400569677352905, + "learning_rate": 8.920559342047539e-06, + "loss": 0.7091, + "step": 7823 + }, + { + "epoch": 0.43062358963068964, + "grad_norm": 0.8974951505661011, + "learning_rate": 8.920290309474908e-06, + "loss": 0.7228, + "step": 7824 + }, + { + "epoch": 0.4306786284330453, + "grad_norm": 0.8176010847091675, + "learning_rate": 8.920021247438426e-06, + "loss": 0.8852, + "step": 7825 + }, + { + "epoch": 0.43073366723540096, + "grad_norm": 0.7591422200202942, + "learning_rate": 8.919752155940112e-06, + "loss": 0.8382, + "step": 7826 + }, + { + "epoch": 0.4307887060377566, + "grad_norm": 0.7089776396751404, + "learning_rate": 8.919483034981988e-06, + "loss": 0.7188, + "step": 7827 + }, + { + "epoch": 0.4308437448401123, + "grad_norm": 0.7328840494155884, + "learning_rate": 8.919213884566081e-06, + "loss": 0.7609, + "step": 7828 + }, + { + "epoch": 0.4308987836424679, + "grad_norm": 0.6473509669303894, + "learning_rate": 8.918944704694411e-06, + "loss": 0.7027, + "step": 7829 + }, + { + "epoch": 0.4309538224448236, + "grad_norm": 0.6585624814033508, + "learning_rate": 8.918675495369003e-06, + "loss": 0.7133, + "step": 7830 + }, + { + "epoch": 0.43100886124717924, + "grad_norm": 0.7232397794723511, + "learning_rate": 8.918406256591876e-06, + "loss": 0.7458, + "step": 7831 + }, + { + "epoch": 0.43106390004953493, + "grad_norm": 0.8752645254135132, + "learning_rate": 8.918136988365059e-06, + "loss": 0.671, + "step": 7832 + }, + { + "epoch": 0.43111893885189057, + "grad_norm": 0.7890885472297668, + "learning_rate": 8.917867690690573e-06, + "loss": 0.7674, + "step": 7833 + }, + { + "epoch": 0.43117397765424625, + "grad_norm": 0.6725128293037415, + "learning_rate": 8.917598363570441e-06, + "loss": 0.7373, + "step": 7834 + }, + { + "epoch": 0.4312290164566019, + "grad_norm": 0.808897852897644, + "learning_rate": 8.917329007006688e-06, + "loss": 0.8397, + "step": 7835 + }, + { + "epoch": 0.4312840552589576, + "grad_norm": 0.7268605828285217, + "learning_rate": 8.91705962100134e-06, + "loss": 0.7957, + "step": 7836 + }, + { + "epoch": 0.4313390940613132, + "grad_norm": 0.7336069345474243, + "learning_rate": 8.916790205556421e-06, + "loss": 0.746, + "step": 7837 + }, + { + "epoch": 0.4313941328636689, + "grad_norm": 0.7380902171134949, + "learning_rate": 8.916520760673955e-06, + "loss": 0.674, + "step": 7838 + }, + { + "epoch": 0.43144917166602453, + "grad_norm": 0.8041831851005554, + "learning_rate": 8.916251286355967e-06, + "loss": 0.8392, + "step": 7839 + }, + { + "epoch": 0.4315042104683802, + "grad_norm": 0.6745681166648865, + "learning_rate": 8.915981782604481e-06, + "loss": 0.7676, + "step": 7840 + }, + { + "epoch": 0.43155924927073586, + "grad_norm": 0.6572039127349854, + "learning_rate": 8.915712249421526e-06, + "loss": 0.7471, + "step": 7841 + }, + { + "epoch": 0.43161428807309155, + "grad_norm": 0.7250062227249146, + "learning_rate": 8.915442686809124e-06, + "loss": 0.8566, + "step": 7842 + }, + { + "epoch": 0.4316693268754472, + "grad_norm": 0.7008941769599915, + "learning_rate": 8.915173094769306e-06, + "loss": 0.7876, + "step": 7843 + }, + { + "epoch": 0.43172436567780287, + "grad_norm": 0.7078337073326111, + "learning_rate": 8.914903473304093e-06, + "loss": 0.756, + "step": 7844 + }, + { + "epoch": 0.4317794044801585, + "grad_norm": 0.7822949886322021, + "learning_rate": 8.914633822415513e-06, + "loss": 0.9423, + "step": 7845 + }, + { + "epoch": 0.4318344432825142, + "grad_norm": 0.6707580089569092, + "learning_rate": 8.914364142105593e-06, + "loss": 0.639, + "step": 7846 + }, + { + "epoch": 0.4318894820848698, + "grad_norm": 0.7868423461914062, + "learning_rate": 8.914094432376362e-06, + "loss": 0.7768, + "step": 7847 + }, + { + "epoch": 0.4319445208872255, + "grad_norm": 0.6147592067718506, + "learning_rate": 8.913824693229845e-06, + "loss": 0.6693, + "step": 7848 + }, + { + "epoch": 0.43199955968958115, + "grad_norm": 0.6901249885559082, + "learning_rate": 8.913554924668067e-06, + "loss": 0.7779, + "step": 7849 + }, + { + "epoch": 0.43205459849193684, + "grad_norm": 0.7062137126922607, + "learning_rate": 8.913285126693058e-06, + "loss": 0.7951, + "step": 7850 + }, + { + "epoch": 0.43210963729429247, + "grad_norm": 0.6363390684127808, + "learning_rate": 8.913015299306846e-06, + "loss": 0.6723, + "step": 7851 + }, + { + "epoch": 0.43216467609664816, + "grad_norm": 0.7168677449226379, + "learning_rate": 8.912745442511459e-06, + "loss": 0.7442, + "step": 7852 + }, + { + "epoch": 0.4322197148990038, + "grad_norm": 0.7347995042800903, + "learning_rate": 8.912475556308925e-06, + "loss": 0.8361, + "step": 7853 + }, + { + "epoch": 0.4322747537013595, + "grad_norm": 0.683777391910553, + "learning_rate": 8.91220564070127e-06, + "loss": 0.7583, + "step": 7854 + }, + { + "epoch": 0.4323297925037151, + "grad_norm": 0.7436330914497375, + "learning_rate": 8.911935695690527e-06, + "loss": 0.8414, + "step": 7855 + }, + { + "epoch": 0.4323848313060708, + "grad_norm": 0.7748109102249146, + "learning_rate": 8.911665721278721e-06, + "loss": 0.7812, + "step": 7856 + }, + { + "epoch": 0.43243987010842644, + "grad_norm": 0.7984411120414734, + "learning_rate": 8.911395717467883e-06, + "loss": 0.6845, + "step": 7857 + }, + { + "epoch": 0.4324949089107821, + "grad_norm": 0.680144727230072, + "learning_rate": 8.911125684260042e-06, + "loss": 0.7156, + "step": 7858 + }, + { + "epoch": 0.43254994771313776, + "grad_norm": 0.7738325595855713, + "learning_rate": 8.910855621657228e-06, + "loss": 0.7295, + "step": 7859 + }, + { + "epoch": 0.43260498651549345, + "grad_norm": 0.7276971340179443, + "learning_rate": 8.910585529661469e-06, + "loss": 0.7982, + "step": 7860 + }, + { + "epoch": 0.4326600253178491, + "grad_norm": 0.7655037641525269, + "learning_rate": 8.910315408274796e-06, + "loss": 0.8416, + "step": 7861 + }, + { + "epoch": 0.4327150641202047, + "grad_norm": 0.7220892906188965, + "learning_rate": 8.910045257499238e-06, + "loss": 0.8002, + "step": 7862 + }, + { + "epoch": 0.4327701029225604, + "grad_norm": 0.6255655884742737, + "learning_rate": 8.90977507733683e-06, + "loss": 0.6477, + "step": 7863 + }, + { + "epoch": 0.43282514172491604, + "grad_norm": 0.649472713470459, + "learning_rate": 8.909504867789594e-06, + "loss": 0.6838, + "step": 7864 + }, + { + "epoch": 0.43288018052727173, + "grad_norm": 0.6915234923362732, + "learning_rate": 8.909234628859568e-06, + "loss": 0.7146, + "step": 7865 + }, + { + "epoch": 0.43293521932962736, + "grad_norm": 0.7120145559310913, + "learning_rate": 8.908964360548783e-06, + "loss": 0.7782, + "step": 7866 + }, + { + "epoch": 0.43299025813198305, + "grad_norm": 0.8125410079956055, + "learning_rate": 8.908694062859267e-06, + "loss": 0.7514, + "step": 7867 + }, + { + "epoch": 0.4330452969343387, + "grad_norm": 0.6821436882019043, + "learning_rate": 8.908423735793053e-06, + "loss": 0.8074, + "step": 7868 + }, + { + "epoch": 0.4331003357366944, + "grad_norm": 0.8079590201377869, + "learning_rate": 8.908153379352171e-06, + "loss": 0.7932, + "step": 7869 + }, + { + "epoch": 0.43315537453905, + "grad_norm": 0.676013708114624, + "learning_rate": 8.907882993538655e-06, + "loss": 0.6611, + "step": 7870 + }, + { + "epoch": 0.4332104133414057, + "grad_norm": 0.706624448299408, + "learning_rate": 8.907612578354537e-06, + "loss": 0.8241, + "step": 7871 + }, + { + "epoch": 0.43326545214376133, + "grad_norm": 0.6533300876617432, + "learning_rate": 8.907342133801848e-06, + "loss": 0.6969, + "step": 7872 + }, + { + "epoch": 0.433320490946117, + "grad_norm": 0.6778282523155212, + "learning_rate": 8.907071659882622e-06, + "loss": 0.6877, + "step": 7873 + }, + { + "epoch": 0.43337552974847265, + "grad_norm": 0.7068879008293152, + "learning_rate": 8.906801156598892e-06, + "loss": 0.7912, + "step": 7874 + }, + { + "epoch": 0.43343056855082834, + "grad_norm": 0.6620263457298279, + "learning_rate": 8.90653062395269e-06, + "loss": 0.7317, + "step": 7875 + }, + { + "epoch": 0.433485607353184, + "grad_norm": 0.7084807753562927, + "learning_rate": 8.906260061946049e-06, + "loss": 0.7268, + "step": 7876 + }, + { + "epoch": 0.43354064615553967, + "grad_norm": 0.7899147272109985, + "learning_rate": 8.905989470581003e-06, + "loss": 0.8258, + "step": 7877 + }, + { + "epoch": 0.4335956849578953, + "grad_norm": 0.6657128930091858, + "learning_rate": 8.905718849859585e-06, + "loss": 0.6564, + "step": 7878 + }, + { + "epoch": 0.433650723760251, + "grad_norm": 0.8737723231315613, + "learning_rate": 8.905448199783831e-06, + "loss": 0.8646, + "step": 7879 + }, + { + "epoch": 0.4337057625626066, + "grad_norm": 0.7517673969268799, + "learning_rate": 8.905177520355775e-06, + "loss": 0.7658, + "step": 7880 + }, + { + "epoch": 0.4337608013649623, + "grad_norm": 0.6724270582199097, + "learning_rate": 8.904906811577447e-06, + "loss": 0.7509, + "step": 7881 + }, + { + "epoch": 0.43381584016731795, + "grad_norm": 0.6490511894226074, + "learning_rate": 8.904636073450885e-06, + "loss": 0.7282, + "step": 7882 + }, + { + "epoch": 0.43387087896967363, + "grad_norm": 0.73885178565979, + "learning_rate": 8.904365305978126e-06, + "loss": 0.7575, + "step": 7883 + }, + { + "epoch": 0.43392591777202927, + "grad_norm": 0.6823462843894958, + "learning_rate": 8.9040945091612e-06, + "loss": 0.7566, + "step": 7884 + }, + { + "epoch": 0.43398095657438496, + "grad_norm": 0.6705971956253052, + "learning_rate": 8.903823683002146e-06, + "loss": 0.7726, + "step": 7885 + }, + { + "epoch": 0.4340359953767406, + "grad_norm": 0.6898428201675415, + "learning_rate": 8.903552827502998e-06, + "loss": 0.7545, + "step": 7886 + }, + { + "epoch": 0.4340910341790963, + "grad_norm": 0.810357928276062, + "learning_rate": 8.90328194266579e-06, + "loss": 0.8883, + "step": 7887 + }, + { + "epoch": 0.4341460729814519, + "grad_norm": 0.6505162119865417, + "learning_rate": 8.903011028492563e-06, + "loss": 0.7205, + "step": 7888 + }, + { + "epoch": 0.4342011117838076, + "grad_norm": 0.8401693105697632, + "learning_rate": 8.902740084985348e-06, + "loss": 0.8105, + "step": 7889 + }, + { + "epoch": 0.43425615058616324, + "grad_norm": 0.7151880860328674, + "learning_rate": 8.902469112146183e-06, + "loss": 0.7748, + "step": 7890 + }, + { + "epoch": 0.4343111893885189, + "grad_norm": 0.7257007956504822, + "learning_rate": 8.902198109977107e-06, + "loss": 0.7818, + "step": 7891 + }, + { + "epoch": 0.43436622819087456, + "grad_norm": 0.786691427230835, + "learning_rate": 8.901927078480153e-06, + "loss": 0.8527, + "step": 7892 + }, + { + "epoch": 0.43442126699323025, + "grad_norm": 0.7420910596847534, + "learning_rate": 8.901656017657358e-06, + "loss": 0.7087, + "step": 7893 + }, + { + "epoch": 0.4344763057955859, + "grad_norm": 0.6713958978652954, + "learning_rate": 8.901384927510763e-06, + "loss": 0.7366, + "step": 7894 + }, + { + "epoch": 0.43453134459794157, + "grad_norm": 1.0276658535003662, + "learning_rate": 8.901113808042402e-06, + "loss": 0.7462, + "step": 7895 + }, + { + "epoch": 0.4345863834002972, + "grad_norm": 0.7207444906234741, + "learning_rate": 8.900842659254314e-06, + "loss": 0.6777, + "step": 7896 + }, + { + "epoch": 0.4346414222026529, + "grad_norm": 0.7581979036331177, + "learning_rate": 8.900571481148538e-06, + "loss": 0.8081, + "step": 7897 + }, + { + "epoch": 0.4346964610050085, + "grad_norm": 0.9224075675010681, + "learning_rate": 8.90030027372711e-06, + "loss": 0.892, + "step": 7898 + }, + { + "epoch": 0.4347514998073642, + "grad_norm": 0.6844260096549988, + "learning_rate": 8.900029036992069e-06, + "loss": 0.8063, + "step": 7899 + }, + { + "epoch": 0.43480653860971985, + "grad_norm": 0.7008691430091858, + "learning_rate": 8.899757770945453e-06, + "loss": 0.6998, + "step": 7900 + }, + { + "epoch": 0.43486157741207554, + "grad_norm": 0.7311949729919434, + "learning_rate": 8.899486475589303e-06, + "loss": 0.7724, + "step": 7901 + }, + { + "epoch": 0.4349166162144312, + "grad_norm": 0.7441468238830566, + "learning_rate": 8.899215150925656e-06, + "loss": 0.7728, + "step": 7902 + }, + { + "epoch": 0.43497165501678686, + "grad_norm": 0.7405179142951965, + "learning_rate": 8.89894379695655e-06, + "loss": 0.8267, + "step": 7903 + }, + { + "epoch": 0.4350266938191425, + "grad_norm": 0.6967620253562927, + "learning_rate": 8.898672413684029e-06, + "loss": 0.7284, + "step": 7904 + }, + { + "epoch": 0.43508173262149813, + "grad_norm": 0.8979219794273376, + "learning_rate": 8.898401001110127e-06, + "loss": 0.8267, + "step": 7905 + }, + { + "epoch": 0.4351367714238538, + "grad_norm": 0.7905356884002686, + "learning_rate": 8.898129559236888e-06, + "loss": 0.8011, + "step": 7906 + }, + { + "epoch": 0.43519181022620945, + "grad_norm": 0.6740859150886536, + "learning_rate": 8.897858088066351e-06, + "loss": 0.6597, + "step": 7907 + }, + { + "epoch": 0.43524684902856514, + "grad_norm": 0.7451572418212891, + "learning_rate": 8.897586587600555e-06, + "loss": 0.7466, + "step": 7908 + }, + { + "epoch": 0.4353018878309208, + "grad_norm": 0.7726565003395081, + "learning_rate": 8.897315057841542e-06, + "loss": 0.7873, + "step": 7909 + }, + { + "epoch": 0.43535692663327646, + "grad_norm": 0.8348171710968018, + "learning_rate": 8.897043498791354e-06, + "loss": 0.7583, + "step": 7910 + }, + { + "epoch": 0.4354119654356321, + "grad_norm": 0.6714087724685669, + "learning_rate": 8.896771910452027e-06, + "loss": 0.7909, + "step": 7911 + }, + { + "epoch": 0.4354670042379878, + "grad_norm": 0.7397969365119934, + "learning_rate": 8.896500292825607e-06, + "loss": 0.7734, + "step": 7912 + }, + { + "epoch": 0.4355220430403434, + "grad_norm": 0.6806391477584839, + "learning_rate": 8.896228645914133e-06, + "loss": 0.7898, + "step": 7913 + }, + { + "epoch": 0.4355770818426991, + "grad_norm": 0.7135224342346191, + "learning_rate": 8.89595696971965e-06, + "loss": 0.7453, + "step": 7914 + }, + { + "epoch": 0.43563212064505474, + "grad_norm": 0.8275992274284363, + "learning_rate": 8.895685264244195e-06, + "loss": 0.7326, + "step": 7915 + }, + { + "epoch": 0.43568715944741043, + "grad_norm": 0.7254159450531006, + "learning_rate": 8.895413529489813e-06, + "loss": 0.7523, + "step": 7916 + }, + { + "epoch": 0.43574219824976607, + "grad_norm": 0.8060647249221802, + "learning_rate": 8.895141765458546e-06, + "loss": 0.7878, + "step": 7917 + }, + { + "epoch": 0.43579723705212176, + "grad_norm": 0.7007316946983337, + "learning_rate": 8.894869972152435e-06, + "loss": 0.7837, + "step": 7918 + }, + { + "epoch": 0.4358522758544774, + "grad_norm": 0.6874841451644897, + "learning_rate": 8.894598149573524e-06, + "loss": 0.7773, + "step": 7919 + }, + { + "epoch": 0.4359073146568331, + "grad_norm": 0.7557696104049683, + "learning_rate": 8.894326297723856e-06, + "loss": 0.6905, + "step": 7920 + }, + { + "epoch": 0.4359623534591887, + "grad_norm": 0.7589512467384338, + "learning_rate": 8.894054416605475e-06, + "loss": 0.8292, + "step": 7921 + }, + { + "epoch": 0.4360173922615444, + "grad_norm": 0.9062818884849548, + "learning_rate": 8.893782506220424e-06, + "loss": 0.9149, + "step": 7922 + }, + { + "epoch": 0.43607243106390003, + "grad_norm": 0.7553420662879944, + "learning_rate": 8.893510566570744e-06, + "loss": 0.7256, + "step": 7923 + }, + { + "epoch": 0.4361274698662557, + "grad_norm": 0.7130489349365234, + "learning_rate": 8.89323859765848e-06, + "loss": 0.7375, + "step": 7924 + }, + { + "epoch": 0.43618250866861136, + "grad_norm": 0.6234793066978455, + "learning_rate": 8.89296659948568e-06, + "loss": 0.716, + "step": 7925 + }, + { + "epoch": 0.43623754747096705, + "grad_norm": 0.7527539134025574, + "learning_rate": 8.892694572054383e-06, + "loss": 0.7884, + "step": 7926 + }, + { + "epoch": 0.4362925862733227, + "grad_norm": 0.7677647471427917, + "learning_rate": 8.892422515366636e-06, + "loss": 0.7136, + "step": 7927 + }, + { + "epoch": 0.43634762507567837, + "grad_norm": 0.7212143540382385, + "learning_rate": 8.892150429424484e-06, + "loss": 0.8113, + "step": 7928 + }, + { + "epoch": 0.436402663878034, + "grad_norm": 0.6735568046569824, + "learning_rate": 8.89187831422997e-06, + "loss": 0.6472, + "step": 7929 + }, + { + "epoch": 0.4364577026803897, + "grad_norm": 0.7120702862739563, + "learning_rate": 8.891606169785141e-06, + "loss": 0.8032, + "step": 7930 + }, + { + "epoch": 0.4365127414827453, + "grad_norm": 0.679499089717865, + "learning_rate": 8.891333996092041e-06, + "loss": 0.7366, + "step": 7931 + }, + { + "epoch": 0.436567780285101, + "grad_norm": 0.7774114012718201, + "learning_rate": 8.891061793152718e-06, + "loss": 0.7917, + "step": 7932 + }, + { + "epoch": 0.43662281908745665, + "grad_norm": 0.6951174139976501, + "learning_rate": 8.890789560969216e-06, + "loss": 0.7518, + "step": 7933 + }, + { + "epoch": 0.43667785788981234, + "grad_norm": 0.7645227909088135, + "learning_rate": 8.89051729954358e-06, + "loss": 0.7787, + "step": 7934 + }, + { + "epoch": 0.43673289669216797, + "grad_norm": 0.7127084732055664, + "learning_rate": 8.890245008877857e-06, + "loss": 0.8137, + "step": 7935 + }, + { + "epoch": 0.43678793549452366, + "grad_norm": 0.7541413903236389, + "learning_rate": 8.889972688974095e-06, + "loss": 0.776, + "step": 7936 + }, + { + "epoch": 0.4368429742968793, + "grad_norm": 0.690963625907898, + "learning_rate": 8.889700339834339e-06, + "loss": 0.7691, + "step": 7937 + }, + { + "epoch": 0.436898013099235, + "grad_norm": 0.750221848487854, + "learning_rate": 8.889427961460636e-06, + "loss": 0.7831, + "step": 7938 + }, + { + "epoch": 0.4369530519015906, + "grad_norm": 0.7255545854568481, + "learning_rate": 8.889155553855035e-06, + "loss": 0.7831, + "step": 7939 + }, + { + "epoch": 0.4370080907039463, + "grad_norm": 0.7187026143074036, + "learning_rate": 8.88888311701958e-06, + "loss": 0.792, + "step": 7940 + }, + { + "epoch": 0.43706312950630194, + "grad_norm": 0.8313350081443787, + "learning_rate": 8.888610650956322e-06, + "loss": 0.706, + "step": 7941 + }, + { + "epoch": 0.43711816830865763, + "grad_norm": 0.8083454370498657, + "learning_rate": 8.888338155667307e-06, + "loss": 0.7857, + "step": 7942 + }, + { + "epoch": 0.43717320711101326, + "grad_norm": 0.8200840353965759, + "learning_rate": 8.888065631154583e-06, + "loss": 0.8601, + "step": 7943 + }, + { + "epoch": 0.43722824591336895, + "grad_norm": 0.7503816485404968, + "learning_rate": 8.887793077420198e-06, + "loss": 0.7744, + "step": 7944 + }, + { + "epoch": 0.4372832847157246, + "grad_norm": 0.7466493248939514, + "learning_rate": 8.887520494466202e-06, + "loss": 0.7818, + "step": 7945 + }, + { + "epoch": 0.4373383235180803, + "grad_norm": 0.728118360042572, + "learning_rate": 8.887247882294641e-06, + "loss": 0.7157, + "step": 7946 + }, + { + "epoch": 0.4373933623204359, + "grad_norm": 0.9199670553207397, + "learning_rate": 8.886975240907568e-06, + "loss": 0.8283, + "step": 7947 + }, + { + "epoch": 0.43744840112279154, + "grad_norm": 0.735584557056427, + "learning_rate": 8.886702570307027e-06, + "loss": 0.6588, + "step": 7948 + }, + { + "epoch": 0.43750343992514723, + "grad_norm": 0.8619036674499512, + "learning_rate": 8.886429870495072e-06, + "loss": 0.7269, + "step": 7949 + }, + { + "epoch": 0.43755847872750286, + "grad_norm": 0.7304830551147461, + "learning_rate": 8.886157141473747e-06, + "loss": 0.6725, + "step": 7950 + }, + { + "epoch": 0.43761351752985855, + "grad_norm": 0.7669086456298828, + "learning_rate": 8.885884383245109e-06, + "loss": 0.6957, + "step": 7951 + }, + { + "epoch": 0.4376685563322142, + "grad_norm": 0.7558299899101257, + "learning_rate": 8.885611595811203e-06, + "loss": 0.8159, + "step": 7952 + }, + { + "epoch": 0.4377235951345699, + "grad_norm": 0.7661786079406738, + "learning_rate": 8.88533877917408e-06, + "loss": 0.764, + "step": 7953 + }, + { + "epoch": 0.4377786339369255, + "grad_norm": 0.7461101412773132, + "learning_rate": 8.88506593333579e-06, + "loss": 0.7544, + "step": 7954 + }, + { + "epoch": 0.4378336727392812, + "grad_norm": 0.7989180088043213, + "learning_rate": 8.884793058298387e-06, + "loss": 0.6913, + "step": 7955 + }, + { + "epoch": 0.43788871154163683, + "grad_norm": 0.7964022755622864, + "learning_rate": 8.884520154063917e-06, + "loss": 0.7339, + "step": 7956 + }, + { + "epoch": 0.4379437503439925, + "grad_norm": 0.7278034687042236, + "learning_rate": 8.884247220634433e-06, + "loss": 0.8477, + "step": 7957 + }, + { + "epoch": 0.43799878914634816, + "grad_norm": 0.7294753789901733, + "learning_rate": 8.883974258011988e-06, + "loss": 0.8412, + "step": 7958 + }, + { + "epoch": 0.43805382794870384, + "grad_norm": 0.665734589099884, + "learning_rate": 8.88370126619863e-06, + "loss": 0.7838, + "step": 7959 + }, + { + "epoch": 0.4381088667510595, + "grad_norm": 0.6984216570854187, + "learning_rate": 8.883428245196414e-06, + "loss": 0.7657, + "step": 7960 + }, + { + "epoch": 0.43816390555341517, + "grad_norm": 0.8048402070999146, + "learning_rate": 8.883155195007393e-06, + "loss": 0.7553, + "step": 7961 + }, + { + "epoch": 0.4382189443557708, + "grad_norm": 0.7145794630050659, + "learning_rate": 8.882882115633616e-06, + "loss": 0.6583, + "step": 7962 + }, + { + "epoch": 0.4382739831581265, + "grad_norm": 0.7073546648025513, + "learning_rate": 8.882609007077135e-06, + "loss": 0.7869, + "step": 7963 + }, + { + "epoch": 0.4383290219604821, + "grad_norm": 0.8300859928131104, + "learning_rate": 8.882335869340004e-06, + "loss": 0.773, + "step": 7964 + }, + { + "epoch": 0.4383840607628378, + "grad_norm": 0.8343188762664795, + "learning_rate": 8.882062702424276e-06, + "loss": 0.6743, + "step": 7965 + }, + { + "epoch": 0.43843909956519345, + "grad_norm": 0.7106530666351318, + "learning_rate": 8.881789506332007e-06, + "loss": 0.7414, + "step": 7966 + }, + { + "epoch": 0.43849413836754914, + "grad_norm": 0.7015630602836609, + "learning_rate": 8.881516281065244e-06, + "loss": 0.7434, + "step": 7967 + }, + { + "epoch": 0.43854917716990477, + "grad_norm": 0.8106673955917358, + "learning_rate": 8.881243026626044e-06, + "loss": 0.7741, + "step": 7968 + }, + { + "epoch": 0.43860421597226046, + "grad_norm": 0.8181495070457458, + "learning_rate": 8.88096974301646e-06, + "loss": 0.8046, + "step": 7969 + }, + { + "epoch": 0.4386592547746161, + "grad_norm": 0.7767857313156128, + "learning_rate": 8.880696430238546e-06, + "loss": 0.8586, + "step": 7970 + }, + { + "epoch": 0.4387142935769718, + "grad_norm": 0.7257522940635681, + "learning_rate": 8.880423088294359e-06, + "loss": 0.7799, + "step": 7971 + }, + { + "epoch": 0.4387693323793274, + "grad_norm": 0.6896021366119385, + "learning_rate": 8.880149717185948e-06, + "loss": 0.8178, + "step": 7972 + }, + { + "epoch": 0.4388243711816831, + "grad_norm": 0.7646406292915344, + "learning_rate": 8.879876316915372e-06, + "loss": 0.8754, + "step": 7973 + }, + { + "epoch": 0.43887940998403874, + "grad_norm": 0.8043848872184753, + "learning_rate": 8.879602887484684e-06, + "loss": 0.8562, + "step": 7974 + }, + { + "epoch": 0.4389344487863944, + "grad_norm": 0.6727305054664612, + "learning_rate": 8.879329428895937e-06, + "loss": 0.6168, + "step": 7975 + }, + { + "epoch": 0.43898948758875006, + "grad_norm": 0.7634731531143188, + "learning_rate": 8.87905594115119e-06, + "loss": 0.857, + "step": 7976 + }, + { + "epoch": 0.43904452639110575, + "grad_norm": 0.6544492244720459, + "learning_rate": 8.878782424252497e-06, + "loss": 0.6302, + "step": 7977 + }, + { + "epoch": 0.4390995651934614, + "grad_norm": 0.8126636743545532, + "learning_rate": 8.878508878201915e-06, + "loss": 0.7823, + "step": 7978 + }, + { + "epoch": 0.43915460399581707, + "grad_norm": 0.7235779166221619, + "learning_rate": 8.878235303001497e-06, + "loss": 0.7527, + "step": 7979 + }, + { + "epoch": 0.4392096427981727, + "grad_norm": 0.6961055397987366, + "learning_rate": 8.8779616986533e-06, + "loss": 0.7383, + "step": 7980 + }, + { + "epoch": 0.4392646816005284, + "grad_norm": 0.7684490084648132, + "learning_rate": 8.877688065159382e-06, + "loss": 0.8009, + "step": 7981 + }, + { + "epoch": 0.43931972040288403, + "grad_norm": 0.7897803783416748, + "learning_rate": 8.877414402521797e-06, + "loss": 0.7561, + "step": 7982 + }, + { + "epoch": 0.4393747592052397, + "grad_norm": 0.7877688407897949, + "learning_rate": 8.877140710742606e-06, + "loss": 0.7949, + "step": 7983 + }, + { + "epoch": 0.43942979800759535, + "grad_norm": 0.8341611623764038, + "learning_rate": 8.876866989823862e-06, + "loss": 0.7585, + "step": 7984 + }, + { + "epoch": 0.43948483680995104, + "grad_norm": 0.7663636207580566, + "learning_rate": 8.876593239767622e-06, + "loss": 0.771, + "step": 7985 + }, + { + "epoch": 0.4395398756123067, + "grad_norm": 0.6824129223823547, + "learning_rate": 8.876319460575946e-06, + "loss": 0.7852, + "step": 7986 + }, + { + "epoch": 0.43959491441466236, + "grad_norm": 0.6533854007720947, + "learning_rate": 8.876045652250891e-06, + "loss": 0.723, + "step": 7987 + }, + { + "epoch": 0.439649953217018, + "grad_norm": 0.7174259424209595, + "learning_rate": 8.875771814794515e-06, + "loss": 0.749, + "step": 7988 + }, + { + "epoch": 0.4397049920193737, + "grad_norm": 0.8585928678512573, + "learning_rate": 8.875497948208875e-06, + "loss": 0.6727, + "step": 7989 + }, + { + "epoch": 0.4397600308217293, + "grad_norm": 0.7558062672615051, + "learning_rate": 8.875224052496029e-06, + "loss": 0.7929, + "step": 7990 + }, + { + "epoch": 0.43981506962408495, + "grad_norm": 0.7063853144645691, + "learning_rate": 8.874950127658037e-06, + "loss": 0.7397, + "step": 7991 + }, + { + "epoch": 0.43987010842644064, + "grad_norm": 0.7165526747703552, + "learning_rate": 8.874676173696956e-06, + "loss": 0.7678, + "step": 7992 + }, + { + "epoch": 0.4399251472287963, + "grad_norm": 0.7657830715179443, + "learning_rate": 8.874402190614847e-06, + "loss": 0.8318, + "step": 7993 + }, + { + "epoch": 0.43998018603115197, + "grad_norm": 0.7776834964752197, + "learning_rate": 8.874128178413769e-06, + "loss": 0.8589, + "step": 7994 + }, + { + "epoch": 0.4400352248335076, + "grad_norm": 0.6805633306503296, + "learning_rate": 8.873854137095778e-06, + "loss": 0.7009, + "step": 7995 + }, + { + "epoch": 0.4400902636358633, + "grad_norm": 0.6962490677833557, + "learning_rate": 8.87358006666294e-06, + "loss": 0.7896, + "step": 7996 + }, + { + "epoch": 0.4401453024382189, + "grad_norm": 0.611610472202301, + "learning_rate": 8.873305967117307e-06, + "loss": 0.5993, + "step": 7997 + }, + { + "epoch": 0.4402003412405746, + "grad_norm": 0.7442964911460876, + "learning_rate": 8.873031838460946e-06, + "loss": 0.8277, + "step": 7998 + }, + { + "epoch": 0.44025538004293024, + "grad_norm": 0.6858734488487244, + "learning_rate": 8.872757680695914e-06, + "loss": 0.8064, + "step": 7999 + }, + { + "epoch": 0.44031041884528593, + "grad_norm": 0.6654849052429199, + "learning_rate": 8.872483493824273e-06, + "loss": 0.7408, + "step": 8000 + }, + { + "epoch": 0.44036545764764157, + "grad_norm": 0.8241575956344604, + "learning_rate": 8.87220927784808e-06, + "loss": 0.8819, + "step": 8001 + }, + { + "epoch": 0.44042049644999726, + "grad_norm": 0.7078573107719421, + "learning_rate": 8.8719350327694e-06, + "loss": 0.7709, + "step": 8002 + }, + { + "epoch": 0.4404755352523529, + "grad_norm": 0.7369210720062256, + "learning_rate": 8.871660758590292e-06, + "loss": 0.7867, + "step": 8003 + }, + { + "epoch": 0.4405305740547086, + "grad_norm": 0.7206673622131348, + "learning_rate": 8.87138645531282e-06, + "loss": 0.8697, + "step": 8004 + }, + { + "epoch": 0.4405856128570642, + "grad_norm": 0.8370183706283569, + "learning_rate": 8.871112122939041e-06, + "loss": 0.7201, + "step": 8005 + }, + { + "epoch": 0.4406406516594199, + "grad_norm": 0.8015196323394775, + "learning_rate": 8.870837761471023e-06, + "loss": 0.774, + "step": 8006 + }, + { + "epoch": 0.44069569046177554, + "grad_norm": 0.730185329914093, + "learning_rate": 8.870563370910821e-06, + "loss": 0.7153, + "step": 8007 + }, + { + "epoch": 0.4407507292641312, + "grad_norm": 0.6719930768013, + "learning_rate": 8.870288951260503e-06, + "loss": 0.7949, + "step": 8008 + }, + { + "epoch": 0.44080576806648686, + "grad_norm": 0.7614291906356812, + "learning_rate": 8.870014502522128e-06, + "loss": 0.7143, + "step": 8009 + }, + { + "epoch": 0.44086080686884255, + "grad_norm": 0.7438056468963623, + "learning_rate": 8.86974002469776e-06, + "loss": 0.6859, + "step": 8010 + }, + { + "epoch": 0.4409158456711982, + "grad_norm": 0.759903073310852, + "learning_rate": 8.869465517789463e-06, + "loss": 0.8095, + "step": 8011 + }, + { + "epoch": 0.44097088447355387, + "grad_norm": 0.7622823119163513, + "learning_rate": 8.869190981799298e-06, + "loss": 0.786, + "step": 8012 + }, + { + "epoch": 0.4410259232759095, + "grad_norm": 0.677003800868988, + "learning_rate": 8.86891641672933e-06, + "loss": 0.7074, + "step": 8013 + }, + { + "epoch": 0.4410809620782652, + "grad_norm": 0.9258451461791992, + "learning_rate": 8.86864182258162e-06, + "loss": 0.7218, + "step": 8014 + }, + { + "epoch": 0.4411360008806208, + "grad_norm": 0.7027828693389893, + "learning_rate": 8.868367199358236e-06, + "loss": 0.7654, + "step": 8015 + }, + { + "epoch": 0.4411910396829765, + "grad_norm": 0.8279967308044434, + "learning_rate": 8.868092547061239e-06, + "loss": 0.8969, + "step": 8016 + }, + { + "epoch": 0.44124607848533215, + "grad_norm": 0.7366079688072205, + "learning_rate": 8.867817865692693e-06, + "loss": 0.8421, + "step": 8017 + }, + { + "epoch": 0.44130111728768784, + "grad_norm": 0.7548787593841553, + "learning_rate": 8.867543155254665e-06, + "loss": 0.79, + "step": 8018 + }, + { + "epoch": 0.44135615609004347, + "grad_norm": 0.7558487057685852, + "learning_rate": 8.867268415749215e-06, + "loss": 0.8461, + "step": 8019 + }, + { + "epoch": 0.44141119489239916, + "grad_norm": 0.6413403153419495, + "learning_rate": 8.866993647178413e-06, + "loss": 0.6811, + "step": 8020 + }, + { + "epoch": 0.4414662336947548, + "grad_norm": 0.9251089692115784, + "learning_rate": 8.86671884954432e-06, + "loss": 0.868, + "step": 8021 + }, + { + "epoch": 0.4415212724971105, + "grad_norm": 0.7920099496841431, + "learning_rate": 8.866444022849006e-06, + "loss": 0.8131, + "step": 8022 + }, + { + "epoch": 0.4415763112994661, + "grad_norm": 0.8738380670547485, + "learning_rate": 8.866169167094532e-06, + "loss": 0.857, + "step": 8023 + }, + { + "epoch": 0.4416313501018218, + "grad_norm": 0.7181336283683777, + "learning_rate": 8.865894282282965e-06, + "loss": 0.7869, + "step": 8024 + }, + { + "epoch": 0.44168638890417744, + "grad_norm": 0.8003776669502258, + "learning_rate": 8.865619368416373e-06, + "loss": 0.8874, + "step": 8025 + }, + { + "epoch": 0.44174142770653313, + "grad_norm": 0.7186623215675354, + "learning_rate": 8.86534442549682e-06, + "loss": 0.7931, + "step": 8026 + }, + { + "epoch": 0.44179646650888876, + "grad_norm": 0.7006831765174866, + "learning_rate": 8.865069453526371e-06, + "loss": 0.7046, + "step": 8027 + }, + { + "epoch": 0.44185150531124445, + "grad_norm": 0.7394786477088928, + "learning_rate": 8.864794452507097e-06, + "loss": 0.685, + "step": 8028 + }, + { + "epoch": 0.4419065441136001, + "grad_norm": 0.7512097358703613, + "learning_rate": 8.864519422441062e-06, + "loss": 0.8047, + "step": 8029 + }, + { + "epoch": 0.4419615829159558, + "grad_norm": 0.6866902709007263, + "learning_rate": 8.864244363330333e-06, + "loss": 0.7099, + "step": 8030 + }, + { + "epoch": 0.4420166217183114, + "grad_norm": 0.7316723465919495, + "learning_rate": 8.863969275176978e-06, + "loss": 0.7767, + "step": 8031 + }, + { + "epoch": 0.4420716605206671, + "grad_norm": 0.7103593349456787, + "learning_rate": 8.863694157983064e-06, + "loss": 0.7832, + "step": 8032 + }, + { + "epoch": 0.44212669932302273, + "grad_norm": 0.6922749876976013, + "learning_rate": 8.863419011750659e-06, + "loss": 0.7833, + "step": 8033 + }, + { + "epoch": 0.44218173812537837, + "grad_norm": 0.7989425659179688, + "learning_rate": 8.863143836481831e-06, + "loss": 0.8651, + "step": 8034 + }, + { + "epoch": 0.44223677692773405, + "grad_norm": 0.6765440702438354, + "learning_rate": 8.862868632178648e-06, + "loss": 0.7858, + "step": 8035 + }, + { + "epoch": 0.4422918157300897, + "grad_norm": 0.670767068862915, + "learning_rate": 8.862593398843178e-06, + "loss": 0.6789, + "step": 8036 + }, + { + "epoch": 0.4423468545324454, + "grad_norm": 0.7556853294372559, + "learning_rate": 8.86231813647749e-06, + "loss": 0.8036, + "step": 8037 + }, + { + "epoch": 0.442401893334801, + "grad_norm": 0.788690984249115, + "learning_rate": 8.862042845083654e-06, + "loss": 0.8355, + "step": 8038 + }, + { + "epoch": 0.4424569321371567, + "grad_norm": 0.8439056873321533, + "learning_rate": 8.861767524663736e-06, + "loss": 0.7327, + "step": 8039 + }, + { + "epoch": 0.44251197093951233, + "grad_norm": 0.7101821899414062, + "learning_rate": 8.861492175219808e-06, + "loss": 0.8303, + "step": 8040 + }, + { + "epoch": 0.442567009741868, + "grad_norm": 0.741680383682251, + "learning_rate": 8.861216796753937e-06, + "loss": 0.7377, + "step": 8041 + }, + { + "epoch": 0.44262204854422366, + "grad_norm": 0.7588099837303162, + "learning_rate": 8.860941389268196e-06, + "loss": 0.8217, + "step": 8042 + }, + { + "epoch": 0.44267708734657935, + "grad_norm": 0.7654829025268555, + "learning_rate": 8.860665952764654e-06, + "loss": 0.8416, + "step": 8043 + }, + { + "epoch": 0.442732126148935, + "grad_norm": 0.7025987505912781, + "learning_rate": 8.860390487245378e-06, + "loss": 0.7312, + "step": 8044 + }, + { + "epoch": 0.44278716495129067, + "grad_norm": 0.7206251621246338, + "learning_rate": 8.860114992712441e-06, + "loss": 0.7522, + "step": 8045 + }, + { + "epoch": 0.4428422037536463, + "grad_norm": 0.7041749954223633, + "learning_rate": 8.859839469167912e-06, + "loss": 0.746, + "step": 8046 + }, + { + "epoch": 0.442897242556002, + "grad_norm": 0.6941862106323242, + "learning_rate": 8.859563916613864e-06, + "loss": 0.7692, + "step": 8047 + }, + { + "epoch": 0.4429522813583576, + "grad_norm": 0.6897740364074707, + "learning_rate": 8.859288335052367e-06, + "loss": 0.7963, + "step": 8048 + }, + { + "epoch": 0.4430073201607133, + "grad_norm": 0.6744545698165894, + "learning_rate": 8.859012724485492e-06, + "loss": 0.7647, + "step": 8049 + }, + { + "epoch": 0.44306235896306895, + "grad_norm": 0.7899364829063416, + "learning_rate": 8.858737084915309e-06, + "loss": 0.8373, + "step": 8050 + }, + { + "epoch": 0.44311739776542464, + "grad_norm": 0.806016743183136, + "learning_rate": 8.85846141634389e-06, + "loss": 0.7871, + "step": 8051 + }, + { + "epoch": 0.44317243656778027, + "grad_norm": 0.7444993257522583, + "learning_rate": 8.85818571877331e-06, + "loss": 0.8099, + "step": 8052 + }, + { + "epoch": 0.44322747537013596, + "grad_norm": 0.772735059261322, + "learning_rate": 8.85790999220564e-06, + "loss": 0.7113, + "step": 8053 + }, + { + "epoch": 0.4432825141724916, + "grad_norm": 0.7743984460830688, + "learning_rate": 8.85763423664295e-06, + "loss": 0.8935, + "step": 8054 + }, + { + "epoch": 0.4433375529748473, + "grad_norm": 0.6751214265823364, + "learning_rate": 8.857358452087313e-06, + "loss": 0.6769, + "step": 8055 + }, + { + "epoch": 0.4433925917772029, + "grad_norm": 0.6921005845069885, + "learning_rate": 8.857082638540803e-06, + "loss": 0.7071, + "step": 8056 + }, + { + "epoch": 0.4434476305795586, + "grad_norm": 0.7884092330932617, + "learning_rate": 8.856806796005491e-06, + "loss": 0.7919, + "step": 8057 + }, + { + "epoch": 0.44350266938191424, + "grad_norm": 0.6522679924964905, + "learning_rate": 8.856530924483452e-06, + "loss": 0.7449, + "step": 8058 + }, + { + "epoch": 0.4435577081842699, + "grad_norm": 0.7172590494155884, + "learning_rate": 8.85625502397676e-06, + "loss": 0.7306, + "step": 8059 + }, + { + "epoch": 0.44361274698662556, + "grad_norm": 0.698658287525177, + "learning_rate": 8.855979094487488e-06, + "loss": 0.803, + "step": 8060 + }, + { + "epoch": 0.44366778578898125, + "grad_norm": 0.685589075088501, + "learning_rate": 8.855703136017708e-06, + "loss": 0.763, + "step": 8061 + }, + { + "epoch": 0.4437228245913369, + "grad_norm": 0.8259774446487427, + "learning_rate": 8.855427148569495e-06, + "loss": 0.811, + "step": 8062 + }, + { + "epoch": 0.4437778633936926, + "grad_norm": 0.6976660490036011, + "learning_rate": 8.855151132144926e-06, + "loss": 0.7345, + "step": 8063 + }, + { + "epoch": 0.4438329021960482, + "grad_norm": 0.7696738243103027, + "learning_rate": 8.854875086746071e-06, + "loss": 0.823, + "step": 8064 + }, + { + "epoch": 0.4438879409984039, + "grad_norm": 0.6627930998802185, + "learning_rate": 8.854599012375006e-06, + "loss": 0.7455, + "step": 8065 + }, + { + "epoch": 0.44394297980075953, + "grad_norm": 0.7492700815200806, + "learning_rate": 8.854322909033809e-06, + "loss": 0.8195, + "step": 8066 + }, + { + "epoch": 0.4439980186031152, + "grad_norm": 0.8335888981819153, + "learning_rate": 8.85404677672455e-06, + "loss": 0.7683, + "step": 8067 + }, + { + "epoch": 0.44405305740547085, + "grad_norm": 0.7448242902755737, + "learning_rate": 8.853770615449309e-06, + "loss": 0.8352, + "step": 8068 + }, + { + "epoch": 0.44410809620782654, + "grad_norm": 0.700616180896759, + "learning_rate": 8.853494425210158e-06, + "loss": 0.7892, + "step": 8069 + }, + { + "epoch": 0.4441631350101822, + "grad_norm": 0.6959284543991089, + "learning_rate": 8.853218206009176e-06, + "loss": 0.6944, + "step": 8070 + }, + { + "epoch": 0.44421817381253786, + "grad_norm": 0.7507375478744507, + "learning_rate": 8.852941957848438e-06, + "loss": 0.8921, + "step": 8071 + }, + { + "epoch": 0.4442732126148935, + "grad_norm": 0.7843918204307556, + "learning_rate": 8.852665680730019e-06, + "loss": 0.816, + "step": 8072 + }, + { + "epoch": 0.4443282514172492, + "grad_norm": 0.8702702522277832, + "learning_rate": 8.852389374655995e-06, + "loss": 0.8191, + "step": 8073 + }, + { + "epoch": 0.4443832902196048, + "grad_norm": 0.6784317493438721, + "learning_rate": 8.852113039628445e-06, + "loss": 0.7726, + "step": 8074 + }, + { + "epoch": 0.4444383290219605, + "grad_norm": 0.724530041217804, + "learning_rate": 8.851836675649443e-06, + "loss": 0.8214, + "step": 8075 + }, + { + "epoch": 0.44449336782431614, + "grad_norm": 0.9814287424087524, + "learning_rate": 8.851560282721067e-06, + "loss": 0.8368, + "step": 8076 + }, + { + "epoch": 0.4445484066266718, + "grad_norm": 0.6606815457344055, + "learning_rate": 8.851283860845398e-06, + "loss": 0.7772, + "step": 8077 + }, + { + "epoch": 0.44460344542902747, + "grad_norm": 0.6910951137542725, + "learning_rate": 8.851007410024507e-06, + "loss": 0.7007, + "step": 8078 + }, + { + "epoch": 0.4446584842313831, + "grad_norm": 0.6764300465583801, + "learning_rate": 8.850730930260479e-06, + "loss": 0.7265, + "step": 8079 + }, + { + "epoch": 0.4447135230337388, + "grad_norm": 0.669622004032135, + "learning_rate": 8.850454421555386e-06, + "loss": 0.7551, + "step": 8080 + }, + { + "epoch": 0.4447685618360944, + "grad_norm": 0.7068240642547607, + "learning_rate": 8.850177883911307e-06, + "loss": 0.8358, + "step": 8081 + }, + { + "epoch": 0.4448236006384501, + "grad_norm": 0.7100360989570618, + "learning_rate": 8.849901317330324e-06, + "loss": 0.7074, + "step": 8082 + }, + { + "epoch": 0.44487863944080575, + "grad_norm": 0.7510328888893127, + "learning_rate": 8.849624721814511e-06, + "loss": 0.6654, + "step": 8083 + }, + { + "epoch": 0.44493367824316143, + "grad_norm": 0.8106432557106018, + "learning_rate": 8.849348097365951e-06, + "loss": 0.6944, + "step": 8084 + }, + { + "epoch": 0.44498871704551707, + "grad_norm": 0.6852346062660217, + "learning_rate": 8.84907144398672e-06, + "loss": 0.7203, + "step": 8085 + }, + { + "epoch": 0.44504375584787276, + "grad_norm": 0.8495593667030334, + "learning_rate": 8.848794761678898e-06, + "loss": 0.7918, + "step": 8086 + }, + { + "epoch": 0.4450987946502284, + "grad_norm": 0.7110981941223145, + "learning_rate": 8.848518050444565e-06, + "loss": 0.8176, + "step": 8087 + }, + { + "epoch": 0.4451538334525841, + "grad_norm": 0.7740922570228577, + "learning_rate": 8.8482413102858e-06, + "loss": 0.7573, + "step": 8088 + }, + { + "epoch": 0.4452088722549397, + "grad_norm": 0.9645134806632996, + "learning_rate": 8.847964541204685e-06, + "loss": 0.7842, + "step": 8089 + }, + { + "epoch": 0.4452639110572954, + "grad_norm": 0.767621636390686, + "learning_rate": 8.847687743203299e-06, + "loss": 0.8182, + "step": 8090 + }, + { + "epoch": 0.44531894985965104, + "grad_norm": 0.6842975616455078, + "learning_rate": 8.84741091628372e-06, + "loss": 0.7795, + "step": 8091 + }, + { + "epoch": 0.4453739886620067, + "grad_norm": 0.768644392490387, + "learning_rate": 8.847134060448032e-06, + "loss": 0.7363, + "step": 8092 + }, + { + "epoch": 0.44542902746436236, + "grad_norm": 0.6813824772834778, + "learning_rate": 8.846857175698314e-06, + "loss": 0.7601, + "step": 8093 + }, + { + "epoch": 0.44548406626671805, + "grad_norm": 0.8608306646347046, + "learning_rate": 8.846580262036645e-06, + "loss": 0.8205, + "step": 8094 + }, + { + "epoch": 0.4455391050690737, + "grad_norm": 0.6917694807052612, + "learning_rate": 8.84630331946511e-06, + "loss": 0.7207, + "step": 8095 + }, + { + "epoch": 0.44559414387142937, + "grad_norm": 0.6777203679084778, + "learning_rate": 8.84602634798579e-06, + "loss": 0.6939, + "step": 8096 + }, + { + "epoch": 0.445649182673785, + "grad_norm": 0.7249894142150879, + "learning_rate": 8.845749347600764e-06, + "loss": 0.7918, + "step": 8097 + }, + { + "epoch": 0.4457042214761407, + "grad_norm": 0.7446995973587036, + "learning_rate": 8.845472318312116e-06, + "loss": 0.7379, + "step": 8098 + }, + { + "epoch": 0.4457592602784963, + "grad_norm": 0.8245479464530945, + "learning_rate": 8.845195260121927e-06, + "loss": 0.8532, + "step": 8099 + }, + { + "epoch": 0.445814299080852, + "grad_norm": 0.7160329818725586, + "learning_rate": 8.84491817303228e-06, + "loss": 0.7042, + "step": 8100 + }, + { + "epoch": 0.44586933788320765, + "grad_norm": 0.8056026101112366, + "learning_rate": 8.844641057045257e-06, + "loss": 0.8581, + "step": 8101 + }, + { + "epoch": 0.44592437668556334, + "grad_norm": 0.7257886528968811, + "learning_rate": 8.84436391216294e-06, + "loss": 0.7297, + "step": 8102 + }, + { + "epoch": 0.445979415487919, + "grad_norm": 0.7400404810905457, + "learning_rate": 8.844086738387415e-06, + "loss": 0.7703, + "step": 8103 + }, + { + "epoch": 0.44603445429027466, + "grad_norm": 0.665271520614624, + "learning_rate": 8.843809535720763e-06, + "loss": 0.7769, + "step": 8104 + }, + { + "epoch": 0.4460894930926303, + "grad_norm": 0.7041043639183044, + "learning_rate": 8.843532304165066e-06, + "loss": 0.7995, + "step": 8105 + }, + { + "epoch": 0.446144531894986, + "grad_norm": 0.8517841100692749, + "learning_rate": 8.84325504372241e-06, + "loss": 0.8239, + "step": 8106 + }, + { + "epoch": 0.4461995706973416, + "grad_norm": 0.7045741677284241, + "learning_rate": 8.842977754394877e-06, + "loss": 0.7982, + "step": 8107 + }, + { + "epoch": 0.4462546094996973, + "grad_norm": 0.7056185007095337, + "learning_rate": 8.842700436184552e-06, + "loss": 0.8003, + "step": 8108 + }, + { + "epoch": 0.44630964830205294, + "grad_norm": 0.9042232632637024, + "learning_rate": 8.842423089093519e-06, + "loss": 0.7534, + "step": 8109 + }, + { + "epoch": 0.44636468710440863, + "grad_norm": 0.8584854602813721, + "learning_rate": 8.842145713123863e-06, + "loss": 0.7759, + "step": 8110 + }, + { + "epoch": 0.44641972590676426, + "grad_norm": 0.7333530187606812, + "learning_rate": 8.841868308277668e-06, + "loss": 0.7218, + "step": 8111 + }, + { + "epoch": 0.44647476470911995, + "grad_norm": 0.7866941094398499, + "learning_rate": 8.84159087455702e-06, + "loss": 0.7016, + "step": 8112 + }, + { + "epoch": 0.4465298035114756, + "grad_norm": 0.7785252928733826, + "learning_rate": 8.841313411964001e-06, + "loss": 0.8232, + "step": 8113 + }, + { + "epoch": 0.4465848423138313, + "grad_norm": 0.7060698866844177, + "learning_rate": 8.841035920500702e-06, + "loss": 0.6987, + "step": 8114 + }, + { + "epoch": 0.4466398811161869, + "grad_norm": 0.7211717963218689, + "learning_rate": 8.840758400169203e-06, + "loss": 0.8604, + "step": 8115 + }, + { + "epoch": 0.4466949199185426, + "grad_norm": 0.979678213596344, + "learning_rate": 8.840480850971593e-06, + "loss": 0.9028, + "step": 8116 + }, + { + "epoch": 0.44674995872089823, + "grad_norm": 0.6595104336738586, + "learning_rate": 8.840203272909957e-06, + "loss": 0.6899, + "step": 8117 + }, + { + "epoch": 0.4468049975232539, + "grad_norm": 0.6392405033111572, + "learning_rate": 8.83992566598638e-06, + "loss": 0.7729, + "step": 8118 + }, + { + "epoch": 0.44686003632560956, + "grad_norm": 1.1084040403366089, + "learning_rate": 8.839648030202949e-06, + "loss": 0.822, + "step": 8119 + }, + { + "epoch": 0.4469150751279652, + "grad_norm": 0.7024106383323669, + "learning_rate": 8.839370365561754e-06, + "loss": 0.7615, + "step": 8120 + }, + { + "epoch": 0.4469701139303209, + "grad_norm": 0.7204060554504395, + "learning_rate": 8.839092672064878e-06, + "loss": 0.7527, + "step": 8121 + }, + { + "epoch": 0.4470251527326765, + "grad_norm": 0.7307723760604858, + "learning_rate": 8.838814949714407e-06, + "loss": 0.8139, + "step": 8122 + }, + { + "epoch": 0.4470801915350322, + "grad_norm": 0.824034571647644, + "learning_rate": 8.838537198512434e-06, + "loss": 0.8299, + "step": 8123 + }, + { + "epoch": 0.44713523033738783, + "grad_norm": 0.6603747606277466, + "learning_rate": 8.83825941846104e-06, + "loss": 0.6762, + "step": 8124 + }, + { + "epoch": 0.4471902691397435, + "grad_norm": 0.7403088808059692, + "learning_rate": 8.837981609562316e-06, + "loss": 0.716, + "step": 8125 + }, + { + "epoch": 0.44724530794209916, + "grad_norm": 0.742173969745636, + "learning_rate": 8.837703771818351e-06, + "loss": 0.7672, + "step": 8126 + }, + { + "epoch": 0.44730034674445485, + "grad_norm": 0.7158839106559753, + "learning_rate": 8.837425905231232e-06, + "loss": 0.6941, + "step": 8127 + }, + { + "epoch": 0.4473553855468105, + "grad_norm": 0.7659464478492737, + "learning_rate": 8.837148009803044e-06, + "loss": 0.7293, + "step": 8128 + }, + { + "epoch": 0.44741042434916617, + "grad_norm": 0.8681113719940186, + "learning_rate": 8.836870085535882e-06, + "loss": 0.8647, + "step": 8129 + }, + { + "epoch": 0.4474654631515218, + "grad_norm": 0.7117272615432739, + "learning_rate": 8.83659213243183e-06, + "loss": 0.8035, + "step": 8130 + }, + { + "epoch": 0.4475205019538775, + "grad_norm": 0.8220957517623901, + "learning_rate": 8.836314150492978e-06, + "loss": 0.6978, + "step": 8131 + }, + { + "epoch": 0.4475755407562331, + "grad_norm": 0.7045003175735474, + "learning_rate": 8.836036139721418e-06, + "loss": 0.747, + "step": 8132 + }, + { + "epoch": 0.4476305795585888, + "grad_norm": 0.6833191514015198, + "learning_rate": 8.835758100119235e-06, + "loss": 0.7604, + "step": 8133 + }, + { + "epoch": 0.44768561836094445, + "grad_norm": 0.7305697798728943, + "learning_rate": 8.835480031688521e-06, + "loss": 0.7301, + "step": 8134 + }, + { + "epoch": 0.44774065716330014, + "grad_norm": 0.7266964912414551, + "learning_rate": 8.835201934431366e-06, + "loss": 0.7675, + "step": 8135 + }, + { + "epoch": 0.44779569596565577, + "grad_norm": 0.6822015047073364, + "learning_rate": 8.834923808349861e-06, + "loss": 0.8226, + "step": 8136 + }, + { + "epoch": 0.44785073476801146, + "grad_norm": 0.7443515062332153, + "learning_rate": 8.834645653446095e-06, + "loss": 0.9289, + "step": 8137 + }, + { + "epoch": 0.4479057735703671, + "grad_norm": 0.7337210178375244, + "learning_rate": 8.834367469722158e-06, + "loss": 0.7758, + "step": 8138 + }, + { + "epoch": 0.4479608123727228, + "grad_norm": 0.6794925332069397, + "learning_rate": 8.83408925718014e-06, + "loss": 0.8426, + "step": 8139 + }, + { + "epoch": 0.4480158511750784, + "grad_norm": 0.7808265089988708, + "learning_rate": 8.833811015822135e-06, + "loss": 0.8464, + "step": 8140 + }, + { + "epoch": 0.4480708899774341, + "grad_norm": 0.7837018370628357, + "learning_rate": 8.833532745650234e-06, + "loss": 0.8722, + "step": 8141 + }, + { + "epoch": 0.44812592877978974, + "grad_norm": 0.9218140840530396, + "learning_rate": 8.833254446666526e-06, + "loss": 0.7981, + "step": 8142 + }, + { + "epoch": 0.44818096758214543, + "grad_norm": 0.7980387806892395, + "learning_rate": 8.832976118873103e-06, + "loss": 0.7705, + "step": 8143 + }, + { + "epoch": 0.44823600638450106, + "grad_norm": 0.7354007363319397, + "learning_rate": 8.832697762272057e-06, + "loss": 0.8286, + "step": 8144 + }, + { + "epoch": 0.44829104518685675, + "grad_norm": 0.7006223201751709, + "learning_rate": 8.832419376865482e-06, + "loss": 0.7107, + "step": 8145 + }, + { + "epoch": 0.4483460839892124, + "grad_norm": 0.7838212847709656, + "learning_rate": 8.83214096265547e-06, + "loss": 0.7676, + "step": 8146 + }, + { + "epoch": 0.4484011227915681, + "grad_norm": 0.7768213748931885, + "learning_rate": 8.83186251964411e-06, + "loss": 0.8689, + "step": 8147 + }, + { + "epoch": 0.4484561615939237, + "grad_norm": 0.7451630234718323, + "learning_rate": 8.831584047833497e-06, + "loss": 0.8625, + "step": 8148 + }, + { + "epoch": 0.4485112003962794, + "grad_norm": 0.7573269605636597, + "learning_rate": 8.831305547225725e-06, + "loss": 0.7357, + "step": 8149 + }, + { + "epoch": 0.44856623919863503, + "grad_norm": 0.6884848475456238, + "learning_rate": 8.831027017822886e-06, + "loss": 0.7306, + "step": 8150 + }, + { + "epoch": 0.4486212780009907, + "grad_norm": 0.7715907096862793, + "learning_rate": 8.830748459627073e-06, + "loss": 0.8311, + "step": 8151 + }, + { + "epoch": 0.44867631680334635, + "grad_norm": 0.6919859647750854, + "learning_rate": 8.83046987264038e-06, + "loss": 0.845, + "step": 8152 + }, + { + "epoch": 0.44873135560570204, + "grad_norm": 0.7066411972045898, + "learning_rate": 8.830191256864902e-06, + "loss": 0.7554, + "step": 8153 + }, + { + "epoch": 0.4487863944080577, + "grad_norm": 0.754196047782898, + "learning_rate": 8.829912612302729e-06, + "loss": 0.7396, + "step": 8154 + }, + { + "epoch": 0.44884143321041337, + "grad_norm": 0.7612286806106567, + "learning_rate": 8.82963393895596e-06, + "loss": 0.8154, + "step": 8155 + }, + { + "epoch": 0.448896472012769, + "grad_norm": 0.8576892614364624, + "learning_rate": 8.829355236826688e-06, + "loss": 0.7395, + "step": 8156 + }, + { + "epoch": 0.4489515108151247, + "grad_norm": 0.6813738346099854, + "learning_rate": 8.829076505917005e-06, + "loss": 0.7661, + "step": 8157 + }, + { + "epoch": 0.4490065496174803, + "grad_norm": 0.7453964948654175, + "learning_rate": 8.828797746229009e-06, + "loss": 0.8221, + "step": 8158 + }, + { + "epoch": 0.449061588419836, + "grad_norm": 0.7546728849411011, + "learning_rate": 8.828518957764795e-06, + "loss": 0.7717, + "step": 8159 + }, + { + "epoch": 0.44911662722219164, + "grad_norm": 0.8270652890205383, + "learning_rate": 8.828240140526456e-06, + "loss": 0.7582, + "step": 8160 + }, + { + "epoch": 0.44917166602454733, + "grad_norm": 0.8188696503639221, + "learning_rate": 8.827961294516089e-06, + "loss": 0.8841, + "step": 8161 + }, + { + "epoch": 0.44922670482690297, + "grad_norm": 0.9101365208625793, + "learning_rate": 8.82768241973579e-06, + "loss": 0.7099, + "step": 8162 + }, + { + "epoch": 0.4492817436292586, + "grad_norm": 0.6749762892723083, + "learning_rate": 8.827403516187656e-06, + "loss": 0.7766, + "step": 8163 + }, + { + "epoch": 0.4493367824316143, + "grad_norm": 1.1351534128189087, + "learning_rate": 8.827124583873781e-06, + "loss": 0.7536, + "step": 8164 + }, + { + "epoch": 0.4493918212339699, + "grad_norm": 0.8729487061500549, + "learning_rate": 8.826845622796261e-06, + "loss": 0.8613, + "step": 8165 + }, + { + "epoch": 0.4494468600363256, + "grad_norm": 0.7495871782302856, + "learning_rate": 8.826566632957193e-06, + "loss": 0.8365, + "step": 8166 + }, + { + "epoch": 0.44950189883868125, + "grad_norm": 0.6414516568183899, + "learning_rate": 8.826287614358677e-06, + "loss": 0.6574, + "step": 8167 + }, + { + "epoch": 0.44955693764103694, + "grad_norm": 0.6954017281532288, + "learning_rate": 8.826008567002805e-06, + "loss": 0.7857, + "step": 8168 + }, + { + "epoch": 0.44961197644339257, + "grad_norm": 0.7199459075927734, + "learning_rate": 8.825729490891678e-06, + "loss": 0.8585, + "step": 8169 + }, + { + "epoch": 0.44966701524574826, + "grad_norm": 0.8245406746864319, + "learning_rate": 8.825450386027392e-06, + "loss": 0.7238, + "step": 8170 + }, + { + "epoch": 0.4497220540481039, + "grad_norm": 0.6348667740821838, + "learning_rate": 8.825171252412044e-06, + "loss": 0.6991, + "step": 8171 + }, + { + "epoch": 0.4497770928504596, + "grad_norm": 0.6304741501808167, + "learning_rate": 8.824892090047734e-06, + "loss": 0.7101, + "step": 8172 + }, + { + "epoch": 0.4498321316528152, + "grad_norm": 0.7088820338249207, + "learning_rate": 8.82461289893656e-06, + "loss": 0.8217, + "step": 8173 + }, + { + "epoch": 0.4498871704551709, + "grad_norm": 0.7570851445198059, + "learning_rate": 8.824333679080617e-06, + "loss": 0.8029, + "step": 8174 + }, + { + "epoch": 0.44994220925752654, + "grad_norm": 0.7544378042221069, + "learning_rate": 8.824054430482007e-06, + "loss": 0.777, + "step": 8175 + }, + { + "epoch": 0.4499972480598822, + "grad_norm": 0.8226260542869568, + "learning_rate": 8.823775153142827e-06, + "loss": 0.8391, + "step": 8176 + }, + { + "epoch": 0.45005228686223786, + "grad_norm": 0.6861422061920166, + "learning_rate": 8.823495847065176e-06, + "loss": 0.7491, + "step": 8177 + }, + { + "epoch": 0.45010732566459355, + "grad_norm": 0.6643275618553162, + "learning_rate": 8.823216512251153e-06, + "loss": 0.6773, + "step": 8178 + }, + { + "epoch": 0.4501623644669492, + "grad_norm": 0.8201391100883484, + "learning_rate": 8.82293714870286e-06, + "loss": 0.8065, + "step": 8179 + }, + { + "epoch": 0.45021740326930487, + "grad_norm": 0.7783405780792236, + "learning_rate": 8.822657756422394e-06, + "loss": 0.7884, + "step": 8180 + }, + { + "epoch": 0.4502724420716605, + "grad_norm": 0.720745861530304, + "learning_rate": 8.822378335411856e-06, + "loss": 0.765, + "step": 8181 + }, + { + "epoch": 0.4503274808740162, + "grad_norm": 0.740364670753479, + "learning_rate": 8.822098885673346e-06, + "loss": 0.6354, + "step": 8182 + }, + { + "epoch": 0.45038251967637183, + "grad_norm": 0.8049225807189941, + "learning_rate": 8.821819407208963e-06, + "loss": 0.7023, + "step": 8183 + }, + { + "epoch": 0.4504375584787275, + "grad_norm": 0.7320911288261414, + "learning_rate": 8.821539900020808e-06, + "loss": 0.8429, + "step": 8184 + }, + { + "epoch": 0.45049259728108315, + "grad_norm": 0.7065376043319702, + "learning_rate": 8.821260364110984e-06, + "loss": 0.7283, + "step": 8185 + }, + { + "epoch": 0.45054763608343884, + "grad_norm": 0.7172972559928894, + "learning_rate": 8.820980799481588e-06, + "loss": 0.7673, + "step": 8186 + }, + { + "epoch": 0.4506026748857945, + "grad_norm": 0.712273895740509, + "learning_rate": 8.820701206134724e-06, + "loss": 0.7317, + "step": 8187 + }, + { + "epoch": 0.45065771368815016, + "grad_norm": 0.6954227685928345, + "learning_rate": 8.820421584072492e-06, + "loss": 0.7037, + "step": 8188 + }, + { + "epoch": 0.4507127524905058, + "grad_norm": 0.6790304780006409, + "learning_rate": 8.820141933296994e-06, + "loss": 0.7544, + "step": 8189 + }, + { + "epoch": 0.4507677912928615, + "grad_norm": 0.7483745813369751, + "learning_rate": 8.819862253810332e-06, + "loss": 0.7894, + "step": 8190 + }, + { + "epoch": 0.4508228300952171, + "grad_norm": 0.7926133871078491, + "learning_rate": 8.819582545614608e-06, + "loss": 0.8085, + "step": 8191 + }, + { + "epoch": 0.4508778688975728, + "grad_norm": 0.8442840576171875, + "learning_rate": 8.819302808711924e-06, + "loss": 0.8252, + "step": 8192 + }, + { + "epoch": 0.45093290769992844, + "grad_norm": 0.8359581232070923, + "learning_rate": 8.819023043104383e-06, + "loss": 0.8187, + "step": 8193 + }, + { + "epoch": 0.45098794650228413, + "grad_norm": 0.7793936133384705, + "learning_rate": 8.818743248794085e-06, + "loss": 0.8425, + "step": 8194 + }, + { + "epoch": 0.45104298530463977, + "grad_norm": 0.735509991645813, + "learning_rate": 8.818463425783136e-06, + "loss": 0.7781, + "step": 8195 + }, + { + "epoch": 0.45109802410699545, + "grad_norm": 0.6735361814498901, + "learning_rate": 8.818183574073639e-06, + "loss": 0.6987, + "step": 8196 + }, + { + "epoch": 0.4511530629093511, + "grad_norm": 0.7780157923698425, + "learning_rate": 8.817903693667695e-06, + "loss": 0.8474, + "step": 8197 + }, + { + "epoch": 0.4512081017117068, + "grad_norm": 0.6714445948600769, + "learning_rate": 8.817623784567411e-06, + "loss": 0.7216, + "step": 8198 + }, + { + "epoch": 0.4512631405140624, + "grad_norm": 0.6311395168304443, + "learning_rate": 8.817343846774886e-06, + "loss": 0.5724, + "step": 8199 + }, + { + "epoch": 0.4513181793164181, + "grad_norm": 0.7446333169937134, + "learning_rate": 8.817063880292227e-06, + "loss": 0.7867, + "step": 8200 + }, + { + "epoch": 0.45137321811877373, + "grad_norm": 0.7684246301651001, + "learning_rate": 8.816783885121539e-06, + "loss": 0.8141, + "step": 8201 + }, + { + "epoch": 0.4514282569211294, + "grad_norm": 0.754781186580658, + "learning_rate": 8.816503861264925e-06, + "loss": 0.8438, + "step": 8202 + }, + { + "epoch": 0.45148329572348506, + "grad_norm": 0.7705762982368469, + "learning_rate": 8.816223808724488e-06, + "loss": 0.8948, + "step": 8203 + }, + { + "epoch": 0.4515383345258407, + "grad_norm": 0.7731552720069885, + "learning_rate": 8.815943727502333e-06, + "loss": 0.7462, + "step": 8204 + }, + { + "epoch": 0.4515933733281964, + "grad_norm": 0.6615393757820129, + "learning_rate": 8.81566361760057e-06, + "loss": 0.7499, + "step": 8205 + }, + { + "epoch": 0.451648412130552, + "grad_norm": 0.724453866481781, + "learning_rate": 8.8153834790213e-06, + "loss": 0.7382, + "step": 8206 + }, + { + "epoch": 0.4517034509329077, + "grad_norm": 0.6369735598564148, + "learning_rate": 8.815103311766629e-06, + "loss": 0.7452, + "step": 8207 + }, + { + "epoch": 0.45175848973526334, + "grad_norm": 0.686000406742096, + "learning_rate": 8.814823115838659e-06, + "loss": 0.6971, + "step": 8208 + }, + { + "epoch": 0.451813528537619, + "grad_norm": 0.7372714281082153, + "learning_rate": 8.814542891239505e-06, + "loss": 0.8553, + "step": 8209 + }, + { + "epoch": 0.45186856733997466, + "grad_norm": 0.8348672986030579, + "learning_rate": 8.814262637971264e-06, + "loss": 0.7135, + "step": 8210 + }, + { + "epoch": 0.45192360614233035, + "grad_norm": 0.7829258441925049, + "learning_rate": 8.813982356036049e-06, + "loss": 0.7974, + "step": 8211 + }, + { + "epoch": 0.451978644944686, + "grad_norm": 0.7013983726501465, + "learning_rate": 8.81370204543596e-06, + "loss": 0.7531, + "step": 8212 + }, + { + "epoch": 0.45203368374704167, + "grad_norm": 0.8424196243286133, + "learning_rate": 8.81342170617311e-06, + "loss": 0.8217, + "step": 8213 + }, + { + "epoch": 0.4520887225493973, + "grad_norm": 0.7113365530967712, + "learning_rate": 8.813141338249603e-06, + "loss": 0.7728, + "step": 8214 + }, + { + "epoch": 0.452143761351753, + "grad_norm": 0.958642303943634, + "learning_rate": 8.812860941667545e-06, + "loss": 0.7234, + "step": 8215 + }, + { + "epoch": 0.4521988001541086, + "grad_norm": 0.6712706685066223, + "learning_rate": 8.812580516429045e-06, + "loss": 0.6998, + "step": 8216 + }, + { + "epoch": 0.4522538389564643, + "grad_norm": 0.7258469462394714, + "learning_rate": 8.812300062536212e-06, + "loss": 0.6758, + "step": 8217 + }, + { + "epoch": 0.45230887775881995, + "grad_norm": 0.735047459602356, + "learning_rate": 8.812019579991152e-06, + "loss": 0.7045, + "step": 8218 + }, + { + "epoch": 0.45236391656117564, + "grad_norm": 0.8339886665344238, + "learning_rate": 8.811739068795971e-06, + "loss": 0.8069, + "step": 8219 + }, + { + "epoch": 0.45241895536353127, + "grad_norm": 0.7170082926750183, + "learning_rate": 8.81145852895278e-06, + "loss": 0.6345, + "step": 8220 + }, + { + "epoch": 0.45247399416588696, + "grad_norm": 0.6892569661140442, + "learning_rate": 8.81117796046369e-06, + "loss": 0.712, + "step": 8221 + }, + { + "epoch": 0.4525290329682426, + "grad_norm": 0.6837140321731567, + "learning_rate": 8.810897363330804e-06, + "loss": 0.7184, + "step": 8222 + }, + { + "epoch": 0.4525840717705983, + "grad_norm": 0.7410069108009338, + "learning_rate": 8.810616737556235e-06, + "loss": 0.8265, + "step": 8223 + }, + { + "epoch": 0.4526391105729539, + "grad_norm": 0.6945875883102417, + "learning_rate": 8.810336083142089e-06, + "loss": 0.7163, + "step": 8224 + }, + { + "epoch": 0.4526941493753096, + "grad_norm": 0.6978884339332581, + "learning_rate": 8.810055400090477e-06, + "loss": 0.795, + "step": 8225 + }, + { + "epoch": 0.45274918817766524, + "grad_norm": 0.7209095358848572, + "learning_rate": 8.809774688403509e-06, + "loss": 0.7317, + "step": 8226 + }, + { + "epoch": 0.45280422698002093, + "grad_norm": 0.7279626727104187, + "learning_rate": 8.809493948083294e-06, + "loss": 0.7699, + "step": 8227 + }, + { + "epoch": 0.45285926578237656, + "grad_norm": 0.7642556428909302, + "learning_rate": 8.809213179131943e-06, + "loss": 0.8518, + "step": 8228 + }, + { + "epoch": 0.45291430458473225, + "grad_norm": 0.6868709325790405, + "learning_rate": 8.808932381551565e-06, + "loss": 0.737, + "step": 8229 + }, + { + "epoch": 0.4529693433870879, + "grad_norm": 0.7012789845466614, + "learning_rate": 8.80865155534427e-06, + "loss": 0.8146, + "step": 8230 + }, + { + "epoch": 0.4530243821894436, + "grad_norm": 0.678683340549469, + "learning_rate": 8.808370700512171e-06, + "loss": 0.7531, + "step": 8231 + }, + { + "epoch": 0.4530794209917992, + "grad_norm": 0.690559983253479, + "learning_rate": 8.808089817057377e-06, + "loss": 0.6779, + "step": 8232 + }, + { + "epoch": 0.4531344597941549, + "grad_norm": 0.7179763317108154, + "learning_rate": 8.807808904981997e-06, + "loss": 0.8815, + "step": 8233 + }, + { + "epoch": 0.45318949859651053, + "grad_norm": 0.7708277702331543, + "learning_rate": 8.807527964288147e-06, + "loss": 0.8084, + "step": 8234 + }, + { + "epoch": 0.4532445373988662, + "grad_norm": 0.6828494071960449, + "learning_rate": 8.807246994977936e-06, + "loss": 0.7587, + "step": 8235 + }, + { + "epoch": 0.45329957620122185, + "grad_norm": 0.7085250616073608, + "learning_rate": 8.806965997053475e-06, + "loss": 0.7894, + "step": 8236 + }, + { + "epoch": 0.45335461500357754, + "grad_norm": 0.7723467946052551, + "learning_rate": 8.806684970516876e-06, + "loss": 0.7408, + "step": 8237 + }, + { + "epoch": 0.4534096538059332, + "grad_norm": 0.8887566328048706, + "learning_rate": 8.806403915370253e-06, + "loss": 0.9022, + "step": 8238 + }, + { + "epoch": 0.45346469260828887, + "grad_norm": 0.7379833459854126, + "learning_rate": 8.806122831615718e-06, + "loss": 0.8264, + "step": 8239 + }, + { + "epoch": 0.4535197314106445, + "grad_norm": 0.903279721736908, + "learning_rate": 8.80584171925538e-06, + "loss": 0.7432, + "step": 8240 + }, + { + "epoch": 0.4535747702130002, + "grad_norm": 0.7671363353729248, + "learning_rate": 8.805560578291356e-06, + "loss": 0.8109, + "step": 8241 + }, + { + "epoch": 0.4536298090153558, + "grad_norm": 0.6047827005386353, + "learning_rate": 8.805279408725755e-06, + "loss": 0.6628, + "step": 8242 + }, + { + "epoch": 0.4536848478177115, + "grad_norm": 1.0570796728134155, + "learning_rate": 8.804998210560696e-06, + "loss": 0.7981, + "step": 8243 + }, + { + "epoch": 0.45373988662006715, + "grad_norm": 0.7116600871086121, + "learning_rate": 8.804716983798288e-06, + "loss": 0.7601, + "step": 8244 + }, + { + "epoch": 0.45379492542242283, + "grad_norm": 0.7162767648696899, + "learning_rate": 8.804435728440644e-06, + "loss": 0.8389, + "step": 8245 + }, + { + "epoch": 0.45384996422477847, + "grad_norm": 0.6715626120567322, + "learning_rate": 8.80415444448988e-06, + "loss": 0.6377, + "step": 8246 + }, + { + "epoch": 0.4539050030271341, + "grad_norm": 0.7168908715248108, + "learning_rate": 8.80387313194811e-06, + "loss": 0.7946, + "step": 8247 + }, + { + "epoch": 0.4539600418294898, + "grad_norm": 0.7497992515563965, + "learning_rate": 8.803591790817448e-06, + "loss": 0.8026, + "step": 8248 + }, + { + "epoch": 0.4540150806318454, + "grad_norm": 0.6665049195289612, + "learning_rate": 8.803310421100009e-06, + "loss": 0.779, + "step": 8249 + }, + { + "epoch": 0.4540701194342011, + "grad_norm": 0.766674280166626, + "learning_rate": 8.803029022797905e-06, + "loss": 0.7467, + "step": 8250 + }, + { + "epoch": 0.45412515823655675, + "grad_norm": 0.7306104302406311, + "learning_rate": 8.802747595913255e-06, + "loss": 0.8323, + "step": 8251 + }, + { + "epoch": 0.45418019703891244, + "grad_norm": 0.6425766944885254, + "learning_rate": 8.802466140448169e-06, + "loss": 0.7226, + "step": 8252 + }, + { + "epoch": 0.45423523584126807, + "grad_norm": 0.7992560267448425, + "learning_rate": 8.802184656404769e-06, + "loss": 0.7285, + "step": 8253 + }, + { + "epoch": 0.45429027464362376, + "grad_norm": 0.6935924887657166, + "learning_rate": 8.801903143785164e-06, + "loss": 0.5757, + "step": 8254 + }, + { + "epoch": 0.4543453134459794, + "grad_norm": 0.7091512084007263, + "learning_rate": 8.801621602591473e-06, + "loss": 0.7719, + "step": 8255 + }, + { + "epoch": 0.4544003522483351, + "grad_norm": 0.851231038570404, + "learning_rate": 8.801340032825814e-06, + "loss": 0.7804, + "step": 8256 + }, + { + "epoch": 0.4544553910506907, + "grad_norm": 0.7443445920944214, + "learning_rate": 8.801058434490298e-06, + "loss": 0.7172, + "step": 8257 + }, + { + "epoch": 0.4545104298530464, + "grad_norm": 0.7156546115875244, + "learning_rate": 8.800776807587046e-06, + "loss": 0.7756, + "step": 8258 + }, + { + "epoch": 0.45456546865540204, + "grad_norm": 0.8027580380439758, + "learning_rate": 8.800495152118172e-06, + "loss": 0.8035, + "step": 8259 + }, + { + "epoch": 0.4546205074577577, + "grad_norm": 0.6868240833282471, + "learning_rate": 8.800213468085794e-06, + "loss": 0.7159, + "step": 8260 + }, + { + "epoch": 0.45467554626011336, + "grad_norm": 0.9127504229545593, + "learning_rate": 8.79993175549203e-06, + "loss": 0.7705, + "step": 8261 + }, + { + "epoch": 0.45473058506246905, + "grad_norm": 0.7074575424194336, + "learning_rate": 8.799650014338994e-06, + "loss": 0.7841, + "step": 8262 + }, + { + "epoch": 0.4547856238648247, + "grad_norm": 0.7462378740310669, + "learning_rate": 8.799368244628807e-06, + "loss": 0.8125, + "step": 8263 + }, + { + "epoch": 0.4548406626671804, + "grad_norm": 0.7510300874710083, + "learning_rate": 8.799086446363585e-06, + "loss": 0.8354, + "step": 8264 + }, + { + "epoch": 0.454895701469536, + "grad_norm": 0.7134591937065125, + "learning_rate": 8.798804619545446e-06, + "loss": 0.7968, + "step": 8265 + }, + { + "epoch": 0.4549507402718917, + "grad_norm": 1.0424071550369263, + "learning_rate": 8.798522764176509e-06, + "loss": 0.8638, + "step": 8266 + }, + { + "epoch": 0.45500577907424733, + "grad_norm": 0.6805267930030823, + "learning_rate": 8.79824088025889e-06, + "loss": 0.757, + "step": 8267 + }, + { + "epoch": 0.455060817876603, + "grad_norm": 0.8145313262939453, + "learning_rate": 8.79795896779471e-06, + "loss": 0.7589, + "step": 8268 + }, + { + "epoch": 0.45511585667895865, + "grad_norm": 0.7611781358718872, + "learning_rate": 8.79767702678609e-06, + "loss": 0.8426, + "step": 8269 + }, + { + "epoch": 0.45517089548131434, + "grad_norm": 0.7639568448066711, + "learning_rate": 8.797395057235142e-06, + "loss": 0.6609, + "step": 8270 + }, + { + "epoch": 0.45522593428367, + "grad_norm": 0.8577544093132019, + "learning_rate": 8.79711305914399e-06, + "loss": 0.8085, + "step": 8271 + }, + { + "epoch": 0.45528097308602566, + "grad_norm": 0.7740383148193359, + "learning_rate": 8.796831032514754e-06, + "loss": 0.8689, + "step": 8272 + }, + { + "epoch": 0.4553360118883813, + "grad_norm": 0.7300885915756226, + "learning_rate": 8.796548977349553e-06, + "loss": 0.8303, + "step": 8273 + }, + { + "epoch": 0.455391050690737, + "grad_norm": 0.6677057147026062, + "learning_rate": 8.796266893650504e-06, + "loss": 0.7449, + "step": 8274 + }, + { + "epoch": 0.4554460894930926, + "grad_norm": 0.7269144058227539, + "learning_rate": 8.79598478141973e-06, + "loss": 0.8744, + "step": 8275 + }, + { + "epoch": 0.4555011282954483, + "grad_norm": 0.7458559274673462, + "learning_rate": 8.795702640659351e-06, + "loss": 0.8036, + "step": 8276 + }, + { + "epoch": 0.45555616709780394, + "grad_norm": 0.7693114280700684, + "learning_rate": 8.795420471371487e-06, + "loss": 0.7617, + "step": 8277 + }, + { + "epoch": 0.45561120590015963, + "grad_norm": 0.7594510316848755, + "learning_rate": 8.79513827355826e-06, + "loss": 0.7049, + "step": 8278 + }, + { + "epoch": 0.45566624470251527, + "grad_norm": 0.7481217980384827, + "learning_rate": 8.794856047221786e-06, + "loss": 0.804, + "step": 8279 + }, + { + "epoch": 0.45572128350487096, + "grad_norm": 0.726859986782074, + "learning_rate": 8.794573792364192e-06, + "loss": 0.7322, + "step": 8280 + }, + { + "epoch": 0.4557763223072266, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.794291508987597e-06, + "loss": 0.8467, + "step": 8281 + }, + { + "epoch": 0.4558313611095823, + "grad_norm": 0.6264217495918274, + "learning_rate": 8.794009197094122e-06, + "loss": 0.6203, + "step": 8282 + }, + { + "epoch": 0.4558863999119379, + "grad_norm": 0.6973850131034851, + "learning_rate": 8.79372685668589e-06, + "loss": 0.8211, + "step": 8283 + }, + { + "epoch": 0.4559414387142936, + "grad_norm": 0.6992879509925842, + "learning_rate": 8.793444487765022e-06, + "loss": 0.7831, + "step": 8284 + }, + { + "epoch": 0.45599647751664923, + "grad_norm": 0.7641519904136658, + "learning_rate": 8.793162090333643e-06, + "loss": 0.7519, + "step": 8285 + }, + { + "epoch": 0.4560515163190049, + "grad_norm": 0.7296152710914612, + "learning_rate": 8.79287966439387e-06, + "loss": 0.8738, + "step": 8286 + }, + { + "epoch": 0.45610655512136056, + "grad_norm": 0.7549383044242859, + "learning_rate": 8.79259720994783e-06, + "loss": 0.7868, + "step": 8287 + }, + { + "epoch": 0.45616159392371625, + "grad_norm": 0.7932083606719971, + "learning_rate": 8.792314726997644e-06, + "loss": 0.8443, + "step": 8288 + }, + { + "epoch": 0.4562166327260719, + "grad_norm": 0.7999894022941589, + "learning_rate": 8.792032215545437e-06, + "loss": 0.852, + "step": 8289 + }, + { + "epoch": 0.4562716715284275, + "grad_norm": 0.8092383742332458, + "learning_rate": 8.79174967559333e-06, + "loss": 0.7922, + "step": 8290 + }, + { + "epoch": 0.4563267103307832, + "grad_norm": 0.7481340169906616, + "learning_rate": 8.791467107143447e-06, + "loss": 0.7086, + "step": 8291 + }, + { + "epoch": 0.45638174913313884, + "grad_norm": 0.8096129298210144, + "learning_rate": 8.791184510197912e-06, + "loss": 0.6645, + "step": 8292 + }, + { + "epoch": 0.4564367879354945, + "grad_norm": 0.7276492118835449, + "learning_rate": 8.79090188475885e-06, + "loss": 0.7174, + "step": 8293 + }, + { + "epoch": 0.45649182673785016, + "grad_norm": 0.815535843372345, + "learning_rate": 8.790619230828385e-06, + "loss": 0.8622, + "step": 8294 + }, + { + "epoch": 0.45654686554020585, + "grad_norm": 0.8191169500350952, + "learning_rate": 8.790336548408637e-06, + "loss": 0.8666, + "step": 8295 + }, + { + "epoch": 0.4566019043425615, + "grad_norm": 0.7449167966842651, + "learning_rate": 8.790053837501737e-06, + "loss": 0.7728, + "step": 8296 + }, + { + "epoch": 0.45665694314491717, + "grad_norm": 0.7311065196990967, + "learning_rate": 8.789771098109808e-06, + "loss": 0.8059, + "step": 8297 + }, + { + "epoch": 0.4567119819472728, + "grad_norm": 0.7381907105445862, + "learning_rate": 8.789488330234971e-06, + "loss": 0.7722, + "step": 8298 + }, + { + "epoch": 0.4567670207496285, + "grad_norm": 0.8180661201477051, + "learning_rate": 8.789205533879355e-06, + "loss": 0.9032, + "step": 8299 + }, + { + "epoch": 0.4568220595519841, + "grad_norm": 0.7993118762969971, + "learning_rate": 8.788922709045087e-06, + "loss": 0.8065, + "step": 8300 + }, + { + "epoch": 0.4568770983543398, + "grad_norm": 0.8449206948280334, + "learning_rate": 8.788639855734287e-06, + "loss": 0.7895, + "step": 8301 + }, + { + "epoch": 0.45693213715669545, + "grad_norm": 0.9224583506584167, + "learning_rate": 8.788356973949084e-06, + "loss": 0.78, + "step": 8302 + }, + { + "epoch": 0.45698717595905114, + "grad_norm": 0.7109915614128113, + "learning_rate": 8.788074063691604e-06, + "loss": 0.8029, + "step": 8303 + }, + { + "epoch": 0.4570422147614068, + "grad_norm": 0.7372310757637024, + "learning_rate": 8.787791124963976e-06, + "loss": 0.8118, + "step": 8304 + }, + { + "epoch": 0.45709725356376246, + "grad_norm": 0.8127168416976929, + "learning_rate": 8.787508157768323e-06, + "loss": 0.8665, + "step": 8305 + }, + { + "epoch": 0.4571522923661181, + "grad_norm": 0.7193050980567932, + "learning_rate": 8.787225162106771e-06, + "loss": 0.749, + "step": 8306 + }, + { + "epoch": 0.4572073311684738, + "grad_norm": 0.8825041651725769, + "learning_rate": 8.786942137981449e-06, + "loss": 0.9651, + "step": 8307 + }, + { + "epoch": 0.4572623699708294, + "grad_norm": 0.6854885816574097, + "learning_rate": 8.786659085394485e-06, + "loss": 0.8259, + "step": 8308 + }, + { + "epoch": 0.4573174087731851, + "grad_norm": 0.6698010563850403, + "learning_rate": 8.786376004348004e-06, + "loss": 0.7212, + "step": 8309 + }, + { + "epoch": 0.45737244757554074, + "grad_norm": 0.7706398963928223, + "learning_rate": 8.786092894844132e-06, + "loss": 0.719, + "step": 8310 + }, + { + "epoch": 0.45742748637789643, + "grad_norm": 0.8905620574951172, + "learning_rate": 8.785809756885002e-06, + "loss": 0.7518, + "step": 8311 + }, + { + "epoch": 0.45748252518025206, + "grad_norm": 0.7537117004394531, + "learning_rate": 8.78552659047274e-06, + "loss": 0.8267, + "step": 8312 + }, + { + "epoch": 0.45753756398260775, + "grad_norm": 0.7840754985809326, + "learning_rate": 8.78524339560947e-06, + "loss": 0.8417, + "step": 8313 + }, + { + "epoch": 0.4575926027849634, + "grad_norm": 0.7373713254928589, + "learning_rate": 8.784960172297327e-06, + "loss": 0.784, + "step": 8314 + }, + { + "epoch": 0.4576476415873191, + "grad_norm": 0.6648432016372681, + "learning_rate": 8.784676920538436e-06, + "loss": 0.7252, + "step": 8315 + }, + { + "epoch": 0.4577026803896747, + "grad_norm": 0.7904912829399109, + "learning_rate": 8.784393640334925e-06, + "loss": 0.7777, + "step": 8316 + }, + { + "epoch": 0.4577577191920304, + "grad_norm": 0.7691501379013062, + "learning_rate": 8.784110331688927e-06, + "loss": 0.733, + "step": 8317 + }, + { + "epoch": 0.45781275799438603, + "grad_norm": 0.6054617762565613, + "learning_rate": 8.783826994602566e-06, + "loss": 0.6367, + "step": 8318 + }, + { + "epoch": 0.4578677967967417, + "grad_norm": 0.7495457530021667, + "learning_rate": 8.783543629077976e-06, + "loss": 0.8672, + "step": 8319 + }, + { + "epoch": 0.45792283559909736, + "grad_norm": 0.6979867815971375, + "learning_rate": 8.783260235117283e-06, + "loss": 0.7338, + "step": 8320 + }, + { + "epoch": 0.45797787440145304, + "grad_norm": 0.6927759647369385, + "learning_rate": 8.78297681272262e-06, + "loss": 0.6925, + "step": 8321 + }, + { + "epoch": 0.4580329132038087, + "grad_norm": 0.9076687097549438, + "learning_rate": 8.782693361896115e-06, + "loss": 0.8225, + "step": 8322 + }, + { + "epoch": 0.45808795200616437, + "grad_norm": 0.7990893721580505, + "learning_rate": 8.782409882639902e-06, + "loss": 0.8144, + "step": 8323 + }, + { + "epoch": 0.45814299080852, + "grad_norm": 0.7958230376243591, + "learning_rate": 8.782126374956107e-06, + "loss": 0.7717, + "step": 8324 + }, + { + "epoch": 0.4581980296108757, + "grad_norm": 0.7694645524024963, + "learning_rate": 8.781842838846861e-06, + "loss": 0.8314, + "step": 8325 + }, + { + "epoch": 0.4582530684132313, + "grad_norm": 0.8653621077537537, + "learning_rate": 8.781559274314297e-06, + "loss": 0.7567, + "step": 8326 + }, + { + "epoch": 0.458308107215587, + "grad_norm": 0.7834668755531311, + "learning_rate": 8.781275681360548e-06, + "loss": 0.7431, + "step": 8327 + }, + { + "epoch": 0.45836314601794265, + "grad_norm": 0.6800104975700378, + "learning_rate": 8.780992059987742e-06, + "loss": 0.8266, + "step": 8328 + }, + { + "epoch": 0.45841818482029834, + "grad_norm": 0.7274910807609558, + "learning_rate": 8.780708410198011e-06, + "loss": 0.7358, + "step": 8329 + }, + { + "epoch": 0.45847322362265397, + "grad_norm": 0.8102344870567322, + "learning_rate": 8.780424731993488e-06, + "loss": 0.7397, + "step": 8330 + }, + { + "epoch": 0.45852826242500966, + "grad_norm": 0.7536956071853638, + "learning_rate": 8.780141025376305e-06, + "loss": 0.7053, + "step": 8331 + }, + { + "epoch": 0.4585833012273653, + "grad_norm": 0.678535521030426, + "learning_rate": 8.779857290348594e-06, + "loss": 0.792, + "step": 8332 + }, + { + "epoch": 0.4586383400297209, + "grad_norm": 0.8847216963768005, + "learning_rate": 8.779573526912487e-06, + "loss": 0.8117, + "step": 8333 + }, + { + "epoch": 0.4586933788320766, + "grad_norm": 0.6997288465499878, + "learning_rate": 8.779289735070117e-06, + "loss": 0.7797, + "step": 8334 + }, + { + "epoch": 0.45874841763443225, + "grad_norm": 0.7445441484451294, + "learning_rate": 8.779005914823617e-06, + "loss": 0.7505, + "step": 8335 + }, + { + "epoch": 0.45880345643678794, + "grad_norm": 0.618844211101532, + "learning_rate": 8.778722066175121e-06, + "loss": 0.661, + "step": 8336 + }, + { + "epoch": 0.45885849523914357, + "grad_norm": 0.6810492873191833, + "learning_rate": 8.778438189126761e-06, + "loss": 0.6819, + "step": 8337 + }, + { + "epoch": 0.45891353404149926, + "grad_norm": 0.6785591244697571, + "learning_rate": 8.778154283680671e-06, + "loss": 0.7808, + "step": 8338 + }, + { + "epoch": 0.4589685728438549, + "grad_norm": 0.7461212873458862, + "learning_rate": 8.777870349838984e-06, + "loss": 0.8566, + "step": 8339 + }, + { + "epoch": 0.4590236116462106, + "grad_norm": 0.6731496453285217, + "learning_rate": 8.777586387603836e-06, + "loss": 0.823, + "step": 8340 + }, + { + "epoch": 0.4590786504485662, + "grad_norm": 0.7295553684234619, + "learning_rate": 8.77730239697736e-06, + "loss": 0.9229, + "step": 8341 + }, + { + "epoch": 0.4591336892509219, + "grad_norm": 0.783275842666626, + "learning_rate": 8.77701837796169e-06, + "loss": 0.782, + "step": 8342 + }, + { + "epoch": 0.45918872805327754, + "grad_norm": 0.6952852606773376, + "learning_rate": 8.77673433055896e-06, + "loss": 0.7977, + "step": 8343 + }, + { + "epoch": 0.45924376685563323, + "grad_norm": 0.7381969094276428, + "learning_rate": 8.776450254771305e-06, + "loss": 0.768, + "step": 8344 + }, + { + "epoch": 0.45929880565798886, + "grad_norm": 0.7911093831062317, + "learning_rate": 8.776166150600862e-06, + "loss": 0.8284, + "step": 8345 + }, + { + "epoch": 0.45935384446034455, + "grad_norm": 0.7319246530532837, + "learning_rate": 8.775882018049765e-06, + "loss": 0.8135, + "step": 8346 + }, + { + "epoch": 0.4594088832627002, + "grad_norm": 0.7888429760932922, + "learning_rate": 8.77559785712015e-06, + "loss": 0.9001, + "step": 8347 + }, + { + "epoch": 0.4594639220650559, + "grad_norm": 0.6983326077461243, + "learning_rate": 8.775313667814151e-06, + "loss": 0.7537, + "step": 8348 + }, + { + "epoch": 0.4595189608674115, + "grad_norm": 0.7532416582107544, + "learning_rate": 8.775029450133905e-06, + "loss": 0.8307, + "step": 8349 + }, + { + "epoch": 0.4595739996697672, + "grad_norm": 0.7159993052482605, + "learning_rate": 8.774745204081549e-06, + "loss": 0.7874, + "step": 8350 + }, + { + "epoch": 0.45962903847212283, + "grad_norm": 0.6898767352104187, + "learning_rate": 8.774460929659218e-06, + "loss": 0.7453, + "step": 8351 + }, + { + "epoch": 0.4596840772744785, + "grad_norm": 0.6833236813545227, + "learning_rate": 8.774176626869051e-06, + "loss": 0.7281, + "step": 8352 + }, + { + "epoch": 0.45973911607683415, + "grad_norm": 0.7840244770050049, + "learning_rate": 8.77389229571318e-06, + "loss": 0.7194, + "step": 8353 + }, + { + "epoch": 0.45979415487918984, + "grad_norm": 0.7920441627502441, + "learning_rate": 8.773607936193747e-06, + "loss": 0.7135, + "step": 8354 + }, + { + "epoch": 0.4598491936815455, + "grad_norm": 0.7395668625831604, + "learning_rate": 8.773323548312884e-06, + "loss": 0.8162, + "step": 8355 + }, + { + "epoch": 0.45990423248390117, + "grad_norm": 0.7854128479957581, + "learning_rate": 8.773039132072734e-06, + "loss": 0.8252, + "step": 8356 + }, + { + "epoch": 0.4599592712862568, + "grad_norm": 0.694997251033783, + "learning_rate": 8.772754687475431e-06, + "loss": 0.6627, + "step": 8357 + }, + { + "epoch": 0.4600143100886125, + "grad_norm": 0.7698866724967957, + "learning_rate": 8.772470214523112e-06, + "loss": 0.8814, + "step": 8358 + }, + { + "epoch": 0.4600693488909681, + "grad_norm": 0.7323407530784607, + "learning_rate": 8.77218571321792e-06, + "loss": 0.7769, + "step": 8359 + }, + { + "epoch": 0.4601243876933238, + "grad_norm": 0.6637027263641357, + "learning_rate": 8.771901183561986e-06, + "loss": 0.6741, + "step": 8360 + }, + { + "epoch": 0.46017942649567944, + "grad_norm": 0.7423702478408813, + "learning_rate": 8.771616625557455e-06, + "loss": 0.7303, + "step": 8361 + }, + { + "epoch": 0.46023446529803513, + "grad_norm": 0.7599568367004395, + "learning_rate": 8.771332039206463e-06, + "loss": 0.8161, + "step": 8362 + }, + { + "epoch": 0.46028950410039077, + "grad_norm": 0.9063183069229126, + "learning_rate": 8.771047424511148e-06, + "loss": 0.8098, + "step": 8363 + }, + { + "epoch": 0.46034454290274646, + "grad_norm": 0.658210813999176, + "learning_rate": 8.770762781473651e-06, + "loss": 0.7097, + "step": 8364 + }, + { + "epoch": 0.4603995817051021, + "grad_norm": 0.8396975994110107, + "learning_rate": 8.770478110096111e-06, + "loss": 0.8731, + "step": 8365 + }, + { + "epoch": 0.4604546205074578, + "grad_norm": 0.7334815263748169, + "learning_rate": 8.770193410380663e-06, + "loss": 0.7689, + "step": 8366 + }, + { + "epoch": 0.4605096593098134, + "grad_norm": 0.8220386505126953, + "learning_rate": 8.769908682329453e-06, + "loss": 0.8139, + "step": 8367 + }, + { + "epoch": 0.4605646981121691, + "grad_norm": 0.8077995181083679, + "learning_rate": 8.76962392594462e-06, + "loss": 0.7379, + "step": 8368 + }, + { + "epoch": 0.46061973691452474, + "grad_norm": 0.8007730841636658, + "learning_rate": 8.7693391412283e-06, + "loss": 0.7835, + "step": 8369 + }, + { + "epoch": 0.4606747757168804, + "grad_norm": 0.7108187079429626, + "learning_rate": 8.769054328182637e-06, + "loss": 0.6787, + "step": 8370 + }, + { + "epoch": 0.46072981451923606, + "grad_norm": 0.7623056173324585, + "learning_rate": 8.768769486809772e-06, + "loss": 0.8056, + "step": 8371 + }, + { + "epoch": 0.46078485332159175, + "grad_norm": 0.6991614103317261, + "learning_rate": 8.768484617111843e-06, + "loss": 0.7404, + "step": 8372 + }, + { + "epoch": 0.4608398921239474, + "grad_norm": 0.7531471848487854, + "learning_rate": 8.768199719090991e-06, + "loss": 0.8104, + "step": 8373 + }, + { + "epoch": 0.46089493092630307, + "grad_norm": 1.0271111726760864, + "learning_rate": 8.76791479274936e-06, + "loss": 0.9028, + "step": 8374 + }, + { + "epoch": 0.4609499697286587, + "grad_norm": 0.7346897125244141, + "learning_rate": 8.76762983808909e-06, + "loss": 0.8179, + "step": 8375 + }, + { + "epoch": 0.46100500853101434, + "grad_norm": 0.6413559913635254, + "learning_rate": 8.767344855112324e-06, + "loss": 0.7995, + "step": 8376 + }, + { + "epoch": 0.46106004733337, + "grad_norm": 0.7187537550926208, + "learning_rate": 8.767059843821199e-06, + "loss": 0.7973, + "step": 8377 + }, + { + "epoch": 0.46111508613572566, + "grad_norm": 0.6819092035293579, + "learning_rate": 8.766774804217864e-06, + "loss": 0.8255, + "step": 8378 + }, + { + "epoch": 0.46117012493808135, + "grad_norm": 0.683318018913269, + "learning_rate": 8.766489736304457e-06, + "loss": 0.6794, + "step": 8379 + }, + { + "epoch": 0.461225163740437, + "grad_norm": 0.7345470786094666, + "learning_rate": 8.76620464008312e-06, + "loss": 0.8741, + "step": 8380 + }, + { + "epoch": 0.46128020254279267, + "grad_norm": 0.7369397282600403, + "learning_rate": 8.765919515556e-06, + "loss": 0.8301, + "step": 8381 + }, + { + "epoch": 0.4613352413451483, + "grad_norm": 0.7304979562759399, + "learning_rate": 8.765634362725233e-06, + "loss": 0.7507, + "step": 8382 + }, + { + "epoch": 0.461390280147504, + "grad_norm": 0.7968454957008362, + "learning_rate": 8.765349181592969e-06, + "loss": 0.7396, + "step": 8383 + }, + { + "epoch": 0.46144531894985963, + "grad_norm": 0.691439151763916, + "learning_rate": 8.765063972161347e-06, + "loss": 0.7199, + "step": 8384 + }, + { + "epoch": 0.4615003577522153, + "grad_norm": 0.8355879187583923, + "learning_rate": 8.764778734432513e-06, + "loss": 0.7369, + "step": 8385 + }, + { + "epoch": 0.46155539655457095, + "grad_norm": 0.908017098903656, + "learning_rate": 8.76449346840861e-06, + "loss": 0.8271, + "step": 8386 + }, + { + "epoch": 0.46161043535692664, + "grad_norm": 0.6426172852516174, + "learning_rate": 8.764208174091781e-06, + "loss": 0.6646, + "step": 8387 + }, + { + "epoch": 0.4616654741592823, + "grad_norm": 0.7003652453422546, + "learning_rate": 8.763922851484171e-06, + "loss": 0.7272, + "step": 8388 + }, + { + "epoch": 0.46172051296163796, + "grad_norm": 0.7470494508743286, + "learning_rate": 8.763637500587925e-06, + "loss": 0.8333, + "step": 8389 + }, + { + "epoch": 0.4617755517639936, + "grad_norm": 0.6974903345108032, + "learning_rate": 8.763352121405187e-06, + "loss": 0.834, + "step": 8390 + }, + { + "epoch": 0.4618305905663493, + "grad_norm": 0.8146659135818481, + "learning_rate": 8.7630667139381e-06, + "loss": 0.724, + "step": 8391 + }, + { + "epoch": 0.4618856293687049, + "grad_norm": 0.6614096164703369, + "learning_rate": 8.762781278188813e-06, + "loss": 0.6822, + "step": 8392 + }, + { + "epoch": 0.4619406681710606, + "grad_norm": 0.712944746017456, + "learning_rate": 8.762495814159469e-06, + "loss": 0.7864, + "step": 8393 + }, + { + "epoch": 0.46199570697341624, + "grad_norm": 0.7531552910804749, + "learning_rate": 8.762210321852213e-06, + "loss": 0.7494, + "step": 8394 + }, + { + "epoch": 0.46205074577577193, + "grad_norm": 0.8150199055671692, + "learning_rate": 8.761924801269191e-06, + "loss": 0.7869, + "step": 8395 + }, + { + "epoch": 0.46210578457812757, + "grad_norm": 0.8586462736129761, + "learning_rate": 8.76163925241255e-06, + "loss": 0.7647, + "step": 8396 + }, + { + "epoch": 0.46216082338048325, + "grad_norm": 0.7258061766624451, + "learning_rate": 8.761353675284434e-06, + "loss": 0.7672, + "step": 8397 + }, + { + "epoch": 0.4622158621828389, + "grad_norm": 0.6592851281166077, + "learning_rate": 8.761068069886992e-06, + "loss": 0.7488, + "step": 8398 + }, + { + "epoch": 0.4622709009851946, + "grad_norm": 0.7410836219787598, + "learning_rate": 8.760782436222368e-06, + "loss": 0.6669, + "step": 8399 + }, + { + "epoch": 0.4623259397875502, + "grad_norm": 0.7121642231941223, + "learning_rate": 8.76049677429271e-06, + "loss": 0.7005, + "step": 8400 + }, + { + "epoch": 0.4623809785899059, + "grad_norm": 0.7170663475990295, + "learning_rate": 8.760211084100166e-06, + "loss": 0.8154, + "step": 8401 + }, + { + "epoch": 0.46243601739226153, + "grad_norm": 0.6851769685745239, + "learning_rate": 8.759925365646882e-06, + "loss": 0.7948, + "step": 8402 + }, + { + "epoch": 0.4624910561946172, + "grad_norm": 0.7728533744812012, + "learning_rate": 8.759639618935006e-06, + "loss": 0.8263, + "step": 8403 + }, + { + "epoch": 0.46254609499697286, + "grad_norm": 0.7276784777641296, + "learning_rate": 8.759353843966682e-06, + "loss": 0.6992, + "step": 8404 + }, + { + "epoch": 0.46260113379932855, + "grad_norm": 0.7533649802207947, + "learning_rate": 8.759068040744063e-06, + "loss": 0.7744, + "step": 8405 + }, + { + "epoch": 0.4626561726016842, + "grad_norm": 0.6911979913711548, + "learning_rate": 8.758782209269294e-06, + "loss": 0.6977, + "step": 8406 + }, + { + "epoch": 0.46271121140403987, + "grad_norm": 0.6723766922950745, + "learning_rate": 8.758496349544526e-06, + "loss": 0.7286, + "step": 8407 + }, + { + "epoch": 0.4627662502063955, + "grad_norm": 0.7327921390533447, + "learning_rate": 8.758210461571903e-06, + "loss": 0.7708, + "step": 8408 + }, + { + "epoch": 0.4628212890087512, + "grad_norm": 0.7498626708984375, + "learning_rate": 8.757924545353578e-06, + "loss": 0.7476, + "step": 8409 + }, + { + "epoch": 0.4628763278111068, + "grad_norm": 0.8944914937019348, + "learning_rate": 8.757638600891696e-06, + "loss": 0.7814, + "step": 8410 + }, + { + "epoch": 0.4629313666134625, + "grad_norm": 0.7242841124534607, + "learning_rate": 8.757352628188411e-06, + "loss": 0.7564, + "step": 8411 + }, + { + "epoch": 0.46298640541581815, + "grad_norm": 0.6706324815750122, + "learning_rate": 8.757066627245866e-06, + "loss": 0.7792, + "step": 8412 + }, + { + "epoch": 0.46304144421817384, + "grad_norm": 0.8044155836105347, + "learning_rate": 8.756780598066218e-06, + "loss": 0.7873, + "step": 8413 + }, + { + "epoch": 0.46309648302052947, + "grad_norm": 0.9265295267105103, + "learning_rate": 8.75649454065161e-06, + "loss": 0.878, + "step": 8414 + }, + { + "epoch": 0.46315152182288516, + "grad_norm": 0.8162378668785095, + "learning_rate": 8.756208455004194e-06, + "loss": 0.8758, + "step": 8415 + }, + { + "epoch": 0.4632065606252408, + "grad_norm": 0.7081401348114014, + "learning_rate": 8.755922341126121e-06, + "loss": 0.8053, + "step": 8416 + }, + { + "epoch": 0.4632615994275965, + "grad_norm": 0.663885235786438, + "learning_rate": 8.755636199019544e-06, + "loss": 0.7456, + "step": 8417 + }, + { + "epoch": 0.4633166382299521, + "grad_norm": 0.6934974193572998, + "learning_rate": 8.755350028686608e-06, + "loss": 0.7316, + "step": 8418 + }, + { + "epoch": 0.46337167703230775, + "grad_norm": 0.7162168025970459, + "learning_rate": 8.755063830129467e-06, + "loss": 0.8566, + "step": 8419 + }, + { + "epoch": 0.46342671583466344, + "grad_norm": 0.7507640719413757, + "learning_rate": 8.75477760335027e-06, + "loss": 0.8141, + "step": 8420 + }, + { + "epoch": 0.46348175463701907, + "grad_norm": 0.6853382587432861, + "learning_rate": 8.754491348351172e-06, + "loss": 0.6995, + "step": 8421 + }, + { + "epoch": 0.46353679343937476, + "grad_norm": 0.6421381831169128, + "learning_rate": 8.75420506513432e-06, + "loss": 0.6344, + "step": 8422 + }, + { + "epoch": 0.4635918322417304, + "grad_norm": 0.8042624592781067, + "learning_rate": 8.753918753701868e-06, + "loss": 0.7506, + "step": 8423 + }, + { + "epoch": 0.4636468710440861, + "grad_norm": 0.7184088230133057, + "learning_rate": 8.753632414055969e-06, + "loss": 0.7997, + "step": 8424 + }, + { + "epoch": 0.4637019098464417, + "grad_norm": 0.749919593334198, + "learning_rate": 8.753346046198773e-06, + "loss": 0.8168, + "step": 8425 + }, + { + "epoch": 0.4637569486487974, + "grad_norm": 0.6583670973777771, + "learning_rate": 8.753059650132433e-06, + "loss": 0.6615, + "step": 8426 + }, + { + "epoch": 0.46381198745115304, + "grad_norm": 0.7560496926307678, + "learning_rate": 8.7527732258591e-06, + "loss": 0.7221, + "step": 8427 + }, + { + "epoch": 0.46386702625350873, + "grad_norm": 0.7031972408294678, + "learning_rate": 8.752486773380928e-06, + "loss": 0.8124, + "step": 8428 + }, + { + "epoch": 0.46392206505586436, + "grad_norm": 0.684124767780304, + "learning_rate": 8.752200292700072e-06, + "loss": 0.6862, + "step": 8429 + }, + { + "epoch": 0.46397710385822005, + "grad_norm": 0.8015589118003845, + "learning_rate": 8.751913783818682e-06, + "loss": 0.7863, + "step": 8430 + }, + { + "epoch": 0.4640321426605757, + "grad_norm": 0.6815705299377441, + "learning_rate": 8.751627246738912e-06, + "loss": 0.8116, + "step": 8431 + }, + { + "epoch": 0.4640871814629314, + "grad_norm": 0.7402058839797974, + "learning_rate": 8.751340681462914e-06, + "loss": 0.7341, + "step": 8432 + }, + { + "epoch": 0.464142220265287, + "grad_norm": 0.7484470009803772, + "learning_rate": 8.751054087992848e-06, + "loss": 0.8103, + "step": 8433 + }, + { + "epoch": 0.4641972590676427, + "grad_norm": 0.8148707151412964, + "learning_rate": 8.75076746633086e-06, + "loss": 0.8995, + "step": 8434 + }, + { + "epoch": 0.46425229786999833, + "grad_norm": 0.6403086185455322, + "learning_rate": 8.750480816479107e-06, + "loss": 0.6705, + "step": 8435 + }, + { + "epoch": 0.464307336672354, + "grad_norm": 0.7787690758705139, + "learning_rate": 8.750194138439748e-06, + "loss": 0.854, + "step": 8436 + }, + { + "epoch": 0.46436237547470965, + "grad_norm": 0.6975393891334534, + "learning_rate": 8.749907432214931e-06, + "loss": 0.7588, + "step": 8437 + }, + { + "epoch": 0.46441741427706534, + "grad_norm": 0.8002430200576782, + "learning_rate": 8.749620697806812e-06, + "loss": 0.8244, + "step": 8438 + }, + { + "epoch": 0.464472453079421, + "grad_norm": 0.8049100637435913, + "learning_rate": 8.74933393521755e-06, + "loss": 0.7686, + "step": 8439 + }, + { + "epoch": 0.46452749188177667, + "grad_norm": 0.6716971397399902, + "learning_rate": 8.749047144449298e-06, + "loss": 0.7823, + "step": 8440 + }, + { + "epoch": 0.4645825306841323, + "grad_norm": 0.7292011380195618, + "learning_rate": 8.748760325504212e-06, + "loss": 0.7643, + "step": 8441 + }, + { + "epoch": 0.464637569486488, + "grad_norm": 0.6823335886001587, + "learning_rate": 8.748473478384444e-06, + "loss": 0.7539, + "step": 8442 + }, + { + "epoch": 0.4646926082888436, + "grad_norm": 0.761730968952179, + "learning_rate": 8.748186603092155e-06, + "loss": 0.7279, + "step": 8443 + }, + { + "epoch": 0.4647476470911993, + "grad_norm": 0.694007933139801, + "learning_rate": 8.747899699629498e-06, + "loss": 0.7907, + "step": 8444 + }, + { + "epoch": 0.46480268589355495, + "grad_norm": 0.7638683319091797, + "learning_rate": 8.74761276799863e-06, + "loss": 0.7278, + "step": 8445 + }, + { + "epoch": 0.46485772469591063, + "grad_norm": 0.6281229853630066, + "learning_rate": 8.747325808201708e-06, + "loss": 0.6609, + "step": 8446 + }, + { + "epoch": 0.46491276349826627, + "grad_norm": 0.7273259162902832, + "learning_rate": 8.747038820240887e-06, + "loss": 0.7553, + "step": 8447 + }, + { + "epoch": 0.46496780230062196, + "grad_norm": 0.807482898235321, + "learning_rate": 8.746751804118326e-06, + "loss": 0.7783, + "step": 8448 + }, + { + "epoch": 0.4650228411029776, + "grad_norm": 0.7088230848312378, + "learning_rate": 8.746464759836182e-06, + "loss": 0.762, + "step": 8449 + }, + { + "epoch": 0.4650778799053333, + "grad_norm": 0.7039850354194641, + "learning_rate": 8.746177687396612e-06, + "loss": 0.7811, + "step": 8450 + }, + { + "epoch": 0.4651329187076889, + "grad_norm": 0.7154161334037781, + "learning_rate": 8.745890586801773e-06, + "loss": 0.76, + "step": 8451 + }, + { + "epoch": 0.4651879575100446, + "grad_norm": 0.6738846302032471, + "learning_rate": 8.745603458053822e-06, + "loss": 0.7119, + "step": 8452 + }, + { + "epoch": 0.46524299631240024, + "grad_norm": 0.6615753173828125, + "learning_rate": 8.745316301154919e-06, + "loss": 0.8061, + "step": 8453 + }, + { + "epoch": 0.4652980351147559, + "grad_norm": 0.7285076379776001, + "learning_rate": 8.74502911610722e-06, + "loss": 0.7522, + "step": 8454 + }, + { + "epoch": 0.46535307391711156, + "grad_norm": 0.7100732922554016, + "learning_rate": 8.744741902912886e-06, + "loss": 0.7665, + "step": 8455 + }, + { + "epoch": 0.46540811271946725, + "grad_norm": 0.6564487814903259, + "learning_rate": 8.744454661574074e-06, + "loss": 0.7352, + "step": 8456 + }, + { + "epoch": 0.4654631515218229, + "grad_norm": 0.689549446105957, + "learning_rate": 8.744167392092944e-06, + "loss": 0.7011, + "step": 8457 + }, + { + "epoch": 0.46551819032417857, + "grad_norm": 0.6660958528518677, + "learning_rate": 8.743880094471651e-06, + "loss": 0.7074, + "step": 8458 + }, + { + "epoch": 0.4655732291265342, + "grad_norm": 0.7470804452896118, + "learning_rate": 8.743592768712361e-06, + "loss": 0.6684, + "step": 8459 + }, + { + "epoch": 0.4656282679288899, + "grad_norm": 0.8058002591133118, + "learning_rate": 8.743305414817227e-06, + "loss": 0.7945, + "step": 8460 + }, + { + "epoch": 0.4656833067312455, + "grad_norm": 0.7756261825561523, + "learning_rate": 8.743018032788413e-06, + "loss": 0.8442, + "step": 8461 + }, + { + "epoch": 0.46573834553360116, + "grad_norm": 0.9267478585243225, + "learning_rate": 8.742730622628077e-06, + "loss": 0.8721, + "step": 8462 + }, + { + "epoch": 0.46579338433595685, + "grad_norm": 0.8684219121932983, + "learning_rate": 8.74244318433838e-06, + "loss": 0.7833, + "step": 8463 + }, + { + "epoch": 0.4658484231383125, + "grad_norm": 0.7060475945472717, + "learning_rate": 8.742155717921481e-06, + "loss": 0.7724, + "step": 8464 + }, + { + "epoch": 0.4659034619406682, + "grad_norm": 0.7316318154335022, + "learning_rate": 8.741868223379543e-06, + "loss": 0.7489, + "step": 8465 + }, + { + "epoch": 0.4659585007430238, + "grad_norm": 0.8131282925605774, + "learning_rate": 8.741580700714724e-06, + "loss": 0.7453, + "step": 8466 + }, + { + "epoch": 0.4660135395453795, + "grad_norm": 0.6985850930213928, + "learning_rate": 8.741293149929187e-06, + "loss": 0.7083, + "step": 8467 + }, + { + "epoch": 0.46606857834773513, + "grad_norm": 0.7512301206588745, + "learning_rate": 8.74100557102509e-06, + "loss": 0.7343, + "step": 8468 + }, + { + "epoch": 0.4661236171500908, + "grad_norm": 0.7547290921211243, + "learning_rate": 8.740717964004596e-06, + "loss": 0.8358, + "step": 8469 + }, + { + "epoch": 0.46617865595244645, + "grad_norm": 0.9091271758079529, + "learning_rate": 8.740430328869868e-06, + "loss": 0.762, + "step": 8470 + }, + { + "epoch": 0.46623369475480214, + "grad_norm": 0.6960130333900452, + "learning_rate": 8.740142665623069e-06, + "loss": 0.7317, + "step": 8471 + }, + { + "epoch": 0.4662887335571578, + "grad_norm": 0.684309184551239, + "learning_rate": 8.739854974266357e-06, + "loss": 0.7653, + "step": 8472 + }, + { + "epoch": 0.46634377235951346, + "grad_norm": 0.7669411301612854, + "learning_rate": 8.739567254801898e-06, + "loss": 0.7152, + "step": 8473 + }, + { + "epoch": 0.4663988111618691, + "grad_norm": 0.7072784900665283, + "learning_rate": 8.73927950723185e-06, + "loss": 0.7508, + "step": 8474 + }, + { + "epoch": 0.4664538499642248, + "grad_norm": 0.7249277234077454, + "learning_rate": 8.73899173155838e-06, + "loss": 0.7469, + "step": 8475 + }, + { + "epoch": 0.4665088887665804, + "grad_norm": 0.7664750218391418, + "learning_rate": 8.738703927783647e-06, + "loss": 0.8692, + "step": 8476 + }, + { + "epoch": 0.4665639275689361, + "grad_norm": 0.7579765319824219, + "learning_rate": 8.738416095909818e-06, + "loss": 0.8283, + "step": 8477 + }, + { + "epoch": 0.46661896637129174, + "grad_norm": 0.7066456079483032, + "learning_rate": 8.738128235939054e-06, + "loss": 0.7125, + "step": 8478 + }, + { + "epoch": 0.46667400517364743, + "grad_norm": 0.766106367111206, + "learning_rate": 8.737840347873518e-06, + "loss": 0.7683, + "step": 8479 + }, + { + "epoch": 0.46672904397600307, + "grad_norm": 0.7599226236343384, + "learning_rate": 8.737552431715374e-06, + "loss": 0.8375, + "step": 8480 + }, + { + "epoch": 0.46678408277835876, + "grad_norm": 0.6955341100692749, + "learning_rate": 8.737264487466789e-06, + "loss": 0.7012, + "step": 8481 + }, + { + "epoch": 0.4668391215807144, + "grad_norm": 0.6096246242523193, + "learning_rate": 8.736976515129923e-06, + "loss": 0.6126, + "step": 8482 + }, + { + "epoch": 0.4668941603830701, + "grad_norm": 0.7469536066055298, + "learning_rate": 8.73668851470694e-06, + "loss": 0.7675, + "step": 8483 + }, + { + "epoch": 0.4669491991854257, + "grad_norm": 0.8018775582313538, + "learning_rate": 8.73640048620001e-06, + "loss": 0.7372, + "step": 8484 + }, + { + "epoch": 0.4670042379877814, + "grad_norm": 0.7446827292442322, + "learning_rate": 8.736112429611293e-06, + "loss": 0.7277, + "step": 8485 + }, + { + "epoch": 0.46705927679013703, + "grad_norm": 0.6292026042938232, + "learning_rate": 8.735824344942954e-06, + "loss": 0.6172, + "step": 8486 + }, + { + "epoch": 0.4671143155924927, + "grad_norm": 0.7207980751991272, + "learning_rate": 8.735536232197159e-06, + "loss": 0.8363, + "step": 8487 + }, + { + "epoch": 0.46716935439484836, + "grad_norm": 0.8585891127586365, + "learning_rate": 8.735248091376073e-06, + "loss": 0.8006, + "step": 8488 + }, + { + "epoch": 0.46722439319720405, + "grad_norm": 0.8149702548980713, + "learning_rate": 8.734959922481863e-06, + "loss": 0.7869, + "step": 8489 + }, + { + "epoch": 0.4672794319995597, + "grad_norm": 0.7113268971443176, + "learning_rate": 8.734671725516695e-06, + "loss": 0.7774, + "step": 8490 + }, + { + "epoch": 0.46733447080191537, + "grad_norm": 0.6940683722496033, + "learning_rate": 8.734383500482733e-06, + "loss": 0.7157, + "step": 8491 + }, + { + "epoch": 0.467389509604271, + "grad_norm": 0.7823536396026611, + "learning_rate": 8.734095247382145e-06, + "loss": 0.8161, + "step": 8492 + }, + { + "epoch": 0.4674445484066267, + "grad_norm": 0.7094922065734863, + "learning_rate": 8.733806966217096e-06, + "loss": 0.7593, + "step": 8493 + }, + { + "epoch": 0.4674995872089823, + "grad_norm": 0.656432569026947, + "learning_rate": 8.733518656989753e-06, + "loss": 0.7853, + "step": 8494 + }, + { + "epoch": 0.467554626011338, + "grad_norm": 0.6715715527534485, + "learning_rate": 8.733230319702284e-06, + "loss": 0.839, + "step": 8495 + }, + { + "epoch": 0.46760966481369365, + "grad_norm": 0.7496705055236816, + "learning_rate": 8.732941954356854e-06, + "loss": 0.8231, + "step": 8496 + }, + { + "epoch": 0.46766470361604934, + "grad_norm": 0.7728047370910645, + "learning_rate": 8.732653560955635e-06, + "loss": 0.7852, + "step": 8497 + }, + { + "epoch": 0.46771974241840497, + "grad_norm": 1.5637458562850952, + "learning_rate": 8.732365139500787e-06, + "loss": 0.7749, + "step": 8498 + }, + { + "epoch": 0.46777478122076066, + "grad_norm": 0.6603190898895264, + "learning_rate": 8.732076689994484e-06, + "loss": 0.6628, + "step": 8499 + }, + { + "epoch": 0.4678298200231163, + "grad_norm": 0.7170974612236023, + "learning_rate": 8.73178821243889e-06, + "loss": 0.7855, + "step": 8500 + }, + { + "epoch": 0.467884858825472, + "grad_norm": 0.7220103740692139, + "learning_rate": 8.731499706836175e-06, + "loss": 0.7035, + "step": 8501 + }, + { + "epoch": 0.4679398976278276, + "grad_norm": 0.6940942406654358, + "learning_rate": 8.731211173188507e-06, + "loss": 0.7857, + "step": 8502 + }, + { + "epoch": 0.4679949364301833, + "grad_norm": 2.441596508026123, + "learning_rate": 8.730922611498057e-06, + "loss": 0.695, + "step": 8503 + }, + { + "epoch": 0.46804997523253894, + "grad_norm": 0.7654910087585449, + "learning_rate": 8.730634021766989e-06, + "loss": 0.788, + "step": 8504 + }, + { + "epoch": 0.4681050140348946, + "grad_norm": 0.791824996471405, + "learning_rate": 8.730345403997475e-06, + "loss": 0.7899, + "step": 8505 + }, + { + "epoch": 0.46816005283725026, + "grad_norm": 0.6863934993743896, + "learning_rate": 8.730056758191682e-06, + "loss": 0.7402, + "step": 8506 + }, + { + "epoch": 0.4682150916396059, + "grad_norm": 0.7920359373092651, + "learning_rate": 8.729768084351783e-06, + "loss": 0.7835, + "step": 8507 + }, + { + "epoch": 0.4682701304419616, + "grad_norm": 0.7077129483222961, + "learning_rate": 8.729479382479944e-06, + "loss": 0.7761, + "step": 8508 + }, + { + "epoch": 0.4683251692443172, + "grad_norm": 0.6870049238204956, + "learning_rate": 8.729190652578337e-06, + "loss": 0.8169, + "step": 8509 + }, + { + "epoch": 0.4683802080466729, + "grad_norm": 0.6802713871002197, + "learning_rate": 8.728901894649131e-06, + "loss": 0.7914, + "step": 8510 + }, + { + "epoch": 0.46843524684902854, + "grad_norm": 0.6645112633705139, + "learning_rate": 8.728613108694497e-06, + "loss": 0.7543, + "step": 8511 + }, + { + "epoch": 0.46849028565138423, + "grad_norm": 0.708292543888092, + "learning_rate": 8.728324294716604e-06, + "loss": 0.7015, + "step": 8512 + }, + { + "epoch": 0.46854532445373986, + "grad_norm": 0.7444465160369873, + "learning_rate": 8.728035452717625e-06, + "loss": 0.7999, + "step": 8513 + }, + { + "epoch": 0.46860036325609555, + "grad_norm": 0.7028616666793823, + "learning_rate": 8.727746582699728e-06, + "loss": 0.8094, + "step": 8514 + }, + { + "epoch": 0.4686554020584512, + "grad_norm": 0.7063208222389221, + "learning_rate": 8.727457684665088e-06, + "loss": 0.8028, + "step": 8515 + }, + { + "epoch": 0.4687104408608069, + "grad_norm": 0.8455138802528381, + "learning_rate": 8.727168758615871e-06, + "loss": 0.7691, + "step": 8516 + }, + { + "epoch": 0.4687654796631625, + "grad_norm": 1.0325778722763062, + "learning_rate": 8.726879804554252e-06, + "loss": 0.7042, + "step": 8517 + }, + { + "epoch": 0.4688205184655182, + "grad_norm": 0.7352754473686218, + "learning_rate": 8.726590822482402e-06, + "loss": 0.8467, + "step": 8518 + }, + { + "epoch": 0.46887555726787383, + "grad_norm": 0.7247193455696106, + "learning_rate": 8.726301812402494e-06, + "loss": 0.8034, + "step": 8519 + }, + { + "epoch": 0.4689305960702295, + "grad_norm": 0.6876820921897888, + "learning_rate": 8.726012774316699e-06, + "loss": 0.7308, + "step": 8520 + }, + { + "epoch": 0.46898563487258516, + "grad_norm": 0.6987231969833374, + "learning_rate": 8.725723708227188e-06, + "loss": 0.7655, + "step": 8521 + }, + { + "epoch": 0.46904067367494084, + "grad_norm": 0.7471843361854553, + "learning_rate": 8.725434614136135e-06, + "loss": 0.7271, + "step": 8522 + }, + { + "epoch": 0.4690957124772965, + "grad_norm": 0.7564642429351807, + "learning_rate": 8.725145492045715e-06, + "loss": 0.7335, + "step": 8523 + }, + { + "epoch": 0.46915075127965217, + "grad_norm": 0.7488992214202881, + "learning_rate": 8.724856341958095e-06, + "loss": 0.8815, + "step": 8524 + }, + { + "epoch": 0.4692057900820078, + "grad_norm": 0.6776759028434753, + "learning_rate": 8.724567163875455e-06, + "loss": 0.7452, + "step": 8525 + }, + { + "epoch": 0.4692608288843635, + "grad_norm": 0.6905981302261353, + "learning_rate": 8.724277957799963e-06, + "loss": 0.6815, + "step": 8526 + }, + { + "epoch": 0.4693158676867191, + "grad_norm": 0.7392297983169556, + "learning_rate": 8.723988723733795e-06, + "loss": 0.7546, + "step": 8527 + }, + { + "epoch": 0.4693709064890748, + "grad_norm": 0.7479110360145569, + "learning_rate": 8.723699461679128e-06, + "loss": 0.7455, + "step": 8528 + }, + { + "epoch": 0.46942594529143045, + "grad_norm": 0.7231360673904419, + "learning_rate": 8.723410171638129e-06, + "loss": 0.7611, + "step": 8529 + }, + { + "epoch": 0.46948098409378614, + "grad_norm": 0.7493714690208435, + "learning_rate": 8.723120853612976e-06, + "loss": 0.6997, + "step": 8530 + }, + { + "epoch": 0.46953602289614177, + "grad_norm": 0.8056793808937073, + "learning_rate": 8.722831507605844e-06, + "loss": 0.7431, + "step": 8531 + }, + { + "epoch": 0.46959106169849746, + "grad_norm": 0.7528547048568726, + "learning_rate": 8.722542133618907e-06, + "loss": 0.8798, + "step": 8532 + }, + { + "epoch": 0.4696461005008531, + "grad_norm": 0.6964863538742065, + "learning_rate": 8.72225273165434e-06, + "loss": 0.8462, + "step": 8533 + }, + { + "epoch": 0.4697011393032088, + "grad_norm": 0.7354302406311035, + "learning_rate": 8.721963301714318e-06, + "loss": 0.7882, + "step": 8534 + }, + { + "epoch": 0.4697561781055644, + "grad_norm": 0.7365205883979797, + "learning_rate": 8.721673843801014e-06, + "loss": 0.7483, + "step": 8535 + }, + { + "epoch": 0.4698112169079201, + "grad_norm": 0.7485378384590149, + "learning_rate": 8.72138435791661e-06, + "loss": 0.8539, + "step": 8536 + }, + { + "epoch": 0.46986625571027574, + "grad_norm": 0.7674353718757629, + "learning_rate": 8.721094844063274e-06, + "loss": 0.834, + "step": 8537 + }, + { + "epoch": 0.4699212945126314, + "grad_norm": 0.7054184079170227, + "learning_rate": 8.720805302243185e-06, + "loss": 0.7938, + "step": 8538 + }, + { + "epoch": 0.46997633331498706, + "grad_norm": 0.7414574027061462, + "learning_rate": 8.72051573245852e-06, + "loss": 0.7932, + "step": 8539 + }, + { + "epoch": 0.47003137211734275, + "grad_norm": 0.6734428405761719, + "learning_rate": 8.720226134711455e-06, + "loss": 0.8775, + "step": 8540 + }, + { + "epoch": 0.4700864109196984, + "grad_norm": 0.6588559150695801, + "learning_rate": 8.719936509004166e-06, + "loss": 0.6985, + "step": 8541 + }, + { + "epoch": 0.4701414497220541, + "grad_norm": 0.6557223200798035, + "learning_rate": 8.71964685533883e-06, + "loss": 0.7243, + "step": 8542 + }, + { + "epoch": 0.4701964885244097, + "grad_norm": 0.7876269221305847, + "learning_rate": 8.719357173717624e-06, + "loss": 0.8075, + "step": 8543 + }, + { + "epoch": 0.4702515273267654, + "grad_norm": 0.8346554040908813, + "learning_rate": 8.719067464142726e-06, + "loss": 0.8427, + "step": 8544 + }, + { + "epoch": 0.47030656612912103, + "grad_norm": 0.7190483808517456, + "learning_rate": 8.718777726616311e-06, + "loss": 0.7689, + "step": 8545 + }, + { + "epoch": 0.4703616049314767, + "grad_norm": 1.303118109703064, + "learning_rate": 8.718487961140558e-06, + "loss": 0.7537, + "step": 8546 + }, + { + "epoch": 0.47041664373383235, + "grad_norm": 0.7733024954795837, + "learning_rate": 8.718198167717647e-06, + "loss": 0.747, + "step": 8547 + }, + { + "epoch": 0.470471682536188, + "grad_norm": 0.6692484617233276, + "learning_rate": 8.717908346349751e-06, + "loss": 0.725, + "step": 8548 + }, + { + "epoch": 0.4705267213385437, + "grad_norm": 0.9639461636543274, + "learning_rate": 8.717618497039054e-06, + "loss": 0.8642, + "step": 8549 + }, + { + "epoch": 0.4705817601408993, + "grad_norm": 0.7584646344184875, + "learning_rate": 8.717328619787728e-06, + "loss": 0.8174, + "step": 8550 + }, + { + "epoch": 0.470636798943255, + "grad_norm": 0.7051709890365601, + "learning_rate": 8.717038714597957e-06, + "loss": 0.7962, + "step": 8551 + }, + { + "epoch": 0.47069183774561063, + "grad_norm": 0.738913893699646, + "learning_rate": 8.716748781471918e-06, + "loss": 0.7367, + "step": 8552 + }, + { + "epoch": 0.4707468765479663, + "grad_norm": 0.7027214169502258, + "learning_rate": 8.716458820411791e-06, + "loss": 0.7613, + "step": 8553 + }, + { + "epoch": 0.47080191535032195, + "grad_norm": 0.6701993346214294, + "learning_rate": 8.716168831419754e-06, + "loss": 0.638, + "step": 8554 + }, + { + "epoch": 0.47085695415267764, + "grad_norm": 0.7422072887420654, + "learning_rate": 8.715878814497984e-06, + "loss": 0.8338, + "step": 8555 + }, + { + "epoch": 0.4709119929550333, + "grad_norm": 0.985992968082428, + "learning_rate": 8.715588769648667e-06, + "loss": 0.7765, + "step": 8556 + }, + { + "epoch": 0.47096703175738897, + "grad_norm": 0.6937553882598877, + "learning_rate": 8.715298696873978e-06, + "loss": 0.7306, + "step": 8557 + }, + { + "epoch": 0.4710220705597446, + "grad_norm": 1.1683214902877808, + "learning_rate": 8.715008596176099e-06, + "loss": 0.7782, + "step": 8558 + }, + { + "epoch": 0.4710771093621003, + "grad_norm": 0.7493681907653809, + "learning_rate": 8.714718467557209e-06, + "loss": 0.9166, + "step": 8559 + }, + { + "epoch": 0.4711321481644559, + "grad_norm": 0.7562084794044495, + "learning_rate": 8.71442831101949e-06, + "loss": 0.7999, + "step": 8560 + }, + { + "epoch": 0.4711871869668116, + "grad_norm": 0.7950266003608704, + "learning_rate": 8.71413812656512e-06, + "loss": 0.8094, + "step": 8561 + }, + { + "epoch": 0.47124222576916724, + "grad_norm": 1.1411044597625732, + "learning_rate": 8.713847914196287e-06, + "loss": 0.7631, + "step": 8562 + }, + { + "epoch": 0.47129726457152293, + "grad_norm": 0.7270122170448303, + "learning_rate": 8.713557673915162e-06, + "loss": 0.7529, + "step": 8563 + }, + { + "epoch": 0.47135230337387857, + "grad_norm": 0.8138573169708252, + "learning_rate": 8.713267405723935e-06, + "loss": 0.8215, + "step": 8564 + }, + { + "epoch": 0.47140734217623426, + "grad_norm": 0.732982873916626, + "learning_rate": 8.712977109624783e-06, + "loss": 0.7099, + "step": 8565 + }, + { + "epoch": 0.4714623809785899, + "grad_norm": 0.7307591438293457, + "learning_rate": 8.712686785619888e-06, + "loss": 0.7035, + "step": 8566 + }, + { + "epoch": 0.4715174197809456, + "grad_norm": 0.8684857487678528, + "learning_rate": 8.712396433711434e-06, + "loss": 0.8605, + "step": 8567 + }, + { + "epoch": 0.4715724585833012, + "grad_norm": 0.7490718364715576, + "learning_rate": 8.712106053901603e-06, + "loss": 0.7439, + "step": 8568 + }, + { + "epoch": 0.4716274973856569, + "grad_norm": 0.8572973012924194, + "learning_rate": 8.711815646192575e-06, + "loss": 0.8187, + "step": 8569 + }, + { + "epoch": 0.47168253618801254, + "grad_norm": 0.785270094871521, + "learning_rate": 8.711525210586536e-06, + "loss": 0.7812, + "step": 8570 + }, + { + "epoch": 0.4717375749903682, + "grad_norm": 0.683651864528656, + "learning_rate": 8.711234747085663e-06, + "loss": 0.7682, + "step": 8571 + }, + { + "epoch": 0.47179261379272386, + "grad_norm": 0.7990714907646179, + "learning_rate": 8.710944255692147e-06, + "loss": 0.8114, + "step": 8572 + }, + { + "epoch": 0.47184765259507955, + "grad_norm": 0.9354856610298157, + "learning_rate": 8.710653736408165e-06, + "loss": 0.7353, + "step": 8573 + }, + { + "epoch": 0.4719026913974352, + "grad_norm": 0.8309356570243835, + "learning_rate": 8.710363189235904e-06, + "loss": 0.8635, + "step": 8574 + }, + { + "epoch": 0.47195773019979087, + "grad_norm": 0.7018463015556335, + "learning_rate": 8.710072614177547e-06, + "loss": 0.6372, + "step": 8575 + }, + { + "epoch": 0.4720127690021465, + "grad_norm": 0.7626469135284424, + "learning_rate": 8.709782011235277e-06, + "loss": 0.7684, + "step": 8576 + }, + { + "epoch": 0.4720678078045022, + "grad_norm": 0.6995826959609985, + "learning_rate": 8.70949138041128e-06, + "loss": 0.7301, + "step": 8577 + }, + { + "epoch": 0.4721228466068578, + "grad_norm": 0.719307541847229, + "learning_rate": 8.709200721707736e-06, + "loss": 0.7437, + "step": 8578 + }, + { + "epoch": 0.4721778854092135, + "grad_norm": 0.7355539202690125, + "learning_rate": 8.708910035126832e-06, + "loss": 0.7926, + "step": 8579 + }, + { + "epoch": 0.47223292421156915, + "grad_norm": 0.7262680530548096, + "learning_rate": 8.708619320670755e-06, + "loss": 0.7641, + "step": 8580 + }, + { + "epoch": 0.47228796301392484, + "grad_norm": 0.844745934009552, + "learning_rate": 8.708328578341687e-06, + "loss": 0.7228, + "step": 8581 + }, + { + "epoch": 0.47234300181628047, + "grad_norm": 0.8169287443161011, + "learning_rate": 8.708037808141814e-06, + "loss": 0.7076, + "step": 8582 + }, + { + "epoch": 0.47239804061863616, + "grad_norm": 0.7342209219932556, + "learning_rate": 8.707747010073322e-06, + "loss": 0.7997, + "step": 8583 + }, + { + "epoch": 0.4724530794209918, + "grad_norm": 0.7138200402259827, + "learning_rate": 8.707456184138394e-06, + "loss": 0.7796, + "step": 8584 + }, + { + "epoch": 0.4725081182233475, + "grad_norm": 0.7168061137199402, + "learning_rate": 8.70716533033922e-06, + "loss": 0.6876, + "step": 8585 + }, + { + "epoch": 0.4725631570257031, + "grad_norm": 0.7256397604942322, + "learning_rate": 8.706874448677982e-06, + "loss": 0.8296, + "step": 8586 + }, + { + "epoch": 0.4726181958280588, + "grad_norm": 0.8232730627059937, + "learning_rate": 8.70658353915687e-06, + "loss": 0.8001, + "step": 8587 + }, + { + "epoch": 0.47267323463041444, + "grad_norm": 0.7110162973403931, + "learning_rate": 8.706292601778067e-06, + "loss": 0.7061, + "step": 8588 + }, + { + "epoch": 0.47272827343277013, + "grad_norm": 0.9466721415519714, + "learning_rate": 8.706001636543761e-06, + "loss": 0.8713, + "step": 8589 + }, + { + "epoch": 0.47278331223512576, + "grad_norm": 0.7017776370048523, + "learning_rate": 8.705710643456138e-06, + "loss": 0.759, + "step": 8590 + }, + { + "epoch": 0.4728383510374814, + "grad_norm": 0.7140772938728333, + "learning_rate": 8.705419622517386e-06, + "loss": 0.6962, + "step": 8591 + }, + { + "epoch": 0.4728933898398371, + "grad_norm": 1.1076452732086182, + "learning_rate": 8.705128573729694e-06, + "loss": 0.8264, + "step": 8592 + }, + { + "epoch": 0.4729484286421927, + "grad_norm": 0.7308200597763062, + "learning_rate": 8.704837497095247e-06, + "loss": 0.6243, + "step": 8593 + }, + { + "epoch": 0.4730034674445484, + "grad_norm": 0.9445781111717224, + "learning_rate": 8.704546392616231e-06, + "loss": 0.6676, + "step": 8594 + }, + { + "epoch": 0.47305850624690404, + "grad_norm": 0.6527873277664185, + "learning_rate": 8.704255260294837e-06, + "loss": 0.6979, + "step": 8595 + }, + { + "epoch": 0.47311354504925973, + "grad_norm": 0.6732963919639587, + "learning_rate": 8.703964100133252e-06, + "loss": 0.7724, + "step": 8596 + }, + { + "epoch": 0.47316858385161537, + "grad_norm": 0.7661726474761963, + "learning_rate": 8.703672912133665e-06, + "loss": 0.7988, + "step": 8597 + }, + { + "epoch": 0.47322362265397105, + "grad_norm": 0.7006877660751343, + "learning_rate": 8.703381696298262e-06, + "loss": 0.6765, + "step": 8598 + }, + { + "epoch": 0.4732786614563267, + "grad_norm": 0.7195086479187012, + "learning_rate": 8.703090452629236e-06, + "loss": 0.6676, + "step": 8599 + }, + { + "epoch": 0.4733337002586824, + "grad_norm": 0.6692042350769043, + "learning_rate": 8.702799181128771e-06, + "loss": 0.7882, + "step": 8600 + }, + { + "epoch": 0.473388739061038, + "grad_norm": 0.7736524343490601, + "learning_rate": 8.70250788179906e-06, + "loss": 0.7977, + "step": 8601 + }, + { + "epoch": 0.4734437778633937, + "grad_norm": 0.8821607828140259, + "learning_rate": 8.70221655464229e-06, + "loss": 0.7465, + "step": 8602 + }, + { + "epoch": 0.47349881666574933, + "grad_norm": 0.7565156817436218, + "learning_rate": 8.701925199660652e-06, + "loss": 0.831, + "step": 8603 + }, + { + "epoch": 0.473553855468105, + "grad_norm": 0.8542304039001465, + "learning_rate": 8.701633816856335e-06, + "loss": 0.7538, + "step": 8604 + }, + { + "epoch": 0.47360889427046066, + "grad_norm": 0.6891050338745117, + "learning_rate": 8.701342406231529e-06, + "loss": 0.7687, + "step": 8605 + }, + { + "epoch": 0.47366393307281635, + "grad_norm": 0.8570719361305237, + "learning_rate": 8.701050967788424e-06, + "loss": 0.7236, + "step": 8606 + }, + { + "epoch": 0.473718971875172, + "grad_norm": 0.7921456098556519, + "learning_rate": 8.700759501529212e-06, + "loss": 0.8214, + "step": 8607 + }, + { + "epoch": 0.47377401067752767, + "grad_norm": 0.7584527730941772, + "learning_rate": 8.70046800745608e-06, + "loss": 0.8204, + "step": 8608 + }, + { + "epoch": 0.4738290494798833, + "grad_norm": 0.8033978343009949, + "learning_rate": 8.700176485571222e-06, + "loss": 0.8278, + "step": 8609 + }, + { + "epoch": 0.473884088282239, + "grad_norm": 0.9950750470161438, + "learning_rate": 8.699884935876828e-06, + "loss": 0.8181, + "step": 8610 + }, + { + "epoch": 0.4739391270845946, + "grad_norm": 0.7213684916496277, + "learning_rate": 8.69959335837509e-06, + "loss": 0.7099, + "step": 8611 + }, + { + "epoch": 0.4739941658869503, + "grad_norm": 0.7847200632095337, + "learning_rate": 8.699301753068199e-06, + "loss": 0.8272, + "step": 8612 + }, + { + "epoch": 0.47404920468930595, + "grad_norm": 0.7075058221817017, + "learning_rate": 8.699010119958344e-06, + "loss": 0.7127, + "step": 8613 + }, + { + "epoch": 0.47410424349166164, + "grad_norm": 0.682741641998291, + "learning_rate": 8.69871845904772e-06, + "loss": 0.8446, + "step": 8614 + }, + { + "epoch": 0.47415928229401727, + "grad_norm": 0.7120605111122131, + "learning_rate": 8.69842677033852e-06, + "loss": 0.7776, + "step": 8615 + }, + { + "epoch": 0.47421432109637296, + "grad_norm": 0.822405219078064, + "learning_rate": 8.698135053832933e-06, + "loss": 0.8018, + "step": 8616 + }, + { + "epoch": 0.4742693598987286, + "grad_norm": 0.6815186738967896, + "learning_rate": 8.697843309533152e-06, + "loss": 0.7413, + "step": 8617 + }, + { + "epoch": 0.4743243987010843, + "grad_norm": 0.7587849497795105, + "learning_rate": 8.69755153744137e-06, + "loss": 0.7809, + "step": 8618 + }, + { + "epoch": 0.4743794375034399, + "grad_norm": 0.7092488408088684, + "learning_rate": 8.697259737559782e-06, + "loss": 0.7921, + "step": 8619 + }, + { + "epoch": 0.4744344763057956, + "grad_norm": 0.7396836280822754, + "learning_rate": 8.69696790989058e-06, + "loss": 0.7946, + "step": 8620 + }, + { + "epoch": 0.47448951510815124, + "grad_norm": 0.6760729551315308, + "learning_rate": 8.696676054435955e-06, + "loss": 0.7389, + "step": 8621 + }, + { + "epoch": 0.4745445539105069, + "grad_norm": 1.1640692949295044, + "learning_rate": 8.696384171198105e-06, + "loss": 0.8291, + "step": 8622 + }, + { + "epoch": 0.47459959271286256, + "grad_norm": 0.7415158152580261, + "learning_rate": 8.696092260179219e-06, + "loss": 0.7534, + "step": 8623 + }, + { + "epoch": 0.47465463151521825, + "grad_norm": 0.7730052471160889, + "learning_rate": 8.695800321381492e-06, + "loss": 0.8447, + "step": 8624 + }, + { + "epoch": 0.4747096703175739, + "grad_norm": 0.811522364616394, + "learning_rate": 8.695508354807121e-06, + "loss": 0.7466, + "step": 8625 + }, + { + "epoch": 0.4747647091199296, + "grad_norm": 0.7908332347869873, + "learning_rate": 8.695216360458298e-06, + "loss": 0.7769, + "step": 8626 + }, + { + "epoch": 0.4748197479222852, + "grad_norm": 0.744971752166748, + "learning_rate": 8.694924338337217e-06, + "loss": 0.7651, + "step": 8627 + }, + { + "epoch": 0.4748747867246409, + "grad_norm": 0.705565869808197, + "learning_rate": 8.694632288446075e-06, + "loss": 0.8258, + "step": 8628 + }, + { + "epoch": 0.47492982552699653, + "grad_norm": 0.8199328780174255, + "learning_rate": 8.694340210787065e-06, + "loss": 0.733, + "step": 8629 + }, + { + "epoch": 0.4749848643293522, + "grad_norm": 0.6965511441230774, + "learning_rate": 8.694048105362382e-06, + "loss": 0.7548, + "step": 8630 + }, + { + "epoch": 0.47503990313170785, + "grad_norm": 0.7943055629730225, + "learning_rate": 8.693755972174225e-06, + "loss": 0.7518, + "step": 8631 + }, + { + "epoch": 0.47509494193406354, + "grad_norm": 0.6277437806129456, + "learning_rate": 8.693463811224785e-06, + "loss": 0.6941, + "step": 8632 + }, + { + "epoch": 0.4751499807364192, + "grad_norm": 1.0745574235916138, + "learning_rate": 8.693171622516259e-06, + "loss": 0.8056, + "step": 8633 + }, + { + "epoch": 0.4752050195387748, + "grad_norm": 0.7005153894424438, + "learning_rate": 8.692879406050844e-06, + "loss": 0.757, + "step": 8634 + }, + { + "epoch": 0.4752600583411305, + "grad_norm": 0.6971127986907959, + "learning_rate": 8.692587161830737e-06, + "loss": 0.7509, + "step": 8635 + }, + { + "epoch": 0.47531509714348613, + "grad_norm": 0.7583497762680054, + "learning_rate": 8.692294889858133e-06, + "loss": 0.7895, + "step": 8636 + }, + { + "epoch": 0.4753701359458418, + "grad_norm": 0.719932496547699, + "learning_rate": 8.692002590135228e-06, + "loss": 0.762, + "step": 8637 + }, + { + "epoch": 0.47542517474819745, + "grad_norm": 0.7041804790496826, + "learning_rate": 8.691710262664222e-06, + "loss": 0.7101, + "step": 8638 + }, + { + "epoch": 0.47548021355055314, + "grad_norm": 0.7395016551017761, + "learning_rate": 8.691417907447309e-06, + "loss": 0.723, + "step": 8639 + }, + { + "epoch": 0.4755352523529088, + "grad_norm": 0.6605637073516846, + "learning_rate": 8.691125524486686e-06, + "loss": 0.644, + "step": 8640 + }, + { + "epoch": 0.47559029115526447, + "grad_norm": 0.694732129573822, + "learning_rate": 8.690833113784552e-06, + "loss": 0.7162, + "step": 8641 + }, + { + "epoch": 0.4756453299576201, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.690540675343105e-06, + "loss": 0.6995, + "step": 8642 + }, + { + "epoch": 0.4757003687599758, + "grad_norm": 0.6961628794670105, + "learning_rate": 8.69024820916454e-06, + "loss": 0.7955, + "step": 8643 + }, + { + "epoch": 0.4757554075623314, + "grad_norm": 0.706266462802887, + "learning_rate": 8.68995571525106e-06, + "loss": 0.7237, + "step": 8644 + }, + { + "epoch": 0.4758104463646871, + "grad_norm": 0.7727495431900024, + "learning_rate": 8.689663193604858e-06, + "loss": 0.7215, + "step": 8645 + }, + { + "epoch": 0.47586548516704275, + "grad_norm": 0.7320648431777954, + "learning_rate": 8.689370644228136e-06, + "loss": 0.7592, + "step": 8646 + }, + { + "epoch": 0.47592052396939843, + "grad_norm": 0.8149487376213074, + "learning_rate": 8.689078067123093e-06, + "loss": 0.7666, + "step": 8647 + }, + { + "epoch": 0.47597556277175407, + "grad_norm": 0.6584552526473999, + "learning_rate": 8.688785462291927e-06, + "loss": 0.7497, + "step": 8648 + }, + { + "epoch": 0.47603060157410976, + "grad_norm": 0.7197825312614441, + "learning_rate": 8.688492829736836e-06, + "loss": 0.7559, + "step": 8649 + }, + { + "epoch": 0.4760856403764654, + "grad_norm": 0.8116913437843323, + "learning_rate": 8.68820016946002e-06, + "loss": 0.7029, + "step": 8650 + }, + { + "epoch": 0.4761406791788211, + "grad_norm": 0.6733378171920776, + "learning_rate": 8.68790748146368e-06, + "loss": 0.7242, + "step": 8651 + }, + { + "epoch": 0.4761957179811767, + "grad_norm": 0.690464437007904, + "learning_rate": 8.687614765750012e-06, + "loss": 0.6668, + "step": 8652 + }, + { + "epoch": 0.4762507567835324, + "grad_norm": 0.7901185154914856, + "learning_rate": 8.687322022321221e-06, + "loss": 0.7436, + "step": 8653 + }, + { + "epoch": 0.47630579558588804, + "grad_norm": 0.7608267068862915, + "learning_rate": 8.687029251179504e-06, + "loss": 0.8292, + "step": 8654 + }, + { + "epoch": 0.4763608343882437, + "grad_norm": 0.6851119995117188, + "learning_rate": 8.686736452327062e-06, + "loss": 0.7974, + "step": 8655 + }, + { + "epoch": 0.47641587319059936, + "grad_norm": 0.6946395635604858, + "learning_rate": 8.686443625766094e-06, + "loss": 0.6745, + "step": 8656 + }, + { + "epoch": 0.47647091199295505, + "grad_norm": 0.7403521537780762, + "learning_rate": 8.686150771498804e-06, + "loss": 0.7759, + "step": 8657 + }, + { + "epoch": 0.4765259507953107, + "grad_norm": 0.8415689468383789, + "learning_rate": 8.685857889527393e-06, + "loss": 0.7911, + "step": 8658 + }, + { + "epoch": 0.47658098959766637, + "grad_norm": 0.6947778463363647, + "learning_rate": 8.68556497985406e-06, + "loss": 0.8026, + "step": 8659 + }, + { + "epoch": 0.476636028400022, + "grad_norm": 0.6807059645652771, + "learning_rate": 8.685272042481006e-06, + "loss": 0.7194, + "step": 8660 + }, + { + "epoch": 0.4766910672023777, + "grad_norm": 0.8948639631271362, + "learning_rate": 8.684979077410434e-06, + "loss": 0.8017, + "step": 8661 + }, + { + "epoch": 0.4767461060047333, + "grad_norm": 0.6697849035263062, + "learning_rate": 8.684686084644546e-06, + "loss": 0.7653, + "step": 8662 + }, + { + "epoch": 0.476801144807089, + "grad_norm": 0.7303311228752136, + "learning_rate": 8.684393064185543e-06, + "loss": 0.8287, + "step": 8663 + }, + { + "epoch": 0.47685618360944465, + "grad_norm": 0.6545100808143616, + "learning_rate": 8.68410001603563e-06, + "loss": 0.7438, + "step": 8664 + }, + { + "epoch": 0.47691122241180034, + "grad_norm": 0.8757766485214233, + "learning_rate": 8.683806940197006e-06, + "loss": 0.8343, + "step": 8665 + }, + { + "epoch": 0.476966261214156, + "grad_norm": 0.6414330005645752, + "learning_rate": 8.683513836671876e-06, + "loss": 0.7201, + "step": 8666 + }, + { + "epoch": 0.47702130001651166, + "grad_norm": 0.6736441850662231, + "learning_rate": 8.68322070546244e-06, + "loss": 0.7365, + "step": 8667 + }, + { + "epoch": 0.4770763388188673, + "grad_norm": 0.780491054058075, + "learning_rate": 8.682927546570905e-06, + "loss": 0.924, + "step": 8668 + }, + { + "epoch": 0.477131377621223, + "grad_norm": 0.6913807988166809, + "learning_rate": 8.68263435999947e-06, + "loss": 0.8269, + "step": 8669 + }, + { + "epoch": 0.4771864164235786, + "grad_norm": 0.7264360189437866, + "learning_rate": 8.682341145750344e-06, + "loss": 0.788, + "step": 8670 + }, + { + "epoch": 0.4772414552259343, + "grad_norm": 0.7777243852615356, + "learning_rate": 8.682047903825725e-06, + "loss": 0.8691, + "step": 8671 + }, + { + "epoch": 0.47729649402828994, + "grad_norm": 0.7590457797050476, + "learning_rate": 8.681754634227821e-06, + "loss": 0.8249, + "step": 8672 + }, + { + "epoch": 0.47735153283064563, + "grad_norm": 0.7672324776649475, + "learning_rate": 8.681461336958836e-06, + "loss": 0.8334, + "step": 8673 + }, + { + "epoch": 0.47740657163300126, + "grad_norm": 0.7181395888328552, + "learning_rate": 8.681168012020971e-06, + "loss": 0.8089, + "step": 8674 + }, + { + "epoch": 0.47746161043535695, + "grad_norm": 0.7671428918838501, + "learning_rate": 8.680874659416433e-06, + "loss": 0.7634, + "step": 8675 + }, + { + "epoch": 0.4775166492377126, + "grad_norm": 0.73219895362854, + "learning_rate": 8.680581279147427e-06, + "loss": 0.7013, + "step": 8676 + }, + { + "epoch": 0.4775716880400682, + "grad_norm": 0.8050867319107056, + "learning_rate": 8.680287871216158e-06, + "loss": 0.7524, + "step": 8677 + }, + { + "epoch": 0.4776267268424239, + "grad_norm": 0.7154340744018555, + "learning_rate": 8.679994435624828e-06, + "loss": 0.802, + "step": 8678 + }, + { + "epoch": 0.47768176564477954, + "grad_norm": 0.7005884051322937, + "learning_rate": 8.679700972375647e-06, + "loss": 0.7633, + "step": 8679 + }, + { + "epoch": 0.47773680444713523, + "grad_norm": 0.8203871846199036, + "learning_rate": 8.679407481470818e-06, + "loss": 0.7782, + "step": 8680 + }, + { + "epoch": 0.47779184324949087, + "grad_norm": 0.6582844853401184, + "learning_rate": 8.679113962912547e-06, + "loss": 0.6799, + "step": 8681 + }, + { + "epoch": 0.47784688205184656, + "grad_norm": 0.7052889466285706, + "learning_rate": 8.67882041670304e-06, + "loss": 0.7814, + "step": 8682 + }, + { + "epoch": 0.4779019208542022, + "grad_norm": 0.7533165812492371, + "learning_rate": 8.678526842844504e-06, + "loss": 0.7983, + "step": 8683 + }, + { + "epoch": 0.4779569596565579, + "grad_norm": 0.7335212230682373, + "learning_rate": 8.678233241339144e-06, + "loss": 0.8023, + "step": 8684 + }, + { + "epoch": 0.4780119984589135, + "grad_norm": 0.7824274897575378, + "learning_rate": 8.67793961218917e-06, + "loss": 0.8219, + "step": 8685 + }, + { + "epoch": 0.4780670372612692, + "grad_norm": 0.6547996401786804, + "learning_rate": 8.677645955396784e-06, + "loss": 0.715, + "step": 8686 + }, + { + "epoch": 0.47812207606362483, + "grad_norm": 0.7507368326187134, + "learning_rate": 8.677352270964196e-06, + "loss": 0.9379, + "step": 8687 + }, + { + "epoch": 0.4781771148659805, + "grad_norm": 0.6403020620346069, + "learning_rate": 8.677058558893613e-06, + "loss": 0.659, + "step": 8688 + }, + { + "epoch": 0.47823215366833616, + "grad_norm": 0.7075803279876709, + "learning_rate": 8.676764819187242e-06, + "loss": 0.7515, + "step": 8689 + }, + { + "epoch": 0.47828719247069185, + "grad_norm": 0.6899601817131042, + "learning_rate": 8.676471051847291e-06, + "loss": 0.8398, + "step": 8690 + }, + { + "epoch": 0.4783422312730475, + "grad_norm": 0.7145645618438721, + "learning_rate": 8.676177256875969e-06, + "loss": 0.7711, + "step": 8691 + }, + { + "epoch": 0.47839727007540317, + "grad_norm": 0.7139655351638794, + "learning_rate": 8.675883434275479e-06, + "loss": 0.8664, + "step": 8692 + }, + { + "epoch": 0.4784523088777588, + "grad_norm": 0.7100433111190796, + "learning_rate": 8.675589584048037e-06, + "loss": 0.7812, + "step": 8693 + }, + { + "epoch": 0.4785073476801145, + "grad_norm": 0.6103882789611816, + "learning_rate": 8.675295706195845e-06, + "loss": 0.6565, + "step": 8694 + }, + { + "epoch": 0.4785623864824701, + "grad_norm": 0.7236714959144592, + "learning_rate": 8.675001800721114e-06, + "loss": 0.6849, + "step": 8695 + }, + { + "epoch": 0.4786174252848258, + "grad_norm": 0.7567160129547119, + "learning_rate": 8.674707867626056e-06, + "loss": 0.8289, + "step": 8696 + }, + { + "epoch": 0.47867246408718145, + "grad_norm": 0.7004136443138123, + "learning_rate": 8.674413906912876e-06, + "loss": 0.7466, + "step": 8697 + }, + { + "epoch": 0.47872750288953714, + "grad_norm": 0.713835597038269, + "learning_rate": 8.674119918583783e-06, + "loss": 0.7875, + "step": 8698 + }, + { + "epoch": 0.47878254169189277, + "grad_norm": 0.8476874232292175, + "learning_rate": 8.67382590264099e-06, + "loss": 0.8028, + "step": 8699 + }, + { + "epoch": 0.47883758049424846, + "grad_norm": 0.720273494720459, + "learning_rate": 8.673531859086706e-06, + "loss": 0.7829, + "step": 8700 + }, + { + "epoch": 0.4788926192966041, + "grad_norm": 0.8042417168617249, + "learning_rate": 8.673237787923137e-06, + "loss": 0.7914, + "step": 8701 + }, + { + "epoch": 0.4789476580989598, + "grad_norm": 0.7779260277748108, + "learning_rate": 8.672943689152498e-06, + "loss": 0.6921, + "step": 8702 + }, + { + "epoch": 0.4790026969013154, + "grad_norm": 0.7957637906074524, + "learning_rate": 8.672649562776997e-06, + "loss": 0.8761, + "step": 8703 + }, + { + "epoch": 0.4790577357036711, + "grad_norm": 0.7467649579048157, + "learning_rate": 8.672355408798845e-06, + "loss": 0.7984, + "step": 8704 + }, + { + "epoch": 0.47911277450602674, + "grad_norm": 0.6746538877487183, + "learning_rate": 8.672061227220252e-06, + "loss": 0.7392, + "step": 8705 + }, + { + "epoch": 0.47916781330838243, + "grad_norm": 0.7331795692443848, + "learning_rate": 8.671767018043432e-06, + "loss": 0.7171, + "step": 8706 + }, + { + "epoch": 0.47922285211073806, + "grad_norm": 0.7879608273506165, + "learning_rate": 8.671472781270592e-06, + "loss": 0.8497, + "step": 8707 + }, + { + "epoch": 0.47927789091309375, + "grad_norm": 0.8659428358078003, + "learning_rate": 8.671178516903946e-06, + "loss": 0.8102, + "step": 8708 + }, + { + "epoch": 0.4793329297154494, + "grad_norm": 0.6489408612251282, + "learning_rate": 8.670884224945704e-06, + "loss": 0.6752, + "step": 8709 + }, + { + "epoch": 0.4793879685178051, + "grad_norm": 0.8182825446128845, + "learning_rate": 8.670589905398079e-06, + "loss": 0.7972, + "step": 8710 + }, + { + "epoch": 0.4794430073201607, + "grad_norm": 0.7759343981742859, + "learning_rate": 8.670295558263285e-06, + "loss": 0.7856, + "step": 8711 + }, + { + "epoch": 0.4794980461225164, + "grad_norm": 0.7421835064888, + "learning_rate": 8.670001183543528e-06, + "loss": 0.8165, + "step": 8712 + }, + { + "epoch": 0.47955308492487203, + "grad_norm": 0.6498512625694275, + "learning_rate": 8.669706781241028e-06, + "loss": 0.7212, + "step": 8713 + }, + { + "epoch": 0.4796081237272277, + "grad_norm": 0.8493219614028931, + "learning_rate": 8.669412351357993e-06, + "loss": 0.8036, + "step": 8714 + }, + { + "epoch": 0.47966316252958335, + "grad_norm": 0.6834331750869751, + "learning_rate": 8.669117893896637e-06, + "loss": 0.8127, + "step": 8715 + }, + { + "epoch": 0.47971820133193904, + "grad_norm": 0.7793670296669006, + "learning_rate": 8.668823408859172e-06, + "loss": 0.7276, + "step": 8716 + }, + { + "epoch": 0.4797732401342947, + "grad_norm": 0.7108075022697449, + "learning_rate": 8.668528896247815e-06, + "loss": 0.8328, + "step": 8717 + }, + { + "epoch": 0.47982827893665037, + "grad_norm": 0.6662433743476868, + "learning_rate": 8.668234356064774e-06, + "loss": 0.6751, + "step": 8718 + }, + { + "epoch": 0.479883317739006, + "grad_norm": 0.6595591902732849, + "learning_rate": 8.667939788312267e-06, + "loss": 0.707, + "step": 8719 + }, + { + "epoch": 0.47993835654136163, + "grad_norm": 0.7435836791992188, + "learning_rate": 8.667645192992506e-06, + "loss": 0.7885, + "step": 8720 + }, + { + "epoch": 0.4799933953437173, + "grad_norm": 0.6999356746673584, + "learning_rate": 8.667350570107706e-06, + "loss": 0.7538, + "step": 8721 + }, + { + "epoch": 0.48004843414607296, + "grad_norm": 0.7111191749572754, + "learning_rate": 8.66705591966008e-06, + "loss": 0.6814, + "step": 8722 + }, + { + "epoch": 0.48010347294842864, + "grad_norm": 0.6752734780311584, + "learning_rate": 8.666761241651844e-06, + "loss": 0.7221, + "step": 8723 + }, + { + "epoch": 0.4801585117507843, + "grad_norm": 0.7432951331138611, + "learning_rate": 8.666466536085212e-06, + "loss": 0.7689, + "step": 8724 + }, + { + "epoch": 0.48021355055313997, + "grad_norm": 0.7384392023086548, + "learning_rate": 8.666171802962398e-06, + "loss": 0.7862, + "step": 8725 + }, + { + "epoch": 0.4802685893554956, + "grad_norm": 0.6878762245178223, + "learning_rate": 8.66587704228562e-06, + "loss": 0.7246, + "step": 8726 + }, + { + "epoch": 0.4803236281578513, + "grad_norm": 0.6640586853027344, + "learning_rate": 8.66558225405709e-06, + "loss": 0.7181, + "step": 8727 + }, + { + "epoch": 0.4803786669602069, + "grad_norm": 0.6808595061302185, + "learning_rate": 8.665287438279024e-06, + "loss": 0.7866, + "step": 8728 + }, + { + "epoch": 0.4804337057625626, + "grad_norm": 0.5966268181800842, + "learning_rate": 8.66499259495364e-06, + "loss": 0.6755, + "step": 8729 + }, + { + "epoch": 0.48048874456491825, + "grad_norm": 0.742016077041626, + "learning_rate": 8.664697724083152e-06, + "loss": 0.8682, + "step": 8730 + }, + { + "epoch": 0.48054378336727394, + "grad_norm": 0.6621154546737671, + "learning_rate": 8.66440282566978e-06, + "loss": 0.7525, + "step": 8731 + }, + { + "epoch": 0.48059882216962957, + "grad_norm": 0.7347434759140015, + "learning_rate": 8.664107899715733e-06, + "loss": 0.7919, + "step": 8732 + }, + { + "epoch": 0.48065386097198526, + "grad_norm": 0.7564681172370911, + "learning_rate": 8.663812946223234e-06, + "loss": 0.9172, + "step": 8733 + }, + { + "epoch": 0.4807088997743409, + "grad_norm": 0.7193084359169006, + "learning_rate": 8.663517965194497e-06, + "loss": 0.7931, + "step": 8734 + }, + { + "epoch": 0.4807639385766966, + "grad_norm": 0.6882064938545227, + "learning_rate": 8.66322295663174e-06, + "loss": 0.7678, + "step": 8735 + }, + { + "epoch": 0.4808189773790522, + "grad_norm": 0.7954713106155396, + "learning_rate": 8.662927920537179e-06, + "loss": 0.6357, + "step": 8736 + }, + { + "epoch": 0.4808740161814079, + "grad_norm": 0.7123041749000549, + "learning_rate": 8.662632856913034e-06, + "loss": 0.7234, + "step": 8737 + }, + { + "epoch": 0.48092905498376354, + "grad_norm": 0.745145320892334, + "learning_rate": 8.66233776576152e-06, + "loss": 0.7516, + "step": 8738 + }, + { + "epoch": 0.4809840937861192, + "grad_norm": 0.6904219388961792, + "learning_rate": 8.662042647084856e-06, + "loss": 0.7995, + "step": 8739 + }, + { + "epoch": 0.48103913258847486, + "grad_norm": 0.71831214427948, + "learning_rate": 8.661747500885258e-06, + "loss": 0.7965, + "step": 8740 + }, + { + "epoch": 0.48109417139083055, + "grad_norm": 0.8514378666877747, + "learning_rate": 8.661452327164948e-06, + "loss": 0.8023, + "step": 8741 + }, + { + "epoch": 0.4811492101931862, + "grad_norm": 0.7411143779754639, + "learning_rate": 8.66115712592614e-06, + "loss": 0.797, + "step": 8742 + }, + { + "epoch": 0.4812042489955419, + "grad_norm": 0.737178385257721, + "learning_rate": 8.660861897171057e-06, + "loss": 0.7286, + "step": 8743 + }, + { + "epoch": 0.4812592877978975, + "grad_norm": 0.6823513507843018, + "learning_rate": 8.660566640901918e-06, + "loss": 0.7482, + "step": 8744 + }, + { + "epoch": 0.4813143266002532, + "grad_norm": 0.7205879092216492, + "learning_rate": 8.660271357120937e-06, + "loss": 0.8294, + "step": 8745 + }, + { + "epoch": 0.48136936540260883, + "grad_norm": 0.6887338757514954, + "learning_rate": 8.659976045830337e-06, + "loss": 0.7711, + "step": 8746 + }, + { + "epoch": 0.4814244042049645, + "grad_norm": 0.7498533129692078, + "learning_rate": 8.659680707032336e-06, + "loss": 0.7296, + "step": 8747 + }, + { + "epoch": 0.48147944300732015, + "grad_norm": 0.8041636943817139, + "learning_rate": 8.659385340729155e-06, + "loss": 0.9213, + "step": 8748 + }, + { + "epoch": 0.48153448180967584, + "grad_norm": 0.8623721599578857, + "learning_rate": 8.659089946923014e-06, + "loss": 0.8024, + "step": 8749 + }, + { + "epoch": 0.4815895206120315, + "grad_norm": 0.7212050557136536, + "learning_rate": 8.658794525616132e-06, + "loss": 0.732, + "step": 8750 + }, + { + "epoch": 0.48164455941438716, + "grad_norm": 0.7141492366790771, + "learning_rate": 8.658499076810729e-06, + "loss": 0.8062, + "step": 8751 + }, + { + "epoch": 0.4816995982167428, + "grad_norm": 0.7191516160964966, + "learning_rate": 8.658203600509027e-06, + "loss": 0.805, + "step": 8752 + }, + { + "epoch": 0.4817546370190985, + "grad_norm": 0.71059650182724, + "learning_rate": 8.657908096713245e-06, + "loss": 0.6755, + "step": 8753 + }, + { + "epoch": 0.4818096758214541, + "grad_norm": 0.6715459823608398, + "learning_rate": 8.657612565425607e-06, + "loss": 0.8093, + "step": 8754 + }, + { + "epoch": 0.4818647146238098, + "grad_norm": 0.7438814640045166, + "learning_rate": 8.65731700664833e-06, + "loss": 0.8059, + "step": 8755 + }, + { + "epoch": 0.48191975342616544, + "grad_norm": 0.7295387387275696, + "learning_rate": 8.657021420383637e-06, + "loss": 0.8437, + "step": 8756 + }, + { + "epoch": 0.48197479222852113, + "grad_norm": 0.7053797245025635, + "learning_rate": 8.656725806633753e-06, + "loss": 0.8424, + "step": 8757 + }, + { + "epoch": 0.48202983103087677, + "grad_norm": 0.6902007460594177, + "learning_rate": 8.656430165400894e-06, + "loss": 0.6967, + "step": 8758 + }, + { + "epoch": 0.48208486983323245, + "grad_norm": 0.66749507188797, + "learning_rate": 8.656134496687286e-06, + "loss": 0.7858, + "step": 8759 + }, + { + "epoch": 0.4821399086355881, + "grad_norm": 0.6755428314208984, + "learning_rate": 8.65583880049515e-06, + "loss": 0.6669, + "step": 8760 + }, + { + "epoch": 0.4821949474379438, + "grad_norm": 0.921096920967102, + "learning_rate": 8.655543076826706e-06, + "loss": 0.8545, + "step": 8761 + }, + { + "epoch": 0.4822499862402994, + "grad_norm": 0.7931553721427917, + "learning_rate": 8.65524732568418e-06, + "loss": 0.8708, + "step": 8762 + }, + { + "epoch": 0.48230502504265504, + "grad_norm": 0.7891780734062195, + "learning_rate": 8.654951547069794e-06, + "loss": 0.687, + "step": 8763 + }, + { + "epoch": 0.48236006384501073, + "grad_norm": 0.747662365436554, + "learning_rate": 8.65465574098577e-06, + "loss": 0.8153, + "step": 8764 + }, + { + "epoch": 0.48241510264736637, + "grad_norm": 0.7758497595787048, + "learning_rate": 8.65435990743433e-06, + "loss": 0.8018, + "step": 8765 + }, + { + "epoch": 0.48247014144972206, + "grad_norm": 0.6997805237770081, + "learning_rate": 8.654064046417703e-06, + "loss": 0.7845, + "step": 8766 + }, + { + "epoch": 0.4825251802520777, + "grad_norm": 0.7188366651535034, + "learning_rate": 8.653768157938106e-06, + "loss": 0.7528, + "step": 8767 + }, + { + "epoch": 0.4825802190544334, + "grad_norm": 0.6848055124282837, + "learning_rate": 8.653472241997767e-06, + "loss": 0.7658, + "step": 8768 + }, + { + "epoch": 0.482635257856789, + "grad_norm": 1.0603824853897095, + "learning_rate": 8.653176298598907e-06, + "loss": 0.7692, + "step": 8769 + }, + { + "epoch": 0.4826902966591447, + "grad_norm": 0.8191514611244202, + "learning_rate": 8.652880327743753e-06, + "loss": 0.7706, + "step": 8770 + }, + { + "epoch": 0.48274533546150034, + "grad_norm": 0.6318503618240356, + "learning_rate": 8.652584329434527e-06, + "loss": 0.6635, + "step": 8771 + }, + { + "epoch": 0.482800374263856, + "grad_norm": 0.6860769391059875, + "learning_rate": 8.652288303673457e-06, + "loss": 0.739, + "step": 8772 + }, + { + "epoch": 0.48285541306621166, + "grad_norm": 0.7414761185646057, + "learning_rate": 8.651992250462765e-06, + "loss": 0.7949, + "step": 8773 + }, + { + "epoch": 0.48291045186856735, + "grad_norm": 0.7255183458328247, + "learning_rate": 8.651696169804676e-06, + "loss": 0.8569, + "step": 8774 + }, + { + "epoch": 0.482965490670923, + "grad_norm": 0.7034135460853577, + "learning_rate": 8.651400061701417e-06, + "loss": 0.7562, + "step": 8775 + }, + { + "epoch": 0.48302052947327867, + "grad_norm": 0.7041038274765015, + "learning_rate": 8.651103926155212e-06, + "loss": 0.7194, + "step": 8776 + }, + { + "epoch": 0.4830755682756343, + "grad_norm": 1.0965619087219238, + "learning_rate": 8.650807763168287e-06, + "loss": 0.9033, + "step": 8777 + }, + { + "epoch": 0.48313060707799, + "grad_norm": 0.7400044798851013, + "learning_rate": 8.650511572742869e-06, + "loss": 0.7626, + "step": 8778 + }, + { + "epoch": 0.4831856458803456, + "grad_norm": 0.6957885026931763, + "learning_rate": 8.650215354881182e-06, + "loss": 0.7283, + "step": 8779 + }, + { + "epoch": 0.4832406846827013, + "grad_norm": 0.7992473840713501, + "learning_rate": 8.649919109585454e-06, + "loss": 0.8376, + "step": 8780 + }, + { + "epoch": 0.48329572348505695, + "grad_norm": 0.8556981086730957, + "learning_rate": 8.649622836857911e-06, + "loss": 0.7737, + "step": 8781 + }, + { + "epoch": 0.48335076228741264, + "grad_norm": 0.8476192355155945, + "learning_rate": 8.64932653670078e-06, + "loss": 0.8926, + "step": 8782 + }, + { + "epoch": 0.48340580108976827, + "grad_norm": 0.6461093425750732, + "learning_rate": 8.649030209116289e-06, + "loss": 0.7452, + "step": 8783 + }, + { + "epoch": 0.48346083989212396, + "grad_norm": 0.6997528076171875, + "learning_rate": 8.648733854106661e-06, + "loss": 0.7962, + "step": 8784 + }, + { + "epoch": 0.4835158786944796, + "grad_norm": 0.7606356739997864, + "learning_rate": 8.648437471674128e-06, + "loss": 0.6517, + "step": 8785 + }, + { + "epoch": 0.4835709174968353, + "grad_norm": 0.8118630051612854, + "learning_rate": 8.648141061820913e-06, + "loss": 0.7539, + "step": 8786 + }, + { + "epoch": 0.4836259562991909, + "grad_norm": 0.8778805136680603, + "learning_rate": 8.64784462454925e-06, + "loss": 0.763, + "step": 8787 + }, + { + "epoch": 0.4836809951015466, + "grad_norm": 0.7741022706031799, + "learning_rate": 8.647548159861361e-06, + "loss": 0.7749, + "step": 8788 + }, + { + "epoch": 0.48373603390390224, + "grad_norm": 0.76578688621521, + "learning_rate": 8.647251667759478e-06, + "loss": 0.6968, + "step": 8789 + }, + { + "epoch": 0.48379107270625793, + "grad_norm": 0.8477250933647156, + "learning_rate": 8.646955148245827e-06, + "loss": 0.8364, + "step": 8790 + }, + { + "epoch": 0.48384611150861356, + "grad_norm": 0.9105041027069092, + "learning_rate": 8.646658601322635e-06, + "loss": 0.823, + "step": 8791 + }, + { + "epoch": 0.48390115031096925, + "grad_norm": 0.7642726898193359, + "learning_rate": 8.646362026992135e-06, + "loss": 0.721, + "step": 8792 + }, + { + "epoch": 0.4839561891133249, + "grad_norm": 0.7567259669303894, + "learning_rate": 8.646065425256555e-06, + "loss": 0.7876, + "step": 8793 + }, + { + "epoch": 0.4840112279156806, + "grad_norm": 0.7691231966018677, + "learning_rate": 8.64576879611812e-06, + "loss": 0.8308, + "step": 8794 + }, + { + "epoch": 0.4840662667180362, + "grad_norm": 1.0769426822662354, + "learning_rate": 8.645472139579067e-06, + "loss": 0.892, + "step": 8795 + }, + { + "epoch": 0.4841213055203919, + "grad_norm": 0.6987955570220947, + "learning_rate": 8.64517545564162e-06, + "loss": 0.8254, + "step": 8796 + }, + { + "epoch": 0.48417634432274753, + "grad_norm": 0.7736005783081055, + "learning_rate": 8.644878744308007e-06, + "loss": 0.7666, + "step": 8797 + }, + { + "epoch": 0.4842313831251032, + "grad_norm": 0.6233380436897278, + "learning_rate": 8.644582005580464e-06, + "loss": 0.6443, + "step": 8798 + }, + { + "epoch": 0.48428642192745885, + "grad_norm": 0.7343530654907227, + "learning_rate": 8.644285239461217e-06, + "loss": 0.724, + "step": 8799 + }, + { + "epoch": 0.48434146072981454, + "grad_norm": 0.725321352481842, + "learning_rate": 8.643988445952499e-06, + "loss": 0.7249, + "step": 8800 + }, + { + "epoch": 0.4843964995321702, + "grad_norm": 0.7256256341934204, + "learning_rate": 8.643691625056539e-06, + "loss": 0.8656, + "step": 8801 + }, + { + "epoch": 0.48445153833452587, + "grad_norm": 0.8559528589248657, + "learning_rate": 8.643394776775567e-06, + "loss": 0.9186, + "step": 8802 + }, + { + "epoch": 0.4845065771368815, + "grad_norm": 0.6735692024230957, + "learning_rate": 8.643097901111815e-06, + "loss": 0.7007, + "step": 8803 + }, + { + "epoch": 0.4845616159392372, + "grad_norm": 0.8373280167579651, + "learning_rate": 8.642800998067515e-06, + "loss": 0.8774, + "step": 8804 + }, + { + "epoch": 0.4846166547415928, + "grad_norm": 0.731311023235321, + "learning_rate": 8.642504067644898e-06, + "loss": 0.7102, + "step": 8805 + }, + { + "epoch": 0.48467169354394846, + "grad_norm": 0.7259742617607117, + "learning_rate": 8.642207109846195e-06, + "loss": 0.7174, + "step": 8806 + }, + { + "epoch": 0.48472673234630415, + "grad_norm": 0.6454386115074158, + "learning_rate": 8.641910124673638e-06, + "loss": 0.7656, + "step": 8807 + }, + { + "epoch": 0.4847817711486598, + "grad_norm": 0.7701624631881714, + "learning_rate": 8.641613112129462e-06, + "loss": 0.7926, + "step": 8808 + }, + { + "epoch": 0.48483680995101547, + "grad_norm": 0.6812854409217834, + "learning_rate": 8.641316072215893e-06, + "loss": 0.7072, + "step": 8809 + }, + { + "epoch": 0.4848918487533711, + "grad_norm": 0.8180119395256042, + "learning_rate": 8.641019004935169e-06, + "loss": 0.8621, + "step": 8810 + }, + { + "epoch": 0.4849468875557268, + "grad_norm": 0.6346331834793091, + "learning_rate": 8.64072191028952e-06, + "loss": 0.6907, + "step": 8811 + }, + { + "epoch": 0.4850019263580824, + "grad_norm": 0.6819741129875183, + "learning_rate": 8.64042478828118e-06, + "loss": 0.77, + "step": 8812 + }, + { + "epoch": 0.4850569651604381, + "grad_norm": 0.9074214100837708, + "learning_rate": 8.640127638912383e-06, + "loss": 0.7799, + "step": 8813 + }, + { + "epoch": 0.48511200396279375, + "grad_norm": 0.8065158724784851, + "learning_rate": 8.63983046218536e-06, + "loss": 0.8033, + "step": 8814 + }, + { + "epoch": 0.48516704276514944, + "grad_norm": 0.6241241097450256, + "learning_rate": 8.639533258102345e-06, + "loss": 0.6936, + "step": 8815 + }, + { + "epoch": 0.48522208156750507, + "grad_norm": 0.6928265690803528, + "learning_rate": 8.639236026665573e-06, + "loss": 0.7526, + "step": 8816 + }, + { + "epoch": 0.48527712036986076, + "grad_norm": 0.8171425461769104, + "learning_rate": 8.638938767877276e-06, + "loss": 0.8227, + "step": 8817 + }, + { + "epoch": 0.4853321591722164, + "grad_norm": 0.7007083296775818, + "learning_rate": 8.638641481739692e-06, + "loss": 0.7439, + "step": 8818 + }, + { + "epoch": 0.4853871979745721, + "grad_norm": 0.8905115127563477, + "learning_rate": 8.63834416825505e-06, + "loss": 0.6873, + "step": 8819 + }, + { + "epoch": 0.4854422367769277, + "grad_norm": 0.702198326587677, + "learning_rate": 8.638046827425588e-06, + "loss": 0.7999, + "step": 8820 + }, + { + "epoch": 0.4854972755792834, + "grad_norm": 0.7280104160308838, + "learning_rate": 8.63774945925354e-06, + "loss": 0.8562, + "step": 8821 + }, + { + "epoch": 0.48555231438163904, + "grad_norm": 0.9803630113601685, + "learning_rate": 8.63745206374114e-06, + "loss": 0.8347, + "step": 8822 + }, + { + "epoch": 0.4856073531839947, + "grad_norm": 0.6781168580055237, + "learning_rate": 8.637154640890625e-06, + "loss": 0.8124, + "step": 8823 + }, + { + "epoch": 0.48566239198635036, + "grad_norm": 0.7219669222831726, + "learning_rate": 8.63685719070423e-06, + "loss": 0.8053, + "step": 8824 + }, + { + "epoch": 0.48571743078870605, + "grad_norm": 0.7077241539955139, + "learning_rate": 8.636559713184187e-06, + "loss": 0.7534, + "step": 8825 + }, + { + "epoch": 0.4857724695910617, + "grad_norm": 0.70063316822052, + "learning_rate": 8.636262208332737e-06, + "loss": 0.7509, + "step": 8826 + }, + { + "epoch": 0.4858275083934174, + "grad_norm": 0.7292184233665466, + "learning_rate": 8.635964676152114e-06, + "loss": 0.7485, + "step": 8827 + }, + { + "epoch": 0.485882547195773, + "grad_norm": 0.7970258593559265, + "learning_rate": 8.635667116644552e-06, + "loss": 0.8874, + "step": 8828 + }, + { + "epoch": 0.4859375859981287, + "grad_norm": 0.7090024352073669, + "learning_rate": 8.63536952981229e-06, + "loss": 0.7665, + "step": 8829 + }, + { + "epoch": 0.48599262480048433, + "grad_norm": 0.761409342288971, + "learning_rate": 8.635071915657565e-06, + "loss": 0.7977, + "step": 8830 + }, + { + "epoch": 0.48604766360284, + "grad_norm": 0.724896252155304, + "learning_rate": 8.634774274182611e-06, + "loss": 0.8591, + "step": 8831 + }, + { + "epoch": 0.48610270240519565, + "grad_norm": 0.737424910068512, + "learning_rate": 8.634476605389666e-06, + "loss": 0.8256, + "step": 8832 + }, + { + "epoch": 0.48615774120755134, + "grad_norm": 0.8261227607727051, + "learning_rate": 8.63417890928097e-06, + "loss": 0.8089, + "step": 8833 + }, + { + "epoch": 0.486212780009907, + "grad_norm": 0.6744595766067505, + "learning_rate": 8.633881185858756e-06, + "loss": 0.7821, + "step": 8834 + }, + { + "epoch": 0.48626781881226266, + "grad_norm": 0.6717672944068909, + "learning_rate": 8.633583435125263e-06, + "loss": 0.7823, + "step": 8835 + }, + { + "epoch": 0.4863228576146183, + "grad_norm": 0.753616213798523, + "learning_rate": 8.633285657082732e-06, + "loss": 0.8044, + "step": 8836 + }, + { + "epoch": 0.486377896416974, + "grad_norm": 0.6910914182662964, + "learning_rate": 8.632987851733397e-06, + "loss": 0.8244, + "step": 8837 + }, + { + "epoch": 0.4864329352193296, + "grad_norm": 0.9127064347267151, + "learning_rate": 8.632690019079499e-06, + "loss": 0.7918, + "step": 8838 + }, + { + "epoch": 0.4864879740216853, + "grad_norm": 0.715918779373169, + "learning_rate": 8.632392159123274e-06, + "loss": 0.744, + "step": 8839 + }, + { + "epoch": 0.48654301282404094, + "grad_norm": 0.8206684589385986, + "learning_rate": 8.632094271866963e-06, + "loss": 0.7852, + "step": 8840 + }, + { + "epoch": 0.48659805162639663, + "grad_norm": 0.6502171158790588, + "learning_rate": 8.631796357312802e-06, + "loss": 0.7653, + "step": 8841 + }, + { + "epoch": 0.48665309042875227, + "grad_norm": 0.6987786889076233, + "learning_rate": 8.631498415463033e-06, + "loss": 0.7669, + "step": 8842 + }, + { + "epoch": 0.48670812923110796, + "grad_norm": 0.7902390360832214, + "learning_rate": 8.631200446319894e-06, + "loss": 0.8438, + "step": 8843 + }, + { + "epoch": 0.4867631680334636, + "grad_norm": 0.7464659810066223, + "learning_rate": 8.630902449885625e-06, + "loss": 0.8276, + "step": 8844 + }, + { + "epoch": 0.4868182068358193, + "grad_norm": 0.7375630736351013, + "learning_rate": 8.630604426162465e-06, + "loss": 0.7921, + "step": 8845 + }, + { + "epoch": 0.4868732456381749, + "grad_norm": 0.7206295728683472, + "learning_rate": 8.630306375152653e-06, + "loss": 0.8424, + "step": 8846 + }, + { + "epoch": 0.4869282844405306, + "grad_norm": 0.7384368181228638, + "learning_rate": 8.63000829685843e-06, + "loss": 0.8702, + "step": 8847 + }, + { + "epoch": 0.48698332324288623, + "grad_norm": 0.7839015126228333, + "learning_rate": 8.629710191282037e-06, + "loss": 0.7064, + "step": 8848 + }, + { + "epoch": 0.48703836204524187, + "grad_norm": 0.6909724473953247, + "learning_rate": 8.629412058425712e-06, + "loss": 0.6924, + "step": 8849 + }, + { + "epoch": 0.48709340084759756, + "grad_norm": 0.6553036570549011, + "learning_rate": 8.6291138982917e-06, + "loss": 0.6526, + "step": 8850 + }, + { + "epoch": 0.4871484396499532, + "grad_norm": 0.7202072143554688, + "learning_rate": 8.628815710882239e-06, + "loss": 0.7272, + "step": 8851 + }, + { + "epoch": 0.4872034784523089, + "grad_norm": 0.6898619532585144, + "learning_rate": 8.62851749619957e-06, + "loss": 0.7687, + "step": 8852 + }, + { + "epoch": 0.4872585172546645, + "grad_norm": 0.7888908386230469, + "learning_rate": 8.628219254245935e-06, + "loss": 0.7654, + "step": 8853 + }, + { + "epoch": 0.4873135560570202, + "grad_norm": 0.7312424778938293, + "learning_rate": 8.627920985023575e-06, + "loss": 0.8053, + "step": 8854 + }, + { + "epoch": 0.48736859485937584, + "grad_norm": 0.6588439345359802, + "learning_rate": 8.627622688534731e-06, + "loss": 0.7229, + "step": 8855 + }, + { + "epoch": 0.4874236336617315, + "grad_norm": 0.8292293548583984, + "learning_rate": 8.627324364781647e-06, + "loss": 0.8482, + "step": 8856 + }, + { + "epoch": 0.48747867246408716, + "grad_norm": 0.7573973536491394, + "learning_rate": 8.627026013766564e-06, + "loss": 0.7282, + "step": 8857 + }, + { + "epoch": 0.48753371126644285, + "grad_norm": 1.2215768098831177, + "learning_rate": 8.626727635491726e-06, + "loss": 0.7771, + "step": 8858 + }, + { + "epoch": 0.4875887500687985, + "grad_norm": 0.7324759364128113, + "learning_rate": 8.626429229959369e-06, + "loss": 0.781, + "step": 8859 + }, + { + "epoch": 0.48764378887115417, + "grad_norm": 0.6995676159858704, + "learning_rate": 8.626130797171745e-06, + "loss": 0.6907, + "step": 8860 + }, + { + "epoch": 0.4876988276735098, + "grad_norm": 0.7400509119033813, + "learning_rate": 8.625832337131092e-06, + "loss": 0.6572, + "step": 8861 + }, + { + "epoch": 0.4877538664758655, + "grad_norm": 0.6634842753410339, + "learning_rate": 8.625533849839653e-06, + "loss": 0.7229, + "step": 8862 + }, + { + "epoch": 0.4878089052782211, + "grad_norm": 0.7357299327850342, + "learning_rate": 8.625235335299673e-06, + "loss": 0.6418, + "step": 8863 + }, + { + "epoch": 0.4878639440805768, + "grad_norm": 0.6473466157913208, + "learning_rate": 8.624936793513394e-06, + "loss": 0.6796, + "step": 8864 + }, + { + "epoch": 0.48791898288293245, + "grad_norm": 0.9110734462738037, + "learning_rate": 8.62463822448306e-06, + "loss": 0.8143, + "step": 8865 + }, + { + "epoch": 0.48797402168528814, + "grad_norm": 0.7932308316230774, + "learning_rate": 8.624339628210916e-06, + "loss": 0.9103, + "step": 8866 + }, + { + "epoch": 0.4880290604876438, + "grad_norm": 0.6677752137184143, + "learning_rate": 8.624041004699205e-06, + "loss": 0.8073, + "step": 8867 + }, + { + "epoch": 0.48808409928999946, + "grad_norm": 0.7379121780395508, + "learning_rate": 8.623742353950171e-06, + "loss": 0.8643, + "step": 8868 + }, + { + "epoch": 0.4881391380923551, + "grad_norm": 0.7479479312896729, + "learning_rate": 8.623443675966062e-06, + "loss": 0.6117, + "step": 8869 + }, + { + "epoch": 0.4881941768947108, + "grad_norm": 0.7822794914245605, + "learning_rate": 8.623144970749118e-06, + "loss": 0.8629, + "step": 8870 + }, + { + "epoch": 0.4882492156970664, + "grad_norm": 0.7040950655937195, + "learning_rate": 8.622846238301587e-06, + "loss": 0.7519, + "step": 8871 + }, + { + "epoch": 0.4883042544994221, + "grad_norm": 0.747368574142456, + "learning_rate": 8.622547478625714e-06, + "loss": 0.7459, + "step": 8872 + }, + { + "epoch": 0.48835929330177774, + "grad_norm": 0.6755948066711426, + "learning_rate": 8.622248691723742e-06, + "loss": 0.7515, + "step": 8873 + }, + { + "epoch": 0.48841433210413343, + "grad_norm": 0.7265586256980896, + "learning_rate": 8.62194987759792e-06, + "loss": 0.7691, + "step": 8874 + }, + { + "epoch": 0.48846937090648906, + "grad_norm": 0.6696380972862244, + "learning_rate": 8.621651036250493e-06, + "loss": 0.778, + "step": 8875 + }, + { + "epoch": 0.48852440970884475, + "grad_norm": 0.7666454911231995, + "learning_rate": 8.621352167683705e-06, + "loss": 0.7396, + "step": 8876 + }, + { + "epoch": 0.4885794485112004, + "grad_norm": 0.7079235315322876, + "learning_rate": 8.621053271899803e-06, + "loss": 0.7917, + "step": 8877 + }, + { + "epoch": 0.4886344873135561, + "grad_norm": 0.6888919472694397, + "learning_rate": 8.620754348901034e-06, + "loss": 0.605, + "step": 8878 + }, + { + "epoch": 0.4886895261159117, + "grad_norm": 0.7177572250366211, + "learning_rate": 8.620455398689645e-06, + "loss": 0.7534, + "step": 8879 + }, + { + "epoch": 0.4887445649182674, + "grad_norm": 0.7268772721290588, + "learning_rate": 8.620156421267883e-06, + "loss": 0.7748, + "step": 8880 + }, + { + "epoch": 0.48879960372062303, + "grad_norm": 0.8015080690383911, + "learning_rate": 8.619857416637993e-06, + "loss": 0.6716, + "step": 8881 + }, + { + "epoch": 0.4888546425229787, + "grad_norm": 0.7464118599891663, + "learning_rate": 8.619558384802226e-06, + "loss": 0.796, + "step": 8882 + }, + { + "epoch": 0.48890968132533436, + "grad_norm": 0.6829718351364136, + "learning_rate": 8.619259325762826e-06, + "loss": 0.788, + "step": 8883 + }, + { + "epoch": 0.48896472012769004, + "grad_norm": 0.6553084850311279, + "learning_rate": 8.618960239522041e-06, + "loss": 0.7215, + "step": 8884 + }, + { + "epoch": 0.4890197589300457, + "grad_norm": 0.8056252598762512, + "learning_rate": 8.618661126082119e-06, + "loss": 0.8588, + "step": 8885 + }, + { + "epoch": 0.48907479773240137, + "grad_norm": 0.8145674467086792, + "learning_rate": 8.618361985445309e-06, + "loss": 0.8095, + "step": 8886 + }, + { + "epoch": 0.489129836534757, + "grad_norm": 0.740031898021698, + "learning_rate": 8.61806281761386e-06, + "loss": 0.7029, + "step": 8887 + }, + { + "epoch": 0.4891848753371127, + "grad_norm": 0.7442640662193298, + "learning_rate": 8.617763622590019e-06, + "loss": 0.782, + "step": 8888 + }, + { + "epoch": 0.4892399141394683, + "grad_norm": 0.6992725133895874, + "learning_rate": 8.617464400376035e-06, + "loss": 0.7877, + "step": 8889 + }, + { + "epoch": 0.489294952941824, + "grad_norm": 1.19756281375885, + "learning_rate": 8.617165150974157e-06, + "loss": 0.6985, + "step": 8890 + }, + { + "epoch": 0.48934999174417965, + "grad_norm": 0.6418262720108032, + "learning_rate": 8.616865874386633e-06, + "loss": 0.7385, + "step": 8891 + }, + { + "epoch": 0.4894050305465353, + "grad_norm": 0.787406325340271, + "learning_rate": 8.616566570615714e-06, + "loss": 0.8686, + "step": 8892 + }, + { + "epoch": 0.48946006934889097, + "grad_norm": 0.6990430951118469, + "learning_rate": 8.616267239663648e-06, + "loss": 0.7683, + "step": 8893 + }, + { + "epoch": 0.4895151081512466, + "grad_norm": 0.7180235981941223, + "learning_rate": 8.615967881532687e-06, + "loss": 0.8337, + "step": 8894 + }, + { + "epoch": 0.4895701469536023, + "grad_norm": 0.7647475600242615, + "learning_rate": 8.615668496225077e-06, + "loss": 0.8668, + "step": 8895 + }, + { + "epoch": 0.4896251857559579, + "grad_norm": 0.843063473701477, + "learning_rate": 8.615369083743072e-06, + "loss": 0.7968, + "step": 8896 + }, + { + "epoch": 0.4896802245583136, + "grad_norm": 0.9526075124740601, + "learning_rate": 8.61506964408892e-06, + "loss": 0.8766, + "step": 8897 + }, + { + "epoch": 0.48973526336066925, + "grad_norm": 0.7850056290626526, + "learning_rate": 8.614770177264874e-06, + "loss": 0.8033, + "step": 8898 + }, + { + "epoch": 0.48979030216302494, + "grad_norm": 0.8658629655838013, + "learning_rate": 8.614470683273182e-06, + "loss": 0.8206, + "step": 8899 + }, + { + "epoch": 0.48984534096538057, + "grad_norm": 0.8060176968574524, + "learning_rate": 8.614171162116096e-06, + "loss": 0.7602, + "step": 8900 + }, + { + "epoch": 0.48990037976773626, + "grad_norm": 0.7398280501365662, + "learning_rate": 8.613871613795865e-06, + "loss": 0.8067, + "step": 8901 + }, + { + "epoch": 0.4899554185700919, + "grad_norm": 0.7341256141662598, + "learning_rate": 8.613572038314744e-06, + "loss": 0.7305, + "step": 8902 + }, + { + "epoch": 0.4900104573724476, + "grad_norm": 0.7832887172698975, + "learning_rate": 8.613272435674984e-06, + "loss": 0.7012, + "step": 8903 + }, + { + "epoch": 0.4900654961748032, + "grad_norm": 0.6536995768547058, + "learning_rate": 8.612972805878834e-06, + "loss": 0.745, + "step": 8904 + }, + { + "epoch": 0.4901205349771589, + "grad_norm": 0.7511856555938721, + "learning_rate": 8.612673148928547e-06, + "loss": 0.7741, + "step": 8905 + }, + { + "epoch": 0.49017557377951454, + "grad_norm": 0.6117261648178101, + "learning_rate": 8.612373464826377e-06, + "loss": 0.5813, + "step": 8906 + }, + { + "epoch": 0.49023061258187023, + "grad_norm": 0.7832254767417908, + "learning_rate": 8.612073753574574e-06, + "loss": 0.7426, + "step": 8907 + }, + { + "epoch": 0.49028565138422586, + "grad_norm": 0.7516622543334961, + "learning_rate": 8.611774015175393e-06, + "loss": 0.8205, + "step": 8908 + }, + { + "epoch": 0.49034069018658155, + "grad_norm": 0.7776936888694763, + "learning_rate": 8.611474249631085e-06, + "loss": 0.8457, + "step": 8909 + }, + { + "epoch": 0.4903957289889372, + "grad_norm": 0.9364853501319885, + "learning_rate": 8.6111744569439e-06, + "loss": 0.9114, + "step": 8910 + }, + { + "epoch": 0.4904507677912929, + "grad_norm": 0.7584181427955627, + "learning_rate": 8.610874637116099e-06, + "loss": 0.6852, + "step": 8911 + }, + { + "epoch": 0.4905058065936485, + "grad_norm": 0.7326254844665527, + "learning_rate": 8.610574790149929e-06, + "loss": 0.7843, + "step": 8912 + }, + { + "epoch": 0.4905608453960042, + "grad_norm": 0.918258547782898, + "learning_rate": 8.610274916047645e-06, + "loss": 0.766, + "step": 8913 + }, + { + "epoch": 0.49061588419835983, + "grad_norm": 1.0083420276641846, + "learning_rate": 8.609975014811502e-06, + "loss": 0.7436, + "step": 8914 + }, + { + "epoch": 0.4906709230007155, + "grad_norm": 0.712664783000946, + "learning_rate": 8.609675086443752e-06, + "loss": 0.7891, + "step": 8915 + }, + { + "epoch": 0.49072596180307115, + "grad_norm": 0.7635206580162048, + "learning_rate": 8.609375130946651e-06, + "loss": 0.7842, + "step": 8916 + }, + { + "epoch": 0.49078100060542684, + "grad_norm": 0.7567723989486694, + "learning_rate": 8.609075148322452e-06, + "loss": 0.8435, + "step": 8917 + }, + { + "epoch": 0.4908360394077825, + "grad_norm": 0.8918718099594116, + "learning_rate": 8.60877513857341e-06, + "loss": 0.8015, + "step": 8918 + }, + { + "epoch": 0.49089107821013817, + "grad_norm": 0.8701914548873901, + "learning_rate": 8.608475101701781e-06, + "loss": 0.7806, + "step": 8919 + }, + { + "epoch": 0.4909461170124938, + "grad_norm": 0.7528215646743774, + "learning_rate": 8.608175037709819e-06, + "loss": 0.7958, + "step": 8920 + }, + { + "epoch": 0.4910011558148495, + "grad_norm": 0.7277387380599976, + "learning_rate": 8.60787494659978e-06, + "loss": 0.7878, + "step": 8921 + }, + { + "epoch": 0.4910561946172051, + "grad_norm": 0.6739892959594727, + "learning_rate": 8.607574828373917e-06, + "loss": 0.7212, + "step": 8922 + }, + { + "epoch": 0.4911112334195608, + "grad_norm": 0.712480366230011, + "learning_rate": 8.607274683034487e-06, + "loss": 0.7966, + "step": 8923 + }, + { + "epoch": 0.49116627222191644, + "grad_norm": 0.7192126512527466, + "learning_rate": 8.606974510583747e-06, + "loss": 0.7032, + "step": 8924 + }, + { + "epoch": 0.49122131102427213, + "grad_norm": 0.7502614855766296, + "learning_rate": 8.606674311023953e-06, + "loss": 0.7465, + "step": 8925 + }, + { + "epoch": 0.49127634982662777, + "grad_norm": 0.8475236892700195, + "learning_rate": 8.606374084357361e-06, + "loss": 0.8083, + "step": 8926 + }, + { + "epoch": 0.49133138862898346, + "grad_norm": 0.6972761750221252, + "learning_rate": 8.606073830586224e-06, + "loss": 0.7206, + "step": 8927 + }, + { + "epoch": 0.4913864274313391, + "grad_norm": 0.6209561824798584, + "learning_rate": 8.605773549712803e-06, + "loss": 0.6664, + "step": 8928 + }, + { + "epoch": 0.4914414662336948, + "grad_norm": 0.7905771732330322, + "learning_rate": 8.605473241739353e-06, + "loss": 0.7243, + "step": 8929 + }, + { + "epoch": 0.4914965050360504, + "grad_norm": 0.762959897518158, + "learning_rate": 8.605172906668131e-06, + "loss": 0.7747, + "step": 8930 + }, + { + "epoch": 0.4915515438384061, + "grad_norm": 0.7297530174255371, + "learning_rate": 8.604872544501394e-06, + "loss": 0.7441, + "step": 8931 + }, + { + "epoch": 0.49160658264076174, + "grad_norm": 0.6732318997383118, + "learning_rate": 8.6045721552414e-06, + "loss": 0.7621, + "step": 8932 + }, + { + "epoch": 0.4916616214431174, + "grad_norm": 0.7010045647621155, + "learning_rate": 8.604271738890407e-06, + "loss": 0.7971, + "step": 8933 + }, + { + "epoch": 0.49171666024547306, + "grad_norm": 0.6996648907661438, + "learning_rate": 8.603971295450672e-06, + "loss": 0.8119, + "step": 8934 + }, + { + "epoch": 0.4917716990478287, + "grad_norm": 0.7679941058158875, + "learning_rate": 8.603670824924456e-06, + "loss": 0.8035, + "step": 8935 + }, + { + "epoch": 0.4918267378501844, + "grad_norm": 0.8009630441665649, + "learning_rate": 8.603370327314011e-06, + "loss": 0.7817, + "step": 8936 + }, + { + "epoch": 0.49188177665254, + "grad_norm": 0.7167709469795227, + "learning_rate": 8.603069802621601e-06, + "loss": 0.7621, + "step": 8937 + }, + { + "epoch": 0.4919368154548957, + "grad_norm": 0.7447960376739502, + "learning_rate": 8.602769250849483e-06, + "loss": 0.7664, + "step": 8938 + }, + { + "epoch": 0.49199185425725134, + "grad_norm": 0.653131365776062, + "learning_rate": 8.602468671999915e-06, + "loss": 0.6927, + "step": 8939 + }, + { + "epoch": 0.492046893059607, + "grad_norm": 0.6758691072463989, + "learning_rate": 8.602168066075158e-06, + "loss": 0.7519, + "step": 8940 + }, + { + "epoch": 0.49210193186196266, + "grad_norm": 0.9186220765113831, + "learning_rate": 8.60186743307747e-06, + "loss": 0.7265, + "step": 8941 + }, + { + "epoch": 0.49215697066431835, + "grad_norm": 0.6781855225563049, + "learning_rate": 8.60156677300911e-06, + "loss": 0.6719, + "step": 8942 + }, + { + "epoch": 0.492212009466674, + "grad_norm": 0.7262865304946899, + "learning_rate": 8.601266085872336e-06, + "loss": 0.6449, + "step": 8943 + }, + { + "epoch": 0.4922670482690297, + "grad_norm": 0.6877585053443909, + "learning_rate": 8.600965371669411e-06, + "loss": 0.6999, + "step": 8944 + }, + { + "epoch": 0.4923220870713853, + "grad_norm": 1.1133443117141724, + "learning_rate": 8.600664630402596e-06, + "loss": 0.7842, + "step": 8945 + }, + { + "epoch": 0.492377125873741, + "grad_norm": 0.643478274345398, + "learning_rate": 8.600363862074149e-06, + "loss": 0.7009, + "step": 8946 + }, + { + "epoch": 0.49243216467609663, + "grad_norm": 0.7692574262619019, + "learning_rate": 8.600063066686331e-06, + "loss": 0.7777, + "step": 8947 + }, + { + "epoch": 0.4924872034784523, + "grad_norm": 0.884963870048523, + "learning_rate": 8.599762244241403e-06, + "loss": 0.7789, + "step": 8948 + }, + { + "epoch": 0.49254224228080795, + "grad_norm": 0.6918813586235046, + "learning_rate": 8.599461394741624e-06, + "loss": 0.7769, + "step": 8949 + }, + { + "epoch": 0.49259728108316364, + "grad_norm": 0.7432044148445129, + "learning_rate": 8.599160518189258e-06, + "loss": 0.7972, + "step": 8950 + }, + { + "epoch": 0.4926523198855193, + "grad_norm": 0.7530491948127747, + "learning_rate": 8.598859614586564e-06, + "loss": 0.8812, + "step": 8951 + }, + { + "epoch": 0.49270735868787496, + "grad_norm": 0.8738592267036438, + "learning_rate": 8.598558683935806e-06, + "loss": 0.6967, + "step": 8952 + }, + { + "epoch": 0.4927623974902306, + "grad_norm": 1.032084584236145, + "learning_rate": 8.598257726239242e-06, + "loss": 0.8513, + "step": 8953 + }, + { + "epoch": 0.4928174362925863, + "grad_norm": 0.8717961311340332, + "learning_rate": 8.597956741499136e-06, + "loss": 0.7703, + "step": 8954 + }, + { + "epoch": 0.4928724750949419, + "grad_norm": 0.6788356900215149, + "learning_rate": 8.597655729717753e-06, + "loss": 0.7649, + "step": 8955 + }, + { + "epoch": 0.4929275138972976, + "grad_norm": 1.0595613718032837, + "learning_rate": 8.59735469089735e-06, + "loss": 0.6967, + "step": 8956 + }, + { + "epoch": 0.49298255269965324, + "grad_norm": 0.7583820819854736, + "learning_rate": 8.597053625040193e-06, + "loss": 0.8384, + "step": 8957 + }, + { + "epoch": 0.49303759150200893, + "grad_norm": 0.7232168912887573, + "learning_rate": 8.596752532148545e-06, + "loss": 0.7643, + "step": 8958 + }, + { + "epoch": 0.49309263030436457, + "grad_norm": 0.727190375328064, + "learning_rate": 8.596451412224666e-06, + "loss": 0.845, + "step": 8959 + }, + { + "epoch": 0.49314766910672025, + "grad_norm": 0.6844252347946167, + "learning_rate": 8.596150265270821e-06, + "loss": 0.7099, + "step": 8960 + }, + { + "epoch": 0.4932027079090759, + "grad_norm": 0.7379910945892334, + "learning_rate": 8.595849091289275e-06, + "loss": 0.8168, + "step": 8961 + }, + { + "epoch": 0.4932577467114316, + "grad_norm": 0.77718186378479, + "learning_rate": 8.595547890282288e-06, + "loss": 0.8457, + "step": 8962 + }, + { + "epoch": 0.4933127855137872, + "grad_norm": 0.686126172542572, + "learning_rate": 8.595246662252127e-06, + "loss": 0.7918, + "step": 8963 + }, + { + "epoch": 0.4933678243161429, + "grad_norm": 0.7406145930290222, + "learning_rate": 8.594945407201051e-06, + "loss": 0.6866, + "step": 8964 + }, + { + "epoch": 0.49342286311849853, + "grad_norm": 0.9543277025222778, + "learning_rate": 8.594644125131331e-06, + "loss": 0.8444, + "step": 8965 + }, + { + "epoch": 0.4934779019208542, + "grad_norm": 0.8659517765045166, + "learning_rate": 8.594342816045228e-06, + "loss": 0.7661, + "step": 8966 + }, + { + "epoch": 0.49353294072320986, + "grad_norm": 0.7289552092552185, + "learning_rate": 8.594041479945005e-06, + "loss": 0.7734, + "step": 8967 + }, + { + "epoch": 0.49358797952556555, + "grad_norm": 0.7232840657234192, + "learning_rate": 8.59374011683293e-06, + "loss": 0.8557, + "step": 8968 + }, + { + "epoch": 0.4936430183279212, + "grad_norm": 0.738684356212616, + "learning_rate": 8.593438726711265e-06, + "loss": 0.7779, + "step": 8969 + }, + { + "epoch": 0.49369805713027687, + "grad_norm": 0.7486668229103088, + "learning_rate": 8.593137309582276e-06, + "loss": 0.7326, + "step": 8970 + }, + { + "epoch": 0.4937530959326325, + "grad_norm": 0.6564297080039978, + "learning_rate": 8.59283586544823e-06, + "loss": 0.6927, + "step": 8971 + }, + { + "epoch": 0.4938081347349882, + "grad_norm": 0.722540557384491, + "learning_rate": 8.592534394311392e-06, + "loss": 0.7254, + "step": 8972 + }, + { + "epoch": 0.4938631735373438, + "grad_norm": 0.7466141581535339, + "learning_rate": 8.592232896174026e-06, + "loss": 0.8551, + "step": 8973 + }, + { + "epoch": 0.4939182123396995, + "grad_norm": 0.7819109559059143, + "learning_rate": 8.591931371038398e-06, + "loss": 0.7271, + "step": 8974 + }, + { + "epoch": 0.49397325114205515, + "grad_norm": 0.7847672700881958, + "learning_rate": 8.591629818906776e-06, + "loss": 0.8404, + "step": 8975 + }, + { + "epoch": 0.49402828994441084, + "grad_norm": 0.8167426586151123, + "learning_rate": 8.591328239781428e-06, + "loss": 0.7375, + "step": 8976 + }, + { + "epoch": 0.49408332874676647, + "grad_norm": 0.7894755005836487, + "learning_rate": 8.591026633664615e-06, + "loss": 0.7872, + "step": 8977 + }, + { + "epoch": 0.4941383675491221, + "grad_norm": 0.726204514503479, + "learning_rate": 8.590725000558609e-06, + "loss": 0.7289, + "step": 8978 + }, + { + "epoch": 0.4941934063514778, + "grad_norm": 0.7116577625274658, + "learning_rate": 8.590423340465675e-06, + "loss": 0.7379, + "step": 8979 + }, + { + "epoch": 0.4942484451538334, + "grad_norm": 0.7302193641662598, + "learning_rate": 8.59012165338808e-06, + "loss": 0.7951, + "step": 8980 + }, + { + "epoch": 0.4943034839561891, + "grad_norm": 0.680555522441864, + "learning_rate": 8.58981993932809e-06, + "loss": 0.7609, + "step": 8981 + }, + { + "epoch": 0.49435852275854475, + "grad_norm": 0.874546229839325, + "learning_rate": 8.589518198287976e-06, + "loss": 0.8025, + "step": 8982 + }, + { + "epoch": 0.49441356156090044, + "grad_norm": 0.7164583206176758, + "learning_rate": 8.589216430270004e-06, + "loss": 0.7466, + "step": 8983 + }, + { + "epoch": 0.49446860036325607, + "grad_norm": 0.9155141115188599, + "learning_rate": 8.588914635276442e-06, + "loss": 0.7896, + "step": 8984 + }, + { + "epoch": 0.49452363916561176, + "grad_norm": 0.6777059435844421, + "learning_rate": 8.588612813309558e-06, + "loss": 0.7468, + "step": 8985 + }, + { + "epoch": 0.4945786779679674, + "grad_norm": 0.7100371718406677, + "learning_rate": 8.58831096437162e-06, + "loss": 0.7216, + "step": 8986 + }, + { + "epoch": 0.4946337167703231, + "grad_norm": 0.6842584609985352, + "learning_rate": 8.5880090884649e-06, + "loss": 0.7103, + "step": 8987 + }, + { + "epoch": 0.4946887555726787, + "grad_norm": 0.6347573399543762, + "learning_rate": 8.587707185591661e-06, + "loss": 0.7103, + "step": 8988 + }, + { + "epoch": 0.4947437943750344, + "grad_norm": 0.7175829410552979, + "learning_rate": 8.587405255754177e-06, + "loss": 0.8375, + "step": 8989 + }, + { + "epoch": 0.49479883317739004, + "grad_norm": 0.8402735590934753, + "learning_rate": 8.587103298954715e-06, + "loss": 0.6841, + "step": 8990 + }, + { + "epoch": 0.49485387197974573, + "grad_norm": 0.6988743543624878, + "learning_rate": 8.586801315195545e-06, + "loss": 0.7637, + "step": 8991 + }, + { + "epoch": 0.49490891078210136, + "grad_norm": 0.6672561168670654, + "learning_rate": 8.586499304478934e-06, + "loss": 0.7103, + "step": 8992 + }, + { + "epoch": 0.49496394958445705, + "grad_norm": 0.6821330189704895, + "learning_rate": 8.586197266807158e-06, + "loss": 0.6881, + "step": 8993 + }, + { + "epoch": 0.4950189883868127, + "grad_norm": 0.7886170744895935, + "learning_rate": 8.585895202182482e-06, + "loss": 0.7892, + "step": 8994 + }, + { + "epoch": 0.4950740271891684, + "grad_norm": 0.7348074913024902, + "learning_rate": 8.585593110607177e-06, + "loss": 0.7835, + "step": 8995 + }, + { + "epoch": 0.495129065991524, + "grad_norm": 0.9375506639480591, + "learning_rate": 8.585290992083514e-06, + "loss": 0.8017, + "step": 8996 + }, + { + "epoch": 0.4951841047938797, + "grad_norm": 0.7442331910133362, + "learning_rate": 8.584988846613765e-06, + "loss": 0.72, + "step": 8997 + }, + { + "epoch": 0.49523914359623533, + "grad_norm": 0.7347918748855591, + "learning_rate": 8.584686674200197e-06, + "loss": 0.8229, + "step": 8998 + }, + { + "epoch": 0.495294182398591, + "grad_norm": 0.7168740630149841, + "learning_rate": 8.584384474845084e-06, + "loss": 0.7288, + "step": 8999 + }, + { + "epoch": 0.49534922120094665, + "grad_norm": 0.7834853529930115, + "learning_rate": 8.584082248550697e-06, + "loss": 0.8521, + "step": 9000 + }, + { + "epoch": 0.49540426000330234, + "grad_norm": 0.6499035358428955, + "learning_rate": 8.58377999531931e-06, + "loss": 0.6887, + "step": 9001 + }, + { + "epoch": 0.495459298805658, + "grad_norm": 0.8000181913375854, + "learning_rate": 8.583477715153189e-06, + "loss": 0.8688, + "step": 9002 + }, + { + "epoch": 0.49551433760801367, + "grad_norm": 0.7539342045783997, + "learning_rate": 8.58317540805461e-06, + "loss": 0.6151, + "step": 9003 + }, + { + "epoch": 0.4955693764103693, + "grad_norm": 0.7677812576293945, + "learning_rate": 8.582873074025841e-06, + "loss": 0.8168, + "step": 9004 + }, + { + "epoch": 0.495624415212725, + "grad_norm": 0.7679157853126526, + "learning_rate": 8.58257071306916e-06, + "loss": 0.7719, + "step": 9005 + }, + { + "epoch": 0.4956794540150806, + "grad_norm": 0.9745703935623169, + "learning_rate": 8.582268325186836e-06, + "loss": 0.8272, + "step": 9006 + }, + { + "epoch": 0.4957344928174363, + "grad_norm": 0.66932612657547, + "learning_rate": 8.581965910381143e-06, + "loss": 0.7256, + "step": 9007 + }, + { + "epoch": 0.49578953161979195, + "grad_norm": 0.7630981206893921, + "learning_rate": 8.581663468654351e-06, + "loss": 0.7594, + "step": 9008 + }, + { + "epoch": 0.49584457042214763, + "grad_norm": 0.7420778870582581, + "learning_rate": 8.581361000008737e-06, + "loss": 0.7834, + "step": 9009 + }, + { + "epoch": 0.49589960922450327, + "grad_norm": 0.6775205731391907, + "learning_rate": 8.58105850444657e-06, + "loss": 0.7609, + "step": 9010 + }, + { + "epoch": 0.49595464802685896, + "grad_norm": 0.6588264107704163, + "learning_rate": 8.580755981970128e-06, + "loss": 0.805, + "step": 9011 + }, + { + "epoch": 0.4960096868292146, + "grad_norm": 0.7325689196586609, + "learning_rate": 8.580453432581681e-06, + "loss": 0.8817, + "step": 9012 + }, + { + "epoch": 0.4960647256315703, + "grad_norm": 0.7319273948669434, + "learning_rate": 8.580150856283505e-06, + "loss": 0.8001, + "step": 9013 + }, + { + "epoch": 0.4961197644339259, + "grad_norm": 0.7841789126396179, + "learning_rate": 8.579848253077875e-06, + "loss": 0.8415, + "step": 9014 + }, + { + "epoch": 0.4961748032362816, + "grad_norm": 0.7593979239463806, + "learning_rate": 8.579545622967062e-06, + "loss": 0.8238, + "step": 9015 + }, + { + "epoch": 0.49622984203863724, + "grad_norm": 0.6938808560371399, + "learning_rate": 8.579242965953343e-06, + "loss": 0.7325, + "step": 9016 + }, + { + "epoch": 0.4962848808409929, + "grad_norm": 0.7907594442367554, + "learning_rate": 8.578940282038993e-06, + "loss": 0.6947, + "step": 9017 + }, + { + "epoch": 0.49633991964334856, + "grad_norm": 0.708703875541687, + "learning_rate": 8.578637571226283e-06, + "loss": 0.6712, + "step": 9018 + }, + { + "epoch": 0.49639495844570425, + "grad_norm": 0.6820377707481384, + "learning_rate": 8.578334833517492e-06, + "loss": 0.7269, + "step": 9019 + }, + { + "epoch": 0.4964499972480599, + "grad_norm": 0.6858653426170349, + "learning_rate": 8.578032068914896e-06, + "loss": 0.7325, + "step": 9020 + }, + { + "epoch": 0.4965050360504155, + "grad_norm": 0.8758736848831177, + "learning_rate": 8.577729277420768e-06, + "loss": 0.6652, + "step": 9021 + }, + { + "epoch": 0.4965600748527712, + "grad_norm": 0.731316328048706, + "learning_rate": 8.577426459037383e-06, + "loss": 0.7835, + "step": 9022 + }, + { + "epoch": 0.49661511365512684, + "grad_norm": 0.813778817653656, + "learning_rate": 8.57712361376702e-06, + "loss": 0.8025, + "step": 9023 + }, + { + "epoch": 0.4966701524574825, + "grad_norm": 0.7167351841926575, + "learning_rate": 8.576820741611952e-06, + "loss": 0.7483, + "step": 9024 + }, + { + "epoch": 0.49672519125983816, + "grad_norm": 0.7243192791938782, + "learning_rate": 8.576517842574457e-06, + "loss": 0.8411, + "step": 9025 + }, + { + "epoch": 0.49678023006219385, + "grad_norm": 0.5869036316871643, + "learning_rate": 8.576214916656814e-06, + "loss": 0.6661, + "step": 9026 + }, + { + "epoch": 0.4968352688645495, + "grad_norm": 0.7502203583717346, + "learning_rate": 8.575911963861293e-06, + "loss": 0.8838, + "step": 9027 + }, + { + "epoch": 0.4968903076669052, + "grad_norm": 0.687562108039856, + "learning_rate": 8.575608984190177e-06, + "loss": 0.7446, + "step": 9028 + }, + { + "epoch": 0.4969453464692608, + "grad_norm": 0.7735342383384705, + "learning_rate": 8.57530597764574e-06, + "loss": 0.8464, + "step": 9029 + }, + { + "epoch": 0.4970003852716165, + "grad_norm": 0.7828487753868103, + "learning_rate": 8.575002944230261e-06, + "loss": 0.7504, + "step": 9030 + }, + { + "epoch": 0.49705542407397213, + "grad_norm": 0.6359286904335022, + "learning_rate": 8.574699883946018e-06, + "loss": 0.6805, + "step": 9031 + }, + { + "epoch": 0.4971104628763278, + "grad_norm": 0.7462830543518066, + "learning_rate": 8.574396796795285e-06, + "loss": 0.8317, + "step": 9032 + }, + { + "epoch": 0.49716550167868345, + "grad_norm": 0.705115795135498, + "learning_rate": 8.574093682780344e-06, + "loss": 0.7401, + "step": 9033 + }, + { + "epoch": 0.49722054048103914, + "grad_norm": 0.6466538310050964, + "learning_rate": 8.573790541903472e-06, + "loss": 0.7761, + "step": 9034 + }, + { + "epoch": 0.4972755792833948, + "grad_norm": 0.7479867339134216, + "learning_rate": 8.573487374166946e-06, + "loss": 0.8394, + "step": 9035 + }, + { + "epoch": 0.49733061808575046, + "grad_norm": 0.7378019094467163, + "learning_rate": 8.573184179573046e-06, + "loss": 0.8215, + "step": 9036 + }, + { + "epoch": 0.4973856568881061, + "grad_norm": 0.6526094675064087, + "learning_rate": 8.57288095812405e-06, + "loss": 0.8055, + "step": 9037 + }, + { + "epoch": 0.4974406956904618, + "grad_norm": 0.679595947265625, + "learning_rate": 8.572577709822238e-06, + "loss": 0.8241, + "step": 9038 + }, + { + "epoch": 0.4974957344928174, + "grad_norm": 0.753466010093689, + "learning_rate": 8.572274434669886e-06, + "loss": 0.896, + "step": 9039 + }, + { + "epoch": 0.4975507732951731, + "grad_norm": 0.7068368792533875, + "learning_rate": 8.571971132669277e-06, + "loss": 0.778, + "step": 9040 + }, + { + "epoch": 0.49760581209752874, + "grad_norm": 0.7397973537445068, + "learning_rate": 8.571667803822689e-06, + "loss": 0.782, + "step": 9041 + }, + { + "epoch": 0.49766085089988443, + "grad_norm": 0.7837033271789551, + "learning_rate": 8.571364448132402e-06, + "loss": 0.7509, + "step": 9042 + }, + { + "epoch": 0.49771588970224007, + "grad_norm": 0.6808765530586243, + "learning_rate": 8.571061065600696e-06, + "loss": 0.672, + "step": 9043 + }, + { + "epoch": 0.49777092850459576, + "grad_norm": 0.6574100255966187, + "learning_rate": 8.570757656229852e-06, + "loss": 0.751, + "step": 9044 + }, + { + "epoch": 0.4978259673069514, + "grad_norm": 0.7357671856880188, + "learning_rate": 8.570454220022146e-06, + "loss": 0.7977, + "step": 9045 + }, + { + "epoch": 0.4978810061093071, + "grad_norm": 0.7937216758728027, + "learning_rate": 8.570150756979865e-06, + "loss": 0.8151, + "step": 9046 + }, + { + "epoch": 0.4979360449116627, + "grad_norm": 0.7050907611846924, + "learning_rate": 8.569847267105285e-06, + "loss": 0.7667, + "step": 9047 + }, + { + "epoch": 0.4979910837140184, + "grad_norm": 0.7105300426483154, + "learning_rate": 8.569543750400688e-06, + "loss": 0.7031, + "step": 9048 + }, + { + "epoch": 0.49804612251637403, + "grad_norm": 0.7174646854400635, + "learning_rate": 8.569240206868358e-06, + "loss": 0.7692, + "step": 9049 + }, + { + "epoch": 0.4981011613187297, + "grad_norm": 0.7525906562805176, + "learning_rate": 8.568936636510573e-06, + "loss": 0.7584, + "step": 9050 + }, + { + "epoch": 0.49815620012108536, + "grad_norm": 1.5518100261688232, + "learning_rate": 8.568633039329615e-06, + "loss": 0.7932, + "step": 9051 + }, + { + "epoch": 0.49821123892344105, + "grad_norm": 0.7037720084190369, + "learning_rate": 8.568329415327766e-06, + "loss": 0.8345, + "step": 9052 + }, + { + "epoch": 0.4982662777257967, + "grad_norm": 0.6422694325447083, + "learning_rate": 8.568025764507308e-06, + "loss": 0.7396, + "step": 9053 + }, + { + "epoch": 0.49832131652815237, + "grad_norm": 0.777306854724884, + "learning_rate": 8.567722086870525e-06, + "loss": 0.8605, + "step": 9054 + }, + { + "epoch": 0.498376355330508, + "grad_norm": 0.6619865298271179, + "learning_rate": 8.567418382419697e-06, + "loss": 0.7395, + "step": 9055 + }, + { + "epoch": 0.4984313941328637, + "grad_norm": 0.7214456796646118, + "learning_rate": 8.567114651157106e-06, + "loss": 0.7932, + "step": 9056 + }, + { + "epoch": 0.4984864329352193, + "grad_norm": 0.75806725025177, + "learning_rate": 8.566810893085037e-06, + "loss": 0.7998, + "step": 9057 + }, + { + "epoch": 0.498541471737575, + "grad_norm": 0.8089895844459534, + "learning_rate": 8.566507108205773e-06, + "loss": 0.7849, + "step": 9058 + }, + { + "epoch": 0.49859651053993065, + "grad_norm": 0.817814290523529, + "learning_rate": 8.566203296521597e-06, + "loss": 0.7261, + "step": 9059 + }, + { + "epoch": 0.49865154934228634, + "grad_norm": 0.7417539954185486, + "learning_rate": 8.56589945803479e-06, + "loss": 0.7087, + "step": 9060 + }, + { + "epoch": 0.49870658814464197, + "grad_norm": 0.7518000602722168, + "learning_rate": 8.565595592747639e-06, + "loss": 0.7245, + "step": 9061 + }, + { + "epoch": 0.49876162694699766, + "grad_norm": 0.9537304043769836, + "learning_rate": 8.565291700662423e-06, + "loss": 0.901, + "step": 9062 + }, + { + "epoch": 0.4988166657493533, + "grad_norm": 0.784545361995697, + "learning_rate": 8.56498778178143e-06, + "loss": 0.7813, + "step": 9063 + }, + { + "epoch": 0.4988717045517089, + "grad_norm": 0.9218429923057556, + "learning_rate": 8.564683836106945e-06, + "loss": 0.8452, + "step": 9064 + }, + { + "epoch": 0.4989267433540646, + "grad_norm": 0.6902065277099609, + "learning_rate": 8.56437986364125e-06, + "loss": 0.7527, + "step": 9065 + }, + { + "epoch": 0.49898178215642025, + "grad_norm": 0.7388677000999451, + "learning_rate": 8.56407586438663e-06, + "loss": 0.82, + "step": 9066 + }, + { + "epoch": 0.49903682095877594, + "grad_norm": 0.6959313154220581, + "learning_rate": 8.563771838345369e-06, + "loss": 0.7274, + "step": 9067 + }, + { + "epoch": 0.4990918597611316, + "grad_norm": 0.6582610607147217, + "learning_rate": 8.563467785519753e-06, + "loss": 0.6518, + "step": 9068 + }, + { + "epoch": 0.49914689856348726, + "grad_norm": 0.6525924801826477, + "learning_rate": 8.563163705912066e-06, + "loss": 0.7006, + "step": 9069 + }, + { + "epoch": 0.4992019373658429, + "grad_norm": 0.8092843890190125, + "learning_rate": 8.562859599524596e-06, + "loss": 0.6915, + "step": 9070 + }, + { + "epoch": 0.4992569761681986, + "grad_norm": 0.6540575623512268, + "learning_rate": 8.562555466359626e-06, + "loss": 0.6729, + "step": 9071 + }, + { + "epoch": 0.4993120149705542, + "grad_norm": 0.8220445513725281, + "learning_rate": 8.562251306419443e-06, + "loss": 0.8172, + "step": 9072 + }, + { + "epoch": 0.4993670537729099, + "grad_norm": 0.7461502552032471, + "learning_rate": 8.561947119706334e-06, + "loss": 0.6902, + "step": 9073 + }, + { + "epoch": 0.49942209257526554, + "grad_norm": 0.8166316151618958, + "learning_rate": 8.56164290622258e-06, + "loss": 0.8238, + "step": 9074 + }, + { + "epoch": 0.49947713137762123, + "grad_norm": 0.8453896641731262, + "learning_rate": 8.561338665970476e-06, + "loss": 0.7697, + "step": 9075 + }, + { + "epoch": 0.49953217017997686, + "grad_norm": 0.7606340050697327, + "learning_rate": 8.5610343989523e-06, + "loss": 0.6951, + "step": 9076 + }, + { + "epoch": 0.49958720898233255, + "grad_norm": 0.7408013343811035, + "learning_rate": 8.560730105170345e-06, + "loss": 0.8298, + "step": 9077 + }, + { + "epoch": 0.4996422477846882, + "grad_norm": 0.7625541090965271, + "learning_rate": 8.560425784626896e-06, + "loss": 0.6738, + "step": 9078 + }, + { + "epoch": 0.4996972865870439, + "grad_norm": 0.6940996646881104, + "learning_rate": 8.560121437324238e-06, + "loss": 0.78, + "step": 9079 + }, + { + "epoch": 0.4997523253893995, + "grad_norm": 0.8087461590766907, + "learning_rate": 8.559817063264661e-06, + "loss": 0.7831, + "step": 9080 + }, + { + "epoch": 0.4998073641917552, + "grad_norm": 0.7418510317802429, + "learning_rate": 8.559512662450452e-06, + "loss": 0.801, + "step": 9081 + }, + { + "epoch": 0.49986240299411083, + "grad_norm": 0.6793946027755737, + "learning_rate": 8.5592082348839e-06, + "loss": 0.7329, + "step": 9082 + }, + { + "epoch": 0.4999174417964665, + "grad_norm": 0.8197429180145264, + "learning_rate": 8.55890378056729e-06, + "loss": 0.804, + "step": 9083 + }, + { + "epoch": 0.49997248059882216, + "grad_norm": 0.7526460886001587, + "learning_rate": 8.558599299502912e-06, + "loss": 0.8378, + "step": 9084 + }, + { + "epoch": 0.5000275194011778, + "grad_norm": 0.8169133067131042, + "learning_rate": 8.558294791693055e-06, + "loss": 0.828, + "step": 9085 + }, + { + "epoch": 0.5000825582035335, + "grad_norm": 0.8386932015419006, + "learning_rate": 8.557990257140007e-06, + "loss": 0.7961, + "step": 9086 + }, + { + "epoch": 0.5001375970058891, + "grad_norm": 0.7183443903923035, + "learning_rate": 8.557685695846057e-06, + "loss": 0.6964, + "step": 9087 + }, + { + "epoch": 0.5001926358082448, + "grad_norm": 0.77079176902771, + "learning_rate": 8.557381107813491e-06, + "loss": 0.8222, + "step": 9088 + }, + { + "epoch": 0.5002476746106005, + "grad_norm": 0.6519342660903931, + "learning_rate": 8.557076493044603e-06, + "loss": 0.772, + "step": 9089 + }, + { + "epoch": 0.5003027134129562, + "grad_norm": 0.7039975523948669, + "learning_rate": 8.556771851541678e-06, + "loss": 0.7491, + "step": 9090 + }, + { + "epoch": 0.5003577522153118, + "grad_norm": 0.6459039449691772, + "learning_rate": 8.556467183307012e-06, + "loss": 0.7104, + "step": 9091 + }, + { + "epoch": 0.5004127910176674, + "grad_norm": 0.7359183430671692, + "learning_rate": 8.556162488342887e-06, + "loss": 0.829, + "step": 9092 + }, + { + "epoch": 0.5004678298200231, + "grad_norm": 0.7029602527618408, + "learning_rate": 8.555857766651599e-06, + "loss": 0.8163, + "step": 9093 + }, + { + "epoch": 0.5005228686223788, + "grad_norm": 0.6687049865722656, + "learning_rate": 8.555553018235435e-06, + "loss": 0.7589, + "step": 9094 + }, + { + "epoch": 0.5005779074247344, + "grad_norm": 0.7277147173881531, + "learning_rate": 8.555248243096686e-06, + "loss": 0.8334, + "step": 9095 + }, + { + "epoch": 0.5006329462270901, + "grad_norm": 0.6512065529823303, + "learning_rate": 8.554943441237642e-06, + "loss": 0.7174, + "step": 9096 + }, + { + "epoch": 0.5006879850294458, + "grad_norm": 0.725351095199585, + "learning_rate": 8.554638612660594e-06, + "loss": 0.6514, + "step": 9097 + }, + { + "epoch": 0.5007430238318015, + "grad_norm": 0.7983208894729614, + "learning_rate": 8.554333757367836e-06, + "loss": 0.8385, + "step": 9098 + }, + { + "epoch": 0.500798062634157, + "grad_norm": 0.6631388068199158, + "learning_rate": 8.554028875361657e-06, + "loss": 0.7103, + "step": 9099 + }, + { + "epoch": 0.5008531014365127, + "grad_norm": 0.730421245098114, + "learning_rate": 8.553723966644347e-06, + "loss": 0.8005, + "step": 9100 + }, + { + "epoch": 0.5009081402388684, + "grad_norm": 0.7385838627815247, + "learning_rate": 8.5534190312182e-06, + "loss": 0.7586, + "step": 9101 + }, + { + "epoch": 0.5009631790412241, + "grad_norm": 0.712458610534668, + "learning_rate": 8.553114069085506e-06, + "loss": 0.7587, + "step": 9102 + }, + { + "epoch": 0.5010182178435797, + "grad_norm": 0.7393542528152466, + "learning_rate": 8.552809080248559e-06, + "loss": 0.746, + "step": 9103 + }, + { + "epoch": 0.5010732566459354, + "grad_norm": 0.6596370935440063, + "learning_rate": 8.552504064709649e-06, + "loss": 0.6968, + "step": 9104 + }, + { + "epoch": 0.5011282954482911, + "grad_norm": 0.7340545654296875, + "learning_rate": 8.552199022471069e-06, + "loss": 0.8326, + "step": 9105 + }, + { + "epoch": 0.5011833342506467, + "grad_norm": 0.6586140990257263, + "learning_rate": 8.55189395353511e-06, + "loss": 0.7144, + "step": 9106 + }, + { + "epoch": 0.5012383730530023, + "grad_norm": 0.6875959038734436, + "learning_rate": 8.551588857904071e-06, + "loss": 0.721, + "step": 9107 + }, + { + "epoch": 0.501293411855358, + "grad_norm": 0.6754499077796936, + "learning_rate": 8.551283735580238e-06, + "loss": 0.6771, + "step": 9108 + }, + { + "epoch": 0.5013484506577137, + "grad_norm": 0.8027325868606567, + "learning_rate": 8.55097858656591e-06, + "loss": 0.8196, + "step": 9109 + }, + { + "epoch": 0.5014034894600693, + "grad_norm": 0.6992260217666626, + "learning_rate": 8.550673410863376e-06, + "loss": 0.7923, + "step": 9110 + }, + { + "epoch": 0.501458528262425, + "grad_norm": 0.741205632686615, + "learning_rate": 8.550368208474928e-06, + "loss": 0.7036, + "step": 9111 + }, + { + "epoch": 0.5015135670647807, + "grad_norm": 0.6485981345176697, + "learning_rate": 8.550062979402866e-06, + "loss": 0.6351, + "step": 9112 + }, + { + "epoch": 0.5015686058671364, + "grad_norm": 0.6984226703643799, + "learning_rate": 8.549757723649481e-06, + "loss": 0.7714, + "step": 9113 + }, + { + "epoch": 0.5016236446694919, + "grad_norm": 0.7773998975753784, + "learning_rate": 8.549452441217067e-06, + "loss": 0.8901, + "step": 9114 + }, + { + "epoch": 0.5016786834718476, + "grad_norm": 0.6912227272987366, + "learning_rate": 8.549147132107918e-06, + "loss": 0.7702, + "step": 9115 + }, + { + "epoch": 0.5017337222742033, + "grad_norm": 0.6742583513259888, + "learning_rate": 8.54884179632433e-06, + "loss": 0.7789, + "step": 9116 + }, + { + "epoch": 0.501788761076559, + "grad_norm": 0.7896195650100708, + "learning_rate": 8.548536433868595e-06, + "loss": 0.7358, + "step": 9117 + }, + { + "epoch": 0.5018437998789146, + "grad_norm": 0.7112523913383484, + "learning_rate": 8.548231044743011e-06, + "loss": 0.7286, + "step": 9118 + }, + { + "epoch": 0.5018988386812703, + "grad_norm": 0.9162774085998535, + "learning_rate": 8.547925628949873e-06, + "loss": 0.935, + "step": 9119 + }, + { + "epoch": 0.501953877483626, + "grad_norm": 0.6319599747657776, + "learning_rate": 8.547620186491477e-06, + "loss": 0.625, + "step": 9120 + }, + { + "epoch": 0.5020089162859817, + "grad_norm": 0.7074719667434692, + "learning_rate": 8.547314717370115e-06, + "loss": 0.6614, + "step": 9121 + }, + { + "epoch": 0.5020639550883372, + "grad_norm": 0.7417262196540833, + "learning_rate": 8.547009221588086e-06, + "loss": 0.8476, + "step": 9122 + }, + { + "epoch": 0.5021189938906929, + "grad_norm": 0.7057339549064636, + "learning_rate": 8.546703699147685e-06, + "loss": 0.805, + "step": 9123 + }, + { + "epoch": 0.5021740326930486, + "grad_norm": 0.7420887351036072, + "learning_rate": 8.546398150051207e-06, + "loss": 0.7331, + "step": 9124 + }, + { + "epoch": 0.5022290714954043, + "grad_norm": 0.9526195526123047, + "learning_rate": 8.546092574300953e-06, + "loss": 0.7803, + "step": 9125 + }, + { + "epoch": 0.5022841102977599, + "grad_norm": 0.748130202293396, + "learning_rate": 8.545786971899214e-06, + "loss": 0.7998, + "step": 9126 + }, + { + "epoch": 0.5023391491001156, + "grad_norm": 0.7266026139259338, + "learning_rate": 8.545481342848289e-06, + "loss": 0.8377, + "step": 9127 + }, + { + "epoch": 0.5023941879024713, + "grad_norm": 0.6762456893920898, + "learning_rate": 8.545175687150478e-06, + "loss": 0.7312, + "step": 9128 + }, + { + "epoch": 0.502449226704827, + "grad_norm": 0.7011429667472839, + "learning_rate": 8.544870004808072e-06, + "loss": 0.7666, + "step": 9129 + }, + { + "epoch": 0.5025042655071825, + "grad_norm": 0.6652229428291321, + "learning_rate": 8.544564295823375e-06, + "loss": 0.6904, + "step": 9130 + }, + { + "epoch": 0.5025593043095382, + "grad_norm": 0.8333765268325806, + "learning_rate": 8.54425856019868e-06, + "loss": 0.7318, + "step": 9131 + }, + { + "epoch": 0.5026143431118939, + "grad_norm": 0.6827245950698853, + "learning_rate": 8.543952797936285e-06, + "loss": 0.7692, + "step": 9132 + }, + { + "epoch": 0.5026693819142496, + "grad_norm": 0.8744323253631592, + "learning_rate": 8.543647009038491e-06, + "loss": 0.7316, + "step": 9133 + }, + { + "epoch": 0.5027244207166052, + "grad_norm": 0.7024276852607727, + "learning_rate": 8.543341193507594e-06, + "loss": 0.7008, + "step": 9134 + }, + { + "epoch": 0.5027794595189609, + "grad_norm": 0.8786055445671082, + "learning_rate": 8.543035351345895e-06, + "loss": 0.7054, + "step": 9135 + }, + { + "epoch": 0.5028344983213165, + "grad_norm": 0.727924108505249, + "learning_rate": 8.54272948255569e-06, + "loss": 0.8049, + "step": 9136 + }, + { + "epoch": 0.5028895371236722, + "grad_norm": 0.8366256356239319, + "learning_rate": 8.542423587139277e-06, + "loss": 0.7926, + "step": 9137 + }, + { + "epoch": 0.5029445759260278, + "grad_norm": 0.7657913565635681, + "learning_rate": 8.542117665098958e-06, + "loss": 0.8152, + "step": 9138 + }, + { + "epoch": 0.5029996147283835, + "grad_norm": 0.7543498277664185, + "learning_rate": 8.54181171643703e-06, + "loss": 0.7566, + "step": 9139 + }, + { + "epoch": 0.5030546535307392, + "grad_norm": 0.7771349549293518, + "learning_rate": 8.541505741155794e-06, + "loss": 0.7907, + "step": 9140 + }, + { + "epoch": 0.5031096923330949, + "grad_norm": 0.6661877632141113, + "learning_rate": 8.541199739257548e-06, + "loss": 0.7481, + "step": 9141 + }, + { + "epoch": 0.5031647311354505, + "grad_norm": 0.7700417637825012, + "learning_rate": 8.540893710744593e-06, + "loss": 0.7544, + "step": 9142 + }, + { + "epoch": 0.5032197699378061, + "grad_norm": 0.6476640105247498, + "learning_rate": 8.54058765561923e-06, + "loss": 0.7221, + "step": 9143 + }, + { + "epoch": 0.5032748087401618, + "grad_norm": 0.7098944187164307, + "learning_rate": 8.540281573883755e-06, + "loss": 0.8083, + "step": 9144 + }, + { + "epoch": 0.5033298475425175, + "grad_norm": 0.9733545184135437, + "learning_rate": 8.539975465540473e-06, + "loss": 0.7381, + "step": 9145 + }, + { + "epoch": 0.5033848863448731, + "grad_norm": 0.641211986541748, + "learning_rate": 8.539669330591685e-06, + "loss": 0.7511, + "step": 9146 + }, + { + "epoch": 0.5034399251472288, + "grad_norm": 0.626027524471283, + "learning_rate": 8.539363169039687e-06, + "loss": 0.7321, + "step": 9147 + }, + { + "epoch": 0.5034949639495845, + "grad_norm": 0.7627241611480713, + "learning_rate": 8.539056980886785e-06, + "loss": 0.7269, + "step": 9148 + }, + { + "epoch": 0.5035500027519401, + "grad_norm": 0.6711145639419556, + "learning_rate": 8.538750766135275e-06, + "loss": 0.8179, + "step": 9149 + }, + { + "epoch": 0.5036050415542958, + "grad_norm": 0.6981950998306274, + "learning_rate": 8.538444524787463e-06, + "loss": 0.8095, + "step": 9150 + }, + { + "epoch": 0.5036600803566514, + "grad_norm": 0.8869871497154236, + "learning_rate": 8.53813825684565e-06, + "loss": 0.8549, + "step": 9151 + }, + { + "epoch": 0.5037151191590071, + "grad_norm": 0.6461544036865234, + "learning_rate": 8.537831962312137e-06, + "loss": 0.7388, + "step": 9152 + }, + { + "epoch": 0.5037701579613627, + "grad_norm": 0.8279222249984741, + "learning_rate": 8.537525641189224e-06, + "loss": 0.8609, + "step": 9153 + }, + { + "epoch": 0.5038251967637184, + "grad_norm": 0.7117578387260437, + "learning_rate": 8.537219293479217e-06, + "loss": 0.802, + "step": 9154 + }, + { + "epoch": 0.5038802355660741, + "grad_norm": 0.6831860542297363, + "learning_rate": 8.536912919184416e-06, + "loss": 0.7821, + "step": 9155 + }, + { + "epoch": 0.5039352743684298, + "grad_norm": 1.1528539657592773, + "learning_rate": 8.536606518307125e-06, + "loss": 0.8578, + "step": 9156 + }, + { + "epoch": 0.5039903131707854, + "grad_norm": 0.6545060873031616, + "learning_rate": 8.536300090849645e-06, + "loss": 0.7744, + "step": 9157 + }, + { + "epoch": 0.504045351973141, + "grad_norm": 0.7176601886749268, + "learning_rate": 8.535993636814281e-06, + "loss": 0.8104, + "step": 9158 + }, + { + "epoch": 0.5041003907754967, + "grad_norm": 0.8458410501480103, + "learning_rate": 8.535687156203334e-06, + "loss": 0.8653, + "step": 9159 + }, + { + "epoch": 0.5041554295778524, + "grad_norm": 0.7500274777412415, + "learning_rate": 8.53538064901911e-06, + "loss": 0.8043, + "step": 9160 + }, + { + "epoch": 0.504210468380208, + "grad_norm": 0.6982965469360352, + "learning_rate": 8.535074115263911e-06, + "loss": 0.7564, + "step": 9161 + }, + { + "epoch": 0.5042655071825637, + "grad_norm": 0.8344218134880066, + "learning_rate": 8.534767554940042e-06, + "loss": 0.7575, + "step": 9162 + }, + { + "epoch": 0.5043205459849194, + "grad_norm": 0.7527137398719788, + "learning_rate": 8.534460968049806e-06, + "loss": 0.7757, + "step": 9163 + }, + { + "epoch": 0.5043755847872751, + "grad_norm": 0.7136969566345215, + "learning_rate": 8.534154354595508e-06, + "loss": 0.826, + "step": 9164 + }, + { + "epoch": 0.5044306235896306, + "grad_norm": 0.8102819919586182, + "learning_rate": 8.533847714579449e-06, + "loss": 0.7247, + "step": 9165 + }, + { + "epoch": 0.5044856623919863, + "grad_norm": 0.7568309903144836, + "learning_rate": 8.53354104800394e-06, + "loss": 0.8509, + "step": 9166 + }, + { + "epoch": 0.504540701194342, + "grad_norm": 0.7719592452049255, + "learning_rate": 8.53323435487128e-06, + "loss": 0.8039, + "step": 9167 + }, + { + "epoch": 0.5045957399966977, + "grad_norm": 0.7514411807060242, + "learning_rate": 8.532927635183778e-06, + "loss": 0.8759, + "step": 9168 + }, + { + "epoch": 0.5046507787990533, + "grad_norm": 0.9781903028488159, + "learning_rate": 8.532620888943736e-06, + "loss": 0.8022, + "step": 9169 + }, + { + "epoch": 0.504705817601409, + "grad_norm": 0.7713304758071899, + "learning_rate": 8.532314116153462e-06, + "loss": 0.8372, + "step": 9170 + }, + { + "epoch": 0.5047608564037647, + "grad_norm": 0.7519709467887878, + "learning_rate": 8.53200731681526e-06, + "loss": 0.7374, + "step": 9171 + }, + { + "epoch": 0.5048158952061204, + "grad_norm": 0.6923980712890625, + "learning_rate": 8.531700490931438e-06, + "loss": 0.7511, + "step": 9172 + }, + { + "epoch": 0.5048709340084759, + "grad_norm": 0.682357907295227, + "learning_rate": 8.5313936385043e-06, + "loss": 0.7647, + "step": 9173 + }, + { + "epoch": 0.5049259728108316, + "grad_norm": 0.8255659341812134, + "learning_rate": 8.531086759536152e-06, + "loss": 0.7533, + "step": 9174 + }, + { + "epoch": 0.5049810116131873, + "grad_norm": 0.6774975061416626, + "learning_rate": 8.530779854029301e-06, + "loss": 0.7019, + "step": 9175 + }, + { + "epoch": 0.505036050415543, + "grad_norm": 0.7973241209983826, + "learning_rate": 8.530472921986053e-06, + "loss": 0.7824, + "step": 9176 + }, + { + "epoch": 0.5050910892178986, + "grad_norm": 0.8216109275817871, + "learning_rate": 8.530165963408716e-06, + "loss": 0.8063, + "step": 9177 + }, + { + "epoch": 0.5051461280202543, + "grad_norm": 0.7277935743331909, + "learning_rate": 8.5298589782996e-06, + "loss": 0.7631, + "step": 9178 + }, + { + "epoch": 0.50520116682261, + "grad_norm": 0.6647855043411255, + "learning_rate": 8.529551966661004e-06, + "loss": 0.7462, + "step": 9179 + }, + { + "epoch": 0.5052562056249656, + "grad_norm": 0.766272783279419, + "learning_rate": 8.529244928495241e-06, + "loss": 0.8075, + "step": 9180 + }, + { + "epoch": 0.5053112444273212, + "grad_norm": 0.7276293635368347, + "learning_rate": 8.52893786380462e-06, + "loss": 0.7908, + "step": 9181 + }, + { + "epoch": 0.5053662832296769, + "grad_norm": 0.7864169478416443, + "learning_rate": 8.528630772591447e-06, + "loss": 0.8082, + "step": 9182 + }, + { + "epoch": 0.5054213220320326, + "grad_norm": 0.9106804132461548, + "learning_rate": 8.528323654858028e-06, + "loss": 0.8989, + "step": 9183 + }, + { + "epoch": 0.5054763608343883, + "grad_norm": 0.7288523316383362, + "learning_rate": 8.52801651060667e-06, + "loss": 0.7972, + "step": 9184 + }, + { + "epoch": 0.5055313996367439, + "grad_norm": 0.7149643301963806, + "learning_rate": 8.527709339839689e-06, + "loss": 0.8191, + "step": 9185 + }, + { + "epoch": 0.5055864384390996, + "grad_norm": 0.6661714911460876, + "learning_rate": 8.527402142559388e-06, + "loss": 0.6596, + "step": 9186 + }, + { + "epoch": 0.5056414772414553, + "grad_norm": 0.7071447372436523, + "learning_rate": 8.527094918768076e-06, + "loss": 0.7633, + "step": 9187 + }, + { + "epoch": 0.5056965160438109, + "grad_norm": 0.7314093112945557, + "learning_rate": 8.526787668468064e-06, + "loss": 0.7815, + "step": 9188 + }, + { + "epoch": 0.5057515548461665, + "grad_norm": 0.8200539946556091, + "learning_rate": 8.526480391661657e-06, + "loss": 0.8376, + "step": 9189 + }, + { + "epoch": 0.5058065936485222, + "grad_norm": 0.7422435283660889, + "learning_rate": 8.52617308835117e-06, + "loss": 0.8783, + "step": 9190 + }, + { + "epoch": 0.5058616324508779, + "grad_norm": 0.7845084071159363, + "learning_rate": 8.525865758538909e-06, + "loss": 0.8005, + "step": 9191 + }, + { + "epoch": 0.5059166712532335, + "grad_norm": 0.6854296922683716, + "learning_rate": 8.525558402227185e-06, + "loss": 0.8118, + "step": 9192 + }, + { + "epoch": 0.5059717100555892, + "grad_norm": 0.6805297136306763, + "learning_rate": 8.525251019418309e-06, + "loss": 0.6765, + "step": 9193 + }, + { + "epoch": 0.5060267488579449, + "grad_norm": 0.7194867134094238, + "learning_rate": 8.524943610114587e-06, + "loss": 0.6752, + "step": 9194 + }, + { + "epoch": 0.5060817876603005, + "grad_norm": 0.6935137510299683, + "learning_rate": 8.524636174318335e-06, + "loss": 0.7122, + "step": 9195 + }, + { + "epoch": 0.5061368264626561, + "grad_norm": 0.8652825951576233, + "learning_rate": 8.52432871203186e-06, + "loss": 0.7725, + "step": 9196 + }, + { + "epoch": 0.5061918652650118, + "grad_norm": 0.9104461669921875, + "learning_rate": 8.524021223257472e-06, + "loss": 0.8589, + "step": 9197 + }, + { + "epoch": 0.5062469040673675, + "grad_norm": 0.7680580019950867, + "learning_rate": 8.523713707997486e-06, + "loss": 0.842, + "step": 9198 + }, + { + "epoch": 0.5063019428697232, + "grad_norm": 0.7324872612953186, + "learning_rate": 8.52340616625421e-06, + "loss": 0.802, + "step": 9199 + }, + { + "epoch": 0.5063569816720788, + "grad_norm": 0.8812359571456909, + "learning_rate": 8.523098598029958e-06, + "loss": 0.8286, + "step": 9200 + }, + { + "epoch": 0.5064120204744345, + "grad_norm": 0.6992496848106384, + "learning_rate": 8.522791003327038e-06, + "loss": 0.811, + "step": 9201 + }, + { + "epoch": 0.5064670592767901, + "grad_norm": 0.8191942572593689, + "learning_rate": 8.522483382147766e-06, + "loss": 0.7192, + "step": 9202 + }, + { + "epoch": 0.5065220980791458, + "grad_norm": 0.9354501366615295, + "learning_rate": 8.522175734494452e-06, + "loss": 0.7424, + "step": 9203 + }, + { + "epoch": 0.5065771368815014, + "grad_norm": 0.6481999754905701, + "learning_rate": 8.521868060369405e-06, + "loss": 0.6385, + "step": 9204 + }, + { + "epoch": 0.5066321756838571, + "grad_norm": 0.7158499360084534, + "learning_rate": 8.521560359774943e-06, + "loss": 0.6116, + "step": 9205 + }, + { + "epoch": 0.5066872144862128, + "grad_norm": 0.8738408088684082, + "learning_rate": 8.521252632713376e-06, + "loss": 0.894, + "step": 9206 + }, + { + "epoch": 0.5067422532885685, + "grad_norm": 0.7037062644958496, + "learning_rate": 8.520944879187015e-06, + "loss": 0.6958, + "step": 9207 + }, + { + "epoch": 0.5067972920909241, + "grad_norm": 0.7205594778060913, + "learning_rate": 8.520637099198175e-06, + "loss": 0.7188, + "step": 9208 + }, + { + "epoch": 0.5068523308932797, + "grad_norm": 0.6761966347694397, + "learning_rate": 8.520329292749169e-06, + "loss": 0.7669, + "step": 9209 + }, + { + "epoch": 0.5069073696956354, + "grad_norm": 0.682556688785553, + "learning_rate": 8.520021459842312e-06, + "loss": 0.7745, + "step": 9210 + }, + { + "epoch": 0.5069624084979911, + "grad_norm": 0.6687794923782349, + "learning_rate": 8.519713600479913e-06, + "loss": 0.7814, + "step": 9211 + }, + { + "epoch": 0.5070174473003467, + "grad_norm": 0.6391967535018921, + "learning_rate": 8.51940571466429e-06, + "loss": 0.7331, + "step": 9212 + }, + { + "epoch": 0.5070724861027024, + "grad_norm": 0.8420151472091675, + "learning_rate": 8.519097802397758e-06, + "loss": 0.8257, + "step": 9213 + }, + { + "epoch": 0.5071275249050581, + "grad_norm": 0.692787230014801, + "learning_rate": 8.518789863682625e-06, + "loss": 0.7179, + "step": 9214 + }, + { + "epoch": 0.5071825637074138, + "grad_norm": 0.6874318718910217, + "learning_rate": 8.518481898521213e-06, + "loss": 0.6847, + "step": 9215 + }, + { + "epoch": 0.5072376025097693, + "grad_norm": 0.8107750415802002, + "learning_rate": 8.518173906915832e-06, + "loss": 0.8459, + "step": 9216 + }, + { + "epoch": 0.507292641312125, + "grad_norm": 0.7952812910079956, + "learning_rate": 8.517865888868797e-06, + "loss": 0.8503, + "step": 9217 + }, + { + "epoch": 0.5073476801144807, + "grad_norm": 0.6926921606063843, + "learning_rate": 8.517557844382424e-06, + "loss": 0.6713, + "step": 9218 + }, + { + "epoch": 0.5074027189168364, + "grad_norm": 0.8203585147857666, + "learning_rate": 8.517249773459026e-06, + "loss": 0.8483, + "step": 9219 + }, + { + "epoch": 0.507457757719192, + "grad_norm": 0.6788125038146973, + "learning_rate": 8.516941676100923e-06, + "loss": 0.7521, + "step": 9220 + }, + { + "epoch": 0.5075127965215477, + "grad_norm": 0.6439838409423828, + "learning_rate": 8.516633552310426e-06, + "loss": 0.7359, + "step": 9221 + }, + { + "epoch": 0.5075678353239034, + "grad_norm": 0.6872217655181885, + "learning_rate": 8.516325402089854e-06, + "loss": 0.73, + "step": 9222 + }, + { + "epoch": 0.5076228741262591, + "grad_norm": 0.6695985794067383, + "learning_rate": 8.51601722544152e-06, + "loss": 0.7519, + "step": 9223 + }, + { + "epoch": 0.5076779129286146, + "grad_norm": 0.7779402136802673, + "learning_rate": 8.515709022367741e-06, + "loss": 0.7325, + "step": 9224 + }, + { + "epoch": 0.5077329517309703, + "grad_norm": 0.9289746284484863, + "learning_rate": 8.515400792870836e-06, + "loss": 0.7839, + "step": 9225 + }, + { + "epoch": 0.507787990533326, + "grad_norm": 0.6949248313903809, + "learning_rate": 8.51509253695312e-06, + "loss": 0.7363, + "step": 9226 + }, + { + "epoch": 0.5078430293356817, + "grad_norm": 0.6463130116462708, + "learning_rate": 8.514784254616908e-06, + "loss": 0.7607, + "step": 9227 + }, + { + "epoch": 0.5078980681380373, + "grad_norm": 0.7332046031951904, + "learning_rate": 8.514475945864519e-06, + "loss": 0.6833, + "step": 9228 + }, + { + "epoch": 0.507953106940393, + "grad_norm": 0.8674100637435913, + "learning_rate": 8.51416761069827e-06, + "loss": 0.669, + "step": 9229 + }, + { + "epoch": 0.5080081457427487, + "grad_norm": 0.8073185682296753, + "learning_rate": 8.513859249120477e-06, + "loss": 0.7215, + "step": 9230 + }, + { + "epoch": 0.5080631845451044, + "grad_norm": 0.674117386341095, + "learning_rate": 8.51355086113346e-06, + "loss": 0.7813, + "step": 9231 + }, + { + "epoch": 0.5081182233474599, + "grad_norm": 0.8564596176147461, + "learning_rate": 8.513242446739534e-06, + "loss": 0.7393, + "step": 9232 + }, + { + "epoch": 0.5081732621498156, + "grad_norm": 0.684637188911438, + "learning_rate": 8.512934005941015e-06, + "loss": 0.781, + "step": 9233 + }, + { + "epoch": 0.5082283009521713, + "grad_norm": 0.816123902797699, + "learning_rate": 8.51262553874023e-06, + "loss": 0.8597, + "step": 9234 + }, + { + "epoch": 0.5082833397545269, + "grad_norm": 0.6582320332527161, + "learning_rate": 8.512317045139488e-06, + "loss": 0.6654, + "step": 9235 + }, + { + "epoch": 0.5083383785568826, + "grad_norm": 1.0153518915176392, + "learning_rate": 8.512008525141113e-06, + "loss": 0.7946, + "step": 9236 + }, + { + "epoch": 0.5083934173592383, + "grad_norm": 0.7455416917800903, + "learning_rate": 8.511699978747422e-06, + "loss": 0.8365, + "step": 9237 + }, + { + "epoch": 0.508448456161594, + "grad_norm": 0.6498221755027771, + "learning_rate": 8.511391405960733e-06, + "loss": 0.7252, + "step": 9238 + }, + { + "epoch": 0.5085034949639495, + "grad_norm": 0.6856792569160461, + "learning_rate": 8.511082806783368e-06, + "loss": 0.7282, + "step": 9239 + }, + { + "epoch": 0.5085585337663052, + "grad_norm": 0.6930065751075745, + "learning_rate": 8.510774181217643e-06, + "loss": 0.7404, + "step": 9240 + }, + { + "epoch": 0.5086135725686609, + "grad_norm": 0.6953150033950806, + "learning_rate": 8.51046552926588e-06, + "loss": 0.7684, + "step": 9241 + }, + { + "epoch": 0.5086686113710166, + "grad_norm": 0.7307711839675903, + "learning_rate": 8.510156850930395e-06, + "loss": 0.7557, + "step": 9242 + }, + { + "epoch": 0.5087236501733722, + "grad_norm": 0.7296478152275085, + "learning_rate": 8.509848146213513e-06, + "loss": 0.7469, + "step": 9243 + }, + { + "epoch": 0.5087786889757279, + "grad_norm": 0.7035672664642334, + "learning_rate": 8.509539415117553e-06, + "loss": 0.7151, + "step": 9244 + }, + { + "epoch": 0.5088337277780836, + "grad_norm": 0.7818698883056641, + "learning_rate": 8.509230657644832e-06, + "loss": 0.7134, + "step": 9245 + }, + { + "epoch": 0.5088887665804392, + "grad_norm": 0.7503119111061096, + "learning_rate": 8.508921873797674e-06, + "loss": 0.7028, + "step": 9246 + }, + { + "epoch": 0.5089438053827948, + "grad_norm": 0.7733498215675354, + "learning_rate": 8.508613063578397e-06, + "loss": 0.8159, + "step": 9247 + }, + { + "epoch": 0.5089988441851505, + "grad_norm": 0.9236353635787964, + "learning_rate": 8.508304226989326e-06, + "loss": 0.8013, + "step": 9248 + }, + { + "epoch": 0.5090538829875062, + "grad_norm": 0.6567198634147644, + "learning_rate": 8.507995364032777e-06, + "loss": 0.8285, + "step": 9249 + }, + { + "epoch": 0.5091089217898619, + "grad_norm": 0.6555445790290833, + "learning_rate": 8.507686474711074e-06, + "loss": 0.6917, + "step": 9250 + }, + { + "epoch": 0.5091639605922175, + "grad_norm": 0.8505375385284424, + "learning_rate": 8.507377559026539e-06, + "loss": 0.824, + "step": 9251 + }, + { + "epoch": 0.5092189993945732, + "grad_norm": 0.703413188457489, + "learning_rate": 8.507068616981493e-06, + "loss": 0.7162, + "step": 9252 + }, + { + "epoch": 0.5092740381969288, + "grad_norm": 0.7257823944091797, + "learning_rate": 8.50675964857826e-06, + "loss": 0.8031, + "step": 9253 + }, + { + "epoch": 0.5093290769992845, + "grad_norm": 0.6861198544502258, + "learning_rate": 8.506450653819159e-06, + "loss": 0.7724, + "step": 9254 + }, + { + "epoch": 0.5093841158016401, + "grad_norm": 0.7733107209205627, + "learning_rate": 8.506141632706512e-06, + "loss": 0.7834, + "step": 9255 + }, + { + "epoch": 0.5094391546039958, + "grad_norm": 0.7472217082977295, + "learning_rate": 8.505832585242644e-06, + "loss": 0.7594, + "step": 9256 + }, + { + "epoch": 0.5094941934063515, + "grad_norm": 0.6273325085639954, + "learning_rate": 8.505523511429876e-06, + "loss": 0.6798, + "step": 9257 + }, + { + "epoch": 0.5095492322087072, + "grad_norm": 0.7366517186164856, + "learning_rate": 8.505214411270533e-06, + "loss": 0.7916, + "step": 9258 + }, + { + "epoch": 0.5096042710110628, + "grad_norm": 0.6654453873634338, + "learning_rate": 8.504905284766936e-06, + "loss": 0.7228, + "step": 9259 + }, + { + "epoch": 0.5096593098134184, + "grad_norm": 0.7926275134086609, + "learning_rate": 8.50459613192141e-06, + "loss": 0.8303, + "step": 9260 + }, + { + "epoch": 0.5097143486157741, + "grad_norm": 0.7256377935409546, + "learning_rate": 8.504286952736277e-06, + "loss": 0.7977, + "step": 9261 + }, + { + "epoch": 0.5097693874181298, + "grad_norm": 0.7333946824073792, + "learning_rate": 8.50397774721386e-06, + "loss": 0.7978, + "step": 9262 + }, + { + "epoch": 0.5098244262204854, + "grad_norm": 0.6102882623672485, + "learning_rate": 8.503668515356485e-06, + "loss": 0.6386, + "step": 9263 + }, + { + "epoch": 0.5098794650228411, + "grad_norm": 0.7939823865890503, + "learning_rate": 8.503359257166477e-06, + "loss": 0.7328, + "step": 9264 + }, + { + "epoch": 0.5099345038251968, + "grad_norm": 0.7245013117790222, + "learning_rate": 8.503049972646157e-06, + "loss": 0.795, + "step": 9265 + }, + { + "epoch": 0.5099895426275525, + "grad_norm": 0.6722108125686646, + "learning_rate": 8.502740661797852e-06, + "loss": 0.7062, + "step": 9266 + }, + { + "epoch": 0.510044581429908, + "grad_norm": 0.6759012341499329, + "learning_rate": 8.502431324623884e-06, + "loss": 0.7427, + "step": 9267 + }, + { + "epoch": 0.5100996202322637, + "grad_norm": 0.6448835730552673, + "learning_rate": 8.502121961126581e-06, + "loss": 0.7381, + "step": 9268 + }, + { + "epoch": 0.5101546590346194, + "grad_norm": 0.6437426209449768, + "learning_rate": 8.501812571308266e-06, + "loss": 0.6733, + "step": 9269 + }, + { + "epoch": 0.5102096978369751, + "grad_norm": 0.6879013776779175, + "learning_rate": 8.501503155171267e-06, + "loss": 0.7227, + "step": 9270 + }, + { + "epoch": 0.5102647366393307, + "grad_norm": 0.6628512740135193, + "learning_rate": 8.501193712717906e-06, + "loss": 0.7151, + "step": 9271 + }, + { + "epoch": 0.5103197754416864, + "grad_norm": 0.7653747797012329, + "learning_rate": 8.500884243950511e-06, + "loss": 0.8189, + "step": 9272 + }, + { + "epoch": 0.5103748142440421, + "grad_norm": 0.7180060148239136, + "learning_rate": 8.500574748871407e-06, + "loss": 0.7633, + "step": 9273 + }, + { + "epoch": 0.5104298530463978, + "grad_norm": 0.7045086622238159, + "learning_rate": 8.50026522748292e-06, + "loss": 0.746, + "step": 9274 + }, + { + "epoch": 0.5104848918487533, + "grad_norm": 0.6224614381790161, + "learning_rate": 8.499955679787376e-06, + "loss": 0.7436, + "step": 9275 + }, + { + "epoch": 0.510539930651109, + "grad_norm": 0.6716495156288147, + "learning_rate": 8.499646105787103e-06, + "loss": 0.8006, + "step": 9276 + }, + { + "epoch": 0.5105949694534647, + "grad_norm": 0.83705735206604, + "learning_rate": 8.499336505484426e-06, + "loss": 0.886, + "step": 9277 + }, + { + "epoch": 0.5106500082558203, + "grad_norm": 0.7942199110984802, + "learning_rate": 8.499026878881673e-06, + "loss": 0.7709, + "step": 9278 + }, + { + "epoch": 0.510705047058176, + "grad_norm": 0.7500330209732056, + "learning_rate": 8.49871722598117e-06, + "loss": 0.7737, + "step": 9279 + }, + { + "epoch": 0.5107600858605317, + "grad_norm": 0.7283433675765991, + "learning_rate": 8.498407546785245e-06, + "loss": 0.8345, + "step": 9280 + }, + { + "epoch": 0.5108151246628874, + "grad_norm": 0.6970989108085632, + "learning_rate": 8.498097841296224e-06, + "loss": 0.7451, + "step": 9281 + }, + { + "epoch": 0.5108701634652429, + "grad_norm": 0.8338573575019836, + "learning_rate": 8.497788109516438e-06, + "loss": 0.8198, + "step": 9282 + }, + { + "epoch": 0.5109252022675986, + "grad_norm": 0.6544861197471619, + "learning_rate": 8.497478351448213e-06, + "loss": 0.7549, + "step": 9283 + }, + { + "epoch": 0.5109802410699543, + "grad_norm": 0.6627360582351685, + "learning_rate": 8.497168567093876e-06, + "loss": 0.7136, + "step": 9284 + }, + { + "epoch": 0.51103527987231, + "grad_norm": 0.7176669239997864, + "learning_rate": 8.496858756455755e-06, + "loss": 0.766, + "step": 9285 + }, + { + "epoch": 0.5110903186746656, + "grad_norm": 0.8260897397994995, + "learning_rate": 8.496548919536183e-06, + "loss": 0.8167, + "step": 9286 + }, + { + "epoch": 0.5111453574770213, + "grad_norm": 0.7077773809432983, + "learning_rate": 8.496239056337483e-06, + "loss": 0.776, + "step": 9287 + }, + { + "epoch": 0.511200396279377, + "grad_norm": 0.7609447836875916, + "learning_rate": 8.495929166861988e-06, + "loss": 0.7339, + "step": 9288 + }, + { + "epoch": 0.5112554350817327, + "grad_norm": 0.6896487474441528, + "learning_rate": 8.495619251112022e-06, + "loss": 0.7639, + "step": 9289 + }, + { + "epoch": 0.5113104738840882, + "grad_norm": 0.6946871280670166, + "learning_rate": 8.495309309089918e-06, + "loss": 0.8242, + "step": 9290 + }, + { + "epoch": 0.5113655126864439, + "grad_norm": 0.79847252368927, + "learning_rate": 8.494999340798007e-06, + "loss": 0.8226, + "step": 9291 + }, + { + "epoch": 0.5114205514887996, + "grad_norm": 0.7845447063446045, + "learning_rate": 8.494689346238615e-06, + "loss": 0.8593, + "step": 9292 + }, + { + "epoch": 0.5114755902911553, + "grad_norm": 1.1577119827270508, + "learning_rate": 8.494379325414074e-06, + "loss": 0.746, + "step": 9293 + }, + { + "epoch": 0.5115306290935109, + "grad_norm": 0.6720938682556152, + "learning_rate": 8.494069278326713e-06, + "loss": 0.6768, + "step": 9294 + }, + { + "epoch": 0.5115856678958666, + "grad_norm": 0.7389395833015442, + "learning_rate": 8.493759204978862e-06, + "loss": 0.8126, + "step": 9295 + }, + { + "epoch": 0.5116407066982223, + "grad_norm": 0.7629536986351013, + "learning_rate": 8.493449105372853e-06, + "loss": 0.7107, + "step": 9296 + }, + { + "epoch": 0.511695745500578, + "grad_norm": 0.7339474558830261, + "learning_rate": 8.493138979511015e-06, + "loss": 0.8144, + "step": 9297 + }, + { + "epoch": 0.5117507843029335, + "grad_norm": 0.7222825288772583, + "learning_rate": 8.49282882739568e-06, + "loss": 0.7512, + "step": 9298 + }, + { + "epoch": 0.5118058231052892, + "grad_norm": 0.676659107208252, + "learning_rate": 8.49251864902918e-06, + "loss": 0.6515, + "step": 9299 + }, + { + "epoch": 0.5118608619076449, + "grad_norm": 0.6336323618888855, + "learning_rate": 8.492208444413844e-06, + "loss": 0.719, + "step": 9300 + }, + { + "epoch": 0.5119159007100006, + "grad_norm": 0.701543927192688, + "learning_rate": 8.491898213552e-06, + "loss": 0.728, + "step": 9301 + }, + { + "epoch": 0.5119709395123562, + "grad_norm": 0.6809069514274597, + "learning_rate": 8.491587956445988e-06, + "loss": 0.8844, + "step": 9302 + }, + { + "epoch": 0.5120259783147119, + "grad_norm": 0.8046489357948303, + "learning_rate": 8.491277673098135e-06, + "loss": 0.817, + "step": 9303 + }, + { + "epoch": 0.5120810171170675, + "grad_norm": 0.8630616068840027, + "learning_rate": 8.490967363510774e-06, + "loss": 0.7745, + "step": 9304 + }, + { + "epoch": 0.5121360559194232, + "grad_norm": 0.7457678914070129, + "learning_rate": 8.490657027686235e-06, + "loss": 0.7956, + "step": 9305 + }, + { + "epoch": 0.5121910947217788, + "grad_norm": 0.6383466124534607, + "learning_rate": 8.490346665626854e-06, + "loss": 0.8046, + "step": 9306 + }, + { + "epoch": 0.5122461335241345, + "grad_norm": 0.7658202052116394, + "learning_rate": 8.49003627733496e-06, + "loss": 0.7905, + "step": 9307 + }, + { + "epoch": 0.5123011723264902, + "grad_norm": 0.6793283224105835, + "learning_rate": 8.48972586281289e-06, + "loss": 0.6646, + "step": 9308 + }, + { + "epoch": 0.5123562111288459, + "grad_norm": 0.7345246076583862, + "learning_rate": 8.489415422062972e-06, + "loss": 0.788, + "step": 9309 + }, + { + "epoch": 0.5124112499312015, + "grad_norm": 0.6665463447570801, + "learning_rate": 8.489104955087542e-06, + "loss": 0.706, + "step": 9310 + }, + { + "epoch": 0.5124662887335572, + "grad_norm": 0.7895458936691284, + "learning_rate": 8.488794461888934e-06, + "loss": 0.7464, + "step": 9311 + }, + { + "epoch": 0.5125213275359128, + "grad_norm": 0.7375221252441406, + "learning_rate": 8.488483942469481e-06, + "loss": 0.8029, + "step": 9312 + }, + { + "epoch": 0.5125763663382685, + "grad_norm": 0.792348325252533, + "learning_rate": 8.488173396831514e-06, + "loss": 0.7324, + "step": 9313 + }, + { + "epoch": 0.5126314051406241, + "grad_norm": 0.6500192880630493, + "learning_rate": 8.487862824977373e-06, + "loss": 0.7331, + "step": 9314 + }, + { + "epoch": 0.5126864439429798, + "grad_norm": 0.6607314348220825, + "learning_rate": 8.487552226909386e-06, + "loss": 0.7782, + "step": 9315 + }, + { + "epoch": 0.5127414827453355, + "grad_norm": 0.8261791467666626, + "learning_rate": 8.487241602629892e-06, + "loss": 0.8036, + "step": 9316 + }, + { + "epoch": 0.5127965215476912, + "grad_norm": 0.8301663994789124, + "learning_rate": 8.486930952141222e-06, + "loss": 0.7928, + "step": 9317 + }, + { + "epoch": 0.5128515603500468, + "grad_norm": 0.6957940459251404, + "learning_rate": 8.486620275445713e-06, + "loss": 0.7359, + "step": 9318 + }, + { + "epoch": 0.5129065991524024, + "grad_norm": 0.7562606334686279, + "learning_rate": 8.4863095725457e-06, + "loss": 0.7546, + "step": 9319 + }, + { + "epoch": 0.5129616379547581, + "grad_norm": 0.795886218547821, + "learning_rate": 8.485998843443517e-06, + "loss": 0.7558, + "step": 9320 + }, + { + "epoch": 0.5130166767571137, + "grad_norm": 0.6558147072792053, + "learning_rate": 8.4856880881415e-06, + "loss": 0.6832, + "step": 9321 + }, + { + "epoch": 0.5130717155594694, + "grad_norm": 0.7300151586532593, + "learning_rate": 8.485377306641984e-06, + "loss": 0.8018, + "step": 9322 + }, + { + "epoch": 0.5131267543618251, + "grad_norm": 0.7114105224609375, + "learning_rate": 8.485066498947305e-06, + "loss": 0.7374, + "step": 9323 + }, + { + "epoch": 0.5131817931641808, + "grad_norm": 0.7061085104942322, + "learning_rate": 8.484755665059798e-06, + "loss": 0.7905, + "step": 9324 + }, + { + "epoch": 0.5132368319665364, + "grad_norm": 0.8481647968292236, + "learning_rate": 8.484444804981802e-06, + "loss": 0.8518, + "step": 9325 + }, + { + "epoch": 0.513291870768892, + "grad_norm": 0.7583557367324829, + "learning_rate": 8.48413391871565e-06, + "loss": 0.8328, + "step": 9326 + }, + { + "epoch": 0.5133469095712477, + "grad_norm": 0.7381925582885742, + "learning_rate": 8.483823006263683e-06, + "loss": 0.76, + "step": 9327 + }, + { + "epoch": 0.5134019483736034, + "grad_norm": 0.8037852644920349, + "learning_rate": 8.483512067628232e-06, + "loss": 0.711, + "step": 9328 + }, + { + "epoch": 0.513456987175959, + "grad_norm": 0.6682618260383606, + "learning_rate": 8.483201102811637e-06, + "loss": 0.7479, + "step": 9329 + }, + { + "epoch": 0.5135120259783147, + "grad_norm": 0.662234365940094, + "learning_rate": 8.482890111816237e-06, + "loss": 0.7701, + "step": 9330 + }, + { + "epoch": 0.5135670647806704, + "grad_norm": 0.7081482410430908, + "learning_rate": 8.482579094644365e-06, + "loss": 0.8255, + "step": 9331 + }, + { + "epoch": 0.5136221035830261, + "grad_norm": 0.9659954905509949, + "learning_rate": 8.482268051298364e-06, + "loss": 0.8742, + "step": 9332 + }, + { + "epoch": 0.5136771423853816, + "grad_norm": 0.7837772369384766, + "learning_rate": 8.481956981780564e-06, + "loss": 0.7692, + "step": 9333 + }, + { + "epoch": 0.5137321811877373, + "grad_norm": 0.681918203830719, + "learning_rate": 8.481645886093311e-06, + "loss": 0.6952, + "step": 9334 + }, + { + "epoch": 0.513787219990093, + "grad_norm": 0.7253187894821167, + "learning_rate": 8.481334764238937e-06, + "loss": 0.7074, + "step": 9335 + }, + { + "epoch": 0.5138422587924487, + "grad_norm": 0.8845877051353455, + "learning_rate": 8.481023616219783e-06, + "loss": 0.675, + "step": 9336 + }, + { + "epoch": 0.5138972975948043, + "grad_norm": 0.6569344401359558, + "learning_rate": 8.480712442038188e-06, + "loss": 0.7181, + "step": 9337 + }, + { + "epoch": 0.51395233639716, + "grad_norm": 0.7372813820838928, + "learning_rate": 8.480401241696491e-06, + "loss": 0.8137, + "step": 9338 + }, + { + "epoch": 0.5140073751995157, + "grad_norm": 0.843099057674408, + "learning_rate": 8.48009001519703e-06, + "loss": 0.7648, + "step": 9339 + }, + { + "epoch": 0.5140624140018714, + "grad_norm": 0.7762032747268677, + "learning_rate": 8.479778762542142e-06, + "loss": 0.7805, + "step": 9340 + }, + { + "epoch": 0.5141174528042269, + "grad_norm": 0.739086925983429, + "learning_rate": 8.479467483734169e-06, + "loss": 0.7125, + "step": 9341 + }, + { + "epoch": 0.5141724916065826, + "grad_norm": 0.7351683974266052, + "learning_rate": 8.479156178775451e-06, + "loss": 0.7855, + "step": 9342 + }, + { + "epoch": 0.5142275304089383, + "grad_norm": 0.7601314187049866, + "learning_rate": 8.478844847668325e-06, + "loss": 0.8349, + "step": 9343 + }, + { + "epoch": 0.514282569211294, + "grad_norm": 0.6841638684272766, + "learning_rate": 8.478533490415133e-06, + "loss": 0.7986, + "step": 9344 + }, + { + "epoch": 0.5143376080136496, + "grad_norm": 0.6734872460365295, + "learning_rate": 8.478222107018213e-06, + "loss": 0.6941, + "step": 9345 + }, + { + "epoch": 0.5143926468160053, + "grad_norm": 0.801930844783783, + "learning_rate": 8.47791069747991e-06, + "loss": 0.8537, + "step": 9346 + }, + { + "epoch": 0.514447685618361, + "grad_norm": 0.6960629224777222, + "learning_rate": 8.477599261802558e-06, + "loss": 0.6629, + "step": 9347 + }, + { + "epoch": 0.5145027244207167, + "grad_norm": 0.7791358232498169, + "learning_rate": 8.477287799988502e-06, + "loss": 0.8777, + "step": 9348 + }, + { + "epoch": 0.5145577632230722, + "grad_norm": 0.7022722959518433, + "learning_rate": 8.476976312040082e-06, + "loss": 0.7116, + "step": 9349 + }, + { + "epoch": 0.5146128020254279, + "grad_norm": 0.7791306376457214, + "learning_rate": 8.476664797959639e-06, + "loss": 0.7262, + "step": 9350 + }, + { + "epoch": 0.5146678408277836, + "grad_norm": 0.7391177415847778, + "learning_rate": 8.476353257749514e-06, + "loss": 0.7308, + "step": 9351 + }, + { + "epoch": 0.5147228796301393, + "grad_norm": 0.6989552974700928, + "learning_rate": 8.476041691412046e-06, + "loss": 0.7754, + "step": 9352 + }, + { + "epoch": 0.5147779184324949, + "grad_norm": 0.7639930844306946, + "learning_rate": 8.475730098949582e-06, + "loss": 0.8385, + "step": 9353 + }, + { + "epoch": 0.5148329572348506, + "grad_norm": 0.7687931060791016, + "learning_rate": 8.47541848036446e-06, + "loss": 0.8118, + "step": 9354 + }, + { + "epoch": 0.5148879960372063, + "grad_norm": 0.8831589221954346, + "learning_rate": 8.475106835659024e-06, + "loss": 0.7705, + "step": 9355 + }, + { + "epoch": 0.5149430348395619, + "grad_norm": 0.7585502862930298, + "learning_rate": 8.474795164835614e-06, + "loss": 0.8167, + "step": 9356 + }, + { + "epoch": 0.5149980736419175, + "grad_norm": 0.7078690528869629, + "learning_rate": 8.474483467896572e-06, + "loss": 0.7412, + "step": 9357 + }, + { + "epoch": 0.5150531124442732, + "grad_norm": 0.8950889706611633, + "learning_rate": 8.474171744844246e-06, + "loss": 0.8132, + "step": 9358 + }, + { + "epoch": 0.5151081512466289, + "grad_norm": 0.7196077704429626, + "learning_rate": 8.473859995680973e-06, + "loss": 0.8041, + "step": 9359 + }, + { + "epoch": 0.5151631900489846, + "grad_norm": 0.7705141305923462, + "learning_rate": 8.473548220409099e-06, + "loss": 0.8437, + "step": 9360 + }, + { + "epoch": 0.5152182288513402, + "grad_norm": 0.6507467031478882, + "learning_rate": 8.473236419030966e-06, + "loss": 0.7713, + "step": 9361 + }, + { + "epoch": 0.5152732676536959, + "grad_norm": 0.7120817303657532, + "learning_rate": 8.472924591548917e-06, + "loss": 0.7688, + "step": 9362 + }, + { + "epoch": 0.5153283064560515, + "grad_norm": 0.7830487489700317, + "learning_rate": 8.472612737965297e-06, + "loss": 0.8875, + "step": 9363 + }, + { + "epoch": 0.5153833452584071, + "grad_norm": 0.8790529370307922, + "learning_rate": 8.47230085828245e-06, + "loss": 0.7648, + "step": 9364 + }, + { + "epoch": 0.5154383840607628, + "grad_norm": 0.8956806659698486, + "learning_rate": 8.471988952502718e-06, + "loss": 0.7891, + "step": 9365 + }, + { + "epoch": 0.5154934228631185, + "grad_norm": 0.7370011210441589, + "learning_rate": 8.471677020628448e-06, + "loss": 0.7609, + "step": 9366 + }, + { + "epoch": 0.5155484616654742, + "grad_norm": 0.6794238090515137, + "learning_rate": 8.471365062661982e-06, + "loss": 0.6679, + "step": 9367 + }, + { + "epoch": 0.5156035004678298, + "grad_norm": 0.7330273985862732, + "learning_rate": 8.471053078605664e-06, + "loss": 0.7276, + "step": 9368 + }, + { + "epoch": 0.5156585392701855, + "grad_norm": 0.7796601057052612, + "learning_rate": 8.470741068461843e-06, + "loss": 0.7897, + "step": 9369 + }, + { + "epoch": 0.5157135780725411, + "grad_norm": 0.6834099888801575, + "learning_rate": 8.470429032232858e-06, + "loss": 0.7924, + "step": 9370 + }, + { + "epoch": 0.5157686168748968, + "grad_norm": 0.6991616487503052, + "learning_rate": 8.47011696992106e-06, + "loss": 0.7901, + "step": 9371 + }, + { + "epoch": 0.5158236556772524, + "grad_norm": 0.7321401834487915, + "learning_rate": 8.469804881528792e-06, + "loss": 0.6718, + "step": 9372 + }, + { + "epoch": 0.5158786944796081, + "grad_norm": 0.7091043591499329, + "learning_rate": 8.469492767058398e-06, + "loss": 0.8204, + "step": 9373 + }, + { + "epoch": 0.5159337332819638, + "grad_norm": 0.8777012825012207, + "learning_rate": 8.469180626512223e-06, + "loss": 0.8045, + "step": 9374 + }, + { + "epoch": 0.5159887720843195, + "grad_norm": 0.6652738451957703, + "learning_rate": 8.468868459892619e-06, + "loss": 0.7248, + "step": 9375 + }, + { + "epoch": 0.5160438108866751, + "grad_norm": 0.7209659218788147, + "learning_rate": 8.468556267201925e-06, + "loss": 0.7508, + "step": 9376 + }, + { + "epoch": 0.5160988496890307, + "grad_norm": 0.7685441970825195, + "learning_rate": 8.468244048442494e-06, + "loss": 0.7501, + "step": 9377 + }, + { + "epoch": 0.5161538884913864, + "grad_norm": 0.6773725152015686, + "learning_rate": 8.467931803616665e-06, + "loss": 0.8036, + "step": 9378 + }, + { + "epoch": 0.5162089272937421, + "grad_norm": 0.7167890071868896, + "learning_rate": 8.467619532726792e-06, + "loss": 0.7229, + "step": 9379 + }, + { + "epoch": 0.5162639660960977, + "grad_norm": 0.7066929340362549, + "learning_rate": 8.467307235775218e-06, + "loss": 0.7433, + "step": 9380 + }, + { + "epoch": 0.5163190048984534, + "grad_norm": 0.7261828780174255, + "learning_rate": 8.46699491276429e-06, + "loss": 0.7873, + "step": 9381 + }, + { + "epoch": 0.5163740437008091, + "grad_norm": 0.7442463636398315, + "learning_rate": 8.466682563696356e-06, + "loss": 0.7953, + "step": 9382 + }, + { + "epoch": 0.5164290825031648, + "grad_norm": 0.5668768286705017, + "learning_rate": 8.466370188573765e-06, + "loss": 0.5602, + "step": 9383 + }, + { + "epoch": 0.5164841213055203, + "grad_norm": 0.7364997267723083, + "learning_rate": 8.466057787398864e-06, + "loss": 0.8274, + "step": 9384 + }, + { + "epoch": 0.516539160107876, + "grad_norm": 0.7793132066726685, + "learning_rate": 8.465745360174e-06, + "loss": 0.7832, + "step": 9385 + }, + { + "epoch": 0.5165941989102317, + "grad_norm": 0.6818128824234009, + "learning_rate": 8.46543290690152e-06, + "loss": 0.8314, + "step": 9386 + }, + { + "epoch": 0.5166492377125874, + "grad_norm": 0.7392195463180542, + "learning_rate": 8.465120427583778e-06, + "loss": 0.8124, + "step": 9387 + }, + { + "epoch": 0.516704276514943, + "grad_norm": 0.8582521677017212, + "learning_rate": 8.464807922223115e-06, + "loss": 0.7417, + "step": 9388 + }, + { + "epoch": 0.5167593153172987, + "grad_norm": 0.7322097420692444, + "learning_rate": 8.464495390821882e-06, + "loss": 0.7408, + "step": 9389 + }, + { + "epoch": 0.5168143541196544, + "grad_norm": 0.8177433013916016, + "learning_rate": 8.464182833382432e-06, + "loss": 0.87, + "step": 9390 + }, + { + "epoch": 0.5168693929220101, + "grad_norm": 0.7088115215301514, + "learning_rate": 8.46387024990711e-06, + "loss": 0.7748, + "step": 9391 + }, + { + "epoch": 0.5169244317243656, + "grad_norm": 0.6648650169372559, + "learning_rate": 8.463557640398268e-06, + "loss": 0.6302, + "step": 9392 + }, + { + "epoch": 0.5169794705267213, + "grad_norm": 0.6688859462738037, + "learning_rate": 8.463245004858251e-06, + "loss": 0.7252, + "step": 9393 + }, + { + "epoch": 0.517034509329077, + "grad_norm": 0.7231030464172363, + "learning_rate": 8.462932343289412e-06, + "loss": 0.8497, + "step": 9394 + }, + { + "epoch": 0.5170895481314327, + "grad_norm": 0.7142065763473511, + "learning_rate": 8.462619655694103e-06, + "loss": 0.7041, + "step": 9395 + }, + { + "epoch": 0.5171445869337883, + "grad_norm": 0.7197136878967285, + "learning_rate": 8.462306942074669e-06, + "loss": 0.7022, + "step": 9396 + }, + { + "epoch": 0.517199625736144, + "grad_norm": 0.7620192766189575, + "learning_rate": 8.461994202433463e-06, + "loss": 0.8243, + "step": 9397 + }, + { + "epoch": 0.5172546645384997, + "grad_norm": 0.7697533965110779, + "learning_rate": 8.461681436772836e-06, + "loss": 0.7861, + "step": 9398 + }, + { + "epoch": 0.5173097033408554, + "grad_norm": 0.7224711179733276, + "learning_rate": 8.461368645095138e-06, + "loss": 0.7588, + "step": 9399 + }, + { + "epoch": 0.5173647421432109, + "grad_norm": 0.9285979270935059, + "learning_rate": 8.46105582740272e-06, + "loss": 0.8113, + "step": 9400 + }, + { + "epoch": 0.5174197809455666, + "grad_norm": 0.7297842502593994, + "learning_rate": 8.460742983697934e-06, + "loss": 0.7115, + "step": 9401 + }, + { + "epoch": 0.5174748197479223, + "grad_norm": 0.6712872982025146, + "learning_rate": 8.460430113983126e-06, + "loss": 0.751, + "step": 9402 + }, + { + "epoch": 0.517529858550278, + "grad_norm": 0.7807186245918274, + "learning_rate": 8.460117218260657e-06, + "loss": 0.8375, + "step": 9403 + }, + { + "epoch": 0.5175848973526336, + "grad_norm": 0.621530294418335, + "learning_rate": 8.45980429653287e-06, + "loss": 0.638, + "step": 9404 + }, + { + "epoch": 0.5176399361549893, + "grad_norm": 0.7086256146430969, + "learning_rate": 8.45949134880212e-06, + "loss": 0.8304, + "step": 9405 + }, + { + "epoch": 0.517694974957345, + "grad_norm": 0.62705397605896, + "learning_rate": 8.45917837507076e-06, + "loss": 0.7008, + "step": 9406 + }, + { + "epoch": 0.5177500137597005, + "grad_norm": 0.9109121561050415, + "learning_rate": 8.458865375341142e-06, + "loss": 0.7529, + "step": 9407 + }, + { + "epoch": 0.5178050525620562, + "grad_norm": 0.6909900903701782, + "learning_rate": 8.458552349615615e-06, + "loss": 0.8453, + "step": 9408 + }, + { + "epoch": 0.5178600913644119, + "grad_norm": 0.7548434138298035, + "learning_rate": 8.458239297896536e-06, + "loss": 0.7516, + "step": 9409 + }, + { + "epoch": 0.5179151301667676, + "grad_norm": 0.7595730423927307, + "learning_rate": 8.457926220186257e-06, + "loss": 0.7599, + "step": 9410 + }, + { + "epoch": 0.5179701689691232, + "grad_norm": 0.7449337840080261, + "learning_rate": 8.45761311648713e-06, + "loss": 0.8236, + "step": 9411 + }, + { + "epoch": 0.5180252077714789, + "grad_norm": 0.7529160976409912, + "learning_rate": 8.457299986801507e-06, + "loss": 0.8655, + "step": 9412 + }, + { + "epoch": 0.5180802465738346, + "grad_norm": 0.6777701377868652, + "learning_rate": 8.456986831131742e-06, + "loss": 0.7737, + "step": 9413 + }, + { + "epoch": 0.5181352853761902, + "grad_norm": 0.9363510012626648, + "learning_rate": 8.456673649480191e-06, + "loss": 0.8227, + "step": 9414 + }, + { + "epoch": 0.5181903241785458, + "grad_norm": 0.798001229763031, + "learning_rate": 8.456360441849206e-06, + "loss": 0.8881, + "step": 9415 + }, + { + "epoch": 0.5182453629809015, + "grad_norm": 0.7212072610855103, + "learning_rate": 8.456047208241141e-06, + "loss": 0.8165, + "step": 9416 + }, + { + "epoch": 0.5183004017832572, + "grad_norm": 0.6918027997016907, + "learning_rate": 8.45573394865835e-06, + "loss": 0.8048, + "step": 9417 + }, + { + "epoch": 0.5183554405856129, + "grad_norm": 0.6474916338920593, + "learning_rate": 8.455420663103187e-06, + "loss": 0.6502, + "step": 9418 + }, + { + "epoch": 0.5184104793879685, + "grad_norm": 0.6592364311218262, + "learning_rate": 8.455107351578008e-06, + "loss": 0.7509, + "step": 9419 + }, + { + "epoch": 0.5184655181903242, + "grad_norm": 0.7658745646476746, + "learning_rate": 8.454794014085168e-06, + "loss": 0.8444, + "step": 9420 + }, + { + "epoch": 0.5185205569926798, + "grad_norm": 0.6814215183258057, + "learning_rate": 8.45448065062702e-06, + "loss": 0.7367, + "step": 9421 + }, + { + "epoch": 0.5185755957950355, + "grad_norm": 0.644740104675293, + "learning_rate": 8.45416726120592e-06, + "loss": 0.7456, + "step": 9422 + }, + { + "epoch": 0.5186306345973911, + "grad_norm": 0.8578751087188721, + "learning_rate": 8.453853845824225e-06, + "loss": 0.8481, + "step": 9423 + }, + { + "epoch": 0.5186856733997468, + "grad_norm": 0.6630389094352722, + "learning_rate": 8.453540404484288e-06, + "loss": 0.7487, + "step": 9424 + }, + { + "epoch": 0.5187407122021025, + "grad_norm": 0.7756431698799133, + "learning_rate": 8.453226937188466e-06, + "loss": 0.798, + "step": 9425 + }, + { + "epoch": 0.5187957510044582, + "grad_norm": 0.7856318354606628, + "learning_rate": 8.452913443939113e-06, + "loss": 0.785, + "step": 9426 + }, + { + "epoch": 0.5188507898068138, + "grad_norm": 0.7563977837562561, + "learning_rate": 8.45259992473859e-06, + "loss": 0.8182, + "step": 9427 + }, + { + "epoch": 0.5189058286091695, + "grad_norm": 0.6945043802261353, + "learning_rate": 8.452286379589247e-06, + "loss": 0.7262, + "step": 9428 + }, + { + "epoch": 0.5189608674115251, + "grad_norm": 0.6607717275619507, + "learning_rate": 8.451972808493444e-06, + "loss": 0.7257, + "step": 9429 + }, + { + "epoch": 0.5190159062138808, + "grad_norm": 0.6682843565940857, + "learning_rate": 8.451659211453539e-06, + "loss": 0.6775, + "step": 9430 + }, + { + "epoch": 0.5190709450162364, + "grad_norm": 0.7175559401512146, + "learning_rate": 8.451345588471886e-06, + "loss": 0.7154, + "step": 9431 + }, + { + "epoch": 0.5191259838185921, + "grad_norm": 0.7499119639396667, + "learning_rate": 8.451031939550845e-06, + "loss": 0.7537, + "step": 9432 + }, + { + "epoch": 0.5191810226209478, + "grad_norm": 0.65048748254776, + "learning_rate": 8.450718264692771e-06, + "loss": 0.7253, + "step": 9433 + }, + { + "epoch": 0.5192360614233035, + "grad_norm": 0.7067640423774719, + "learning_rate": 8.450404563900022e-06, + "loss": 0.7245, + "step": 9434 + }, + { + "epoch": 0.519291100225659, + "grad_norm": 0.7079932689666748, + "learning_rate": 8.450090837174956e-06, + "loss": 0.7776, + "step": 9435 + }, + { + "epoch": 0.5193461390280147, + "grad_norm": 0.8260107636451721, + "learning_rate": 8.44977708451993e-06, + "loss": 0.8529, + "step": 9436 + }, + { + "epoch": 0.5194011778303704, + "grad_norm": 0.6412167549133301, + "learning_rate": 8.449463305937304e-06, + "loss": 0.7371, + "step": 9437 + }, + { + "epoch": 0.5194562166327261, + "grad_norm": 0.7067576050758362, + "learning_rate": 8.449149501429435e-06, + "loss": 0.7161, + "step": 9438 + }, + { + "epoch": 0.5195112554350817, + "grad_norm": 0.6966904997825623, + "learning_rate": 8.448835670998681e-06, + "loss": 0.7285, + "step": 9439 + }, + { + "epoch": 0.5195662942374374, + "grad_norm": 0.8066132664680481, + "learning_rate": 8.448521814647401e-06, + "loss": 0.8265, + "step": 9440 + }, + { + "epoch": 0.5196213330397931, + "grad_norm": 0.7597149610519409, + "learning_rate": 8.448207932377957e-06, + "loss": 0.7721, + "step": 9441 + }, + { + "epoch": 0.5196763718421488, + "grad_norm": 0.6965302228927612, + "learning_rate": 8.447894024192702e-06, + "loss": 0.749, + "step": 9442 + }, + { + "epoch": 0.5197314106445043, + "grad_norm": 0.7032600045204163, + "learning_rate": 8.447580090094e-06, + "loss": 0.7923, + "step": 9443 + }, + { + "epoch": 0.51978644944686, + "grad_norm": 0.7255309820175171, + "learning_rate": 8.447266130084208e-06, + "loss": 0.6739, + "step": 9444 + }, + { + "epoch": 0.5198414882492157, + "grad_norm": 0.6602993011474609, + "learning_rate": 8.446952144165686e-06, + "loss": 0.7886, + "step": 9445 + }, + { + "epoch": 0.5198965270515714, + "grad_norm": 0.7017884850502014, + "learning_rate": 8.446638132340796e-06, + "loss": 0.7554, + "step": 9446 + }, + { + "epoch": 0.519951565853927, + "grad_norm": 0.7234843969345093, + "learning_rate": 8.446324094611894e-06, + "loss": 0.8294, + "step": 9447 + }, + { + "epoch": 0.5200066046562827, + "grad_norm": 0.6859332919120789, + "learning_rate": 8.446010030981347e-06, + "loss": 0.7563, + "step": 9448 + }, + { + "epoch": 0.5200616434586384, + "grad_norm": 0.7759458422660828, + "learning_rate": 8.445695941451507e-06, + "loss": 0.7577, + "step": 9449 + }, + { + "epoch": 0.520116682260994, + "grad_norm": 0.7852263450622559, + "learning_rate": 8.44538182602474e-06, + "loss": 0.7446, + "step": 9450 + }, + { + "epoch": 0.5201717210633496, + "grad_norm": 0.8143053650856018, + "learning_rate": 8.445067684703406e-06, + "loss": 0.7995, + "step": 9451 + }, + { + "epoch": 0.5202267598657053, + "grad_norm": 0.692738950252533, + "learning_rate": 8.444753517489865e-06, + "loss": 0.7185, + "step": 9452 + }, + { + "epoch": 0.520281798668061, + "grad_norm": 0.6615390181541443, + "learning_rate": 8.444439324386478e-06, + "loss": 0.7128, + "step": 9453 + }, + { + "epoch": 0.5203368374704166, + "grad_norm": 0.7360419034957886, + "learning_rate": 8.444125105395608e-06, + "loss": 0.6565, + "step": 9454 + }, + { + "epoch": 0.5203918762727723, + "grad_norm": 0.7280182838439941, + "learning_rate": 8.443810860519615e-06, + "loss": 0.7295, + "step": 9455 + }, + { + "epoch": 0.520446915075128, + "grad_norm": 0.787367582321167, + "learning_rate": 8.44349658976086e-06, + "loss": 0.7342, + "step": 9456 + }, + { + "epoch": 0.5205019538774837, + "grad_norm": 0.7496024966239929, + "learning_rate": 8.44318229312171e-06, + "loss": 0.7499, + "step": 9457 + }, + { + "epoch": 0.5205569926798392, + "grad_norm": 0.9167383909225464, + "learning_rate": 8.44286797060452e-06, + "loss": 0.7797, + "step": 9458 + }, + { + "epoch": 0.5206120314821949, + "grad_norm": 0.7032341957092285, + "learning_rate": 8.442553622211659e-06, + "loss": 0.7627, + "step": 9459 + }, + { + "epoch": 0.5206670702845506, + "grad_norm": 1.2905993461608887, + "learning_rate": 8.442239247945485e-06, + "loss": 0.7841, + "step": 9460 + }, + { + "epoch": 0.5207221090869063, + "grad_norm": 0.6909230351448059, + "learning_rate": 8.441924847808362e-06, + "loss": 0.7234, + "step": 9461 + }, + { + "epoch": 0.5207771478892619, + "grad_norm": 0.6632175445556641, + "learning_rate": 8.441610421802653e-06, + "loss": 0.6733, + "step": 9462 + }, + { + "epoch": 0.5208321866916176, + "grad_norm": 0.7838154435157776, + "learning_rate": 8.441295969930722e-06, + "loss": 0.7583, + "step": 9463 + }, + { + "epoch": 0.5208872254939733, + "grad_norm": 0.6380481123924255, + "learning_rate": 8.440981492194932e-06, + "loss": 0.7109, + "step": 9464 + }, + { + "epoch": 0.520942264296329, + "grad_norm": 0.6859052181243896, + "learning_rate": 8.440666988597646e-06, + "loss": 0.7387, + "step": 9465 + }, + { + "epoch": 0.5209973030986845, + "grad_norm": 0.7411379814147949, + "learning_rate": 8.440352459141226e-06, + "loss": 0.7852, + "step": 9466 + }, + { + "epoch": 0.5210523419010402, + "grad_norm": 0.6925216913223267, + "learning_rate": 8.44003790382804e-06, + "loss": 0.8228, + "step": 9467 + }, + { + "epoch": 0.5211073807033959, + "grad_norm": 0.7136396169662476, + "learning_rate": 8.43972332266045e-06, + "loss": 0.8168, + "step": 9468 + }, + { + "epoch": 0.5211624195057516, + "grad_norm": 0.719639003276825, + "learning_rate": 8.43940871564082e-06, + "loss": 0.6728, + "step": 9469 + }, + { + "epoch": 0.5212174583081072, + "grad_norm": 0.647861897945404, + "learning_rate": 8.439094082771513e-06, + "loss": 0.6986, + "step": 9470 + }, + { + "epoch": 0.5212724971104629, + "grad_norm": 0.6644579172134399, + "learning_rate": 8.438779424054897e-06, + "loss": 0.6263, + "step": 9471 + }, + { + "epoch": 0.5213275359128186, + "grad_norm": 0.7157352566719055, + "learning_rate": 8.438464739493335e-06, + "loss": 0.827, + "step": 9472 + }, + { + "epoch": 0.5213825747151742, + "grad_norm": 0.793765127658844, + "learning_rate": 8.438150029089193e-06, + "loss": 0.741, + "step": 9473 + }, + { + "epoch": 0.5214376135175298, + "grad_norm": 0.7078518867492676, + "learning_rate": 8.437835292844836e-06, + "loss": 0.7618, + "step": 9474 + }, + { + "epoch": 0.5214926523198855, + "grad_norm": 0.7492140531539917, + "learning_rate": 8.437520530762628e-06, + "loss": 0.7894, + "step": 9475 + }, + { + "epoch": 0.5215476911222412, + "grad_norm": 0.6534473299980164, + "learning_rate": 8.437205742844937e-06, + "loss": 0.7567, + "step": 9476 + }, + { + "epoch": 0.5216027299245969, + "grad_norm": 0.8745388984680176, + "learning_rate": 8.436890929094126e-06, + "loss": 0.8758, + "step": 9477 + }, + { + "epoch": 0.5216577687269525, + "grad_norm": 0.6804752349853516, + "learning_rate": 8.436576089512564e-06, + "loss": 0.7841, + "step": 9478 + }, + { + "epoch": 0.5217128075293082, + "grad_norm": 0.712065577507019, + "learning_rate": 8.436261224102615e-06, + "loss": 0.8079, + "step": 9479 + }, + { + "epoch": 0.5217678463316638, + "grad_norm": 0.8733783960342407, + "learning_rate": 8.435946332866648e-06, + "loss": 0.8295, + "step": 9480 + }, + { + "epoch": 0.5218228851340195, + "grad_norm": 0.6871289610862732, + "learning_rate": 8.435631415807028e-06, + "loss": 0.7087, + "step": 9481 + }, + { + "epoch": 0.5218779239363751, + "grad_norm": 0.8363185524940491, + "learning_rate": 8.43531647292612e-06, + "loss": 0.7329, + "step": 9482 + }, + { + "epoch": 0.5219329627387308, + "grad_norm": 0.6845195293426514, + "learning_rate": 8.435001504226295e-06, + "loss": 0.7651, + "step": 9483 + }, + { + "epoch": 0.5219880015410865, + "grad_norm": 0.7527645826339722, + "learning_rate": 8.434686509709917e-06, + "loss": 0.6856, + "step": 9484 + }, + { + "epoch": 0.5220430403434422, + "grad_norm": 0.6945710778236389, + "learning_rate": 8.434371489379356e-06, + "loss": 0.6875, + "step": 9485 + }, + { + "epoch": 0.5220980791457978, + "grad_norm": 0.7668873071670532, + "learning_rate": 8.434056443236977e-06, + "loss": 0.7662, + "step": 9486 + }, + { + "epoch": 0.5221531179481534, + "grad_norm": 0.9873473048210144, + "learning_rate": 8.433741371285148e-06, + "loss": 0.7662, + "step": 9487 + }, + { + "epoch": 0.5222081567505091, + "grad_norm": 0.8635447025299072, + "learning_rate": 8.43342627352624e-06, + "loss": 0.645, + "step": 9488 + }, + { + "epoch": 0.5222631955528648, + "grad_norm": 0.7836978435516357, + "learning_rate": 8.43311114996262e-06, + "loss": 0.7647, + "step": 9489 + }, + { + "epoch": 0.5223182343552204, + "grad_norm": 0.8370835185050964, + "learning_rate": 8.432796000596652e-06, + "loss": 0.8402, + "step": 9490 + }, + { + "epoch": 0.5223732731575761, + "grad_norm": 0.9627843499183655, + "learning_rate": 8.432480825430712e-06, + "loss": 0.6985, + "step": 9491 + }, + { + "epoch": 0.5224283119599318, + "grad_norm": 0.6774263978004456, + "learning_rate": 8.432165624467163e-06, + "loss": 0.7051, + "step": 9492 + }, + { + "epoch": 0.5224833507622874, + "grad_norm": 0.6590597033500671, + "learning_rate": 8.431850397708375e-06, + "loss": 0.7147, + "step": 9493 + }, + { + "epoch": 0.522538389564643, + "grad_norm": 0.8153522610664368, + "learning_rate": 8.43153514515672e-06, + "loss": 0.6759, + "step": 9494 + }, + { + "epoch": 0.5225934283669987, + "grad_norm": 0.7457708716392517, + "learning_rate": 8.431219866814563e-06, + "loss": 0.7168, + "step": 9495 + }, + { + "epoch": 0.5226484671693544, + "grad_norm": 0.6994161009788513, + "learning_rate": 8.430904562684278e-06, + "loss": 0.8393, + "step": 9496 + }, + { + "epoch": 0.52270350597171, + "grad_norm": 0.780337393283844, + "learning_rate": 8.430589232768232e-06, + "loss": 0.6528, + "step": 9497 + }, + { + "epoch": 0.5227585447740657, + "grad_norm": 0.6833232641220093, + "learning_rate": 8.430273877068796e-06, + "loss": 0.7545, + "step": 9498 + }, + { + "epoch": 0.5228135835764214, + "grad_norm": 0.7330057621002197, + "learning_rate": 8.42995849558834e-06, + "loss": 0.7932, + "step": 9499 + }, + { + "epoch": 0.5228686223787771, + "grad_norm": 0.8131541609764099, + "learning_rate": 8.429643088329233e-06, + "loss": 0.7546, + "step": 9500 + }, + { + "epoch": 0.5229236611811326, + "grad_norm": 0.7353833317756653, + "learning_rate": 8.42932765529385e-06, + "loss": 0.7508, + "step": 9501 + }, + { + "epoch": 0.5229786999834883, + "grad_norm": 0.7166246771812439, + "learning_rate": 8.429012196484554e-06, + "loss": 0.728, + "step": 9502 + }, + { + "epoch": 0.523033738785844, + "grad_norm": 0.732064962387085, + "learning_rate": 8.428696711903721e-06, + "loss": 0.8306, + "step": 9503 + }, + { + "epoch": 0.5230887775881997, + "grad_norm": 0.6858934164047241, + "learning_rate": 8.428381201553721e-06, + "loss": 0.7801, + "step": 9504 + }, + { + "epoch": 0.5231438163905553, + "grad_norm": 0.7046478986740112, + "learning_rate": 8.428065665436928e-06, + "loss": 0.7365, + "step": 9505 + }, + { + "epoch": 0.523198855192911, + "grad_norm": 0.6669325828552246, + "learning_rate": 8.42775010355571e-06, + "loss": 0.7764, + "step": 9506 + }, + { + "epoch": 0.5232538939952667, + "grad_norm": 0.655619740486145, + "learning_rate": 8.427434515912438e-06, + "loss": 0.7919, + "step": 9507 + }, + { + "epoch": 0.5233089327976224, + "grad_norm": 0.6236690878868103, + "learning_rate": 8.427118902509487e-06, + "loss": 0.6653, + "step": 9508 + }, + { + "epoch": 0.5233639715999779, + "grad_norm": 0.8233165740966797, + "learning_rate": 8.426803263349228e-06, + "loss": 0.8012, + "step": 9509 + }, + { + "epoch": 0.5234190104023336, + "grad_norm": 0.6626759171485901, + "learning_rate": 8.426487598434035e-06, + "loss": 0.7728, + "step": 9510 + }, + { + "epoch": 0.5234740492046893, + "grad_norm": 0.9209974408149719, + "learning_rate": 8.426171907766275e-06, + "loss": 0.769, + "step": 9511 + }, + { + "epoch": 0.523529088007045, + "grad_norm": 0.6297587156295776, + "learning_rate": 8.425856191348325e-06, + "loss": 0.7333, + "step": 9512 + }, + { + "epoch": 0.5235841268094006, + "grad_norm": 0.6995256543159485, + "learning_rate": 8.425540449182558e-06, + "loss": 0.7486, + "step": 9513 + }, + { + "epoch": 0.5236391656117563, + "grad_norm": 0.8076607584953308, + "learning_rate": 8.425224681271345e-06, + "loss": 0.8533, + "step": 9514 + }, + { + "epoch": 0.523694204414112, + "grad_norm": 1.2198601961135864, + "learning_rate": 8.42490888761706e-06, + "loss": 0.7291, + "step": 9515 + }, + { + "epoch": 0.5237492432164677, + "grad_norm": 0.7047159671783447, + "learning_rate": 8.424593068222076e-06, + "loss": 0.713, + "step": 9516 + }, + { + "epoch": 0.5238042820188232, + "grad_norm": 0.7652333378791809, + "learning_rate": 8.424277223088768e-06, + "loss": 0.8149, + "step": 9517 + }, + { + "epoch": 0.5238593208211789, + "grad_norm": 1.1311010122299194, + "learning_rate": 8.42396135221951e-06, + "loss": 0.8195, + "step": 9518 + }, + { + "epoch": 0.5239143596235346, + "grad_norm": 0.7855533957481384, + "learning_rate": 8.423645455616674e-06, + "loss": 0.7901, + "step": 9519 + }, + { + "epoch": 0.5239693984258903, + "grad_norm": 0.7028971314430237, + "learning_rate": 8.423329533282635e-06, + "loss": 0.8006, + "step": 9520 + }, + { + "epoch": 0.5240244372282459, + "grad_norm": 0.703809916973114, + "learning_rate": 8.423013585219769e-06, + "loss": 0.7581, + "step": 9521 + }, + { + "epoch": 0.5240794760306016, + "grad_norm": 0.94233238697052, + "learning_rate": 8.422697611430448e-06, + "loss": 0.7689, + "step": 9522 + }, + { + "epoch": 0.5241345148329573, + "grad_norm": 0.8164071440696716, + "learning_rate": 8.422381611917047e-06, + "loss": 0.8761, + "step": 9523 + }, + { + "epoch": 0.5241895536353129, + "grad_norm": 0.6242091059684753, + "learning_rate": 8.422065586681944e-06, + "loss": 0.6975, + "step": 9524 + }, + { + "epoch": 0.5242445924376685, + "grad_norm": 0.6607261300086975, + "learning_rate": 8.42174953572751e-06, + "loss": 0.6847, + "step": 9525 + }, + { + "epoch": 0.5242996312400242, + "grad_norm": 0.7174261212348938, + "learning_rate": 8.421433459056123e-06, + "loss": 0.7905, + "step": 9526 + }, + { + "epoch": 0.5243546700423799, + "grad_norm": 0.7414089441299438, + "learning_rate": 8.42111735667016e-06, + "loss": 0.7788, + "step": 9527 + }, + { + "epoch": 0.5244097088447356, + "grad_norm": 0.7347442507743835, + "learning_rate": 8.420801228571992e-06, + "loss": 0.7691, + "step": 9528 + }, + { + "epoch": 0.5244647476470912, + "grad_norm": 0.6947832107543945, + "learning_rate": 8.420485074763999e-06, + "loss": 0.6702, + "step": 9529 + }, + { + "epoch": 0.5245197864494469, + "grad_norm": 0.6865423321723938, + "learning_rate": 8.420168895248557e-06, + "loss": 0.7577, + "step": 9530 + }, + { + "epoch": 0.5245748252518025, + "grad_norm": 0.7023190855979919, + "learning_rate": 8.419852690028039e-06, + "loss": 0.7711, + "step": 9531 + }, + { + "epoch": 0.5246298640541582, + "grad_norm": 0.8312145471572876, + "learning_rate": 8.419536459104824e-06, + "loss": 0.7999, + "step": 9532 + }, + { + "epoch": 0.5246849028565138, + "grad_norm": 0.6700688600540161, + "learning_rate": 8.419220202481288e-06, + "loss": 0.7163, + "step": 9533 + }, + { + "epoch": 0.5247399416588695, + "grad_norm": 0.767062246799469, + "learning_rate": 8.418903920159809e-06, + "loss": 0.7451, + "step": 9534 + }, + { + "epoch": 0.5247949804612252, + "grad_norm": 0.6814010143280029, + "learning_rate": 8.418587612142763e-06, + "loss": 0.771, + "step": 9535 + }, + { + "epoch": 0.5248500192635808, + "grad_norm": 0.6728426218032837, + "learning_rate": 8.418271278432528e-06, + "loss": 0.8336, + "step": 9536 + }, + { + "epoch": 0.5249050580659365, + "grad_norm": 0.7112382650375366, + "learning_rate": 8.417954919031482e-06, + "loss": 0.7392, + "step": 9537 + }, + { + "epoch": 0.5249600968682921, + "grad_norm": 0.7371365427970886, + "learning_rate": 8.417638533942e-06, + "loss": 0.8233, + "step": 9538 + }, + { + "epoch": 0.5250151356706478, + "grad_norm": 0.6593502163887024, + "learning_rate": 8.41732212316646e-06, + "loss": 0.7455, + "step": 9539 + }, + { + "epoch": 0.5250701744730034, + "grad_norm": 0.685553252696991, + "learning_rate": 8.417005686707245e-06, + "loss": 0.7783, + "step": 9540 + }, + { + "epoch": 0.5251252132753591, + "grad_norm": 0.7003353238105774, + "learning_rate": 8.41668922456673e-06, + "loss": 0.7733, + "step": 9541 + }, + { + "epoch": 0.5251802520777148, + "grad_norm": 0.7602891325950623, + "learning_rate": 8.416372736747292e-06, + "loss": 0.7236, + "step": 9542 + }, + { + "epoch": 0.5252352908800705, + "grad_norm": 0.647531270980835, + "learning_rate": 8.41605622325131e-06, + "loss": 0.7388, + "step": 9543 + }, + { + "epoch": 0.5252903296824261, + "grad_norm": 0.7309756875038147, + "learning_rate": 8.415739684081165e-06, + "loss": 0.7178, + "step": 9544 + }, + { + "epoch": 0.5253453684847817, + "grad_norm": 0.6991532444953918, + "learning_rate": 8.415423119239236e-06, + "loss": 0.8078, + "step": 9545 + }, + { + "epoch": 0.5254004072871374, + "grad_norm": 0.7392330765724182, + "learning_rate": 8.4151065287279e-06, + "loss": 0.8452, + "step": 9546 + }, + { + "epoch": 0.5254554460894931, + "grad_norm": 0.7617329955101013, + "learning_rate": 8.414789912549537e-06, + "loss": 0.7885, + "step": 9547 + }, + { + "epoch": 0.5255104848918487, + "grad_norm": 1.160125732421875, + "learning_rate": 8.414473270706527e-06, + "loss": 0.9628, + "step": 9548 + }, + { + "epoch": 0.5255655236942044, + "grad_norm": 0.7578685879707336, + "learning_rate": 8.414156603201252e-06, + "loss": 0.7745, + "step": 9549 + }, + { + "epoch": 0.5256205624965601, + "grad_norm": 0.6963017582893372, + "learning_rate": 8.413839910036089e-06, + "loss": 0.7693, + "step": 9550 + }, + { + "epoch": 0.5256756012989158, + "grad_norm": 0.6631398797035217, + "learning_rate": 8.413523191213415e-06, + "loss": 0.6606, + "step": 9551 + }, + { + "epoch": 0.5257306401012714, + "grad_norm": 0.707343339920044, + "learning_rate": 8.41320644673562e-06, + "loss": 0.7161, + "step": 9552 + }, + { + "epoch": 0.525785678903627, + "grad_norm": 0.833448588848114, + "learning_rate": 8.412889676605075e-06, + "loss": 0.7509, + "step": 9553 + }, + { + "epoch": 0.5258407177059827, + "grad_norm": 0.6214264631271362, + "learning_rate": 8.412572880824168e-06, + "loss": 0.7436, + "step": 9554 + }, + { + "epoch": 0.5258957565083384, + "grad_norm": 0.6479233503341675, + "learning_rate": 8.412256059395274e-06, + "loss": 0.7359, + "step": 9555 + }, + { + "epoch": 0.525950795310694, + "grad_norm": 0.7596501111984253, + "learning_rate": 8.411939212320778e-06, + "loss": 0.7422, + "step": 9556 + }, + { + "epoch": 0.5260058341130497, + "grad_norm": 0.8040934205055237, + "learning_rate": 8.41162233960306e-06, + "loss": 0.7721, + "step": 9557 + }, + { + "epoch": 0.5260608729154054, + "grad_norm": 0.7190027832984924, + "learning_rate": 8.411305441244505e-06, + "loss": 0.8794, + "step": 9558 + }, + { + "epoch": 0.5261159117177611, + "grad_norm": 0.8002649545669556, + "learning_rate": 8.410988517247486e-06, + "loss": 0.7958, + "step": 9559 + }, + { + "epoch": 0.5261709505201166, + "grad_norm": 0.7151750326156616, + "learning_rate": 8.410671567614394e-06, + "loss": 0.7597, + "step": 9560 + }, + { + "epoch": 0.5262259893224723, + "grad_norm": 0.9718102812767029, + "learning_rate": 8.410354592347607e-06, + "loss": 0.8272, + "step": 9561 + }, + { + "epoch": 0.526281028124828, + "grad_norm": 0.701932966709137, + "learning_rate": 8.410037591449506e-06, + "loss": 0.808, + "step": 9562 + }, + { + "epoch": 0.5263360669271837, + "grad_norm": 0.8247585296630859, + "learning_rate": 8.409720564922476e-06, + "loss": 0.7598, + "step": 9563 + }, + { + "epoch": 0.5263911057295393, + "grad_norm": 0.7305104732513428, + "learning_rate": 8.409403512768899e-06, + "loss": 0.8161, + "step": 9564 + }, + { + "epoch": 0.526446144531895, + "grad_norm": 0.8726410865783691, + "learning_rate": 8.409086434991158e-06, + "loss": 0.8598, + "step": 9565 + }, + { + "epoch": 0.5265011833342507, + "grad_norm": 0.7329155802726746, + "learning_rate": 8.408769331591637e-06, + "loss": 0.7355, + "step": 9566 + }, + { + "epoch": 0.5265562221366064, + "grad_norm": 0.8227902054786682, + "learning_rate": 8.408452202572716e-06, + "loss": 0.7888, + "step": 9567 + }, + { + "epoch": 0.5266112609389619, + "grad_norm": 0.7190666794776917, + "learning_rate": 8.408135047936783e-06, + "loss": 0.669, + "step": 9568 + }, + { + "epoch": 0.5266662997413176, + "grad_norm": 0.6529938578605652, + "learning_rate": 8.407817867686217e-06, + "loss": 0.7345, + "step": 9569 + }, + { + "epoch": 0.5267213385436733, + "grad_norm": 0.6985379457473755, + "learning_rate": 8.407500661823407e-06, + "loss": 0.852, + "step": 9570 + }, + { + "epoch": 0.526776377346029, + "grad_norm": 0.7480047345161438, + "learning_rate": 8.407183430350732e-06, + "loss": 0.7422, + "step": 9571 + }, + { + "epoch": 0.5268314161483846, + "grad_norm": 0.7599420547485352, + "learning_rate": 8.406866173270579e-06, + "loss": 0.7499, + "step": 9572 + }, + { + "epoch": 0.5268864549507403, + "grad_norm": 0.813448965549469, + "learning_rate": 8.406548890585331e-06, + "loss": 0.7979, + "step": 9573 + }, + { + "epoch": 0.526941493753096, + "grad_norm": 0.6029278039932251, + "learning_rate": 8.406231582297374e-06, + "loss": 0.7289, + "step": 9574 + }, + { + "epoch": 0.5269965325554516, + "grad_norm": 0.656829297542572, + "learning_rate": 8.40591424840909e-06, + "loss": 0.6778, + "step": 9575 + }, + { + "epoch": 0.5270515713578072, + "grad_norm": 0.7147198915481567, + "learning_rate": 8.405596888922869e-06, + "loss": 0.7212, + "step": 9576 + }, + { + "epoch": 0.5271066101601629, + "grad_norm": 0.7722035050392151, + "learning_rate": 8.405279503841094e-06, + "loss": 0.8008, + "step": 9577 + }, + { + "epoch": 0.5271616489625186, + "grad_norm": 0.6828493475914001, + "learning_rate": 8.40496209316615e-06, + "loss": 0.787, + "step": 9578 + }, + { + "epoch": 0.5272166877648742, + "grad_norm": 0.6965187788009644, + "learning_rate": 8.40464465690042e-06, + "loss": 0.6803, + "step": 9579 + }, + { + "epoch": 0.5272717265672299, + "grad_norm": 0.7300547957420349, + "learning_rate": 8.404327195046293e-06, + "loss": 0.8165, + "step": 9580 + }, + { + "epoch": 0.5273267653695856, + "grad_norm": 0.7367526292800903, + "learning_rate": 8.404009707606153e-06, + "loss": 0.7709, + "step": 9581 + }, + { + "epoch": 0.5273818041719412, + "grad_norm": 0.6694689989089966, + "learning_rate": 8.40369219458239e-06, + "loss": 0.7971, + "step": 9582 + }, + { + "epoch": 0.5274368429742968, + "grad_norm": 0.6723141074180603, + "learning_rate": 8.403374655977384e-06, + "loss": 0.695, + "step": 9583 + }, + { + "epoch": 0.5274918817766525, + "grad_norm": 0.7737089395523071, + "learning_rate": 8.403057091793528e-06, + "loss": 0.7765, + "step": 9584 + }, + { + "epoch": 0.5275469205790082, + "grad_norm": 0.8378487825393677, + "learning_rate": 8.402739502033204e-06, + "loss": 0.7984, + "step": 9585 + }, + { + "epoch": 0.5276019593813639, + "grad_norm": 0.7496509552001953, + "learning_rate": 8.402421886698802e-06, + "loss": 0.7846, + "step": 9586 + }, + { + "epoch": 0.5276569981837195, + "grad_norm": 0.7020435929298401, + "learning_rate": 8.402104245792706e-06, + "loss": 0.8102, + "step": 9587 + }, + { + "epoch": 0.5277120369860752, + "grad_norm": 0.8877277374267578, + "learning_rate": 8.401786579317308e-06, + "loss": 0.6995, + "step": 9588 + }, + { + "epoch": 0.5277670757884309, + "grad_norm": 0.6975196599960327, + "learning_rate": 8.401468887274991e-06, + "loss": 0.7475, + "step": 9589 + }, + { + "epoch": 0.5278221145907865, + "grad_norm": 0.8267357349395752, + "learning_rate": 8.401151169668144e-06, + "loss": 0.7091, + "step": 9590 + }, + { + "epoch": 0.5278771533931421, + "grad_norm": 0.6778179407119751, + "learning_rate": 8.400833426499156e-06, + "loss": 0.8198, + "step": 9591 + }, + { + "epoch": 0.5279321921954978, + "grad_norm": 0.7343330979347229, + "learning_rate": 8.400515657770414e-06, + "loss": 0.7565, + "step": 9592 + }, + { + "epoch": 0.5279872309978535, + "grad_norm": 0.7745271325111389, + "learning_rate": 8.400197863484307e-06, + "loss": 0.7991, + "step": 9593 + }, + { + "epoch": 0.5280422698002092, + "grad_norm": 0.7652345895767212, + "learning_rate": 8.399880043643224e-06, + "loss": 0.7752, + "step": 9594 + }, + { + "epoch": 0.5280973086025648, + "grad_norm": 0.9764432311058044, + "learning_rate": 8.399562198249551e-06, + "loss": 0.784, + "step": 9595 + }, + { + "epoch": 0.5281523474049205, + "grad_norm": 0.6763052940368652, + "learning_rate": 8.399244327305678e-06, + "loss": 0.7695, + "step": 9596 + }, + { + "epoch": 0.5282073862072761, + "grad_norm": 0.7788934111595154, + "learning_rate": 8.398926430813996e-06, + "loss": 0.8152, + "step": 9597 + }, + { + "epoch": 0.5282624250096318, + "grad_norm": 0.8088317513465881, + "learning_rate": 8.398608508776894e-06, + "loss": 0.7751, + "step": 9598 + }, + { + "epoch": 0.5283174638119874, + "grad_norm": 0.6735319495201111, + "learning_rate": 8.398290561196756e-06, + "loss": 0.7305, + "step": 9599 + }, + { + "epoch": 0.5283725026143431, + "grad_norm": 0.7279297113418579, + "learning_rate": 8.39797258807598e-06, + "loss": 0.7381, + "step": 9600 + }, + { + "epoch": 0.5284275414166988, + "grad_norm": 0.74604332447052, + "learning_rate": 8.39765458941695e-06, + "loss": 0.8138, + "step": 9601 + }, + { + "epoch": 0.5284825802190545, + "grad_norm": 0.7735850214958191, + "learning_rate": 8.397336565222057e-06, + "loss": 0.7364, + "step": 9602 + }, + { + "epoch": 0.52853761902141, + "grad_norm": 0.7890003323554993, + "learning_rate": 8.397018515493693e-06, + "loss": 0.8301, + "step": 9603 + }, + { + "epoch": 0.5285926578237657, + "grad_norm": 0.739054262638092, + "learning_rate": 8.396700440234245e-06, + "loss": 0.7503, + "step": 9604 + }, + { + "epoch": 0.5286476966261214, + "grad_norm": 0.7611023783683777, + "learning_rate": 8.396382339446108e-06, + "loss": 0.7225, + "step": 9605 + }, + { + "epoch": 0.5287027354284771, + "grad_norm": 0.770602285861969, + "learning_rate": 8.39606421313167e-06, + "loss": 0.71, + "step": 9606 + }, + { + "epoch": 0.5287577742308327, + "grad_norm": 0.7495261430740356, + "learning_rate": 8.395746061293322e-06, + "loss": 0.7729, + "step": 9607 + }, + { + "epoch": 0.5288128130331884, + "grad_norm": 0.7159668207168579, + "learning_rate": 8.395427883933456e-06, + "loss": 0.8457, + "step": 9608 + }, + { + "epoch": 0.5288678518355441, + "grad_norm": 0.7663426399230957, + "learning_rate": 8.395109681054463e-06, + "loss": 0.784, + "step": 9609 + }, + { + "epoch": 0.5289228906378998, + "grad_norm": 0.7271933555603027, + "learning_rate": 8.394791452658732e-06, + "loss": 0.7981, + "step": 9610 + }, + { + "epoch": 0.5289779294402553, + "grad_norm": 0.7782096266746521, + "learning_rate": 8.394473198748661e-06, + "loss": 0.7953, + "step": 9611 + }, + { + "epoch": 0.529032968242611, + "grad_norm": 0.8318955302238464, + "learning_rate": 8.394154919326636e-06, + "loss": 0.6875, + "step": 9612 + }, + { + "epoch": 0.5290880070449667, + "grad_norm": 0.7402167916297913, + "learning_rate": 8.393836614395051e-06, + "loss": 0.7805, + "step": 9613 + }, + { + "epoch": 0.5291430458473224, + "grad_norm": 0.6314370632171631, + "learning_rate": 8.393518283956299e-06, + "loss": 0.6841, + "step": 9614 + }, + { + "epoch": 0.529198084649678, + "grad_norm": 0.8387365937232971, + "learning_rate": 8.393199928012772e-06, + "loss": 0.8503, + "step": 9615 + }, + { + "epoch": 0.5292531234520337, + "grad_norm": 0.7066243886947632, + "learning_rate": 8.392881546566863e-06, + "loss": 0.8494, + "step": 9616 + }, + { + "epoch": 0.5293081622543894, + "grad_norm": 0.7034226059913635, + "learning_rate": 8.392563139620964e-06, + "loss": 0.7335, + "step": 9617 + }, + { + "epoch": 0.5293632010567451, + "grad_norm": 0.6969622373580933, + "learning_rate": 8.392244707177468e-06, + "loss": 0.7203, + "step": 9618 + }, + { + "epoch": 0.5294182398591006, + "grad_norm": 0.7694050073623657, + "learning_rate": 8.391926249238768e-06, + "loss": 0.7864, + "step": 9619 + }, + { + "epoch": 0.5294732786614563, + "grad_norm": 0.7284281253814697, + "learning_rate": 8.391607765807262e-06, + "loss": 0.6704, + "step": 9620 + }, + { + "epoch": 0.529528317463812, + "grad_norm": 1.0466688871383667, + "learning_rate": 8.391289256885337e-06, + "loss": 0.7807, + "step": 9621 + }, + { + "epoch": 0.5295833562661676, + "grad_norm": 0.7118388414382935, + "learning_rate": 8.39097072247539e-06, + "loss": 0.738, + "step": 9622 + }, + { + "epoch": 0.5296383950685233, + "grad_norm": 0.794377863407135, + "learning_rate": 8.390652162579815e-06, + "loss": 0.6831, + "step": 9623 + }, + { + "epoch": 0.529693433870879, + "grad_norm": 0.6042492389678955, + "learning_rate": 8.390333577201007e-06, + "loss": 0.6773, + "step": 9624 + }, + { + "epoch": 0.5297484726732347, + "grad_norm": 0.6452521681785583, + "learning_rate": 8.390014966341357e-06, + "loss": 0.7168, + "step": 9625 + }, + { + "epoch": 0.5298035114755902, + "grad_norm": 0.7113651633262634, + "learning_rate": 8.389696330003265e-06, + "loss": 0.709, + "step": 9626 + }, + { + "epoch": 0.5298585502779459, + "grad_norm": 0.6469250917434692, + "learning_rate": 8.38937766818912e-06, + "loss": 0.6804, + "step": 9627 + }, + { + "epoch": 0.5299135890803016, + "grad_norm": 0.7529417872428894, + "learning_rate": 8.389058980901322e-06, + "loss": 0.8537, + "step": 9628 + }, + { + "epoch": 0.5299686278826573, + "grad_norm": 0.7681186199188232, + "learning_rate": 8.388740268142262e-06, + "loss": 0.7383, + "step": 9629 + }, + { + "epoch": 0.5300236666850129, + "grad_norm": 0.6585648655891418, + "learning_rate": 8.388421529914337e-06, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 0.5300787054873686, + "grad_norm": 0.7432085871696472, + "learning_rate": 8.388102766219943e-06, + "loss": 0.7391, + "step": 9631 + }, + { + "epoch": 0.5301337442897243, + "grad_norm": 0.6672815084457397, + "learning_rate": 8.387783977061476e-06, + "loss": 0.8056, + "step": 9632 + }, + { + "epoch": 0.53018878309208, + "grad_norm": 0.7566675543785095, + "learning_rate": 8.387465162441332e-06, + "loss": 0.7858, + "step": 9633 + }, + { + "epoch": 0.5302438218944355, + "grad_norm": 0.6522077322006226, + "learning_rate": 8.387146322361907e-06, + "loss": 0.759, + "step": 9634 + }, + { + "epoch": 0.5302988606967912, + "grad_norm": 0.7246397137641907, + "learning_rate": 8.386827456825597e-06, + "loss": 0.8158, + "step": 9635 + }, + { + "epoch": 0.5303538994991469, + "grad_norm": 0.7577807307243347, + "learning_rate": 8.386508565834797e-06, + "loss": 0.7495, + "step": 9636 + }, + { + "epoch": 0.5304089383015026, + "grad_norm": 0.7080703973770142, + "learning_rate": 8.386189649391906e-06, + "loss": 0.8086, + "step": 9637 + }, + { + "epoch": 0.5304639771038582, + "grad_norm": 0.7505277395248413, + "learning_rate": 8.385870707499321e-06, + "loss": 0.7206, + "step": 9638 + }, + { + "epoch": 0.5305190159062139, + "grad_norm": 0.7044165134429932, + "learning_rate": 8.385551740159437e-06, + "loss": 0.7838, + "step": 9639 + }, + { + "epoch": 0.5305740547085696, + "grad_norm": 0.7921645641326904, + "learning_rate": 8.385232747374652e-06, + "loss": 0.7604, + "step": 9640 + }, + { + "epoch": 0.5306290935109252, + "grad_norm": 0.9930111169815063, + "learning_rate": 8.384913729147364e-06, + "loss": 0.7839, + "step": 9641 + }, + { + "epoch": 0.5306841323132808, + "grad_norm": 0.7333244681358337, + "learning_rate": 8.38459468547997e-06, + "loss": 0.7941, + "step": 9642 + }, + { + "epoch": 0.5307391711156365, + "grad_norm": 0.7857590913772583, + "learning_rate": 8.384275616374868e-06, + "loss": 0.8535, + "step": 9643 + }, + { + "epoch": 0.5307942099179922, + "grad_norm": 0.8568746447563171, + "learning_rate": 8.383956521834459e-06, + "loss": 0.6586, + "step": 9644 + }, + { + "epoch": 0.5308492487203479, + "grad_norm": 0.7061276435852051, + "learning_rate": 8.383637401861136e-06, + "loss": 0.7288, + "step": 9645 + }, + { + "epoch": 0.5309042875227035, + "grad_norm": 0.7348940968513489, + "learning_rate": 8.383318256457303e-06, + "loss": 0.8099, + "step": 9646 + }, + { + "epoch": 0.5309593263250592, + "grad_norm": 0.6526725888252258, + "learning_rate": 8.382999085625353e-06, + "loss": 0.6702, + "step": 9647 + }, + { + "epoch": 0.5310143651274148, + "grad_norm": 0.8122747540473938, + "learning_rate": 8.382679889367687e-06, + "loss": 0.67, + "step": 9648 + }, + { + "epoch": 0.5310694039297705, + "grad_norm": 0.9145376682281494, + "learning_rate": 8.382360667686706e-06, + "loss": 0.7719, + "step": 9649 + }, + { + "epoch": 0.5311244427321261, + "grad_norm": 0.6659818887710571, + "learning_rate": 8.382041420584807e-06, + "loss": 0.806, + "step": 9650 + }, + { + "epoch": 0.5311794815344818, + "grad_norm": 0.7088539004325867, + "learning_rate": 8.381722148064391e-06, + "loss": 0.7046, + "step": 9651 + }, + { + "epoch": 0.5312345203368375, + "grad_norm": 0.8610590696334839, + "learning_rate": 8.381402850127854e-06, + "loss": 0.6998, + "step": 9652 + }, + { + "epoch": 0.5312895591391932, + "grad_norm": 0.775830864906311, + "learning_rate": 8.3810835267776e-06, + "loss": 0.8874, + "step": 9653 + }, + { + "epoch": 0.5313445979415488, + "grad_norm": 0.6871606707572937, + "learning_rate": 8.380764178016028e-06, + "loss": 0.7903, + "step": 9654 + }, + { + "epoch": 0.5313996367439044, + "grad_norm": 0.7005272507667542, + "learning_rate": 8.380444803845537e-06, + "loss": 0.6685, + "step": 9655 + }, + { + "epoch": 0.5314546755462601, + "grad_norm": 0.8922042846679688, + "learning_rate": 8.380125404268527e-06, + "loss": 0.7797, + "step": 9656 + }, + { + "epoch": 0.5315097143486158, + "grad_norm": 0.7242267727851868, + "learning_rate": 8.3798059792874e-06, + "loss": 0.863, + "step": 9657 + }, + { + "epoch": 0.5315647531509714, + "grad_norm": 0.6625328660011292, + "learning_rate": 8.379486528904555e-06, + "loss": 0.7, + "step": 9658 + }, + { + "epoch": 0.5316197919533271, + "grad_norm": 0.9882226586341858, + "learning_rate": 8.379167053122394e-06, + "loss": 0.7534, + "step": 9659 + }, + { + "epoch": 0.5316748307556828, + "grad_norm": 0.6894702911376953, + "learning_rate": 8.378847551943318e-06, + "loss": 0.7503, + "step": 9660 + }, + { + "epoch": 0.5317298695580385, + "grad_norm": 0.6820259690284729, + "learning_rate": 8.37852802536973e-06, + "loss": 0.7713, + "step": 9661 + }, + { + "epoch": 0.531784908360394, + "grad_norm": 0.667918860912323, + "learning_rate": 8.378208473404028e-06, + "loss": 0.7524, + "step": 9662 + }, + { + "epoch": 0.5318399471627497, + "grad_norm": 0.7789241075515747, + "learning_rate": 8.377888896048617e-06, + "loss": 0.6906, + "step": 9663 + }, + { + "epoch": 0.5318949859651054, + "grad_norm": 0.7264542579650879, + "learning_rate": 8.377569293305894e-06, + "loss": 0.7836, + "step": 9664 + }, + { + "epoch": 0.531950024767461, + "grad_norm": 0.6979835629463196, + "learning_rate": 8.377249665178267e-06, + "loss": 0.7739, + "step": 9665 + }, + { + "epoch": 0.5320050635698167, + "grad_norm": 0.8008072376251221, + "learning_rate": 8.376930011668136e-06, + "loss": 0.7853, + "step": 9666 + }, + { + "epoch": 0.5320601023721724, + "grad_norm": 0.7185621857643127, + "learning_rate": 8.376610332777901e-06, + "loss": 0.7311, + "step": 9667 + }, + { + "epoch": 0.5321151411745281, + "grad_norm": 0.7644047141075134, + "learning_rate": 8.376290628509969e-06, + "loss": 0.6919, + "step": 9668 + }, + { + "epoch": 0.5321701799768837, + "grad_norm": 0.7387600541114807, + "learning_rate": 8.37597089886674e-06, + "loss": 0.7285, + "step": 9669 + }, + { + "epoch": 0.5322252187792393, + "grad_norm": 0.7344895005226135, + "learning_rate": 8.375651143850614e-06, + "loss": 0.7514, + "step": 9670 + }, + { + "epoch": 0.532280257581595, + "grad_norm": 0.6930707097053528, + "learning_rate": 8.375331363464002e-06, + "loss": 0.8318, + "step": 9671 + }, + { + "epoch": 0.5323352963839507, + "grad_norm": 0.678162693977356, + "learning_rate": 8.3750115577093e-06, + "loss": 0.7123, + "step": 9672 + }, + { + "epoch": 0.5323903351863063, + "grad_norm": 0.7780481576919556, + "learning_rate": 8.374691726588914e-06, + "loss": 0.7672, + "step": 9673 + }, + { + "epoch": 0.532445373988662, + "grad_norm": 0.6664674282073975, + "learning_rate": 8.374371870105252e-06, + "loss": 0.6994, + "step": 9674 + }, + { + "epoch": 0.5325004127910177, + "grad_norm": 0.6952562928199768, + "learning_rate": 8.374051988260712e-06, + "loss": 0.8638, + "step": 9675 + }, + { + "epoch": 0.5325554515933734, + "grad_norm": 0.764005184173584, + "learning_rate": 8.373732081057699e-06, + "loss": 0.756, + "step": 9676 + }, + { + "epoch": 0.5326104903957289, + "grad_norm": 0.9434393048286438, + "learning_rate": 8.373412148498621e-06, + "loss": 0.8668, + "step": 9677 + }, + { + "epoch": 0.5326655291980846, + "grad_norm": 0.752609133720398, + "learning_rate": 8.373092190585878e-06, + "loss": 0.8078, + "step": 9678 + }, + { + "epoch": 0.5327205680004403, + "grad_norm": 0.671940803527832, + "learning_rate": 8.37277220732188e-06, + "loss": 0.7726, + "step": 9679 + }, + { + "epoch": 0.532775606802796, + "grad_norm": 0.7824863791465759, + "learning_rate": 8.372452198709027e-06, + "loss": 0.8246, + "step": 9680 + }, + { + "epoch": 0.5328306456051516, + "grad_norm": 0.7300587892532349, + "learning_rate": 8.372132164749726e-06, + "loss": 0.7953, + "step": 9681 + }, + { + "epoch": 0.5328856844075073, + "grad_norm": 0.7146018743515015, + "learning_rate": 8.371812105446384e-06, + "loss": 0.7409, + "step": 9682 + }, + { + "epoch": 0.532940723209863, + "grad_norm": 0.73857581615448, + "learning_rate": 8.371492020801404e-06, + "loss": 0.8067, + "step": 9683 + }, + { + "epoch": 0.5329957620122187, + "grad_norm": 0.6760877966880798, + "learning_rate": 8.37117191081719e-06, + "loss": 0.7363, + "step": 9684 + }, + { + "epoch": 0.5330508008145742, + "grad_norm": 0.766482412815094, + "learning_rate": 8.370851775496154e-06, + "loss": 0.7358, + "step": 9685 + }, + { + "epoch": 0.5331058396169299, + "grad_norm": 0.7230576276779175, + "learning_rate": 8.370531614840697e-06, + "loss": 0.8154, + "step": 9686 + }, + { + "epoch": 0.5331608784192856, + "grad_norm": 0.7357933521270752, + "learning_rate": 8.370211428853225e-06, + "loss": 0.7187, + "step": 9687 + }, + { + "epoch": 0.5332159172216413, + "grad_norm": 0.8208534121513367, + "learning_rate": 8.369891217536148e-06, + "loss": 0.8037, + "step": 9688 + }, + { + "epoch": 0.5332709560239969, + "grad_norm": 0.6771863698959351, + "learning_rate": 8.36957098089187e-06, + "loss": 0.733, + "step": 9689 + }, + { + "epoch": 0.5333259948263526, + "grad_norm": 0.6382480263710022, + "learning_rate": 8.369250718922798e-06, + "loss": 0.7391, + "step": 9690 + }, + { + "epoch": 0.5333810336287083, + "grad_norm": 0.6638994812965393, + "learning_rate": 8.368930431631342e-06, + "loss": 0.7176, + "step": 9691 + }, + { + "epoch": 0.533436072431064, + "grad_norm": 0.7599604725837708, + "learning_rate": 8.368610119019903e-06, + "loss": 0.8814, + "step": 9692 + }, + { + "epoch": 0.5334911112334195, + "grad_norm": 0.6896547079086304, + "learning_rate": 8.368289781090894e-06, + "loss": 0.7618, + "step": 9693 + }, + { + "epoch": 0.5335461500357752, + "grad_norm": 0.7081224918365479, + "learning_rate": 8.36796941784672e-06, + "loss": 0.656, + "step": 9694 + }, + { + "epoch": 0.5336011888381309, + "grad_norm": 0.8819646835327148, + "learning_rate": 8.367649029289791e-06, + "loss": 0.8946, + "step": 9695 + }, + { + "epoch": 0.5336562276404866, + "grad_norm": 0.6597925424575806, + "learning_rate": 8.367328615422512e-06, + "loss": 0.6891, + "step": 9696 + }, + { + "epoch": 0.5337112664428422, + "grad_norm": 0.6855770945549011, + "learning_rate": 8.367008176247294e-06, + "loss": 0.7158, + "step": 9697 + }, + { + "epoch": 0.5337663052451979, + "grad_norm": 0.6874905228614807, + "learning_rate": 8.366687711766541e-06, + "loss": 0.7445, + "step": 9698 + }, + { + "epoch": 0.5338213440475535, + "grad_norm": 0.6990895867347717, + "learning_rate": 8.366367221982666e-06, + "loss": 0.6189, + "step": 9699 + }, + { + "epoch": 0.5338763828499092, + "grad_norm": 0.7235365509986877, + "learning_rate": 8.366046706898075e-06, + "loss": 0.6406, + "step": 9700 + }, + { + "epoch": 0.5339314216522648, + "grad_norm": 0.7563154697418213, + "learning_rate": 8.36572616651518e-06, + "loss": 0.7798, + "step": 9701 + }, + { + "epoch": 0.5339864604546205, + "grad_norm": 0.6845980286598206, + "learning_rate": 8.365405600836387e-06, + "loss": 0.7665, + "step": 9702 + }, + { + "epoch": 0.5340414992569762, + "grad_norm": 0.6374378204345703, + "learning_rate": 8.365085009864106e-06, + "loss": 0.6935, + "step": 9703 + }, + { + "epoch": 0.5340965380593319, + "grad_norm": 0.726672887802124, + "learning_rate": 8.364764393600747e-06, + "loss": 0.7821, + "step": 9704 + }, + { + "epoch": 0.5341515768616875, + "grad_norm": 0.6784456372261047, + "learning_rate": 8.364443752048719e-06, + "loss": 0.7722, + "step": 9705 + }, + { + "epoch": 0.5342066156640431, + "grad_norm": 0.6344080567359924, + "learning_rate": 8.364123085210433e-06, + "loss": 0.7256, + "step": 9706 + }, + { + "epoch": 0.5342616544663988, + "grad_norm": 0.7913152575492859, + "learning_rate": 8.363802393088299e-06, + "loss": 0.7892, + "step": 9707 + }, + { + "epoch": 0.5343166932687544, + "grad_norm": 0.6792107820510864, + "learning_rate": 8.363481675684726e-06, + "loss": 0.7374, + "step": 9708 + }, + { + "epoch": 0.5343717320711101, + "grad_norm": 1.0153685808181763, + "learning_rate": 8.363160933002126e-06, + "loss": 0.7396, + "step": 9709 + }, + { + "epoch": 0.5344267708734658, + "grad_norm": 0.7655258774757385, + "learning_rate": 8.362840165042906e-06, + "loss": 0.7746, + "step": 9710 + }, + { + "epoch": 0.5344818096758215, + "grad_norm": 0.7830179929733276, + "learning_rate": 8.362519371809483e-06, + "loss": 0.7082, + "step": 9711 + }, + { + "epoch": 0.5345368484781771, + "grad_norm": 0.7410556674003601, + "learning_rate": 8.362198553304261e-06, + "loss": 0.7055, + "step": 9712 + }, + { + "epoch": 0.5345918872805328, + "grad_norm": 0.6542297005653381, + "learning_rate": 8.361877709529658e-06, + "loss": 0.7153, + "step": 9713 + }, + { + "epoch": 0.5346469260828884, + "grad_norm": 0.6752653121948242, + "learning_rate": 8.36155684048808e-06, + "loss": 0.6901, + "step": 9714 + }, + { + "epoch": 0.5347019648852441, + "grad_norm": 0.7158684134483337, + "learning_rate": 8.361235946181943e-06, + "loss": 0.7775, + "step": 9715 + }, + { + "epoch": 0.5347570036875997, + "grad_norm": 0.6174392700195312, + "learning_rate": 8.360915026613652e-06, + "loss": 0.6501, + "step": 9716 + }, + { + "epoch": 0.5348120424899554, + "grad_norm": 0.7110500931739807, + "learning_rate": 8.360594081785627e-06, + "loss": 0.742, + "step": 9717 + }, + { + "epoch": 0.5348670812923111, + "grad_norm": 0.8456488251686096, + "learning_rate": 8.360273111700276e-06, + "loss": 0.8237, + "step": 9718 + }, + { + "epoch": 0.5349221200946668, + "grad_norm": 0.6660711169242859, + "learning_rate": 8.359952116360011e-06, + "loss": 0.7856, + "step": 9719 + }, + { + "epoch": 0.5349771588970224, + "grad_norm": 0.7661204934120178, + "learning_rate": 8.359631095767244e-06, + "loss": 0.8336, + "step": 9720 + }, + { + "epoch": 0.535032197699378, + "grad_norm": 0.7747855186462402, + "learning_rate": 8.359310049924392e-06, + "loss": 0.7302, + "step": 9721 + }, + { + "epoch": 0.5350872365017337, + "grad_norm": 0.8156001567840576, + "learning_rate": 8.358988978833864e-06, + "loss": 0.7878, + "step": 9722 + }, + { + "epoch": 0.5351422753040894, + "grad_norm": 0.7371010780334473, + "learning_rate": 8.358667882498073e-06, + "loss": 0.803, + "step": 9723 + }, + { + "epoch": 0.535197314106445, + "grad_norm": 0.7141744494438171, + "learning_rate": 8.358346760919431e-06, + "loss": 0.687, + "step": 9724 + }, + { + "epoch": 0.5352523529088007, + "grad_norm": 0.6395956873893738, + "learning_rate": 8.358025614100358e-06, + "loss": 0.7052, + "step": 9725 + }, + { + "epoch": 0.5353073917111564, + "grad_norm": 0.7135289311408997, + "learning_rate": 8.35770444204326e-06, + "loss": 0.7882, + "step": 9726 + }, + { + "epoch": 0.5353624305135121, + "grad_norm": 0.702408492565155, + "learning_rate": 8.357383244750557e-06, + "loss": 0.6965, + "step": 9727 + }, + { + "epoch": 0.5354174693158676, + "grad_norm": 0.731193482875824, + "learning_rate": 8.357062022224658e-06, + "loss": 0.7525, + "step": 9728 + }, + { + "epoch": 0.5354725081182233, + "grad_norm": 0.8115057945251465, + "learning_rate": 8.356740774467982e-06, + "loss": 0.7466, + "step": 9729 + }, + { + "epoch": 0.535527546920579, + "grad_norm": 0.8644380569458008, + "learning_rate": 8.356419501482938e-06, + "loss": 0.7989, + "step": 9730 + }, + { + "epoch": 0.5355825857229347, + "grad_norm": 1.414620041847229, + "learning_rate": 8.356098203271945e-06, + "loss": 0.7782, + "step": 9731 + }, + { + "epoch": 0.5356376245252903, + "grad_norm": 0.7355421185493469, + "learning_rate": 8.355776879837417e-06, + "loss": 0.7163, + "step": 9732 + }, + { + "epoch": 0.535692663327646, + "grad_norm": 0.6556879281997681, + "learning_rate": 8.355455531181766e-06, + "loss": 0.7543, + "step": 9733 + }, + { + "epoch": 0.5357477021300017, + "grad_norm": 0.6632516980171204, + "learning_rate": 8.355134157307412e-06, + "loss": 0.7382, + "step": 9734 + }, + { + "epoch": 0.5358027409323574, + "grad_norm": 0.7096145153045654, + "learning_rate": 8.354812758216767e-06, + "loss": 0.7797, + "step": 9735 + }, + { + "epoch": 0.5358577797347129, + "grad_norm": 0.6404649019241333, + "learning_rate": 8.354491333912244e-06, + "loss": 0.6637, + "step": 9736 + }, + { + "epoch": 0.5359128185370686, + "grad_norm": 0.6987022757530212, + "learning_rate": 8.354169884396266e-06, + "loss": 0.7682, + "step": 9737 + }, + { + "epoch": 0.5359678573394243, + "grad_norm": 0.6593581438064575, + "learning_rate": 8.353848409671245e-06, + "loss": 0.6747, + "step": 9738 + }, + { + "epoch": 0.53602289614178, + "grad_norm": 0.6999880075454712, + "learning_rate": 8.353526909739596e-06, + "loss": 0.6659, + "step": 9739 + }, + { + "epoch": 0.5360779349441356, + "grad_norm": 0.6448989510536194, + "learning_rate": 8.353205384603735e-06, + "loss": 0.7297, + "step": 9740 + }, + { + "epoch": 0.5361329737464913, + "grad_norm": 0.6666765213012695, + "learning_rate": 8.352883834266082e-06, + "loss": 0.6459, + "step": 9741 + }, + { + "epoch": 0.536188012548847, + "grad_norm": 0.8020225763320923, + "learning_rate": 8.352562258729051e-06, + "loss": 0.8122, + "step": 9742 + }, + { + "epoch": 0.5362430513512026, + "grad_norm": 0.6883382201194763, + "learning_rate": 8.35224065799506e-06, + "loss": 0.7084, + "step": 9743 + }, + { + "epoch": 0.5362980901535582, + "grad_norm": 0.7366660237312317, + "learning_rate": 8.351919032066525e-06, + "loss": 0.848, + "step": 9744 + }, + { + "epoch": 0.5363531289559139, + "grad_norm": 0.7408311367034912, + "learning_rate": 8.351597380945863e-06, + "loss": 0.798, + "step": 9745 + }, + { + "epoch": 0.5364081677582696, + "grad_norm": 0.6841676235198975, + "learning_rate": 8.351275704635495e-06, + "loss": 0.7372, + "step": 9746 + }, + { + "epoch": 0.5364632065606253, + "grad_norm": 0.6903505325317383, + "learning_rate": 8.350954003137833e-06, + "loss": 0.7371, + "step": 9747 + }, + { + "epoch": 0.5365182453629809, + "grad_norm": 0.6444700956344604, + "learning_rate": 8.350632276455298e-06, + "loss": 0.6685, + "step": 9748 + }, + { + "epoch": 0.5365732841653366, + "grad_norm": 0.6821029186248779, + "learning_rate": 8.350310524590307e-06, + "loss": 0.8796, + "step": 9749 + }, + { + "epoch": 0.5366283229676923, + "grad_norm": 0.6733999848365784, + "learning_rate": 8.349988747545282e-06, + "loss": 0.6833, + "step": 9750 + }, + { + "epoch": 0.5366833617700478, + "grad_norm": 0.8097321391105652, + "learning_rate": 8.349666945322636e-06, + "loss": 0.834, + "step": 9751 + }, + { + "epoch": 0.5367384005724035, + "grad_norm": 0.7692395448684692, + "learning_rate": 8.34934511792479e-06, + "loss": 0.7866, + "step": 9752 + }, + { + "epoch": 0.5367934393747592, + "grad_norm": 0.7551112174987793, + "learning_rate": 8.349023265354164e-06, + "loss": 0.8378, + "step": 9753 + }, + { + "epoch": 0.5368484781771149, + "grad_norm": 0.5796393156051636, + "learning_rate": 8.348701387613176e-06, + "loss": 0.5995, + "step": 9754 + }, + { + "epoch": 0.5369035169794705, + "grad_norm": 0.6839799284934998, + "learning_rate": 8.348379484704244e-06, + "loss": 0.8262, + "step": 9755 + }, + { + "epoch": 0.5369585557818262, + "grad_norm": 0.7710869908332825, + "learning_rate": 8.348057556629786e-06, + "loss": 0.7796, + "step": 9756 + }, + { + "epoch": 0.5370135945841819, + "grad_norm": 0.733096718788147, + "learning_rate": 8.347735603392225e-06, + "loss": 0.8233, + "step": 9757 + }, + { + "epoch": 0.5370686333865375, + "grad_norm": 0.6438466906547546, + "learning_rate": 8.347413624993982e-06, + "loss": 0.7582, + "step": 9758 + }, + { + "epoch": 0.5371236721888931, + "grad_norm": 0.6877560615539551, + "learning_rate": 8.34709162143747e-06, + "loss": 0.7428, + "step": 9759 + }, + { + "epoch": 0.5371787109912488, + "grad_norm": 1.060831069946289, + "learning_rate": 8.346769592725115e-06, + "loss": 0.8636, + "step": 9760 + }, + { + "epoch": 0.5372337497936045, + "grad_norm": 0.6828434467315674, + "learning_rate": 8.346447538859334e-06, + "loss": 0.7801, + "step": 9761 + }, + { + "epoch": 0.5372887885959602, + "grad_norm": 0.6784753203392029, + "learning_rate": 8.346125459842552e-06, + "loss": 0.7356, + "step": 9762 + }, + { + "epoch": 0.5373438273983158, + "grad_norm": 0.6493560075759888, + "learning_rate": 8.345803355677185e-06, + "loss": 0.749, + "step": 9763 + }, + { + "epoch": 0.5373988662006715, + "grad_norm": 0.7109258770942688, + "learning_rate": 8.345481226365657e-06, + "loss": 0.7599, + "step": 9764 + }, + { + "epoch": 0.5374539050030271, + "grad_norm": 0.8526985049247742, + "learning_rate": 8.345159071910387e-06, + "loss": 0.6605, + "step": 9765 + }, + { + "epoch": 0.5375089438053828, + "grad_norm": 0.9194039702415466, + "learning_rate": 8.344836892313797e-06, + "loss": 0.794, + "step": 9766 + }, + { + "epoch": 0.5375639826077384, + "grad_norm": 0.7258954048156738, + "learning_rate": 8.344514687578307e-06, + "loss": 0.871, + "step": 9767 + }, + { + "epoch": 0.5376190214100941, + "grad_norm": 0.7099377512931824, + "learning_rate": 8.34419245770634e-06, + "loss": 0.8098, + "step": 9768 + }, + { + "epoch": 0.5376740602124498, + "grad_norm": 0.7883020639419556, + "learning_rate": 8.34387020270032e-06, + "loss": 0.8383, + "step": 9769 + }, + { + "epoch": 0.5377290990148055, + "grad_norm": 0.7009730339050293, + "learning_rate": 8.343547922562664e-06, + "loss": 0.7794, + "step": 9770 + }, + { + "epoch": 0.5377841378171611, + "grad_norm": 0.6569581031799316, + "learning_rate": 8.343225617295798e-06, + "loss": 0.7574, + "step": 9771 + }, + { + "epoch": 0.5378391766195167, + "grad_norm": 0.6159278154373169, + "learning_rate": 8.342903286902142e-06, + "loss": 0.7136, + "step": 9772 + }, + { + "epoch": 0.5378942154218724, + "grad_norm": 0.6594879627227783, + "learning_rate": 8.342580931384121e-06, + "loss": 0.6906, + "step": 9773 + }, + { + "epoch": 0.5379492542242281, + "grad_norm": 0.7002933025360107, + "learning_rate": 8.342258550744156e-06, + "loss": 0.7272, + "step": 9774 + }, + { + "epoch": 0.5380042930265837, + "grad_norm": 0.8243216276168823, + "learning_rate": 8.341936144984672e-06, + "loss": 0.8105, + "step": 9775 + }, + { + "epoch": 0.5380593318289394, + "grad_norm": 0.8358921408653259, + "learning_rate": 8.34161371410809e-06, + "loss": 0.7118, + "step": 9776 + }, + { + "epoch": 0.5381143706312951, + "grad_norm": 0.6339066028594971, + "learning_rate": 8.34129125811683e-06, + "loss": 0.7035, + "step": 9777 + }, + { + "epoch": 0.5381694094336508, + "grad_norm": 0.7407625317573547, + "learning_rate": 8.340968777013324e-06, + "loss": 0.7447, + "step": 9778 + }, + { + "epoch": 0.5382244482360063, + "grad_norm": 0.6876600384712219, + "learning_rate": 8.340646270799991e-06, + "loss": 0.7298, + "step": 9779 + }, + { + "epoch": 0.538279487038362, + "grad_norm": 0.7021264433860779, + "learning_rate": 8.340323739479251e-06, + "loss": 0.7869, + "step": 9780 + }, + { + "epoch": 0.5383345258407177, + "grad_norm": 0.7341023087501526, + "learning_rate": 8.340001183053535e-06, + "loss": 0.7447, + "step": 9781 + }, + { + "epoch": 0.5383895646430734, + "grad_norm": 0.6829406023025513, + "learning_rate": 8.339678601525263e-06, + "loss": 0.7438, + "step": 9782 + }, + { + "epoch": 0.538444603445429, + "grad_norm": 0.7671583294868469, + "learning_rate": 8.33935599489686e-06, + "loss": 0.8678, + "step": 9783 + }, + { + "epoch": 0.5384996422477847, + "grad_norm": 0.701797366142273, + "learning_rate": 8.339033363170753e-06, + "loss": 0.8431, + "step": 9784 + }, + { + "epoch": 0.5385546810501404, + "grad_norm": 0.748235285282135, + "learning_rate": 8.338710706349363e-06, + "loss": 0.7905, + "step": 9785 + }, + { + "epoch": 0.5386097198524961, + "grad_norm": 0.8202430605888367, + "learning_rate": 8.338388024435119e-06, + "loss": 0.7734, + "step": 9786 + }, + { + "epoch": 0.5386647586548516, + "grad_norm": 0.8218014240264893, + "learning_rate": 8.338065317430442e-06, + "loss": 0.846, + "step": 9787 + }, + { + "epoch": 0.5387197974572073, + "grad_norm": 0.6773214936256409, + "learning_rate": 8.337742585337762e-06, + "loss": 0.7692, + "step": 9788 + }, + { + "epoch": 0.538774836259563, + "grad_norm": 0.7011464834213257, + "learning_rate": 8.337419828159501e-06, + "loss": 0.7534, + "step": 9789 + }, + { + "epoch": 0.5388298750619187, + "grad_norm": 0.8299004435539246, + "learning_rate": 8.337097045898087e-06, + "loss": 0.7997, + "step": 9790 + }, + { + "epoch": 0.5388849138642743, + "grad_norm": 0.8600753545761108, + "learning_rate": 8.336774238555942e-06, + "loss": 0.8307, + "step": 9791 + }, + { + "epoch": 0.53893995266663, + "grad_norm": 0.676490843296051, + "learning_rate": 8.336451406135498e-06, + "loss": 0.7748, + "step": 9792 + }, + { + "epoch": 0.5389949914689857, + "grad_norm": 0.7094627618789673, + "learning_rate": 8.336128548639177e-06, + "loss": 0.7524, + "step": 9793 + }, + { + "epoch": 0.5390500302713412, + "grad_norm": 0.6804066896438599, + "learning_rate": 8.335805666069407e-06, + "loss": 0.8299, + "step": 9794 + }, + { + "epoch": 0.5391050690736969, + "grad_norm": 0.6992025971412659, + "learning_rate": 8.335482758428614e-06, + "loss": 0.7548, + "step": 9795 + }, + { + "epoch": 0.5391601078760526, + "grad_norm": 0.6649640798568726, + "learning_rate": 8.335159825719227e-06, + "loss": 0.6595, + "step": 9796 + }, + { + "epoch": 0.5392151466784083, + "grad_norm": 0.7292002439498901, + "learning_rate": 8.33483686794367e-06, + "loss": 0.7944, + "step": 9797 + }, + { + "epoch": 0.5392701854807639, + "grad_norm": 0.9124587178230286, + "learning_rate": 8.334513885104375e-06, + "loss": 0.8586, + "step": 9798 + }, + { + "epoch": 0.5393252242831196, + "grad_norm": 0.7091020941734314, + "learning_rate": 8.334190877203761e-06, + "loss": 0.7019, + "step": 9799 + }, + { + "epoch": 0.5393802630854753, + "grad_norm": 0.7470952272415161, + "learning_rate": 8.333867844244265e-06, + "loss": 0.7866, + "step": 9800 + }, + { + "epoch": 0.539435301887831, + "grad_norm": 0.7368966341018677, + "learning_rate": 8.333544786228309e-06, + "loss": 0.8135, + "step": 9801 + }, + { + "epoch": 0.5394903406901865, + "grad_norm": 0.668305516242981, + "learning_rate": 8.333221703158322e-06, + "loss": 0.7549, + "step": 9802 + }, + { + "epoch": 0.5395453794925422, + "grad_norm": 0.6788874268531799, + "learning_rate": 8.332898595036735e-06, + "loss": 0.8077, + "step": 9803 + }, + { + "epoch": 0.5396004182948979, + "grad_norm": 0.654863715171814, + "learning_rate": 8.332575461865972e-06, + "loss": 0.7695, + "step": 9804 + }, + { + "epoch": 0.5396554570972536, + "grad_norm": 0.7460314631462097, + "learning_rate": 8.332252303648464e-06, + "loss": 0.7711, + "step": 9805 + }, + { + "epoch": 0.5397104958996092, + "grad_norm": 0.7923582792282104, + "learning_rate": 8.331929120386643e-06, + "loss": 0.7348, + "step": 9806 + }, + { + "epoch": 0.5397655347019649, + "grad_norm": 0.6570843458175659, + "learning_rate": 8.331605912082932e-06, + "loss": 0.7029, + "step": 9807 + }, + { + "epoch": 0.5398205735043206, + "grad_norm": 0.7728865742683411, + "learning_rate": 8.331282678739762e-06, + "loss": 0.8249, + "step": 9808 + }, + { + "epoch": 0.5398756123066762, + "grad_norm": 0.7121468186378479, + "learning_rate": 8.330959420359565e-06, + "loss": 0.8698, + "step": 9809 + }, + { + "epoch": 0.5399306511090318, + "grad_norm": 0.7779444456100464, + "learning_rate": 8.330636136944768e-06, + "loss": 0.7448, + "step": 9810 + }, + { + "epoch": 0.5399856899113875, + "grad_norm": 0.7770833373069763, + "learning_rate": 8.330312828497801e-06, + "loss": 0.8489, + "step": 9811 + }, + { + "epoch": 0.5400407287137432, + "grad_norm": 0.6705769896507263, + "learning_rate": 8.329989495021096e-06, + "loss": 0.7349, + "step": 9812 + }, + { + "epoch": 0.5400957675160989, + "grad_norm": 0.6775381565093994, + "learning_rate": 8.329666136517079e-06, + "loss": 0.8093, + "step": 9813 + }, + { + "epoch": 0.5401508063184545, + "grad_norm": 0.6621832251548767, + "learning_rate": 8.329342752988183e-06, + "loss": 0.7877, + "step": 9814 + }, + { + "epoch": 0.5402058451208102, + "grad_norm": 0.704339861869812, + "learning_rate": 8.329019344436839e-06, + "loss": 0.7708, + "step": 9815 + }, + { + "epoch": 0.5402608839231658, + "grad_norm": 0.789944052696228, + "learning_rate": 8.328695910865476e-06, + "loss": 0.7563, + "step": 9816 + }, + { + "epoch": 0.5403159227255215, + "grad_norm": 0.6997420191764832, + "learning_rate": 8.328372452276525e-06, + "loss": 0.7023, + "step": 9817 + }, + { + "epoch": 0.5403709615278771, + "grad_norm": 0.6453180313110352, + "learning_rate": 8.328048968672418e-06, + "loss": 0.7193, + "step": 9818 + }, + { + "epoch": 0.5404260003302328, + "grad_norm": 0.7059640884399414, + "learning_rate": 8.327725460055586e-06, + "loss": 0.7875, + "step": 9819 + }, + { + "epoch": 0.5404810391325885, + "grad_norm": 0.7725005745887756, + "learning_rate": 8.327401926428461e-06, + "loss": 0.7503, + "step": 9820 + }, + { + "epoch": 0.5405360779349442, + "grad_norm": 0.7710940837860107, + "learning_rate": 8.327078367793473e-06, + "loss": 0.8314, + "step": 9821 + }, + { + "epoch": 0.5405911167372998, + "grad_norm": 0.9090666770935059, + "learning_rate": 8.326754784153055e-06, + "loss": 0.8021, + "step": 9822 + }, + { + "epoch": 0.5406461555396554, + "grad_norm": 0.7135322690010071, + "learning_rate": 8.326431175509638e-06, + "loss": 0.8084, + "step": 9823 + }, + { + "epoch": 0.5407011943420111, + "grad_norm": 0.9126102328300476, + "learning_rate": 8.326107541865656e-06, + "loss": 0.75, + "step": 9824 + }, + { + "epoch": 0.5407562331443668, + "grad_norm": 0.7263361215591431, + "learning_rate": 8.325783883223539e-06, + "loss": 0.6808, + "step": 9825 + }, + { + "epoch": 0.5408112719467224, + "grad_norm": 0.7234700918197632, + "learning_rate": 8.32546019958572e-06, + "loss": 0.7582, + "step": 9826 + }, + { + "epoch": 0.5408663107490781, + "grad_norm": 0.7043294310569763, + "learning_rate": 8.325136490954633e-06, + "loss": 0.8421, + "step": 9827 + }, + { + "epoch": 0.5409213495514338, + "grad_norm": 0.7947664856910706, + "learning_rate": 8.32481275733271e-06, + "loss": 0.8672, + "step": 9828 + }, + { + "epoch": 0.5409763883537895, + "grad_norm": 0.704590916633606, + "learning_rate": 8.324488998722384e-06, + "loss": 0.7356, + "step": 9829 + }, + { + "epoch": 0.541031427156145, + "grad_norm": 0.7630662322044373, + "learning_rate": 8.32416521512609e-06, + "loss": 0.7082, + "step": 9830 + }, + { + "epoch": 0.5410864659585007, + "grad_norm": 0.728721022605896, + "learning_rate": 8.323841406546259e-06, + "loss": 0.7987, + "step": 9831 + }, + { + "epoch": 0.5411415047608564, + "grad_norm": 0.7164294719696045, + "learning_rate": 8.323517572985326e-06, + "loss": 0.721, + "step": 9832 + }, + { + "epoch": 0.5411965435632121, + "grad_norm": 0.7555723190307617, + "learning_rate": 8.323193714445722e-06, + "loss": 0.814, + "step": 9833 + }, + { + "epoch": 0.5412515823655677, + "grad_norm": 0.827485978603363, + "learning_rate": 8.322869830929887e-06, + "loss": 0.8817, + "step": 9834 + }, + { + "epoch": 0.5413066211679234, + "grad_norm": 0.718950092792511, + "learning_rate": 8.322545922440252e-06, + "loss": 0.8648, + "step": 9835 + }, + { + "epoch": 0.5413616599702791, + "grad_norm": 0.7361611723899841, + "learning_rate": 8.32222198897925e-06, + "loss": 0.7392, + "step": 9836 + }, + { + "epoch": 0.5414166987726347, + "grad_norm": 0.6712168455123901, + "learning_rate": 8.321898030549316e-06, + "loss": 0.7505, + "step": 9837 + }, + { + "epoch": 0.5414717375749903, + "grad_norm": 0.7475710511207581, + "learning_rate": 8.321574047152887e-06, + "loss": 0.7969, + "step": 9838 + }, + { + "epoch": 0.541526776377346, + "grad_norm": 0.9751361608505249, + "learning_rate": 8.321250038792397e-06, + "loss": 0.8534, + "step": 9839 + }, + { + "epoch": 0.5415818151797017, + "grad_norm": 0.6858723163604736, + "learning_rate": 8.32092600547028e-06, + "loss": 0.8277, + "step": 9840 + }, + { + "epoch": 0.5416368539820573, + "grad_norm": 0.8899725675582886, + "learning_rate": 8.320601947188971e-06, + "loss": 0.8599, + "step": 9841 + }, + { + "epoch": 0.541691892784413, + "grad_norm": 0.7140665650367737, + "learning_rate": 8.320277863950907e-06, + "loss": 0.7429, + "step": 9842 + }, + { + "epoch": 0.5417469315867687, + "grad_norm": 0.7467615604400635, + "learning_rate": 8.319953755758525e-06, + "loss": 0.7826, + "step": 9843 + }, + { + "epoch": 0.5418019703891244, + "grad_norm": 0.6578202843666077, + "learning_rate": 8.319629622614258e-06, + "loss": 0.6833, + "step": 9844 + }, + { + "epoch": 0.5418570091914799, + "grad_norm": 0.9430698156356812, + "learning_rate": 8.319305464520543e-06, + "loss": 0.8243, + "step": 9845 + }, + { + "epoch": 0.5419120479938356, + "grad_norm": 0.8632097840309143, + "learning_rate": 8.318981281479817e-06, + "loss": 0.7975, + "step": 9846 + }, + { + "epoch": 0.5419670867961913, + "grad_norm": 0.7241839170455933, + "learning_rate": 8.318657073494517e-06, + "loss": 0.7226, + "step": 9847 + }, + { + "epoch": 0.542022125598547, + "grad_norm": 0.6927164196968079, + "learning_rate": 8.318332840567078e-06, + "loss": 0.7125, + "step": 9848 + }, + { + "epoch": 0.5420771644009026, + "grad_norm": 0.6414939761161804, + "learning_rate": 8.318008582699937e-06, + "loss": 0.7366, + "step": 9849 + }, + { + "epoch": 0.5421322032032583, + "grad_norm": 0.7584436535835266, + "learning_rate": 8.317684299895533e-06, + "loss": 0.8601, + "step": 9850 + }, + { + "epoch": 0.542187242005614, + "grad_norm": 0.6045856475830078, + "learning_rate": 8.317359992156302e-06, + "loss": 0.6697, + "step": 9851 + }, + { + "epoch": 0.5422422808079697, + "grad_norm": 0.715048611164093, + "learning_rate": 8.31703565948468e-06, + "loss": 0.7535, + "step": 9852 + }, + { + "epoch": 0.5422973196103252, + "grad_norm": 0.6925113201141357, + "learning_rate": 8.316711301883106e-06, + "loss": 0.8122, + "step": 9853 + }, + { + "epoch": 0.5423523584126809, + "grad_norm": 0.6787780523300171, + "learning_rate": 8.316386919354018e-06, + "loss": 0.7428, + "step": 9854 + }, + { + "epoch": 0.5424073972150366, + "grad_norm": 0.6831366419792175, + "learning_rate": 8.316062511899855e-06, + "loss": 0.767, + "step": 9855 + }, + { + "epoch": 0.5424624360173923, + "grad_norm": 0.6865691542625427, + "learning_rate": 8.315738079523053e-06, + "loss": 0.6549, + "step": 9856 + }, + { + "epoch": 0.5425174748197479, + "grad_norm": 0.7149406671524048, + "learning_rate": 8.31541362222605e-06, + "loss": 0.8127, + "step": 9857 + }, + { + "epoch": 0.5425725136221036, + "grad_norm": 0.6826779842376709, + "learning_rate": 8.315089140011286e-06, + "loss": 0.706, + "step": 9858 + }, + { + "epoch": 0.5426275524244593, + "grad_norm": 0.688204288482666, + "learning_rate": 8.3147646328812e-06, + "loss": 0.8675, + "step": 9859 + }, + { + "epoch": 0.542682591226815, + "grad_norm": 0.6659492254257202, + "learning_rate": 8.31444010083823e-06, + "loss": 0.7851, + "step": 9860 + }, + { + "epoch": 0.5427376300291705, + "grad_norm": 0.8049291372299194, + "learning_rate": 8.314115543884816e-06, + "loss": 0.7442, + "step": 9861 + }, + { + "epoch": 0.5427926688315262, + "grad_norm": 0.7505989670753479, + "learning_rate": 8.313790962023397e-06, + "loss": 0.8391, + "step": 9862 + }, + { + "epoch": 0.5428477076338819, + "grad_norm": 0.6810199618339539, + "learning_rate": 8.31346635525641e-06, + "loss": 0.8131, + "step": 9863 + }, + { + "epoch": 0.5429027464362376, + "grad_norm": 0.6724215745925903, + "learning_rate": 8.313141723586298e-06, + "loss": 0.75, + "step": 9864 + }, + { + "epoch": 0.5429577852385932, + "grad_norm": 0.7804376482963562, + "learning_rate": 8.3128170670155e-06, + "loss": 0.704, + "step": 9865 + }, + { + "epoch": 0.5430128240409489, + "grad_norm": 0.9494230151176453, + "learning_rate": 8.312492385546455e-06, + "loss": 0.8578, + "step": 9866 + }, + { + "epoch": 0.5430678628433045, + "grad_norm": 0.6780333518981934, + "learning_rate": 8.312167679181606e-06, + "loss": 0.701, + "step": 9867 + }, + { + "epoch": 0.5431229016456602, + "grad_norm": 0.7407701015472412, + "learning_rate": 8.31184294792339e-06, + "loss": 0.8505, + "step": 9868 + }, + { + "epoch": 0.5431779404480158, + "grad_norm": 0.680903434753418, + "learning_rate": 8.311518191774249e-06, + "loss": 0.7645, + "step": 9869 + }, + { + "epoch": 0.5432329792503715, + "grad_norm": 0.6695752143859863, + "learning_rate": 8.311193410736622e-06, + "loss": 0.816, + "step": 9870 + }, + { + "epoch": 0.5432880180527272, + "grad_norm": 0.6725142598152161, + "learning_rate": 8.310868604812954e-06, + "loss": 0.7044, + "step": 9871 + }, + { + "epoch": 0.5433430568550829, + "grad_norm": 0.922627866268158, + "learning_rate": 8.310543774005684e-06, + "loss": 0.7589, + "step": 9872 + }, + { + "epoch": 0.5433980956574385, + "grad_norm": 1.0136839151382446, + "learning_rate": 8.310218918317251e-06, + "loss": 0.7573, + "step": 9873 + }, + { + "epoch": 0.5434531344597942, + "grad_norm": 0.9053532481193542, + "learning_rate": 8.309894037750099e-06, + "loss": 0.8269, + "step": 9874 + }, + { + "epoch": 0.5435081732621498, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.309569132306671e-06, + "loss": 0.716, + "step": 9875 + }, + { + "epoch": 0.5435632120645055, + "grad_norm": 0.7157679796218872, + "learning_rate": 8.309244201989408e-06, + "loss": 0.7433, + "step": 9876 + }, + { + "epoch": 0.5436182508668611, + "grad_norm": 0.9316089749336243, + "learning_rate": 8.308919246800748e-06, + "loss": 0.7499, + "step": 9877 + }, + { + "epoch": 0.5436732896692168, + "grad_norm": 0.6682490110397339, + "learning_rate": 8.308594266743139e-06, + "loss": 0.7286, + "step": 9878 + }, + { + "epoch": 0.5437283284715725, + "grad_norm": 0.7241143584251404, + "learning_rate": 8.308269261819022e-06, + "loss": 0.7934, + "step": 9879 + }, + { + "epoch": 0.5437833672739281, + "grad_norm": 0.7402396202087402, + "learning_rate": 8.307944232030838e-06, + "loss": 0.7361, + "step": 9880 + }, + { + "epoch": 0.5438384060762838, + "grad_norm": 0.6839993596076965, + "learning_rate": 8.307619177381029e-06, + "loss": 0.749, + "step": 9881 + }, + { + "epoch": 0.5438934448786394, + "grad_norm": 0.6536363363265991, + "learning_rate": 8.307294097872041e-06, + "loss": 0.706, + "step": 9882 + }, + { + "epoch": 0.5439484836809951, + "grad_norm": 0.602644681930542, + "learning_rate": 8.306968993506317e-06, + "loss": 0.6857, + "step": 9883 + }, + { + "epoch": 0.5440035224833507, + "grad_norm": 0.6567881107330322, + "learning_rate": 8.306643864286297e-06, + "loss": 0.6989, + "step": 9884 + }, + { + "epoch": 0.5440585612857064, + "grad_norm": 1.0013506412506104, + "learning_rate": 8.306318710214427e-06, + "loss": 0.7251, + "step": 9885 + }, + { + "epoch": 0.5441136000880621, + "grad_norm": 0.7016813158988953, + "learning_rate": 8.305993531293153e-06, + "loss": 0.7535, + "step": 9886 + }, + { + "epoch": 0.5441686388904178, + "grad_norm": 0.7345741391181946, + "learning_rate": 8.305668327524915e-06, + "loss": 0.887, + "step": 9887 + }, + { + "epoch": 0.5442236776927734, + "grad_norm": 1.0925754308700562, + "learning_rate": 8.305343098912158e-06, + "loss": 0.7779, + "step": 9888 + }, + { + "epoch": 0.544278716495129, + "grad_norm": 0.79815274477005, + "learning_rate": 8.305017845457328e-06, + "loss": 0.7736, + "step": 9889 + }, + { + "epoch": 0.5443337552974847, + "grad_norm": 0.6324154138565063, + "learning_rate": 8.304692567162868e-06, + "loss": 0.6823, + "step": 9890 + }, + { + "epoch": 0.5443887940998404, + "grad_norm": 0.6990262866020203, + "learning_rate": 8.304367264031223e-06, + "loss": 0.7804, + "step": 9891 + }, + { + "epoch": 0.544443832902196, + "grad_norm": 1.4203195571899414, + "learning_rate": 8.304041936064839e-06, + "loss": 0.8702, + "step": 9892 + }, + { + "epoch": 0.5444988717045517, + "grad_norm": 0.6986544132232666, + "learning_rate": 8.303716583266161e-06, + "loss": 0.7666, + "step": 9893 + }, + { + "epoch": 0.5445539105069074, + "grad_norm": 0.7037138938903809, + "learning_rate": 8.303391205637632e-06, + "loss": 0.7995, + "step": 9894 + }, + { + "epoch": 0.5446089493092631, + "grad_norm": 0.7101728320121765, + "learning_rate": 8.3030658031817e-06, + "loss": 0.8185, + "step": 9895 + }, + { + "epoch": 0.5446639881116186, + "grad_norm": 0.6571425795555115, + "learning_rate": 8.302740375900808e-06, + "loss": 0.6152, + "step": 9896 + }, + { + "epoch": 0.5447190269139743, + "grad_norm": 0.7560263276100159, + "learning_rate": 8.302414923797406e-06, + "loss": 0.9037, + "step": 9897 + }, + { + "epoch": 0.54477406571633, + "grad_norm": 0.8692007064819336, + "learning_rate": 8.302089446873935e-06, + "loss": 0.7689, + "step": 9898 + }, + { + "epoch": 0.5448291045186857, + "grad_norm": 0.7533506751060486, + "learning_rate": 8.301763945132845e-06, + "loss": 0.7671, + "step": 9899 + }, + { + "epoch": 0.5448841433210413, + "grad_norm": 0.6992233991622925, + "learning_rate": 8.301438418576581e-06, + "loss": 0.723, + "step": 9900 + }, + { + "epoch": 0.544939182123397, + "grad_norm": 0.7966120839118958, + "learning_rate": 8.301112867207589e-06, + "loss": 0.7968, + "step": 9901 + }, + { + "epoch": 0.5449942209257527, + "grad_norm": 0.800558865070343, + "learning_rate": 8.300787291028316e-06, + "loss": 0.8583, + "step": 9902 + }, + { + "epoch": 0.5450492597281084, + "grad_norm": 0.7019909024238586, + "learning_rate": 8.30046169004121e-06, + "loss": 0.7045, + "step": 9903 + }, + { + "epoch": 0.5451042985304639, + "grad_norm": 0.7778449654579163, + "learning_rate": 8.300136064248717e-06, + "loss": 0.7964, + "step": 9904 + }, + { + "epoch": 0.5451593373328196, + "grad_norm": 0.6894309520721436, + "learning_rate": 8.299810413653284e-06, + "loss": 0.7382, + "step": 9905 + }, + { + "epoch": 0.5452143761351753, + "grad_norm": 0.6942182183265686, + "learning_rate": 8.299484738257361e-06, + "loss": 0.73, + "step": 9906 + }, + { + "epoch": 0.545269414937531, + "grad_norm": 0.6607787609100342, + "learning_rate": 8.299159038063394e-06, + "loss": 0.6987, + "step": 9907 + }, + { + "epoch": 0.5453244537398866, + "grad_norm": 0.7447709441184998, + "learning_rate": 8.29883331307383e-06, + "loss": 0.7787, + "step": 9908 + }, + { + "epoch": 0.5453794925422423, + "grad_norm": 0.6315301656723022, + "learning_rate": 8.298507563291116e-06, + "loss": 0.7047, + "step": 9909 + }, + { + "epoch": 0.545434531344598, + "grad_norm": 0.8095656633377075, + "learning_rate": 8.298181788717705e-06, + "loss": 0.691, + "step": 9910 + }, + { + "epoch": 0.5454895701469537, + "grad_norm": 0.6419453024864197, + "learning_rate": 8.29785598935604e-06, + "loss": 0.7333, + "step": 9911 + }, + { + "epoch": 0.5455446089493092, + "grad_norm": 0.7209222316741943, + "learning_rate": 8.297530165208574e-06, + "loss": 0.8174, + "step": 9912 + }, + { + "epoch": 0.5455996477516649, + "grad_norm": 0.6778598427772522, + "learning_rate": 8.297204316277754e-06, + "loss": 0.7696, + "step": 9913 + }, + { + "epoch": 0.5456546865540206, + "grad_norm": 0.6573307514190674, + "learning_rate": 8.296878442566028e-06, + "loss": 0.7843, + "step": 9914 + }, + { + "epoch": 0.5457097253563763, + "grad_norm": 0.6987473964691162, + "learning_rate": 8.296552544075847e-06, + "loss": 0.809, + "step": 9915 + }, + { + "epoch": 0.5457647641587319, + "grad_norm": 0.7149204015731812, + "learning_rate": 8.29622662080966e-06, + "loss": 0.848, + "step": 9916 + }, + { + "epoch": 0.5458198029610876, + "grad_norm": 0.6252632141113281, + "learning_rate": 8.295900672769913e-06, + "loss": 0.7029, + "step": 9917 + }, + { + "epoch": 0.5458748417634433, + "grad_norm": 0.713376522064209, + "learning_rate": 8.295574699959062e-06, + "loss": 0.726, + "step": 9918 + }, + { + "epoch": 0.5459298805657989, + "grad_norm": 0.6864717602729797, + "learning_rate": 8.295248702379552e-06, + "loss": 0.7428, + "step": 9919 + }, + { + "epoch": 0.5459849193681545, + "grad_norm": 0.8085678219795227, + "learning_rate": 8.294922680033837e-06, + "loss": 0.8697, + "step": 9920 + }, + { + "epoch": 0.5460399581705102, + "grad_norm": 0.7366700768470764, + "learning_rate": 8.294596632924363e-06, + "loss": 0.7714, + "step": 9921 + }, + { + "epoch": 0.5460949969728659, + "grad_norm": 0.670632541179657, + "learning_rate": 8.294270561053583e-06, + "loss": 0.7032, + "step": 9922 + }, + { + "epoch": 0.5461500357752215, + "grad_norm": 0.7867220640182495, + "learning_rate": 8.293944464423946e-06, + "loss": 0.8903, + "step": 9923 + }, + { + "epoch": 0.5462050745775772, + "grad_norm": 0.8441565632820129, + "learning_rate": 8.293618343037907e-06, + "loss": 0.8694, + "step": 9924 + }, + { + "epoch": 0.5462601133799329, + "grad_norm": 0.7048027515411377, + "learning_rate": 8.293292196897913e-06, + "loss": 0.8226, + "step": 9925 + }, + { + "epoch": 0.5463151521822885, + "grad_norm": 0.6344078779220581, + "learning_rate": 8.292966026006416e-06, + "loss": 0.7615, + "step": 9926 + }, + { + "epoch": 0.5463701909846441, + "grad_norm": 0.6744484901428223, + "learning_rate": 8.292639830365867e-06, + "loss": 0.6944, + "step": 9927 + }, + { + "epoch": 0.5464252297869998, + "grad_norm": 0.8113303780555725, + "learning_rate": 8.292313609978721e-06, + "loss": 0.7558, + "step": 9928 + }, + { + "epoch": 0.5464802685893555, + "grad_norm": 0.640190839767456, + "learning_rate": 8.291987364847425e-06, + "loss": 0.7167, + "step": 9929 + }, + { + "epoch": 0.5465353073917112, + "grad_norm": 0.7714816331863403, + "learning_rate": 8.291661094974434e-06, + "loss": 0.8662, + "step": 9930 + }, + { + "epoch": 0.5465903461940668, + "grad_norm": 0.6785402894020081, + "learning_rate": 8.291334800362199e-06, + "loss": 0.6835, + "step": 9931 + }, + { + "epoch": 0.5466453849964225, + "grad_norm": 0.704868495464325, + "learning_rate": 8.291008481013173e-06, + "loss": 0.7343, + "step": 9932 + }, + { + "epoch": 0.5467004237987781, + "grad_norm": 0.7587466239929199, + "learning_rate": 8.290682136929809e-06, + "loss": 0.7856, + "step": 9933 + }, + { + "epoch": 0.5467554626011338, + "grad_norm": 0.7460505962371826, + "learning_rate": 8.290355768114557e-06, + "loss": 0.7463, + "step": 9934 + }, + { + "epoch": 0.5468105014034894, + "grad_norm": 0.7185021042823792, + "learning_rate": 8.290029374569873e-06, + "loss": 0.8106, + "step": 9935 + }, + { + "epoch": 0.5468655402058451, + "grad_norm": 0.7023874521255493, + "learning_rate": 8.289702956298209e-06, + "loss": 0.6863, + "step": 9936 + }, + { + "epoch": 0.5469205790082008, + "grad_norm": 0.8688495755195618, + "learning_rate": 8.289376513302017e-06, + "loss": 0.8898, + "step": 9937 + }, + { + "epoch": 0.5469756178105565, + "grad_norm": 0.6405122876167297, + "learning_rate": 8.289050045583752e-06, + "loss": 0.6804, + "step": 9938 + }, + { + "epoch": 0.5470306566129121, + "grad_norm": 0.8364881277084351, + "learning_rate": 8.288723553145868e-06, + "loss": 0.8356, + "step": 9939 + }, + { + "epoch": 0.5470856954152677, + "grad_norm": 0.6621617078781128, + "learning_rate": 8.288397035990818e-06, + "loss": 0.7508, + "step": 9940 + }, + { + "epoch": 0.5471407342176234, + "grad_norm": 0.6822347640991211, + "learning_rate": 8.288070494121056e-06, + "loss": 0.7722, + "step": 9941 + }, + { + "epoch": 0.5471957730199791, + "grad_norm": 0.6727223992347717, + "learning_rate": 8.287743927539036e-06, + "loss": 0.743, + "step": 9942 + }, + { + "epoch": 0.5472508118223347, + "grad_norm": 0.7852441668510437, + "learning_rate": 8.287417336247214e-06, + "loss": 0.8321, + "step": 9943 + }, + { + "epoch": 0.5473058506246904, + "grad_norm": 0.6982126235961914, + "learning_rate": 8.287090720248041e-06, + "loss": 0.6669, + "step": 9944 + }, + { + "epoch": 0.5473608894270461, + "grad_norm": 0.7820166945457458, + "learning_rate": 8.286764079543976e-06, + "loss": 0.7592, + "step": 9945 + }, + { + "epoch": 0.5474159282294018, + "grad_norm": 0.6868422627449036, + "learning_rate": 8.28643741413747e-06, + "loss": 0.8308, + "step": 9946 + }, + { + "epoch": 0.5474709670317573, + "grad_norm": 0.8227942585945129, + "learning_rate": 8.286110724030982e-06, + "loss": 0.7982, + "step": 9947 + }, + { + "epoch": 0.547526005834113, + "grad_norm": 0.6838171482086182, + "learning_rate": 8.285784009226964e-06, + "loss": 0.7907, + "step": 9948 + }, + { + "epoch": 0.5475810446364687, + "grad_norm": 0.7200812697410583, + "learning_rate": 8.285457269727875e-06, + "loss": 0.88, + "step": 9949 + }, + { + "epoch": 0.5476360834388244, + "grad_norm": 0.7469412684440613, + "learning_rate": 8.285130505536168e-06, + "loss": 0.8167, + "step": 9950 + }, + { + "epoch": 0.54769112224118, + "grad_norm": 0.6660227179527283, + "learning_rate": 8.284803716654298e-06, + "loss": 0.7685, + "step": 9951 + }, + { + "epoch": 0.5477461610435357, + "grad_norm": 0.7116572260856628, + "learning_rate": 8.284476903084723e-06, + "loss": 0.7415, + "step": 9952 + }, + { + "epoch": 0.5478011998458914, + "grad_norm": 0.6540791988372803, + "learning_rate": 8.284150064829899e-06, + "loss": 0.6571, + "step": 9953 + }, + { + "epoch": 0.5478562386482471, + "grad_norm": 0.7527759075164795, + "learning_rate": 8.283823201892283e-06, + "loss": 0.8678, + "step": 9954 + }, + { + "epoch": 0.5479112774506026, + "grad_norm": 0.7795953750610352, + "learning_rate": 8.283496314274331e-06, + "loss": 0.8086, + "step": 9955 + }, + { + "epoch": 0.5479663162529583, + "grad_norm": 0.862503170967102, + "learning_rate": 8.283169401978498e-06, + "loss": 0.7442, + "step": 9956 + }, + { + "epoch": 0.548021355055314, + "grad_norm": 0.6552054286003113, + "learning_rate": 8.282842465007244e-06, + "loss": 0.6664, + "step": 9957 + }, + { + "epoch": 0.5480763938576697, + "grad_norm": 0.7242427468299866, + "learning_rate": 8.282515503363024e-06, + "loss": 0.8199, + "step": 9958 + }, + { + "epoch": 0.5481314326600253, + "grad_norm": 0.7529763579368591, + "learning_rate": 8.282188517048295e-06, + "loss": 0.761, + "step": 9959 + }, + { + "epoch": 0.548186471462381, + "grad_norm": 0.7909425497055054, + "learning_rate": 8.281861506065519e-06, + "loss": 0.7389, + "step": 9960 + }, + { + "epoch": 0.5482415102647367, + "grad_norm": 0.6594850420951843, + "learning_rate": 8.281534470417147e-06, + "loss": 0.7473, + "step": 9961 + }, + { + "epoch": 0.5482965490670924, + "grad_norm": 0.6900844573974609, + "learning_rate": 8.281207410105642e-06, + "loss": 0.7551, + "step": 9962 + }, + { + "epoch": 0.5483515878694479, + "grad_norm": 0.6922640204429626, + "learning_rate": 8.28088032513346e-06, + "loss": 0.7654, + "step": 9963 + }, + { + "epoch": 0.5484066266718036, + "grad_norm": 0.7758432626724243, + "learning_rate": 8.28055321550306e-06, + "loss": 0.8033, + "step": 9964 + }, + { + "epoch": 0.5484616654741593, + "grad_norm": 0.7074280977249146, + "learning_rate": 8.2802260812169e-06, + "loss": 0.7302, + "step": 9965 + }, + { + "epoch": 0.5485167042765149, + "grad_norm": 0.7724928259849548, + "learning_rate": 8.27989892227744e-06, + "loss": 0.7621, + "step": 9966 + }, + { + "epoch": 0.5485717430788706, + "grad_norm": 0.7364168167114258, + "learning_rate": 8.279571738687137e-06, + "loss": 0.7587, + "step": 9967 + }, + { + "epoch": 0.5486267818812263, + "grad_norm": 0.7298350930213928, + "learning_rate": 8.27924453044845e-06, + "loss": 0.7371, + "step": 9968 + }, + { + "epoch": 0.548681820683582, + "grad_norm": 0.8056737780570984, + "learning_rate": 8.27891729756384e-06, + "loss": 0.9871, + "step": 9969 + }, + { + "epoch": 0.5487368594859375, + "grad_norm": 0.7499688267707825, + "learning_rate": 8.278590040035763e-06, + "loss": 0.8574, + "step": 9970 + }, + { + "epoch": 0.5487918982882932, + "grad_norm": 0.7398175001144409, + "learning_rate": 8.278262757866683e-06, + "loss": 0.744, + "step": 9971 + }, + { + "epoch": 0.5488469370906489, + "grad_norm": 0.7099171876907349, + "learning_rate": 8.277935451059058e-06, + "loss": 0.7108, + "step": 9972 + }, + { + "epoch": 0.5489019758930046, + "grad_norm": 0.6720188856124878, + "learning_rate": 8.277608119615345e-06, + "loss": 0.8565, + "step": 9973 + }, + { + "epoch": 0.5489570146953602, + "grad_norm": 0.7870737910270691, + "learning_rate": 8.27728076353801e-06, + "loss": 0.7429, + "step": 9974 + }, + { + "epoch": 0.5490120534977159, + "grad_norm": 0.7358133792877197, + "learning_rate": 8.276953382829507e-06, + "loss": 0.7549, + "step": 9975 + }, + { + "epoch": 0.5490670923000716, + "grad_norm": 0.8968467116355896, + "learning_rate": 8.276625977492303e-06, + "loss": 0.6983, + "step": 9976 + }, + { + "epoch": 0.5491221311024272, + "grad_norm": 0.7346875071525574, + "learning_rate": 8.276298547528852e-06, + "loss": 0.8541, + "step": 9977 + }, + { + "epoch": 0.5491771699047828, + "grad_norm": 0.7297229170799255, + "learning_rate": 8.27597109294162e-06, + "loss": 0.8378, + "step": 9978 + }, + { + "epoch": 0.5492322087071385, + "grad_norm": 0.6907635927200317, + "learning_rate": 8.275643613733064e-06, + "loss": 0.7058, + "step": 9979 + }, + { + "epoch": 0.5492872475094942, + "grad_norm": 0.7612239718437195, + "learning_rate": 8.27531610990565e-06, + "loss": 0.6827, + "step": 9980 + }, + { + "epoch": 0.5493422863118499, + "grad_norm": 1.3160386085510254, + "learning_rate": 8.274988581461837e-06, + "loss": 0.7357, + "step": 9981 + }, + { + "epoch": 0.5493973251142055, + "grad_norm": 0.6370541453361511, + "learning_rate": 8.274661028404083e-06, + "loss": 0.7323, + "step": 9982 + }, + { + "epoch": 0.5494523639165612, + "grad_norm": 0.7051724195480347, + "learning_rate": 8.274333450734856e-06, + "loss": 0.7714, + "step": 9983 + }, + { + "epoch": 0.5495074027189168, + "grad_norm": 0.7452969551086426, + "learning_rate": 8.274005848456614e-06, + "loss": 0.7516, + "step": 9984 + }, + { + "epoch": 0.5495624415212725, + "grad_norm": 0.7132626175880432, + "learning_rate": 8.273678221571823e-06, + "loss": 0.6417, + "step": 9985 + }, + { + "epoch": 0.5496174803236281, + "grad_norm": 0.7873446345329285, + "learning_rate": 8.273350570082941e-06, + "loss": 0.8457, + "step": 9986 + }, + { + "epoch": 0.5496725191259838, + "grad_norm": 0.691470205783844, + "learning_rate": 8.273022893992432e-06, + "loss": 0.7871, + "step": 9987 + }, + { + "epoch": 0.5497275579283395, + "grad_norm": 0.6671431064605713, + "learning_rate": 8.27269519330276e-06, + "loss": 0.6919, + "step": 9988 + }, + { + "epoch": 0.5497825967306952, + "grad_norm": 0.8026914596557617, + "learning_rate": 8.272367468016387e-06, + "loss": 0.6885, + "step": 9989 + }, + { + "epoch": 0.5498376355330508, + "grad_norm": 0.9003152251243591, + "learning_rate": 8.272039718135774e-06, + "loss": 0.7671, + "step": 9990 + }, + { + "epoch": 0.5498926743354065, + "grad_norm": 0.6515254378318787, + "learning_rate": 8.271711943663388e-06, + "loss": 0.7589, + "step": 9991 + }, + { + "epoch": 0.5499477131377621, + "grad_norm": 0.6495782136917114, + "learning_rate": 8.27138414460169e-06, + "loss": 0.7277, + "step": 9992 + }, + { + "epoch": 0.5500027519401178, + "grad_norm": 0.7564565539360046, + "learning_rate": 8.271056320953146e-06, + "loss": 0.6977, + "step": 9993 + }, + { + "epoch": 0.5500577907424734, + "grad_norm": 0.8551548719406128, + "learning_rate": 8.270728472720218e-06, + "loss": 0.684, + "step": 9994 + }, + { + "epoch": 0.5501128295448291, + "grad_norm": 0.6614843010902405, + "learning_rate": 8.270400599905369e-06, + "loss": 0.6559, + "step": 9995 + }, + { + "epoch": 0.5501678683471848, + "grad_norm": 0.6920068264007568, + "learning_rate": 8.270072702511065e-06, + "loss": 0.7497, + "step": 9996 + }, + { + "epoch": 0.5502229071495405, + "grad_norm": 0.7426198124885559, + "learning_rate": 8.26974478053977e-06, + "loss": 0.7434, + "step": 9997 + }, + { + "epoch": 0.550277945951896, + "grad_norm": 1.2630934715270996, + "learning_rate": 8.269416833993949e-06, + "loss": 0.7306, + "step": 9998 + }, + { + "epoch": 0.5503329847542517, + "grad_norm": 0.7069457769393921, + "learning_rate": 8.269088862876066e-06, + "loss": 0.6735, + "step": 9999 + }, + { + "epoch": 0.5503880235566074, + "grad_norm": 0.8945016264915466, + "learning_rate": 8.268760867188586e-06, + "loss": 0.7575, + "step": 10000 + }, + { + "epoch": 0.5504430623589631, + "grad_norm": 0.7708195447921753, + "learning_rate": 8.268432846933974e-06, + "loss": 0.6988, + "step": 10001 + }, + { + "epoch": 0.5504981011613187, + "grad_norm": 0.7884799838066101, + "learning_rate": 8.268104802114696e-06, + "loss": 0.8085, + "step": 10002 + }, + { + "epoch": 0.5505531399636744, + "grad_norm": 0.7801569104194641, + "learning_rate": 8.267776732733217e-06, + "loss": 0.886, + "step": 10003 + }, + { + "epoch": 0.5506081787660301, + "grad_norm": 0.714645504951477, + "learning_rate": 8.267448638792004e-06, + "loss": 0.7151, + "step": 10004 + }, + { + "epoch": 0.5506632175683858, + "grad_norm": 0.653136134147644, + "learning_rate": 8.267120520293519e-06, + "loss": 0.6347, + "step": 10005 + }, + { + "epoch": 0.5507182563707413, + "grad_norm": 0.8821585774421692, + "learning_rate": 8.266792377240233e-06, + "loss": 0.6457, + "step": 10006 + }, + { + "epoch": 0.550773295173097, + "grad_norm": 0.7056930661201477, + "learning_rate": 8.266464209634608e-06, + "loss": 0.8709, + "step": 10007 + }, + { + "epoch": 0.5508283339754527, + "grad_norm": 0.6505821347236633, + "learning_rate": 8.266136017479113e-06, + "loss": 0.7674, + "step": 10008 + }, + { + "epoch": 0.5508833727778083, + "grad_norm": 0.7947389483451843, + "learning_rate": 8.265807800776216e-06, + "loss": 0.7882, + "step": 10009 + }, + { + "epoch": 0.550938411580164, + "grad_norm": 0.7466071844100952, + "learning_rate": 8.265479559528379e-06, + "loss": 0.7673, + "step": 10010 + }, + { + "epoch": 0.5509934503825197, + "grad_norm": 0.706430971622467, + "learning_rate": 8.265151293738074e-06, + "loss": 0.7796, + "step": 10011 + }, + { + "epoch": 0.5510484891848754, + "grad_norm": 0.7701015472412109, + "learning_rate": 8.264823003407765e-06, + "loss": 0.7631, + "step": 10012 + }, + { + "epoch": 0.551103527987231, + "grad_norm": 0.6923625469207764, + "learning_rate": 8.264494688539922e-06, + "loss": 0.7659, + "step": 10013 + }, + { + "epoch": 0.5511585667895866, + "grad_norm": 0.6585322618484497, + "learning_rate": 8.264166349137008e-06, + "loss": 0.7248, + "step": 10014 + }, + { + "epoch": 0.5512136055919423, + "grad_norm": 0.698451578617096, + "learning_rate": 8.263837985201493e-06, + "loss": 0.7768, + "step": 10015 + }, + { + "epoch": 0.551268644394298, + "grad_norm": 0.7585058808326721, + "learning_rate": 8.263509596735847e-06, + "loss": 0.8535, + "step": 10016 + }, + { + "epoch": 0.5513236831966536, + "grad_norm": 0.6973930597305298, + "learning_rate": 8.263181183742536e-06, + "loss": 0.8253, + "step": 10017 + }, + { + "epoch": 0.5513787219990093, + "grad_norm": 0.6752467751502991, + "learning_rate": 8.26285274622403e-06, + "loss": 0.7402, + "step": 10018 + }, + { + "epoch": 0.551433760801365, + "grad_norm": 0.717555820941925, + "learning_rate": 8.262524284182794e-06, + "loss": 0.8057, + "step": 10019 + }, + { + "epoch": 0.5514887996037207, + "grad_norm": 0.6975438594818115, + "learning_rate": 8.2621957976213e-06, + "loss": 0.803, + "step": 10020 + }, + { + "epoch": 0.5515438384060762, + "grad_norm": 0.667797327041626, + "learning_rate": 8.261867286542016e-06, + "loss": 0.7387, + "step": 10021 + }, + { + "epoch": 0.5515988772084319, + "grad_norm": 0.7330532670021057, + "learning_rate": 8.261538750947411e-06, + "loss": 0.8143, + "step": 10022 + }, + { + "epoch": 0.5516539160107876, + "grad_norm": 0.7034017443656921, + "learning_rate": 8.261210190839952e-06, + "loss": 0.739, + "step": 10023 + }, + { + "epoch": 0.5517089548131433, + "grad_norm": 0.709284245967865, + "learning_rate": 8.260881606222113e-06, + "loss": 0.8021, + "step": 10024 + }, + { + "epoch": 0.5517639936154989, + "grad_norm": 0.7587909698486328, + "learning_rate": 8.260552997096359e-06, + "loss": 0.8346, + "step": 10025 + }, + { + "epoch": 0.5518190324178546, + "grad_norm": 0.7413986325263977, + "learning_rate": 8.26022436346516e-06, + "loss": 0.6777, + "step": 10026 + }, + { + "epoch": 0.5518740712202103, + "grad_norm": 0.7112768292427063, + "learning_rate": 8.25989570533099e-06, + "loss": 0.7017, + "step": 10027 + }, + { + "epoch": 0.551929110022566, + "grad_norm": 0.7097088098526001, + "learning_rate": 8.259567022696315e-06, + "loss": 0.7315, + "step": 10028 + }, + { + "epoch": 0.5519841488249215, + "grad_norm": 0.6544226408004761, + "learning_rate": 8.259238315563606e-06, + "loss": 0.7729, + "step": 10029 + }, + { + "epoch": 0.5520391876272772, + "grad_norm": 0.6892885565757751, + "learning_rate": 8.258909583935335e-06, + "loss": 0.7919, + "step": 10030 + }, + { + "epoch": 0.5520942264296329, + "grad_norm": 0.697424054145813, + "learning_rate": 8.258580827813972e-06, + "loss": 0.7514, + "step": 10031 + }, + { + "epoch": 0.5521492652319886, + "grad_norm": 0.7021437883377075, + "learning_rate": 8.258252047201989e-06, + "loss": 0.747, + "step": 10032 + }, + { + "epoch": 0.5522043040343442, + "grad_norm": 0.6974816918373108, + "learning_rate": 8.257923242101854e-06, + "loss": 0.7245, + "step": 10033 + }, + { + "epoch": 0.5522593428366999, + "grad_norm": 0.6645311117172241, + "learning_rate": 8.25759441251604e-06, + "loss": 0.649, + "step": 10034 + }, + { + "epoch": 0.5523143816390556, + "grad_norm": 0.7223736643791199, + "learning_rate": 8.25726555844702e-06, + "loss": 0.7792, + "step": 10035 + }, + { + "epoch": 0.5523694204414112, + "grad_norm": 0.7253531813621521, + "learning_rate": 8.256936679897262e-06, + "loss": 0.7636, + "step": 10036 + }, + { + "epoch": 0.5524244592437668, + "grad_norm": 0.6979514956474304, + "learning_rate": 8.256607776869241e-06, + "loss": 0.7929, + "step": 10037 + }, + { + "epoch": 0.5524794980461225, + "grad_norm": 0.7442019581794739, + "learning_rate": 8.25627884936543e-06, + "loss": 0.6984, + "step": 10038 + }, + { + "epoch": 0.5525345368484782, + "grad_norm": 0.7519513964653015, + "learning_rate": 8.255949897388294e-06, + "loss": 0.7228, + "step": 10039 + }, + { + "epoch": 0.5525895756508339, + "grad_norm": 0.7302790880203247, + "learning_rate": 8.255620920940313e-06, + "loss": 0.7555, + "step": 10040 + }, + { + "epoch": 0.5526446144531895, + "grad_norm": 0.6521434187889099, + "learning_rate": 8.255291920023956e-06, + "loss": 0.7825, + "step": 10041 + }, + { + "epoch": 0.5526996532555452, + "grad_norm": 0.8270126581192017, + "learning_rate": 8.254962894641695e-06, + "loss": 0.7939, + "step": 10042 + }, + { + "epoch": 0.5527546920579008, + "grad_norm": 0.7209310531616211, + "learning_rate": 8.254633844796007e-06, + "loss": 0.8286, + "step": 10043 + }, + { + "epoch": 0.5528097308602565, + "grad_norm": 0.6506814360618591, + "learning_rate": 8.25430477048936e-06, + "loss": 0.7209, + "step": 10044 + }, + { + "epoch": 0.5528647696626121, + "grad_norm": 0.6914637684822083, + "learning_rate": 8.25397567172423e-06, + "loss": 0.705, + "step": 10045 + }, + { + "epoch": 0.5529198084649678, + "grad_norm": 0.8369725942611694, + "learning_rate": 8.253646548503091e-06, + "loss": 0.8254, + "step": 10046 + }, + { + "epoch": 0.5529748472673235, + "grad_norm": 0.7809324860572815, + "learning_rate": 8.253317400828414e-06, + "loss": 0.8117, + "step": 10047 + }, + { + "epoch": 0.5530298860696792, + "grad_norm": 0.7184550762176514, + "learning_rate": 8.252988228702676e-06, + "loss": 0.738, + "step": 10048 + }, + { + "epoch": 0.5530849248720348, + "grad_norm": 0.7111478447914124, + "learning_rate": 8.252659032128347e-06, + "loss": 0.7143, + "step": 10049 + }, + { + "epoch": 0.5531399636743904, + "grad_norm": 0.7506794333457947, + "learning_rate": 8.252329811107905e-06, + "loss": 0.7721, + "step": 10050 + }, + { + "epoch": 0.5531950024767461, + "grad_norm": 0.7700625658035278, + "learning_rate": 8.252000565643823e-06, + "loss": 0.7993, + "step": 10051 + }, + { + "epoch": 0.5532500412791017, + "grad_norm": 0.6985816955566406, + "learning_rate": 8.251671295738575e-06, + "loss": 0.7461, + "step": 10052 + }, + { + "epoch": 0.5533050800814574, + "grad_norm": 0.6932175755500793, + "learning_rate": 8.251342001394635e-06, + "loss": 0.6804, + "step": 10053 + }, + { + "epoch": 0.5533601188838131, + "grad_norm": 0.8060765266418457, + "learning_rate": 8.25101268261448e-06, + "loss": 0.7137, + "step": 10054 + }, + { + "epoch": 0.5534151576861688, + "grad_norm": 0.6853482127189636, + "learning_rate": 8.250683339400582e-06, + "loss": 0.7229, + "step": 10055 + }, + { + "epoch": 0.5534701964885244, + "grad_norm": 0.7581862211227417, + "learning_rate": 8.25035397175542e-06, + "loss": 0.8091, + "step": 10056 + }, + { + "epoch": 0.55352523529088, + "grad_norm": 0.7375245094299316, + "learning_rate": 8.250024579681466e-06, + "loss": 0.7234, + "step": 10057 + }, + { + "epoch": 0.5535802740932357, + "grad_norm": 0.7904585599899292, + "learning_rate": 8.249695163181198e-06, + "loss": 0.7295, + "step": 10058 + }, + { + "epoch": 0.5536353128955914, + "grad_norm": 0.6593602895736694, + "learning_rate": 8.249365722257092e-06, + "loss": 0.7492, + "step": 10059 + }, + { + "epoch": 0.553690351697947, + "grad_norm": 0.7226922512054443, + "learning_rate": 8.249036256911622e-06, + "loss": 0.8177, + "step": 10060 + }, + { + "epoch": 0.5537453905003027, + "grad_norm": 0.7268722653388977, + "learning_rate": 8.248706767147265e-06, + "loss": 0.8059, + "step": 10061 + }, + { + "epoch": 0.5538004293026584, + "grad_norm": 0.7797269225120544, + "learning_rate": 8.248377252966499e-06, + "loss": 0.8122, + "step": 10062 + }, + { + "epoch": 0.5538554681050141, + "grad_norm": 0.7199145555496216, + "learning_rate": 8.248047714371797e-06, + "loss": 0.7312, + "step": 10063 + }, + { + "epoch": 0.5539105069073696, + "grad_norm": 0.6950703263282776, + "learning_rate": 8.24771815136564e-06, + "loss": 0.757, + "step": 10064 + }, + { + "epoch": 0.5539655457097253, + "grad_norm": 0.6413441896438599, + "learning_rate": 8.247388563950502e-06, + "loss": 0.6955, + "step": 10065 + }, + { + "epoch": 0.554020584512081, + "grad_norm": 0.7650758624076843, + "learning_rate": 8.24705895212886e-06, + "loss": 0.8355, + "step": 10066 + }, + { + "epoch": 0.5540756233144367, + "grad_norm": 0.7067090272903442, + "learning_rate": 8.246729315903192e-06, + "loss": 0.7409, + "step": 10067 + }, + { + "epoch": 0.5541306621167923, + "grad_norm": 0.7763532996177673, + "learning_rate": 8.246399655275976e-06, + "loss": 0.8097, + "step": 10068 + }, + { + "epoch": 0.554185700919148, + "grad_norm": 0.6865057945251465, + "learning_rate": 8.246069970249689e-06, + "loss": 0.7597, + "step": 10069 + }, + { + "epoch": 0.5542407397215037, + "grad_norm": 0.7643107771873474, + "learning_rate": 8.24574026082681e-06, + "loss": 0.7403, + "step": 10070 + }, + { + "epoch": 0.5542957785238594, + "grad_norm": 0.7354087829589844, + "learning_rate": 8.245410527009815e-06, + "loss": 0.8896, + "step": 10071 + }, + { + "epoch": 0.5543508173262149, + "grad_norm": 0.7971135973930359, + "learning_rate": 8.245080768801183e-06, + "loss": 0.7738, + "step": 10072 + }, + { + "epoch": 0.5544058561285706, + "grad_norm": 1.0506731271743774, + "learning_rate": 8.244750986203394e-06, + "loss": 0.7888, + "step": 10073 + }, + { + "epoch": 0.5544608949309263, + "grad_norm": 0.8305885195732117, + "learning_rate": 8.244421179218925e-06, + "loss": 0.8186, + "step": 10074 + }, + { + "epoch": 0.554515933733282, + "grad_norm": 0.9507874250411987, + "learning_rate": 8.244091347850253e-06, + "loss": 0.7975, + "step": 10075 + }, + { + "epoch": 0.5545709725356376, + "grad_norm": 0.7146797776222229, + "learning_rate": 8.243761492099861e-06, + "loss": 0.6895, + "step": 10076 + }, + { + "epoch": 0.5546260113379933, + "grad_norm": 0.734990656375885, + "learning_rate": 8.243431611970225e-06, + "loss": 0.8087, + "step": 10077 + }, + { + "epoch": 0.554681050140349, + "grad_norm": 0.6807795166969299, + "learning_rate": 8.243101707463825e-06, + "loss": 0.7861, + "step": 10078 + }, + { + "epoch": 0.5547360889427047, + "grad_norm": 0.7412874698638916, + "learning_rate": 8.242771778583142e-06, + "loss": 0.7864, + "step": 10079 + }, + { + "epoch": 0.5547911277450602, + "grad_norm": 0.6655074954032898, + "learning_rate": 8.242441825330652e-06, + "loss": 0.6554, + "step": 10080 + }, + { + "epoch": 0.5548461665474159, + "grad_norm": 0.7549700140953064, + "learning_rate": 8.242111847708838e-06, + "loss": 0.8031, + "step": 10081 + }, + { + "epoch": 0.5549012053497716, + "grad_norm": 0.8907766342163086, + "learning_rate": 8.241781845720181e-06, + "loss": 0.8068, + "step": 10082 + }, + { + "epoch": 0.5549562441521273, + "grad_norm": 0.7347774505615234, + "learning_rate": 8.241451819367157e-06, + "loss": 0.7453, + "step": 10083 + }, + { + "epoch": 0.5550112829544829, + "grad_norm": 0.6856632828712463, + "learning_rate": 8.24112176865225e-06, + "loss": 0.6235, + "step": 10084 + }, + { + "epoch": 0.5550663217568386, + "grad_norm": 0.7134507298469543, + "learning_rate": 8.24079169357794e-06, + "loss": 0.7991, + "step": 10085 + }, + { + "epoch": 0.5551213605591943, + "grad_norm": 0.7814854383468628, + "learning_rate": 8.240461594146704e-06, + "loss": 0.7681, + "step": 10086 + }, + { + "epoch": 0.5551763993615499, + "grad_norm": 0.6893261671066284, + "learning_rate": 8.240131470361028e-06, + "loss": 0.7746, + "step": 10087 + }, + { + "epoch": 0.5552314381639055, + "grad_norm": 0.925003170967102, + "learning_rate": 8.239801322223393e-06, + "loss": 0.7621, + "step": 10088 + }, + { + "epoch": 0.5552864769662612, + "grad_norm": 0.6261017918586731, + "learning_rate": 8.239471149736277e-06, + "loss": 0.7673, + "step": 10089 + }, + { + "epoch": 0.5553415157686169, + "grad_norm": 0.7268226146697998, + "learning_rate": 8.239140952902162e-06, + "loss": 0.7375, + "step": 10090 + }, + { + "epoch": 0.5553965545709726, + "grad_norm": 0.8062194585800171, + "learning_rate": 8.238810731723532e-06, + "loss": 0.8002, + "step": 10091 + }, + { + "epoch": 0.5554515933733282, + "grad_norm": 0.892842173576355, + "learning_rate": 8.238480486202867e-06, + "loss": 0.7959, + "step": 10092 + }, + { + "epoch": 0.5555066321756839, + "grad_norm": 0.7530377507209778, + "learning_rate": 8.23815021634265e-06, + "loss": 0.8137, + "step": 10093 + }, + { + "epoch": 0.5555616709780395, + "grad_norm": 0.6994850635528564, + "learning_rate": 8.237819922145364e-06, + "loss": 0.7966, + "step": 10094 + }, + { + "epoch": 0.5556167097803951, + "grad_norm": 0.8502941727638245, + "learning_rate": 8.237489603613488e-06, + "loss": 0.7668, + "step": 10095 + }, + { + "epoch": 0.5556717485827508, + "grad_norm": 0.6583576798439026, + "learning_rate": 8.237159260749507e-06, + "loss": 0.7379, + "step": 10096 + }, + { + "epoch": 0.5557267873851065, + "grad_norm": 0.9539539217948914, + "learning_rate": 8.236828893555904e-06, + "loss": 0.7563, + "step": 10097 + }, + { + "epoch": 0.5557818261874622, + "grad_norm": 0.7446413040161133, + "learning_rate": 8.236498502035162e-06, + "loss": 0.7329, + "step": 10098 + }, + { + "epoch": 0.5558368649898178, + "grad_norm": 0.8950835466384888, + "learning_rate": 8.236168086189761e-06, + "loss": 0.8144, + "step": 10099 + }, + { + "epoch": 0.5558919037921735, + "grad_norm": 0.7255009412765503, + "learning_rate": 8.235837646022191e-06, + "loss": 0.6946, + "step": 10100 + }, + { + "epoch": 0.5559469425945291, + "grad_norm": 0.6983402967453003, + "learning_rate": 8.235507181534929e-06, + "loss": 0.7371, + "step": 10101 + }, + { + "epoch": 0.5560019813968848, + "grad_norm": 1.043593168258667, + "learning_rate": 8.235176692730463e-06, + "loss": 0.6763, + "step": 10102 + }, + { + "epoch": 0.5560570201992404, + "grad_norm": 0.7452800869941711, + "learning_rate": 8.234846179611272e-06, + "loss": 0.8945, + "step": 10103 + }, + { + "epoch": 0.5561120590015961, + "grad_norm": 0.6367164254188538, + "learning_rate": 8.234515642179845e-06, + "loss": 0.6542, + "step": 10104 + }, + { + "epoch": 0.5561670978039518, + "grad_norm": 0.8377598524093628, + "learning_rate": 8.234185080438664e-06, + "loss": 0.787, + "step": 10105 + }, + { + "epoch": 0.5562221366063075, + "grad_norm": 0.7353680729866028, + "learning_rate": 8.233854494390214e-06, + "loss": 0.6391, + "step": 10106 + }, + { + "epoch": 0.5562771754086631, + "grad_norm": 0.7431599497795105, + "learning_rate": 8.233523884036977e-06, + "loss": 0.8221, + "step": 10107 + }, + { + "epoch": 0.5563322142110187, + "grad_norm": 0.7292743921279907, + "learning_rate": 8.233193249381442e-06, + "loss": 0.7791, + "step": 10108 + }, + { + "epoch": 0.5563872530133744, + "grad_norm": 0.7251895666122437, + "learning_rate": 8.232862590426091e-06, + "loss": 0.7993, + "step": 10109 + }, + { + "epoch": 0.5564422918157301, + "grad_norm": 0.7373167276382446, + "learning_rate": 8.23253190717341e-06, + "loss": 0.861, + "step": 10110 + }, + { + "epoch": 0.5564973306180857, + "grad_norm": 0.6689401268959045, + "learning_rate": 8.232201199625887e-06, + "loss": 0.7002, + "step": 10111 + }, + { + "epoch": 0.5565523694204414, + "grad_norm": 0.7405139207839966, + "learning_rate": 8.231870467786003e-06, + "loss": 0.8041, + "step": 10112 + }, + { + "epoch": 0.5566074082227971, + "grad_norm": 0.7561736702919006, + "learning_rate": 8.231539711656246e-06, + "loss": 0.7687, + "step": 10113 + }, + { + "epoch": 0.5566624470251528, + "grad_norm": 0.6857489943504333, + "learning_rate": 8.231208931239103e-06, + "loss": 0.7175, + "step": 10114 + }, + { + "epoch": 0.5567174858275084, + "grad_norm": 0.7410408854484558, + "learning_rate": 8.230878126537057e-06, + "loss": 0.7337, + "step": 10115 + }, + { + "epoch": 0.556772524629864, + "grad_norm": 0.7533249258995056, + "learning_rate": 8.230547297552595e-06, + "loss": 0.7226, + "step": 10116 + }, + { + "epoch": 0.5568275634322197, + "grad_norm": 0.6227561235427856, + "learning_rate": 8.230216444288207e-06, + "loss": 0.711, + "step": 10117 + }, + { + "epoch": 0.5568826022345754, + "grad_norm": 0.6790871024131775, + "learning_rate": 8.229885566746373e-06, + "loss": 0.728, + "step": 10118 + }, + { + "epoch": 0.556937641036931, + "grad_norm": 1.0007857084274292, + "learning_rate": 8.229554664929587e-06, + "loss": 0.9193, + "step": 10119 + }, + { + "epoch": 0.5569926798392867, + "grad_norm": 0.7167220711708069, + "learning_rate": 8.229223738840331e-06, + "loss": 0.8288, + "step": 10120 + }, + { + "epoch": 0.5570477186416424, + "grad_norm": 0.8037107586860657, + "learning_rate": 8.228892788481095e-06, + "loss": 0.8462, + "step": 10121 + }, + { + "epoch": 0.5571027574439981, + "grad_norm": 0.7355597615242004, + "learning_rate": 8.228561813854363e-06, + "loss": 0.7998, + "step": 10122 + }, + { + "epoch": 0.5571577962463536, + "grad_norm": 0.7384124994277954, + "learning_rate": 8.228230814962625e-06, + "loss": 0.7861, + "step": 10123 + }, + { + "epoch": 0.5572128350487093, + "grad_norm": 0.8170364499092102, + "learning_rate": 8.227899791808371e-06, + "loss": 0.8005, + "step": 10124 + }, + { + "epoch": 0.557267873851065, + "grad_norm": 0.678702175617218, + "learning_rate": 8.227568744394084e-06, + "loss": 0.7408, + "step": 10125 + }, + { + "epoch": 0.5573229126534207, + "grad_norm": 0.7212443947792053, + "learning_rate": 8.227237672722255e-06, + "loss": 0.7127, + "step": 10126 + }, + { + "epoch": 0.5573779514557763, + "grad_norm": 0.7035290002822876, + "learning_rate": 8.22690657679537e-06, + "loss": 0.8263, + "step": 10127 + }, + { + "epoch": 0.557432990258132, + "grad_norm": 0.6535285115242004, + "learning_rate": 8.226575456615921e-06, + "loss": 0.6979, + "step": 10128 + }, + { + "epoch": 0.5574880290604877, + "grad_norm": 0.7353794574737549, + "learning_rate": 8.226244312186396e-06, + "loss": 0.6838, + "step": 10129 + }, + { + "epoch": 0.5575430678628434, + "grad_norm": 0.5839618444442749, + "learning_rate": 8.225913143509278e-06, + "loss": 0.5925, + "step": 10130 + }, + { + "epoch": 0.5575981066651989, + "grad_norm": 0.6922228336334229, + "learning_rate": 8.225581950587063e-06, + "loss": 0.6808, + "step": 10131 + }, + { + "epoch": 0.5576531454675546, + "grad_norm": 0.753989040851593, + "learning_rate": 8.225250733422236e-06, + "loss": 0.6567, + "step": 10132 + }, + { + "epoch": 0.5577081842699103, + "grad_norm": 0.7327600717544556, + "learning_rate": 8.22491949201729e-06, + "loss": 0.8311, + "step": 10133 + }, + { + "epoch": 0.557763223072266, + "grad_norm": 0.6435133218765259, + "learning_rate": 8.224588226374712e-06, + "loss": 0.6684, + "step": 10134 + }, + { + "epoch": 0.5578182618746216, + "grad_norm": 0.6402057409286499, + "learning_rate": 8.22425693649699e-06, + "loss": 0.7569, + "step": 10135 + }, + { + "epoch": 0.5578733006769773, + "grad_norm": 0.7454472780227661, + "learning_rate": 8.223925622386617e-06, + "loss": 0.7908, + "step": 10136 + }, + { + "epoch": 0.557928339479333, + "grad_norm": 0.7373154759407043, + "learning_rate": 8.223594284046084e-06, + "loss": 0.8232, + "step": 10137 + }, + { + "epoch": 0.5579833782816885, + "grad_norm": 0.6478374004364014, + "learning_rate": 8.223262921477878e-06, + "loss": 0.7353, + "step": 10138 + }, + { + "epoch": 0.5580384170840442, + "grad_norm": 0.715212881565094, + "learning_rate": 8.222931534684488e-06, + "loss": 0.729, + "step": 10139 + }, + { + "epoch": 0.5580934558863999, + "grad_norm": 0.9226915240287781, + "learning_rate": 8.22260012366841e-06, + "loss": 0.7846, + "step": 10140 + }, + { + "epoch": 0.5581484946887556, + "grad_norm": 0.6481993198394775, + "learning_rate": 8.222268688432132e-06, + "loss": 0.6955, + "step": 10141 + }, + { + "epoch": 0.5582035334911112, + "grad_norm": 0.7240349054336548, + "learning_rate": 8.221937228978145e-06, + "loss": 0.7956, + "step": 10142 + }, + { + "epoch": 0.5582585722934669, + "grad_norm": 0.7089122533798218, + "learning_rate": 8.221605745308939e-06, + "loss": 0.7481, + "step": 10143 + }, + { + "epoch": 0.5583136110958226, + "grad_norm": 0.7292537093162537, + "learning_rate": 8.221274237427009e-06, + "loss": 0.7797, + "step": 10144 + }, + { + "epoch": 0.5583686498981782, + "grad_norm": 0.7104652523994446, + "learning_rate": 8.220942705334841e-06, + "loss": 0.7966, + "step": 10145 + }, + { + "epoch": 0.5584236887005338, + "grad_norm": 0.7656546831130981, + "learning_rate": 8.220611149034931e-06, + "loss": 0.7541, + "step": 10146 + }, + { + "epoch": 0.5584787275028895, + "grad_norm": 0.7618892788887024, + "learning_rate": 8.22027956852977e-06, + "loss": 0.6994, + "step": 10147 + }, + { + "epoch": 0.5585337663052452, + "grad_norm": 0.6445756554603577, + "learning_rate": 8.219947963821851e-06, + "loss": 0.7303, + "step": 10148 + }, + { + "epoch": 0.5585888051076009, + "grad_norm": 0.6529820561408997, + "learning_rate": 8.219616334913663e-06, + "loss": 0.7008, + "step": 10149 + }, + { + "epoch": 0.5586438439099565, + "grad_norm": 0.6890642046928406, + "learning_rate": 8.219284681807703e-06, + "loss": 0.8124, + "step": 10150 + }, + { + "epoch": 0.5586988827123122, + "grad_norm": 0.7273370027542114, + "learning_rate": 8.218953004506458e-06, + "loss": 0.7507, + "step": 10151 + }, + { + "epoch": 0.5587539215146679, + "grad_norm": 0.7239277362823486, + "learning_rate": 8.218621303012425e-06, + "loss": 0.7929, + "step": 10152 + }, + { + "epoch": 0.5588089603170235, + "grad_norm": 0.660275399684906, + "learning_rate": 8.218289577328096e-06, + "loss": 0.7418, + "step": 10153 + }, + { + "epoch": 0.5588639991193791, + "grad_norm": 0.7406648993492126, + "learning_rate": 8.217957827455965e-06, + "loss": 0.8072, + "step": 10154 + }, + { + "epoch": 0.5589190379217348, + "grad_norm": 0.7051703333854675, + "learning_rate": 8.217626053398522e-06, + "loss": 0.6562, + "step": 10155 + }, + { + "epoch": 0.5589740767240905, + "grad_norm": 0.93423992395401, + "learning_rate": 8.217294255158266e-06, + "loss": 0.738, + "step": 10156 + }, + { + "epoch": 0.5590291155264462, + "grad_norm": 0.8362720608711243, + "learning_rate": 8.216962432737685e-06, + "loss": 0.8585, + "step": 10157 + }, + { + "epoch": 0.5590841543288018, + "grad_norm": 0.9195587038993835, + "learning_rate": 8.216630586139277e-06, + "loss": 0.8778, + "step": 10158 + }, + { + "epoch": 0.5591391931311575, + "grad_norm": 0.7181550860404968, + "learning_rate": 8.216298715365534e-06, + "loss": 0.702, + "step": 10159 + }, + { + "epoch": 0.5591942319335131, + "grad_norm": 0.6900259852409363, + "learning_rate": 8.21596682041895e-06, + "loss": 0.7652, + "step": 10160 + }, + { + "epoch": 0.5592492707358688, + "grad_norm": 0.7523833513259888, + "learning_rate": 8.215634901302022e-06, + "loss": 0.7881, + "step": 10161 + }, + { + "epoch": 0.5593043095382244, + "grad_norm": 0.6659645438194275, + "learning_rate": 8.215302958017241e-06, + "loss": 0.694, + "step": 10162 + }, + { + "epoch": 0.5593593483405801, + "grad_norm": 0.8898606300354004, + "learning_rate": 8.214970990567105e-06, + "loss": 0.8534, + "step": 10163 + }, + { + "epoch": 0.5594143871429358, + "grad_norm": 0.6759241819381714, + "learning_rate": 8.214638998954108e-06, + "loss": 0.8241, + "step": 10164 + }, + { + "epoch": 0.5594694259452915, + "grad_norm": 0.7136911749839783, + "learning_rate": 8.214306983180744e-06, + "loss": 0.7846, + "step": 10165 + }, + { + "epoch": 0.559524464747647, + "grad_norm": 0.6781616806983948, + "learning_rate": 8.213974943249509e-06, + "loss": 0.7116, + "step": 10166 + }, + { + "epoch": 0.5595795035500027, + "grad_norm": 0.7134156227111816, + "learning_rate": 8.213642879162898e-06, + "loss": 0.7537, + "step": 10167 + }, + { + "epoch": 0.5596345423523584, + "grad_norm": 1.306710124015808, + "learning_rate": 8.213310790923408e-06, + "loss": 0.8506, + "step": 10168 + }, + { + "epoch": 0.5596895811547141, + "grad_norm": 0.725304901599884, + "learning_rate": 8.212978678533534e-06, + "loss": 0.8115, + "step": 10169 + }, + { + "epoch": 0.5597446199570697, + "grad_norm": 0.7833520174026489, + "learning_rate": 8.212646541995772e-06, + "loss": 0.919, + "step": 10170 + }, + { + "epoch": 0.5597996587594254, + "grad_norm": 0.6938104033470154, + "learning_rate": 8.212314381312621e-06, + "loss": 0.7303, + "step": 10171 + }, + { + "epoch": 0.5598546975617811, + "grad_norm": 0.6860232949256897, + "learning_rate": 8.211982196486573e-06, + "loss": 0.7709, + "step": 10172 + }, + { + "epoch": 0.5599097363641368, + "grad_norm": 0.6611567139625549, + "learning_rate": 8.211649987520126e-06, + "loss": 0.7711, + "step": 10173 + }, + { + "epoch": 0.5599647751664923, + "grad_norm": 0.8603463172912598, + "learning_rate": 8.211317754415778e-06, + "loss": 0.8527, + "step": 10174 + }, + { + "epoch": 0.560019813968848, + "grad_norm": 0.7350558638572693, + "learning_rate": 8.210985497176025e-06, + "loss": 0.8148, + "step": 10175 + }, + { + "epoch": 0.5600748527712037, + "grad_norm": 0.6881470084190369, + "learning_rate": 8.210653215803365e-06, + "loss": 0.7526, + "step": 10176 + }, + { + "epoch": 0.5601298915735594, + "grad_norm": 0.6879626512527466, + "learning_rate": 8.210320910300296e-06, + "loss": 0.7649, + "step": 10177 + }, + { + "epoch": 0.560184930375915, + "grad_norm": 0.6843587160110474, + "learning_rate": 8.209988580669312e-06, + "loss": 0.8131, + "step": 10178 + }, + { + "epoch": 0.5602399691782707, + "grad_norm": 0.6684302687644958, + "learning_rate": 8.209656226912915e-06, + "loss": 0.7256, + "step": 10179 + }, + { + "epoch": 0.5602950079806264, + "grad_norm": 0.7973861694335938, + "learning_rate": 8.209323849033601e-06, + "loss": 0.7924, + "step": 10180 + }, + { + "epoch": 0.560350046782982, + "grad_norm": 0.6850616931915283, + "learning_rate": 8.208991447033867e-06, + "loss": 0.7423, + "step": 10181 + }, + { + "epoch": 0.5604050855853376, + "grad_norm": 0.8284440636634827, + "learning_rate": 8.208659020916213e-06, + "loss": 0.7637, + "step": 10182 + }, + { + "epoch": 0.5604601243876933, + "grad_norm": 0.7671821713447571, + "learning_rate": 8.208326570683136e-06, + "loss": 0.7688, + "step": 10183 + }, + { + "epoch": 0.560515163190049, + "grad_norm": 0.8359144330024719, + "learning_rate": 8.207994096337135e-06, + "loss": 0.8179, + "step": 10184 + }, + { + "epoch": 0.5605702019924046, + "grad_norm": 0.6389699578285217, + "learning_rate": 8.207661597880709e-06, + "loss": 0.6987, + "step": 10185 + }, + { + "epoch": 0.5606252407947603, + "grad_norm": 0.6472755074501038, + "learning_rate": 8.20732907531636e-06, + "loss": 0.6984, + "step": 10186 + }, + { + "epoch": 0.560680279597116, + "grad_norm": 0.8231903314590454, + "learning_rate": 8.20699652864658e-06, + "loss": 0.8212, + "step": 10187 + }, + { + "epoch": 0.5607353183994717, + "grad_norm": 0.7550386190414429, + "learning_rate": 8.206663957873876e-06, + "loss": 0.7446, + "step": 10188 + }, + { + "epoch": 0.5607903572018272, + "grad_norm": 0.6704659461975098, + "learning_rate": 8.206331363000743e-06, + "loss": 0.7035, + "step": 10189 + }, + { + "epoch": 0.5608453960041829, + "grad_norm": 0.7258654236793518, + "learning_rate": 8.20599874402968e-06, + "loss": 0.7032, + "step": 10190 + }, + { + "epoch": 0.5609004348065386, + "grad_norm": 0.674609363079071, + "learning_rate": 8.20566610096319e-06, + "loss": 0.7545, + "step": 10191 + }, + { + "epoch": 0.5609554736088943, + "grad_norm": 0.6978347301483154, + "learning_rate": 8.205333433803773e-06, + "loss": 0.8198, + "step": 10192 + }, + { + "epoch": 0.5610105124112499, + "grad_norm": 0.6252121329307556, + "learning_rate": 8.205000742553925e-06, + "loss": 0.6639, + "step": 10193 + }, + { + "epoch": 0.5610655512136056, + "grad_norm": 0.7288224101066589, + "learning_rate": 8.204668027216152e-06, + "loss": 0.8035, + "step": 10194 + }, + { + "epoch": 0.5611205900159613, + "grad_norm": 0.6591556072235107, + "learning_rate": 8.20433528779295e-06, + "loss": 0.7552, + "step": 10195 + }, + { + "epoch": 0.561175628818317, + "grad_norm": 0.769827127456665, + "learning_rate": 8.204002524286823e-06, + "loss": 0.7279, + "step": 10196 + }, + { + "epoch": 0.5612306676206725, + "grad_norm": 0.74398273229599, + "learning_rate": 8.203669736700271e-06, + "loss": 0.7638, + "step": 10197 + }, + { + "epoch": 0.5612857064230282, + "grad_norm": 0.9343454241752625, + "learning_rate": 8.203336925035795e-06, + "loss": 0.7513, + "step": 10198 + }, + { + "epoch": 0.5613407452253839, + "grad_norm": 0.6667190194129944, + "learning_rate": 8.203004089295894e-06, + "loss": 0.77, + "step": 10199 + }, + { + "epoch": 0.5613957840277396, + "grad_norm": 0.7684557437896729, + "learning_rate": 8.202671229483073e-06, + "loss": 0.803, + "step": 10200 + }, + { + "epoch": 0.5614508228300952, + "grad_norm": 0.6551374793052673, + "learning_rate": 8.202338345599832e-06, + "loss": 0.6914, + "step": 10201 + }, + { + "epoch": 0.5615058616324509, + "grad_norm": 0.717464029788971, + "learning_rate": 8.202005437648674e-06, + "loss": 0.6797, + "step": 10202 + }, + { + "epoch": 0.5615609004348066, + "grad_norm": 0.7053301334381104, + "learning_rate": 8.2016725056321e-06, + "loss": 0.7857, + "step": 10203 + }, + { + "epoch": 0.5616159392371622, + "grad_norm": 0.8392077684402466, + "learning_rate": 8.20133954955261e-06, + "loss": 0.8321, + "step": 10204 + }, + { + "epoch": 0.5616709780395178, + "grad_norm": 0.6630520820617676, + "learning_rate": 8.201006569412711e-06, + "loss": 0.7093, + "step": 10205 + }, + { + "epoch": 0.5617260168418735, + "grad_norm": 0.6835867762565613, + "learning_rate": 8.200673565214905e-06, + "loss": 0.6623, + "step": 10206 + }, + { + "epoch": 0.5617810556442292, + "grad_norm": 0.7635336518287659, + "learning_rate": 8.200340536961691e-06, + "loss": 0.8378, + "step": 10207 + }, + { + "epoch": 0.5618360944465849, + "grad_norm": 0.6500052213668823, + "learning_rate": 8.200007484655575e-06, + "loss": 0.6836, + "step": 10208 + }, + { + "epoch": 0.5618911332489405, + "grad_norm": 0.6549860835075378, + "learning_rate": 8.199674408299058e-06, + "loss": 0.6868, + "step": 10209 + }, + { + "epoch": 0.5619461720512962, + "grad_norm": 0.7995957732200623, + "learning_rate": 8.199341307894647e-06, + "loss": 0.7719, + "step": 10210 + }, + { + "epoch": 0.5620012108536518, + "grad_norm": 0.6869412064552307, + "learning_rate": 8.199008183444843e-06, + "loss": 0.7921, + "step": 10211 + }, + { + "epoch": 0.5620562496560075, + "grad_norm": 0.9125131964683533, + "learning_rate": 8.198675034952149e-06, + "loss": 0.9015, + "step": 10212 + }, + { + "epoch": 0.5621112884583631, + "grad_norm": 0.6851146221160889, + "learning_rate": 8.198341862419068e-06, + "loss": 0.7773, + "step": 10213 + }, + { + "epoch": 0.5621663272607188, + "grad_norm": 0.6808778047561646, + "learning_rate": 8.198008665848108e-06, + "loss": 0.7375, + "step": 10214 + }, + { + "epoch": 0.5622213660630745, + "grad_norm": 0.6419697999954224, + "learning_rate": 8.19767544524177e-06, + "loss": 0.7496, + "step": 10215 + }, + { + "epoch": 0.5622764048654302, + "grad_norm": 0.7325716614723206, + "learning_rate": 8.197342200602559e-06, + "loss": 0.7424, + "step": 10216 + }, + { + "epoch": 0.5623314436677858, + "grad_norm": 0.6165832281112671, + "learning_rate": 8.19700893193298e-06, + "loss": 0.6364, + "step": 10217 + }, + { + "epoch": 0.5623864824701414, + "grad_norm": 0.7632125020027161, + "learning_rate": 8.196675639235539e-06, + "loss": 0.7175, + "step": 10218 + }, + { + "epoch": 0.5624415212724971, + "grad_norm": 0.6789713501930237, + "learning_rate": 8.196342322512738e-06, + "loss": 0.7122, + "step": 10219 + }, + { + "epoch": 0.5624965600748528, + "grad_norm": 0.7341050505638123, + "learning_rate": 8.196008981767084e-06, + "loss": 0.7598, + "step": 10220 + }, + { + "epoch": 0.5625515988772084, + "grad_norm": 0.7318429350852966, + "learning_rate": 8.195675617001083e-06, + "loss": 0.7723, + "step": 10221 + }, + { + "epoch": 0.5626066376795641, + "grad_norm": 0.6940313577651978, + "learning_rate": 8.195342228217238e-06, + "loss": 0.7885, + "step": 10222 + }, + { + "epoch": 0.5626616764819198, + "grad_norm": 0.8792300820350647, + "learning_rate": 8.195008815418058e-06, + "loss": 0.7657, + "step": 10223 + }, + { + "epoch": 0.5627167152842754, + "grad_norm": 0.7234559655189514, + "learning_rate": 8.194675378606044e-06, + "loss": 0.7988, + "step": 10224 + }, + { + "epoch": 0.562771754086631, + "grad_norm": 0.6698254942893982, + "learning_rate": 8.194341917783708e-06, + "loss": 0.6378, + "step": 10225 + }, + { + "epoch": 0.5628267928889867, + "grad_norm": 0.6546483635902405, + "learning_rate": 8.194008432953552e-06, + "loss": 0.7113, + "step": 10226 + }, + { + "epoch": 0.5628818316913424, + "grad_norm": 0.6532583832740784, + "learning_rate": 8.193674924118085e-06, + "loss": 0.6782, + "step": 10227 + }, + { + "epoch": 0.562936870493698, + "grad_norm": 0.770578920841217, + "learning_rate": 8.19334139127981e-06, + "loss": 0.8519, + "step": 10228 + }, + { + "epoch": 0.5629919092960537, + "grad_norm": 0.7255409359931946, + "learning_rate": 8.193007834441235e-06, + "loss": 0.6555, + "step": 10229 + }, + { + "epoch": 0.5630469480984094, + "grad_norm": 0.6659883856773376, + "learning_rate": 8.19267425360487e-06, + "loss": 0.7836, + "step": 10230 + }, + { + "epoch": 0.5631019869007651, + "grad_norm": 0.6596028208732605, + "learning_rate": 8.192340648773221e-06, + "loss": 0.6199, + "step": 10231 + }, + { + "epoch": 0.5631570257031207, + "grad_norm": 0.8226001858711243, + "learning_rate": 8.192007019948793e-06, + "loss": 0.8101, + "step": 10232 + }, + { + "epoch": 0.5632120645054763, + "grad_norm": 0.7465038895606995, + "learning_rate": 8.191673367134094e-06, + "loss": 0.8437, + "step": 10233 + }, + { + "epoch": 0.563267103307832, + "grad_norm": 1.0008004903793335, + "learning_rate": 8.191339690331632e-06, + "loss": 0.8626, + "step": 10234 + }, + { + "epoch": 0.5633221421101877, + "grad_norm": 0.7538222670555115, + "learning_rate": 8.191005989543917e-06, + "loss": 0.7222, + "step": 10235 + }, + { + "epoch": 0.5633771809125433, + "grad_norm": 0.6252872943878174, + "learning_rate": 8.190672264773454e-06, + "loss": 0.8038, + "step": 10236 + }, + { + "epoch": 0.563432219714899, + "grad_norm": 0.7083514928817749, + "learning_rate": 8.190338516022752e-06, + "loss": 0.7863, + "step": 10237 + }, + { + "epoch": 0.5634872585172547, + "grad_norm": 0.6887454390525818, + "learning_rate": 8.19000474329432e-06, + "loss": 0.7034, + "step": 10238 + }, + { + "epoch": 0.5635422973196104, + "grad_norm": 0.7487072348594666, + "learning_rate": 8.189670946590666e-06, + "loss": 0.8618, + "step": 10239 + }, + { + "epoch": 0.5635973361219659, + "grad_norm": 0.6999371647834778, + "learning_rate": 8.189337125914298e-06, + "loss": 0.7613, + "step": 10240 + }, + { + "epoch": 0.5636523749243216, + "grad_norm": 0.8265380263328552, + "learning_rate": 8.18900328126773e-06, + "loss": 0.7576, + "step": 10241 + }, + { + "epoch": 0.5637074137266773, + "grad_norm": 0.6688962578773499, + "learning_rate": 8.188669412653463e-06, + "loss": 0.712, + "step": 10242 + }, + { + "epoch": 0.563762452529033, + "grad_norm": 0.6343923211097717, + "learning_rate": 8.188335520074011e-06, + "loss": 0.7239, + "step": 10243 + }, + { + "epoch": 0.5638174913313886, + "grad_norm": 0.7122388482093811, + "learning_rate": 8.188001603531883e-06, + "loss": 0.7892, + "step": 10244 + }, + { + "epoch": 0.5638725301337443, + "grad_norm": 0.6646286845207214, + "learning_rate": 8.187667663029587e-06, + "loss": 0.7805, + "step": 10245 + }, + { + "epoch": 0.5639275689361, + "grad_norm": 0.742938220500946, + "learning_rate": 8.187333698569638e-06, + "loss": 0.8444, + "step": 10246 + }, + { + "epoch": 0.5639826077384557, + "grad_norm": 0.7260885238647461, + "learning_rate": 8.18699971015454e-06, + "loss": 0.8621, + "step": 10247 + }, + { + "epoch": 0.5640376465408112, + "grad_norm": 0.7920067310333252, + "learning_rate": 8.186665697786804e-06, + "loss": 0.7391, + "step": 10248 + }, + { + "epoch": 0.5640926853431669, + "grad_norm": 0.7472825646400452, + "learning_rate": 8.186331661468943e-06, + "loss": 0.7249, + "step": 10249 + }, + { + "epoch": 0.5641477241455226, + "grad_norm": 0.692643940448761, + "learning_rate": 8.185997601203465e-06, + "loss": 0.7884, + "step": 10250 + }, + { + "epoch": 0.5642027629478783, + "grad_norm": 0.715455174446106, + "learning_rate": 8.185663516992884e-06, + "loss": 0.7369, + "step": 10251 + }, + { + "epoch": 0.5642578017502339, + "grad_norm": 0.7566105723381042, + "learning_rate": 8.185329408839705e-06, + "loss": 0.7378, + "step": 10252 + }, + { + "epoch": 0.5643128405525896, + "grad_norm": 0.8163520693778992, + "learning_rate": 8.184995276746445e-06, + "loss": 0.7326, + "step": 10253 + }, + { + "epoch": 0.5643678793549453, + "grad_norm": 0.6280468106269836, + "learning_rate": 8.184661120715615e-06, + "loss": 0.6858, + "step": 10254 + }, + { + "epoch": 0.564422918157301, + "grad_norm": 0.7246795892715454, + "learning_rate": 8.184326940749723e-06, + "loss": 0.8111, + "step": 10255 + }, + { + "epoch": 0.5644779569596565, + "grad_norm": 0.7429527640342712, + "learning_rate": 8.18399273685128e-06, + "loss": 0.7642, + "step": 10256 + }, + { + "epoch": 0.5645329957620122, + "grad_norm": 0.7308861017227173, + "learning_rate": 8.183658509022802e-06, + "loss": 0.7844, + "step": 10257 + }, + { + "epoch": 0.5645880345643679, + "grad_norm": 0.7549033164978027, + "learning_rate": 8.1833242572668e-06, + "loss": 0.8585, + "step": 10258 + }, + { + "epoch": 0.5646430733667236, + "grad_norm": 0.6779888868331909, + "learning_rate": 8.182989981585782e-06, + "loss": 0.6808, + "step": 10259 + }, + { + "epoch": 0.5646981121690792, + "grad_norm": 0.887113630771637, + "learning_rate": 8.182655681982266e-06, + "loss": 0.8229, + "step": 10260 + }, + { + "epoch": 0.5647531509714349, + "grad_norm": 0.6405711770057678, + "learning_rate": 8.18232135845876e-06, + "loss": 0.6901, + "step": 10261 + }, + { + "epoch": 0.5648081897737905, + "grad_norm": 0.7302486300468445, + "learning_rate": 8.18198701101778e-06, + "loss": 0.6853, + "step": 10262 + }, + { + "epoch": 0.5648632285761462, + "grad_norm": 0.6374662518501282, + "learning_rate": 8.181652639661837e-06, + "loss": 0.7177, + "step": 10263 + }, + { + "epoch": 0.5649182673785018, + "grad_norm": 0.9267570972442627, + "learning_rate": 8.181318244393444e-06, + "loss": 0.7926, + "step": 10264 + }, + { + "epoch": 0.5649733061808575, + "grad_norm": 0.8196623921394348, + "learning_rate": 8.180983825215114e-06, + "loss": 0.7127, + "step": 10265 + }, + { + "epoch": 0.5650283449832132, + "grad_norm": 0.7004575133323669, + "learning_rate": 8.180649382129361e-06, + "loss": 0.7858, + "step": 10266 + }, + { + "epoch": 0.5650833837855688, + "grad_norm": 0.7667824625968933, + "learning_rate": 8.180314915138701e-06, + "loss": 0.7742, + "step": 10267 + }, + { + "epoch": 0.5651384225879245, + "grad_norm": 0.7372623682022095, + "learning_rate": 8.179980424245644e-06, + "loss": 0.7949, + "step": 10268 + }, + { + "epoch": 0.5651934613902801, + "grad_norm": 0.6417940258979797, + "learning_rate": 8.179645909452704e-06, + "loss": 0.6683, + "step": 10269 + }, + { + "epoch": 0.5652485001926358, + "grad_norm": 0.6736140251159668, + "learning_rate": 8.179311370762398e-06, + "loss": 0.6564, + "step": 10270 + }, + { + "epoch": 0.5653035389949914, + "grad_norm": 0.6727200746536255, + "learning_rate": 8.178976808177239e-06, + "loss": 0.8065, + "step": 10271 + }, + { + "epoch": 0.5653585777973471, + "grad_norm": 0.7565415501594543, + "learning_rate": 8.17864222169974e-06, + "loss": 0.9055, + "step": 10272 + }, + { + "epoch": 0.5654136165997028, + "grad_norm": 0.8938627243041992, + "learning_rate": 8.178307611332418e-06, + "loss": 0.8009, + "step": 10273 + }, + { + "epoch": 0.5654686554020585, + "grad_norm": 0.7439131140708923, + "learning_rate": 8.177972977077786e-06, + "loss": 0.7807, + "step": 10274 + }, + { + "epoch": 0.5655236942044141, + "grad_norm": 0.7603998184204102, + "learning_rate": 8.17763831893836e-06, + "loss": 0.818, + "step": 10275 + }, + { + "epoch": 0.5655787330067698, + "grad_norm": 0.7088946104049683, + "learning_rate": 8.177303636916655e-06, + "loss": 0.7741, + "step": 10276 + }, + { + "epoch": 0.5656337718091254, + "grad_norm": 0.6801518201828003, + "learning_rate": 8.176968931015187e-06, + "loss": 0.7633, + "step": 10277 + }, + { + "epoch": 0.5656888106114811, + "grad_norm": 0.6739299297332764, + "learning_rate": 8.17663420123647e-06, + "loss": 0.7772, + "step": 10278 + }, + { + "epoch": 0.5657438494138367, + "grad_norm": 0.7432494759559631, + "learning_rate": 8.176299447583021e-06, + "loss": 0.7368, + "step": 10279 + }, + { + "epoch": 0.5657988882161924, + "grad_norm": 0.7847158908843994, + "learning_rate": 8.175964670057357e-06, + "loss": 0.7824, + "step": 10280 + }, + { + "epoch": 0.5658539270185481, + "grad_norm": 0.8732449412345886, + "learning_rate": 8.17562986866199e-06, + "loss": 0.8035, + "step": 10281 + }, + { + "epoch": 0.5659089658209038, + "grad_norm": 0.7988447546958923, + "learning_rate": 8.17529504339944e-06, + "loss": 0.828, + "step": 10282 + }, + { + "epoch": 0.5659640046232594, + "grad_norm": 0.7063263058662415, + "learning_rate": 8.174960194272224e-06, + "loss": 0.7723, + "step": 10283 + }, + { + "epoch": 0.566019043425615, + "grad_norm": 0.7635022401809692, + "learning_rate": 8.174625321282856e-06, + "loss": 0.7156, + "step": 10284 + }, + { + "epoch": 0.5660740822279707, + "grad_norm": 0.6505927443504333, + "learning_rate": 8.174290424433853e-06, + "loss": 0.7409, + "step": 10285 + }, + { + "epoch": 0.5661291210303264, + "grad_norm": 0.6919816136360168, + "learning_rate": 8.173955503727734e-06, + "loss": 0.7829, + "step": 10286 + }, + { + "epoch": 0.566184159832682, + "grad_norm": 0.7024216651916504, + "learning_rate": 8.173620559167015e-06, + "loss": 0.7378, + "step": 10287 + }, + { + "epoch": 0.5662391986350377, + "grad_norm": 0.7134365439414978, + "learning_rate": 8.173285590754212e-06, + "loss": 0.7737, + "step": 10288 + }, + { + "epoch": 0.5662942374373934, + "grad_norm": 0.6867973804473877, + "learning_rate": 8.172950598491845e-06, + "loss": 0.7169, + "step": 10289 + }, + { + "epoch": 0.5663492762397491, + "grad_norm": 0.6900742650032043, + "learning_rate": 8.172615582382432e-06, + "loss": 0.7888, + "step": 10290 + }, + { + "epoch": 0.5664043150421046, + "grad_norm": 0.7026718854904175, + "learning_rate": 8.172280542428488e-06, + "loss": 0.8179, + "step": 10291 + }, + { + "epoch": 0.5664593538444603, + "grad_norm": 0.6940855979919434, + "learning_rate": 8.171945478632533e-06, + "loss": 0.7686, + "step": 10292 + }, + { + "epoch": 0.566514392646816, + "grad_norm": 0.6717686653137207, + "learning_rate": 8.171610390997085e-06, + "loss": 0.7865, + "step": 10293 + }, + { + "epoch": 0.5665694314491717, + "grad_norm": 0.6947711110115051, + "learning_rate": 8.171275279524661e-06, + "loss": 0.7811, + "step": 10294 + }, + { + "epoch": 0.5666244702515273, + "grad_norm": 0.6907814741134644, + "learning_rate": 8.170940144217782e-06, + "loss": 0.7095, + "step": 10295 + }, + { + "epoch": 0.566679509053883, + "grad_norm": 0.723952054977417, + "learning_rate": 8.170604985078965e-06, + "loss": 0.7814, + "step": 10296 + }, + { + "epoch": 0.5667345478562387, + "grad_norm": 0.7775490880012512, + "learning_rate": 8.17026980211073e-06, + "loss": 0.797, + "step": 10297 + }, + { + "epoch": 0.5667895866585944, + "grad_norm": 0.7557885646820068, + "learning_rate": 8.169934595315597e-06, + "loss": 0.8423, + "step": 10298 + }, + { + "epoch": 0.5668446254609499, + "grad_norm": 0.7838338017463684, + "learning_rate": 8.169599364696083e-06, + "loss": 0.7114, + "step": 10299 + }, + { + "epoch": 0.5668996642633056, + "grad_norm": 0.6632605791091919, + "learning_rate": 8.169264110254707e-06, + "loss": 0.6723, + "step": 10300 + }, + { + "epoch": 0.5669547030656613, + "grad_norm": 0.735756516456604, + "learning_rate": 8.168928831993991e-06, + "loss": 0.7533, + "step": 10301 + }, + { + "epoch": 0.567009741868017, + "grad_norm": 0.6981016993522644, + "learning_rate": 8.168593529916457e-06, + "loss": 0.7882, + "step": 10302 + }, + { + "epoch": 0.5670647806703726, + "grad_norm": 0.6413942575454712, + "learning_rate": 8.168258204024619e-06, + "loss": 0.6593, + "step": 10303 + }, + { + "epoch": 0.5671198194727283, + "grad_norm": 0.7040891051292419, + "learning_rate": 8.167922854321002e-06, + "loss": 0.7295, + "step": 10304 + }, + { + "epoch": 0.567174858275084, + "grad_norm": 0.7132521867752075, + "learning_rate": 8.167587480808126e-06, + "loss": 0.7128, + "step": 10305 + }, + { + "epoch": 0.5672298970774396, + "grad_norm": 0.756529688835144, + "learning_rate": 8.167252083488508e-06, + "loss": 0.7044, + "step": 10306 + }, + { + "epoch": 0.5672849358797952, + "grad_norm": 0.8456888198852539, + "learning_rate": 8.166916662364672e-06, + "loss": 0.8304, + "step": 10307 + }, + { + "epoch": 0.5673399746821509, + "grad_norm": 0.7758522629737854, + "learning_rate": 8.166581217439138e-06, + "loss": 0.7192, + "step": 10308 + }, + { + "epoch": 0.5673950134845066, + "grad_norm": 0.8110343217849731, + "learning_rate": 8.166245748714428e-06, + "loss": 0.8794, + "step": 10309 + }, + { + "epoch": 0.5674500522868622, + "grad_norm": 0.6803586483001709, + "learning_rate": 8.165910256193062e-06, + "loss": 0.7402, + "step": 10310 + }, + { + "epoch": 0.5675050910892179, + "grad_norm": 0.7294176816940308, + "learning_rate": 8.165574739877563e-06, + "loss": 0.7325, + "step": 10311 + }, + { + "epoch": 0.5675601298915736, + "grad_norm": 0.835488498210907, + "learning_rate": 8.165239199770448e-06, + "loss": 0.8317, + "step": 10312 + }, + { + "epoch": 0.5676151686939293, + "grad_norm": 0.6497608423233032, + "learning_rate": 8.164903635874246e-06, + "loss": 0.6902, + "step": 10313 + }, + { + "epoch": 0.5676702074962848, + "grad_norm": 0.6782082915306091, + "learning_rate": 8.164568048191474e-06, + "loss": 0.7941, + "step": 10314 + }, + { + "epoch": 0.5677252462986405, + "grad_norm": 0.6974388957023621, + "learning_rate": 8.164232436724656e-06, + "loss": 0.7899, + "step": 10315 + }, + { + "epoch": 0.5677802851009962, + "grad_norm": 0.7222558259963989, + "learning_rate": 8.163896801476314e-06, + "loss": 0.8034, + "step": 10316 + }, + { + "epoch": 0.5678353239033519, + "grad_norm": 0.6562586426734924, + "learning_rate": 8.16356114244897e-06, + "loss": 0.7864, + "step": 10317 + }, + { + "epoch": 0.5678903627057075, + "grad_norm": 0.6888270378112793, + "learning_rate": 8.16322545964515e-06, + "loss": 0.8455, + "step": 10318 + }, + { + "epoch": 0.5679454015080632, + "grad_norm": 0.642084002494812, + "learning_rate": 8.162889753067372e-06, + "loss": 0.7478, + "step": 10319 + }, + { + "epoch": 0.5680004403104189, + "grad_norm": 0.7077270746231079, + "learning_rate": 8.16255402271816e-06, + "loss": 0.7281, + "step": 10320 + }, + { + "epoch": 0.5680554791127745, + "grad_norm": 0.7202198505401611, + "learning_rate": 8.16221826860004e-06, + "loss": 0.7893, + "step": 10321 + }, + { + "epoch": 0.5681105179151301, + "grad_norm": 0.8950369954109192, + "learning_rate": 8.161882490715534e-06, + "loss": 0.772, + "step": 10322 + }, + { + "epoch": 0.5681655567174858, + "grad_norm": 0.6986666917800903, + "learning_rate": 8.161546689067166e-06, + "loss": 0.7712, + "step": 10323 + }, + { + "epoch": 0.5682205955198415, + "grad_norm": 0.7095959782600403, + "learning_rate": 8.161210863657458e-06, + "loss": 0.8373, + "step": 10324 + }, + { + "epoch": 0.5682756343221972, + "grad_norm": 0.7510485649108887, + "learning_rate": 8.160875014488936e-06, + "loss": 0.9106, + "step": 10325 + }, + { + "epoch": 0.5683306731245528, + "grad_norm": 0.7558283805847168, + "learning_rate": 8.160539141564123e-06, + "loss": 0.8192, + "step": 10326 + }, + { + "epoch": 0.5683857119269085, + "grad_norm": 0.7523400187492371, + "learning_rate": 8.160203244885545e-06, + "loss": 0.8276, + "step": 10327 + }, + { + "epoch": 0.5684407507292641, + "grad_norm": 0.6911195516586304, + "learning_rate": 8.159867324455724e-06, + "loss": 0.6286, + "step": 10328 + }, + { + "epoch": 0.5684957895316198, + "grad_norm": 0.6456325054168701, + "learning_rate": 8.159531380277188e-06, + "loss": 0.7419, + "step": 10329 + }, + { + "epoch": 0.5685508283339754, + "grad_norm": 0.9318492412567139, + "learning_rate": 8.159195412352458e-06, + "loss": 0.8131, + "step": 10330 + }, + { + "epoch": 0.5686058671363311, + "grad_norm": 0.7012938857078552, + "learning_rate": 8.158859420684062e-06, + "loss": 0.7074, + "step": 10331 + }, + { + "epoch": 0.5686609059386868, + "grad_norm": 0.7152053117752075, + "learning_rate": 8.158523405274523e-06, + "loss": 0.7186, + "step": 10332 + }, + { + "epoch": 0.5687159447410425, + "grad_norm": 0.7074982523918152, + "learning_rate": 8.158187366126368e-06, + "loss": 0.8021, + "step": 10333 + }, + { + "epoch": 0.5687709835433981, + "grad_norm": 0.689536452293396, + "learning_rate": 8.157851303242123e-06, + "loss": 0.7493, + "step": 10334 + }, + { + "epoch": 0.5688260223457537, + "grad_norm": 0.7411753535270691, + "learning_rate": 8.157515216624313e-06, + "loss": 0.8012, + "step": 10335 + }, + { + "epoch": 0.5688810611481094, + "grad_norm": 0.6831420063972473, + "learning_rate": 8.157179106275463e-06, + "loss": 0.7114, + "step": 10336 + }, + { + "epoch": 0.5689360999504651, + "grad_norm": 0.6786901950836182, + "learning_rate": 8.1568429721981e-06, + "loss": 0.7638, + "step": 10337 + }, + { + "epoch": 0.5689911387528207, + "grad_norm": 0.7546970844268799, + "learning_rate": 8.15650681439475e-06, + "loss": 0.7711, + "step": 10338 + }, + { + "epoch": 0.5690461775551764, + "grad_norm": 0.8071785569190979, + "learning_rate": 8.156170632867942e-06, + "loss": 0.8105, + "step": 10339 + }, + { + "epoch": 0.5691012163575321, + "grad_norm": 0.7872087359428406, + "learning_rate": 8.155834427620198e-06, + "loss": 0.7657, + "step": 10340 + }, + { + "epoch": 0.5691562551598878, + "grad_norm": 0.724328875541687, + "learning_rate": 8.155498198654047e-06, + "loss": 0.7978, + "step": 10341 + }, + { + "epoch": 0.5692112939622433, + "grad_norm": 0.8559905886650085, + "learning_rate": 8.155161945972016e-06, + "loss": 0.7766, + "step": 10342 + }, + { + "epoch": 0.569266332764599, + "grad_norm": 0.607418417930603, + "learning_rate": 8.154825669576635e-06, + "loss": 0.642, + "step": 10343 + }, + { + "epoch": 0.5693213715669547, + "grad_norm": 0.7403624653816223, + "learning_rate": 8.154489369470426e-06, + "loss": 0.7301, + "step": 10344 + }, + { + "epoch": 0.5693764103693104, + "grad_norm": 0.7388540506362915, + "learning_rate": 8.154153045655922e-06, + "loss": 0.7895, + "step": 10345 + }, + { + "epoch": 0.569431449171666, + "grad_norm": 0.8327579498291016, + "learning_rate": 8.153816698135646e-06, + "loss": 0.7589, + "step": 10346 + }, + { + "epoch": 0.5694864879740217, + "grad_norm": 0.7738710641860962, + "learning_rate": 8.153480326912128e-06, + "loss": 0.7828, + "step": 10347 + }, + { + "epoch": 0.5695415267763774, + "grad_norm": 0.8280724287033081, + "learning_rate": 8.153143931987896e-06, + "loss": 0.8194, + "step": 10348 + }, + { + "epoch": 0.5695965655787331, + "grad_norm": 0.8290724754333496, + "learning_rate": 8.152807513365478e-06, + "loss": 0.5941, + "step": 10349 + }, + { + "epoch": 0.5696516043810886, + "grad_norm": 0.7514322400093079, + "learning_rate": 8.152471071047403e-06, + "loss": 0.676, + "step": 10350 + }, + { + "epoch": 0.5697066431834443, + "grad_norm": 0.6990258693695068, + "learning_rate": 8.1521346050362e-06, + "loss": 0.804, + "step": 10351 + }, + { + "epoch": 0.5697616819858, + "grad_norm": 0.6781288981437683, + "learning_rate": 8.151798115334396e-06, + "loss": 0.7372, + "step": 10352 + }, + { + "epoch": 0.5698167207881556, + "grad_norm": 0.764301061630249, + "learning_rate": 8.151461601944523e-06, + "loss": 0.8242, + "step": 10353 + }, + { + "epoch": 0.5698717595905113, + "grad_norm": 0.7577376961708069, + "learning_rate": 8.151125064869106e-06, + "loss": 0.7354, + "step": 10354 + }, + { + "epoch": 0.569926798392867, + "grad_norm": 0.767764687538147, + "learning_rate": 8.150788504110678e-06, + "loss": 0.7262, + "step": 10355 + }, + { + "epoch": 0.5699818371952227, + "grad_norm": 0.6634765267372131, + "learning_rate": 8.150451919671767e-06, + "loss": 0.7527, + "step": 10356 + }, + { + "epoch": 0.5700368759975782, + "grad_norm": 0.8803308010101318, + "learning_rate": 8.150115311554901e-06, + "loss": 0.8172, + "step": 10357 + }, + { + "epoch": 0.5700919147999339, + "grad_norm": 0.695791482925415, + "learning_rate": 8.149778679762611e-06, + "loss": 0.7538, + "step": 10358 + }, + { + "epoch": 0.5701469536022896, + "grad_norm": 0.7047555446624756, + "learning_rate": 8.149442024297432e-06, + "loss": 0.7533, + "step": 10359 + }, + { + "epoch": 0.5702019924046453, + "grad_norm": 0.7148274183273315, + "learning_rate": 8.149105345161886e-06, + "loss": 0.6736, + "step": 10360 + }, + { + "epoch": 0.5702570312070009, + "grad_norm": 0.673204243183136, + "learning_rate": 8.148768642358508e-06, + "loss": 0.7713, + "step": 10361 + }, + { + "epoch": 0.5703120700093566, + "grad_norm": 0.6258989572525024, + "learning_rate": 8.148431915889827e-06, + "loss": 0.6578, + "step": 10362 + }, + { + "epoch": 0.5703671088117123, + "grad_norm": 0.8411956429481506, + "learning_rate": 8.148095165758377e-06, + "loss": 0.8387, + "step": 10363 + }, + { + "epoch": 0.570422147614068, + "grad_norm": 0.7802130579948425, + "learning_rate": 8.147758391966685e-06, + "loss": 0.8564, + "step": 10364 + }, + { + "epoch": 0.5704771864164235, + "grad_norm": 0.6665176153182983, + "learning_rate": 8.147421594517282e-06, + "loss": 0.688, + "step": 10365 + }, + { + "epoch": 0.5705322252187792, + "grad_norm": 0.7166683673858643, + "learning_rate": 8.147084773412702e-06, + "loss": 0.6704, + "step": 10366 + }, + { + "epoch": 0.5705872640211349, + "grad_norm": 0.6948957443237305, + "learning_rate": 8.146747928655476e-06, + "loss": 0.7116, + "step": 10367 + }, + { + "epoch": 0.5706423028234906, + "grad_norm": 0.588965892791748, + "learning_rate": 8.146411060248134e-06, + "loss": 0.5644, + "step": 10368 + }, + { + "epoch": 0.5706973416258462, + "grad_norm": 0.8020890355110168, + "learning_rate": 8.14607416819321e-06, + "loss": 0.6978, + "step": 10369 + }, + { + "epoch": 0.5707523804282019, + "grad_norm": 0.9900732040405273, + "learning_rate": 8.145737252493234e-06, + "loss": 0.7295, + "step": 10370 + }, + { + "epoch": 0.5708074192305576, + "grad_norm": 0.7236563563346863, + "learning_rate": 8.145400313150737e-06, + "loss": 0.7555, + "step": 10371 + }, + { + "epoch": 0.5708624580329132, + "grad_norm": 0.6784152984619141, + "learning_rate": 8.145063350168257e-06, + "loss": 0.7283, + "step": 10372 + }, + { + "epoch": 0.5709174968352688, + "grad_norm": 0.6255244612693787, + "learning_rate": 8.14472636354832e-06, + "loss": 0.6722, + "step": 10373 + }, + { + "epoch": 0.5709725356376245, + "grad_norm": 0.8250948786735535, + "learning_rate": 8.14438935329346e-06, + "loss": 0.8406, + "step": 10374 + }, + { + "epoch": 0.5710275744399802, + "grad_norm": 0.7308233380317688, + "learning_rate": 8.144052319406215e-06, + "loss": 0.8084, + "step": 10375 + }, + { + "epoch": 0.5710826132423359, + "grad_norm": 0.7850058674812317, + "learning_rate": 8.143715261889112e-06, + "loss": 0.7892, + "step": 10376 + }, + { + "epoch": 0.5711376520446915, + "grad_norm": 0.81241774559021, + "learning_rate": 8.143378180744687e-06, + "loss": 0.7819, + "step": 10377 + }, + { + "epoch": 0.5711926908470472, + "grad_norm": 0.7174570560455322, + "learning_rate": 8.143041075975473e-06, + "loss": 0.7104, + "step": 10378 + }, + { + "epoch": 0.5712477296494028, + "grad_norm": 0.6954129934310913, + "learning_rate": 8.142703947584004e-06, + "loss": 0.7821, + "step": 10379 + }, + { + "epoch": 0.5713027684517585, + "grad_norm": 0.6895242929458618, + "learning_rate": 8.142366795572813e-06, + "loss": 0.7687, + "step": 10380 + }, + { + "epoch": 0.5713578072541141, + "grad_norm": 0.6543757319450378, + "learning_rate": 8.142029619944434e-06, + "loss": 0.7042, + "step": 10381 + }, + { + "epoch": 0.5714128460564698, + "grad_norm": 0.6712427139282227, + "learning_rate": 8.141692420701404e-06, + "loss": 0.6861, + "step": 10382 + }, + { + "epoch": 0.5714678848588255, + "grad_norm": 1.6716055870056152, + "learning_rate": 8.141355197846253e-06, + "loss": 0.8209, + "step": 10383 + }, + { + "epoch": 0.5715229236611812, + "grad_norm": 0.7509854435920715, + "learning_rate": 8.141017951381516e-06, + "loss": 0.8246, + "step": 10384 + }, + { + "epoch": 0.5715779624635368, + "grad_norm": 0.7161786556243896, + "learning_rate": 8.14068068130973e-06, + "loss": 0.835, + "step": 10385 + }, + { + "epoch": 0.5716330012658924, + "grad_norm": 0.7423714995384216, + "learning_rate": 8.140343387633427e-06, + "loss": 0.8004, + "step": 10386 + }, + { + "epoch": 0.5716880400682481, + "grad_norm": 0.6955768465995789, + "learning_rate": 8.140006070355146e-06, + "loss": 0.7299, + "step": 10387 + }, + { + "epoch": 0.5717430788706038, + "grad_norm": 0.6742254495620728, + "learning_rate": 8.13966872947742e-06, + "loss": 0.6549, + "step": 10388 + }, + { + "epoch": 0.5717981176729594, + "grad_norm": 0.7332299947738647, + "learning_rate": 8.139331365002782e-06, + "loss": 0.7945, + "step": 10389 + }, + { + "epoch": 0.5718531564753151, + "grad_norm": 0.6552133560180664, + "learning_rate": 8.138993976933771e-06, + "loss": 0.7193, + "step": 10390 + }, + { + "epoch": 0.5719081952776708, + "grad_norm": 0.6708530187606812, + "learning_rate": 8.138656565272923e-06, + "loss": 0.8053, + "step": 10391 + }, + { + "epoch": 0.5719632340800265, + "grad_norm": 0.7837093472480774, + "learning_rate": 8.138319130022771e-06, + "loss": 0.7752, + "step": 10392 + }, + { + "epoch": 0.572018272882382, + "grad_norm": 0.6910337805747986, + "learning_rate": 8.137981671185853e-06, + "loss": 0.7573, + "step": 10393 + }, + { + "epoch": 0.5720733116847377, + "grad_norm": 0.6758334636688232, + "learning_rate": 8.137644188764704e-06, + "loss": 0.8251, + "step": 10394 + }, + { + "epoch": 0.5721283504870934, + "grad_norm": 0.7513287663459778, + "learning_rate": 8.137306682761862e-06, + "loss": 0.6491, + "step": 10395 + }, + { + "epoch": 0.572183389289449, + "grad_norm": 0.678210973739624, + "learning_rate": 8.136969153179863e-06, + "loss": 0.7761, + "step": 10396 + }, + { + "epoch": 0.5722384280918047, + "grad_norm": 0.8256083726882935, + "learning_rate": 8.13663160002124e-06, + "loss": 0.7813, + "step": 10397 + }, + { + "epoch": 0.5722934668941604, + "grad_norm": 0.8383314609527588, + "learning_rate": 8.136294023288538e-06, + "loss": 0.7669, + "step": 10398 + }, + { + "epoch": 0.5723485056965161, + "grad_norm": 0.7150036692619324, + "learning_rate": 8.135956422984287e-06, + "loss": 0.8322, + "step": 10399 + }, + { + "epoch": 0.5724035444988717, + "grad_norm": 1.3011385202407837, + "learning_rate": 8.13561879911103e-06, + "loss": 0.8044, + "step": 10400 + }, + { + "epoch": 0.5724585833012273, + "grad_norm": 0.6749194860458374, + "learning_rate": 8.135281151671298e-06, + "loss": 0.6426, + "step": 10401 + }, + { + "epoch": 0.572513622103583, + "grad_norm": 0.7370286583900452, + "learning_rate": 8.134943480667635e-06, + "loss": 0.8051, + "step": 10402 + }, + { + "epoch": 0.5725686609059387, + "grad_norm": 0.6827631592750549, + "learning_rate": 8.134605786102574e-06, + "loss": 0.6961, + "step": 10403 + }, + { + "epoch": 0.5726236997082943, + "grad_norm": 0.7593247294425964, + "learning_rate": 8.134268067978655e-06, + "loss": 0.7514, + "step": 10404 + }, + { + "epoch": 0.57267873851065, + "grad_norm": 0.7229800224304199, + "learning_rate": 8.133930326298417e-06, + "loss": 0.8105, + "step": 10405 + }, + { + "epoch": 0.5727337773130057, + "grad_norm": 0.720973551273346, + "learning_rate": 8.133592561064396e-06, + "loss": 0.6866, + "step": 10406 + }, + { + "epoch": 0.5727888161153614, + "grad_norm": 0.7530742883682251, + "learning_rate": 8.133254772279135e-06, + "loss": 0.773, + "step": 10407 + }, + { + "epoch": 0.5728438549177169, + "grad_norm": 0.6897457838058472, + "learning_rate": 8.132916959945167e-06, + "loss": 0.8107, + "step": 10408 + }, + { + "epoch": 0.5728988937200726, + "grad_norm": 0.6659066081047058, + "learning_rate": 8.132579124065034e-06, + "loss": 0.8036, + "step": 10409 + }, + { + "epoch": 0.5729539325224283, + "grad_norm": 0.6925005316734314, + "learning_rate": 8.132241264641276e-06, + "loss": 0.7869, + "step": 10410 + }, + { + "epoch": 0.573008971324784, + "grad_norm": 0.8681634068489075, + "learning_rate": 8.131903381676433e-06, + "loss": 0.7411, + "step": 10411 + }, + { + "epoch": 0.5730640101271396, + "grad_norm": 0.669561505317688, + "learning_rate": 8.13156547517304e-06, + "loss": 0.7398, + "step": 10412 + }, + { + "epoch": 0.5731190489294953, + "grad_norm": 0.6737409234046936, + "learning_rate": 8.131227545133639e-06, + "loss": 0.7319, + "step": 10413 + }, + { + "epoch": 0.573174087731851, + "grad_norm": 0.7111513614654541, + "learning_rate": 8.130889591560772e-06, + "loss": 0.7192, + "step": 10414 + }, + { + "epoch": 0.5732291265342067, + "grad_norm": 0.6618744134902954, + "learning_rate": 8.130551614456974e-06, + "loss": 0.6636, + "step": 10415 + }, + { + "epoch": 0.5732841653365622, + "grad_norm": 0.8150144815444946, + "learning_rate": 8.13021361382479e-06, + "loss": 0.7168, + "step": 10416 + }, + { + "epoch": 0.5733392041389179, + "grad_norm": 0.744898796081543, + "learning_rate": 8.129875589666758e-06, + "loss": 0.8562, + "step": 10417 + }, + { + "epoch": 0.5733942429412736, + "grad_norm": 0.7831705212593079, + "learning_rate": 8.129537541985419e-06, + "loss": 0.8491, + "step": 10418 + }, + { + "epoch": 0.5734492817436293, + "grad_norm": 0.8097667098045349, + "learning_rate": 8.129199470783313e-06, + "loss": 0.7623, + "step": 10419 + }, + { + "epoch": 0.5735043205459849, + "grad_norm": 0.7951840758323669, + "learning_rate": 8.128861376062982e-06, + "loss": 0.8195, + "step": 10420 + }, + { + "epoch": 0.5735593593483406, + "grad_norm": 0.5902833938598633, + "learning_rate": 8.128523257826966e-06, + "loss": 0.6244, + "step": 10421 + }, + { + "epoch": 0.5736143981506963, + "grad_norm": 1.113287329673767, + "learning_rate": 8.128185116077805e-06, + "loss": 0.8382, + "step": 10422 + }, + { + "epoch": 0.573669436953052, + "grad_norm": 0.6899390816688538, + "learning_rate": 8.127846950818046e-06, + "loss": 0.7632, + "step": 10423 + }, + { + "epoch": 0.5737244757554075, + "grad_norm": 0.6905965805053711, + "learning_rate": 8.127508762050225e-06, + "loss": 0.7429, + "step": 10424 + }, + { + "epoch": 0.5737795145577632, + "grad_norm": 0.7036122679710388, + "learning_rate": 8.127170549776882e-06, + "loss": 0.7699, + "step": 10425 + }, + { + "epoch": 0.5738345533601189, + "grad_norm": 0.6599798202514648, + "learning_rate": 8.126832314000566e-06, + "loss": 0.7169, + "step": 10426 + }, + { + "epoch": 0.5738895921624746, + "grad_norm": 0.8682155609130859, + "learning_rate": 8.126494054723815e-06, + "loss": 0.851, + "step": 10427 + }, + { + "epoch": 0.5739446309648302, + "grad_norm": 0.6661516427993774, + "learning_rate": 8.12615577194917e-06, + "loss": 0.7287, + "step": 10428 + }, + { + "epoch": 0.5739996697671859, + "grad_norm": 0.6805256009101868, + "learning_rate": 8.125817465679176e-06, + "loss": 0.7033, + "step": 10429 + }, + { + "epoch": 0.5740547085695415, + "grad_norm": 0.7088646292686462, + "learning_rate": 8.125479135916375e-06, + "loss": 0.7295, + "step": 10430 + }, + { + "epoch": 0.5741097473718972, + "grad_norm": 0.6854971647262573, + "learning_rate": 8.12514078266331e-06, + "loss": 0.8102, + "step": 10431 + }, + { + "epoch": 0.5741647861742528, + "grad_norm": 0.7481474876403809, + "learning_rate": 8.124802405922521e-06, + "loss": 0.7463, + "step": 10432 + }, + { + "epoch": 0.5742198249766085, + "grad_norm": 0.8280898928642273, + "learning_rate": 8.124464005696556e-06, + "loss": 0.8067, + "step": 10433 + }, + { + "epoch": 0.5742748637789642, + "grad_norm": 0.696812629699707, + "learning_rate": 8.124125581987953e-06, + "loss": 0.7041, + "step": 10434 + }, + { + "epoch": 0.5743299025813199, + "grad_norm": 0.791084349155426, + "learning_rate": 8.123787134799262e-06, + "loss": 0.8244, + "step": 10435 + }, + { + "epoch": 0.5743849413836755, + "grad_norm": 0.7422665953636169, + "learning_rate": 8.123448664133022e-06, + "loss": 0.7792, + "step": 10436 + }, + { + "epoch": 0.5744399801860312, + "grad_norm": 0.7302834987640381, + "learning_rate": 8.123110169991777e-06, + "loss": 0.7617, + "step": 10437 + }, + { + "epoch": 0.5744950189883868, + "grad_norm": 0.6640440821647644, + "learning_rate": 8.122771652378071e-06, + "loss": 0.7965, + "step": 10438 + }, + { + "epoch": 0.5745500577907424, + "grad_norm": 0.7704516649246216, + "learning_rate": 8.12243311129445e-06, + "loss": 0.7814, + "step": 10439 + }, + { + "epoch": 0.5746050965930981, + "grad_norm": 0.673254668712616, + "learning_rate": 8.122094546743459e-06, + "loss": 0.7364, + "step": 10440 + }, + { + "epoch": 0.5746601353954538, + "grad_norm": 0.7648451924324036, + "learning_rate": 8.121755958727639e-06, + "loss": 0.8585, + "step": 10441 + }, + { + "epoch": 0.5747151741978095, + "grad_norm": 0.6660173535346985, + "learning_rate": 8.121417347249539e-06, + "loss": 0.6989, + "step": 10442 + }, + { + "epoch": 0.5747702130001651, + "grad_norm": 0.7128653526306152, + "learning_rate": 8.1210787123117e-06, + "loss": 0.8317, + "step": 10443 + }, + { + "epoch": 0.5748252518025208, + "grad_norm": 0.6404966115951538, + "learning_rate": 8.12074005391667e-06, + "loss": 0.6957, + "step": 10444 + }, + { + "epoch": 0.5748802906048764, + "grad_norm": 0.9597657918930054, + "learning_rate": 8.120401372066993e-06, + "loss": 0.9266, + "step": 10445 + }, + { + "epoch": 0.5749353294072321, + "grad_norm": 0.7735045552253723, + "learning_rate": 8.120062666765213e-06, + "loss": 0.8159, + "step": 10446 + }, + { + "epoch": 0.5749903682095877, + "grad_norm": 0.8031814098358154, + "learning_rate": 8.11972393801388e-06, + "loss": 0.7741, + "step": 10447 + }, + { + "epoch": 0.5750454070119434, + "grad_norm": 0.7008558511734009, + "learning_rate": 8.119385185815535e-06, + "loss": 0.6558, + "step": 10448 + }, + { + "epoch": 0.5751004458142991, + "grad_norm": 0.8162875175476074, + "learning_rate": 8.119046410172725e-06, + "loss": 0.7196, + "step": 10449 + }, + { + "epoch": 0.5751554846166548, + "grad_norm": 0.8142701983451843, + "learning_rate": 8.118707611088e-06, + "loss": 0.7709, + "step": 10450 + }, + { + "epoch": 0.5752105234190104, + "grad_norm": 0.7671986818313599, + "learning_rate": 8.118368788563902e-06, + "loss": 0.8725, + "step": 10451 + }, + { + "epoch": 0.575265562221366, + "grad_norm": 0.6604374051094055, + "learning_rate": 8.118029942602979e-06, + "loss": 0.7119, + "step": 10452 + }, + { + "epoch": 0.5753206010237217, + "grad_norm": 0.7119179368019104, + "learning_rate": 8.117691073207776e-06, + "loss": 0.7445, + "step": 10453 + }, + { + "epoch": 0.5753756398260774, + "grad_norm": 0.7572842240333557, + "learning_rate": 8.117352180380843e-06, + "loss": 0.7672, + "step": 10454 + }, + { + "epoch": 0.575430678628433, + "grad_norm": 0.688667356967926, + "learning_rate": 8.117013264124725e-06, + "loss": 0.7733, + "step": 10455 + }, + { + "epoch": 0.5754857174307887, + "grad_norm": 0.6683163046836853, + "learning_rate": 8.116674324441971e-06, + "loss": 0.6381, + "step": 10456 + }, + { + "epoch": 0.5755407562331444, + "grad_norm": 0.7792099714279175, + "learning_rate": 8.116335361335126e-06, + "loss": 0.7781, + "step": 10457 + }, + { + "epoch": 0.5755957950355001, + "grad_norm": 0.702132523059845, + "learning_rate": 8.115996374806738e-06, + "loss": 0.7442, + "step": 10458 + }, + { + "epoch": 0.5756508338378556, + "grad_norm": 0.7021365761756897, + "learning_rate": 8.115657364859356e-06, + "loss": 0.7215, + "step": 10459 + }, + { + "epoch": 0.5757058726402113, + "grad_norm": 0.7032247185707092, + "learning_rate": 8.115318331495527e-06, + "loss": 0.7069, + "step": 10460 + }, + { + "epoch": 0.575760911442567, + "grad_norm": 0.8301237225532532, + "learning_rate": 8.1149792747178e-06, + "loss": 0.789, + "step": 10461 + }, + { + "epoch": 0.5758159502449227, + "grad_norm": 0.7051018476486206, + "learning_rate": 8.11464019452872e-06, + "loss": 0.7511, + "step": 10462 + }, + { + "epoch": 0.5758709890472783, + "grad_norm": 0.8422626256942749, + "learning_rate": 8.114301090930843e-06, + "loss": 0.6507, + "step": 10463 + }, + { + "epoch": 0.575926027849634, + "grad_norm": 0.7751632332801819, + "learning_rate": 8.113961963926708e-06, + "loss": 0.7357, + "step": 10464 + }, + { + "epoch": 0.5759810666519897, + "grad_norm": 0.7158333659172058, + "learning_rate": 8.11362281351887e-06, + "loss": 0.8382, + "step": 10465 + }, + { + "epoch": 0.5760361054543454, + "grad_norm": 0.6926481127738953, + "learning_rate": 8.113283639709878e-06, + "loss": 0.7078, + "step": 10466 + }, + { + "epoch": 0.5760911442567009, + "grad_norm": 0.7091588973999023, + "learning_rate": 8.112944442502277e-06, + "loss": 0.7932, + "step": 10467 + }, + { + "epoch": 0.5761461830590566, + "grad_norm": 0.6979780197143555, + "learning_rate": 8.11260522189862e-06, + "loss": 0.6812, + "step": 10468 + }, + { + "epoch": 0.5762012218614123, + "grad_norm": 0.6735736131668091, + "learning_rate": 8.112265977901455e-06, + "loss": 0.7499, + "step": 10469 + }, + { + "epoch": 0.576256260663768, + "grad_norm": 0.6995692849159241, + "learning_rate": 8.111926710513334e-06, + "loss": 0.7123, + "step": 10470 + }, + { + "epoch": 0.5763112994661236, + "grad_norm": 0.7162681818008423, + "learning_rate": 8.111587419736802e-06, + "loss": 0.7586, + "step": 10471 + }, + { + "epoch": 0.5763663382684793, + "grad_norm": 0.945935070514679, + "learning_rate": 8.111248105574414e-06, + "loss": 0.8474, + "step": 10472 + }, + { + "epoch": 0.576421377070835, + "grad_norm": 0.608730673789978, + "learning_rate": 8.110908768028716e-06, + "loss": 0.6433, + "step": 10473 + }, + { + "epoch": 0.5764764158731907, + "grad_norm": 0.6777853965759277, + "learning_rate": 8.110569407102263e-06, + "loss": 0.7913, + "step": 10474 + }, + { + "epoch": 0.5765314546755462, + "grad_norm": 0.6310930848121643, + "learning_rate": 8.1102300227976e-06, + "loss": 0.719, + "step": 10475 + }, + { + "epoch": 0.5765864934779019, + "grad_norm": 0.7048485279083252, + "learning_rate": 8.109890615117282e-06, + "loss": 0.7341, + "step": 10476 + }, + { + "epoch": 0.5766415322802576, + "grad_norm": 0.672987163066864, + "learning_rate": 8.10955118406386e-06, + "loss": 0.7637, + "step": 10477 + }, + { + "epoch": 0.5766965710826133, + "grad_norm": 0.7018216252326965, + "learning_rate": 8.109211729639882e-06, + "loss": 0.6924, + "step": 10478 + }, + { + "epoch": 0.5767516098849689, + "grad_norm": 0.7183761596679688, + "learning_rate": 8.108872251847901e-06, + "loss": 0.7945, + "step": 10479 + }, + { + "epoch": 0.5768066486873246, + "grad_norm": 0.7332683801651001, + "learning_rate": 8.108532750690469e-06, + "loss": 0.7686, + "step": 10480 + }, + { + "epoch": 0.5768616874896803, + "grad_norm": 0.7118290066719055, + "learning_rate": 8.108193226170139e-06, + "loss": 0.6917, + "step": 10481 + }, + { + "epoch": 0.5769167262920358, + "grad_norm": 0.8242507576942444, + "learning_rate": 8.107853678289456e-06, + "loss": 0.9119, + "step": 10482 + }, + { + "epoch": 0.5769717650943915, + "grad_norm": 0.7138590216636658, + "learning_rate": 8.10751410705098e-06, + "loss": 0.7095, + "step": 10483 + }, + { + "epoch": 0.5770268038967472, + "grad_norm": 0.7541199326515198, + "learning_rate": 8.107174512457259e-06, + "loss": 0.8042, + "step": 10484 + }, + { + "epoch": 0.5770818426991029, + "grad_norm": 0.7776939868927002, + "learning_rate": 8.106834894510846e-06, + "loss": 0.8075, + "step": 10485 + }, + { + "epoch": 0.5771368815014585, + "grad_norm": 0.6466917395591736, + "learning_rate": 8.106495253214293e-06, + "loss": 0.707, + "step": 10486 + }, + { + "epoch": 0.5771919203038142, + "grad_norm": 0.687101423740387, + "learning_rate": 8.106155588570153e-06, + "loss": 0.6945, + "step": 10487 + }, + { + "epoch": 0.5772469591061699, + "grad_norm": 0.8338418006896973, + "learning_rate": 8.10581590058098e-06, + "loss": 0.8044, + "step": 10488 + }, + { + "epoch": 0.5773019979085255, + "grad_norm": 0.7052263617515564, + "learning_rate": 8.105476189249325e-06, + "loss": 0.8216, + "step": 10489 + }, + { + "epoch": 0.5773570367108811, + "grad_norm": 0.7205906510353088, + "learning_rate": 8.105136454577744e-06, + "loss": 0.8853, + "step": 10490 + }, + { + "epoch": 0.5774120755132368, + "grad_norm": 0.7875076532363892, + "learning_rate": 8.10479669656879e-06, + "loss": 0.822, + "step": 10491 + }, + { + "epoch": 0.5774671143155925, + "grad_norm": 0.6858797669410706, + "learning_rate": 8.104456915225012e-06, + "loss": 0.7924, + "step": 10492 + }, + { + "epoch": 0.5775221531179482, + "grad_norm": 0.6991322636604309, + "learning_rate": 8.104117110548968e-06, + "loss": 0.8144, + "step": 10493 + }, + { + "epoch": 0.5775771919203038, + "grad_norm": 0.7768846750259399, + "learning_rate": 8.103777282543209e-06, + "loss": 0.7793, + "step": 10494 + }, + { + "epoch": 0.5776322307226595, + "grad_norm": 0.7055716514587402, + "learning_rate": 8.103437431210293e-06, + "loss": 0.7653, + "step": 10495 + }, + { + "epoch": 0.5776872695250151, + "grad_norm": 1.009839653968811, + "learning_rate": 8.10309755655277e-06, + "loss": 0.7646, + "step": 10496 + }, + { + "epoch": 0.5777423083273708, + "grad_norm": 0.699435293674469, + "learning_rate": 8.102757658573197e-06, + "loss": 0.7806, + "step": 10497 + }, + { + "epoch": 0.5777973471297264, + "grad_norm": 0.8566381931304932, + "learning_rate": 8.102417737274129e-06, + "loss": 0.8302, + "step": 10498 + }, + { + "epoch": 0.5778523859320821, + "grad_norm": 0.745801568031311, + "learning_rate": 8.10207779265812e-06, + "loss": 0.91, + "step": 10499 + }, + { + "epoch": 0.5779074247344378, + "grad_norm": 0.6867349743843079, + "learning_rate": 8.101737824727724e-06, + "loss": 0.771, + "step": 10500 + }, + { + "epoch": 0.5779624635367935, + "grad_norm": 0.6693048477172852, + "learning_rate": 8.101397833485496e-06, + "loss": 0.7967, + "step": 10501 + }, + { + "epoch": 0.5780175023391491, + "grad_norm": 0.7485450506210327, + "learning_rate": 8.101057818933993e-06, + "loss": 0.7132, + "step": 10502 + }, + { + "epoch": 0.5780725411415047, + "grad_norm": 0.7619839906692505, + "learning_rate": 8.100717781075769e-06, + "loss": 0.7379, + "step": 10503 + }, + { + "epoch": 0.5781275799438604, + "grad_norm": 0.7651955485343933, + "learning_rate": 8.100377719913382e-06, + "loss": 0.8437, + "step": 10504 + }, + { + "epoch": 0.5781826187462161, + "grad_norm": 0.692385196685791, + "learning_rate": 8.100037635449384e-06, + "loss": 0.7666, + "step": 10505 + }, + { + "epoch": 0.5782376575485717, + "grad_norm": 0.7332374453544617, + "learning_rate": 8.099697527686334e-06, + "loss": 0.7476, + "step": 10506 + }, + { + "epoch": 0.5782926963509274, + "grad_norm": 0.6934877634048462, + "learning_rate": 8.099357396626786e-06, + "loss": 0.8054, + "step": 10507 + }, + { + "epoch": 0.5783477351532831, + "grad_norm": 0.8393011689186096, + "learning_rate": 8.099017242273298e-06, + "loss": 0.8655, + "step": 10508 + }, + { + "epoch": 0.5784027739556388, + "grad_norm": 0.6850646734237671, + "learning_rate": 8.098677064628425e-06, + "loss": 0.7424, + "step": 10509 + }, + { + "epoch": 0.5784578127579943, + "grad_norm": 0.7302095293998718, + "learning_rate": 8.098336863694728e-06, + "loss": 0.903, + "step": 10510 + }, + { + "epoch": 0.57851285156035, + "grad_norm": 0.7474033236503601, + "learning_rate": 8.097996639474757e-06, + "loss": 0.7509, + "step": 10511 + }, + { + "epoch": 0.5785678903627057, + "grad_norm": 0.6525655388832092, + "learning_rate": 8.097656391971074e-06, + "loss": 0.7097, + "step": 10512 + }, + { + "epoch": 0.5786229291650614, + "grad_norm": 0.8197451829910278, + "learning_rate": 8.097316121186234e-06, + "loss": 0.7401, + "step": 10513 + }, + { + "epoch": 0.578677967967417, + "grad_norm": 0.7048231959342957, + "learning_rate": 8.096975827122795e-06, + "loss": 0.7964, + "step": 10514 + }, + { + "epoch": 0.5787330067697727, + "grad_norm": 0.8417022228240967, + "learning_rate": 8.096635509783315e-06, + "loss": 0.7703, + "step": 10515 + }, + { + "epoch": 0.5787880455721284, + "grad_norm": 0.7313926815986633, + "learning_rate": 8.096295169170352e-06, + "loss": 0.7565, + "step": 10516 + }, + { + "epoch": 0.5788430843744841, + "grad_norm": 0.7156692147254944, + "learning_rate": 8.095954805286464e-06, + "loss": 0.7456, + "step": 10517 + }, + { + "epoch": 0.5788981231768396, + "grad_norm": 0.7366768717765808, + "learning_rate": 8.095614418134205e-06, + "loss": 0.72, + "step": 10518 + }, + { + "epoch": 0.5789531619791953, + "grad_norm": 0.7011533379554749, + "learning_rate": 8.09527400771614e-06, + "loss": 0.7683, + "step": 10519 + }, + { + "epoch": 0.579008200781551, + "grad_norm": 0.6849086284637451, + "learning_rate": 8.094933574034823e-06, + "loss": 0.6938, + "step": 10520 + }, + { + "epoch": 0.5790632395839067, + "grad_norm": 0.7351469397544861, + "learning_rate": 8.094593117092814e-06, + "loss": 0.7364, + "step": 10521 + }, + { + "epoch": 0.5791182783862623, + "grad_norm": 0.7133724689483643, + "learning_rate": 8.09425263689267e-06, + "loss": 0.7328, + "step": 10522 + }, + { + "epoch": 0.579173317188618, + "grad_norm": 0.6713461875915527, + "learning_rate": 8.093912133436954e-06, + "loss": 0.7296, + "step": 10523 + }, + { + "epoch": 0.5792283559909737, + "grad_norm": 0.7057825922966003, + "learning_rate": 8.093571606728222e-06, + "loss": 0.7732, + "step": 10524 + }, + { + "epoch": 0.5792833947933292, + "grad_norm": 0.7378783226013184, + "learning_rate": 8.093231056769033e-06, + "loss": 0.7907, + "step": 10525 + }, + { + "epoch": 0.5793384335956849, + "grad_norm": 0.8796947598457336, + "learning_rate": 8.092890483561947e-06, + "loss": 0.7325, + "step": 10526 + }, + { + "epoch": 0.5793934723980406, + "grad_norm": 0.7326352000236511, + "learning_rate": 8.092549887109525e-06, + "loss": 0.7948, + "step": 10527 + }, + { + "epoch": 0.5794485112003963, + "grad_norm": 0.7131063342094421, + "learning_rate": 8.092209267414325e-06, + "loss": 0.7595, + "step": 10528 + }, + { + "epoch": 0.5795035500027519, + "grad_norm": 0.6993252635002136, + "learning_rate": 8.091868624478908e-06, + "loss": 0.782, + "step": 10529 + }, + { + "epoch": 0.5795585888051076, + "grad_norm": 0.6945857405662537, + "learning_rate": 8.091527958305835e-06, + "loss": 0.7283, + "step": 10530 + }, + { + "epoch": 0.5796136276074633, + "grad_norm": 0.8203904032707214, + "learning_rate": 8.091187268897667e-06, + "loss": 0.7787, + "step": 10531 + }, + { + "epoch": 0.579668666409819, + "grad_norm": 0.6450221538543701, + "learning_rate": 8.09084655625696e-06, + "loss": 0.7092, + "step": 10532 + }, + { + "epoch": 0.5797237052121745, + "grad_norm": 0.6852096915245056, + "learning_rate": 8.090505820386279e-06, + "loss": 0.7916, + "step": 10533 + }, + { + "epoch": 0.5797787440145302, + "grad_norm": 1.0816445350646973, + "learning_rate": 8.090165061288182e-06, + "loss": 0.7545, + "step": 10534 + }, + { + "epoch": 0.5798337828168859, + "grad_norm": 0.7312847375869751, + "learning_rate": 8.089824278965233e-06, + "loss": 0.7395, + "step": 10535 + }, + { + "epoch": 0.5798888216192416, + "grad_norm": 0.7281426191329956, + "learning_rate": 8.089483473419992e-06, + "loss": 0.7677, + "step": 10536 + }, + { + "epoch": 0.5799438604215972, + "grad_norm": 0.7392409443855286, + "learning_rate": 8.08914264465502e-06, + "loss": 0.7674, + "step": 10537 + }, + { + "epoch": 0.5799988992239529, + "grad_norm": 0.7041863799095154, + "learning_rate": 8.088801792672877e-06, + "loss": 0.6156, + "step": 10538 + }, + { + "epoch": 0.5800539380263086, + "grad_norm": 0.7113755345344543, + "learning_rate": 8.088460917476128e-06, + "loss": 0.7677, + "step": 10539 + }, + { + "epoch": 0.5801089768286642, + "grad_norm": 0.673966646194458, + "learning_rate": 8.088120019067334e-06, + "loss": 0.7557, + "step": 10540 + }, + { + "epoch": 0.5801640156310198, + "grad_norm": 0.8165854215621948, + "learning_rate": 8.087779097449055e-06, + "loss": 0.8102, + "step": 10541 + }, + { + "epoch": 0.5802190544333755, + "grad_norm": 0.7010880708694458, + "learning_rate": 8.087438152623857e-06, + "loss": 0.7816, + "step": 10542 + }, + { + "epoch": 0.5802740932357312, + "grad_norm": 0.726177990436554, + "learning_rate": 8.0870971845943e-06, + "loss": 0.7671, + "step": 10543 + }, + { + "epoch": 0.5803291320380869, + "grad_norm": 0.7403919696807861, + "learning_rate": 8.086756193362946e-06, + "loss": 0.8449, + "step": 10544 + }, + { + "epoch": 0.5803841708404425, + "grad_norm": 0.6897104382514954, + "learning_rate": 8.086415178932358e-06, + "loss": 0.7563, + "step": 10545 + }, + { + "epoch": 0.5804392096427982, + "grad_norm": 0.7682604193687439, + "learning_rate": 8.0860741413051e-06, + "loss": 0.8019, + "step": 10546 + }, + { + "epoch": 0.5804942484451538, + "grad_norm": 0.7317522168159485, + "learning_rate": 8.085733080483736e-06, + "loss": 0.7446, + "step": 10547 + }, + { + "epoch": 0.5805492872475095, + "grad_norm": 0.8503430485725403, + "learning_rate": 8.085391996470826e-06, + "loss": 0.7343, + "step": 10548 + }, + { + "epoch": 0.5806043260498651, + "grad_norm": 0.8550657629966736, + "learning_rate": 8.085050889268937e-06, + "loss": 0.9267, + "step": 10549 + }, + { + "epoch": 0.5806593648522208, + "grad_norm": 0.7751224637031555, + "learning_rate": 8.084709758880633e-06, + "loss": 0.7404, + "step": 10550 + }, + { + "epoch": 0.5807144036545765, + "grad_norm": 0.6346186399459839, + "learning_rate": 8.084368605308475e-06, + "loss": 0.66, + "step": 10551 + }, + { + "epoch": 0.5807694424569322, + "grad_norm": 0.7295717597007751, + "learning_rate": 8.084027428555027e-06, + "loss": 0.8313, + "step": 10552 + }, + { + "epoch": 0.5808244812592878, + "grad_norm": 0.6962289810180664, + "learning_rate": 8.083686228622856e-06, + "loss": 0.7871, + "step": 10553 + }, + { + "epoch": 0.5808795200616435, + "grad_norm": 0.6968896389007568, + "learning_rate": 8.083345005514522e-06, + "loss": 0.7261, + "step": 10554 + }, + { + "epoch": 0.5809345588639991, + "grad_norm": 0.8374869227409363, + "learning_rate": 8.083003759232595e-06, + "loss": 0.797, + "step": 10555 + }, + { + "epoch": 0.5809895976663548, + "grad_norm": 0.6511034369468689, + "learning_rate": 8.082662489779637e-06, + "loss": 0.7237, + "step": 10556 + }, + { + "epoch": 0.5810446364687104, + "grad_norm": 0.6644287705421448, + "learning_rate": 8.082321197158212e-06, + "loss": 0.6969, + "step": 10557 + }, + { + "epoch": 0.5810996752710661, + "grad_norm": 0.7681102752685547, + "learning_rate": 8.081979881370884e-06, + "loss": 0.7193, + "step": 10558 + }, + { + "epoch": 0.5811547140734218, + "grad_norm": 0.7930792570114136, + "learning_rate": 8.081638542420224e-06, + "loss": 0.7198, + "step": 10559 + }, + { + "epoch": 0.5812097528757775, + "grad_norm": 0.7227992415428162, + "learning_rate": 8.081297180308791e-06, + "loss": 0.7533, + "step": 10560 + }, + { + "epoch": 0.581264791678133, + "grad_norm": 0.7293071150779724, + "learning_rate": 8.080955795039156e-06, + "loss": 0.6228, + "step": 10561 + }, + { + "epoch": 0.5813198304804887, + "grad_norm": 0.7356483936309814, + "learning_rate": 8.080614386613879e-06, + "loss": 0.7299, + "step": 10562 + }, + { + "epoch": 0.5813748692828444, + "grad_norm": 0.8181473016738892, + "learning_rate": 8.080272955035531e-06, + "loss": 0.6576, + "step": 10563 + }, + { + "epoch": 0.5814299080852001, + "grad_norm": 0.7066958546638489, + "learning_rate": 8.079931500306675e-06, + "loss": 0.7372, + "step": 10564 + }, + { + "epoch": 0.5814849468875557, + "grad_norm": 0.6821097135543823, + "learning_rate": 8.079590022429877e-06, + "loss": 0.7516, + "step": 10565 + }, + { + "epoch": 0.5815399856899114, + "grad_norm": 0.6879069209098816, + "learning_rate": 8.079248521407707e-06, + "loss": 0.7525, + "step": 10566 + }, + { + "epoch": 0.5815950244922671, + "grad_norm": 0.956345796585083, + "learning_rate": 8.078906997242729e-06, + "loss": 0.8175, + "step": 10567 + }, + { + "epoch": 0.5816500632946227, + "grad_norm": 0.6942328214645386, + "learning_rate": 8.078565449937508e-06, + "loss": 0.6264, + "step": 10568 + }, + { + "epoch": 0.5817051020969783, + "grad_norm": 0.7073766589164734, + "learning_rate": 8.078223879494615e-06, + "loss": 0.766, + "step": 10569 + }, + { + "epoch": 0.581760140899334, + "grad_norm": 0.7649571895599365, + "learning_rate": 8.077882285916614e-06, + "loss": 0.8767, + "step": 10570 + }, + { + "epoch": 0.5818151797016897, + "grad_norm": 0.6384355425834656, + "learning_rate": 8.077540669206076e-06, + "loss": 0.7444, + "step": 10571 + }, + { + "epoch": 0.5818702185040453, + "grad_norm": 0.7173928022384644, + "learning_rate": 8.077199029365565e-06, + "loss": 0.8277, + "step": 10572 + }, + { + "epoch": 0.581925257306401, + "grad_norm": 0.7310757637023926, + "learning_rate": 8.076857366397648e-06, + "loss": 0.8425, + "step": 10573 + }, + { + "epoch": 0.5819802961087567, + "grad_norm": 0.6888872385025024, + "learning_rate": 8.076515680304897e-06, + "loss": 0.6961, + "step": 10574 + }, + { + "epoch": 0.5820353349111124, + "grad_norm": 0.7290124297142029, + "learning_rate": 8.076173971089877e-06, + "loss": 0.7865, + "step": 10575 + }, + { + "epoch": 0.582090373713468, + "grad_norm": 0.7402634024620056, + "learning_rate": 8.075832238755156e-06, + "loss": 0.7196, + "step": 10576 + }, + { + "epoch": 0.5821454125158236, + "grad_norm": 0.74916672706604, + "learning_rate": 8.075490483303305e-06, + "loss": 0.8361, + "step": 10577 + }, + { + "epoch": 0.5822004513181793, + "grad_norm": 0.8146494626998901, + "learning_rate": 8.07514870473689e-06, + "loss": 0.7398, + "step": 10578 + }, + { + "epoch": 0.582255490120535, + "grad_norm": 0.6632487177848816, + "learning_rate": 8.07480690305848e-06, + "loss": 0.7239, + "step": 10579 + }, + { + "epoch": 0.5823105289228906, + "grad_norm": 0.6912766695022583, + "learning_rate": 8.074465078270645e-06, + "loss": 0.7488, + "step": 10580 + }, + { + "epoch": 0.5823655677252463, + "grad_norm": 0.7410522699356079, + "learning_rate": 8.074123230375952e-06, + "loss": 0.7413, + "step": 10581 + }, + { + "epoch": 0.582420606527602, + "grad_norm": 0.7932689189910889, + "learning_rate": 8.073781359376972e-06, + "loss": 0.7894, + "step": 10582 + }, + { + "epoch": 0.5824756453299577, + "grad_norm": 0.6710309982299805, + "learning_rate": 8.073439465276277e-06, + "loss": 0.6727, + "step": 10583 + }, + { + "epoch": 0.5825306841323132, + "grad_norm": 0.7457143068313599, + "learning_rate": 8.07309754807643e-06, + "loss": 0.6719, + "step": 10584 + }, + { + "epoch": 0.5825857229346689, + "grad_norm": 0.7340453863143921, + "learning_rate": 8.072755607780008e-06, + "loss": 0.7397, + "step": 10585 + }, + { + "epoch": 0.5826407617370246, + "grad_norm": 0.7532176971435547, + "learning_rate": 8.072413644389574e-06, + "loss": 0.7368, + "step": 10586 + }, + { + "epoch": 0.5826958005393803, + "grad_norm": 0.9317812919616699, + "learning_rate": 8.072071657907703e-06, + "loss": 0.9113, + "step": 10587 + }, + { + "epoch": 0.5827508393417359, + "grad_norm": 0.8535491228103638, + "learning_rate": 8.071729648336963e-06, + "loss": 0.7708, + "step": 10588 + }, + { + "epoch": 0.5828058781440916, + "grad_norm": 0.6720348000526428, + "learning_rate": 8.071387615679926e-06, + "loss": 0.7521, + "step": 10589 + }, + { + "epoch": 0.5828609169464473, + "grad_norm": 0.7113864421844482, + "learning_rate": 8.071045559939162e-06, + "loss": 0.8713, + "step": 10590 + }, + { + "epoch": 0.582915955748803, + "grad_norm": 0.7760024070739746, + "learning_rate": 8.070703481117242e-06, + "loss": 0.7567, + "step": 10591 + }, + { + "epoch": 0.5829709945511585, + "grad_norm": 0.9548617005348206, + "learning_rate": 8.070361379216735e-06, + "loss": 0.7937, + "step": 10592 + }, + { + "epoch": 0.5830260333535142, + "grad_norm": 0.7796840667724609, + "learning_rate": 8.070019254240216e-06, + "loss": 0.7485, + "step": 10593 + }, + { + "epoch": 0.5830810721558699, + "grad_norm": 0.7006514668464661, + "learning_rate": 8.069677106190253e-06, + "loss": 0.7813, + "step": 10594 + }, + { + "epoch": 0.5831361109582256, + "grad_norm": 0.646396279335022, + "learning_rate": 8.069334935069417e-06, + "loss": 0.7437, + "step": 10595 + }, + { + "epoch": 0.5831911497605812, + "grad_norm": 0.8257368206977844, + "learning_rate": 8.068992740880283e-06, + "loss": 0.7351, + "step": 10596 + }, + { + "epoch": 0.5832461885629369, + "grad_norm": 0.6646208763122559, + "learning_rate": 8.068650523625422e-06, + "loss": 0.6554, + "step": 10597 + }, + { + "epoch": 0.5833012273652926, + "grad_norm": 0.8495579957962036, + "learning_rate": 8.068308283307402e-06, + "loss": 0.791, + "step": 10598 + }, + { + "epoch": 0.5833562661676482, + "grad_norm": 0.7283076047897339, + "learning_rate": 8.0679660199288e-06, + "loss": 0.7327, + "step": 10599 + }, + { + "epoch": 0.5834113049700038, + "grad_norm": 0.704572856426239, + "learning_rate": 8.067623733492187e-06, + "loss": 0.6094, + "step": 10600 + }, + { + "epoch": 0.5834663437723595, + "grad_norm": 0.6435144543647766, + "learning_rate": 8.067281424000136e-06, + "loss": 0.6974, + "step": 10601 + }, + { + "epoch": 0.5835213825747152, + "grad_norm": 0.9628346562385559, + "learning_rate": 8.066939091455215e-06, + "loss": 0.8933, + "step": 10602 + }, + { + "epoch": 0.5835764213770709, + "grad_norm": 0.6856930255889893, + "learning_rate": 8.066596735860004e-06, + "loss": 0.7414, + "step": 10603 + }, + { + "epoch": 0.5836314601794265, + "grad_norm": 0.7341175675392151, + "learning_rate": 8.066254357217072e-06, + "loss": 0.7553, + "step": 10604 + }, + { + "epoch": 0.5836864989817822, + "grad_norm": 0.7124871611595154, + "learning_rate": 8.065911955528995e-06, + "loss": 0.663, + "step": 10605 + }, + { + "epoch": 0.5837415377841378, + "grad_norm": 0.816028892993927, + "learning_rate": 8.065569530798341e-06, + "loss": 0.8778, + "step": 10606 + }, + { + "epoch": 0.5837965765864935, + "grad_norm": 0.8735721111297607, + "learning_rate": 8.06522708302769e-06, + "loss": 0.7866, + "step": 10607 + }, + { + "epoch": 0.5838516153888491, + "grad_norm": 0.6780036687850952, + "learning_rate": 8.06488461221961e-06, + "loss": 0.7329, + "step": 10608 + }, + { + "epoch": 0.5839066541912048, + "grad_norm": 0.7624822854995728, + "learning_rate": 8.06454211837668e-06, + "loss": 0.8095, + "step": 10609 + }, + { + "epoch": 0.5839616929935605, + "grad_norm": 0.8269234895706177, + "learning_rate": 8.06419960150147e-06, + "loss": 0.7194, + "step": 10610 + }, + { + "epoch": 0.5840167317959161, + "grad_norm": 0.6748649477958679, + "learning_rate": 8.063857061596558e-06, + "loss": 0.702, + "step": 10611 + }, + { + "epoch": 0.5840717705982718, + "grad_norm": 0.9700273275375366, + "learning_rate": 8.063514498664515e-06, + "loss": 0.7917, + "step": 10612 + }, + { + "epoch": 0.5841268094006274, + "grad_norm": 0.7798827290534973, + "learning_rate": 8.063171912707916e-06, + "loss": 0.798, + "step": 10613 + }, + { + "epoch": 0.5841818482029831, + "grad_norm": 0.6613249778747559, + "learning_rate": 8.06282930372934e-06, + "loss": 0.7216, + "step": 10614 + }, + { + "epoch": 0.5842368870053387, + "grad_norm": 0.727116048336029, + "learning_rate": 8.062486671731357e-06, + "loss": 0.8054, + "step": 10615 + }, + { + "epoch": 0.5842919258076944, + "grad_norm": 0.6704444289207458, + "learning_rate": 8.062144016716543e-06, + "loss": 0.7503, + "step": 10616 + }, + { + "epoch": 0.5843469646100501, + "grad_norm": 0.6867938041687012, + "learning_rate": 8.061801338687477e-06, + "loss": 0.8005, + "step": 10617 + }, + { + "epoch": 0.5844020034124058, + "grad_norm": 0.7097555994987488, + "learning_rate": 8.061458637646729e-06, + "loss": 0.8515, + "step": 10618 + }, + { + "epoch": 0.5844570422147614, + "grad_norm": 0.6624881625175476, + "learning_rate": 8.061115913596878e-06, + "loss": 0.7735, + "step": 10619 + }, + { + "epoch": 0.584512081017117, + "grad_norm": 0.6649004220962524, + "learning_rate": 8.060773166540498e-06, + "loss": 0.7837, + "step": 10620 + }, + { + "epoch": 0.5845671198194727, + "grad_norm": 0.6732968091964722, + "learning_rate": 8.06043039648017e-06, + "loss": 0.7846, + "step": 10621 + }, + { + "epoch": 0.5846221586218284, + "grad_norm": 0.7551947236061096, + "learning_rate": 8.060087603418464e-06, + "loss": 0.6868, + "step": 10622 + }, + { + "epoch": 0.584677197424184, + "grad_norm": 0.7781728506088257, + "learning_rate": 8.059744787357959e-06, + "loss": 0.8088, + "step": 10623 + }, + { + "epoch": 0.5847322362265397, + "grad_norm": 0.6362790465354919, + "learning_rate": 8.05940194830123e-06, + "loss": 0.664, + "step": 10624 + }, + { + "epoch": 0.5847872750288954, + "grad_norm": 0.670386791229248, + "learning_rate": 8.059059086250856e-06, + "loss": 0.6839, + "step": 10625 + }, + { + "epoch": 0.5848423138312511, + "grad_norm": 0.7030045986175537, + "learning_rate": 8.058716201209414e-06, + "loss": 0.7243, + "step": 10626 + }, + { + "epoch": 0.5848973526336066, + "grad_norm": 0.7881805896759033, + "learning_rate": 8.058373293179477e-06, + "loss": 0.7994, + "step": 10627 + }, + { + "epoch": 0.5849523914359623, + "grad_norm": 0.7077344059944153, + "learning_rate": 8.058030362163628e-06, + "loss": 0.822, + "step": 10628 + }, + { + "epoch": 0.585007430238318, + "grad_norm": 0.6787039637565613, + "learning_rate": 8.057687408164439e-06, + "loss": 0.7619, + "step": 10629 + }, + { + "epoch": 0.5850624690406737, + "grad_norm": 1.1377217769622803, + "learning_rate": 8.05734443118449e-06, + "loss": 0.8632, + "step": 10630 + }, + { + "epoch": 0.5851175078430293, + "grad_norm": 0.7002600431442261, + "learning_rate": 8.05700143122636e-06, + "loss": 0.8184, + "step": 10631 + }, + { + "epoch": 0.585172546645385, + "grad_norm": 0.7016324400901794, + "learning_rate": 8.056658408292626e-06, + "loss": 0.658, + "step": 10632 + }, + { + "epoch": 0.5852275854477407, + "grad_norm": 0.6674843430519104, + "learning_rate": 8.056315362385864e-06, + "loss": 0.7281, + "step": 10633 + }, + { + "epoch": 0.5852826242500964, + "grad_norm": 0.6789288520812988, + "learning_rate": 8.055972293508653e-06, + "loss": 0.8192, + "step": 10634 + }, + { + "epoch": 0.5853376630524519, + "grad_norm": 0.6740062236785889, + "learning_rate": 8.055629201663575e-06, + "loss": 0.7343, + "step": 10635 + }, + { + "epoch": 0.5853927018548076, + "grad_norm": 0.7417730689048767, + "learning_rate": 8.055286086853204e-06, + "loss": 0.8161, + "step": 10636 + }, + { + "epoch": 0.5854477406571633, + "grad_norm": 0.6680465340614319, + "learning_rate": 8.054942949080122e-06, + "loss": 0.7589, + "step": 10637 + }, + { + "epoch": 0.585502779459519, + "grad_norm": 0.7205108404159546, + "learning_rate": 8.054599788346904e-06, + "loss": 0.6837, + "step": 10638 + }, + { + "epoch": 0.5855578182618746, + "grad_norm": 0.8694404363632202, + "learning_rate": 8.054256604656134e-06, + "loss": 0.8033, + "step": 10639 + }, + { + "epoch": 0.5856128570642303, + "grad_norm": 0.685471773147583, + "learning_rate": 8.053913398010389e-06, + "loss": 0.7654, + "step": 10640 + }, + { + "epoch": 0.585667895866586, + "grad_norm": 1.3463424444198608, + "learning_rate": 8.053570168412249e-06, + "loss": 0.7743, + "step": 10641 + }, + { + "epoch": 0.5857229346689417, + "grad_norm": 0.9380106329917908, + "learning_rate": 8.05322691586429e-06, + "loss": 0.8984, + "step": 10642 + }, + { + "epoch": 0.5857779734712972, + "grad_norm": 0.7408519387245178, + "learning_rate": 8.052883640369096e-06, + "loss": 0.7716, + "step": 10643 + }, + { + "epoch": 0.5858330122736529, + "grad_norm": 0.7712904214859009, + "learning_rate": 8.052540341929248e-06, + "loss": 0.7767, + "step": 10644 + }, + { + "epoch": 0.5858880510760086, + "grad_norm": 0.8464158177375793, + "learning_rate": 8.052197020547321e-06, + "loss": 0.8333, + "step": 10645 + }, + { + "epoch": 0.5859430898783643, + "grad_norm": 0.6970158219337463, + "learning_rate": 8.0518536762259e-06, + "loss": 0.7354, + "step": 10646 + }, + { + "epoch": 0.5859981286807199, + "grad_norm": 0.7048965096473694, + "learning_rate": 8.051510308967563e-06, + "loss": 0.8333, + "step": 10647 + }, + { + "epoch": 0.5860531674830756, + "grad_norm": 0.6443868279457092, + "learning_rate": 8.05116691877489e-06, + "loss": 0.7386, + "step": 10648 + }, + { + "epoch": 0.5861082062854313, + "grad_norm": 0.6653542518615723, + "learning_rate": 8.050823505650465e-06, + "loss": 0.8116, + "step": 10649 + }, + { + "epoch": 0.5861632450877869, + "grad_norm": 0.7293158769607544, + "learning_rate": 8.050480069596868e-06, + "loss": 0.8231, + "step": 10650 + }, + { + "epoch": 0.5862182838901425, + "grad_norm": 0.6876117587089539, + "learning_rate": 8.050136610616676e-06, + "loss": 0.7856, + "step": 10651 + }, + { + "epoch": 0.5862733226924982, + "grad_norm": 0.6811665296554565, + "learning_rate": 8.049793128712477e-06, + "loss": 0.7667, + "step": 10652 + }, + { + "epoch": 0.5863283614948539, + "grad_norm": 0.701034426689148, + "learning_rate": 8.049449623886849e-06, + "loss": 0.7812, + "step": 10653 + }, + { + "epoch": 0.5863834002972095, + "grad_norm": 0.6872833967208862, + "learning_rate": 8.049106096142372e-06, + "loss": 0.755, + "step": 10654 + }, + { + "epoch": 0.5864384390995652, + "grad_norm": 0.6643580198287964, + "learning_rate": 8.04876254548163e-06, + "loss": 0.7692, + "step": 10655 + }, + { + "epoch": 0.5864934779019209, + "grad_norm": 0.6672106981277466, + "learning_rate": 8.048418971907206e-06, + "loss": 0.7424, + "step": 10656 + }, + { + "epoch": 0.5865485167042765, + "grad_norm": 0.8030515313148499, + "learning_rate": 8.04807537542168e-06, + "loss": 0.8074, + "step": 10657 + }, + { + "epoch": 0.5866035555066321, + "grad_norm": 0.713417112827301, + "learning_rate": 8.047731756027637e-06, + "loss": 0.6974, + "step": 10658 + }, + { + "epoch": 0.5866585943089878, + "grad_norm": 0.7715572118759155, + "learning_rate": 8.047388113727657e-06, + "loss": 0.7353, + "step": 10659 + }, + { + "epoch": 0.5867136331113435, + "grad_norm": 0.7009812593460083, + "learning_rate": 8.047044448524323e-06, + "loss": 0.7992, + "step": 10660 + }, + { + "epoch": 0.5867686719136992, + "grad_norm": 0.6425079107284546, + "learning_rate": 8.046700760420219e-06, + "loss": 0.7394, + "step": 10661 + }, + { + "epoch": 0.5868237107160548, + "grad_norm": 0.7713460922241211, + "learning_rate": 8.046357049417927e-06, + "loss": 0.7759, + "step": 10662 + }, + { + "epoch": 0.5868787495184105, + "grad_norm": 0.7310347557067871, + "learning_rate": 8.046013315520033e-06, + "loss": 0.7278, + "step": 10663 + }, + { + "epoch": 0.5869337883207661, + "grad_norm": 0.7493315935134888, + "learning_rate": 8.045669558729117e-06, + "loss": 0.7808, + "step": 10664 + }, + { + "epoch": 0.5869888271231218, + "grad_norm": 0.7547439336776733, + "learning_rate": 8.045325779047763e-06, + "loss": 0.8245, + "step": 10665 + }, + { + "epoch": 0.5870438659254774, + "grad_norm": 0.7556985020637512, + "learning_rate": 8.044981976478557e-06, + "loss": 0.8, + "step": 10666 + }, + { + "epoch": 0.5870989047278331, + "grad_norm": 0.8330736756324768, + "learning_rate": 8.04463815102408e-06, + "loss": 0.8177, + "step": 10667 + }, + { + "epoch": 0.5871539435301888, + "grad_norm": 0.7823941111564636, + "learning_rate": 8.04429430268692e-06, + "loss": 0.8306, + "step": 10668 + }, + { + "epoch": 0.5872089823325445, + "grad_norm": 0.9141719937324524, + "learning_rate": 8.043950431469657e-06, + "loss": 0.9137, + "step": 10669 + }, + { + "epoch": 0.5872640211349001, + "grad_norm": 0.6967095732688904, + "learning_rate": 8.043606537374878e-06, + "loss": 0.7262, + "step": 10670 + }, + { + "epoch": 0.5873190599372557, + "grad_norm": 0.7909649014472961, + "learning_rate": 8.043262620405166e-06, + "loss": 0.8332, + "step": 10671 + }, + { + "epoch": 0.5873740987396114, + "grad_norm": 0.7967168092727661, + "learning_rate": 8.042918680563107e-06, + "loss": 0.7966, + "step": 10672 + }, + { + "epoch": 0.5874291375419671, + "grad_norm": 0.7637625336647034, + "learning_rate": 8.042574717851287e-06, + "loss": 0.8322, + "step": 10673 + }, + { + "epoch": 0.5874841763443227, + "grad_norm": 0.6968004107475281, + "learning_rate": 8.04223073227229e-06, + "loss": 0.8061, + "step": 10674 + }, + { + "epoch": 0.5875392151466784, + "grad_norm": 0.7325586080551147, + "learning_rate": 8.0418867238287e-06, + "loss": 0.7922, + "step": 10675 + }, + { + "epoch": 0.5875942539490341, + "grad_norm": 0.6784406304359436, + "learning_rate": 8.041542692523103e-06, + "loss": 0.7327, + "step": 10676 + }, + { + "epoch": 0.5876492927513898, + "grad_norm": 0.8297861218452454, + "learning_rate": 8.041198638358088e-06, + "loss": 0.9347, + "step": 10677 + }, + { + "epoch": 0.5877043315537454, + "grad_norm": 0.6227413415908813, + "learning_rate": 8.040854561336236e-06, + "loss": 0.655, + "step": 10678 + }, + { + "epoch": 0.587759370356101, + "grad_norm": 0.752098023891449, + "learning_rate": 8.040510461460134e-06, + "loss": 0.7608, + "step": 10679 + }, + { + "epoch": 0.5878144091584567, + "grad_norm": 0.7008342146873474, + "learning_rate": 8.040166338732372e-06, + "loss": 0.7385, + "step": 10680 + }, + { + "epoch": 0.5878694479608124, + "grad_norm": 0.6768027544021606, + "learning_rate": 8.039822193155532e-06, + "loss": 0.6812, + "step": 10681 + }, + { + "epoch": 0.587924486763168, + "grad_norm": 0.7728545069694519, + "learning_rate": 8.039478024732203e-06, + "loss": 0.7696, + "step": 10682 + }, + { + "epoch": 0.5879795255655237, + "grad_norm": 0.7257505655288696, + "learning_rate": 8.03913383346497e-06, + "loss": 0.6686, + "step": 10683 + }, + { + "epoch": 0.5880345643678794, + "grad_norm": 0.7755837440490723, + "learning_rate": 8.03878961935642e-06, + "loss": 0.8469, + "step": 10684 + }, + { + "epoch": 0.5880896031702351, + "grad_norm": 0.7187668085098267, + "learning_rate": 8.038445382409142e-06, + "loss": 0.8249, + "step": 10685 + }, + { + "epoch": 0.5881446419725906, + "grad_norm": 0.638053834438324, + "learning_rate": 8.038101122625722e-06, + "loss": 0.6876, + "step": 10686 + }, + { + "epoch": 0.5881996807749463, + "grad_norm": 0.7323756217956543, + "learning_rate": 8.037756840008746e-06, + "loss": 0.7489, + "step": 10687 + }, + { + "epoch": 0.588254719577302, + "grad_norm": 0.6795439720153809, + "learning_rate": 8.037412534560804e-06, + "loss": 0.7246, + "step": 10688 + }, + { + "epoch": 0.5883097583796577, + "grad_norm": 0.8136376142501831, + "learning_rate": 8.037068206284482e-06, + "loss": 0.8518, + "step": 10689 + }, + { + "epoch": 0.5883647971820133, + "grad_norm": 0.6484195590019226, + "learning_rate": 8.036723855182367e-06, + "loss": 0.7018, + "step": 10690 + }, + { + "epoch": 0.588419835984369, + "grad_norm": 0.7465028166770935, + "learning_rate": 8.036379481257048e-06, + "loss": 0.8276, + "step": 10691 + }, + { + "epoch": 0.5884748747867247, + "grad_norm": 0.7761173844337463, + "learning_rate": 8.036035084511116e-06, + "loss": 0.6371, + "step": 10692 + }, + { + "epoch": 0.5885299135890804, + "grad_norm": 0.830008864402771, + "learning_rate": 8.035690664947156e-06, + "loss": 0.8199, + "step": 10693 + }, + { + "epoch": 0.5885849523914359, + "grad_norm": 0.6614254117012024, + "learning_rate": 8.03534622256776e-06, + "loss": 0.656, + "step": 10694 + }, + { + "epoch": 0.5886399911937916, + "grad_norm": 0.7229047417640686, + "learning_rate": 8.035001757375509e-06, + "loss": 0.7622, + "step": 10695 + }, + { + "epoch": 0.5886950299961473, + "grad_norm": 0.7044325470924377, + "learning_rate": 8.034657269373001e-06, + "loss": 0.7678, + "step": 10696 + }, + { + "epoch": 0.5887500687985029, + "grad_norm": 0.7109018564224243, + "learning_rate": 8.03431275856282e-06, + "loss": 0.7976, + "step": 10697 + }, + { + "epoch": 0.5888051076008586, + "grad_norm": 0.7812879085540771, + "learning_rate": 8.033968224947557e-06, + "loss": 0.7163, + "step": 10698 + }, + { + "epoch": 0.5888601464032143, + "grad_norm": 0.7408469915390015, + "learning_rate": 8.033623668529802e-06, + "loss": 0.6895, + "step": 10699 + }, + { + "epoch": 0.58891518520557, + "grad_norm": 0.7654302716255188, + "learning_rate": 8.033279089312142e-06, + "loss": 0.8126, + "step": 10700 + }, + { + "epoch": 0.5889702240079255, + "grad_norm": 0.7307846546173096, + "learning_rate": 8.032934487297169e-06, + "loss": 0.7958, + "step": 10701 + }, + { + "epoch": 0.5890252628102812, + "grad_norm": 0.6658591032028198, + "learning_rate": 8.032589862487472e-06, + "loss": 0.717, + "step": 10702 + }, + { + "epoch": 0.5890803016126369, + "grad_norm": 1.4167139530181885, + "learning_rate": 8.03224521488564e-06, + "loss": 0.8599, + "step": 10703 + }, + { + "epoch": 0.5891353404149926, + "grad_norm": 0.6723609566688538, + "learning_rate": 8.031900544494266e-06, + "loss": 0.8167, + "step": 10704 + }, + { + "epoch": 0.5891903792173482, + "grad_norm": 0.6420501470565796, + "learning_rate": 8.03155585131594e-06, + "loss": 0.692, + "step": 10705 + }, + { + "epoch": 0.5892454180197039, + "grad_norm": 0.6973454356193542, + "learning_rate": 8.031211135353251e-06, + "loss": 0.7709, + "step": 10706 + }, + { + "epoch": 0.5893004568220596, + "grad_norm": 0.7752252221107483, + "learning_rate": 8.03086639660879e-06, + "loss": 0.7795, + "step": 10707 + }, + { + "epoch": 0.5893554956244152, + "grad_norm": 0.8193135857582092, + "learning_rate": 8.030521635085149e-06, + "loss": 0.812, + "step": 10708 + }, + { + "epoch": 0.5894105344267708, + "grad_norm": 0.7976878881454468, + "learning_rate": 8.03017685078492e-06, + "loss": 0.8039, + "step": 10709 + }, + { + "epoch": 0.5894655732291265, + "grad_norm": 0.7545839548110962, + "learning_rate": 8.02983204371069e-06, + "loss": 0.8238, + "step": 10710 + }, + { + "epoch": 0.5895206120314822, + "grad_norm": 0.6544732451438904, + "learning_rate": 8.029487213865054e-06, + "loss": 0.7471, + "step": 10711 + }, + { + "epoch": 0.5895756508338379, + "grad_norm": 0.7054508924484253, + "learning_rate": 8.029142361250603e-06, + "loss": 0.8283, + "step": 10712 + }, + { + "epoch": 0.5896306896361935, + "grad_norm": 0.7425236105918884, + "learning_rate": 8.02879748586993e-06, + "loss": 0.8031, + "step": 10713 + }, + { + "epoch": 0.5896857284385492, + "grad_norm": 0.8390052318572998, + "learning_rate": 8.028452587725626e-06, + "loss": 0.7218, + "step": 10714 + }, + { + "epoch": 0.5897407672409049, + "grad_norm": 0.8116903901100159, + "learning_rate": 8.028107666820282e-06, + "loss": 0.8057, + "step": 10715 + }, + { + "epoch": 0.5897958060432605, + "grad_norm": 0.602308452129364, + "learning_rate": 8.027762723156492e-06, + "loss": 0.6428, + "step": 10716 + }, + { + "epoch": 0.5898508448456161, + "grad_norm": 0.7480159401893616, + "learning_rate": 8.027417756736848e-06, + "loss": 0.7566, + "step": 10717 + }, + { + "epoch": 0.5899058836479718, + "grad_norm": 0.6823177933692932, + "learning_rate": 8.027072767563943e-06, + "loss": 0.8337, + "step": 10718 + }, + { + "epoch": 0.5899609224503275, + "grad_norm": 0.6841796040534973, + "learning_rate": 8.026727755640367e-06, + "loss": 0.751, + "step": 10719 + }, + { + "epoch": 0.5900159612526832, + "grad_norm": 0.7257139086723328, + "learning_rate": 8.026382720968718e-06, + "loss": 0.7373, + "step": 10720 + }, + { + "epoch": 0.5900710000550388, + "grad_norm": 0.6318400502204895, + "learning_rate": 8.026037663551584e-06, + "loss": 0.7205, + "step": 10721 + }, + { + "epoch": 0.5901260388573945, + "grad_norm": 0.6612908840179443, + "learning_rate": 8.025692583391564e-06, + "loss": 0.7613, + "step": 10722 + }, + { + "epoch": 0.5901810776597501, + "grad_norm": 0.7555351853370667, + "learning_rate": 8.025347480491246e-06, + "loss": 0.718, + "step": 10723 + }, + { + "epoch": 0.5902361164621058, + "grad_norm": 0.6944366097450256, + "learning_rate": 8.025002354853227e-06, + "loss": 0.7775, + "step": 10724 + }, + { + "epoch": 0.5902911552644614, + "grad_norm": 0.6968230605125427, + "learning_rate": 8.0246572064801e-06, + "loss": 0.7316, + "step": 10725 + }, + { + "epoch": 0.5903461940668171, + "grad_norm": 0.7083567380905151, + "learning_rate": 8.024312035374459e-06, + "loss": 0.7844, + "step": 10726 + }, + { + "epoch": 0.5904012328691728, + "grad_norm": 0.7183080315589905, + "learning_rate": 8.0239668415389e-06, + "loss": 0.8308, + "step": 10727 + }, + { + "epoch": 0.5904562716715285, + "grad_norm": 0.8350495100021362, + "learning_rate": 8.023621624976014e-06, + "loss": 0.9077, + "step": 10728 + }, + { + "epoch": 0.590511310473884, + "grad_norm": 0.6876987218856812, + "learning_rate": 8.023276385688396e-06, + "loss": 0.7483, + "step": 10729 + }, + { + "epoch": 0.5905663492762397, + "grad_norm": 0.8617128133773804, + "learning_rate": 8.022931123678646e-06, + "loss": 0.7058, + "step": 10730 + }, + { + "epoch": 0.5906213880785954, + "grad_norm": 0.6921959519386292, + "learning_rate": 8.02258583894935e-06, + "loss": 0.7542, + "step": 10731 + }, + { + "epoch": 0.5906764268809511, + "grad_norm": 0.7394077181816101, + "learning_rate": 8.02224053150311e-06, + "loss": 0.7761, + "step": 10732 + }, + { + "epoch": 0.5907314656833067, + "grad_norm": 0.6672187447547913, + "learning_rate": 8.02189520134252e-06, + "loss": 0.6904, + "step": 10733 + }, + { + "epoch": 0.5907865044856624, + "grad_norm": 0.7498076558113098, + "learning_rate": 8.021549848470174e-06, + "loss": 0.7994, + "step": 10734 + }, + { + "epoch": 0.5908415432880181, + "grad_norm": 0.699832558631897, + "learning_rate": 8.021204472888669e-06, + "loss": 0.7413, + "step": 10735 + }, + { + "epoch": 0.5908965820903738, + "grad_norm": 0.7628722190856934, + "learning_rate": 8.020859074600598e-06, + "loss": 0.8202, + "step": 10736 + }, + { + "epoch": 0.5909516208927293, + "grad_norm": 0.8023744225502014, + "learning_rate": 8.020513653608558e-06, + "loss": 0.8225, + "step": 10737 + }, + { + "epoch": 0.591006659695085, + "grad_norm": 0.7283689379692078, + "learning_rate": 8.02016820991515e-06, + "loss": 0.6706, + "step": 10738 + }, + { + "epoch": 0.5910616984974407, + "grad_norm": 0.7199996113777161, + "learning_rate": 8.019822743522962e-06, + "loss": 0.8258, + "step": 10739 + }, + { + "epoch": 0.5911167372997963, + "grad_norm": 0.623249888420105, + "learning_rate": 8.019477254434598e-06, + "loss": 0.6188, + "step": 10740 + }, + { + "epoch": 0.591171776102152, + "grad_norm": 0.7331949472427368, + "learning_rate": 8.01913174265265e-06, + "loss": 0.8013, + "step": 10741 + }, + { + "epoch": 0.5912268149045077, + "grad_norm": 0.7003010511398315, + "learning_rate": 8.018786208179716e-06, + "loss": 0.8305, + "step": 10742 + }, + { + "epoch": 0.5912818537068634, + "grad_norm": 0.6879638433456421, + "learning_rate": 8.01844065101839e-06, + "loss": 0.7622, + "step": 10743 + }, + { + "epoch": 0.591336892509219, + "grad_norm": 0.6597324013710022, + "learning_rate": 8.018095071171276e-06, + "loss": 0.7362, + "step": 10744 + }, + { + "epoch": 0.5913919313115746, + "grad_norm": 0.664905846118927, + "learning_rate": 8.017749468640967e-06, + "loss": 0.7629, + "step": 10745 + }, + { + "epoch": 0.5914469701139303, + "grad_norm": 0.7358053922653198, + "learning_rate": 8.017403843430059e-06, + "loss": 0.7798, + "step": 10746 + }, + { + "epoch": 0.591502008916286, + "grad_norm": 0.699603259563446, + "learning_rate": 8.017058195541152e-06, + "loss": 0.6249, + "step": 10747 + }, + { + "epoch": 0.5915570477186416, + "grad_norm": 0.6736140847206116, + "learning_rate": 8.016712524976843e-06, + "loss": 0.6904, + "step": 10748 + }, + { + "epoch": 0.5916120865209973, + "grad_norm": 0.6803401112556458, + "learning_rate": 8.016366831739732e-06, + "loss": 0.6868, + "step": 10749 + }, + { + "epoch": 0.591667125323353, + "grad_norm": 0.7152959704399109, + "learning_rate": 8.016021115832413e-06, + "loss": 0.7747, + "step": 10750 + }, + { + "epoch": 0.5917221641257087, + "grad_norm": 0.6469255685806274, + "learning_rate": 8.015675377257489e-06, + "loss": 0.7309, + "step": 10751 + }, + { + "epoch": 0.5917772029280642, + "grad_norm": 0.7902734875679016, + "learning_rate": 8.015329616017554e-06, + "loss": 0.7575, + "step": 10752 + }, + { + "epoch": 0.5918322417304199, + "grad_norm": 0.7447189688682556, + "learning_rate": 8.014983832115208e-06, + "loss": 0.7759, + "step": 10753 + }, + { + "epoch": 0.5918872805327756, + "grad_norm": 0.6135374903678894, + "learning_rate": 8.014638025553053e-06, + "loss": 0.6681, + "step": 10754 + }, + { + "epoch": 0.5919423193351313, + "grad_norm": 0.8614835739135742, + "learning_rate": 8.014292196333684e-06, + "loss": 0.7203, + "step": 10755 + }, + { + "epoch": 0.5919973581374869, + "grad_norm": 0.7649008631706238, + "learning_rate": 8.013946344459703e-06, + "loss": 0.7966, + "step": 10756 + }, + { + "epoch": 0.5920523969398426, + "grad_norm": 1.0862764120101929, + "learning_rate": 8.013600469933707e-06, + "loss": 0.866, + "step": 10757 + }, + { + "epoch": 0.5921074357421983, + "grad_norm": 0.7304185628890991, + "learning_rate": 8.013254572758296e-06, + "loss": 0.7599, + "step": 10758 + }, + { + "epoch": 0.592162474544554, + "grad_norm": 0.6329634785652161, + "learning_rate": 8.012908652936072e-06, + "loss": 0.6855, + "step": 10759 + }, + { + "epoch": 0.5922175133469095, + "grad_norm": 0.6692202687263489, + "learning_rate": 8.012562710469631e-06, + "loss": 0.817, + "step": 10760 + }, + { + "epoch": 0.5922725521492652, + "grad_norm": 0.6577631235122681, + "learning_rate": 8.012216745361577e-06, + "loss": 0.7813, + "step": 10761 + }, + { + "epoch": 0.5923275909516209, + "grad_norm": 0.6877861022949219, + "learning_rate": 8.011870757614506e-06, + "loss": 0.7142, + "step": 10762 + }, + { + "epoch": 0.5923826297539766, + "grad_norm": 0.7132022380828857, + "learning_rate": 8.011524747231023e-06, + "loss": 0.747, + "step": 10763 + }, + { + "epoch": 0.5924376685563322, + "grad_norm": 0.7841360569000244, + "learning_rate": 8.011178714213726e-06, + "loss": 0.7511, + "step": 10764 + }, + { + "epoch": 0.5924927073586879, + "grad_norm": 0.8572794198989868, + "learning_rate": 8.010832658565215e-06, + "loss": 0.8704, + "step": 10765 + }, + { + "epoch": 0.5925477461610436, + "grad_norm": 0.6825506687164307, + "learning_rate": 8.010486580288092e-06, + "loss": 0.7472, + "step": 10766 + }, + { + "epoch": 0.5926027849633992, + "grad_norm": 0.7484591603279114, + "learning_rate": 8.010140479384957e-06, + "loss": 0.7679, + "step": 10767 + }, + { + "epoch": 0.5926578237657548, + "grad_norm": 0.712602436542511, + "learning_rate": 8.009794355858412e-06, + "loss": 0.7706, + "step": 10768 + }, + { + "epoch": 0.5927128625681105, + "grad_norm": 0.8911493420600891, + "learning_rate": 8.00944820971106e-06, + "loss": 0.8396, + "step": 10769 + }, + { + "epoch": 0.5927679013704662, + "grad_norm": 0.7300251126289368, + "learning_rate": 8.009102040945498e-06, + "loss": 0.7611, + "step": 10770 + }, + { + "epoch": 0.5928229401728219, + "grad_norm": 0.727343738079071, + "learning_rate": 8.008755849564333e-06, + "loss": 0.6785, + "step": 10771 + }, + { + "epoch": 0.5928779789751775, + "grad_norm": 0.8323808908462524, + "learning_rate": 8.008409635570163e-06, + "loss": 0.7429, + "step": 10772 + }, + { + "epoch": 0.5929330177775332, + "grad_norm": 0.6651942133903503, + "learning_rate": 8.00806339896559e-06, + "loss": 0.7683, + "step": 10773 + }, + { + "epoch": 0.5929880565798888, + "grad_norm": 0.7164554595947266, + "learning_rate": 8.007717139753222e-06, + "loss": 0.7742, + "step": 10774 + }, + { + "epoch": 0.5930430953822445, + "grad_norm": 0.6906408667564392, + "learning_rate": 8.007370857935654e-06, + "loss": 0.7322, + "step": 10775 + }, + { + "epoch": 0.5930981341846001, + "grad_norm": 0.6384999752044678, + "learning_rate": 8.007024553515493e-06, + "loss": 0.7011, + "step": 10776 + }, + { + "epoch": 0.5931531729869558, + "grad_norm": 0.6997355222702026, + "learning_rate": 8.006678226495338e-06, + "loss": 0.7303, + "step": 10777 + }, + { + "epoch": 0.5932082117893115, + "grad_norm": 0.6730707287788391, + "learning_rate": 8.006331876877797e-06, + "loss": 0.7461, + "step": 10778 + }, + { + "epoch": 0.5932632505916672, + "grad_norm": 0.7529115080833435, + "learning_rate": 8.00598550466547e-06, + "loss": 0.7487, + "step": 10779 + }, + { + "epoch": 0.5933182893940228, + "grad_norm": 0.7186329960823059, + "learning_rate": 8.00563910986096e-06, + "loss": 0.8025, + "step": 10780 + }, + { + "epoch": 0.5933733281963784, + "grad_norm": 0.7523752450942993, + "learning_rate": 8.005292692466869e-06, + "loss": 0.8291, + "step": 10781 + }, + { + "epoch": 0.5934283669987341, + "grad_norm": 1.182645559310913, + "learning_rate": 8.004946252485806e-06, + "loss": 0.8037, + "step": 10782 + }, + { + "epoch": 0.5934834058010897, + "grad_norm": 0.736570417881012, + "learning_rate": 8.004599789920369e-06, + "loss": 0.8259, + "step": 10783 + }, + { + "epoch": 0.5935384446034454, + "grad_norm": 0.757665753364563, + "learning_rate": 8.004253304773165e-06, + "loss": 0.7773, + "step": 10784 + }, + { + "epoch": 0.5935934834058011, + "grad_norm": 0.6988566517829895, + "learning_rate": 8.003906797046798e-06, + "loss": 0.7895, + "step": 10785 + }, + { + "epoch": 0.5936485222081568, + "grad_norm": 0.6921454071998596, + "learning_rate": 8.00356026674387e-06, + "loss": 0.8068, + "step": 10786 + }, + { + "epoch": 0.5937035610105124, + "grad_norm": 0.7053877115249634, + "learning_rate": 8.003213713866988e-06, + "loss": 0.7632, + "step": 10787 + }, + { + "epoch": 0.593758599812868, + "grad_norm": 0.8193650245666504, + "learning_rate": 8.002867138418757e-06, + "loss": 0.759, + "step": 10788 + }, + { + "epoch": 0.5938136386152237, + "grad_norm": 0.6089804768562317, + "learning_rate": 8.002520540401779e-06, + "loss": 0.7117, + "step": 10789 + }, + { + "epoch": 0.5938686774175794, + "grad_norm": 0.6869456768035889, + "learning_rate": 8.002173919818662e-06, + "loss": 0.7724, + "step": 10790 + }, + { + "epoch": 0.593923716219935, + "grad_norm": 0.7279118895530701, + "learning_rate": 8.001827276672007e-06, + "loss": 0.7578, + "step": 10791 + }, + { + "epoch": 0.5939787550222907, + "grad_norm": 0.6960133910179138, + "learning_rate": 8.00148061096442e-06, + "loss": 0.7887, + "step": 10792 + }, + { + "epoch": 0.5940337938246464, + "grad_norm": 0.6774740815162659, + "learning_rate": 8.001133922698511e-06, + "loss": 0.7146, + "step": 10793 + }, + { + "epoch": 0.5940888326270021, + "grad_norm": 0.6696349382400513, + "learning_rate": 8.000787211876883e-06, + "loss": 0.7829, + "step": 10794 + }, + { + "epoch": 0.5941438714293577, + "grad_norm": 1.5037024021148682, + "learning_rate": 8.000440478502142e-06, + "loss": 0.8198, + "step": 10795 + }, + { + "epoch": 0.5941989102317133, + "grad_norm": 0.7373353838920593, + "learning_rate": 8.000093722576893e-06, + "loss": 0.7864, + "step": 10796 + }, + { + "epoch": 0.594253949034069, + "grad_norm": 0.8120700120925903, + "learning_rate": 7.999746944103743e-06, + "loss": 0.7918, + "step": 10797 + }, + { + "epoch": 0.5943089878364247, + "grad_norm": 0.7669811844825745, + "learning_rate": 7.999400143085296e-06, + "loss": 0.751, + "step": 10798 + }, + { + "epoch": 0.5943640266387803, + "grad_norm": 0.8090860843658447, + "learning_rate": 7.999053319524163e-06, + "loss": 0.8387, + "step": 10799 + }, + { + "epoch": 0.594419065441136, + "grad_norm": 0.6994315385818481, + "learning_rate": 7.998706473422945e-06, + "loss": 0.7084, + "step": 10800 + }, + { + "epoch": 0.5944741042434917, + "grad_norm": 0.7913107872009277, + "learning_rate": 7.998359604784254e-06, + "loss": 0.7454, + "step": 10801 + }, + { + "epoch": 0.5945291430458474, + "grad_norm": 0.6831398010253906, + "learning_rate": 7.998012713610696e-06, + "loss": 0.7422, + "step": 10802 + }, + { + "epoch": 0.5945841818482029, + "grad_norm": 0.7324068546295166, + "learning_rate": 7.997665799904875e-06, + "loss": 0.7622, + "step": 10803 + }, + { + "epoch": 0.5946392206505586, + "grad_norm": 0.8192811012268066, + "learning_rate": 7.997318863669399e-06, + "loss": 0.7783, + "step": 10804 + }, + { + "epoch": 0.5946942594529143, + "grad_norm": 0.8008341789245605, + "learning_rate": 7.996971904906879e-06, + "loss": 0.7673, + "step": 10805 + }, + { + "epoch": 0.59474929825527, + "grad_norm": 0.6899568438529968, + "learning_rate": 7.99662492361992e-06, + "loss": 0.7477, + "step": 10806 + }, + { + "epoch": 0.5948043370576256, + "grad_norm": 0.7322555780410767, + "learning_rate": 7.996277919811132e-06, + "loss": 0.7673, + "step": 10807 + }, + { + "epoch": 0.5948593758599813, + "grad_norm": 1.008300542831421, + "learning_rate": 7.995930893483117e-06, + "loss": 0.7556, + "step": 10808 + }, + { + "epoch": 0.594914414662337, + "grad_norm": 0.7211925387382507, + "learning_rate": 7.99558384463849e-06, + "loss": 0.761, + "step": 10809 + }, + { + "epoch": 0.5949694534646927, + "grad_norm": 0.7143383622169495, + "learning_rate": 7.995236773279855e-06, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.5950244922670482, + "grad_norm": 0.7682802677154541, + "learning_rate": 7.994889679409825e-06, + "loss": 0.8538, + "step": 10811 + }, + { + "epoch": 0.5950795310694039, + "grad_norm": 0.6304698586463928, + "learning_rate": 7.994542563031004e-06, + "loss": 0.7343, + "step": 10812 + }, + { + "epoch": 0.5951345698717596, + "grad_norm": 0.6704440116882324, + "learning_rate": 7.994195424146002e-06, + "loss": 0.6921, + "step": 10813 + }, + { + "epoch": 0.5951896086741153, + "grad_norm": 0.8626209497451782, + "learning_rate": 7.99384826275743e-06, + "loss": 0.7049, + "step": 10814 + }, + { + "epoch": 0.5952446474764709, + "grad_norm": 0.810922384262085, + "learning_rate": 7.993501078867895e-06, + "loss": 0.793, + "step": 10815 + }, + { + "epoch": 0.5952996862788266, + "grad_norm": 0.8495855927467346, + "learning_rate": 7.993153872480009e-06, + "loss": 0.8078, + "step": 10816 + }, + { + "epoch": 0.5953547250811823, + "grad_norm": 0.7430331707000732, + "learning_rate": 7.992806643596378e-06, + "loss": 0.7957, + "step": 10817 + }, + { + "epoch": 0.595409763883538, + "grad_norm": 0.7188051342964172, + "learning_rate": 7.992459392219614e-06, + "loss": 0.725, + "step": 10818 + }, + { + "epoch": 0.5954648026858935, + "grad_norm": 0.7046926021575928, + "learning_rate": 7.992112118352326e-06, + "loss": 0.7438, + "step": 10819 + }, + { + "epoch": 0.5955198414882492, + "grad_norm": 0.7982804775238037, + "learning_rate": 7.991764821997123e-06, + "loss": 0.7046, + "step": 10820 + }, + { + "epoch": 0.5955748802906049, + "grad_norm": 0.6392245292663574, + "learning_rate": 7.991417503156618e-06, + "loss": 0.7413, + "step": 10821 + }, + { + "epoch": 0.5956299190929606, + "grad_norm": 0.7518960237503052, + "learning_rate": 7.99107016183342e-06, + "loss": 0.7661, + "step": 10822 + }, + { + "epoch": 0.5956849578953162, + "grad_norm": 0.7413721680641174, + "learning_rate": 7.99072279803014e-06, + "loss": 0.6538, + "step": 10823 + }, + { + "epoch": 0.5957399966976719, + "grad_norm": 0.7729454636573792, + "learning_rate": 7.990375411749384e-06, + "loss": 0.8056, + "step": 10824 + }, + { + "epoch": 0.5957950355000275, + "grad_norm": 0.8059296607971191, + "learning_rate": 7.99002800299377e-06, + "loss": 0.8699, + "step": 10825 + }, + { + "epoch": 0.5958500743023831, + "grad_norm": 0.5947105288505554, + "learning_rate": 7.989680571765907e-06, + "loss": 0.6481, + "step": 10826 + }, + { + "epoch": 0.5959051131047388, + "grad_norm": 0.7303743362426758, + "learning_rate": 7.989333118068404e-06, + "loss": 0.7401, + "step": 10827 + }, + { + "epoch": 0.5959601519070945, + "grad_norm": 0.7121400237083435, + "learning_rate": 7.988985641903873e-06, + "loss": 0.78, + "step": 10828 + }, + { + "epoch": 0.5960151907094502, + "grad_norm": 0.6921802163124084, + "learning_rate": 7.988638143274926e-06, + "loss": 0.7234, + "step": 10829 + }, + { + "epoch": 0.5960702295118058, + "grad_norm": 0.6715331673622131, + "learning_rate": 7.988290622184174e-06, + "loss": 0.7606, + "step": 10830 + }, + { + "epoch": 0.5961252683141615, + "grad_norm": 0.6315215229988098, + "learning_rate": 7.98794307863423e-06, + "loss": 0.6902, + "step": 10831 + }, + { + "epoch": 0.5961803071165171, + "grad_norm": 0.6884782314300537, + "learning_rate": 7.987595512627707e-06, + "loss": 0.7808, + "step": 10832 + }, + { + "epoch": 0.5962353459188728, + "grad_norm": 0.7050700783729553, + "learning_rate": 7.987247924167215e-06, + "loss": 0.7248, + "step": 10833 + }, + { + "epoch": 0.5962903847212284, + "grad_norm": 0.7232446074485779, + "learning_rate": 7.986900313255367e-06, + "loss": 0.8686, + "step": 10834 + }, + { + "epoch": 0.5963454235235841, + "grad_norm": 0.693631649017334, + "learning_rate": 7.986552679894778e-06, + "loss": 0.7567, + "step": 10835 + }, + { + "epoch": 0.5964004623259398, + "grad_norm": 0.6462356448173523, + "learning_rate": 7.986205024088054e-06, + "loss": 0.7091, + "step": 10836 + }, + { + "epoch": 0.5964555011282955, + "grad_norm": 0.7465559840202332, + "learning_rate": 7.985857345837814e-06, + "loss": 0.8965, + "step": 10837 + }, + { + "epoch": 0.5965105399306511, + "grad_norm": 0.6803271770477295, + "learning_rate": 7.985509645146672e-06, + "loss": 0.7602, + "step": 10838 + }, + { + "epoch": 0.5965655787330068, + "grad_norm": 1.1414798498153687, + "learning_rate": 7.985161922017238e-06, + "loss": 0.7806, + "step": 10839 + }, + { + "epoch": 0.5966206175353624, + "grad_norm": 0.6583230495452881, + "learning_rate": 7.984814176452123e-06, + "loss": 0.6727, + "step": 10840 + }, + { + "epoch": 0.5966756563377181, + "grad_norm": 0.6582550406455994, + "learning_rate": 7.984466408453946e-06, + "loss": 0.6794, + "step": 10841 + }, + { + "epoch": 0.5967306951400737, + "grad_norm": 0.8680793642997742, + "learning_rate": 7.984118618025318e-06, + "loss": 0.7999, + "step": 10842 + }, + { + "epoch": 0.5967857339424294, + "grad_norm": 0.772777795791626, + "learning_rate": 7.983770805168853e-06, + "loss": 0.6278, + "step": 10843 + }, + { + "epoch": 0.5968407727447851, + "grad_norm": 0.8099700808525085, + "learning_rate": 7.983422969887167e-06, + "loss": 0.7631, + "step": 10844 + }, + { + "epoch": 0.5968958115471408, + "grad_norm": 0.660271406173706, + "learning_rate": 7.983075112182871e-06, + "loss": 0.7557, + "step": 10845 + }, + { + "epoch": 0.5969508503494964, + "grad_norm": 0.7205530405044556, + "learning_rate": 7.982727232058582e-06, + "loss": 0.8258, + "step": 10846 + }, + { + "epoch": 0.597005889151852, + "grad_norm": 0.7925810813903809, + "learning_rate": 7.982379329516912e-06, + "loss": 0.7534, + "step": 10847 + }, + { + "epoch": 0.5970609279542077, + "grad_norm": 0.7255545854568481, + "learning_rate": 7.982031404560477e-06, + "loss": 0.8394, + "step": 10848 + }, + { + "epoch": 0.5971159667565634, + "grad_norm": 0.835394561290741, + "learning_rate": 7.981683457191893e-06, + "loss": 0.8384, + "step": 10849 + }, + { + "epoch": 0.597171005558919, + "grad_norm": 0.6781747937202454, + "learning_rate": 7.981335487413775e-06, + "loss": 0.8173, + "step": 10850 + }, + { + "epoch": 0.5972260443612747, + "grad_norm": 0.8602943420410156, + "learning_rate": 7.980987495228737e-06, + "loss": 0.8257, + "step": 10851 + }, + { + "epoch": 0.5972810831636304, + "grad_norm": 0.7157264947891235, + "learning_rate": 7.980639480639394e-06, + "loss": 0.7267, + "step": 10852 + }, + { + "epoch": 0.5973361219659861, + "grad_norm": 0.7695063352584839, + "learning_rate": 7.980291443648364e-06, + "loss": 0.7794, + "step": 10853 + }, + { + "epoch": 0.5973911607683416, + "grad_norm": 0.723971426486969, + "learning_rate": 7.979943384258262e-06, + "loss": 0.7761, + "step": 10854 + }, + { + "epoch": 0.5974461995706973, + "grad_norm": 0.691722571849823, + "learning_rate": 7.979595302471702e-06, + "loss": 0.7276, + "step": 10855 + }, + { + "epoch": 0.597501238373053, + "grad_norm": 0.7019701600074768, + "learning_rate": 7.9792471982913e-06, + "loss": 0.7965, + "step": 10856 + }, + { + "epoch": 0.5975562771754087, + "grad_norm": 0.6626996994018555, + "learning_rate": 7.978899071719675e-06, + "loss": 0.7124, + "step": 10857 + }, + { + "epoch": 0.5976113159777643, + "grad_norm": 0.6871625781059265, + "learning_rate": 7.978550922759443e-06, + "loss": 0.7742, + "step": 10858 + }, + { + "epoch": 0.59766635478012, + "grad_norm": 0.7153579592704773, + "learning_rate": 7.978202751413217e-06, + "loss": 0.7852, + "step": 10859 + }, + { + "epoch": 0.5977213935824757, + "grad_norm": 0.6891841292381287, + "learning_rate": 7.977854557683619e-06, + "loss": 0.7873, + "step": 10860 + }, + { + "epoch": 0.5977764323848314, + "grad_norm": 0.6864004731178284, + "learning_rate": 7.977506341573262e-06, + "loss": 0.7223, + "step": 10861 + }, + { + "epoch": 0.5978314711871869, + "grad_norm": 0.7163059115409851, + "learning_rate": 7.977158103084764e-06, + "loss": 0.679, + "step": 10862 + }, + { + "epoch": 0.5978865099895426, + "grad_norm": 0.6727336049079895, + "learning_rate": 7.976809842220742e-06, + "loss": 0.7148, + "step": 10863 + }, + { + "epoch": 0.5979415487918983, + "grad_norm": 0.672960638999939, + "learning_rate": 7.976461558983814e-06, + "loss": 0.7263, + "step": 10864 + }, + { + "epoch": 0.597996587594254, + "grad_norm": 0.9124444127082825, + "learning_rate": 7.976113253376601e-06, + "loss": 0.6876, + "step": 10865 + }, + { + "epoch": 0.5980516263966096, + "grad_norm": 0.6415041089057922, + "learning_rate": 7.975764925401715e-06, + "loss": 0.6655, + "step": 10866 + }, + { + "epoch": 0.5981066651989653, + "grad_norm": 0.7342595458030701, + "learning_rate": 7.975416575061776e-06, + "loss": 0.7753, + "step": 10867 + }, + { + "epoch": 0.598161704001321, + "grad_norm": 0.7161775231361389, + "learning_rate": 7.975068202359402e-06, + "loss": 0.7525, + "step": 10868 + }, + { + "epoch": 0.5982167428036765, + "grad_norm": 0.7087578773498535, + "learning_rate": 7.974719807297212e-06, + "loss": 0.7196, + "step": 10869 + }, + { + "epoch": 0.5982717816060322, + "grad_norm": 0.6472536325454712, + "learning_rate": 7.974371389877826e-06, + "loss": 0.6837, + "step": 10870 + }, + { + "epoch": 0.5983268204083879, + "grad_norm": 0.6625581383705139, + "learning_rate": 7.97402295010386e-06, + "loss": 0.6379, + "step": 10871 + }, + { + "epoch": 0.5983818592107436, + "grad_norm": 0.7621071934700012, + "learning_rate": 7.973674487977934e-06, + "loss": 0.8291, + "step": 10872 + }, + { + "epoch": 0.5984368980130992, + "grad_norm": 0.693394660949707, + "learning_rate": 7.973326003502666e-06, + "loss": 0.7677, + "step": 10873 + }, + { + "epoch": 0.5984919368154549, + "grad_norm": 0.6393985152244568, + "learning_rate": 7.972977496680674e-06, + "loss": 0.7058, + "step": 10874 + }, + { + "epoch": 0.5985469756178106, + "grad_norm": 0.7101462483406067, + "learning_rate": 7.972628967514582e-06, + "loss": 0.7396, + "step": 10875 + }, + { + "epoch": 0.5986020144201663, + "grad_norm": 0.8131522536277771, + "learning_rate": 7.972280416007003e-06, + "loss": 0.8461, + "step": 10876 + }, + { + "epoch": 0.5986570532225218, + "grad_norm": 0.7186655402183533, + "learning_rate": 7.971931842160564e-06, + "loss": 0.7721, + "step": 10877 + }, + { + "epoch": 0.5987120920248775, + "grad_norm": 0.7520855069160461, + "learning_rate": 7.971583245977877e-06, + "loss": 0.7733, + "step": 10878 + }, + { + "epoch": 0.5987671308272332, + "grad_norm": 0.6548848748207092, + "learning_rate": 7.971234627461569e-06, + "loss": 0.6555, + "step": 10879 + }, + { + "epoch": 0.5988221696295889, + "grad_norm": 0.7341775894165039, + "learning_rate": 7.970885986614254e-06, + "loss": 0.8292, + "step": 10880 + }, + { + "epoch": 0.5988772084319445, + "grad_norm": 0.7126352190971375, + "learning_rate": 7.970537323438556e-06, + "loss": 0.7704, + "step": 10881 + }, + { + "epoch": 0.5989322472343002, + "grad_norm": 0.7291527390480042, + "learning_rate": 7.970188637937097e-06, + "loss": 0.8175, + "step": 10882 + }, + { + "epoch": 0.5989872860366559, + "grad_norm": 0.682767927646637, + "learning_rate": 7.969839930112493e-06, + "loss": 0.8187, + "step": 10883 + }, + { + "epoch": 0.5990423248390115, + "grad_norm": 0.7820014953613281, + "learning_rate": 7.969491199967368e-06, + "loss": 0.7949, + "step": 10884 + }, + { + "epoch": 0.5990973636413671, + "grad_norm": 0.7257336974143982, + "learning_rate": 7.969142447504341e-06, + "loss": 0.8461, + "step": 10885 + }, + { + "epoch": 0.5991524024437228, + "grad_norm": 0.6813532114028931, + "learning_rate": 7.968793672726033e-06, + "loss": 0.7889, + "step": 10886 + }, + { + "epoch": 0.5992074412460785, + "grad_norm": 0.6868439316749573, + "learning_rate": 7.96844487563507e-06, + "loss": 0.7268, + "step": 10887 + }, + { + "epoch": 0.5992624800484342, + "grad_norm": 0.6547278761863708, + "learning_rate": 7.968096056234067e-06, + "loss": 0.7026, + "step": 10888 + }, + { + "epoch": 0.5993175188507898, + "grad_norm": 0.6704558730125427, + "learning_rate": 7.96774721452565e-06, + "loss": 0.6994, + "step": 10889 + }, + { + "epoch": 0.5993725576531455, + "grad_norm": 0.7134065628051758, + "learning_rate": 7.967398350512439e-06, + "loss": 0.7728, + "step": 10890 + }, + { + "epoch": 0.5994275964555011, + "grad_norm": 0.751265823841095, + "learning_rate": 7.967049464197056e-06, + "loss": 0.8421, + "step": 10891 + }, + { + "epoch": 0.5994826352578568, + "grad_norm": 0.8558571934700012, + "learning_rate": 7.966700555582125e-06, + "loss": 0.9144, + "step": 10892 + }, + { + "epoch": 0.5995376740602124, + "grad_norm": 0.8338084816932678, + "learning_rate": 7.966351624670263e-06, + "loss": 0.7502, + "step": 10893 + }, + { + "epoch": 0.5995927128625681, + "grad_norm": 0.7017131447792053, + "learning_rate": 7.9660026714641e-06, + "loss": 0.7778, + "step": 10894 + }, + { + "epoch": 0.5996477516649238, + "grad_norm": 0.7176111340522766, + "learning_rate": 7.965653695966253e-06, + "loss": 0.8478, + "step": 10895 + }, + { + "epoch": 0.5997027904672795, + "grad_norm": 0.7026060819625854, + "learning_rate": 7.965304698179349e-06, + "loss": 0.7111, + "step": 10896 + }, + { + "epoch": 0.5997578292696351, + "grad_norm": 0.6383810639381409, + "learning_rate": 7.964955678106005e-06, + "loss": 0.6429, + "step": 10897 + }, + { + "epoch": 0.5998128680719907, + "grad_norm": 0.8024059534072876, + "learning_rate": 7.96460663574885e-06, + "loss": 0.7308, + "step": 10898 + }, + { + "epoch": 0.5998679068743464, + "grad_norm": 0.7378466725349426, + "learning_rate": 7.964257571110504e-06, + "loss": 0.7593, + "step": 10899 + }, + { + "epoch": 0.5999229456767021, + "grad_norm": 0.7089043855667114, + "learning_rate": 7.963908484193593e-06, + "loss": 0.6862, + "step": 10900 + }, + { + "epoch": 0.5999779844790577, + "grad_norm": 0.765295684337616, + "learning_rate": 7.963559375000738e-06, + "loss": 0.6759, + "step": 10901 + }, + { + "epoch": 0.6000330232814134, + "grad_norm": 0.7040783166885376, + "learning_rate": 7.963210243534565e-06, + "loss": 0.7754, + "step": 10902 + }, + { + "epoch": 0.6000880620837691, + "grad_norm": 0.8593736886978149, + "learning_rate": 7.962861089797698e-06, + "loss": 0.8765, + "step": 10903 + }, + { + "epoch": 0.6001431008861248, + "grad_norm": 0.6613926291465759, + "learning_rate": 7.962511913792758e-06, + "loss": 0.6697, + "step": 10904 + }, + { + "epoch": 0.6001981396884803, + "grad_norm": 0.6369597911834717, + "learning_rate": 7.962162715522372e-06, + "loss": 0.7145, + "step": 10905 + }, + { + "epoch": 0.600253178490836, + "grad_norm": 1.1790162324905396, + "learning_rate": 7.961813494989164e-06, + "loss": 0.8067, + "step": 10906 + }, + { + "epoch": 0.6003082172931917, + "grad_norm": 0.7548268437385559, + "learning_rate": 7.961464252195759e-06, + "loss": 0.7936, + "step": 10907 + }, + { + "epoch": 0.6003632560955474, + "grad_norm": 0.6204384565353394, + "learning_rate": 7.961114987144781e-06, + "loss": 0.6374, + "step": 10908 + }, + { + "epoch": 0.600418294897903, + "grad_norm": 0.7149941921234131, + "learning_rate": 7.960765699838854e-06, + "loss": 0.8422, + "step": 10909 + }, + { + "epoch": 0.6004733337002587, + "grad_norm": 0.7040171027183533, + "learning_rate": 7.960416390280608e-06, + "loss": 0.8261, + "step": 10910 + }, + { + "epoch": 0.6005283725026144, + "grad_norm": 0.713591456413269, + "learning_rate": 7.960067058472663e-06, + "loss": 0.7908, + "step": 10911 + }, + { + "epoch": 0.60058341130497, + "grad_norm": 0.654086172580719, + "learning_rate": 7.959717704417645e-06, + "loss": 0.6971, + "step": 10912 + }, + { + "epoch": 0.6006384501073256, + "grad_norm": 0.7293223738670349, + "learning_rate": 7.959368328118183e-06, + "loss": 0.7032, + "step": 10913 + }, + { + "epoch": 0.6006934889096813, + "grad_norm": 0.705434262752533, + "learning_rate": 7.959018929576898e-06, + "loss": 0.7193, + "step": 10914 + }, + { + "epoch": 0.600748527712037, + "grad_norm": 0.7406907677650452, + "learning_rate": 7.958669508796422e-06, + "loss": 0.8464, + "step": 10915 + }, + { + "epoch": 0.6008035665143926, + "grad_norm": 0.6683858036994934, + "learning_rate": 7.958320065779377e-06, + "loss": 0.699, + "step": 10916 + }, + { + "epoch": 0.6008586053167483, + "grad_norm": 0.7380560636520386, + "learning_rate": 7.95797060052839e-06, + "loss": 0.7409, + "step": 10917 + }, + { + "epoch": 0.600913644119104, + "grad_norm": 0.7729377746582031, + "learning_rate": 7.957621113046088e-06, + "loss": 0.8838, + "step": 10918 + }, + { + "epoch": 0.6009686829214597, + "grad_norm": 0.6842743158340454, + "learning_rate": 7.957271603335097e-06, + "loss": 0.781, + "step": 10919 + }, + { + "epoch": 0.6010237217238152, + "grad_norm": 0.6864648461341858, + "learning_rate": 7.956922071398045e-06, + "loss": 0.6717, + "step": 10920 + }, + { + "epoch": 0.6010787605261709, + "grad_norm": 0.7718262672424316, + "learning_rate": 7.956572517237557e-06, + "loss": 0.8023, + "step": 10921 + }, + { + "epoch": 0.6011337993285266, + "grad_norm": 0.686338484287262, + "learning_rate": 7.956222940856261e-06, + "loss": 0.7139, + "step": 10922 + }, + { + "epoch": 0.6011888381308823, + "grad_norm": 0.7064465284347534, + "learning_rate": 7.955873342256789e-06, + "loss": 0.845, + "step": 10923 + }, + { + "epoch": 0.6012438769332379, + "grad_norm": 0.6847875714302063, + "learning_rate": 7.955523721441761e-06, + "loss": 0.7078, + "step": 10924 + }, + { + "epoch": 0.6012989157355936, + "grad_norm": 0.6879494786262512, + "learning_rate": 7.955174078413806e-06, + "loss": 0.7532, + "step": 10925 + }, + { + "epoch": 0.6013539545379493, + "grad_norm": 0.6569855213165283, + "learning_rate": 7.954824413175554e-06, + "loss": 0.7529, + "step": 10926 + }, + { + "epoch": 0.601408993340305, + "grad_norm": 0.6225974559783936, + "learning_rate": 7.954474725729635e-06, + "loss": 0.6595, + "step": 10927 + }, + { + "epoch": 0.6014640321426605, + "grad_norm": 0.7067761421203613, + "learning_rate": 7.954125016078675e-06, + "loss": 0.7851, + "step": 10928 + }, + { + "epoch": 0.6015190709450162, + "grad_norm": 0.683030903339386, + "learning_rate": 7.9537752842253e-06, + "loss": 0.7461, + "step": 10929 + }, + { + "epoch": 0.6015741097473719, + "grad_norm": 0.6411080956459045, + "learning_rate": 7.953425530172143e-06, + "loss": 0.6945, + "step": 10930 + }, + { + "epoch": 0.6016291485497276, + "grad_norm": 0.6254550814628601, + "learning_rate": 7.953075753921829e-06, + "loss": 0.7143, + "step": 10931 + }, + { + "epoch": 0.6016841873520832, + "grad_norm": 0.684100866317749, + "learning_rate": 7.952725955476987e-06, + "loss": 0.8137, + "step": 10932 + }, + { + "epoch": 0.6017392261544389, + "grad_norm": 0.6341036558151245, + "learning_rate": 7.95237613484025e-06, + "loss": 0.6692, + "step": 10933 + }, + { + "epoch": 0.6017942649567946, + "grad_norm": 0.7311153411865234, + "learning_rate": 7.952026292014242e-06, + "loss": 0.7091, + "step": 10934 + }, + { + "epoch": 0.6018493037591502, + "grad_norm": 0.7265943884849548, + "learning_rate": 7.951676427001596e-06, + "loss": 0.765, + "step": 10935 + }, + { + "epoch": 0.6019043425615058, + "grad_norm": 0.8777397274971008, + "learning_rate": 7.951326539804938e-06, + "loss": 0.7824, + "step": 10936 + }, + { + "epoch": 0.6019593813638615, + "grad_norm": 0.7241179347038269, + "learning_rate": 7.9509766304269e-06, + "loss": 0.7913, + "step": 10937 + }, + { + "epoch": 0.6020144201662172, + "grad_norm": 0.8090667128562927, + "learning_rate": 7.950626698870113e-06, + "loss": 0.8208, + "step": 10938 + }, + { + "epoch": 0.6020694589685729, + "grad_norm": 0.7376043796539307, + "learning_rate": 7.950276745137206e-06, + "loss": 0.7176, + "step": 10939 + }, + { + "epoch": 0.6021244977709285, + "grad_norm": 0.7149157524108887, + "learning_rate": 7.949926769230809e-06, + "loss": 0.7949, + "step": 10940 + }, + { + "epoch": 0.6021795365732842, + "grad_norm": 0.8721579909324646, + "learning_rate": 7.949576771153549e-06, + "loss": 0.8433, + "step": 10941 + }, + { + "epoch": 0.6022345753756398, + "grad_norm": 0.7946182489395142, + "learning_rate": 7.949226750908062e-06, + "loss": 0.7412, + "step": 10942 + }, + { + "epoch": 0.6022896141779955, + "grad_norm": 0.6661237478256226, + "learning_rate": 7.948876708496975e-06, + "loss": 0.725, + "step": 10943 + }, + { + "epoch": 0.6023446529803511, + "grad_norm": 0.8346213698387146, + "learning_rate": 7.948526643922922e-06, + "loss": 0.6817, + "step": 10944 + }, + { + "epoch": 0.6023996917827068, + "grad_norm": 0.7911655306816101, + "learning_rate": 7.94817655718853e-06, + "loss": 0.7398, + "step": 10945 + }, + { + "epoch": 0.6024547305850625, + "grad_norm": 0.6480078101158142, + "learning_rate": 7.947826448296432e-06, + "loss": 0.6822, + "step": 10946 + }, + { + "epoch": 0.6025097693874182, + "grad_norm": 0.6950085759162903, + "learning_rate": 7.94747631724926e-06, + "loss": 0.8073, + "step": 10947 + }, + { + "epoch": 0.6025648081897738, + "grad_norm": 0.7142168879508972, + "learning_rate": 7.947126164049645e-06, + "loss": 0.6159, + "step": 10948 + }, + { + "epoch": 0.6026198469921294, + "grad_norm": 0.7459015846252441, + "learning_rate": 7.946775988700219e-06, + "loss": 0.8377, + "step": 10949 + }, + { + "epoch": 0.6026748857944851, + "grad_norm": 1.050179362297058, + "learning_rate": 7.946425791203614e-06, + "loss": 0.8098, + "step": 10950 + }, + { + "epoch": 0.6027299245968408, + "grad_norm": 0.7473265528678894, + "learning_rate": 7.94607557156246e-06, + "loss": 0.6846, + "step": 10951 + }, + { + "epoch": 0.6027849633991964, + "grad_norm": 0.7990789413452148, + "learning_rate": 7.945725329779392e-06, + "loss": 0.8216, + "step": 10952 + }, + { + "epoch": 0.6028400022015521, + "grad_norm": 0.6461700201034546, + "learning_rate": 7.94537506585704e-06, + "loss": 0.7864, + "step": 10953 + }, + { + "epoch": 0.6028950410039078, + "grad_norm": 0.661123514175415, + "learning_rate": 7.945024779798038e-06, + "loss": 0.7466, + "step": 10954 + }, + { + "epoch": 0.6029500798062634, + "grad_norm": 0.6998088359832764, + "learning_rate": 7.944674471605018e-06, + "loss": 0.7846, + "step": 10955 + }, + { + "epoch": 0.603005118608619, + "grad_norm": 0.6917386651039124, + "learning_rate": 7.944324141280613e-06, + "loss": 0.7699, + "step": 10956 + }, + { + "epoch": 0.6030601574109747, + "grad_norm": 0.7304503321647644, + "learning_rate": 7.943973788827455e-06, + "loss": 0.8015, + "step": 10957 + }, + { + "epoch": 0.6031151962133304, + "grad_norm": 0.7996858358383179, + "learning_rate": 7.94362341424818e-06, + "loss": 0.7093, + "step": 10958 + }, + { + "epoch": 0.603170235015686, + "grad_norm": 0.7445322871208191, + "learning_rate": 7.943273017545419e-06, + "loss": 0.7388, + "step": 10959 + }, + { + "epoch": 0.6032252738180417, + "grad_norm": 0.6672174334526062, + "learning_rate": 7.942922598721805e-06, + "loss": 0.7703, + "step": 10960 + }, + { + "epoch": 0.6032803126203974, + "grad_norm": 0.7313557267189026, + "learning_rate": 7.94257215777997e-06, + "loss": 0.6637, + "step": 10961 + }, + { + "epoch": 0.6033353514227531, + "grad_norm": 0.7248823642730713, + "learning_rate": 7.942221694722553e-06, + "loss": 0.836, + "step": 10962 + }, + { + "epoch": 0.6033903902251087, + "grad_norm": 0.6583372354507446, + "learning_rate": 7.941871209552187e-06, + "loss": 0.7582, + "step": 10963 + }, + { + "epoch": 0.6034454290274643, + "grad_norm": 0.7502591013908386, + "learning_rate": 7.941520702271503e-06, + "loss": 0.7455, + "step": 10964 + }, + { + "epoch": 0.60350046782982, + "grad_norm": 0.6899349689483643, + "learning_rate": 7.941170172883135e-06, + "loss": 0.7677, + "step": 10965 + }, + { + "epoch": 0.6035555066321757, + "grad_norm": 0.693321943283081, + "learning_rate": 7.940819621389722e-06, + "loss": 0.7754, + "step": 10966 + }, + { + "epoch": 0.6036105454345313, + "grad_norm": 0.7376342415809631, + "learning_rate": 7.940469047793893e-06, + "loss": 0.7761, + "step": 10967 + }, + { + "epoch": 0.603665584236887, + "grad_norm": 0.6377952694892883, + "learning_rate": 7.940118452098289e-06, + "loss": 0.6612, + "step": 10968 + }, + { + "epoch": 0.6037206230392427, + "grad_norm": 0.8041388988494873, + "learning_rate": 7.939767834305538e-06, + "loss": 0.8358, + "step": 10969 + }, + { + "epoch": 0.6037756618415984, + "grad_norm": 1.5993521213531494, + "learning_rate": 7.939417194418282e-06, + "loss": 0.8536, + "step": 10970 + }, + { + "epoch": 0.6038307006439539, + "grad_norm": 0.6718295216560364, + "learning_rate": 7.939066532439153e-06, + "loss": 0.717, + "step": 10971 + }, + { + "epoch": 0.6038857394463096, + "grad_norm": 0.7951062917709351, + "learning_rate": 7.938715848370787e-06, + "loss": 0.6919, + "step": 10972 + }, + { + "epoch": 0.6039407782486653, + "grad_norm": 0.707804262638092, + "learning_rate": 7.938365142215816e-06, + "loss": 0.7346, + "step": 10973 + }, + { + "epoch": 0.603995817051021, + "grad_norm": 0.7244500517845154, + "learning_rate": 7.938014413976883e-06, + "loss": 0.708, + "step": 10974 + }, + { + "epoch": 0.6040508558533766, + "grad_norm": 0.7533566951751709, + "learning_rate": 7.937663663656617e-06, + "loss": 0.6761, + "step": 10975 + }, + { + "epoch": 0.6041058946557323, + "grad_norm": 0.8844665288925171, + "learning_rate": 7.93731289125766e-06, + "loss": 0.7833, + "step": 10976 + }, + { + "epoch": 0.604160933458088, + "grad_norm": 0.6413047313690186, + "learning_rate": 7.936962096782643e-06, + "loss": 0.7175, + "step": 10977 + }, + { + "epoch": 0.6042159722604437, + "grad_norm": 0.765943706035614, + "learning_rate": 7.936611280234206e-06, + "loss": 0.7654, + "step": 10978 + }, + { + "epoch": 0.6042710110627992, + "grad_norm": 0.6833398938179016, + "learning_rate": 7.936260441614985e-06, + "loss": 0.7459, + "step": 10979 + }, + { + "epoch": 0.6043260498651549, + "grad_norm": 0.6363481283187866, + "learning_rate": 7.935909580927617e-06, + "loss": 0.7173, + "step": 10980 + }, + { + "epoch": 0.6043810886675106, + "grad_norm": 0.7731046080589294, + "learning_rate": 7.935558698174738e-06, + "loss": 0.8428, + "step": 10981 + }, + { + "epoch": 0.6044361274698663, + "grad_norm": 0.7346602082252502, + "learning_rate": 7.935207793358986e-06, + "loss": 0.832, + "step": 10982 + }, + { + "epoch": 0.6044911662722219, + "grad_norm": 0.6711193919181824, + "learning_rate": 7.934856866482998e-06, + "loss": 0.742, + "step": 10983 + }, + { + "epoch": 0.6045462050745776, + "grad_norm": 0.6931266784667969, + "learning_rate": 7.934505917549411e-06, + "loss": 0.7779, + "step": 10984 + }, + { + "epoch": 0.6046012438769333, + "grad_norm": 0.7624725699424744, + "learning_rate": 7.934154946560862e-06, + "loss": 0.7229, + "step": 10985 + }, + { + "epoch": 0.604656282679289, + "grad_norm": 0.6594272255897522, + "learning_rate": 7.933803953519991e-06, + "loss": 0.7776, + "step": 10986 + }, + { + "epoch": 0.6047113214816445, + "grad_norm": 0.674521803855896, + "learning_rate": 7.933452938429435e-06, + "loss": 0.6904, + "step": 10987 + }, + { + "epoch": 0.6047663602840002, + "grad_norm": 0.7352569699287415, + "learning_rate": 7.933101901291831e-06, + "loss": 0.7655, + "step": 10988 + }, + { + "epoch": 0.6048213990863559, + "grad_norm": 0.8560347557067871, + "learning_rate": 7.932750842109817e-06, + "loss": 0.7894, + "step": 10989 + }, + { + "epoch": 0.6048764378887116, + "grad_norm": 0.769496500492096, + "learning_rate": 7.932399760886037e-06, + "loss": 0.8255, + "step": 10990 + }, + { + "epoch": 0.6049314766910672, + "grad_norm": 0.9399588108062744, + "learning_rate": 7.932048657623122e-06, + "loss": 0.8554, + "step": 10991 + }, + { + "epoch": 0.6049865154934229, + "grad_norm": 0.6662001609802246, + "learning_rate": 7.931697532323716e-06, + "loss": 0.7788, + "step": 10992 + }, + { + "epoch": 0.6050415542957785, + "grad_norm": 0.758263111114502, + "learning_rate": 7.931346384990455e-06, + "loss": 0.7907, + "step": 10993 + }, + { + "epoch": 0.6050965930981342, + "grad_norm": 0.7283937335014343, + "learning_rate": 7.930995215625978e-06, + "loss": 0.8415, + "step": 10994 + }, + { + "epoch": 0.6051516319004898, + "grad_norm": 0.6611599922180176, + "learning_rate": 7.930644024232927e-06, + "loss": 0.7145, + "step": 10995 + }, + { + "epoch": 0.6052066707028455, + "grad_norm": 0.8450857400894165, + "learning_rate": 7.93029281081394e-06, + "loss": 0.7208, + "step": 10996 + }, + { + "epoch": 0.6052617095052012, + "grad_norm": 0.649010181427002, + "learning_rate": 7.929941575371655e-06, + "loss": 0.6928, + "step": 10997 + }, + { + "epoch": 0.6053167483075568, + "grad_norm": 0.7022100687026978, + "learning_rate": 7.929590317908718e-06, + "loss": 0.7329, + "step": 10998 + }, + { + "epoch": 0.6053717871099125, + "grad_norm": 0.768598198890686, + "learning_rate": 7.92923903842776e-06, + "loss": 0.7799, + "step": 10999 + }, + { + "epoch": 0.6054268259122682, + "grad_norm": 0.6648436784744263, + "learning_rate": 7.928887736931428e-06, + "loss": 0.7728, + "step": 11000 + }, + { + "epoch": 0.6054818647146238, + "grad_norm": 0.6946157813072205, + "learning_rate": 7.928536413422357e-06, + "loss": 0.7609, + "step": 11001 + }, + { + "epoch": 0.6055369035169794, + "grad_norm": 0.7779337167739868, + "learning_rate": 7.928185067903191e-06, + "loss": 0.7679, + "step": 11002 + }, + { + "epoch": 0.6055919423193351, + "grad_norm": 0.6520814895629883, + "learning_rate": 7.927833700376573e-06, + "loss": 0.6734, + "step": 11003 + }, + { + "epoch": 0.6056469811216908, + "grad_norm": 0.7724258899688721, + "learning_rate": 7.927482310845138e-06, + "loss": 0.7564, + "step": 11004 + }, + { + "epoch": 0.6057020199240465, + "grad_norm": 0.6649174690246582, + "learning_rate": 7.927130899311529e-06, + "loss": 0.7217, + "step": 11005 + }, + { + "epoch": 0.6057570587264021, + "grad_norm": 0.6807287931442261, + "learning_rate": 7.926779465778389e-06, + "loss": 0.6966, + "step": 11006 + }, + { + "epoch": 0.6058120975287578, + "grad_norm": 0.6644826531410217, + "learning_rate": 7.926428010248357e-06, + "loss": 0.7238, + "step": 11007 + }, + { + "epoch": 0.6058671363311134, + "grad_norm": 0.7533535957336426, + "learning_rate": 7.926076532724077e-06, + "loss": 0.855, + "step": 11008 + }, + { + "epoch": 0.6059221751334691, + "grad_norm": 0.6457169055938721, + "learning_rate": 7.925725033208187e-06, + "loss": 0.6717, + "step": 11009 + }, + { + "epoch": 0.6059772139358247, + "grad_norm": 0.724719762802124, + "learning_rate": 7.925373511703332e-06, + "loss": 0.8701, + "step": 11010 + }, + { + "epoch": 0.6060322527381804, + "grad_norm": 0.746755063533783, + "learning_rate": 7.925021968212153e-06, + "loss": 0.8509, + "step": 11011 + }, + { + "epoch": 0.6060872915405361, + "grad_norm": 0.7377174496650696, + "learning_rate": 7.924670402737292e-06, + "loss": 0.8053, + "step": 11012 + }, + { + "epoch": 0.6061423303428918, + "grad_norm": 0.9791839718818665, + "learning_rate": 7.92431881528139e-06, + "loss": 0.7893, + "step": 11013 + }, + { + "epoch": 0.6061973691452474, + "grad_norm": 0.7472195029258728, + "learning_rate": 7.923967205847089e-06, + "loss": 0.7195, + "step": 11014 + }, + { + "epoch": 0.606252407947603, + "grad_norm": 0.672851026058197, + "learning_rate": 7.923615574437037e-06, + "loss": 0.8234, + "step": 11015 + }, + { + "epoch": 0.6063074467499587, + "grad_norm": 0.739942729473114, + "learning_rate": 7.923263921053872e-06, + "loss": 0.8582, + "step": 11016 + }, + { + "epoch": 0.6063624855523144, + "grad_norm": 0.7337772846221924, + "learning_rate": 7.922912245700236e-06, + "loss": 0.8008, + "step": 11017 + }, + { + "epoch": 0.60641752435467, + "grad_norm": 0.6707174777984619, + "learning_rate": 7.922560548378774e-06, + "loss": 0.8531, + "step": 11018 + }, + { + "epoch": 0.6064725631570257, + "grad_norm": 0.6783839464187622, + "learning_rate": 7.922208829092133e-06, + "loss": 0.7963, + "step": 11019 + }, + { + "epoch": 0.6065276019593814, + "grad_norm": 0.6133253574371338, + "learning_rate": 7.92185708784295e-06, + "loss": 0.7375, + "step": 11020 + }, + { + "epoch": 0.6065826407617371, + "grad_norm": 0.8300097584724426, + "learning_rate": 7.921505324633868e-06, + "loss": 0.7976, + "step": 11021 + }, + { + "epoch": 0.6066376795640926, + "grad_norm": 0.6800658702850342, + "learning_rate": 7.921153539467538e-06, + "loss": 0.7321, + "step": 11022 + }, + { + "epoch": 0.6066927183664483, + "grad_norm": 0.6849787831306458, + "learning_rate": 7.920801732346602e-06, + "loss": 0.7134, + "step": 11023 + }, + { + "epoch": 0.606747757168804, + "grad_norm": 0.7675080895423889, + "learning_rate": 7.920449903273697e-06, + "loss": 0.7402, + "step": 11024 + }, + { + "epoch": 0.6068027959711597, + "grad_norm": 0.7431055903434753, + "learning_rate": 7.920098052251476e-06, + "loss": 0.7872, + "step": 11025 + }, + { + "epoch": 0.6068578347735153, + "grad_norm": 0.6264036297798157, + "learning_rate": 7.919746179282577e-06, + "loss": 0.7496, + "step": 11026 + }, + { + "epoch": 0.606912873575871, + "grad_norm": 0.7800843715667725, + "learning_rate": 7.919394284369648e-06, + "loss": 0.7917, + "step": 11027 + }, + { + "epoch": 0.6069679123782267, + "grad_norm": 0.7665574550628662, + "learning_rate": 7.919042367515336e-06, + "loss": 0.7905, + "step": 11028 + }, + { + "epoch": 0.6070229511805824, + "grad_norm": 0.7473214864730835, + "learning_rate": 7.918690428722279e-06, + "loss": 0.7732, + "step": 11029 + }, + { + "epoch": 0.6070779899829379, + "grad_norm": 0.6717211008071899, + "learning_rate": 7.918338467993127e-06, + "loss": 0.8221, + "step": 11030 + }, + { + "epoch": 0.6071330287852936, + "grad_norm": 0.6745431423187256, + "learning_rate": 7.917986485330525e-06, + "loss": 0.6899, + "step": 11031 + }, + { + "epoch": 0.6071880675876493, + "grad_norm": 0.6838263273239136, + "learning_rate": 7.917634480737117e-06, + "loss": 0.7133, + "step": 11032 + }, + { + "epoch": 0.607243106390005, + "grad_norm": 0.7975682020187378, + "learning_rate": 7.91728245421555e-06, + "loss": 0.8283, + "step": 11033 + }, + { + "epoch": 0.6072981451923606, + "grad_norm": 0.7112031579017639, + "learning_rate": 7.916930405768468e-06, + "loss": 0.7423, + "step": 11034 + }, + { + "epoch": 0.6073531839947163, + "grad_norm": 0.7006776928901672, + "learning_rate": 7.91657833539852e-06, + "loss": 0.716, + "step": 11035 + }, + { + "epoch": 0.607408222797072, + "grad_norm": 0.7523549795150757, + "learning_rate": 7.916226243108348e-06, + "loss": 0.8591, + "step": 11036 + }, + { + "epoch": 0.6074632615994277, + "grad_norm": 0.7257835268974304, + "learning_rate": 7.9158741289006e-06, + "loss": 0.7471, + "step": 11037 + }, + { + "epoch": 0.6075183004017832, + "grad_norm": 0.8100149631500244, + "learning_rate": 7.915521992777922e-06, + "loss": 0.8373, + "step": 11038 + }, + { + "epoch": 0.6075733392041389, + "grad_norm": 0.7781035304069519, + "learning_rate": 7.915169834742964e-06, + "loss": 0.8471, + "step": 11039 + }, + { + "epoch": 0.6076283780064946, + "grad_norm": 0.7426049709320068, + "learning_rate": 7.914817654798368e-06, + "loss": 0.753, + "step": 11040 + }, + { + "epoch": 0.6076834168088502, + "grad_norm": 0.6990010738372803, + "learning_rate": 7.914465452946782e-06, + "loss": 0.7556, + "step": 11041 + }, + { + "epoch": 0.6077384556112059, + "grad_norm": 0.8038754463195801, + "learning_rate": 7.914113229190856e-06, + "loss": 0.7787, + "step": 11042 + }, + { + "epoch": 0.6077934944135616, + "grad_norm": 0.6434115767478943, + "learning_rate": 7.913760983533233e-06, + "loss": 0.7831, + "step": 11043 + }, + { + "epoch": 0.6078485332159173, + "grad_norm": 0.8119033575057983, + "learning_rate": 7.913408715976562e-06, + "loss": 0.7691, + "step": 11044 + }, + { + "epoch": 0.6079035720182728, + "grad_norm": 0.6710149049758911, + "learning_rate": 7.913056426523493e-06, + "loss": 0.7542, + "step": 11045 + }, + { + "epoch": 0.6079586108206285, + "grad_norm": 0.7458183765411377, + "learning_rate": 7.912704115176671e-06, + "loss": 0.7673, + "step": 11046 + }, + { + "epoch": 0.6080136496229842, + "grad_norm": 0.8061705827713013, + "learning_rate": 7.912351781938745e-06, + "loss": 0.9255, + "step": 11047 + }, + { + "epoch": 0.6080686884253399, + "grad_norm": 0.7193130850791931, + "learning_rate": 7.91199942681236e-06, + "loss": 0.8154, + "step": 11048 + }, + { + "epoch": 0.6081237272276955, + "grad_norm": 0.7785167098045349, + "learning_rate": 7.911647049800171e-06, + "loss": 0.7747, + "step": 11049 + }, + { + "epoch": 0.6081787660300512, + "grad_norm": 0.665765106678009, + "learning_rate": 7.911294650904818e-06, + "loss": 0.7573, + "step": 11050 + }, + { + "epoch": 0.6082338048324069, + "grad_norm": 0.7940623760223389, + "learning_rate": 7.910942230128956e-06, + "loss": 0.6628, + "step": 11051 + }, + { + "epoch": 0.6082888436347625, + "grad_norm": 0.8364549875259399, + "learning_rate": 7.910589787475232e-06, + "loss": 0.8103, + "step": 11052 + }, + { + "epoch": 0.6083438824371181, + "grad_norm": 0.6153101325035095, + "learning_rate": 7.910237322946292e-06, + "loss": 0.76, + "step": 11053 + }, + { + "epoch": 0.6083989212394738, + "grad_norm": 0.8381257653236389, + "learning_rate": 7.909884836544789e-06, + "loss": 0.8366, + "step": 11054 + }, + { + "epoch": 0.6084539600418295, + "grad_norm": 0.6602391600608826, + "learning_rate": 7.90953232827337e-06, + "loss": 0.7389, + "step": 11055 + }, + { + "epoch": 0.6085089988441852, + "grad_norm": 0.7329971194267273, + "learning_rate": 7.909179798134685e-06, + "loss": 0.8217, + "step": 11056 + }, + { + "epoch": 0.6085640376465408, + "grad_norm": 0.7319926023483276, + "learning_rate": 7.908827246131383e-06, + "loss": 0.78, + "step": 11057 + }, + { + "epoch": 0.6086190764488965, + "grad_norm": 0.6491387486457825, + "learning_rate": 7.908474672266114e-06, + "loss": 0.7496, + "step": 11058 + }, + { + "epoch": 0.6086741152512521, + "grad_norm": 0.656434953212738, + "learning_rate": 7.908122076541529e-06, + "loss": 0.7462, + "step": 11059 + }, + { + "epoch": 0.6087291540536078, + "grad_norm": 0.6908577680587769, + "learning_rate": 7.907769458960275e-06, + "loss": 0.7505, + "step": 11060 + }, + { + "epoch": 0.6087841928559634, + "grad_norm": 0.774424135684967, + "learning_rate": 7.907416819525007e-06, + "loss": 0.8275, + "step": 11061 + }, + { + "epoch": 0.6088392316583191, + "grad_norm": 0.6796718835830688, + "learning_rate": 7.90706415823837e-06, + "loss": 0.7606, + "step": 11062 + }, + { + "epoch": 0.6088942704606748, + "grad_norm": 0.9576514959335327, + "learning_rate": 7.906711475103016e-06, + "loss": 0.807, + "step": 11063 + }, + { + "epoch": 0.6089493092630305, + "grad_norm": 0.9848490953445435, + "learning_rate": 7.9063587701216e-06, + "loss": 0.7856, + "step": 11064 + }, + { + "epoch": 0.6090043480653861, + "grad_norm": 0.9490165710449219, + "learning_rate": 7.906006043296768e-06, + "loss": 0.8519, + "step": 11065 + }, + { + "epoch": 0.6090593868677417, + "grad_norm": 0.631382942199707, + "learning_rate": 7.905653294631172e-06, + "loss": 0.7041, + "step": 11066 + }, + { + "epoch": 0.6091144256700974, + "grad_norm": 0.6969574093818665, + "learning_rate": 7.905300524127464e-06, + "loss": 0.7556, + "step": 11067 + }, + { + "epoch": 0.6091694644724531, + "grad_norm": 0.6990532279014587, + "learning_rate": 7.904947731788295e-06, + "loss": 0.799, + "step": 11068 + }, + { + "epoch": 0.6092245032748087, + "grad_norm": 0.7216916084289551, + "learning_rate": 7.904594917616315e-06, + "loss": 0.7617, + "step": 11069 + }, + { + "epoch": 0.6092795420771644, + "grad_norm": 0.6874147653579712, + "learning_rate": 7.904242081614179e-06, + "loss": 0.7616, + "step": 11070 + }, + { + "epoch": 0.6093345808795201, + "grad_norm": 0.6909550428390503, + "learning_rate": 7.903889223784535e-06, + "loss": 0.7649, + "step": 11071 + }, + { + "epoch": 0.6093896196818758, + "grad_norm": 0.7796370387077332, + "learning_rate": 7.90353634413004e-06, + "loss": 0.7557, + "step": 11072 + }, + { + "epoch": 0.6094446584842313, + "grad_norm": 0.807448148727417, + "learning_rate": 7.903183442653341e-06, + "loss": 0.7519, + "step": 11073 + }, + { + "epoch": 0.609499697286587, + "grad_norm": 0.846371054649353, + "learning_rate": 7.902830519357092e-06, + "loss": 0.9342, + "step": 11074 + }, + { + "epoch": 0.6095547360889427, + "grad_norm": 1.0386929512023926, + "learning_rate": 7.902477574243947e-06, + "loss": 0.6802, + "step": 11075 + }, + { + "epoch": 0.6096097748912984, + "grad_norm": 0.8011854887008667, + "learning_rate": 7.902124607316558e-06, + "loss": 0.7756, + "step": 11076 + }, + { + "epoch": 0.609664813693654, + "grad_norm": 0.6560170650482178, + "learning_rate": 7.901771618577574e-06, + "loss": 0.7831, + "step": 11077 + }, + { + "epoch": 0.6097198524960097, + "grad_norm": 0.656891942024231, + "learning_rate": 7.901418608029655e-06, + "loss": 0.7239, + "step": 11078 + }, + { + "epoch": 0.6097748912983654, + "grad_norm": 0.7451794743537903, + "learning_rate": 7.901065575675448e-06, + "loss": 0.7426, + "step": 11079 + }, + { + "epoch": 0.6098299301007211, + "grad_norm": 0.6805453300476074, + "learning_rate": 7.90071252151761e-06, + "loss": 0.7257, + "step": 11080 + }, + { + "epoch": 0.6098849689030766, + "grad_norm": 0.7747140526771545, + "learning_rate": 7.900359445558791e-06, + "loss": 0.8554, + "step": 11081 + }, + { + "epoch": 0.6099400077054323, + "grad_norm": 0.7276260256767273, + "learning_rate": 7.900006347801649e-06, + "loss": 0.7608, + "step": 11082 + }, + { + "epoch": 0.609995046507788, + "grad_norm": 0.7496321201324463, + "learning_rate": 7.899653228248836e-06, + "loss": 0.7707, + "step": 11083 + }, + { + "epoch": 0.6100500853101436, + "grad_norm": 0.6810722947120667, + "learning_rate": 7.899300086903006e-06, + "loss": 0.7425, + "step": 11084 + }, + { + "epoch": 0.6101051241124993, + "grad_norm": 0.7245593070983887, + "learning_rate": 7.89894692376681e-06, + "loss": 0.8404, + "step": 11085 + }, + { + "epoch": 0.610160162914855, + "grad_norm": 0.7139402627944946, + "learning_rate": 7.898593738842906e-06, + "loss": 0.7219, + "step": 11086 + }, + { + "epoch": 0.6102152017172107, + "grad_norm": 0.6483772397041321, + "learning_rate": 7.898240532133947e-06, + "loss": 0.7571, + "step": 11087 + }, + { + "epoch": 0.6102702405195662, + "grad_norm": 0.7347467541694641, + "learning_rate": 7.89788730364259e-06, + "loss": 0.7666, + "step": 11088 + }, + { + "epoch": 0.6103252793219219, + "grad_norm": 0.8899261355400085, + "learning_rate": 7.897534053371485e-06, + "loss": 0.6886, + "step": 11089 + }, + { + "epoch": 0.6103803181242776, + "grad_norm": 0.7005650401115417, + "learning_rate": 7.89718078132329e-06, + "loss": 0.6771, + "step": 11090 + }, + { + "epoch": 0.6104353569266333, + "grad_norm": 0.776589035987854, + "learning_rate": 7.896827487500662e-06, + "loss": 0.7731, + "step": 11091 + }, + { + "epoch": 0.6104903957289889, + "grad_norm": 0.7039395570755005, + "learning_rate": 7.896474171906252e-06, + "loss": 0.7415, + "step": 11092 + }, + { + "epoch": 0.6105454345313446, + "grad_norm": 0.7453792095184326, + "learning_rate": 7.896120834542718e-06, + "loss": 0.8507, + "step": 11093 + }, + { + "epoch": 0.6106004733337003, + "grad_norm": 0.7516497373580933, + "learning_rate": 7.895767475412717e-06, + "loss": 0.8271, + "step": 11094 + }, + { + "epoch": 0.610655512136056, + "grad_norm": 0.6751283407211304, + "learning_rate": 7.895414094518901e-06, + "loss": 0.7788, + "step": 11095 + }, + { + "epoch": 0.6107105509384115, + "grad_norm": 0.7240836024284363, + "learning_rate": 7.895060691863927e-06, + "loss": 0.7507, + "step": 11096 + }, + { + "epoch": 0.6107655897407672, + "grad_norm": 0.8286149501800537, + "learning_rate": 7.894707267450451e-06, + "loss": 0.7033, + "step": 11097 + }, + { + "epoch": 0.6108206285431229, + "grad_norm": 0.8814655542373657, + "learning_rate": 7.894353821281131e-06, + "loss": 0.73, + "step": 11098 + }, + { + "epoch": 0.6108756673454786, + "grad_norm": 0.6792872548103333, + "learning_rate": 7.894000353358624e-06, + "loss": 0.7445, + "step": 11099 + }, + { + "epoch": 0.6109307061478342, + "grad_norm": 0.6442595720291138, + "learning_rate": 7.893646863685584e-06, + "loss": 0.7228, + "step": 11100 + }, + { + "epoch": 0.6109857449501899, + "grad_norm": 0.6775944828987122, + "learning_rate": 7.89329335226467e-06, + "loss": 0.7937, + "step": 11101 + }, + { + "epoch": 0.6110407837525456, + "grad_norm": 0.6315211653709412, + "learning_rate": 7.892939819098534e-06, + "loss": 0.7328, + "step": 11102 + }, + { + "epoch": 0.6110958225549012, + "grad_norm": 0.7419382929801941, + "learning_rate": 7.89258626418984e-06, + "loss": 0.8088, + "step": 11103 + }, + { + "epoch": 0.6111508613572568, + "grad_norm": 0.6645117402076721, + "learning_rate": 7.89223268754124e-06, + "loss": 0.7844, + "step": 11104 + }, + { + "epoch": 0.6112059001596125, + "grad_norm": 0.6389926075935364, + "learning_rate": 7.891879089155397e-06, + "loss": 0.6353, + "step": 11105 + }, + { + "epoch": 0.6112609389619682, + "grad_norm": 0.8223785758018494, + "learning_rate": 7.891525469034963e-06, + "loss": 0.7377, + "step": 11106 + }, + { + "epoch": 0.6113159777643239, + "grad_norm": 0.7627747058868408, + "learning_rate": 7.891171827182595e-06, + "loss": 0.8317, + "step": 11107 + }, + { + "epoch": 0.6113710165666795, + "grad_norm": 0.8015971183776855, + "learning_rate": 7.890818163600956e-06, + "loss": 0.8324, + "step": 11108 + }, + { + "epoch": 0.6114260553690352, + "grad_norm": 0.7180280089378357, + "learning_rate": 7.8904644782927e-06, + "loss": 0.8211, + "step": 11109 + }, + { + "epoch": 0.6114810941713908, + "grad_norm": 0.7855646014213562, + "learning_rate": 7.890110771260487e-06, + "loss": 0.8629, + "step": 11110 + }, + { + "epoch": 0.6115361329737465, + "grad_norm": 0.7389342784881592, + "learning_rate": 7.889757042506976e-06, + "loss": 0.6917, + "step": 11111 + }, + { + "epoch": 0.6115911717761021, + "grad_norm": 0.7996030449867249, + "learning_rate": 7.889403292034825e-06, + "loss": 0.7361, + "step": 11112 + }, + { + "epoch": 0.6116462105784578, + "grad_norm": 0.6658353805541992, + "learning_rate": 7.88904951984669e-06, + "loss": 0.7048, + "step": 11113 + }, + { + "epoch": 0.6117012493808135, + "grad_norm": 0.8128555417060852, + "learning_rate": 7.888695725945235e-06, + "loss": 0.7772, + "step": 11114 + }, + { + "epoch": 0.6117562881831692, + "grad_norm": 0.7597428560256958, + "learning_rate": 7.888341910333114e-06, + "loss": 0.7447, + "step": 11115 + }, + { + "epoch": 0.6118113269855248, + "grad_norm": 0.7330088019371033, + "learning_rate": 7.88798807301299e-06, + "loss": 0.849, + "step": 11116 + }, + { + "epoch": 0.6118663657878805, + "grad_norm": 0.8374074101448059, + "learning_rate": 7.88763421398752e-06, + "loss": 0.6149, + "step": 11117 + }, + { + "epoch": 0.6119214045902361, + "grad_norm": 0.7507160305976868, + "learning_rate": 7.887280333259364e-06, + "loss": 0.7737, + "step": 11118 + }, + { + "epoch": 0.6119764433925918, + "grad_norm": 0.7218281626701355, + "learning_rate": 7.886926430831181e-06, + "loss": 0.8151, + "step": 11119 + }, + { + "epoch": 0.6120314821949474, + "grad_norm": 0.6761744618415833, + "learning_rate": 7.886572506705634e-06, + "loss": 0.7429, + "step": 11120 + }, + { + "epoch": 0.6120865209973031, + "grad_norm": 0.8243520259857178, + "learning_rate": 7.886218560885379e-06, + "loss": 0.819, + "step": 11121 + }, + { + "epoch": 0.6121415597996588, + "grad_norm": 0.9675465822219849, + "learning_rate": 7.885864593373078e-06, + "loss": 0.7834, + "step": 11122 + }, + { + "epoch": 0.6121965986020145, + "grad_norm": 0.7220338582992554, + "learning_rate": 7.885510604171391e-06, + "loss": 0.8266, + "step": 11123 + }, + { + "epoch": 0.61225163740437, + "grad_norm": 0.7185316681861877, + "learning_rate": 7.88515659328298e-06, + "loss": 0.7949, + "step": 11124 + }, + { + "epoch": 0.6123066762067257, + "grad_norm": 0.67637038230896, + "learning_rate": 7.884802560710503e-06, + "loss": 0.7456, + "step": 11125 + }, + { + "epoch": 0.6123617150090814, + "grad_norm": 0.7886855602264404, + "learning_rate": 7.884448506456622e-06, + "loss": 0.7181, + "step": 11126 + }, + { + "epoch": 0.612416753811437, + "grad_norm": 0.7250227928161621, + "learning_rate": 7.884094430523999e-06, + "loss": 0.7537, + "step": 11127 + }, + { + "epoch": 0.6124717926137927, + "grad_norm": 0.6771906614303589, + "learning_rate": 7.883740332915295e-06, + "loss": 0.7642, + "step": 11128 + }, + { + "epoch": 0.6125268314161484, + "grad_norm": 0.8375886082649231, + "learning_rate": 7.88338621363317e-06, + "loss": 0.7231, + "step": 11129 + }, + { + "epoch": 0.6125818702185041, + "grad_norm": 0.6782773733139038, + "learning_rate": 7.883032072680285e-06, + "loss": 0.8391, + "step": 11130 + }, + { + "epoch": 0.6126369090208597, + "grad_norm": 0.7103945016860962, + "learning_rate": 7.882677910059304e-06, + "loss": 0.7838, + "step": 11131 + }, + { + "epoch": 0.6126919478232153, + "grad_norm": 0.7037224769592285, + "learning_rate": 7.882323725772887e-06, + "loss": 0.7906, + "step": 11132 + }, + { + "epoch": 0.612746986625571, + "grad_norm": 0.6872009634971619, + "learning_rate": 7.881969519823695e-06, + "loss": 0.7764, + "step": 11133 + }, + { + "epoch": 0.6128020254279267, + "grad_norm": 0.7377448678016663, + "learning_rate": 7.881615292214393e-06, + "loss": 0.8231, + "step": 11134 + }, + { + "epoch": 0.6128570642302823, + "grad_norm": 0.62479168176651, + "learning_rate": 7.881261042947642e-06, + "loss": 0.6522, + "step": 11135 + }, + { + "epoch": 0.612912103032638, + "grad_norm": 0.7989023923873901, + "learning_rate": 7.880906772026105e-06, + "loss": 0.7326, + "step": 11136 + }, + { + "epoch": 0.6129671418349937, + "grad_norm": 0.6322734951972961, + "learning_rate": 7.880552479452441e-06, + "loss": 0.6775, + "step": 11137 + }, + { + "epoch": 0.6130221806373494, + "grad_norm": 0.8628767132759094, + "learning_rate": 7.880198165229318e-06, + "loss": 0.7705, + "step": 11138 + }, + { + "epoch": 0.613077219439705, + "grad_norm": 0.7386173605918884, + "learning_rate": 7.879843829359396e-06, + "loss": 0.7297, + "step": 11139 + }, + { + "epoch": 0.6131322582420606, + "grad_norm": 0.6882045269012451, + "learning_rate": 7.879489471845339e-06, + "loss": 0.6875, + "step": 11140 + }, + { + "epoch": 0.6131872970444163, + "grad_norm": 0.5986032485961914, + "learning_rate": 7.879135092689809e-06, + "loss": 0.6329, + "step": 11141 + }, + { + "epoch": 0.613242335846772, + "grad_norm": 0.7973099946975708, + "learning_rate": 7.878780691895472e-06, + "loss": 0.809, + "step": 11142 + }, + { + "epoch": 0.6132973746491276, + "grad_norm": 0.6828579902648926, + "learning_rate": 7.878426269464989e-06, + "loss": 0.7777, + "step": 11143 + }, + { + "epoch": 0.6133524134514833, + "grad_norm": 0.8179183006286621, + "learning_rate": 7.878071825401024e-06, + "loss": 0.7275, + "step": 11144 + }, + { + "epoch": 0.613407452253839, + "grad_norm": 0.7290762066841125, + "learning_rate": 7.877717359706242e-06, + "loss": 0.7424, + "step": 11145 + }, + { + "epoch": 0.6134624910561947, + "grad_norm": 0.732510507106781, + "learning_rate": 7.877362872383305e-06, + "loss": 0.6157, + "step": 11146 + }, + { + "epoch": 0.6135175298585502, + "grad_norm": 0.9205982685089111, + "learning_rate": 7.877008363434881e-06, + "loss": 0.7723, + "step": 11147 + }, + { + "epoch": 0.6135725686609059, + "grad_norm": 0.7138587832450867, + "learning_rate": 7.876653832863633e-06, + "loss": 0.7773, + "step": 11148 + }, + { + "epoch": 0.6136276074632616, + "grad_norm": 0.7323171496391296, + "learning_rate": 7.876299280672224e-06, + "loss": 0.8265, + "step": 11149 + }, + { + "epoch": 0.6136826462656173, + "grad_norm": 0.6717494130134583, + "learning_rate": 7.875944706863318e-06, + "loss": 0.788, + "step": 11150 + }, + { + "epoch": 0.6137376850679729, + "grad_norm": 0.7779331207275391, + "learning_rate": 7.875590111439582e-06, + "loss": 0.7864, + "step": 11151 + }, + { + "epoch": 0.6137927238703286, + "grad_norm": 0.6706684827804565, + "learning_rate": 7.875235494403683e-06, + "loss": 0.6673, + "step": 11152 + }, + { + "epoch": 0.6138477626726843, + "grad_norm": 0.7142137885093689, + "learning_rate": 7.874880855758281e-06, + "loss": 0.8031, + "step": 11153 + }, + { + "epoch": 0.61390280147504, + "grad_norm": 0.6962595582008362, + "learning_rate": 7.874526195506045e-06, + "loss": 0.692, + "step": 11154 + }, + { + "epoch": 0.6139578402773955, + "grad_norm": 0.7237100601196289, + "learning_rate": 7.874171513649638e-06, + "loss": 0.7504, + "step": 11155 + }, + { + "epoch": 0.6140128790797512, + "grad_norm": 0.8235127925872803, + "learning_rate": 7.87381681019173e-06, + "loss": 0.8132, + "step": 11156 + }, + { + "epoch": 0.6140679178821069, + "grad_norm": 0.7483351826667786, + "learning_rate": 7.873462085134981e-06, + "loss": 0.7589, + "step": 11157 + }, + { + "epoch": 0.6141229566844626, + "grad_norm": 0.7309976816177368, + "learning_rate": 7.873107338482062e-06, + "loss": 0.7722, + "step": 11158 + }, + { + "epoch": 0.6141779954868182, + "grad_norm": 0.8871245384216309, + "learning_rate": 7.872752570235639e-06, + "loss": 0.882, + "step": 11159 + }, + { + "epoch": 0.6142330342891739, + "grad_norm": 0.5987886190414429, + "learning_rate": 7.872397780398374e-06, + "loss": 0.6312, + "step": 11160 + }, + { + "epoch": 0.6142880730915296, + "grad_norm": 0.7320038080215454, + "learning_rate": 7.872042968972937e-06, + "loss": 0.7444, + "step": 11161 + }, + { + "epoch": 0.6143431118938852, + "grad_norm": 0.8111129999160767, + "learning_rate": 7.871688135961995e-06, + "loss": 0.7413, + "step": 11162 + }, + { + "epoch": 0.6143981506962408, + "grad_norm": 0.7497085332870483, + "learning_rate": 7.871333281368211e-06, + "loss": 0.8413, + "step": 11163 + }, + { + "epoch": 0.6144531894985965, + "grad_norm": 0.8341198563575745, + "learning_rate": 7.870978405194256e-06, + "loss": 0.7959, + "step": 11164 + }, + { + "epoch": 0.6145082283009522, + "grad_norm": 0.6293482780456543, + "learning_rate": 7.870623507442797e-06, + "loss": 0.6429, + "step": 11165 + }, + { + "epoch": 0.6145632671033079, + "grad_norm": 1.2423945665359497, + "learning_rate": 7.870268588116499e-06, + "loss": 0.6309, + "step": 11166 + }, + { + "epoch": 0.6146183059056635, + "grad_norm": 0.7811731100082397, + "learning_rate": 7.86991364721803e-06, + "loss": 0.738, + "step": 11167 + }, + { + "epoch": 0.6146733447080192, + "grad_norm": 0.6904361248016357, + "learning_rate": 7.869558684750061e-06, + "loss": 0.7995, + "step": 11168 + }, + { + "epoch": 0.6147283835103748, + "grad_norm": 0.7267210483551025, + "learning_rate": 7.869203700715254e-06, + "loss": 0.6989, + "step": 11169 + }, + { + "epoch": 0.6147834223127304, + "grad_norm": 0.7183068990707397, + "learning_rate": 7.868848695116282e-06, + "loss": 0.7872, + "step": 11170 + }, + { + "epoch": 0.6148384611150861, + "grad_norm": 0.6774286031723022, + "learning_rate": 7.868493667955808e-06, + "loss": 0.7502, + "step": 11171 + }, + { + "epoch": 0.6148934999174418, + "grad_norm": 0.7587934732437134, + "learning_rate": 7.868138619236507e-06, + "loss": 0.8037, + "step": 11172 + }, + { + "epoch": 0.6149485387197975, + "grad_norm": 0.6825854182243347, + "learning_rate": 7.867783548961043e-06, + "loss": 0.7924, + "step": 11173 + }, + { + "epoch": 0.6150035775221531, + "grad_norm": 0.6243380904197693, + "learning_rate": 7.867428457132084e-06, + "loss": 0.5953, + "step": 11174 + }, + { + "epoch": 0.6150586163245088, + "grad_norm": 0.6630006432533264, + "learning_rate": 7.8670733437523e-06, + "loss": 0.7102, + "step": 11175 + }, + { + "epoch": 0.6151136551268644, + "grad_norm": 0.7059652805328369, + "learning_rate": 7.866718208824362e-06, + "loss": 0.6847, + "step": 11176 + }, + { + "epoch": 0.6151686939292201, + "grad_norm": 0.6768305897712708, + "learning_rate": 7.866363052350938e-06, + "loss": 0.7152, + "step": 11177 + }, + { + "epoch": 0.6152237327315757, + "grad_norm": 0.6850628852844238, + "learning_rate": 7.866007874334696e-06, + "loss": 0.767, + "step": 11178 + }, + { + "epoch": 0.6152787715339314, + "grad_norm": 0.6767143607139587, + "learning_rate": 7.865652674778305e-06, + "loss": 0.6826, + "step": 11179 + }, + { + "epoch": 0.6153338103362871, + "grad_norm": 0.8240014314651489, + "learning_rate": 7.865297453684436e-06, + "loss": 0.8493, + "step": 11180 + }, + { + "epoch": 0.6153888491386428, + "grad_norm": 0.7725485563278198, + "learning_rate": 7.864942211055758e-06, + "loss": 0.8704, + "step": 11181 + }, + { + "epoch": 0.6154438879409984, + "grad_norm": 0.9260931015014648, + "learning_rate": 7.864586946894941e-06, + "loss": 0.7926, + "step": 11182 + }, + { + "epoch": 0.615498926743354, + "grad_norm": 0.7558152079582214, + "learning_rate": 7.864231661204655e-06, + "loss": 0.8436, + "step": 11183 + }, + { + "epoch": 0.6155539655457097, + "grad_norm": 0.7899817824363708, + "learning_rate": 7.863876353987571e-06, + "loss": 0.7579, + "step": 11184 + }, + { + "epoch": 0.6156090043480654, + "grad_norm": 0.7757478952407837, + "learning_rate": 7.863521025246362e-06, + "loss": 0.7534, + "step": 11185 + }, + { + "epoch": 0.615664043150421, + "grad_norm": 0.6563131809234619, + "learning_rate": 7.863165674983693e-06, + "loss": 0.728, + "step": 11186 + }, + { + "epoch": 0.6157190819527767, + "grad_norm": 0.6516488790512085, + "learning_rate": 7.862810303202234e-06, + "loss": 0.736, + "step": 11187 + }, + { + "epoch": 0.6157741207551324, + "grad_norm": 0.6867820620536804, + "learning_rate": 7.862454909904665e-06, + "loss": 0.8032, + "step": 11188 + }, + { + "epoch": 0.6158291595574881, + "grad_norm": 0.7399753928184509, + "learning_rate": 7.862099495093647e-06, + "loss": 0.8681, + "step": 11189 + }, + { + "epoch": 0.6158841983598436, + "grad_norm": 0.7249311804771423, + "learning_rate": 7.861744058771857e-06, + "loss": 0.7868, + "step": 11190 + }, + { + "epoch": 0.6159392371621993, + "grad_norm": 0.8579045534133911, + "learning_rate": 7.861388600941964e-06, + "loss": 0.7915, + "step": 11191 + }, + { + "epoch": 0.615994275964555, + "grad_norm": 0.6855454444885254, + "learning_rate": 7.86103312160664e-06, + "loss": 0.8442, + "step": 11192 + }, + { + "epoch": 0.6160493147669107, + "grad_norm": 0.7412910461425781, + "learning_rate": 7.860677620768558e-06, + "loss": 0.7684, + "step": 11193 + }, + { + "epoch": 0.6161043535692663, + "grad_norm": 0.8567430377006531, + "learning_rate": 7.860322098430389e-06, + "loss": 0.8801, + "step": 11194 + }, + { + "epoch": 0.616159392371622, + "grad_norm": 0.7504804134368896, + "learning_rate": 7.859966554594802e-06, + "loss": 0.7359, + "step": 11195 + }, + { + "epoch": 0.6162144311739777, + "grad_norm": 0.7086803317070007, + "learning_rate": 7.859610989264474e-06, + "loss": 0.8498, + "step": 11196 + }, + { + "epoch": 0.6162694699763334, + "grad_norm": 0.7201757431030273, + "learning_rate": 7.859255402442075e-06, + "loss": 0.608, + "step": 11197 + }, + { + "epoch": 0.6163245087786889, + "grad_norm": 0.8968291282653809, + "learning_rate": 7.858899794130279e-06, + "loss": 0.8067, + "step": 11198 + }, + { + "epoch": 0.6163795475810446, + "grad_norm": 0.7474254965782166, + "learning_rate": 7.858544164331756e-06, + "loss": 0.8355, + "step": 11199 + }, + { + "epoch": 0.6164345863834003, + "grad_norm": 0.6907560229301453, + "learning_rate": 7.85818851304918e-06, + "loss": 0.788, + "step": 11200 + }, + { + "epoch": 0.616489625185756, + "grad_norm": 0.725330650806427, + "learning_rate": 7.857832840285224e-06, + "loss": 0.8157, + "step": 11201 + }, + { + "epoch": 0.6165446639881116, + "grad_norm": 0.682722270488739, + "learning_rate": 7.857477146042562e-06, + "loss": 0.7939, + "step": 11202 + }, + { + "epoch": 0.6165997027904673, + "grad_norm": 0.661533534526825, + "learning_rate": 7.857121430323866e-06, + "loss": 0.7173, + "step": 11203 + }, + { + "epoch": 0.616654741592823, + "grad_norm": 0.6922706961631775, + "learning_rate": 7.856765693131811e-06, + "loss": 0.7719, + "step": 11204 + }, + { + "epoch": 0.6167097803951787, + "grad_norm": 0.72809898853302, + "learning_rate": 7.856409934469071e-06, + "loss": 0.7362, + "step": 11205 + }, + { + "epoch": 0.6167648191975342, + "grad_norm": 0.7540956735610962, + "learning_rate": 7.856054154338317e-06, + "loss": 0.7883, + "step": 11206 + }, + { + "epoch": 0.6168198579998899, + "grad_norm": 0.6777094006538391, + "learning_rate": 7.855698352742224e-06, + "loss": 0.6938, + "step": 11207 + }, + { + "epoch": 0.6168748968022456, + "grad_norm": 0.6771852970123291, + "learning_rate": 7.855342529683467e-06, + "loss": 0.697, + "step": 11208 + }, + { + "epoch": 0.6169299356046013, + "grad_norm": 0.7810118198394775, + "learning_rate": 7.854986685164721e-06, + "loss": 0.6875, + "step": 11209 + }, + { + "epoch": 0.6169849744069569, + "grad_norm": 0.6992766261100769, + "learning_rate": 7.854630819188658e-06, + "loss": 0.6553, + "step": 11210 + }, + { + "epoch": 0.6170400132093126, + "grad_norm": 0.7409703135490417, + "learning_rate": 7.854274931757954e-06, + "loss": 0.7685, + "step": 11211 + }, + { + "epoch": 0.6170950520116683, + "grad_norm": 0.7263410687446594, + "learning_rate": 7.853919022875285e-06, + "loss": 0.7939, + "step": 11212 + }, + { + "epoch": 0.6171500908140238, + "grad_norm": 0.8451918959617615, + "learning_rate": 7.853563092543323e-06, + "loss": 0.7522, + "step": 11213 + }, + { + "epoch": 0.6172051296163795, + "grad_norm": 0.672926664352417, + "learning_rate": 7.853207140764745e-06, + "loss": 0.732, + "step": 11214 + }, + { + "epoch": 0.6172601684187352, + "grad_norm": 0.6607885956764221, + "learning_rate": 7.852851167542226e-06, + "loss": 0.7441, + "step": 11215 + }, + { + "epoch": 0.6173152072210909, + "grad_norm": 0.730385422706604, + "learning_rate": 7.85249517287844e-06, + "loss": 0.7925, + "step": 11216 + }, + { + "epoch": 0.6173702460234465, + "grad_norm": 0.7338821887969971, + "learning_rate": 7.852139156776067e-06, + "loss": 0.8106, + "step": 11217 + }, + { + "epoch": 0.6174252848258022, + "grad_norm": 0.7662163376808167, + "learning_rate": 7.851783119237777e-06, + "loss": 0.8166, + "step": 11218 + }, + { + "epoch": 0.6174803236281579, + "grad_norm": 0.7738409042358398, + "learning_rate": 7.85142706026625e-06, + "loss": 0.7898, + "step": 11219 + }, + { + "epoch": 0.6175353624305135, + "grad_norm": 0.8129978775978088, + "learning_rate": 7.851070979864159e-06, + "loss": 0.7618, + "step": 11220 + }, + { + "epoch": 0.6175904012328691, + "grad_norm": 0.7923482060432434, + "learning_rate": 7.850714878034183e-06, + "loss": 0.7341, + "step": 11221 + }, + { + "epoch": 0.6176454400352248, + "grad_norm": 0.7189306020736694, + "learning_rate": 7.850358754778996e-06, + "loss": 0.7775, + "step": 11222 + }, + { + "epoch": 0.6177004788375805, + "grad_norm": 0.9873724579811096, + "learning_rate": 7.850002610101276e-06, + "loss": 0.8521, + "step": 11223 + }, + { + "epoch": 0.6177555176399362, + "grad_norm": 0.6350038051605225, + "learning_rate": 7.8496464440037e-06, + "loss": 0.6356, + "step": 11224 + }, + { + "epoch": 0.6178105564422918, + "grad_norm": 0.8059771060943604, + "learning_rate": 7.849290256488941e-06, + "loss": 0.821, + "step": 11225 + }, + { + "epoch": 0.6178655952446475, + "grad_norm": 0.7469610571861267, + "learning_rate": 7.848934047559684e-06, + "loss": 0.7782, + "step": 11226 + }, + { + "epoch": 0.6179206340470031, + "grad_norm": 0.6423176527023315, + "learning_rate": 7.848577817218597e-06, + "loss": 0.6693, + "step": 11227 + }, + { + "epoch": 0.6179756728493588, + "grad_norm": 0.7298387885093689, + "learning_rate": 7.848221565468363e-06, + "loss": 0.775, + "step": 11228 + }, + { + "epoch": 0.6180307116517144, + "grad_norm": 0.7125145196914673, + "learning_rate": 7.84786529231166e-06, + "loss": 0.7507, + "step": 11229 + }, + { + "epoch": 0.6180857504540701, + "grad_norm": 0.6658627390861511, + "learning_rate": 7.847508997751163e-06, + "loss": 0.7506, + "step": 11230 + }, + { + "epoch": 0.6181407892564258, + "grad_norm": 0.6425275206565857, + "learning_rate": 7.847152681789549e-06, + "loss": 0.657, + "step": 11231 + }, + { + "epoch": 0.6181958280587815, + "grad_norm": 0.8075960278511047, + "learning_rate": 7.846796344429498e-06, + "loss": 0.5434, + "step": 11232 + }, + { + "epoch": 0.6182508668611371, + "grad_norm": 0.8481889367103577, + "learning_rate": 7.846439985673689e-06, + "loss": 0.8303, + "step": 11233 + }, + { + "epoch": 0.6183059056634927, + "grad_norm": 0.7216358184814453, + "learning_rate": 7.846083605524799e-06, + "loss": 0.7589, + "step": 11234 + }, + { + "epoch": 0.6183609444658484, + "grad_norm": 0.8399745225906372, + "learning_rate": 7.845727203985504e-06, + "loss": 0.8096, + "step": 11235 + }, + { + "epoch": 0.6184159832682041, + "grad_norm": 0.6708692908287048, + "learning_rate": 7.845370781058489e-06, + "loss": 0.6858, + "step": 11236 + }, + { + "epoch": 0.6184710220705597, + "grad_norm": 0.6309100389480591, + "learning_rate": 7.845014336746426e-06, + "loss": 0.6093, + "step": 11237 + }, + { + "epoch": 0.6185260608729154, + "grad_norm": 0.8138728141784668, + "learning_rate": 7.844657871051997e-06, + "loss": 0.8259, + "step": 11238 + }, + { + "epoch": 0.6185810996752711, + "grad_norm": 0.6763564348220825, + "learning_rate": 7.844301383977882e-06, + "loss": 0.7056, + "step": 11239 + }, + { + "epoch": 0.6186361384776268, + "grad_norm": 0.792085587978363, + "learning_rate": 7.843944875526758e-06, + "loss": 0.7364, + "step": 11240 + }, + { + "epoch": 0.6186911772799824, + "grad_norm": 0.8738027811050415, + "learning_rate": 7.843588345701306e-06, + "loss": 0.7092, + "step": 11241 + }, + { + "epoch": 0.618746216082338, + "grad_norm": 0.7694413065910339, + "learning_rate": 7.843231794504205e-06, + "loss": 0.852, + "step": 11242 + }, + { + "epoch": 0.6188012548846937, + "grad_norm": 0.8211640119552612, + "learning_rate": 7.842875221938135e-06, + "loss": 0.8218, + "step": 11243 + }, + { + "epoch": 0.6188562936870494, + "grad_norm": 0.620566189289093, + "learning_rate": 7.842518628005776e-06, + "loss": 0.7176, + "step": 11244 + }, + { + "epoch": 0.618911332489405, + "grad_norm": 0.7044099569320679, + "learning_rate": 7.84216201270981e-06, + "loss": 0.8068, + "step": 11245 + }, + { + "epoch": 0.6189663712917607, + "grad_norm": 0.765209436416626, + "learning_rate": 7.841805376052912e-06, + "loss": 0.8002, + "step": 11246 + }, + { + "epoch": 0.6190214100941164, + "grad_norm": 0.7565444707870483, + "learning_rate": 7.841448718037765e-06, + "loss": 0.7997, + "step": 11247 + }, + { + "epoch": 0.6190764488964721, + "grad_norm": 0.9544101357460022, + "learning_rate": 7.841092038667052e-06, + "loss": 0.647, + "step": 11248 + }, + { + "epoch": 0.6191314876988276, + "grad_norm": 0.7319634556770325, + "learning_rate": 7.840735337943452e-06, + "loss": 0.7982, + "step": 11249 + }, + { + "epoch": 0.6191865265011833, + "grad_norm": 0.6017479300498962, + "learning_rate": 7.840378615869645e-06, + "loss": 0.6817, + "step": 11250 + }, + { + "epoch": 0.619241565303539, + "grad_norm": 0.6936477422714233, + "learning_rate": 7.840021872448312e-06, + "loss": 0.7227, + "step": 11251 + }, + { + "epoch": 0.6192966041058947, + "grad_norm": 0.6962631940841675, + "learning_rate": 7.839665107682135e-06, + "loss": 0.779, + "step": 11252 + }, + { + "epoch": 0.6193516429082503, + "grad_norm": 0.9580947160720825, + "learning_rate": 7.839308321573797e-06, + "loss": 0.8821, + "step": 11253 + }, + { + "epoch": 0.619406681710606, + "grad_norm": 0.7721261978149414, + "learning_rate": 7.838951514125977e-06, + "loss": 0.7146, + "step": 11254 + }, + { + "epoch": 0.6194617205129617, + "grad_norm": 0.7349434494972229, + "learning_rate": 7.838594685341354e-06, + "loss": 0.7601, + "step": 11255 + }, + { + "epoch": 0.6195167593153172, + "grad_norm": 0.6787356734275818, + "learning_rate": 7.838237835222618e-06, + "loss": 0.706, + "step": 11256 + }, + { + "epoch": 0.6195717981176729, + "grad_norm": 0.7658288478851318, + "learning_rate": 7.837880963772445e-06, + "loss": 0.7102, + "step": 11257 + }, + { + "epoch": 0.6196268369200286, + "grad_norm": 0.8083927035331726, + "learning_rate": 7.837524070993516e-06, + "loss": 0.8501, + "step": 11258 + }, + { + "epoch": 0.6196818757223843, + "grad_norm": 0.7656283974647522, + "learning_rate": 7.837167156888516e-06, + "loss": 0.7558, + "step": 11259 + }, + { + "epoch": 0.6197369145247399, + "grad_norm": 0.7897886037826538, + "learning_rate": 7.836810221460128e-06, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.6197919533270956, + "grad_norm": 0.6858190298080444, + "learning_rate": 7.836453264711035e-06, + "loss": 0.717, + "step": 11261 + }, + { + "epoch": 0.6198469921294513, + "grad_norm": 0.7423431873321533, + "learning_rate": 7.836096286643917e-06, + "loss": 0.7047, + "step": 11262 + }, + { + "epoch": 0.619902030931807, + "grad_norm": 0.8277921676635742, + "learning_rate": 7.835739287261458e-06, + "loss": 0.7418, + "step": 11263 + }, + { + "epoch": 0.6199570697341625, + "grad_norm": 0.7102510929107666, + "learning_rate": 7.835382266566343e-06, + "loss": 0.8202, + "step": 11264 + }, + { + "epoch": 0.6200121085365182, + "grad_norm": 0.6705429553985596, + "learning_rate": 7.835025224561252e-06, + "loss": 0.7332, + "step": 11265 + }, + { + "epoch": 0.6200671473388739, + "grad_norm": 0.6529950499534607, + "learning_rate": 7.834668161248873e-06, + "loss": 0.7579, + "step": 11266 + }, + { + "epoch": 0.6201221861412296, + "grad_norm": 0.7189938426017761, + "learning_rate": 7.834311076631885e-06, + "loss": 0.7323, + "step": 11267 + }, + { + "epoch": 0.6201772249435852, + "grad_norm": 0.6559470891952515, + "learning_rate": 7.833953970712973e-06, + "loss": 0.5973, + "step": 11268 + }, + { + "epoch": 0.6202322637459409, + "grad_norm": 0.7971723675727844, + "learning_rate": 7.833596843494824e-06, + "loss": 0.804, + "step": 11269 + }, + { + "epoch": 0.6202873025482966, + "grad_norm": 0.7800958752632141, + "learning_rate": 7.833239694980118e-06, + "loss": 0.772, + "step": 11270 + }, + { + "epoch": 0.6203423413506522, + "grad_norm": 0.6831466555595398, + "learning_rate": 7.83288252517154e-06, + "loss": 0.7341, + "step": 11271 + }, + { + "epoch": 0.6203973801530078, + "grad_norm": 0.6504807472229004, + "learning_rate": 7.832525334071776e-06, + "loss": 0.6462, + "step": 11272 + }, + { + "epoch": 0.6204524189553635, + "grad_norm": 0.6973552703857422, + "learning_rate": 7.832168121683512e-06, + "loss": 0.7504, + "step": 11273 + }, + { + "epoch": 0.6205074577577192, + "grad_norm": 0.6772480607032776, + "learning_rate": 7.831810888009427e-06, + "loss": 0.7273, + "step": 11274 + }, + { + "epoch": 0.6205624965600749, + "grad_norm": 0.7077416777610779, + "learning_rate": 7.831453633052212e-06, + "loss": 0.7365, + "step": 11275 + }, + { + "epoch": 0.6206175353624305, + "grad_norm": 0.7338337898254395, + "learning_rate": 7.831096356814548e-06, + "loss": 0.7959, + "step": 11276 + }, + { + "epoch": 0.6206725741647862, + "grad_norm": 0.6313255429267883, + "learning_rate": 7.830739059299123e-06, + "loss": 0.7027, + "step": 11277 + }, + { + "epoch": 0.6207276129671419, + "grad_norm": 0.7377570867538452, + "learning_rate": 7.830381740508619e-06, + "loss": 0.6903, + "step": 11278 + }, + { + "epoch": 0.6207826517694975, + "grad_norm": 0.6868650317192078, + "learning_rate": 7.830024400445724e-06, + "loss": 0.6882, + "step": 11279 + }, + { + "epoch": 0.6208376905718531, + "grad_norm": 0.7632661461830139, + "learning_rate": 7.829667039113124e-06, + "loss": 0.8437, + "step": 11280 + }, + { + "epoch": 0.6208927293742088, + "grad_norm": 0.9241608381271362, + "learning_rate": 7.829309656513504e-06, + "loss": 0.779, + "step": 11281 + }, + { + "epoch": 0.6209477681765645, + "grad_norm": 0.6857842206954956, + "learning_rate": 7.828952252649551e-06, + "loss": 0.7882, + "step": 11282 + }, + { + "epoch": 0.6210028069789202, + "grad_norm": 0.695659875869751, + "learning_rate": 7.828594827523947e-06, + "loss": 0.7471, + "step": 11283 + }, + { + "epoch": 0.6210578457812758, + "grad_norm": 0.6398521661758423, + "learning_rate": 7.828237381139383e-06, + "loss": 0.7328, + "step": 11284 + }, + { + "epoch": 0.6211128845836315, + "grad_norm": 0.7386063933372498, + "learning_rate": 7.827879913498544e-06, + "loss": 0.748, + "step": 11285 + }, + { + "epoch": 0.6211679233859871, + "grad_norm": 0.6740923523902893, + "learning_rate": 7.827522424604117e-06, + "loss": 0.6866, + "step": 11286 + }, + { + "epoch": 0.6212229621883428, + "grad_norm": 0.6794413924217224, + "learning_rate": 7.82716491445879e-06, + "loss": 0.7299, + "step": 11287 + }, + { + "epoch": 0.6212780009906984, + "grad_norm": 0.6471715569496155, + "learning_rate": 7.826807383065245e-06, + "loss": 0.7071, + "step": 11288 + }, + { + "epoch": 0.6213330397930541, + "grad_norm": 0.9716162085533142, + "learning_rate": 7.826449830426174e-06, + "loss": 0.7417, + "step": 11289 + }, + { + "epoch": 0.6213880785954098, + "grad_norm": 0.6928716897964478, + "learning_rate": 7.826092256544263e-06, + "loss": 0.7757, + "step": 11290 + }, + { + "epoch": 0.6214431173977655, + "grad_norm": 0.6739227175712585, + "learning_rate": 7.825734661422197e-06, + "loss": 0.7576, + "step": 11291 + }, + { + "epoch": 0.621498156200121, + "grad_norm": 1.2619935274124146, + "learning_rate": 7.825377045062668e-06, + "loss": 0.7454, + "step": 11292 + }, + { + "epoch": 0.6215531950024767, + "grad_norm": 0.6713572144508362, + "learning_rate": 7.825019407468361e-06, + "loss": 0.7916, + "step": 11293 + }, + { + "epoch": 0.6216082338048324, + "grad_norm": 0.6143541932106018, + "learning_rate": 7.824661748641964e-06, + "loss": 0.6765, + "step": 11294 + }, + { + "epoch": 0.6216632726071881, + "grad_norm": 0.7141658067703247, + "learning_rate": 7.824304068586163e-06, + "loss": 0.7773, + "step": 11295 + }, + { + "epoch": 0.6217183114095437, + "grad_norm": 0.7320290803909302, + "learning_rate": 7.823946367303653e-06, + "loss": 0.8062, + "step": 11296 + }, + { + "epoch": 0.6217733502118994, + "grad_norm": 0.7523403167724609, + "learning_rate": 7.823588644797115e-06, + "loss": 0.7126, + "step": 11297 + }, + { + "epoch": 0.6218283890142551, + "grad_norm": 0.6512221097946167, + "learning_rate": 7.823230901069242e-06, + "loss": 0.7563, + "step": 11298 + }, + { + "epoch": 0.6218834278166107, + "grad_norm": 0.6512733697891235, + "learning_rate": 7.82287313612272e-06, + "loss": 0.7603, + "step": 11299 + }, + { + "epoch": 0.6219384666189663, + "grad_norm": 1.0590927600860596, + "learning_rate": 7.82251534996024e-06, + "loss": 0.8325, + "step": 11300 + }, + { + "epoch": 0.621993505421322, + "grad_norm": 0.6763397455215454, + "learning_rate": 7.82215754258449e-06, + "loss": 0.7915, + "step": 11301 + }, + { + "epoch": 0.6220485442236777, + "grad_norm": 0.6640639901161194, + "learning_rate": 7.82179971399816e-06, + "loss": 0.6953, + "step": 11302 + }, + { + "epoch": 0.6221035830260333, + "grad_norm": 0.6611515283584595, + "learning_rate": 7.821441864203938e-06, + "loss": 0.8331, + "step": 11303 + }, + { + "epoch": 0.622158621828389, + "grad_norm": 0.8226057887077332, + "learning_rate": 7.821083993204514e-06, + "loss": 0.7448, + "step": 11304 + }, + { + "epoch": 0.6222136606307447, + "grad_norm": 0.6798059940338135, + "learning_rate": 7.820726101002578e-06, + "loss": 0.717, + "step": 11305 + }, + { + "epoch": 0.6222686994331004, + "grad_norm": 0.7623499631881714, + "learning_rate": 7.820368187600821e-06, + "loss": 0.7343, + "step": 11306 + }, + { + "epoch": 0.622323738235456, + "grad_norm": 0.703886866569519, + "learning_rate": 7.82001025300193e-06, + "loss": 0.8008, + "step": 11307 + }, + { + "epoch": 0.6223787770378116, + "grad_norm": 0.6817659735679626, + "learning_rate": 7.819652297208597e-06, + "loss": 0.7534, + "step": 11308 + }, + { + "epoch": 0.6224338158401673, + "grad_norm": 0.8991402983665466, + "learning_rate": 7.819294320223513e-06, + "loss": 0.6236, + "step": 11309 + }, + { + "epoch": 0.622488854642523, + "grad_norm": 0.791199803352356, + "learning_rate": 7.818936322049366e-06, + "loss": 0.772, + "step": 11310 + }, + { + "epoch": 0.6225438934448786, + "grad_norm": 0.6401470303535461, + "learning_rate": 7.81857830268885e-06, + "loss": 0.7749, + "step": 11311 + }, + { + "epoch": 0.6225989322472343, + "grad_norm": 0.6731516122817993, + "learning_rate": 7.818220262144653e-06, + "loss": 0.7506, + "step": 11312 + }, + { + "epoch": 0.62265397104959, + "grad_norm": 0.7391661405563354, + "learning_rate": 7.817862200419467e-06, + "loss": 0.7288, + "step": 11313 + }, + { + "epoch": 0.6227090098519457, + "grad_norm": 0.7363784909248352, + "learning_rate": 7.817504117515984e-06, + "loss": 0.7087, + "step": 11314 + }, + { + "epoch": 0.6227640486543012, + "grad_norm": 0.7609296441078186, + "learning_rate": 7.817146013436893e-06, + "loss": 0.7553, + "step": 11315 + }, + { + "epoch": 0.6228190874566569, + "grad_norm": 0.6818829774856567, + "learning_rate": 7.816787888184886e-06, + "loss": 0.7534, + "step": 11316 + }, + { + "epoch": 0.6228741262590126, + "grad_norm": 0.7434844374656677, + "learning_rate": 7.816429741762657e-06, + "loss": 0.8008, + "step": 11317 + }, + { + "epoch": 0.6229291650613683, + "grad_norm": 0.6881742477416992, + "learning_rate": 7.816071574172895e-06, + "loss": 0.7324, + "step": 11318 + }, + { + "epoch": 0.6229842038637239, + "grad_norm": 0.7109540104866028, + "learning_rate": 7.815713385418293e-06, + "loss": 0.7954, + "step": 11319 + }, + { + "epoch": 0.6230392426660796, + "grad_norm": 0.6868860721588135, + "learning_rate": 7.815355175501542e-06, + "loss": 0.6703, + "step": 11320 + }, + { + "epoch": 0.6230942814684353, + "grad_norm": 0.7851449847221375, + "learning_rate": 7.814996944425337e-06, + "loss": 0.8321, + "step": 11321 + }, + { + "epoch": 0.623149320270791, + "grad_norm": 0.7966809272766113, + "learning_rate": 7.814638692192367e-06, + "loss": 0.7603, + "step": 11322 + }, + { + "epoch": 0.6232043590731465, + "grad_norm": 0.6612964272499084, + "learning_rate": 7.814280418805327e-06, + "loss": 0.8096, + "step": 11323 + }, + { + "epoch": 0.6232593978755022, + "grad_norm": 0.6398881077766418, + "learning_rate": 7.813922124266908e-06, + "loss": 0.7559, + "step": 11324 + }, + { + "epoch": 0.6233144366778579, + "grad_norm": 0.8062521815299988, + "learning_rate": 7.813563808579804e-06, + "loss": 0.7863, + "step": 11325 + }, + { + "epoch": 0.6233694754802136, + "grad_norm": 0.7083317041397095, + "learning_rate": 7.813205471746708e-06, + "loss": 0.7358, + "step": 11326 + }, + { + "epoch": 0.6234245142825692, + "grad_norm": 0.6190419793128967, + "learning_rate": 7.812847113770312e-06, + "loss": 0.637, + "step": 11327 + }, + { + "epoch": 0.6234795530849249, + "grad_norm": 0.7036548256874084, + "learning_rate": 7.812488734653309e-06, + "loss": 0.8049, + "step": 11328 + }, + { + "epoch": 0.6235345918872806, + "grad_norm": 0.7952288389205933, + "learning_rate": 7.812130334398395e-06, + "loss": 0.781, + "step": 11329 + }, + { + "epoch": 0.6235896306896362, + "grad_norm": 0.7925593852996826, + "learning_rate": 7.811771913008262e-06, + "loss": 0.7913, + "step": 11330 + }, + { + "epoch": 0.6236446694919918, + "grad_norm": 0.7190900444984436, + "learning_rate": 7.811413470485604e-06, + "loss": 0.7464, + "step": 11331 + }, + { + "epoch": 0.6236997082943475, + "grad_norm": 0.6476338505744934, + "learning_rate": 7.811055006833114e-06, + "loss": 0.699, + "step": 11332 + }, + { + "epoch": 0.6237547470967032, + "grad_norm": 0.7412729263305664, + "learning_rate": 7.810696522053487e-06, + "loss": 0.7958, + "step": 11333 + }, + { + "epoch": 0.6238097858990589, + "grad_norm": 0.6646767854690552, + "learning_rate": 7.81033801614942e-06, + "loss": 0.6276, + "step": 11334 + }, + { + "epoch": 0.6238648247014145, + "grad_norm": 0.6912583112716675, + "learning_rate": 7.809979489123601e-06, + "loss": 0.7611, + "step": 11335 + }, + { + "epoch": 0.6239198635037702, + "grad_norm": 0.7324331998825073, + "learning_rate": 7.80962094097873e-06, + "loss": 0.7436, + "step": 11336 + }, + { + "epoch": 0.6239749023061258, + "grad_norm": 0.7046643495559692, + "learning_rate": 7.809262371717501e-06, + "loss": 0.7287, + "step": 11337 + }, + { + "epoch": 0.6240299411084815, + "grad_norm": 0.6013771891593933, + "learning_rate": 7.808903781342607e-06, + "loss": 0.6822, + "step": 11338 + }, + { + "epoch": 0.6240849799108371, + "grad_norm": 0.633074164390564, + "learning_rate": 7.808545169856745e-06, + "loss": 0.7758, + "step": 11339 + }, + { + "epoch": 0.6241400187131928, + "grad_norm": 0.6603411436080933, + "learning_rate": 7.808186537262608e-06, + "loss": 0.6797, + "step": 11340 + }, + { + "epoch": 0.6241950575155485, + "grad_norm": 0.8316327929496765, + "learning_rate": 7.807827883562894e-06, + "loss": 0.777, + "step": 11341 + }, + { + "epoch": 0.6242500963179041, + "grad_norm": 0.7954252362251282, + "learning_rate": 7.807469208760295e-06, + "loss": 0.6581, + "step": 11342 + }, + { + "epoch": 0.6243051351202598, + "grad_norm": 0.6108134984970093, + "learning_rate": 7.80711051285751e-06, + "loss": 0.7126, + "step": 11343 + }, + { + "epoch": 0.6243601739226154, + "grad_norm": 0.7224909067153931, + "learning_rate": 7.806751795857235e-06, + "loss": 0.8677, + "step": 11344 + }, + { + "epoch": 0.6244152127249711, + "grad_norm": 0.720923125743866, + "learning_rate": 7.806393057762165e-06, + "loss": 0.7174, + "step": 11345 + }, + { + "epoch": 0.6244702515273267, + "grad_norm": 0.6837444305419922, + "learning_rate": 7.806034298574993e-06, + "loss": 0.7431, + "step": 11346 + }, + { + "epoch": 0.6245252903296824, + "grad_norm": 0.8486534953117371, + "learning_rate": 7.80567551829842e-06, + "loss": 0.7955, + "step": 11347 + }, + { + "epoch": 0.6245803291320381, + "grad_norm": 0.6459395885467529, + "learning_rate": 7.805316716935143e-06, + "loss": 0.7681, + "step": 11348 + }, + { + "epoch": 0.6246353679343938, + "grad_norm": 0.8414636850357056, + "learning_rate": 7.804957894487854e-06, + "loss": 0.8985, + "step": 11349 + }, + { + "epoch": 0.6246904067367494, + "grad_norm": 0.7930828928947449, + "learning_rate": 7.804599050959254e-06, + "loss": 0.7389, + "step": 11350 + }, + { + "epoch": 0.624745445539105, + "grad_norm": 0.7102516889572144, + "learning_rate": 7.804240186352038e-06, + "loss": 0.8072, + "step": 11351 + }, + { + "epoch": 0.6248004843414607, + "grad_norm": 0.773341178894043, + "learning_rate": 7.803881300668901e-06, + "loss": 0.7531, + "step": 11352 + }, + { + "epoch": 0.6248555231438164, + "grad_norm": 0.6354981064796448, + "learning_rate": 7.803522393912544e-06, + "loss": 0.6761, + "step": 11353 + }, + { + "epoch": 0.624910561946172, + "grad_norm": 0.7833859324455261, + "learning_rate": 7.803163466085663e-06, + "loss": 0.7768, + "step": 11354 + }, + { + "epoch": 0.6249656007485277, + "grad_norm": 0.6982376575469971, + "learning_rate": 7.802804517190957e-06, + "loss": 0.7472, + "step": 11355 + }, + { + "epoch": 0.6250206395508834, + "grad_norm": 0.7214694023132324, + "learning_rate": 7.80244554723112e-06, + "loss": 0.7919, + "step": 11356 + }, + { + "epoch": 0.6250756783532391, + "grad_norm": 0.8002933859825134, + "learning_rate": 7.802086556208855e-06, + "loss": 0.8278, + "step": 11357 + }, + { + "epoch": 0.6251307171555947, + "grad_norm": 0.7619680762290955, + "learning_rate": 7.801727544126858e-06, + "loss": 0.7775, + "step": 11358 + }, + { + "epoch": 0.6251857559579503, + "grad_norm": 0.6340392827987671, + "learning_rate": 7.801368510987825e-06, + "loss": 0.7324, + "step": 11359 + }, + { + "epoch": 0.625240794760306, + "grad_norm": 0.6754844784736633, + "learning_rate": 7.801009456794457e-06, + "loss": 0.7296, + "step": 11360 + }, + { + "epoch": 0.6252958335626617, + "grad_norm": 0.6871771216392517, + "learning_rate": 7.80065038154945e-06, + "loss": 0.7398, + "step": 11361 + }, + { + "epoch": 0.6253508723650173, + "grad_norm": 0.6610772013664246, + "learning_rate": 7.800291285255505e-06, + "loss": 0.738, + "step": 11362 + }, + { + "epoch": 0.625405911167373, + "grad_norm": 0.6858081221580505, + "learning_rate": 7.799932167915322e-06, + "loss": 0.7353, + "step": 11363 + }, + { + "epoch": 0.6254609499697287, + "grad_norm": 0.6698840856552124, + "learning_rate": 7.799573029531597e-06, + "loss": 0.7505, + "step": 11364 + }, + { + "epoch": 0.6255159887720844, + "grad_norm": 0.7374000549316406, + "learning_rate": 7.799213870107031e-06, + "loss": 0.7974, + "step": 11365 + }, + { + "epoch": 0.6255710275744399, + "grad_norm": 0.6962621808052063, + "learning_rate": 7.798854689644324e-06, + "loss": 0.8183, + "step": 11366 + }, + { + "epoch": 0.6256260663767956, + "grad_norm": 0.8477681279182434, + "learning_rate": 7.798495488146173e-06, + "loss": 0.7533, + "step": 11367 + }, + { + "epoch": 0.6256811051791513, + "grad_norm": 0.6963459253311157, + "learning_rate": 7.798136265615278e-06, + "loss": 0.6362, + "step": 11368 + }, + { + "epoch": 0.625736143981507, + "grad_norm": 0.7125601172447205, + "learning_rate": 7.79777702205434e-06, + "loss": 0.7296, + "step": 11369 + }, + { + "epoch": 0.6257911827838626, + "grad_norm": 0.6650554537773132, + "learning_rate": 7.79741775746606e-06, + "loss": 0.8231, + "step": 11370 + }, + { + "epoch": 0.6258462215862183, + "grad_norm": 0.6556620597839355, + "learning_rate": 7.797058471853138e-06, + "loss": 0.6952, + "step": 11371 + }, + { + "epoch": 0.625901260388574, + "grad_norm": 0.6350956559181213, + "learning_rate": 7.79669916521827e-06, + "loss": 0.686, + "step": 11372 + }, + { + "epoch": 0.6259562991909297, + "grad_norm": 0.6346702575683594, + "learning_rate": 7.796339837564163e-06, + "loss": 0.7234, + "step": 11373 + }, + { + "epoch": 0.6260113379932852, + "grad_norm": 0.741437554359436, + "learning_rate": 7.795980488893514e-06, + "loss": 0.8096, + "step": 11374 + }, + { + "epoch": 0.6260663767956409, + "grad_norm": 0.7057582139968872, + "learning_rate": 7.795621119209021e-06, + "loss": 0.8022, + "step": 11375 + }, + { + "epoch": 0.6261214155979966, + "grad_norm": 0.658107578754425, + "learning_rate": 7.79526172851339e-06, + "loss": 0.7564, + "step": 11376 + }, + { + "epoch": 0.6261764544003523, + "grad_norm": 0.7974086403846741, + "learning_rate": 7.79490231680932e-06, + "loss": 0.7721, + "step": 11377 + }, + { + "epoch": 0.6262314932027079, + "grad_norm": 0.6669130921363831, + "learning_rate": 7.794542884099513e-06, + "loss": 0.7652, + "step": 11378 + }, + { + "epoch": 0.6262865320050636, + "grad_norm": 0.7364919185638428, + "learning_rate": 7.794183430386669e-06, + "loss": 0.8679, + "step": 11379 + }, + { + "epoch": 0.6263415708074193, + "grad_norm": 0.7383667230606079, + "learning_rate": 7.793823955673489e-06, + "loss": 0.7715, + "step": 11380 + }, + { + "epoch": 0.626396609609775, + "grad_norm": 0.6688774228096008, + "learning_rate": 7.793464459962679e-06, + "loss": 0.7503, + "step": 11381 + }, + { + "epoch": 0.6264516484121305, + "grad_norm": 0.6771709322929382, + "learning_rate": 7.793104943256935e-06, + "loss": 0.7479, + "step": 11382 + }, + { + "epoch": 0.6265066872144862, + "grad_norm": 0.7121349573135376, + "learning_rate": 7.792745405558964e-06, + "loss": 0.7655, + "step": 11383 + }, + { + "epoch": 0.6265617260168419, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.792385846871465e-06, + "loss": 0.7418, + "step": 11384 + }, + { + "epoch": 0.6266167648191975, + "grad_norm": 0.6701569557189941, + "learning_rate": 7.792026267197142e-06, + "loss": 0.7669, + "step": 11385 + }, + { + "epoch": 0.6266718036215532, + "grad_norm": 0.6890652179718018, + "learning_rate": 7.791666666538697e-06, + "loss": 0.7659, + "step": 11386 + }, + { + "epoch": 0.6267268424239089, + "grad_norm": 0.7636297345161438, + "learning_rate": 7.791307044898833e-06, + "loss": 0.7272, + "step": 11387 + }, + { + "epoch": 0.6267818812262645, + "grad_norm": 0.6563602089881897, + "learning_rate": 7.790947402280252e-06, + "loss": 0.7603, + "step": 11388 + }, + { + "epoch": 0.6268369200286201, + "grad_norm": 0.7252678275108337, + "learning_rate": 7.790587738685655e-06, + "loss": 0.7789, + "step": 11389 + }, + { + "epoch": 0.6268919588309758, + "grad_norm": 0.6703618764877319, + "learning_rate": 7.79022805411775e-06, + "loss": 0.6883, + "step": 11390 + }, + { + "epoch": 0.6269469976333315, + "grad_norm": 0.7165848612785339, + "learning_rate": 7.789868348579239e-06, + "loss": 0.7944, + "step": 11391 + }, + { + "epoch": 0.6270020364356872, + "grad_norm": 0.9325329065322876, + "learning_rate": 7.789508622072822e-06, + "loss": 0.9059, + "step": 11392 + }, + { + "epoch": 0.6270570752380428, + "grad_norm": 0.6875555515289307, + "learning_rate": 7.789148874601204e-06, + "loss": 0.7115, + "step": 11393 + }, + { + "epoch": 0.6271121140403985, + "grad_norm": 0.6470181941986084, + "learning_rate": 7.788789106167093e-06, + "loss": 0.7603, + "step": 11394 + }, + { + "epoch": 0.6271671528427541, + "grad_norm": 0.688685417175293, + "learning_rate": 7.788429316773188e-06, + "loss": 0.8397, + "step": 11395 + }, + { + "epoch": 0.6272221916451098, + "grad_norm": 0.6299887895584106, + "learning_rate": 7.788069506422193e-06, + "loss": 0.7026, + "step": 11396 + }, + { + "epoch": 0.6272772304474654, + "grad_norm": 0.8046191930770874, + "learning_rate": 7.787709675116817e-06, + "loss": 0.8573, + "step": 11397 + }, + { + "epoch": 0.6273322692498211, + "grad_norm": 0.6700685620307922, + "learning_rate": 7.78734982285976e-06, + "loss": 0.7225, + "step": 11398 + }, + { + "epoch": 0.6273873080521768, + "grad_norm": 0.6968538761138916, + "learning_rate": 7.786989949653726e-06, + "loss": 0.6571, + "step": 11399 + }, + { + "epoch": 0.6274423468545325, + "grad_norm": 0.6857314705848694, + "learning_rate": 7.786630055501425e-06, + "loss": 0.8131, + "step": 11400 + }, + { + "epoch": 0.6274973856568881, + "grad_norm": 0.702316403388977, + "learning_rate": 7.786270140405557e-06, + "loss": 0.7222, + "step": 11401 + }, + { + "epoch": 0.6275524244592438, + "grad_norm": 0.6987283825874329, + "learning_rate": 7.785910204368827e-06, + "loss": 0.7171, + "step": 11402 + }, + { + "epoch": 0.6276074632615994, + "grad_norm": 0.6835529208183289, + "learning_rate": 7.785550247393943e-06, + "loss": 0.8077, + "step": 11403 + }, + { + "epoch": 0.6276625020639551, + "grad_norm": 0.6423392295837402, + "learning_rate": 7.785190269483609e-06, + "loss": 0.6689, + "step": 11404 + }, + { + "epoch": 0.6277175408663107, + "grad_norm": 0.6995517611503601, + "learning_rate": 7.78483027064053e-06, + "loss": 0.7417, + "step": 11405 + }, + { + "epoch": 0.6277725796686664, + "grad_norm": 0.6639729142189026, + "learning_rate": 7.784470250867413e-06, + "loss": 0.6521, + "step": 11406 + }, + { + "epoch": 0.6278276184710221, + "grad_norm": 0.7280262112617493, + "learning_rate": 7.784110210166961e-06, + "loss": 0.7686, + "step": 11407 + }, + { + "epoch": 0.6278826572733778, + "grad_norm": 0.6741863489151001, + "learning_rate": 7.783750148541884e-06, + "loss": 0.7794, + "step": 11408 + }, + { + "epoch": 0.6279376960757334, + "grad_norm": 0.8160151243209839, + "learning_rate": 7.783390065994885e-06, + "loss": 0.7065, + "step": 11409 + }, + { + "epoch": 0.627992734878089, + "grad_norm": 0.7288973927497864, + "learning_rate": 7.783029962528672e-06, + "loss": 0.8337, + "step": 11410 + }, + { + "epoch": 0.6280477736804447, + "grad_norm": 0.7764643430709839, + "learning_rate": 7.782669838145952e-06, + "loss": 0.8812, + "step": 11411 + }, + { + "epoch": 0.6281028124828004, + "grad_norm": 0.8145303130149841, + "learning_rate": 7.782309692849425e-06, + "loss": 0.9206, + "step": 11412 + }, + { + "epoch": 0.628157851285156, + "grad_norm": 0.6883288621902466, + "learning_rate": 7.781949526641808e-06, + "loss": 0.7779, + "step": 11413 + }, + { + "epoch": 0.6282128900875117, + "grad_norm": 0.7281043529510498, + "learning_rate": 7.781589339525803e-06, + "loss": 0.7933, + "step": 11414 + }, + { + "epoch": 0.6282679288898674, + "grad_norm": 0.7998347878456116, + "learning_rate": 7.781229131504115e-06, + "loss": 0.8772, + "step": 11415 + }, + { + "epoch": 0.6283229676922231, + "grad_norm": 0.7591177225112915, + "learning_rate": 7.780868902579455e-06, + "loss": 0.9054, + "step": 11416 + }, + { + "epoch": 0.6283780064945786, + "grad_norm": 0.7209650278091431, + "learning_rate": 7.780508652754528e-06, + "loss": 0.7781, + "step": 11417 + }, + { + "epoch": 0.6284330452969343, + "grad_norm": 1.2373511791229248, + "learning_rate": 7.780148382032042e-06, + "loss": 0.7501, + "step": 11418 + }, + { + "epoch": 0.62848808409929, + "grad_norm": 0.6281551122665405, + "learning_rate": 7.779788090414704e-06, + "loss": 0.8122, + "step": 11419 + }, + { + "epoch": 0.6285431229016457, + "grad_norm": 0.6954115629196167, + "learning_rate": 7.779427777905224e-06, + "loss": 0.7815, + "step": 11420 + }, + { + "epoch": 0.6285981617040013, + "grad_norm": 0.727043628692627, + "learning_rate": 7.77906744450631e-06, + "loss": 0.7116, + "step": 11421 + }, + { + "epoch": 0.628653200506357, + "grad_norm": 0.6979809403419495, + "learning_rate": 7.778707090220667e-06, + "loss": 0.7707, + "step": 11422 + }, + { + "epoch": 0.6287082393087127, + "grad_norm": 0.6851169466972351, + "learning_rate": 7.778346715051006e-06, + "loss": 0.811, + "step": 11423 + }, + { + "epoch": 0.6287632781110684, + "grad_norm": 0.70259028673172, + "learning_rate": 7.777986319000036e-06, + "loss": 0.7766, + "step": 11424 + }, + { + "epoch": 0.6288183169134239, + "grad_norm": 0.7436364889144897, + "learning_rate": 7.777625902070463e-06, + "loss": 0.8449, + "step": 11425 + }, + { + "epoch": 0.6288733557157796, + "grad_norm": 0.6452080607414246, + "learning_rate": 7.777265464264998e-06, + "loss": 0.7138, + "step": 11426 + }, + { + "epoch": 0.6289283945181353, + "grad_norm": 0.6329460144042969, + "learning_rate": 7.776905005586349e-06, + "loss": 0.6482, + "step": 11427 + }, + { + "epoch": 0.6289834333204909, + "grad_norm": 0.7521186470985413, + "learning_rate": 7.776544526037225e-06, + "loss": 0.751, + "step": 11428 + }, + { + "epoch": 0.6290384721228466, + "grad_norm": 0.7105319499969482, + "learning_rate": 7.776184025620334e-06, + "loss": 0.843, + "step": 11429 + }, + { + "epoch": 0.6290935109252023, + "grad_norm": 0.7329964637756348, + "learning_rate": 7.77582350433839e-06, + "loss": 0.6992, + "step": 11430 + }, + { + "epoch": 0.629148549727558, + "grad_norm": 0.7492092847824097, + "learning_rate": 7.775462962194098e-06, + "loss": 0.7579, + "step": 11431 + }, + { + "epoch": 0.6292035885299135, + "grad_norm": 0.7332866191864014, + "learning_rate": 7.77510239919017e-06, + "loss": 0.7758, + "step": 11432 + }, + { + "epoch": 0.6292586273322692, + "grad_norm": 0.7532867193222046, + "learning_rate": 7.774741815329315e-06, + "loss": 0.8157, + "step": 11433 + }, + { + "epoch": 0.6293136661346249, + "grad_norm": 0.7498316168785095, + "learning_rate": 7.774381210614244e-06, + "loss": 0.7671, + "step": 11434 + }, + { + "epoch": 0.6293687049369806, + "grad_norm": 0.8017444610595703, + "learning_rate": 7.774020585047666e-06, + "loss": 0.6989, + "step": 11435 + }, + { + "epoch": 0.6294237437393362, + "grad_norm": 0.7827737927436829, + "learning_rate": 7.77365993863229e-06, + "loss": 0.852, + "step": 11436 + }, + { + "epoch": 0.6294787825416919, + "grad_norm": 1.1411668062210083, + "learning_rate": 7.77329927137083e-06, + "loss": 0.9303, + "step": 11437 + }, + { + "epoch": 0.6295338213440476, + "grad_norm": 1.2931067943572998, + "learning_rate": 7.772938583265995e-06, + "loss": 0.8913, + "step": 11438 + }, + { + "epoch": 0.6295888601464033, + "grad_norm": 0.7407616376876831, + "learning_rate": 7.772577874320494e-06, + "loss": 0.9247, + "step": 11439 + }, + { + "epoch": 0.6296438989487588, + "grad_norm": 0.6544716954231262, + "learning_rate": 7.772217144537043e-06, + "loss": 0.7879, + "step": 11440 + }, + { + "epoch": 0.6296989377511145, + "grad_norm": 0.7467932105064392, + "learning_rate": 7.77185639391835e-06, + "loss": 0.7624, + "step": 11441 + }, + { + "epoch": 0.6297539765534702, + "grad_norm": 0.6845136880874634, + "learning_rate": 7.771495622467123e-06, + "loss": 0.691, + "step": 11442 + }, + { + "epoch": 0.6298090153558259, + "grad_norm": 0.7881575226783752, + "learning_rate": 7.771134830186079e-06, + "loss": 0.7567, + "step": 11443 + }, + { + "epoch": 0.6298640541581815, + "grad_norm": 0.6910528540611267, + "learning_rate": 7.770774017077928e-06, + "loss": 0.7527, + "step": 11444 + }, + { + "epoch": 0.6299190929605372, + "grad_norm": 0.7395550608634949, + "learning_rate": 7.770413183145379e-06, + "loss": 0.8288, + "step": 11445 + }, + { + "epoch": 0.6299741317628929, + "grad_norm": 0.6876364350318909, + "learning_rate": 7.770052328391147e-06, + "loss": 0.7759, + "step": 11446 + }, + { + "epoch": 0.6300291705652485, + "grad_norm": 0.7936999201774597, + "learning_rate": 7.769691452817945e-06, + "loss": 0.6885, + "step": 11447 + }, + { + "epoch": 0.6300842093676041, + "grad_norm": 0.721479058265686, + "learning_rate": 7.769330556428482e-06, + "loss": 0.7215, + "step": 11448 + }, + { + "epoch": 0.6301392481699598, + "grad_norm": 0.6549312472343445, + "learning_rate": 7.76896963922547e-06, + "loss": 0.7523, + "step": 11449 + }, + { + "epoch": 0.6301942869723155, + "grad_norm": 0.6684648394584656, + "learning_rate": 7.768608701211627e-06, + "loss": 0.768, + "step": 11450 + }, + { + "epoch": 0.6302493257746712, + "grad_norm": 0.7014286518096924, + "learning_rate": 7.76824774238966e-06, + "loss": 0.7534, + "step": 11451 + }, + { + "epoch": 0.6303043645770268, + "grad_norm": 0.9186445474624634, + "learning_rate": 7.767886762762284e-06, + "loss": 0.8398, + "step": 11452 + }, + { + "epoch": 0.6303594033793825, + "grad_norm": 0.787187933921814, + "learning_rate": 7.76752576233221e-06, + "loss": 0.8035, + "step": 11453 + }, + { + "epoch": 0.6304144421817381, + "grad_norm": 0.7471121549606323, + "learning_rate": 7.767164741102157e-06, + "loss": 0.7983, + "step": 11454 + }, + { + "epoch": 0.6304694809840938, + "grad_norm": 0.6810591816902161, + "learning_rate": 7.766803699074834e-06, + "loss": 0.7132, + "step": 11455 + }, + { + "epoch": 0.6305245197864494, + "grad_norm": 0.7154163122177124, + "learning_rate": 7.766442636252953e-06, + "loss": 0.7942, + "step": 11456 + }, + { + "epoch": 0.6305795585888051, + "grad_norm": 0.6990880966186523, + "learning_rate": 7.766081552639231e-06, + "loss": 0.7296, + "step": 11457 + }, + { + "epoch": 0.6306345973911608, + "grad_norm": 0.8848066926002502, + "learning_rate": 7.76572044823638e-06, + "loss": 0.621, + "step": 11458 + }, + { + "epoch": 0.6306896361935165, + "grad_norm": 0.6929910182952881, + "learning_rate": 7.765359323047116e-06, + "loss": 0.5917, + "step": 11459 + }, + { + "epoch": 0.6307446749958721, + "grad_norm": 0.6874505281448364, + "learning_rate": 7.764998177074149e-06, + "loss": 0.7244, + "step": 11460 + }, + { + "epoch": 0.6307997137982277, + "grad_norm": 0.6823066473007202, + "learning_rate": 7.764637010320197e-06, + "loss": 0.7299, + "step": 11461 + }, + { + "epoch": 0.6308547526005834, + "grad_norm": 0.7315061688423157, + "learning_rate": 7.764275822787972e-06, + "loss": 0.7759, + "step": 11462 + }, + { + "epoch": 0.6309097914029391, + "grad_norm": 0.6186662316322327, + "learning_rate": 7.763914614480192e-06, + "loss": 0.6746, + "step": 11463 + }, + { + "epoch": 0.6309648302052947, + "grad_norm": 0.6751530170440674, + "learning_rate": 7.763553385399569e-06, + "loss": 0.8371, + "step": 11464 + }, + { + "epoch": 0.6310198690076504, + "grad_norm": 1.0283396244049072, + "learning_rate": 7.763192135548818e-06, + "loss": 0.7743, + "step": 11465 + }, + { + "epoch": 0.6310749078100061, + "grad_norm": 0.7695029973983765, + "learning_rate": 7.762830864930655e-06, + "loss": 0.7387, + "step": 11466 + }, + { + "epoch": 0.6311299466123618, + "grad_norm": 0.8087024688720703, + "learning_rate": 7.762469573547795e-06, + "loss": 0.8357, + "step": 11467 + }, + { + "epoch": 0.6311849854147173, + "grad_norm": 0.9203382134437561, + "learning_rate": 7.762108261402951e-06, + "loss": 0.8191, + "step": 11468 + }, + { + "epoch": 0.631240024217073, + "grad_norm": 0.6569168567657471, + "learning_rate": 7.761746928498843e-06, + "loss": 0.7035, + "step": 11469 + }, + { + "epoch": 0.6312950630194287, + "grad_norm": 0.7903677225112915, + "learning_rate": 7.761385574838183e-06, + "loss": 0.8295, + "step": 11470 + }, + { + "epoch": 0.6313501018217843, + "grad_norm": 0.6780279278755188, + "learning_rate": 7.76102420042369e-06, + "loss": 0.6497, + "step": 11471 + }, + { + "epoch": 0.63140514062414, + "grad_norm": 0.7150516510009766, + "learning_rate": 7.760662805258076e-06, + "loss": 0.7979, + "step": 11472 + }, + { + "epoch": 0.6314601794264957, + "grad_norm": 0.7278215885162354, + "learning_rate": 7.760301389344061e-06, + "loss": 0.8503, + "step": 11473 + }, + { + "epoch": 0.6315152182288514, + "grad_norm": 0.8695063591003418, + "learning_rate": 7.75993995268436e-06, + "loss": 0.7796, + "step": 11474 + }, + { + "epoch": 0.631570257031207, + "grad_norm": 0.7154332399368286, + "learning_rate": 7.759578495281688e-06, + "loss": 0.725, + "step": 11475 + }, + { + "epoch": 0.6316252958335626, + "grad_norm": 0.7151778936386108, + "learning_rate": 7.759217017138763e-06, + "loss": 0.6932, + "step": 11476 + }, + { + "epoch": 0.6316803346359183, + "grad_norm": 0.6328319311141968, + "learning_rate": 7.758855518258301e-06, + "loss": 0.7382, + "step": 11477 + }, + { + "epoch": 0.631735373438274, + "grad_norm": 0.8377438187599182, + "learning_rate": 7.75849399864302e-06, + "loss": 0.7782, + "step": 11478 + }, + { + "epoch": 0.6317904122406296, + "grad_norm": 0.6654751896858215, + "learning_rate": 7.758132458295637e-06, + "loss": 0.8076, + "step": 11479 + }, + { + "epoch": 0.6318454510429853, + "grad_norm": 0.6841873526573181, + "learning_rate": 7.757770897218869e-06, + "loss": 0.7195, + "step": 11480 + }, + { + "epoch": 0.631900489845341, + "grad_norm": 0.7791223526000977, + "learning_rate": 7.757409315415431e-06, + "loss": 0.7858, + "step": 11481 + }, + { + "epoch": 0.6319555286476967, + "grad_norm": 0.6412019729614258, + "learning_rate": 7.757047712888044e-06, + "loss": 0.6853, + "step": 11482 + }, + { + "epoch": 0.6320105674500522, + "grad_norm": 0.7058777213096619, + "learning_rate": 7.756686089639425e-06, + "loss": 0.8955, + "step": 11483 + }, + { + "epoch": 0.6320656062524079, + "grad_norm": 0.6950271725654602, + "learning_rate": 7.75632444567229e-06, + "loss": 0.7213, + "step": 11484 + }, + { + "epoch": 0.6321206450547636, + "grad_norm": 0.6938642859458923, + "learning_rate": 7.755962780989359e-06, + "loss": 0.749, + "step": 11485 + }, + { + "epoch": 0.6321756838571193, + "grad_norm": 4.447030544281006, + "learning_rate": 7.755601095593348e-06, + "loss": 0.7603, + "step": 11486 + }, + { + "epoch": 0.6322307226594749, + "grad_norm": 0.6693708896636963, + "learning_rate": 7.755239389486979e-06, + "loss": 0.769, + "step": 11487 + }, + { + "epoch": 0.6322857614618306, + "grad_norm": 0.830352246761322, + "learning_rate": 7.754877662672968e-06, + "loss": 0.8069, + "step": 11488 + }, + { + "epoch": 0.6323408002641863, + "grad_norm": 0.7211840748786926, + "learning_rate": 7.754515915154033e-06, + "loss": 0.7972, + "step": 11489 + }, + { + "epoch": 0.632395839066542, + "grad_norm": 0.723101019859314, + "learning_rate": 7.754154146932893e-06, + "loss": 0.7385, + "step": 11490 + }, + { + "epoch": 0.6324508778688975, + "grad_norm": 0.6515377759933472, + "learning_rate": 7.75379235801227e-06, + "loss": 0.7527, + "step": 11491 + }, + { + "epoch": 0.6325059166712532, + "grad_norm": 0.6296554803848267, + "learning_rate": 7.75343054839488e-06, + "loss": 0.7135, + "step": 11492 + }, + { + "epoch": 0.6325609554736089, + "grad_norm": 0.8153911232948303, + "learning_rate": 7.753068718083441e-06, + "loss": 0.7298, + "step": 11493 + }, + { + "epoch": 0.6326159942759646, + "grad_norm": 0.6735014915466309, + "learning_rate": 7.752706867080676e-06, + "loss": 0.6851, + "step": 11494 + }, + { + "epoch": 0.6326710330783202, + "grad_norm": 0.7077293992042542, + "learning_rate": 7.752344995389303e-06, + "loss": 0.7806, + "step": 11495 + }, + { + "epoch": 0.6327260718806759, + "grad_norm": 0.6928272843360901, + "learning_rate": 7.751983103012042e-06, + "loss": 0.7538, + "step": 11496 + }, + { + "epoch": 0.6327811106830316, + "grad_norm": 0.7058837413787842, + "learning_rate": 7.751621189951612e-06, + "loss": 0.7065, + "step": 11497 + }, + { + "epoch": 0.6328361494853872, + "grad_norm": 0.7272600531578064, + "learning_rate": 7.751259256210735e-06, + "loss": 0.7468, + "step": 11498 + }, + { + "epoch": 0.6328911882877428, + "grad_norm": 0.6175968050956726, + "learning_rate": 7.75089730179213e-06, + "loss": 0.7195, + "step": 11499 + }, + { + "epoch": 0.6329462270900985, + "grad_norm": 0.6567386984825134, + "learning_rate": 7.750535326698514e-06, + "loss": 0.8147, + "step": 11500 + }, + { + "epoch": 0.6330012658924542, + "grad_norm": 0.6325315237045288, + "learning_rate": 7.750173330932613e-06, + "loss": 0.7087, + "step": 11501 + }, + { + "epoch": 0.6330563046948099, + "grad_norm": 0.8607509732246399, + "learning_rate": 7.749811314497147e-06, + "loss": 0.8009, + "step": 11502 + }, + { + "epoch": 0.6331113434971655, + "grad_norm": 0.7452824711799622, + "learning_rate": 7.749449277394833e-06, + "loss": 0.7497, + "step": 11503 + }, + { + "epoch": 0.6331663822995212, + "grad_norm": 0.7371357679367065, + "learning_rate": 7.749087219628395e-06, + "loss": 0.8936, + "step": 11504 + }, + { + "epoch": 0.6332214211018768, + "grad_norm": 0.7177306413650513, + "learning_rate": 7.748725141200552e-06, + "loss": 0.8327, + "step": 11505 + }, + { + "epoch": 0.6332764599042325, + "grad_norm": 0.5938527584075928, + "learning_rate": 7.748363042114028e-06, + "loss": 0.6471, + "step": 11506 + }, + { + "epoch": 0.6333314987065881, + "grad_norm": 0.8827341198921204, + "learning_rate": 7.748000922371543e-06, + "loss": 0.7247, + "step": 11507 + }, + { + "epoch": 0.6333865375089438, + "grad_norm": 0.7008641958236694, + "learning_rate": 7.747638781975818e-06, + "loss": 0.684, + "step": 11508 + }, + { + "epoch": 0.6334415763112995, + "grad_norm": 0.7752355337142944, + "learning_rate": 7.747276620929576e-06, + "loss": 0.7993, + "step": 11509 + }, + { + "epoch": 0.6334966151136552, + "grad_norm": 0.6928088068962097, + "learning_rate": 7.74691443923554e-06, + "loss": 0.7213, + "step": 11510 + }, + { + "epoch": 0.6335516539160108, + "grad_norm": 0.8197296261787415, + "learning_rate": 7.746552236896428e-06, + "loss": 0.847, + "step": 11511 + }, + { + "epoch": 0.6336066927183664, + "grad_norm": 0.7912493348121643, + "learning_rate": 7.746190013914966e-06, + "loss": 0.8217, + "step": 11512 + }, + { + "epoch": 0.6336617315207221, + "grad_norm": 0.7726556062698364, + "learning_rate": 7.745827770293871e-06, + "loss": 0.7626, + "step": 11513 + }, + { + "epoch": 0.6337167703230777, + "grad_norm": 0.668569028377533, + "learning_rate": 7.745465506035873e-06, + "loss": 0.7141, + "step": 11514 + }, + { + "epoch": 0.6337718091254334, + "grad_norm": 0.7226139903068542, + "learning_rate": 7.745103221143694e-06, + "loss": 0.7262, + "step": 11515 + }, + { + "epoch": 0.6338268479277891, + "grad_norm": 0.7315354943275452, + "learning_rate": 7.744740915620051e-06, + "loss": 0.7955, + "step": 11516 + }, + { + "epoch": 0.6338818867301448, + "grad_norm": 0.6815279126167297, + "learning_rate": 7.744378589467668e-06, + "loss": 0.7347, + "step": 11517 + }, + { + "epoch": 0.6339369255325004, + "grad_norm": 0.6931445598602295, + "learning_rate": 7.744016242689272e-06, + "loss": 0.7959, + "step": 11518 + }, + { + "epoch": 0.633991964334856, + "grad_norm": 0.7156991362571716, + "learning_rate": 7.743653875287584e-06, + "loss": 0.7793, + "step": 11519 + }, + { + "epoch": 0.6340470031372117, + "grad_norm": 0.8503926396369934, + "learning_rate": 7.74329148726533e-06, + "loss": 0.823, + "step": 11520 + }, + { + "epoch": 0.6341020419395674, + "grad_norm": 0.6280057430267334, + "learning_rate": 7.742929078625228e-06, + "loss": 0.6729, + "step": 11521 + }, + { + "epoch": 0.634157080741923, + "grad_norm": 0.7004517316818237, + "learning_rate": 7.742566649370008e-06, + "loss": 0.7578, + "step": 11522 + }, + { + "epoch": 0.6342121195442787, + "grad_norm": 0.7147908210754395, + "learning_rate": 7.74220419950239e-06, + "loss": 0.7705, + "step": 11523 + }, + { + "epoch": 0.6342671583466344, + "grad_norm": 0.7191137671470642, + "learning_rate": 7.7418417290251e-06, + "loss": 0.789, + "step": 11524 + }, + { + "epoch": 0.6343221971489901, + "grad_norm": 0.7288943529129028, + "learning_rate": 7.741479237940862e-06, + "loss": 0.8204, + "step": 11525 + }, + { + "epoch": 0.6343772359513457, + "grad_norm": 0.714821994304657, + "learning_rate": 7.741116726252398e-06, + "loss": 0.8252, + "step": 11526 + }, + { + "epoch": 0.6344322747537013, + "grad_norm": 0.6869103312492371, + "learning_rate": 7.740754193962435e-06, + "loss": 0.8136, + "step": 11527 + }, + { + "epoch": 0.634487313556057, + "grad_norm": 0.6629248857498169, + "learning_rate": 7.740391641073698e-06, + "loss": 0.7049, + "step": 11528 + }, + { + "epoch": 0.6345423523584127, + "grad_norm": 0.7078685164451599, + "learning_rate": 7.74002906758891e-06, + "loss": 0.7345, + "step": 11529 + }, + { + "epoch": 0.6345973911607683, + "grad_norm": 0.7748367190361023, + "learning_rate": 7.739666473510798e-06, + "loss": 0.7085, + "step": 11530 + }, + { + "epoch": 0.634652429963124, + "grad_norm": 0.6661930084228516, + "learning_rate": 7.739303858842086e-06, + "loss": 0.7795, + "step": 11531 + }, + { + "epoch": 0.6347074687654797, + "grad_norm": 0.6847965121269226, + "learning_rate": 7.738941223585499e-06, + "loss": 0.797, + "step": 11532 + }, + { + "epoch": 0.6347625075678354, + "grad_norm": 0.695184051990509, + "learning_rate": 7.738578567743762e-06, + "loss": 0.8184, + "step": 11533 + }, + { + "epoch": 0.6348175463701909, + "grad_norm": 0.6620088815689087, + "learning_rate": 7.738215891319603e-06, + "loss": 0.721, + "step": 11534 + }, + { + "epoch": 0.6348725851725466, + "grad_norm": 0.6802023649215698, + "learning_rate": 7.737853194315745e-06, + "loss": 0.9207, + "step": 11535 + }, + { + "epoch": 0.6349276239749023, + "grad_norm": 1.0193618535995483, + "learning_rate": 7.737490476734916e-06, + "loss": 0.8495, + "step": 11536 + }, + { + "epoch": 0.634982662777258, + "grad_norm": 0.6578189730644226, + "learning_rate": 7.737127738579841e-06, + "loss": 0.7455, + "step": 11537 + }, + { + "epoch": 0.6350377015796136, + "grad_norm": 0.70018470287323, + "learning_rate": 7.736764979853248e-06, + "loss": 0.7414, + "step": 11538 + }, + { + "epoch": 0.6350927403819693, + "grad_norm": 0.8136304616928101, + "learning_rate": 7.736402200557862e-06, + "loss": 0.7327, + "step": 11539 + }, + { + "epoch": 0.635147779184325, + "grad_norm": 0.7805309295654297, + "learning_rate": 7.736039400696408e-06, + "loss": 0.7659, + "step": 11540 + }, + { + "epoch": 0.6352028179866807, + "grad_norm": 0.675215482711792, + "learning_rate": 7.735676580271615e-06, + "loss": 0.7532, + "step": 11541 + }, + { + "epoch": 0.6352578567890362, + "grad_norm": 0.6873239874839783, + "learning_rate": 7.735313739286208e-06, + "loss": 0.8123, + "step": 11542 + }, + { + "epoch": 0.6353128955913919, + "grad_norm": 0.6624773144721985, + "learning_rate": 7.734950877742917e-06, + "loss": 0.7642, + "step": 11543 + }, + { + "epoch": 0.6353679343937476, + "grad_norm": 0.8047438859939575, + "learning_rate": 7.734587995644468e-06, + "loss": 0.7452, + "step": 11544 + }, + { + "epoch": 0.6354229731961033, + "grad_norm": 0.7449815273284912, + "learning_rate": 7.734225092993585e-06, + "loss": 0.7756, + "step": 11545 + }, + { + "epoch": 0.6354780119984589, + "grad_norm": 0.693081259727478, + "learning_rate": 7.733862169792999e-06, + "loss": 0.7029, + "step": 11546 + }, + { + "epoch": 0.6355330508008146, + "grad_norm": 0.6593700051307678, + "learning_rate": 7.733499226045437e-06, + "loss": 0.6009, + "step": 11547 + }, + { + "epoch": 0.6355880896031703, + "grad_norm": 0.7402041554450989, + "learning_rate": 7.733136261753627e-06, + "loss": 0.6921, + "step": 11548 + }, + { + "epoch": 0.635643128405526, + "grad_norm": 0.7686228156089783, + "learning_rate": 7.732773276920294e-06, + "loss": 0.855, + "step": 11549 + }, + { + "epoch": 0.6356981672078815, + "grad_norm": 0.6776669025421143, + "learning_rate": 7.732410271548171e-06, + "loss": 0.7146, + "step": 11550 + }, + { + "epoch": 0.6357532060102372, + "grad_norm": 0.6055952906608582, + "learning_rate": 7.732047245639983e-06, + "loss": 0.6926, + "step": 11551 + }, + { + "epoch": 0.6358082448125929, + "grad_norm": 0.7452635765075684, + "learning_rate": 7.731684199198461e-06, + "loss": 0.7766, + "step": 11552 + }, + { + "epoch": 0.6358632836149486, + "grad_norm": 0.7482720017433167, + "learning_rate": 7.73132113222633e-06, + "loss": 0.7725, + "step": 11553 + }, + { + "epoch": 0.6359183224173042, + "grad_norm": 0.6534025073051453, + "learning_rate": 7.73095804472632e-06, + "loss": 0.7902, + "step": 11554 + }, + { + "epoch": 0.6359733612196599, + "grad_norm": 0.7364560961723328, + "learning_rate": 7.730594936701162e-06, + "loss": 0.7998, + "step": 11555 + }, + { + "epoch": 0.6360284000220155, + "grad_norm": 0.6881458163261414, + "learning_rate": 7.730231808153582e-06, + "loss": 0.7586, + "step": 11556 + }, + { + "epoch": 0.6360834388243711, + "grad_norm": 0.6574262976646423, + "learning_rate": 7.72986865908631e-06, + "loss": 0.6999, + "step": 11557 + }, + { + "epoch": 0.6361384776267268, + "grad_norm": 0.6976385712623596, + "learning_rate": 7.729505489502078e-06, + "loss": 0.7387, + "step": 11558 + }, + { + "epoch": 0.6361935164290825, + "grad_norm": 0.6482532620429993, + "learning_rate": 7.729142299403613e-06, + "loss": 0.7715, + "step": 11559 + }, + { + "epoch": 0.6362485552314382, + "grad_norm": 0.7140287160873413, + "learning_rate": 7.728779088793643e-06, + "loss": 0.8562, + "step": 11560 + }, + { + "epoch": 0.6363035940337938, + "grad_norm": 0.6579470634460449, + "learning_rate": 7.728415857674901e-06, + "loss": 0.727, + "step": 11561 + }, + { + "epoch": 0.6363586328361495, + "grad_norm": 0.8670933246612549, + "learning_rate": 7.728052606050116e-06, + "loss": 0.7459, + "step": 11562 + }, + { + "epoch": 0.6364136716385052, + "grad_norm": 0.7995489835739136, + "learning_rate": 7.72768933392202e-06, + "loss": 0.8228, + "step": 11563 + }, + { + "epoch": 0.6364687104408608, + "grad_norm": 0.6467362642288208, + "learning_rate": 7.727326041293336e-06, + "loss": 0.7545, + "step": 11564 + }, + { + "epoch": 0.6365237492432164, + "grad_norm": 0.6646577715873718, + "learning_rate": 7.726962728166803e-06, + "loss": 0.7824, + "step": 11565 + }, + { + "epoch": 0.6365787880455721, + "grad_norm": 0.6576912999153137, + "learning_rate": 7.726599394545149e-06, + "loss": 0.7324, + "step": 11566 + }, + { + "epoch": 0.6366338268479278, + "grad_norm": 0.7514963150024414, + "learning_rate": 7.726236040431101e-06, + "loss": 0.7712, + "step": 11567 + }, + { + "epoch": 0.6366888656502835, + "grad_norm": 0.7313328981399536, + "learning_rate": 7.725872665827394e-06, + "loss": 0.7361, + "step": 11568 + }, + { + "epoch": 0.6367439044526391, + "grad_norm": 0.7109994292259216, + "learning_rate": 7.725509270736759e-06, + "loss": 0.812, + "step": 11569 + }, + { + "epoch": 0.6367989432549948, + "grad_norm": 1.128675103187561, + "learning_rate": 7.725145855161924e-06, + "loss": 0.726, + "step": 11570 + }, + { + "epoch": 0.6368539820573504, + "grad_norm": 0.7357437014579773, + "learning_rate": 7.724782419105622e-06, + "loss": 0.7958, + "step": 11571 + }, + { + "epoch": 0.6369090208597061, + "grad_norm": 0.6874725222587585, + "learning_rate": 7.724418962570587e-06, + "loss": 0.751, + "step": 11572 + }, + { + "epoch": 0.6369640596620617, + "grad_norm": 0.7175989747047424, + "learning_rate": 7.724055485559545e-06, + "loss": 0.7191, + "step": 11573 + }, + { + "epoch": 0.6370190984644174, + "grad_norm": 0.6424688100814819, + "learning_rate": 7.723691988075235e-06, + "loss": 0.608, + "step": 11574 + }, + { + "epoch": 0.6370741372667731, + "grad_norm": 0.6845381855964661, + "learning_rate": 7.723328470120383e-06, + "loss": 0.7465, + "step": 11575 + }, + { + "epoch": 0.6371291760691288, + "grad_norm": 0.7955030202865601, + "learning_rate": 7.722964931697723e-06, + "loss": 0.745, + "step": 11576 + }, + { + "epoch": 0.6371842148714844, + "grad_norm": 0.6855689883232117, + "learning_rate": 7.722601372809989e-06, + "loss": 0.7764, + "step": 11577 + }, + { + "epoch": 0.63723925367384, + "grad_norm": 0.7505692839622498, + "learning_rate": 7.722237793459909e-06, + "loss": 0.8324, + "step": 11578 + }, + { + "epoch": 0.6372942924761957, + "grad_norm": 0.6852842569351196, + "learning_rate": 7.721874193650221e-06, + "loss": 0.7599, + "step": 11579 + }, + { + "epoch": 0.6373493312785514, + "grad_norm": 0.698210597038269, + "learning_rate": 7.721510573383654e-06, + "loss": 0.843, + "step": 11580 + }, + { + "epoch": 0.637404370080907, + "grad_norm": 0.8344444632530212, + "learning_rate": 7.721146932662942e-06, + "loss": 0.8602, + "step": 11581 + }, + { + "epoch": 0.6374594088832627, + "grad_norm": 0.6385721564292908, + "learning_rate": 7.72078327149082e-06, + "loss": 0.7449, + "step": 11582 + }, + { + "epoch": 0.6375144476856184, + "grad_norm": 0.6474401354789734, + "learning_rate": 7.720419589870016e-06, + "loss": 0.6328, + "step": 11583 + }, + { + "epoch": 0.6375694864879741, + "grad_norm": 0.6554263234138489, + "learning_rate": 7.720055887803268e-06, + "loss": 0.6672, + "step": 11584 + }, + { + "epoch": 0.6376245252903296, + "grad_norm": 0.6551910638809204, + "learning_rate": 7.719692165293309e-06, + "loss": 0.8024, + "step": 11585 + }, + { + "epoch": 0.6376795640926853, + "grad_norm": 0.693418025970459, + "learning_rate": 7.719328422342871e-06, + "loss": 0.726, + "step": 11586 + }, + { + "epoch": 0.637734602895041, + "grad_norm": 0.8642090559005737, + "learning_rate": 7.718964658954689e-06, + "loss": 0.8274, + "step": 11587 + }, + { + "epoch": 0.6377896416973967, + "grad_norm": 0.8255778551101685, + "learning_rate": 7.718600875131494e-06, + "loss": 0.7259, + "step": 11588 + }, + { + "epoch": 0.6378446804997523, + "grad_norm": 0.7492913007736206, + "learning_rate": 7.718237070876025e-06, + "loss": 0.7093, + "step": 11589 + }, + { + "epoch": 0.637899719302108, + "grad_norm": 0.7154868245124817, + "learning_rate": 7.717873246191013e-06, + "loss": 0.7909, + "step": 11590 + }, + { + "epoch": 0.6379547581044637, + "grad_norm": 0.7751424312591553, + "learning_rate": 7.717509401079194e-06, + "loss": 0.8528, + "step": 11591 + }, + { + "epoch": 0.6380097969068194, + "grad_norm": 0.68199223279953, + "learning_rate": 7.7171455355433e-06, + "loss": 0.7077, + "step": 11592 + }, + { + "epoch": 0.6380648357091749, + "grad_norm": 0.7340414524078369, + "learning_rate": 7.716781649586069e-06, + "loss": 0.693, + "step": 11593 + }, + { + "epoch": 0.6381198745115306, + "grad_norm": 0.6278988122940063, + "learning_rate": 7.716417743210234e-06, + "loss": 0.7049, + "step": 11594 + }, + { + "epoch": 0.6381749133138863, + "grad_norm": 0.9113193154335022, + "learning_rate": 7.716053816418532e-06, + "loss": 0.7757, + "step": 11595 + }, + { + "epoch": 0.638229952116242, + "grad_norm": 0.7059371471405029, + "learning_rate": 7.715689869213694e-06, + "loss": 0.7805, + "step": 11596 + }, + { + "epoch": 0.6382849909185976, + "grad_norm": 0.7508488297462463, + "learning_rate": 7.71532590159846e-06, + "loss": 0.7394, + "step": 11597 + }, + { + "epoch": 0.6383400297209533, + "grad_norm": 0.8222774863243103, + "learning_rate": 7.71496191357556e-06, + "loss": 0.7675, + "step": 11598 + }, + { + "epoch": 0.638395068523309, + "grad_norm": 0.7295246124267578, + "learning_rate": 7.714597905147736e-06, + "loss": 0.7766, + "step": 11599 + }, + { + "epoch": 0.6384501073256645, + "grad_norm": 0.7482065558433533, + "learning_rate": 7.71423387631772e-06, + "loss": 0.7334, + "step": 11600 + }, + { + "epoch": 0.6385051461280202, + "grad_norm": 0.7654659748077393, + "learning_rate": 7.71386982708825e-06, + "loss": 0.8097, + "step": 11601 + }, + { + "epoch": 0.6385601849303759, + "grad_norm": 0.9125531911849976, + "learning_rate": 7.71350575746206e-06, + "loss": 0.7776, + "step": 11602 + }, + { + "epoch": 0.6386152237327316, + "grad_norm": 0.8063878417015076, + "learning_rate": 7.713141667441886e-06, + "loss": 0.7899, + "step": 11603 + }, + { + "epoch": 0.6386702625350872, + "grad_norm": 0.7315171360969543, + "learning_rate": 7.712777557030466e-06, + "loss": 0.7884, + "step": 11604 + }, + { + "epoch": 0.6387253013374429, + "grad_norm": 0.7306345105171204, + "learning_rate": 7.712413426230536e-06, + "loss": 0.8646, + "step": 11605 + }, + { + "epoch": 0.6387803401397986, + "grad_norm": 0.8300313353538513, + "learning_rate": 7.712049275044833e-06, + "loss": 0.8131, + "step": 11606 + }, + { + "epoch": 0.6388353789421543, + "grad_norm": 0.7513623237609863, + "learning_rate": 7.711685103476093e-06, + "loss": 0.8115, + "step": 11607 + }, + { + "epoch": 0.6388904177445098, + "grad_norm": 0.7126060128211975, + "learning_rate": 7.711320911527054e-06, + "loss": 0.8198, + "step": 11608 + }, + { + "epoch": 0.6389454565468655, + "grad_norm": 0.7017398476600647, + "learning_rate": 7.710956699200454e-06, + "loss": 0.8088, + "step": 11609 + }, + { + "epoch": 0.6390004953492212, + "grad_norm": 0.7345026135444641, + "learning_rate": 7.710592466499027e-06, + "loss": 0.8228, + "step": 11610 + }, + { + "epoch": 0.6390555341515769, + "grad_norm": 0.6903058886528015, + "learning_rate": 7.710228213425514e-06, + "loss": 0.7058, + "step": 11611 + }, + { + "epoch": 0.6391105729539325, + "grad_norm": 0.6838604211807251, + "learning_rate": 7.70986393998265e-06, + "loss": 0.7091, + "step": 11612 + }, + { + "epoch": 0.6391656117562882, + "grad_norm": 0.7067943811416626, + "learning_rate": 7.709499646173177e-06, + "loss": 0.7631, + "step": 11613 + }, + { + "epoch": 0.6392206505586439, + "grad_norm": 0.7577057480812073, + "learning_rate": 7.709135331999827e-06, + "loss": 0.7545, + "step": 11614 + }, + { + "epoch": 0.6392756893609995, + "grad_norm": 0.6425572633743286, + "learning_rate": 7.70877099746534e-06, + "loss": 0.7188, + "step": 11615 + }, + { + "epoch": 0.6393307281633551, + "grad_norm": 0.7257497310638428, + "learning_rate": 7.708406642572459e-06, + "loss": 0.7514, + "step": 11616 + }, + { + "epoch": 0.6393857669657108, + "grad_norm": 0.8214251399040222, + "learning_rate": 7.708042267323916e-06, + "loss": 0.7824, + "step": 11617 + }, + { + "epoch": 0.6394408057680665, + "grad_norm": 0.7879108786582947, + "learning_rate": 7.707677871722453e-06, + "loss": 0.6122, + "step": 11618 + }, + { + "epoch": 0.6394958445704222, + "grad_norm": 0.6656795740127563, + "learning_rate": 7.707313455770808e-06, + "loss": 0.754, + "step": 11619 + }, + { + "epoch": 0.6395508833727778, + "grad_norm": 0.7196451425552368, + "learning_rate": 7.70694901947172e-06, + "loss": 0.7662, + "step": 11620 + }, + { + "epoch": 0.6396059221751335, + "grad_norm": 0.8213779926300049, + "learning_rate": 7.706584562827928e-06, + "loss": 0.8732, + "step": 11621 + }, + { + "epoch": 0.6396609609774891, + "grad_norm": 0.7114893794059753, + "learning_rate": 7.70622008584217e-06, + "loss": 0.8493, + "step": 11622 + }, + { + "epoch": 0.6397159997798448, + "grad_norm": 0.7009783983230591, + "learning_rate": 7.705855588517188e-06, + "loss": 0.738, + "step": 11623 + }, + { + "epoch": 0.6397710385822004, + "grad_norm": 0.7576995491981506, + "learning_rate": 7.705491070855717e-06, + "loss": 0.8839, + "step": 11624 + }, + { + "epoch": 0.6398260773845561, + "grad_norm": 0.705784022808075, + "learning_rate": 7.7051265328605e-06, + "loss": 0.7246, + "step": 11625 + }, + { + "epoch": 0.6398811161869118, + "grad_norm": 0.6696903109550476, + "learning_rate": 7.704761974534277e-06, + "loss": 0.7418, + "step": 11626 + }, + { + "epoch": 0.6399361549892675, + "grad_norm": 0.8617024421691895, + "learning_rate": 7.704397395879786e-06, + "loss": 0.8109, + "step": 11627 + }, + { + "epoch": 0.6399911937916231, + "grad_norm": 0.6819054484367371, + "learning_rate": 7.70403279689977e-06, + "loss": 0.6438, + "step": 11628 + }, + { + "epoch": 0.6400462325939787, + "grad_norm": 0.6145044565200806, + "learning_rate": 7.703668177596966e-06, + "loss": 0.6712, + "step": 11629 + }, + { + "epoch": 0.6401012713963344, + "grad_norm": 0.6946390271186829, + "learning_rate": 7.703303537974116e-06, + "loss": 0.8099, + "step": 11630 + }, + { + "epoch": 0.6401563101986901, + "grad_norm": 0.6791605949401855, + "learning_rate": 7.702938878033961e-06, + "loss": 0.7494, + "step": 11631 + }, + { + "epoch": 0.6402113490010457, + "grad_norm": 0.6718626618385315, + "learning_rate": 7.70257419777924e-06, + "loss": 0.7471, + "step": 11632 + }, + { + "epoch": 0.6402663878034014, + "grad_norm": 0.8051798343658447, + "learning_rate": 7.702209497212694e-06, + "loss": 0.8569, + "step": 11633 + }, + { + "epoch": 0.6403214266057571, + "grad_norm": 0.6602774858474731, + "learning_rate": 7.701844776337067e-06, + "loss": 0.7396, + "step": 11634 + }, + { + "epoch": 0.6403764654081128, + "grad_norm": 0.672363817691803, + "learning_rate": 7.701480035155096e-06, + "loss": 0.7584, + "step": 11635 + }, + { + "epoch": 0.6404315042104683, + "grad_norm": 0.7363641262054443, + "learning_rate": 7.701115273669524e-06, + "loss": 0.8149, + "step": 11636 + }, + { + "epoch": 0.640486543012824, + "grad_norm": 0.7238422632217407, + "learning_rate": 7.700750491883094e-06, + "loss": 0.7598, + "step": 11637 + }, + { + "epoch": 0.6405415818151797, + "grad_norm": 1.3627614974975586, + "learning_rate": 7.700385689798544e-06, + "loss": 0.8303, + "step": 11638 + }, + { + "epoch": 0.6405966206175354, + "grad_norm": 0.6339633464813232, + "learning_rate": 7.70002086741862e-06, + "loss": 0.7308, + "step": 11639 + }, + { + "epoch": 0.640651659419891, + "grad_norm": 0.6821589469909668, + "learning_rate": 7.699656024746062e-06, + "loss": 0.6728, + "step": 11640 + }, + { + "epoch": 0.6407066982222467, + "grad_norm": 0.8514766097068787, + "learning_rate": 7.699291161783611e-06, + "loss": 0.8693, + "step": 11641 + }, + { + "epoch": 0.6407617370246024, + "grad_norm": 0.649075984954834, + "learning_rate": 7.698926278534011e-06, + "loss": 0.7482, + "step": 11642 + }, + { + "epoch": 0.640816775826958, + "grad_norm": 0.6507017016410828, + "learning_rate": 7.698561375000001e-06, + "loss": 0.7841, + "step": 11643 + }, + { + "epoch": 0.6408718146293136, + "grad_norm": 0.6736069321632385, + "learning_rate": 7.69819645118433e-06, + "loss": 0.74, + "step": 11644 + }, + { + "epoch": 0.6409268534316693, + "grad_norm": 0.6727941632270813, + "learning_rate": 7.697831507089734e-06, + "loss": 0.806, + "step": 11645 + }, + { + "epoch": 0.640981892234025, + "grad_norm": 0.7089083194732666, + "learning_rate": 7.697466542718959e-06, + "loss": 0.8091, + "step": 11646 + }, + { + "epoch": 0.6410369310363806, + "grad_norm": 0.6355387568473816, + "learning_rate": 7.69710155807475e-06, + "loss": 0.7033, + "step": 11647 + }, + { + "epoch": 0.6410919698387363, + "grad_norm": 0.6327098608016968, + "learning_rate": 7.696736553159846e-06, + "loss": 0.7664, + "step": 11648 + }, + { + "epoch": 0.641147008641092, + "grad_norm": 0.6971945762634277, + "learning_rate": 7.69637152797699e-06, + "loss": 0.7441, + "step": 11649 + }, + { + "epoch": 0.6412020474434477, + "grad_norm": 0.7420539855957031, + "learning_rate": 7.696006482528929e-06, + "loss": 0.7909, + "step": 11650 + }, + { + "epoch": 0.6412570862458032, + "grad_norm": 0.6877853274345398, + "learning_rate": 7.695641416818405e-06, + "loss": 0.7624, + "step": 11651 + }, + { + "epoch": 0.6413121250481589, + "grad_norm": 0.7337075471878052, + "learning_rate": 7.695276330848162e-06, + "loss": 0.7829, + "step": 11652 + }, + { + "epoch": 0.6413671638505146, + "grad_norm": 0.6423582434654236, + "learning_rate": 7.694911224620944e-06, + "loss": 0.6686, + "step": 11653 + }, + { + "epoch": 0.6414222026528703, + "grad_norm": 0.7826602458953857, + "learning_rate": 7.694546098139492e-06, + "loss": 0.774, + "step": 11654 + }, + { + "epoch": 0.6414772414552259, + "grad_norm": 0.7678147554397583, + "learning_rate": 7.694180951406556e-06, + "loss": 0.8067, + "step": 11655 + }, + { + "epoch": 0.6415322802575816, + "grad_norm": 0.6400566101074219, + "learning_rate": 7.693815784424875e-06, + "loss": 0.7796, + "step": 11656 + }, + { + "epoch": 0.6415873190599373, + "grad_norm": 0.6606197357177734, + "learning_rate": 7.693450597197196e-06, + "loss": 0.7381, + "step": 11657 + }, + { + "epoch": 0.641642357862293, + "grad_norm": 0.7953683137893677, + "learning_rate": 7.693085389726262e-06, + "loss": 0.8867, + "step": 11658 + }, + { + "epoch": 0.6416973966646485, + "grad_norm": 0.6763843894004822, + "learning_rate": 7.692720162014822e-06, + "loss": 0.7579, + "step": 11659 + }, + { + "epoch": 0.6417524354670042, + "grad_norm": 0.6456292867660522, + "learning_rate": 7.692354914065617e-06, + "loss": 0.7814, + "step": 11660 + }, + { + "epoch": 0.6418074742693599, + "grad_norm": 0.702803373336792, + "learning_rate": 7.691989645881393e-06, + "loss": 0.7393, + "step": 11661 + }, + { + "epoch": 0.6418625130717156, + "grad_norm": 0.8328298926353455, + "learning_rate": 7.691624357464895e-06, + "loss": 0.6587, + "step": 11662 + }, + { + "epoch": 0.6419175518740712, + "grad_norm": 0.8409613966941833, + "learning_rate": 7.691259048818871e-06, + "loss": 0.8075, + "step": 11663 + }, + { + "epoch": 0.6419725906764269, + "grad_norm": 0.6969256401062012, + "learning_rate": 7.690893719946062e-06, + "loss": 0.8061, + "step": 11664 + }, + { + "epoch": 0.6420276294787826, + "grad_norm": 0.7689732313156128, + "learning_rate": 7.690528370849217e-06, + "loss": 0.7709, + "step": 11665 + }, + { + "epoch": 0.6420826682811382, + "grad_norm": 0.8239523768424988, + "learning_rate": 7.69016300153108e-06, + "loss": 0.7421, + "step": 11666 + }, + { + "epoch": 0.6421377070834938, + "grad_norm": 0.7199227809906006, + "learning_rate": 7.689797611994398e-06, + "loss": 0.7877, + "step": 11667 + }, + { + "epoch": 0.6421927458858495, + "grad_norm": 0.8315985798835754, + "learning_rate": 7.689432202241919e-06, + "loss": 0.8458, + "step": 11668 + }, + { + "epoch": 0.6422477846882052, + "grad_norm": 0.7213512063026428, + "learning_rate": 7.689066772276385e-06, + "loss": 0.7199, + "step": 11669 + }, + { + "epoch": 0.6423028234905609, + "grad_norm": 0.6023604273796082, + "learning_rate": 7.688701322100547e-06, + "loss": 0.6485, + "step": 11670 + }, + { + "epoch": 0.6423578622929165, + "grad_norm": 0.8171319365501404, + "learning_rate": 7.688335851717148e-06, + "loss": 0.7561, + "step": 11671 + }, + { + "epoch": 0.6424129010952722, + "grad_norm": 0.6545816659927368, + "learning_rate": 7.687970361128937e-06, + "loss": 0.6796, + "step": 11672 + }, + { + "epoch": 0.6424679398976278, + "grad_norm": 0.8093686103820801, + "learning_rate": 7.687604850338661e-06, + "loss": 0.8538, + "step": 11673 + }, + { + "epoch": 0.6425229786999835, + "grad_norm": 0.6438135504722595, + "learning_rate": 7.687239319349066e-06, + "loss": 0.7046, + "step": 11674 + }, + { + "epoch": 0.6425780175023391, + "grad_norm": 0.685100257396698, + "learning_rate": 7.6868737681629e-06, + "loss": 0.7568, + "step": 11675 + }, + { + "epoch": 0.6426330563046948, + "grad_norm": 0.6850112676620483, + "learning_rate": 7.68650819678291e-06, + "loss": 0.7082, + "step": 11676 + }, + { + "epoch": 0.6426880951070505, + "grad_norm": 0.7524490356445312, + "learning_rate": 7.686142605211843e-06, + "loss": 0.7285, + "step": 11677 + }, + { + "epoch": 0.6427431339094062, + "grad_norm": 0.7706617116928101, + "learning_rate": 7.685776993452446e-06, + "loss": 0.7934, + "step": 11678 + }, + { + "epoch": 0.6427981727117618, + "grad_norm": 0.6612235307693481, + "learning_rate": 7.68541136150747e-06, + "loss": 0.6538, + "step": 11679 + }, + { + "epoch": 0.6428532115141175, + "grad_norm": 0.6380587816238403, + "learning_rate": 7.68504570937966e-06, + "loss": 0.7, + "step": 11680 + }, + { + "epoch": 0.6429082503164731, + "grad_norm": 0.6563882231712341, + "learning_rate": 7.684680037071765e-06, + "loss": 0.6912, + "step": 11681 + }, + { + "epoch": 0.6429632891188288, + "grad_norm": 0.6579793095588684, + "learning_rate": 7.684314344586534e-06, + "loss": 0.7263, + "step": 11682 + }, + { + "epoch": 0.6430183279211844, + "grad_norm": 0.7029374837875366, + "learning_rate": 7.683948631926713e-06, + "loss": 0.7151, + "step": 11683 + }, + { + "epoch": 0.6430733667235401, + "grad_norm": 0.6683217883110046, + "learning_rate": 7.683582899095056e-06, + "loss": 0.7643, + "step": 11684 + }, + { + "epoch": 0.6431284055258958, + "grad_norm": 1.0482646226882935, + "learning_rate": 7.683217146094308e-06, + "loss": 0.8889, + "step": 11685 + }, + { + "epoch": 0.6431834443282514, + "grad_norm": 0.7101102471351624, + "learning_rate": 7.682851372927216e-06, + "loss": 0.7762, + "step": 11686 + }, + { + "epoch": 0.643238483130607, + "grad_norm": 0.674961268901825, + "learning_rate": 7.682485579596533e-06, + "loss": 0.736, + "step": 11687 + }, + { + "epoch": 0.6432935219329627, + "grad_norm": 0.7071837782859802, + "learning_rate": 7.682119766105005e-06, + "loss": 0.7231, + "step": 11688 + }, + { + "epoch": 0.6433485607353184, + "grad_norm": 0.6982744932174683, + "learning_rate": 7.681753932455383e-06, + "loss": 0.7498, + "step": 11689 + }, + { + "epoch": 0.643403599537674, + "grad_norm": 0.6927201747894287, + "learning_rate": 7.681388078650415e-06, + "loss": 0.803, + "step": 11690 + }, + { + "epoch": 0.6434586383400297, + "grad_norm": 0.7299236059188843, + "learning_rate": 7.681022204692854e-06, + "loss": 0.7386, + "step": 11691 + }, + { + "epoch": 0.6435136771423854, + "grad_norm": 0.8809047937393188, + "learning_rate": 7.680656310585449e-06, + "loss": 0.741, + "step": 11692 + }, + { + "epoch": 0.6435687159447411, + "grad_norm": 0.862843930721283, + "learning_rate": 7.680290396330947e-06, + "loss": 0.8357, + "step": 11693 + }, + { + "epoch": 0.6436237547470967, + "grad_norm": 0.7436664700508118, + "learning_rate": 7.679924461932098e-06, + "loss": 0.8352, + "step": 11694 + }, + { + "epoch": 0.6436787935494523, + "grad_norm": 0.6582232713699341, + "learning_rate": 7.679558507391657e-06, + "loss": 0.7107, + "step": 11695 + }, + { + "epoch": 0.643733832351808, + "grad_norm": 0.6798850297927856, + "learning_rate": 7.67919253271237e-06, + "loss": 0.6968, + "step": 11696 + }, + { + "epoch": 0.6437888711541637, + "grad_norm": 0.7747187614440918, + "learning_rate": 7.67882653789699e-06, + "loss": 0.7611, + "step": 11697 + }, + { + "epoch": 0.6438439099565193, + "grad_norm": 0.7097567915916443, + "learning_rate": 7.678460522948267e-06, + "loss": 0.7275, + "step": 11698 + }, + { + "epoch": 0.643898948758875, + "grad_norm": 0.6958394050598145, + "learning_rate": 7.678094487868952e-06, + "loss": 0.7441, + "step": 11699 + }, + { + "epoch": 0.6439539875612307, + "grad_norm": 0.9129040837287903, + "learning_rate": 7.677728432661794e-06, + "loss": 0.7693, + "step": 11700 + }, + { + "epoch": 0.6440090263635864, + "grad_norm": 1.1396137475967407, + "learning_rate": 7.677362357329548e-06, + "loss": 0.7479, + "step": 11701 + }, + { + "epoch": 0.644064065165942, + "grad_norm": 0.8163042664527893, + "learning_rate": 7.67699626187496e-06, + "loss": 0.835, + "step": 11702 + }, + { + "epoch": 0.6441191039682976, + "grad_norm": 0.9869117736816406, + "learning_rate": 7.676630146300787e-06, + "loss": 0.769, + "step": 11703 + }, + { + "epoch": 0.6441741427706533, + "grad_norm": 0.7439526915550232, + "learning_rate": 7.676264010609777e-06, + "loss": 0.8239, + "step": 11704 + }, + { + "epoch": 0.644229181573009, + "grad_norm": 0.6943735480308533, + "learning_rate": 7.675897854804685e-06, + "loss": 0.7702, + "step": 11705 + }, + { + "epoch": 0.6442842203753646, + "grad_norm": 0.7384238243103027, + "learning_rate": 7.67553167888826e-06, + "loss": 0.6911, + "step": 11706 + }, + { + "epoch": 0.6443392591777203, + "grad_norm": 0.660022497177124, + "learning_rate": 7.675165482863254e-06, + "loss": 0.7359, + "step": 11707 + }, + { + "epoch": 0.644394297980076, + "grad_norm": 0.6956108808517456, + "learning_rate": 7.674799266732422e-06, + "loss": 0.7845, + "step": 11708 + }, + { + "epoch": 0.6444493367824317, + "grad_norm": 0.7361618280410767, + "learning_rate": 7.674433030498513e-06, + "loss": 0.7391, + "step": 11709 + }, + { + "epoch": 0.6445043755847872, + "grad_norm": 0.7655043005943298, + "learning_rate": 7.674066774164284e-06, + "loss": 0.8305, + "step": 11710 + }, + { + "epoch": 0.6445594143871429, + "grad_norm": 0.7160911560058594, + "learning_rate": 7.673700497732483e-06, + "loss": 0.7654, + "step": 11711 + }, + { + "epoch": 0.6446144531894986, + "grad_norm": 0.7812016010284424, + "learning_rate": 7.673334201205866e-06, + "loss": 0.8212, + "step": 11712 + }, + { + "epoch": 0.6446694919918543, + "grad_norm": 0.7457767128944397, + "learning_rate": 7.672967884587184e-06, + "loss": 0.8084, + "step": 11713 + }, + { + "epoch": 0.6447245307942099, + "grad_norm": 0.7524051070213318, + "learning_rate": 7.672601547879189e-06, + "loss": 0.7525, + "step": 11714 + }, + { + "epoch": 0.6447795695965656, + "grad_norm": 0.7271043062210083, + "learning_rate": 7.672235191084638e-06, + "loss": 0.7627, + "step": 11715 + }, + { + "epoch": 0.6448346083989213, + "grad_norm": 0.6893014907836914, + "learning_rate": 7.671868814206283e-06, + "loss": 0.7969, + "step": 11716 + }, + { + "epoch": 0.644889647201277, + "grad_norm": 0.7057414054870605, + "learning_rate": 7.671502417246876e-06, + "loss": 0.7448, + "step": 11717 + }, + { + "epoch": 0.6449446860036325, + "grad_norm": 0.7490910887718201, + "learning_rate": 7.671136000209172e-06, + "loss": 0.8046, + "step": 11718 + }, + { + "epoch": 0.6449997248059882, + "grad_norm": 0.7338950634002686, + "learning_rate": 7.670769563095926e-06, + "loss": 0.8521, + "step": 11719 + }, + { + "epoch": 0.6450547636083439, + "grad_norm": 0.8669398427009583, + "learning_rate": 7.670403105909891e-06, + "loss": 0.7803, + "step": 11720 + }, + { + "epoch": 0.6451098024106996, + "grad_norm": 0.7012562155723572, + "learning_rate": 7.67003662865382e-06, + "loss": 0.8047, + "step": 11721 + }, + { + "epoch": 0.6451648412130552, + "grad_norm": 0.9933050274848938, + "learning_rate": 7.66967013133047e-06, + "loss": 0.7081, + "step": 11722 + }, + { + "epoch": 0.6452198800154109, + "grad_norm": 1.12044358253479, + "learning_rate": 7.669303613942592e-06, + "loss": 0.7315, + "step": 11723 + }, + { + "epoch": 0.6452749188177666, + "grad_norm": 0.8654733300209045, + "learning_rate": 7.668937076492943e-06, + "loss": 0.6849, + "step": 11724 + }, + { + "epoch": 0.6453299576201222, + "grad_norm": 0.7081291675567627, + "learning_rate": 7.668570518984277e-06, + "loss": 0.7584, + "step": 11725 + }, + { + "epoch": 0.6453849964224778, + "grad_norm": 0.7473898530006409, + "learning_rate": 7.66820394141935e-06, + "loss": 0.8364, + "step": 11726 + }, + { + "epoch": 0.6454400352248335, + "grad_norm": 0.7863657474517822, + "learning_rate": 7.667837343800916e-06, + "loss": 0.7235, + "step": 11727 + }, + { + "epoch": 0.6454950740271892, + "grad_norm": 0.6664546728134155, + "learning_rate": 7.667470726131732e-06, + "loss": 0.7203, + "step": 11728 + }, + { + "epoch": 0.6455501128295448, + "grad_norm": 0.7182374596595764, + "learning_rate": 7.667104088414552e-06, + "loss": 0.7376, + "step": 11729 + }, + { + "epoch": 0.6456051516319005, + "grad_norm": 0.6518070697784424, + "learning_rate": 7.666737430652128e-06, + "loss": 0.6804, + "step": 11730 + }, + { + "epoch": 0.6456601904342562, + "grad_norm": 0.7354047894477844, + "learning_rate": 7.666370752847223e-06, + "loss": 0.7648, + "step": 11731 + }, + { + "epoch": 0.6457152292366118, + "grad_norm": 0.7440805435180664, + "learning_rate": 7.666004055002588e-06, + "loss": 0.7674, + "step": 11732 + }, + { + "epoch": 0.6457702680389674, + "grad_norm": 1.6423569917678833, + "learning_rate": 7.665637337120981e-06, + "loss": 0.8957, + "step": 11733 + }, + { + "epoch": 0.6458253068413231, + "grad_norm": 0.6960558295249939, + "learning_rate": 7.665270599205156e-06, + "loss": 0.7278, + "step": 11734 + }, + { + "epoch": 0.6458803456436788, + "grad_norm": 0.6983850002288818, + "learning_rate": 7.664903841257871e-06, + "loss": 0.7351, + "step": 11735 + }, + { + "epoch": 0.6459353844460345, + "grad_norm": 0.6905686855316162, + "learning_rate": 7.664537063281883e-06, + "loss": 0.7558, + "step": 11736 + }, + { + "epoch": 0.6459904232483901, + "grad_norm": 0.7483980655670166, + "learning_rate": 7.664170265279946e-06, + "loss": 0.813, + "step": 11737 + }, + { + "epoch": 0.6460454620507458, + "grad_norm": 0.767756998538971, + "learning_rate": 7.66380344725482e-06, + "loss": 0.8397, + "step": 11738 + }, + { + "epoch": 0.6461005008531014, + "grad_norm": 0.7813250422477722, + "learning_rate": 7.66343660920926e-06, + "loss": 0.8034, + "step": 11739 + }, + { + "epoch": 0.6461555396554571, + "grad_norm": 0.7357046604156494, + "learning_rate": 7.663069751146022e-06, + "loss": 0.7604, + "step": 11740 + }, + { + "epoch": 0.6462105784578127, + "grad_norm": 0.620285153388977, + "learning_rate": 7.662702873067866e-06, + "loss": 0.6191, + "step": 11741 + }, + { + "epoch": 0.6462656172601684, + "grad_norm": 0.6711301803588867, + "learning_rate": 7.662335974977549e-06, + "loss": 0.7674, + "step": 11742 + }, + { + "epoch": 0.6463206560625241, + "grad_norm": 0.756258487701416, + "learning_rate": 7.661969056877824e-06, + "loss": 0.7074, + "step": 11743 + }, + { + "epoch": 0.6463756948648798, + "grad_norm": 0.8121050596237183, + "learning_rate": 7.661602118771456e-06, + "loss": 0.8028, + "step": 11744 + }, + { + "epoch": 0.6464307336672354, + "grad_norm": 0.735906720161438, + "learning_rate": 7.661235160661197e-06, + "loss": 0.7197, + "step": 11745 + }, + { + "epoch": 0.646485772469591, + "grad_norm": 0.644490122795105, + "learning_rate": 7.660868182549807e-06, + "loss": 0.6172, + "step": 11746 + }, + { + "epoch": 0.6465408112719467, + "grad_norm": 0.7228739261627197, + "learning_rate": 7.660501184440045e-06, + "loss": 0.8302, + "step": 11747 + }, + { + "epoch": 0.6465958500743024, + "grad_norm": 0.8292868137359619, + "learning_rate": 7.660134166334668e-06, + "loss": 0.7506, + "step": 11748 + }, + { + "epoch": 0.646650888876658, + "grad_norm": 0.7224695086479187, + "learning_rate": 7.659767128236433e-06, + "loss": 0.8043, + "step": 11749 + }, + { + "epoch": 0.6467059276790137, + "grad_norm": 0.7092188000679016, + "learning_rate": 7.659400070148102e-06, + "loss": 0.7838, + "step": 11750 + }, + { + "epoch": 0.6467609664813694, + "grad_norm": 0.6975178122520447, + "learning_rate": 7.65903299207243e-06, + "loss": 0.7576, + "step": 11751 + }, + { + "epoch": 0.6468160052837251, + "grad_norm": 0.6524471044540405, + "learning_rate": 7.658665894012179e-06, + "loss": 0.7822, + "step": 11752 + }, + { + "epoch": 0.6468710440860806, + "grad_norm": 0.8134269118309021, + "learning_rate": 7.658298775970107e-06, + "loss": 0.8116, + "step": 11753 + }, + { + "epoch": 0.6469260828884363, + "grad_norm": 0.7166362404823303, + "learning_rate": 7.657931637948974e-06, + "loss": 0.768, + "step": 11754 + }, + { + "epoch": 0.646981121690792, + "grad_norm": 0.6418643593788147, + "learning_rate": 7.657564479951535e-06, + "loss": 0.7488, + "step": 11755 + }, + { + "epoch": 0.6470361604931477, + "grad_norm": 0.7104085087776184, + "learning_rate": 7.657197301980556e-06, + "loss": 0.7518, + "step": 11756 + }, + { + "epoch": 0.6470911992955033, + "grad_norm": 0.7297894358634949, + "learning_rate": 7.656830104038793e-06, + "loss": 0.7877, + "step": 11757 + }, + { + "epoch": 0.647146238097859, + "grad_norm": 0.8037092089653015, + "learning_rate": 7.656462886129006e-06, + "loss": 0.7375, + "step": 11758 + }, + { + "epoch": 0.6472012769002147, + "grad_norm": 0.7498913407325745, + "learning_rate": 7.656095648253955e-06, + "loss": 0.7899, + "step": 11759 + }, + { + "epoch": 0.6472563157025704, + "grad_norm": 0.7383849620819092, + "learning_rate": 7.655728390416398e-06, + "loss": 0.8276, + "step": 11760 + }, + { + "epoch": 0.6473113545049259, + "grad_norm": 0.750481367111206, + "learning_rate": 7.6553611126191e-06, + "loss": 0.7649, + "step": 11761 + }, + { + "epoch": 0.6473663933072816, + "grad_norm": 0.8483286499977112, + "learning_rate": 7.654993814864817e-06, + "loss": 0.877, + "step": 11762 + }, + { + "epoch": 0.6474214321096373, + "grad_norm": 0.7938307523727417, + "learning_rate": 7.654626497156311e-06, + "loss": 0.8159, + "step": 11763 + }, + { + "epoch": 0.647476470911993, + "grad_norm": 0.6576653122901917, + "learning_rate": 7.654259159496343e-06, + "loss": 0.797, + "step": 11764 + }, + { + "epoch": 0.6475315097143486, + "grad_norm": 0.6495664715766907, + "learning_rate": 7.653891801887675e-06, + "loss": 0.6641, + "step": 11765 + }, + { + "epoch": 0.6475865485167043, + "grad_norm": 0.7447353601455688, + "learning_rate": 7.653524424333065e-06, + "loss": 0.667, + "step": 11766 + }, + { + "epoch": 0.64764158731906, + "grad_norm": 0.6565769910812378, + "learning_rate": 7.653157026835277e-06, + "loss": 0.7123, + "step": 11767 + }, + { + "epoch": 0.6476966261214157, + "grad_norm": 0.8406145572662354, + "learning_rate": 7.652789609397072e-06, + "loss": 0.7582, + "step": 11768 + }, + { + "epoch": 0.6477516649237712, + "grad_norm": 0.8478217720985413, + "learning_rate": 7.652422172021207e-06, + "loss": 0.6758, + "step": 11769 + }, + { + "epoch": 0.6478067037261269, + "grad_norm": 0.7230110168457031, + "learning_rate": 7.652054714710448e-06, + "loss": 0.8216, + "step": 11770 + }, + { + "epoch": 0.6478617425284826, + "grad_norm": 0.6718668341636658, + "learning_rate": 7.651687237467558e-06, + "loss": 0.7204, + "step": 11771 + }, + { + "epoch": 0.6479167813308382, + "grad_norm": 1.062383770942688, + "learning_rate": 7.651319740295296e-06, + "loss": 0.6853, + "step": 11772 + }, + { + "epoch": 0.6479718201331939, + "grad_norm": 0.7157385945320129, + "learning_rate": 7.650952223196423e-06, + "loss": 0.6826, + "step": 11773 + }, + { + "epoch": 0.6480268589355496, + "grad_norm": 0.6762190461158752, + "learning_rate": 7.650584686173703e-06, + "loss": 0.7673, + "step": 11774 + }, + { + "epoch": 0.6480818977379053, + "grad_norm": 0.7540121674537659, + "learning_rate": 7.650217129229897e-06, + "loss": 0.7361, + "step": 11775 + }, + { + "epoch": 0.6481369365402608, + "grad_norm": 1.0383096933364868, + "learning_rate": 7.649849552367771e-06, + "loss": 0.7936, + "step": 11776 + }, + { + "epoch": 0.6481919753426165, + "grad_norm": 0.6430917382240295, + "learning_rate": 7.649481955590084e-06, + "loss": 0.7738, + "step": 11777 + }, + { + "epoch": 0.6482470141449722, + "grad_norm": 0.7846735715866089, + "learning_rate": 7.6491143388996e-06, + "loss": 0.6892, + "step": 11778 + }, + { + "epoch": 0.6483020529473279, + "grad_norm": 0.7154437899589539, + "learning_rate": 7.64874670229908e-06, + "loss": 0.6889, + "step": 11779 + }, + { + "epoch": 0.6483570917496835, + "grad_norm": 0.731270432472229, + "learning_rate": 7.648379045791291e-06, + "loss": 0.6405, + "step": 11780 + }, + { + "epoch": 0.6484121305520392, + "grad_norm": 0.6782581210136414, + "learning_rate": 7.648011369378993e-06, + "loss": 0.7822, + "step": 11781 + }, + { + "epoch": 0.6484671693543949, + "grad_norm": 0.7025747299194336, + "learning_rate": 7.64764367306495e-06, + "loss": 0.6929, + "step": 11782 + }, + { + "epoch": 0.6485222081567505, + "grad_norm": 0.6791071891784668, + "learning_rate": 7.647275956851928e-06, + "loss": 0.7507, + "step": 11783 + }, + { + "epoch": 0.6485772469591061, + "grad_norm": 0.7598931193351746, + "learning_rate": 7.646908220742686e-06, + "loss": 0.776, + "step": 11784 + }, + { + "epoch": 0.6486322857614618, + "grad_norm": 0.6930273771286011, + "learning_rate": 7.646540464739993e-06, + "loss": 0.7653, + "step": 11785 + }, + { + "epoch": 0.6486873245638175, + "grad_norm": 0.7276393175125122, + "learning_rate": 7.646172688846608e-06, + "loss": 0.8102, + "step": 11786 + }, + { + "epoch": 0.6487423633661732, + "grad_norm": 0.6826562285423279, + "learning_rate": 7.645804893065298e-06, + "loss": 0.6182, + "step": 11787 + }, + { + "epoch": 0.6487974021685288, + "grad_norm": 0.7837507128715515, + "learning_rate": 7.645437077398827e-06, + "loss": 0.8124, + "step": 11788 + }, + { + "epoch": 0.6488524409708845, + "grad_norm": 0.6937540769577026, + "learning_rate": 7.645069241849959e-06, + "loss": 0.7831, + "step": 11789 + }, + { + "epoch": 0.6489074797732401, + "grad_norm": 0.6531546115875244, + "learning_rate": 7.644701386421458e-06, + "loss": 0.755, + "step": 11790 + }, + { + "epoch": 0.6489625185755958, + "grad_norm": 0.8563246726989746, + "learning_rate": 7.644333511116088e-06, + "loss": 0.7715, + "step": 11791 + }, + { + "epoch": 0.6490175573779514, + "grad_norm": 0.8330580592155457, + "learning_rate": 7.643965615936619e-06, + "loss": 0.6651, + "step": 11792 + }, + { + "epoch": 0.6490725961803071, + "grad_norm": 0.6478384137153625, + "learning_rate": 7.643597700885809e-06, + "loss": 0.7063, + "step": 11793 + }, + { + "epoch": 0.6491276349826628, + "grad_norm": 0.7169124484062195, + "learning_rate": 7.643229765966428e-06, + "loss": 0.7578, + "step": 11794 + }, + { + "epoch": 0.6491826737850185, + "grad_norm": 0.726198136806488, + "learning_rate": 7.642861811181239e-06, + "loss": 0.783, + "step": 11795 + }, + { + "epoch": 0.6492377125873741, + "grad_norm": 0.7167587280273438, + "learning_rate": 7.642493836533008e-06, + "loss": 0.81, + "step": 11796 + }, + { + "epoch": 0.6492927513897297, + "grad_norm": 0.7215337157249451, + "learning_rate": 7.642125842024502e-06, + "loss": 0.8176, + "step": 11797 + }, + { + "epoch": 0.6493477901920854, + "grad_norm": 0.7041502594947815, + "learning_rate": 7.641757827658484e-06, + "loss": 0.8117, + "step": 11798 + }, + { + "epoch": 0.6494028289944411, + "grad_norm": 1.0303698778152466, + "learning_rate": 7.64138979343772e-06, + "loss": 0.781, + "step": 11799 + }, + { + "epoch": 0.6494578677967967, + "grad_norm": 0.626518189907074, + "learning_rate": 7.64102173936498e-06, + "loss": 0.6668, + "step": 11800 + }, + { + "epoch": 0.6495129065991524, + "grad_norm": 0.8889065980911255, + "learning_rate": 7.640653665443025e-06, + "loss": 0.8076, + "step": 11801 + }, + { + "epoch": 0.6495679454015081, + "grad_norm": 0.8333556652069092, + "learning_rate": 7.640285571674626e-06, + "loss": 0.8111, + "step": 11802 + }, + { + "epoch": 0.6496229842038638, + "grad_norm": 0.7248615622520447, + "learning_rate": 7.639917458062547e-06, + "loss": 0.7876, + "step": 11803 + }, + { + "epoch": 0.6496780230062194, + "grad_norm": 0.8870820999145508, + "learning_rate": 7.639549324609554e-06, + "loss": 0.8586, + "step": 11804 + }, + { + "epoch": 0.649733061808575, + "grad_norm": 0.7777245044708252, + "learning_rate": 7.639181171318417e-06, + "loss": 0.7793, + "step": 11805 + }, + { + "epoch": 0.6497881006109307, + "grad_norm": 0.7858467102050781, + "learning_rate": 7.638812998191897e-06, + "loss": 0.7842, + "step": 11806 + }, + { + "epoch": 0.6498431394132864, + "grad_norm": 0.6278610825538635, + "learning_rate": 7.638444805232769e-06, + "loss": 0.6659, + "step": 11807 + }, + { + "epoch": 0.649898178215642, + "grad_norm": 0.6758826971054077, + "learning_rate": 7.638076592443795e-06, + "loss": 0.7047, + "step": 11808 + }, + { + "epoch": 0.6499532170179977, + "grad_norm": 0.745007336139679, + "learning_rate": 7.637708359827743e-06, + "loss": 0.8557, + "step": 11809 + }, + { + "epoch": 0.6500082558203534, + "grad_norm": 0.8092321157455444, + "learning_rate": 7.63734010738738e-06, + "loss": 0.7895, + "step": 11810 + }, + { + "epoch": 0.6500632946227091, + "grad_norm": 0.7055220603942871, + "learning_rate": 7.636971835125476e-06, + "loss": 0.7678, + "step": 11811 + }, + { + "epoch": 0.6501183334250646, + "grad_norm": 0.7130264043807983, + "learning_rate": 7.636603543044797e-06, + "loss": 0.7648, + "step": 11812 + }, + { + "epoch": 0.6501733722274203, + "grad_norm": 0.7494268417358398, + "learning_rate": 7.636235231148112e-06, + "loss": 0.7883, + "step": 11813 + }, + { + "epoch": 0.650228411029776, + "grad_norm": 0.7998068332672119, + "learning_rate": 7.635866899438189e-06, + "loss": 0.7849, + "step": 11814 + }, + { + "epoch": 0.6502834498321316, + "grad_norm": 0.6749094128608704, + "learning_rate": 7.635498547917795e-06, + "loss": 0.8488, + "step": 11815 + }, + { + "epoch": 0.6503384886344873, + "grad_norm": 0.743679940700531, + "learning_rate": 7.635130176589698e-06, + "loss": 0.7562, + "step": 11816 + }, + { + "epoch": 0.650393527436843, + "grad_norm": 0.8368289470672607, + "learning_rate": 7.634761785456671e-06, + "loss": 0.7012, + "step": 11817 + }, + { + "epoch": 0.6504485662391987, + "grad_norm": 0.7214943170547485, + "learning_rate": 7.634393374521478e-06, + "loss": 0.7386, + "step": 11818 + }, + { + "epoch": 0.6505036050415542, + "grad_norm": 0.7026216387748718, + "learning_rate": 7.63402494378689e-06, + "loss": 0.7444, + "step": 11819 + }, + { + "epoch": 0.6505586438439099, + "grad_norm": 0.6271201372146606, + "learning_rate": 7.633656493255677e-06, + "loss": 0.6567, + "step": 11820 + }, + { + "epoch": 0.6506136826462656, + "grad_norm": 0.8359349370002747, + "learning_rate": 7.633288022930606e-06, + "loss": 0.7081, + "step": 11821 + }, + { + "epoch": 0.6506687214486213, + "grad_norm": 0.7009666562080383, + "learning_rate": 7.632919532814444e-06, + "loss": 0.6892, + "step": 11822 + }, + { + "epoch": 0.6507237602509769, + "grad_norm": 0.7445069551467896, + "learning_rate": 7.632551022909966e-06, + "loss": 0.7854, + "step": 11823 + }, + { + "epoch": 0.6507787990533326, + "grad_norm": 0.7204466462135315, + "learning_rate": 7.63218249321994e-06, + "loss": 0.8065, + "step": 11824 + }, + { + "epoch": 0.6508338378556883, + "grad_norm": 0.7058166265487671, + "learning_rate": 7.631813943747135e-06, + "loss": 0.6668, + "step": 11825 + }, + { + "epoch": 0.650888876658044, + "grad_norm": 0.739919126033783, + "learning_rate": 7.631445374494319e-06, + "loss": 0.8657, + "step": 11826 + }, + { + "epoch": 0.6509439154603995, + "grad_norm": 1.0444670915603638, + "learning_rate": 7.631076785464263e-06, + "loss": 0.7226, + "step": 11827 + }, + { + "epoch": 0.6509989542627552, + "grad_norm": 0.7146627306938171, + "learning_rate": 7.630708176659743e-06, + "loss": 0.7567, + "step": 11828 + }, + { + "epoch": 0.6510539930651109, + "grad_norm": 0.6981074810028076, + "learning_rate": 7.630339548083521e-06, + "loss": 0.7158, + "step": 11829 + }, + { + "epoch": 0.6511090318674666, + "grad_norm": 0.7620309591293335, + "learning_rate": 7.629970899738372e-06, + "loss": 0.811, + "step": 11830 + }, + { + "epoch": 0.6511640706698222, + "grad_norm": 0.7017341256141663, + "learning_rate": 7.629602231627066e-06, + "loss": 0.7092, + "step": 11831 + }, + { + "epoch": 0.6512191094721779, + "grad_norm": 0.733524739742279, + "learning_rate": 7.629233543752373e-06, + "loss": 0.859, + "step": 11832 + }, + { + "epoch": 0.6512741482745336, + "grad_norm": 0.7246975898742676, + "learning_rate": 7.628864836117065e-06, + "loss": 0.7732, + "step": 11833 + }, + { + "epoch": 0.6513291870768892, + "grad_norm": 0.5763251185417175, + "learning_rate": 7.628496108723911e-06, + "loss": 0.6632, + "step": 11834 + }, + { + "epoch": 0.6513842258792448, + "grad_norm": 0.6120070815086365, + "learning_rate": 7.628127361575685e-06, + "loss": 0.6809, + "step": 11835 + }, + { + "epoch": 0.6514392646816005, + "grad_norm": 0.8650742769241333, + "learning_rate": 7.627758594675157e-06, + "loss": 0.6388, + "step": 11836 + }, + { + "epoch": 0.6514943034839562, + "grad_norm": 0.8650027513504028, + "learning_rate": 7.627389808025099e-06, + "loss": 0.7622, + "step": 11837 + }, + { + "epoch": 0.6515493422863119, + "grad_norm": 0.6683071851730347, + "learning_rate": 7.627021001628283e-06, + "loss": 0.7424, + "step": 11838 + }, + { + "epoch": 0.6516043810886675, + "grad_norm": 0.6821237206459045, + "learning_rate": 7.626652175487479e-06, + "loss": 0.7844, + "step": 11839 + }, + { + "epoch": 0.6516594198910232, + "grad_norm": 0.7142770886421204, + "learning_rate": 7.626283329605462e-06, + "loss": 0.7706, + "step": 11840 + }, + { + "epoch": 0.6517144586933789, + "grad_norm": 0.7870625257492065, + "learning_rate": 7.625914463985002e-06, + "loss": 0.7673, + "step": 11841 + }, + { + "epoch": 0.6517694974957345, + "grad_norm": 0.7386491894721985, + "learning_rate": 7.62554557862887e-06, + "loss": 0.7562, + "step": 11842 + }, + { + "epoch": 0.6518245362980901, + "grad_norm": 0.6529993414878845, + "learning_rate": 7.625176673539843e-06, + "loss": 0.8258, + "step": 11843 + }, + { + "epoch": 0.6518795751004458, + "grad_norm": 0.7010294795036316, + "learning_rate": 7.6248077487206895e-06, + "loss": 0.7773, + "step": 11844 + }, + { + "epoch": 0.6519346139028015, + "grad_norm": 0.6699075698852539, + "learning_rate": 7.624438804174184e-06, + "loss": 0.7163, + "step": 11845 + }, + { + "epoch": 0.6519896527051572, + "grad_norm": 0.6600161790847778, + "learning_rate": 7.624069839903099e-06, + "loss": 0.7355, + "step": 11846 + }, + { + "epoch": 0.6520446915075128, + "grad_norm": 0.6556873321533203, + "learning_rate": 7.623700855910205e-06, + "loss": 0.627, + "step": 11847 + }, + { + "epoch": 0.6520997303098685, + "grad_norm": 0.6867008805274963, + "learning_rate": 7.623331852198281e-06, + "loss": 0.8228, + "step": 11848 + }, + { + "epoch": 0.6521547691122241, + "grad_norm": 0.6885474324226379, + "learning_rate": 7.622962828770095e-06, + "loss": 0.6804, + "step": 11849 + }, + { + "epoch": 0.6522098079145798, + "grad_norm": 0.6903913021087646, + "learning_rate": 7.622593785628425e-06, + "loss": 0.6553, + "step": 11850 + }, + { + "epoch": 0.6522648467169354, + "grad_norm": 0.6581684947013855, + "learning_rate": 7.622224722776039e-06, + "loss": 0.7102, + "step": 11851 + }, + { + "epoch": 0.6523198855192911, + "grad_norm": 0.8261715769767761, + "learning_rate": 7.621855640215716e-06, + "loss": 0.676, + "step": 11852 + }, + { + "epoch": 0.6523749243216468, + "grad_norm": 0.6238247752189636, + "learning_rate": 7.6214865379502265e-06, + "loss": 0.7065, + "step": 11853 + }, + { + "epoch": 0.6524299631240025, + "grad_norm": 0.7350416779518127, + "learning_rate": 7.621117415982346e-06, + "loss": 0.7512, + "step": 11854 + }, + { + "epoch": 0.652485001926358, + "grad_norm": 0.7337208390235901, + "learning_rate": 7.620748274314851e-06, + "loss": 0.7593, + "step": 11855 + }, + { + "epoch": 0.6525400407287137, + "grad_norm": 0.6568214297294617, + "learning_rate": 7.620379112950511e-06, + "loss": 0.7363, + "step": 11856 + }, + { + "epoch": 0.6525950795310694, + "grad_norm": 0.7099055647850037, + "learning_rate": 7.620009931892105e-06, + "loss": 0.6631, + "step": 11857 + }, + { + "epoch": 0.652650118333425, + "grad_norm": 0.6563010215759277, + "learning_rate": 7.6196407311424035e-06, + "loss": 0.6617, + "step": 11858 + }, + { + "epoch": 0.6527051571357807, + "grad_norm": 0.6664251685142517, + "learning_rate": 7.6192715107041845e-06, + "loss": 0.7898, + "step": 11859 + }, + { + "epoch": 0.6527601959381364, + "grad_norm": 0.6524507403373718, + "learning_rate": 7.618902270580222e-06, + "loss": 0.767, + "step": 11860 + }, + { + "epoch": 0.6528152347404921, + "grad_norm": 0.7391313910484314, + "learning_rate": 7.61853301077329e-06, + "loss": 0.6015, + "step": 11861 + }, + { + "epoch": 0.6528702735428477, + "grad_norm": 0.7691878080368042, + "learning_rate": 7.618163731286167e-06, + "loss": 0.718, + "step": 11862 + }, + { + "epoch": 0.6529253123452033, + "grad_norm": 0.6524633765220642, + "learning_rate": 7.617794432121625e-06, + "loss": 0.6841, + "step": 11863 + }, + { + "epoch": 0.652980351147559, + "grad_norm": 0.7125405073165894, + "learning_rate": 7.61742511328244e-06, + "loss": 0.7654, + "step": 11864 + }, + { + "epoch": 0.6530353899499147, + "grad_norm": 0.7123568058013916, + "learning_rate": 7.617055774771389e-06, + "loss": 0.7189, + "step": 11865 + }, + { + "epoch": 0.6530904287522703, + "grad_norm": 0.6968240141868591, + "learning_rate": 7.616686416591248e-06, + "loss": 0.7201, + "step": 11866 + }, + { + "epoch": 0.653145467554626, + "grad_norm": 0.7208551168441772, + "learning_rate": 7.616317038744792e-06, + "loss": 0.6644, + "step": 11867 + }, + { + "epoch": 0.6532005063569817, + "grad_norm": 0.7320911884307861, + "learning_rate": 7.615947641234798e-06, + "loss": 0.7118, + "step": 11868 + }, + { + "epoch": 0.6532555451593374, + "grad_norm": 0.7762041687965393, + "learning_rate": 7.615578224064041e-06, + "loss": 0.7501, + "step": 11869 + }, + { + "epoch": 0.653310583961693, + "grad_norm": 0.7455989718437195, + "learning_rate": 7.6152087872352975e-06, + "loss": 0.8058, + "step": 11870 + }, + { + "epoch": 0.6533656227640486, + "grad_norm": 0.736044704914093, + "learning_rate": 7.614839330751347e-06, + "loss": 0.727, + "step": 11871 + }, + { + "epoch": 0.6534206615664043, + "grad_norm": 0.680171012878418, + "learning_rate": 7.614469854614961e-06, + "loss": 0.6722, + "step": 11872 + }, + { + "epoch": 0.65347570036876, + "grad_norm": 0.7598134279251099, + "learning_rate": 7.614100358828922e-06, + "loss": 0.7472, + "step": 11873 + }, + { + "epoch": 0.6535307391711156, + "grad_norm": 0.8288099765777588, + "learning_rate": 7.613730843396003e-06, + "loss": 0.7493, + "step": 11874 + }, + { + "epoch": 0.6535857779734713, + "grad_norm": 0.6436724066734314, + "learning_rate": 7.613361308318984e-06, + "loss": 0.7103, + "step": 11875 + }, + { + "epoch": 0.653640816775827, + "grad_norm": 0.671334981918335, + "learning_rate": 7.612991753600639e-06, + "loss": 0.6949, + "step": 11876 + }, + { + "epoch": 0.6536958555781827, + "grad_norm": 0.6019170880317688, + "learning_rate": 7.61262217924375e-06, + "loss": 0.6116, + "step": 11877 + }, + { + "epoch": 0.6537508943805382, + "grad_norm": 1.4682546854019165, + "learning_rate": 7.61225258525109e-06, + "loss": 0.9343, + "step": 11878 + }, + { + "epoch": 0.6538059331828939, + "grad_norm": 0.656822681427002, + "learning_rate": 7.611882971625439e-06, + "loss": 0.7357, + "step": 11879 + }, + { + "epoch": 0.6538609719852496, + "grad_norm": 0.635734498500824, + "learning_rate": 7.611513338369576e-06, + "loss": 0.6263, + "step": 11880 + }, + { + "epoch": 0.6539160107876053, + "grad_norm": 0.7123430967330933, + "learning_rate": 7.611143685486277e-06, + "loss": 0.8446, + "step": 11881 + }, + { + "epoch": 0.6539710495899609, + "grad_norm": 0.7597065567970276, + "learning_rate": 7.610774012978322e-06, + "loss": 0.7449, + "step": 11882 + }, + { + "epoch": 0.6540260883923166, + "grad_norm": 0.7555896043777466, + "learning_rate": 7.610404320848486e-06, + "loss": 0.7575, + "step": 11883 + }, + { + "epoch": 0.6540811271946723, + "grad_norm": 0.7572906613349915, + "learning_rate": 7.6100346090995506e-06, + "loss": 0.7547, + "step": 11884 + }, + { + "epoch": 0.654136165997028, + "grad_norm": 0.6663275957107544, + "learning_rate": 7.609664877734295e-06, + "loss": 0.7038, + "step": 11885 + }, + { + "epoch": 0.6541912047993835, + "grad_norm": 0.7346611618995667, + "learning_rate": 7.609295126755496e-06, + "loss": 0.7902, + "step": 11886 + }, + { + "epoch": 0.6542462436017392, + "grad_norm": 0.6846545338630676, + "learning_rate": 7.608925356165934e-06, + "loss": 0.7334, + "step": 11887 + }, + { + "epoch": 0.6543012824040949, + "grad_norm": 0.6714815497398376, + "learning_rate": 7.608555565968385e-06, + "loss": 0.7204, + "step": 11888 + }, + { + "epoch": 0.6543563212064506, + "grad_norm": 0.805095374584198, + "learning_rate": 7.608185756165634e-06, + "loss": 0.8521, + "step": 11889 + }, + { + "epoch": 0.6544113600088062, + "grad_norm": 0.8415316343307495, + "learning_rate": 7.607815926760456e-06, + "loss": 0.7076, + "step": 11890 + }, + { + "epoch": 0.6544663988111619, + "grad_norm": 0.7665743231773376, + "learning_rate": 7.607446077755632e-06, + "loss": 0.8072, + "step": 11891 + }, + { + "epoch": 0.6545214376135176, + "grad_norm": 0.6705248355865479, + "learning_rate": 7.607076209153939e-06, + "loss": 0.6607, + "step": 11892 + }, + { + "epoch": 0.6545764764158732, + "grad_norm": 0.6791796684265137, + "learning_rate": 7.606706320958159e-06, + "loss": 0.773, + "step": 11893 + }, + { + "epoch": 0.6546315152182288, + "grad_norm": 0.8177357316017151, + "learning_rate": 7.606336413171075e-06, + "loss": 0.8114, + "step": 11894 + }, + { + "epoch": 0.6546865540205845, + "grad_norm": 0.9491637945175171, + "learning_rate": 7.605966485795462e-06, + "loss": 0.7424, + "step": 11895 + }, + { + "epoch": 0.6547415928229402, + "grad_norm": 0.7326256036758423, + "learning_rate": 7.605596538834103e-06, + "loss": 0.8176, + "step": 11896 + }, + { + "epoch": 0.6547966316252959, + "grad_norm": 0.6081808805465698, + "learning_rate": 7.6052265722897775e-06, + "loss": 0.6827, + "step": 11897 + }, + { + "epoch": 0.6548516704276515, + "grad_norm": 0.7165681719779968, + "learning_rate": 7.604856586165268e-06, + "loss": 0.7854, + "step": 11898 + }, + { + "epoch": 0.6549067092300072, + "grad_norm": 0.8777725100517273, + "learning_rate": 7.604486580463353e-06, + "loss": 0.8084, + "step": 11899 + }, + { + "epoch": 0.6549617480323628, + "grad_norm": 0.6814439296722412, + "learning_rate": 7.604116555186811e-06, + "loss": 0.6869, + "step": 11900 + }, + { + "epoch": 0.6550167868347184, + "grad_norm": 0.7060914635658264, + "learning_rate": 7.60374651033843e-06, + "loss": 0.7066, + "step": 11901 + }, + { + "epoch": 0.6550718256370741, + "grad_norm": 0.6823089718818665, + "learning_rate": 7.603376445920987e-06, + "loss": 0.6095, + "step": 11902 + }, + { + "epoch": 0.6551268644394298, + "grad_norm": 0.7099863290786743, + "learning_rate": 7.603006361937262e-06, + "loss": 0.8037, + "step": 11903 + }, + { + "epoch": 0.6551819032417855, + "grad_norm": 0.6479066610336304, + "learning_rate": 7.602636258390037e-06, + "loss": 0.6844, + "step": 11904 + }, + { + "epoch": 0.6552369420441411, + "grad_norm": 0.6663268804550171, + "learning_rate": 7.602266135282097e-06, + "loss": 0.735, + "step": 11905 + }, + { + "epoch": 0.6552919808464968, + "grad_norm": 0.8670598268508911, + "learning_rate": 7.60189599261622e-06, + "loss": 0.779, + "step": 11906 + }, + { + "epoch": 0.6553470196488524, + "grad_norm": 0.607631504535675, + "learning_rate": 7.601525830395189e-06, + "loss": 0.6288, + "step": 11907 + }, + { + "epoch": 0.6554020584512081, + "grad_norm": 0.9054927229881287, + "learning_rate": 7.601155648621786e-06, + "loss": 0.8562, + "step": 11908 + }, + { + "epoch": 0.6554570972535637, + "grad_norm": 0.8069004416465759, + "learning_rate": 7.6007854472987955e-06, + "loss": 0.88, + "step": 11909 + }, + { + "epoch": 0.6555121360559194, + "grad_norm": 0.6393092274665833, + "learning_rate": 7.600415226428995e-06, + "loss": 0.6908, + "step": 11910 + }, + { + "epoch": 0.6555671748582751, + "grad_norm": 0.7533125281333923, + "learning_rate": 7.600044986015172e-06, + "loss": 0.8061, + "step": 11911 + }, + { + "epoch": 0.6556222136606308, + "grad_norm": 0.6859326958656311, + "learning_rate": 7.599674726060105e-06, + "loss": 0.7603, + "step": 11912 + }, + { + "epoch": 0.6556772524629864, + "grad_norm": 0.7284619808197021, + "learning_rate": 7.59930444656658e-06, + "loss": 0.7698, + "step": 11913 + }, + { + "epoch": 0.655732291265342, + "grad_norm": 1.074234127998352, + "learning_rate": 7.598934147537378e-06, + "loss": 0.8252, + "step": 11914 + }, + { + "epoch": 0.6557873300676977, + "grad_norm": 0.6899133920669556, + "learning_rate": 7.598563828975283e-06, + "loss": 0.6023, + "step": 11915 + }, + { + "epoch": 0.6558423688700534, + "grad_norm": 0.6736464500427246, + "learning_rate": 7.598193490883077e-06, + "loss": 0.788, + "step": 11916 + }, + { + "epoch": 0.655897407672409, + "grad_norm": 0.7646307349205017, + "learning_rate": 7.597823133263545e-06, + "loss": 0.7607, + "step": 11917 + }, + { + "epoch": 0.6559524464747647, + "grad_norm": 0.6413717865943909, + "learning_rate": 7.59745275611947e-06, + "loss": 0.6415, + "step": 11918 + }, + { + "epoch": 0.6560074852771204, + "grad_norm": 0.6605532169342041, + "learning_rate": 7.597082359453636e-06, + "loss": 0.6655, + "step": 11919 + }, + { + "epoch": 0.6560625240794761, + "grad_norm": 0.6573199033737183, + "learning_rate": 7.596711943268824e-06, + "loss": 0.624, + "step": 11920 + }, + { + "epoch": 0.6561175628818317, + "grad_norm": 0.8312102556228638, + "learning_rate": 7.596341507567822e-06, + "loss": 0.6803, + "step": 11921 + }, + { + "epoch": 0.6561726016841873, + "grad_norm": 0.6915873289108276, + "learning_rate": 7.59597105235341e-06, + "loss": 0.6897, + "step": 11922 + }, + { + "epoch": 0.656227640486543, + "grad_norm": 0.6916965842247009, + "learning_rate": 7.595600577628377e-06, + "loss": 0.7154, + "step": 11923 + }, + { + "epoch": 0.6562826792888987, + "grad_norm": 0.6712722182273865, + "learning_rate": 7.595230083395501e-06, + "loss": 0.7236, + "step": 11924 + }, + { + "epoch": 0.6563377180912543, + "grad_norm": 0.6514019966125488, + "learning_rate": 7.594859569657575e-06, + "loss": 0.6895, + "step": 11925 + }, + { + "epoch": 0.65639275689361, + "grad_norm": 0.7300555109977722, + "learning_rate": 7.594489036417378e-06, + "loss": 0.7563, + "step": 11926 + }, + { + "epoch": 0.6564477956959657, + "grad_norm": 0.8076907396316528, + "learning_rate": 7.594118483677695e-06, + "loss": 0.8883, + "step": 11927 + }, + { + "epoch": 0.6565028344983214, + "grad_norm": 0.666466236114502, + "learning_rate": 7.5937479114413114e-06, + "loss": 0.7641, + "step": 11928 + }, + { + "epoch": 0.6565578733006769, + "grad_norm": 0.6621832251548767, + "learning_rate": 7.593377319711013e-06, + "loss": 0.6687, + "step": 11929 + }, + { + "epoch": 0.6566129121030326, + "grad_norm": 0.8757139444351196, + "learning_rate": 7.593006708489585e-06, + "loss": 0.7746, + "step": 11930 + }, + { + "epoch": 0.6566679509053883, + "grad_norm": 0.646801769733429, + "learning_rate": 7.5926360777798135e-06, + "loss": 0.6884, + "step": 11931 + }, + { + "epoch": 0.656722989707744, + "grad_norm": 0.6703395843505859, + "learning_rate": 7.592265427584482e-06, + "loss": 0.6822, + "step": 11932 + }, + { + "epoch": 0.6567780285100996, + "grad_norm": 0.7653201222419739, + "learning_rate": 7.591894757906378e-06, + "loss": 0.7999, + "step": 11933 + }, + { + "epoch": 0.6568330673124553, + "grad_norm": 0.6921548247337341, + "learning_rate": 7.591524068748288e-06, + "loss": 0.7177, + "step": 11934 + }, + { + "epoch": 0.656888106114811, + "grad_norm": 0.7085320353507996, + "learning_rate": 7.591153360112995e-06, + "loss": 0.8395, + "step": 11935 + }, + { + "epoch": 0.6569431449171667, + "grad_norm": 0.6565294861793518, + "learning_rate": 7.590782632003287e-06, + "loss": 0.6969, + "step": 11936 + }, + { + "epoch": 0.6569981837195222, + "grad_norm": 0.7023206353187561, + "learning_rate": 7.590411884421952e-06, + "loss": 0.7321, + "step": 11937 + }, + { + "epoch": 0.6570532225218779, + "grad_norm": 0.7848044633865356, + "learning_rate": 7.590041117371774e-06, + "loss": 0.8857, + "step": 11938 + }, + { + "epoch": 0.6571082613242336, + "grad_norm": 1.004591703414917, + "learning_rate": 7.589670330855541e-06, + "loss": 0.8267, + "step": 11939 + }, + { + "epoch": 0.6571633001265893, + "grad_norm": 0.7525139451026917, + "learning_rate": 7.589299524876036e-06, + "loss": 0.6857, + "step": 11940 + }, + { + "epoch": 0.6572183389289449, + "grad_norm": 0.746224582195282, + "learning_rate": 7.588928699436051e-06, + "loss": 0.805, + "step": 11941 + }, + { + "epoch": 0.6572733777313006, + "grad_norm": 0.6304495930671692, + "learning_rate": 7.588557854538371e-06, + "loss": 0.652, + "step": 11942 + }, + { + "epoch": 0.6573284165336563, + "grad_norm": 0.761688768863678, + "learning_rate": 7.588186990185783e-06, + "loss": 0.7954, + "step": 11943 + }, + { + "epoch": 0.6573834553360118, + "grad_norm": 0.7735103368759155, + "learning_rate": 7.587816106381073e-06, + "loss": 0.7584, + "step": 11944 + }, + { + "epoch": 0.6574384941383675, + "grad_norm": 0.7351566553115845, + "learning_rate": 7.5874452031270305e-06, + "loss": 0.7984, + "step": 11945 + }, + { + "epoch": 0.6574935329407232, + "grad_norm": 0.7054993510246277, + "learning_rate": 7.587074280426443e-06, + "loss": 0.7057, + "step": 11946 + }, + { + "epoch": 0.6575485717430789, + "grad_norm": 0.7444368004798889, + "learning_rate": 7.586703338282099e-06, + "loss": 0.7476, + "step": 11947 + }, + { + "epoch": 0.6576036105454345, + "grad_norm": 0.6944568157196045, + "learning_rate": 7.586332376696782e-06, + "loss": 0.6874, + "step": 11948 + }, + { + "epoch": 0.6576586493477902, + "grad_norm": 0.6595578193664551, + "learning_rate": 7.585961395673287e-06, + "loss": 0.7541, + "step": 11949 + }, + { + "epoch": 0.6577136881501459, + "grad_norm": 0.6669502258300781, + "learning_rate": 7.585590395214396e-06, + "loss": 0.7515, + "step": 11950 + }, + { + "epoch": 0.6577687269525015, + "grad_norm": 0.7254583835601807, + "learning_rate": 7.585219375322901e-06, + "loss": 0.8089, + "step": 11951 + }, + { + "epoch": 0.6578237657548571, + "grad_norm": 1.0479141473770142, + "learning_rate": 7.584848336001587e-06, + "loss": 0.8108, + "step": 11952 + }, + { + "epoch": 0.6578788045572128, + "grad_norm": 0.6928718686103821, + "learning_rate": 7.584477277253246e-06, + "loss": 0.6325, + "step": 11953 + }, + { + "epoch": 0.6579338433595685, + "grad_norm": 0.8926869630813599, + "learning_rate": 7.584106199080666e-06, + "loss": 0.7294, + "step": 11954 + }, + { + "epoch": 0.6579888821619242, + "grad_norm": 0.7209964394569397, + "learning_rate": 7.583735101486635e-06, + "loss": 0.7646, + "step": 11955 + }, + { + "epoch": 0.6580439209642798, + "grad_norm": 0.7619316577911377, + "learning_rate": 7.583363984473941e-06, + "loss": 0.7756, + "step": 11956 + }, + { + "epoch": 0.6580989597666355, + "grad_norm": 0.6974903345108032, + "learning_rate": 7.582992848045378e-06, + "loss": 0.6497, + "step": 11957 + }, + { + "epoch": 0.6581539985689911, + "grad_norm": 0.8338617086410522, + "learning_rate": 7.582621692203731e-06, + "loss": 0.6619, + "step": 11958 + }, + { + "epoch": 0.6582090373713468, + "grad_norm": 0.9330396056175232, + "learning_rate": 7.5822505169517905e-06, + "loss": 0.8219, + "step": 11959 + }, + { + "epoch": 0.6582640761737024, + "grad_norm": 0.7725355625152588, + "learning_rate": 7.5818793222923445e-06, + "loss": 0.7262, + "step": 11960 + }, + { + "epoch": 0.6583191149760581, + "grad_norm": 0.7049654722213745, + "learning_rate": 7.5815081082281885e-06, + "loss": 0.7917, + "step": 11961 + }, + { + "epoch": 0.6583741537784138, + "grad_norm": 0.6801711916923523, + "learning_rate": 7.581136874762105e-06, + "loss": 0.6984, + "step": 11962 + }, + { + "epoch": 0.6584291925807695, + "grad_norm": 0.7774253487586975, + "learning_rate": 7.58076562189689e-06, + "loss": 0.7615, + "step": 11963 + }, + { + "epoch": 0.6584842313831251, + "grad_norm": 0.7436443567276001, + "learning_rate": 7.58039434963533e-06, + "loss": 0.7419, + "step": 11964 + }, + { + "epoch": 0.6585392701854808, + "grad_norm": 0.6857719421386719, + "learning_rate": 7.580023057980217e-06, + "loss": 0.8009, + "step": 11965 + }, + { + "epoch": 0.6585943089878364, + "grad_norm": 0.7194758653640747, + "learning_rate": 7.579651746934342e-06, + "loss": 0.7338, + "step": 11966 + }, + { + "epoch": 0.6586493477901921, + "grad_norm": 0.7248701453208923, + "learning_rate": 7.579280416500495e-06, + "loss": 0.6972, + "step": 11967 + }, + { + "epoch": 0.6587043865925477, + "grad_norm": 0.6719415783882141, + "learning_rate": 7.578909066681466e-06, + "loss": 0.7552, + "step": 11968 + }, + { + "epoch": 0.6587594253949034, + "grad_norm": 0.728338897228241, + "learning_rate": 7.578537697480046e-06, + "loss": 0.8386, + "step": 11969 + }, + { + "epoch": 0.6588144641972591, + "grad_norm": 0.7151786684989929, + "learning_rate": 7.578166308899029e-06, + "loss": 0.7186, + "step": 11970 + }, + { + "epoch": 0.6588695029996148, + "grad_norm": 0.664412260055542, + "learning_rate": 7.577794900941205e-06, + "loss": 0.6672, + "step": 11971 + }, + { + "epoch": 0.6589245418019704, + "grad_norm": 0.6915827989578247, + "learning_rate": 7.577423473609361e-06, + "loss": 0.7427, + "step": 11972 + }, + { + "epoch": 0.658979580604326, + "grad_norm": 0.705243706703186, + "learning_rate": 7.577052026906295e-06, + "loss": 0.7526, + "step": 11973 + }, + { + "epoch": 0.6590346194066817, + "grad_norm": 0.6559640169143677, + "learning_rate": 7.576680560834795e-06, + "loss": 0.8187, + "step": 11974 + }, + { + "epoch": 0.6590896582090374, + "grad_norm": 0.7359572649002075, + "learning_rate": 7.576309075397653e-06, + "loss": 0.8127, + "step": 11975 + }, + { + "epoch": 0.659144697011393, + "grad_norm": 0.6581039428710938, + "learning_rate": 7.575937570597661e-06, + "loss": 0.7066, + "step": 11976 + }, + { + "epoch": 0.6591997358137487, + "grad_norm": 0.8360844254493713, + "learning_rate": 7.5755660464376134e-06, + "loss": 0.7998, + "step": 11977 + }, + { + "epoch": 0.6592547746161044, + "grad_norm": 0.7201453447341919, + "learning_rate": 7.5751945029203e-06, + "loss": 0.7884, + "step": 11978 + }, + { + "epoch": 0.6593098134184601, + "grad_norm": 0.6985270977020264, + "learning_rate": 7.574822940048514e-06, + "loss": 0.7268, + "step": 11979 + }, + { + "epoch": 0.6593648522208156, + "grad_norm": 0.6405925154685974, + "learning_rate": 7.574451357825048e-06, + "loss": 0.6848, + "step": 11980 + }, + { + "epoch": 0.6594198910231713, + "grad_norm": 0.6656618714332581, + "learning_rate": 7.574079756252694e-06, + "loss": 0.7755, + "step": 11981 + }, + { + "epoch": 0.659474929825527, + "grad_norm": 0.8461045622825623, + "learning_rate": 7.573708135334248e-06, + "loss": 0.7171, + "step": 11982 + }, + { + "epoch": 0.6595299686278827, + "grad_norm": 0.5527384877204895, + "learning_rate": 7.573336495072498e-06, + "loss": 0.6668, + "step": 11983 + }, + { + "epoch": 0.6595850074302383, + "grad_norm": 0.6703749299049377, + "learning_rate": 7.572964835470241e-06, + "loss": 0.7128, + "step": 11984 + }, + { + "epoch": 0.659640046232594, + "grad_norm": 0.6824783682823181, + "learning_rate": 7.57259315653027e-06, + "loss": 0.8007, + "step": 11985 + }, + { + "epoch": 0.6596950850349497, + "grad_norm": 0.7369599938392639, + "learning_rate": 7.572221458255377e-06, + "loss": 0.7507, + "step": 11986 + }, + { + "epoch": 0.6597501238373052, + "grad_norm": 0.6976807713508606, + "learning_rate": 7.571849740648356e-06, + "loss": 0.7787, + "step": 11987 + }, + { + "epoch": 0.6598051626396609, + "grad_norm": 0.6735848784446716, + "learning_rate": 7.571478003711998e-06, + "loss": 0.6791, + "step": 11988 + }, + { + "epoch": 0.6598602014420166, + "grad_norm": 0.7245956659317017, + "learning_rate": 7.5711062474491025e-06, + "loss": 0.7999, + "step": 11989 + }, + { + "epoch": 0.6599152402443723, + "grad_norm": 0.760748565196991, + "learning_rate": 7.5707344718624595e-06, + "loss": 0.7904, + "step": 11990 + }, + { + "epoch": 0.6599702790467279, + "grad_norm": 0.6745715141296387, + "learning_rate": 7.5703626769548654e-06, + "loss": 0.6938, + "step": 11991 + }, + { + "epoch": 0.6600253178490836, + "grad_norm": 0.7301452159881592, + "learning_rate": 7.569990862729113e-06, + "loss": 0.7546, + "step": 11992 + }, + { + "epoch": 0.6600803566514393, + "grad_norm": 0.68801349401474, + "learning_rate": 7.569619029187998e-06, + "loss": 0.7592, + "step": 11993 + }, + { + "epoch": 0.660135395453795, + "grad_norm": 0.6839548349380493, + "learning_rate": 7.569247176334313e-06, + "loss": 0.7139, + "step": 11994 + }, + { + "epoch": 0.6601904342561505, + "grad_norm": 0.7490861415863037, + "learning_rate": 7.568875304170854e-06, + "loss": 0.7939, + "step": 11995 + }, + { + "epoch": 0.6602454730585062, + "grad_norm": 0.7098836302757263, + "learning_rate": 7.568503412700416e-06, + "loss": 0.7824, + "step": 11996 + }, + { + "epoch": 0.6603005118608619, + "grad_norm": 0.7427988052368164, + "learning_rate": 7.568131501925795e-06, + "loss": 0.7492, + "step": 11997 + }, + { + "epoch": 0.6603555506632176, + "grad_norm": 0.6715356111526489, + "learning_rate": 7.567759571849784e-06, + "loss": 0.6444, + "step": 11998 + }, + { + "epoch": 0.6604105894655732, + "grad_norm": 0.6697829961776733, + "learning_rate": 7.5673876224751795e-06, + "loss": 0.7064, + "step": 11999 + }, + { + "epoch": 0.6604656282679289, + "grad_norm": 0.6778494119644165, + "learning_rate": 7.567015653804777e-06, + "loss": 0.7517, + "step": 12000 + }, + { + "epoch": 0.6605206670702846, + "grad_norm": 0.6423540711402893, + "learning_rate": 7.566643665841371e-06, + "loss": 0.6321, + "step": 12001 + }, + { + "epoch": 0.6605757058726403, + "grad_norm": 0.6874244213104248, + "learning_rate": 7.566271658587761e-06, + "loss": 0.762, + "step": 12002 + }, + { + "epoch": 0.6606307446749958, + "grad_norm": 0.6805301308631897, + "learning_rate": 7.565899632046737e-06, + "loss": 0.765, + "step": 12003 + }, + { + "epoch": 0.6606857834773515, + "grad_norm": 0.7039558291435242, + "learning_rate": 7.5655275862211e-06, + "loss": 0.728, + "step": 12004 + }, + { + "epoch": 0.6607408222797072, + "grad_norm": 0.6513119339942932, + "learning_rate": 7.565155521113643e-06, + "loss": 0.7711, + "step": 12005 + }, + { + "epoch": 0.6607958610820629, + "grad_norm": 0.6483618021011353, + "learning_rate": 7.5647834367271655e-06, + "loss": 0.7015, + "step": 12006 + }, + { + "epoch": 0.6608508998844185, + "grad_norm": 0.7180553674697876, + "learning_rate": 7.564411333064461e-06, + "loss": 0.812, + "step": 12007 + }, + { + "epoch": 0.6609059386867742, + "grad_norm": 0.9036096334457397, + "learning_rate": 7.5640392101283285e-06, + "loss": 0.7858, + "step": 12008 + }, + { + "epoch": 0.6609609774891299, + "grad_norm": 0.7380802035331726, + "learning_rate": 7.563667067921563e-06, + "loss": 0.6615, + "step": 12009 + }, + { + "epoch": 0.6610160162914855, + "grad_norm": 0.6830628514289856, + "learning_rate": 7.5632949064469615e-06, + "loss": 0.7465, + "step": 12010 + }, + { + "epoch": 0.6610710550938411, + "grad_norm": 0.7562816143035889, + "learning_rate": 7.562922725707323e-06, + "loss": 0.8559, + "step": 12011 + }, + { + "epoch": 0.6611260938961968, + "grad_norm": 0.7376649379730225, + "learning_rate": 7.562550525705442e-06, + "loss": 0.7769, + "step": 12012 + }, + { + "epoch": 0.6611811326985525, + "grad_norm": 0.715466320514679, + "learning_rate": 7.562178306444116e-06, + "loss": 0.8233, + "step": 12013 + }, + { + "epoch": 0.6612361715009082, + "grad_norm": 0.6714800596237183, + "learning_rate": 7.561806067926147e-06, + "loss": 0.6025, + "step": 12014 + }, + { + "epoch": 0.6612912103032638, + "grad_norm": 0.7083391547203064, + "learning_rate": 7.561433810154328e-06, + "loss": 0.7063, + "step": 12015 + }, + { + "epoch": 0.6613462491056195, + "grad_norm": 0.8062768578529358, + "learning_rate": 7.561061533131457e-06, + "loss": 0.7992, + "step": 12016 + }, + { + "epoch": 0.6614012879079751, + "grad_norm": 0.741889476776123, + "learning_rate": 7.560689236860334e-06, + "loss": 0.8149, + "step": 12017 + }, + { + "epoch": 0.6614563267103308, + "grad_norm": 0.6834374666213989, + "learning_rate": 7.560316921343756e-06, + "loss": 0.782, + "step": 12018 + }, + { + "epoch": 0.6615113655126864, + "grad_norm": 0.7469872236251831, + "learning_rate": 7.559944586584522e-06, + "loss": 0.759, + "step": 12019 + }, + { + "epoch": 0.6615664043150421, + "grad_norm": 0.8300836086273193, + "learning_rate": 7.559572232585428e-06, + "loss": 0.8637, + "step": 12020 + }, + { + "epoch": 0.6616214431173978, + "grad_norm": 0.6241582632064819, + "learning_rate": 7.559199859349276e-06, + "loss": 0.7134, + "step": 12021 + }, + { + "epoch": 0.6616764819197535, + "grad_norm": 0.6696488261222839, + "learning_rate": 7.5588274668788634e-06, + "loss": 0.7457, + "step": 12022 + }, + { + "epoch": 0.6617315207221091, + "grad_norm": 0.7090815305709839, + "learning_rate": 7.558455055176987e-06, + "loss": 0.7449, + "step": 12023 + }, + { + "epoch": 0.6617865595244647, + "grad_norm": 0.6925215125083923, + "learning_rate": 7.558082624246448e-06, + "loss": 0.758, + "step": 12024 + }, + { + "epoch": 0.6618415983268204, + "grad_norm": 0.6658454537391663, + "learning_rate": 7.5577101740900425e-06, + "loss": 0.6918, + "step": 12025 + }, + { + "epoch": 0.6618966371291761, + "grad_norm": 0.6646405458450317, + "learning_rate": 7.557337704710574e-06, + "loss": 0.7293, + "step": 12026 + }, + { + "epoch": 0.6619516759315317, + "grad_norm": 0.6630399227142334, + "learning_rate": 7.556965216110841e-06, + "loss": 0.7572, + "step": 12027 + }, + { + "epoch": 0.6620067147338874, + "grad_norm": 0.7333918809890747, + "learning_rate": 7.556592708293641e-06, + "loss": 0.8012, + "step": 12028 + }, + { + "epoch": 0.6620617535362431, + "grad_norm": 0.7399254441261292, + "learning_rate": 7.556220181261773e-06, + "loss": 0.8406, + "step": 12029 + }, + { + "epoch": 0.6621167923385987, + "grad_norm": 0.6244909167289734, + "learning_rate": 7.55584763501804e-06, + "loss": 0.7427, + "step": 12030 + }, + { + "epoch": 0.6621718311409543, + "grad_norm": 0.6991485953330994, + "learning_rate": 7.55547506956524e-06, + "loss": 0.7583, + "step": 12031 + }, + { + "epoch": 0.66222686994331, + "grad_norm": 0.7115411162376404, + "learning_rate": 7.555102484906174e-06, + "loss": 0.7951, + "step": 12032 + }, + { + "epoch": 0.6622819087456657, + "grad_norm": 0.7684284448623657, + "learning_rate": 7.554729881043641e-06, + "loss": 0.717, + "step": 12033 + }, + { + "epoch": 0.6623369475480213, + "grad_norm": 0.7705931067466736, + "learning_rate": 7.554357257980443e-06, + "loss": 0.6903, + "step": 12034 + }, + { + "epoch": 0.662391986350377, + "grad_norm": 0.9283333420753479, + "learning_rate": 7.553984615719379e-06, + "loss": 0.7845, + "step": 12035 + }, + { + "epoch": 0.6624470251527327, + "grad_norm": 0.6867572665214539, + "learning_rate": 7.553611954263249e-06, + "loss": 0.8796, + "step": 12036 + }, + { + "epoch": 0.6625020639550884, + "grad_norm": 0.6129451990127563, + "learning_rate": 7.553239273614855e-06, + "loss": 0.6308, + "step": 12037 + }, + { + "epoch": 0.662557102757444, + "grad_norm": 0.749679446220398, + "learning_rate": 7.552866573777e-06, + "loss": 0.8308, + "step": 12038 + }, + { + "epoch": 0.6626121415597996, + "grad_norm": 0.7651422619819641, + "learning_rate": 7.552493854752483e-06, + "loss": 0.7266, + "step": 12039 + }, + { + "epoch": 0.6626671803621553, + "grad_norm": 0.9293195009231567, + "learning_rate": 7.552121116544104e-06, + "loss": 0.7795, + "step": 12040 + }, + { + "epoch": 0.662722219164511, + "grad_norm": 0.7321802973747253, + "learning_rate": 7.5517483591546655e-06, + "loss": 0.7294, + "step": 12041 + }, + { + "epoch": 0.6627772579668666, + "grad_norm": 0.702414333820343, + "learning_rate": 7.551375582586971e-06, + "loss": 0.7954, + "step": 12042 + }, + { + "epoch": 0.6628322967692223, + "grad_norm": 0.7497946619987488, + "learning_rate": 7.551002786843819e-06, + "loss": 0.7654, + "step": 12043 + }, + { + "epoch": 0.662887335571578, + "grad_norm": 0.6125331521034241, + "learning_rate": 7.550629971928017e-06, + "loss": 0.7299, + "step": 12044 + }, + { + "epoch": 0.6629423743739337, + "grad_norm": 0.7252177596092224, + "learning_rate": 7.550257137842358e-06, + "loss": 0.7553, + "step": 12045 + }, + { + "epoch": 0.6629974131762892, + "grad_norm": 0.6463978886604309, + "learning_rate": 7.5498842845896515e-06, + "loss": 0.7114, + "step": 12046 + }, + { + "epoch": 0.6630524519786449, + "grad_norm": 0.7392497062683105, + "learning_rate": 7.549511412172696e-06, + "loss": 0.6801, + "step": 12047 + }, + { + "epoch": 0.6631074907810006, + "grad_norm": 0.8068972229957581, + "learning_rate": 7.549138520594297e-06, + "loss": 0.8207, + "step": 12048 + }, + { + "epoch": 0.6631625295833563, + "grad_norm": 0.7632858753204346, + "learning_rate": 7.548765609857254e-06, + "loss": 0.7095, + "step": 12049 + }, + { + "epoch": 0.6632175683857119, + "grad_norm": 0.7252069115638733, + "learning_rate": 7.5483926799643705e-06, + "loss": 0.7796, + "step": 12050 + }, + { + "epoch": 0.6632726071880676, + "grad_norm": 1.048311471939087, + "learning_rate": 7.54801973091845e-06, + "loss": 0.7306, + "step": 12051 + }, + { + "epoch": 0.6633276459904233, + "grad_norm": 0.7432072758674622, + "learning_rate": 7.547646762722296e-06, + "loss": 0.8209, + "step": 12052 + }, + { + "epoch": 0.663382684792779, + "grad_norm": 0.7191399335861206, + "learning_rate": 7.547273775378709e-06, + "loss": 0.7011, + "step": 12053 + }, + { + "epoch": 0.6634377235951345, + "grad_norm": 0.5776329636573792, + "learning_rate": 7.5469007688904975e-06, + "loss": 0.6055, + "step": 12054 + }, + { + "epoch": 0.6634927623974902, + "grad_norm": 0.9296837449073792, + "learning_rate": 7.546527743260459e-06, + "loss": 0.7413, + "step": 12055 + }, + { + "epoch": 0.6635478011998459, + "grad_norm": 0.7279512286186218, + "learning_rate": 7.5461546984914e-06, + "loss": 0.7734, + "step": 12056 + }, + { + "epoch": 0.6636028400022016, + "grad_norm": 0.7297198176383972, + "learning_rate": 7.545781634586125e-06, + "loss": 0.7535, + "step": 12057 + }, + { + "epoch": 0.6636578788045572, + "grad_norm": 0.7094287872314453, + "learning_rate": 7.545408551547435e-06, + "loss": 0.7587, + "step": 12058 + }, + { + "epoch": 0.6637129176069129, + "grad_norm": 0.7559607028961182, + "learning_rate": 7.5450354493781374e-06, + "loss": 0.7358, + "step": 12059 + }, + { + "epoch": 0.6637679564092686, + "grad_norm": 0.8472892045974731, + "learning_rate": 7.544662328081034e-06, + "loss": 0.7537, + "step": 12060 + }, + { + "epoch": 0.6638229952116242, + "grad_norm": 0.6346176862716675, + "learning_rate": 7.544289187658929e-06, + "loss": 0.7658, + "step": 12061 + }, + { + "epoch": 0.6638780340139798, + "grad_norm": 0.7949367165565491, + "learning_rate": 7.543916028114628e-06, + "loss": 0.6837, + "step": 12062 + }, + { + "epoch": 0.6639330728163355, + "grad_norm": 0.7177689671516418, + "learning_rate": 7.5435428494509355e-06, + "loss": 0.7218, + "step": 12063 + }, + { + "epoch": 0.6639881116186912, + "grad_norm": 0.90680330991745, + "learning_rate": 7.5431696516706555e-06, + "loss": 0.8274, + "step": 12064 + }, + { + "epoch": 0.6640431504210469, + "grad_norm": 0.7799603939056396, + "learning_rate": 7.5427964347765916e-06, + "loss": 0.7528, + "step": 12065 + }, + { + "epoch": 0.6640981892234025, + "grad_norm": 0.7668048739433289, + "learning_rate": 7.542423198771553e-06, + "loss": 0.746, + "step": 12066 + }, + { + "epoch": 0.6641532280257582, + "grad_norm": 1.0042381286621094, + "learning_rate": 7.542049943658341e-06, + "loss": 0.7836, + "step": 12067 + }, + { + "epoch": 0.6642082668281138, + "grad_norm": 0.6915723085403442, + "learning_rate": 7.541676669439761e-06, + "loss": 0.8042, + "step": 12068 + }, + { + "epoch": 0.6642633056304695, + "grad_norm": 0.7268955707550049, + "learning_rate": 7.5413033761186215e-06, + "loss": 0.689, + "step": 12069 + }, + { + "epoch": 0.6643183444328251, + "grad_norm": 0.6418740749359131, + "learning_rate": 7.540930063697726e-06, + "loss": 0.6302, + "step": 12070 + }, + { + "epoch": 0.6643733832351808, + "grad_norm": 0.696384847164154, + "learning_rate": 7.540556732179879e-06, + "loss": 0.7978, + "step": 12071 + }, + { + "epoch": 0.6644284220375365, + "grad_norm": 0.7400668859481812, + "learning_rate": 7.540183381567889e-06, + "loss": 0.8768, + "step": 12072 + }, + { + "epoch": 0.6644834608398921, + "grad_norm": 0.6653871536254883, + "learning_rate": 7.539810011864559e-06, + "loss": 0.8107, + "step": 12073 + }, + { + "epoch": 0.6645384996422478, + "grad_norm": 0.7635810971260071, + "learning_rate": 7.539436623072698e-06, + "loss": 0.8476, + "step": 12074 + }, + { + "epoch": 0.6645935384446034, + "grad_norm": 0.6583054661750793, + "learning_rate": 7.53906321519511e-06, + "loss": 0.7093, + "step": 12075 + }, + { + "epoch": 0.6646485772469591, + "grad_norm": 0.8294859528541565, + "learning_rate": 7.538689788234604e-06, + "loss": 0.8107, + "step": 12076 + }, + { + "epoch": 0.6647036160493147, + "grad_norm": 0.6711081862449646, + "learning_rate": 7.538316342193983e-06, + "loss": 0.7491, + "step": 12077 + }, + { + "epoch": 0.6647586548516704, + "grad_norm": 0.7375408411026001, + "learning_rate": 7.5379428770760575e-06, + "loss": 0.7853, + "step": 12078 + }, + { + "epoch": 0.6648136936540261, + "grad_norm": 0.7322511672973633, + "learning_rate": 7.537569392883633e-06, + "loss": 0.7568, + "step": 12079 + }, + { + "epoch": 0.6648687324563818, + "grad_norm": 0.6390300393104553, + "learning_rate": 7.537195889619515e-06, + "loss": 0.7191, + "step": 12080 + }, + { + "epoch": 0.6649237712587374, + "grad_norm": 0.8155800104141235, + "learning_rate": 7.536822367286514e-06, + "loss": 0.7499, + "step": 12081 + }, + { + "epoch": 0.664978810061093, + "grad_norm": 0.7942230701446533, + "learning_rate": 7.536448825887432e-06, + "loss": 0.7797, + "step": 12082 + }, + { + "epoch": 0.6650338488634487, + "grad_norm": 0.7103378176689148, + "learning_rate": 7.536075265425083e-06, + "loss": 0.6814, + "step": 12083 + }, + { + "epoch": 0.6650888876658044, + "grad_norm": 0.8164991736412048, + "learning_rate": 7.535701685902268e-06, + "loss": 0.7917, + "step": 12084 + }, + { + "epoch": 0.66514392646816, + "grad_norm": 0.6970370411872864, + "learning_rate": 7.535328087321799e-06, + "loss": 0.7266, + "step": 12085 + }, + { + "epoch": 0.6651989652705157, + "grad_norm": 0.6468706130981445, + "learning_rate": 7.534954469686484e-06, + "loss": 0.7229, + "step": 12086 + }, + { + "epoch": 0.6652540040728714, + "grad_norm": 0.6551242470741272, + "learning_rate": 7.534580832999128e-06, + "loss": 0.6759, + "step": 12087 + }, + { + "epoch": 0.6653090428752271, + "grad_norm": 0.670215368270874, + "learning_rate": 7.534207177262543e-06, + "loss": 0.761, + "step": 12088 + }, + { + "epoch": 0.6653640816775827, + "grad_norm": 0.7365970015525818, + "learning_rate": 7.533833502479533e-06, + "loss": 0.7628, + "step": 12089 + }, + { + "epoch": 0.6654191204799383, + "grad_norm": 0.7419471740722656, + "learning_rate": 7.53345980865291e-06, + "loss": 0.8093, + "step": 12090 + }, + { + "epoch": 0.665474159282294, + "grad_norm": 0.6573269963264465, + "learning_rate": 7.53308609578548e-06, + "loss": 0.6806, + "step": 12091 + }, + { + "epoch": 0.6655291980846497, + "grad_norm": 0.9270638227462769, + "learning_rate": 7.5327123638800545e-06, + "loss": 0.8612, + "step": 12092 + }, + { + "epoch": 0.6655842368870053, + "grad_norm": 0.85124671459198, + "learning_rate": 7.532338612939441e-06, + "loss": 0.6776, + "step": 12093 + }, + { + "epoch": 0.665639275689361, + "grad_norm": 0.7791070342063904, + "learning_rate": 7.531964842966446e-06, + "loss": 0.7571, + "step": 12094 + }, + { + "epoch": 0.6656943144917167, + "grad_norm": 0.6604436635971069, + "learning_rate": 7.5315910539638825e-06, + "loss": 0.781, + "step": 12095 + }, + { + "epoch": 0.6657493532940724, + "grad_norm": 0.7567091584205627, + "learning_rate": 7.531217245934559e-06, + "loss": 0.8005, + "step": 12096 + }, + { + "epoch": 0.6658043920964279, + "grad_norm": 0.660637378692627, + "learning_rate": 7.530843418881282e-06, + "loss": 0.7351, + "step": 12097 + }, + { + "epoch": 0.6658594308987836, + "grad_norm": 0.6305738687515259, + "learning_rate": 7.530469572806865e-06, + "loss": 0.7452, + "step": 12098 + }, + { + "epoch": 0.6659144697011393, + "grad_norm": 0.8291265368461609, + "learning_rate": 7.5300957077141164e-06, + "loss": 0.7799, + "step": 12099 + }, + { + "epoch": 0.665969508503495, + "grad_norm": 0.7459661364555359, + "learning_rate": 7.5297218236058456e-06, + "loss": 0.8273, + "step": 12100 + }, + { + "epoch": 0.6660245473058506, + "grad_norm": 0.7570028901100159, + "learning_rate": 7.529347920484862e-06, + "loss": 0.7622, + "step": 12101 + }, + { + "epoch": 0.6660795861082063, + "grad_norm": 0.733403205871582, + "learning_rate": 7.528973998353977e-06, + "loss": 0.8357, + "step": 12102 + }, + { + "epoch": 0.666134624910562, + "grad_norm": 0.8814442753791809, + "learning_rate": 7.528600057216e-06, + "loss": 0.727, + "step": 12103 + }, + { + "epoch": 0.6661896637129177, + "grad_norm": 0.629338800907135, + "learning_rate": 7.528226097073742e-06, + "loss": 0.6758, + "step": 12104 + }, + { + "epoch": 0.6662447025152732, + "grad_norm": 0.7786098122596741, + "learning_rate": 7.527852117930014e-06, + "loss": 0.7476, + "step": 12105 + }, + { + "epoch": 0.6662997413176289, + "grad_norm": 0.6604528427124023, + "learning_rate": 7.527478119787626e-06, + "loss": 0.7275, + "step": 12106 + }, + { + "epoch": 0.6663547801199846, + "grad_norm": 0.6937400698661804, + "learning_rate": 7.527104102649387e-06, + "loss": 0.7187, + "step": 12107 + }, + { + "epoch": 0.6664098189223403, + "grad_norm": 0.6863219738006592, + "learning_rate": 7.526730066518113e-06, + "loss": 0.7512, + "step": 12108 + }, + { + "epoch": 0.6664648577246959, + "grad_norm": 0.7771461606025696, + "learning_rate": 7.526356011396609e-06, + "loss": 0.8439, + "step": 12109 + }, + { + "epoch": 0.6665198965270516, + "grad_norm": 0.7223722338676453, + "learning_rate": 7.525981937287692e-06, + "loss": 0.6488, + "step": 12110 + }, + { + "epoch": 0.6665749353294073, + "grad_norm": 0.8091556429862976, + "learning_rate": 7.52560784419417e-06, + "loss": 0.6618, + "step": 12111 + }, + { + "epoch": 0.666629974131763, + "grad_norm": 0.6435044407844543, + "learning_rate": 7.525233732118856e-06, + "loss": 0.6994, + "step": 12112 + }, + { + "epoch": 0.6666850129341185, + "grad_norm": 0.6933714151382446, + "learning_rate": 7.52485960106456e-06, + "loss": 0.6917, + "step": 12113 + }, + { + "epoch": 0.6667400517364742, + "grad_norm": 0.693192720413208, + "learning_rate": 7.524485451034097e-06, + "loss": 0.7941, + "step": 12114 + }, + { + "epoch": 0.6667950905388299, + "grad_norm": 1.1374844312667847, + "learning_rate": 7.524111282030275e-06, + "loss": 0.9112, + "step": 12115 + }, + { + "epoch": 0.6668501293411855, + "grad_norm": 0.6917465329170227, + "learning_rate": 7.523737094055911e-06, + "loss": 0.681, + "step": 12116 + }, + { + "epoch": 0.6669051681435412, + "grad_norm": 0.8057913184165955, + "learning_rate": 7.523362887113812e-06, + "loss": 0.8186, + "step": 12117 + }, + { + "epoch": 0.6669602069458969, + "grad_norm": 0.7194918394088745, + "learning_rate": 7.522988661206795e-06, + "loss": 0.7875, + "step": 12118 + }, + { + "epoch": 0.6670152457482525, + "grad_norm": 0.6829916834831238, + "learning_rate": 7.52261441633767e-06, + "loss": 0.6506, + "step": 12119 + }, + { + "epoch": 0.6670702845506081, + "grad_norm": 0.7869738936424255, + "learning_rate": 7.5222401525092495e-06, + "loss": 0.7091, + "step": 12120 + }, + { + "epoch": 0.6671253233529638, + "grad_norm": 0.6835895776748657, + "learning_rate": 7.5218658697243475e-06, + "loss": 0.7839, + "step": 12121 + }, + { + "epoch": 0.6671803621553195, + "grad_norm": 0.7462154030799866, + "learning_rate": 7.521491567985776e-06, + "loss": 0.7073, + "step": 12122 + }, + { + "epoch": 0.6672354009576752, + "grad_norm": 0.6413764953613281, + "learning_rate": 7.52111724729635e-06, + "loss": 0.6472, + "step": 12123 + }, + { + "epoch": 0.6672904397600308, + "grad_norm": 0.7085923552513123, + "learning_rate": 7.520742907658881e-06, + "loss": 0.8167, + "step": 12124 + }, + { + "epoch": 0.6673454785623865, + "grad_norm": 0.6490428447723389, + "learning_rate": 7.520368549076182e-06, + "loss": 0.7693, + "step": 12125 + }, + { + "epoch": 0.6674005173647422, + "grad_norm": 0.7082974910736084, + "learning_rate": 7.51999417155107e-06, + "loss": 0.6707, + "step": 12126 + }, + { + "epoch": 0.6674555561670978, + "grad_norm": 0.704335629940033, + "learning_rate": 7.519619775086355e-06, + "loss": 0.825, + "step": 12127 + }, + { + "epoch": 0.6675105949694534, + "grad_norm": 0.6815123558044434, + "learning_rate": 7.519245359684852e-06, + "loss": 0.762, + "step": 12128 + }, + { + "epoch": 0.6675656337718091, + "grad_norm": 0.6497910618782043, + "learning_rate": 7.518870925349376e-06, + "loss": 0.6934, + "step": 12129 + }, + { + "epoch": 0.6676206725741648, + "grad_norm": 0.6699943542480469, + "learning_rate": 7.51849647208274e-06, + "loss": 0.7816, + "step": 12130 + }, + { + "epoch": 0.6676757113765205, + "grad_norm": 0.7139337062835693, + "learning_rate": 7.51812199988776e-06, + "loss": 0.679, + "step": 12131 + }, + { + "epoch": 0.6677307501788761, + "grad_norm": 0.6762346029281616, + "learning_rate": 7.517747508767248e-06, + "loss": 0.7477, + "step": 12132 + }, + { + "epoch": 0.6677857889812318, + "grad_norm": 0.7429338693618774, + "learning_rate": 7.517372998724017e-06, + "loss": 0.7549, + "step": 12133 + }, + { + "epoch": 0.6678408277835874, + "grad_norm": 0.7392850518226624, + "learning_rate": 7.516998469760888e-06, + "loss": 0.8167, + "step": 12134 + }, + { + "epoch": 0.6678958665859431, + "grad_norm": 0.7511306405067444, + "learning_rate": 7.516623921880671e-06, + "loss": 0.7264, + "step": 12135 + }, + { + "epoch": 0.6679509053882987, + "grad_norm": 0.6757550835609436, + "learning_rate": 7.516249355086183e-06, + "loss": 0.7405, + "step": 12136 + }, + { + "epoch": 0.6680059441906544, + "grad_norm": 0.7433735132217407, + "learning_rate": 7.515874769380238e-06, + "loss": 0.7954, + "step": 12137 + }, + { + "epoch": 0.6680609829930101, + "grad_norm": 0.7390886545181274, + "learning_rate": 7.51550016476565e-06, + "loss": 0.7487, + "step": 12138 + }, + { + "epoch": 0.6681160217953658, + "grad_norm": 0.7405929565429688, + "learning_rate": 7.5151255412452385e-06, + "loss": 0.8127, + "step": 12139 + }, + { + "epoch": 0.6681710605977214, + "grad_norm": 0.6628968715667725, + "learning_rate": 7.514750898821817e-06, + "loss": 0.7009, + "step": 12140 + }, + { + "epoch": 0.668226099400077, + "grad_norm": 0.6777421832084656, + "learning_rate": 7.514376237498199e-06, + "loss": 0.6689, + "step": 12141 + }, + { + "epoch": 0.6682811382024327, + "grad_norm": 0.617261528968811, + "learning_rate": 7.514001557277202e-06, + "loss": 0.7597, + "step": 12142 + }, + { + "epoch": 0.6683361770047884, + "grad_norm": 0.6666202545166016, + "learning_rate": 7.5136268581616446e-06, + "loss": 0.6623, + "step": 12143 + }, + { + "epoch": 0.668391215807144, + "grad_norm": 0.7170178890228271, + "learning_rate": 7.513252140154339e-06, + "loss": 0.8224, + "step": 12144 + }, + { + "epoch": 0.6684462546094997, + "grad_norm": 0.6173199415206909, + "learning_rate": 7.512877403258103e-06, + "loss": 0.6784, + "step": 12145 + }, + { + "epoch": 0.6685012934118554, + "grad_norm": 0.6906641125679016, + "learning_rate": 7.512502647475753e-06, + "loss": 0.6649, + "step": 12146 + }, + { + "epoch": 0.6685563322142111, + "grad_norm": 0.6435873508453369, + "learning_rate": 7.5121278728101065e-06, + "loss": 0.751, + "step": 12147 + }, + { + "epoch": 0.6686113710165666, + "grad_norm": 0.8345947861671448, + "learning_rate": 7.511753079263978e-06, + "loss": 0.7841, + "step": 12148 + }, + { + "epoch": 0.6686664098189223, + "grad_norm": 0.6952378153800964, + "learning_rate": 7.511378266840187e-06, + "loss": 0.8187, + "step": 12149 + }, + { + "epoch": 0.668721448621278, + "grad_norm": 0.6878920793533325, + "learning_rate": 7.5110034355415484e-06, + "loss": 0.6726, + "step": 12150 + }, + { + "epoch": 0.6687764874236337, + "grad_norm": 0.7119094729423523, + "learning_rate": 7.5106285853708805e-06, + "loss": 0.7824, + "step": 12151 + }, + { + "epoch": 0.6688315262259893, + "grad_norm": 0.7261053323745728, + "learning_rate": 7.5102537163309994e-06, + "loss": 0.7122, + "step": 12152 + }, + { + "epoch": 0.668886565028345, + "grad_norm": 0.717268168926239, + "learning_rate": 7.509878828424725e-06, + "loss": 0.7144, + "step": 12153 + }, + { + "epoch": 0.6689416038307007, + "grad_norm": 0.8373270630836487, + "learning_rate": 7.5095039216548725e-06, + "loss": 0.7941, + "step": 12154 + }, + { + "epoch": 0.6689966426330564, + "grad_norm": 0.7113829851150513, + "learning_rate": 7.509128996024259e-06, + "loss": 0.705, + "step": 12155 + }, + { + "epoch": 0.6690516814354119, + "grad_norm": 0.7894094586372375, + "learning_rate": 7.508754051535705e-06, + "loss": 0.8284, + "step": 12156 + }, + { + "epoch": 0.6691067202377676, + "grad_norm": 0.6739659905433655, + "learning_rate": 7.508379088192028e-06, + "loss": 0.7264, + "step": 12157 + }, + { + "epoch": 0.6691617590401233, + "grad_norm": 0.735211193561554, + "learning_rate": 7.508004105996043e-06, + "loss": 0.8187, + "step": 12158 + }, + { + "epoch": 0.6692167978424789, + "grad_norm": 0.7438055872917175, + "learning_rate": 7.507629104950571e-06, + "loss": 0.8949, + "step": 12159 + }, + { + "epoch": 0.6692718366448346, + "grad_norm": 1.0734246969223022, + "learning_rate": 7.507254085058431e-06, + "loss": 0.7687, + "step": 12160 + }, + { + "epoch": 0.6693268754471903, + "grad_norm": 0.6719897985458374, + "learning_rate": 7.50687904632244e-06, + "loss": 0.7522, + "step": 12161 + }, + { + "epoch": 0.669381914249546, + "grad_norm": 0.7063966989517212, + "learning_rate": 7.506503988745416e-06, + "loss": 0.7794, + "step": 12162 + }, + { + "epoch": 0.6694369530519015, + "grad_norm": 0.6582265496253967, + "learning_rate": 7.506128912330179e-06, + "loss": 0.7012, + "step": 12163 + }, + { + "epoch": 0.6694919918542572, + "grad_norm": 0.7764506340026855, + "learning_rate": 7.50575381707955e-06, + "loss": 0.7816, + "step": 12164 + }, + { + "epoch": 0.6695470306566129, + "grad_norm": 0.7659780383110046, + "learning_rate": 7.505378702996344e-06, + "loss": 0.753, + "step": 12165 + }, + { + "epoch": 0.6696020694589686, + "grad_norm": 0.9013122916221619, + "learning_rate": 7.505003570083385e-06, + "loss": 0.8255, + "step": 12166 + }, + { + "epoch": 0.6696571082613242, + "grad_norm": 0.6417272686958313, + "learning_rate": 7.504628418343487e-06, + "loss": 0.6236, + "step": 12167 + }, + { + "epoch": 0.6697121470636799, + "grad_norm": 0.7511595487594604, + "learning_rate": 7.504253247779474e-06, + "loss": 0.7961, + "step": 12168 + }, + { + "epoch": 0.6697671858660356, + "grad_norm": 0.7987878918647766, + "learning_rate": 7.503878058394163e-06, + "loss": 0.7249, + "step": 12169 + }, + { + "epoch": 0.6698222246683913, + "grad_norm": 0.6860646605491638, + "learning_rate": 7.503502850190374e-06, + "loss": 0.7973, + "step": 12170 + }, + { + "epoch": 0.6698772634707468, + "grad_norm": 0.7334334850311279, + "learning_rate": 7.50312762317093e-06, + "loss": 0.8756, + "step": 12171 + }, + { + "epoch": 0.6699323022731025, + "grad_norm": 0.7792186737060547, + "learning_rate": 7.502752377338647e-06, + "loss": 0.8393, + "step": 12172 + }, + { + "epoch": 0.6699873410754582, + "grad_norm": 0.6532536149024963, + "learning_rate": 7.502377112696346e-06, + "loss": 0.6509, + "step": 12173 + }, + { + "epoch": 0.6700423798778139, + "grad_norm": 0.6595458984375, + "learning_rate": 7.50200182924685e-06, + "loss": 0.781, + "step": 12174 + }, + { + "epoch": 0.6700974186801695, + "grad_norm": 0.6668636202812195, + "learning_rate": 7.501626526992978e-06, + "loss": 0.7702, + "step": 12175 + }, + { + "epoch": 0.6701524574825252, + "grad_norm": 0.686851441860199, + "learning_rate": 7.501251205937551e-06, + "loss": 0.8648, + "step": 12176 + }, + { + "epoch": 0.6702074962848809, + "grad_norm": 0.7363078594207764, + "learning_rate": 7.500875866083388e-06, + "loss": 0.7309, + "step": 12177 + }, + { + "epoch": 0.6702625350872365, + "grad_norm": 0.6927379369735718, + "learning_rate": 7.500500507433312e-06, + "loss": 0.7258, + "step": 12178 + }, + { + "epoch": 0.6703175738895921, + "grad_norm": 0.6589936017990112, + "learning_rate": 7.5001251299901455e-06, + "loss": 0.6776, + "step": 12179 + }, + { + "epoch": 0.6703726126919478, + "grad_norm": 0.6402539610862732, + "learning_rate": 7.499749733756707e-06, + "loss": 0.7467, + "step": 12180 + }, + { + "epoch": 0.6704276514943035, + "grad_norm": 0.776469886302948, + "learning_rate": 7.499374318735817e-06, + "loss": 0.7856, + "step": 12181 + }, + { + "epoch": 0.6704826902966592, + "grad_norm": 0.7062460780143738, + "learning_rate": 7.4989988849303e-06, + "loss": 0.8286, + "step": 12182 + }, + { + "epoch": 0.6705377290990148, + "grad_norm": 0.6725799441337585, + "learning_rate": 7.4986234323429755e-06, + "loss": 0.7517, + "step": 12183 + }, + { + "epoch": 0.6705927679013705, + "grad_norm": 0.6444042921066284, + "learning_rate": 7.498247960976667e-06, + "loss": 0.5984, + "step": 12184 + }, + { + "epoch": 0.6706478067037261, + "grad_norm": 0.6968628764152527, + "learning_rate": 7.497872470834195e-06, + "loss": 0.6996, + "step": 12185 + }, + { + "epoch": 0.6707028455060818, + "grad_norm": 0.643500030040741, + "learning_rate": 7.497496961918381e-06, + "loss": 0.6252, + "step": 12186 + }, + { + "epoch": 0.6707578843084374, + "grad_norm": 0.7026870846748352, + "learning_rate": 7.49712143423205e-06, + "loss": 0.7883, + "step": 12187 + }, + { + "epoch": 0.6708129231107931, + "grad_norm": 0.8169240951538086, + "learning_rate": 7.496745887778022e-06, + "loss": 0.6717, + "step": 12188 + }, + { + "epoch": 0.6708679619131488, + "grad_norm": 0.6611927151679993, + "learning_rate": 7.496370322559121e-06, + "loss": 0.6674, + "step": 12189 + }, + { + "epoch": 0.6709230007155045, + "grad_norm": 0.7330195307731628, + "learning_rate": 7.495994738578169e-06, + "loss": 0.7809, + "step": 12190 + }, + { + "epoch": 0.6709780395178601, + "grad_norm": 0.6469636559486389, + "learning_rate": 7.495619135837988e-06, + "loss": 0.6511, + "step": 12191 + }, + { + "epoch": 0.6710330783202157, + "grad_norm": 0.6558564901351929, + "learning_rate": 7.495243514341402e-06, + "loss": 0.7284, + "step": 12192 + }, + { + "epoch": 0.6710881171225714, + "grad_norm": 0.6736281514167786, + "learning_rate": 7.494867874091233e-06, + "loss": 0.7007, + "step": 12193 + }, + { + "epoch": 0.6711431559249271, + "grad_norm": 0.7302053570747375, + "learning_rate": 7.494492215090304e-06, + "loss": 0.77, + "step": 12194 + }, + { + "epoch": 0.6711981947272827, + "grad_norm": 0.7368764877319336, + "learning_rate": 7.494116537341442e-06, + "loss": 0.8478, + "step": 12195 + }, + { + "epoch": 0.6712532335296384, + "grad_norm": 0.782767653465271, + "learning_rate": 7.493740840847466e-06, + "loss": 0.813, + "step": 12196 + }, + { + "epoch": 0.6713082723319941, + "grad_norm": 0.6787601113319397, + "learning_rate": 7.493365125611202e-06, + "loss": 0.7507, + "step": 12197 + }, + { + "epoch": 0.6713633111343498, + "grad_norm": 0.6912569999694824, + "learning_rate": 7.4929893916354715e-06, + "loss": 0.8003, + "step": 12198 + }, + { + "epoch": 0.6714183499367053, + "grad_norm": 0.7625328898429871, + "learning_rate": 7.4926136389231005e-06, + "loss": 0.8021, + "step": 12199 + }, + { + "epoch": 0.671473388739061, + "grad_norm": 0.6720984578132629, + "learning_rate": 7.4922378674769146e-06, + "loss": 0.7757, + "step": 12200 + }, + { + "epoch": 0.6715284275414167, + "grad_norm": 0.7816714644432068, + "learning_rate": 7.491862077299734e-06, + "loss": 0.7086, + "step": 12201 + }, + { + "epoch": 0.6715834663437723, + "grad_norm": 0.7546358108520508, + "learning_rate": 7.491486268394387e-06, + "loss": 0.8365, + "step": 12202 + }, + { + "epoch": 0.671638505146128, + "grad_norm": 0.7201979756355286, + "learning_rate": 7.491110440763695e-06, + "loss": 0.835, + "step": 12203 + }, + { + "epoch": 0.6716935439484837, + "grad_norm": 0.8177551031112671, + "learning_rate": 7.490734594410484e-06, + "loss": 0.8636, + "step": 12204 + }, + { + "epoch": 0.6717485827508394, + "grad_norm": 0.7433933019638062, + "learning_rate": 7.490358729337578e-06, + "loss": 0.745, + "step": 12205 + }, + { + "epoch": 0.671803621553195, + "grad_norm": 0.8013591170310974, + "learning_rate": 7.489982845547802e-06, + "loss": 0.7638, + "step": 12206 + }, + { + "epoch": 0.6718586603555506, + "grad_norm": 0.6561495065689087, + "learning_rate": 7.489606943043982e-06, + "loss": 0.7997, + "step": 12207 + }, + { + "epoch": 0.6719136991579063, + "grad_norm": 0.7291023135185242, + "learning_rate": 7.489231021828943e-06, + "loss": 0.7452, + "step": 12208 + }, + { + "epoch": 0.671968737960262, + "grad_norm": 0.6978216171264648, + "learning_rate": 7.488855081905511e-06, + "loss": 0.7984, + "step": 12209 + }, + { + "epoch": 0.6720237767626176, + "grad_norm": 0.701006293296814, + "learning_rate": 7.488479123276507e-06, + "loss": 0.7218, + "step": 12210 + }, + { + "epoch": 0.6720788155649733, + "grad_norm": 0.7275286912918091, + "learning_rate": 7.488103145944763e-06, + "loss": 0.6872, + "step": 12211 + }, + { + "epoch": 0.672133854367329, + "grad_norm": 0.7319645881652832, + "learning_rate": 7.487727149913101e-06, + "loss": 0.7862, + "step": 12212 + }, + { + "epoch": 0.6721888931696847, + "grad_norm": 0.7143612504005432, + "learning_rate": 7.487351135184348e-06, + "loss": 0.838, + "step": 12213 + }, + { + "epoch": 0.6722439319720402, + "grad_norm": 0.7135382294654846, + "learning_rate": 7.486975101761329e-06, + "loss": 0.7263, + "step": 12214 + }, + { + "epoch": 0.6722989707743959, + "grad_norm": 0.6283460259437561, + "learning_rate": 7.486599049646872e-06, + "loss": 0.7262, + "step": 12215 + }, + { + "epoch": 0.6723540095767516, + "grad_norm": 0.7196768522262573, + "learning_rate": 7.486222978843801e-06, + "loss": 0.6752, + "step": 12216 + }, + { + "epoch": 0.6724090483791073, + "grad_norm": 0.5856572389602661, + "learning_rate": 7.485846889354944e-06, + "loss": 0.6779, + "step": 12217 + }, + { + "epoch": 0.6724640871814629, + "grad_norm": 0.7671294808387756, + "learning_rate": 7.485470781183126e-06, + "loss": 0.766, + "step": 12218 + }, + { + "epoch": 0.6725191259838186, + "grad_norm": 0.6780520677566528, + "learning_rate": 7.485094654331177e-06, + "loss": 0.7474, + "step": 12219 + }, + { + "epoch": 0.6725741647861743, + "grad_norm": 0.7537981867790222, + "learning_rate": 7.484718508801921e-06, + "loss": 0.8347, + "step": 12220 + }, + { + "epoch": 0.67262920358853, + "grad_norm": 0.7451551556587219, + "learning_rate": 7.484342344598186e-06, + "loss": 0.8217, + "step": 12221 + }, + { + "epoch": 0.6726842423908855, + "grad_norm": 0.6656951904296875, + "learning_rate": 7.483966161722798e-06, + "loss": 0.7437, + "step": 12222 + }, + { + "epoch": 0.6727392811932412, + "grad_norm": 0.7306267619132996, + "learning_rate": 7.483589960178586e-06, + "loss": 0.8495, + "step": 12223 + }, + { + "epoch": 0.6727943199955969, + "grad_norm": 0.6619658470153809, + "learning_rate": 7.483213739968376e-06, + "loss": 0.6379, + "step": 12224 + }, + { + "epoch": 0.6728493587979526, + "grad_norm": 0.7066444754600525, + "learning_rate": 7.4828375010949974e-06, + "loss": 0.7307, + "step": 12225 + }, + { + "epoch": 0.6729043976003082, + "grad_norm": 0.7356079816818237, + "learning_rate": 7.482461243561276e-06, + "loss": 0.7781, + "step": 12226 + }, + { + "epoch": 0.6729594364026639, + "grad_norm": 0.6759988069534302, + "learning_rate": 7.48208496737004e-06, + "loss": 0.7808, + "step": 12227 + }, + { + "epoch": 0.6730144752050196, + "grad_norm": 0.7519234418869019, + "learning_rate": 7.481708672524119e-06, + "loss": 0.7948, + "step": 12228 + }, + { + "epoch": 0.6730695140073752, + "grad_norm": 0.6387592554092407, + "learning_rate": 7.48133235902634e-06, + "loss": 0.7423, + "step": 12229 + }, + { + "epoch": 0.6731245528097308, + "grad_norm": 1.0615060329437256, + "learning_rate": 7.480956026879529e-06, + "loss": 0.8668, + "step": 12230 + }, + { + "epoch": 0.6731795916120865, + "grad_norm": 0.7578469514846802, + "learning_rate": 7.480579676086519e-06, + "loss": 0.812, + "step": 12231 + }, + { + "epoch": 0.6732346304144422, + "grad_norm": 0.6669226884841919, + "learning_rate": 7.480203306650134e-06, + "loss": 0.7002, + "step": 12232 + }, + { + "epoch": 0.6732896692167979, + "grad_norm": 0.7110459208488464, + "learning_rate": 7.479826918573208e-06, + "loss": 0.8542, + "step": 12233 + }, + { + "epoch": 0.6733447080191535, + "grad_norm": 0.6632254123687744, + "learning_rate": 7.479450511858563e-06, + "loss": 0.6784, + "step": 12234 + }, + { + "epoch": 0.6733997468215092, + "grad_norm": 0.7368438839912415, + "learning_rate": 7.479074086509032e-06, + "loss": 0.7683, + "step": 12235 + }, + { + "epoch": 0.6734547856238648, + "grad_norm": 0.764905571937561, + "learning_rate": 7.478697642527447e-06, + "loss": 0.7585, + "step": 12236 + }, + { + "epoch": 0.6735098244262205, + "grad_norm": 0.7141197323799133, + "learning_rate": 7.478321179916632e-06, + "loss": 0.7409, + "step": 12237 + }, + { + "epoch": 0.6735648632285761, + "grad_norm": 0.6514197587966919, + "learning_rate": 7.477944698679419e-06, + "loss": 0.7623, + "step": 12238 + }, + { + "epoch": 0.6736199020309318, + "grad_norm": 0.7712671160697937, + "learning_rate": 7.477568198818636e-06, + "loss": 0.777, + "step": 12239 + }, + { + "epoch": 0.6736749408332875, + "grad_norm": 0.6690881252288818, + "learning_rate": 7.4771916803371145e-06, + "loss": 0.7275, + "step": 12240 + }, + { + "epoch": 0.6737299796356432, + "grad_norm": 0.7206465601921082, + "learning_rate": 7.476815143237683e-06, + "loss": 0.853, + "step": 12241 + }, + { + "epoch": 0.6737850184379988, + "grad_norm": 0.7052504420280457, + "learning_rate": 7.476438587523171e-06, + "loss": 0.774, + "step": 12242 + }, + { + "epoch": 0.6738400572403545, + "grad_norm": 1.6168169975280762, + "learning_rate": 7.476062013196411e-06, + "loss": 0.7423, + "step": 12243 + }, + { + "epoch": 0.6738950960427101, + "grad_norm": 0.715300977230072, + "learning_rate": 7.475685420260232e-06, + "loss": 0.78, + "step": 12244 + }, + { + "epoch": 0.6739501348450657, + "grad_norm": 0.7774379253387451, + "learning_rate": 7.475308808717463e-06, + "loss": 0.885, + "step": 12245 + }, + { + "epoch": 0.6740051736474214, + "grad_norm": 0.6998060941696167, + "learning_rate": 7.474932178570935e-06, + "loss": 0.807, + "step": 12246 + }, + { + "epoch": 0.6740602124497771, + "grad_norm": 0.6710013747215271, + "learning_rate": 7.47455552982348e-06, + "loss": 0.7639, + "step": 12247 + }, + { + "epoch": 0.6741152512521328, + "grad_norm": 0.707435667514801, + "learning_rate": 7.474178862477929e-06, + "loss": 0.7914, + "step": 12248 + }, + { + "epoch": 0.6741702900544884, + "grad_norm": 0.7344105243682861, + "learning_rate": 7.47380217653711e-06, + "loss": 0.7464, + "step": 12249 + }, + { + "epoch": 0.674225328856844, + "grad_norm": 0.7157585620880127, + "learning_rate": 7.473425472003858e-06, + "loss": 0.7747, + "step": 12250 + }, + { + "epoch": 0.6742803676591997, + "grad_norm": 0.6978434920310974, + "learning_rate": 7.473048748881001e-06, + "loss": 0.6903, + "step": 12251 + }, + { + "epoch": 0.6743354064615554, + "grad_norm": 0.6454086899757385, + "learning_rate": 7.472672007171372e-06, + "loss": 0.725, + "step": 12252 + }, + { + "epoch": 0.674390445263911, + "grad_norm": 0.6729341745376587, + "learning_rate": 7.4722952468778035e-06, + "loss": 0.7704, + "step": 12253 + }, + { + "epoch": 0.6744454840662667, + "grad_norm": 0.7995265126228333, + "learning_rate": 7.471918468003122e-06, + "loss": 0.7567, + "step": 12254 + }, + { + "epoch": 0.6745005228686224, + "grad_norm": 0.729629397392273, + "learning_rate": 7.471541670550165e-06, + "loss": 0.796, + "step": 12255 + }, + { + "epoch": 0.6745555616709781, + "grad_norm": 0.6923666000366211, + "learning_rate": 7.471164854521764e-06, + "loss": 0.6894, + "step": 12256 + }, + { + "epoch": 0.6746106004733337, + "grad_norm": 0.6485042572021484, + "learning_rate": 7.470788019920747e-06, + "loss": 0.6912, + "step": 12257 + }, + { + "epoch": 0.6746656392756893, + "grad_norm": 0.7569034099578857, + "learning_rate": 7.470411166749949e-06, + "loss": 0.8167, + "step": 12258 + }, + { + "epoch": 0.674720678078045, + "grad_norm": 0.6202835440635681, + "learning_rate": 7.470034295012203e-06, + "loss": 0.6409, + "step": 12259 + }, + { + "epoch": 0.6747757168804007, + "grad_norm": 0.6414007544517517, + "learning_rate": 7.4696574047103395e-06, + "loss": 0.7163, + "step": 12260 + }, + { + "epoch": 0.6748307556827563, + "grad_norm": 0.7012181878089905, + "learning_rate": 7.469280495847193e-06, + "loss": 0.7682, + "step": 12261 + }, + { + "epoch": 0.674885794485112, + "grad_norm": 0.7027888298034668, + "learning_rate": 7.468903568425596e-06, + "loss": 0.7561, + "step": 12262 + }, + { + "epoch": 0.6749408332874677, + "grad_norm": 0.7282221913337708, + "learning_rate": 7.4685266224483785e-06, + "loss": 0.7552, + "step": 12263 + }, + { + "epoch": 0.6749958720898234, + "grad_norm": 0.7349117398262024, + "learning_rate": 7.468149657918377e-06, + "loss": 0.8323, + "step": 12264 + }, + { + "epoch": 0.675050910892179, + "grad_norm": 0.8992187976837158, + "learning_rate": 7.467772674838424e-06, + "loss": 0.7589, + "step": 12265 + }, + { + "epoch": 0.6751059496945346, + "grad_norm": 0.6773034930229187, + "learning_rate": 7.4673956732113505e-06, + "loss": 0.7229, + "step": 12266 + }, + { + "epoch": 0.6751609884968903, + "grad_norm": 0.6563699841499329, + "learning_rate": 7.467018653039992e-06, + "loss": 0.7526, + "step": 12267 + }, + { + "epoch": 0.675216027299246, + "grad_norm": 0.7559765577316284, + "learning_rate": 7.466641614327181e-06, + "loss": 0.708, + "step": 12268 + }, + { + "epoch": 0.6752710661016016, + "grad_norm": 0.7077820897102356, + "learning_rate": 7.4662645570757545e-06, + "loss": 0.6568, + "step": 12269 + }, + { + "epoch": 0.6753261049039573, + "grad_norm": 0.8082162141799927, + "learning_rate": 7.465887481288541e-06, + "loss": 0.8751, + "step": 12270 + }, + { + "epoch": 0.675381143706313, + "grad_norm": 0.6940243244171143, + "learning_rate": 7.465510386968377e-06, + "loss": 0.7826, + "step": 12271 + }, + { + "epoch": 0.6754361825086687, + "grad_norm": 0.6634145379066467, + "learning_rate": 7.465133274118099e-06, + "loss": 0.6816, + "step": 12272 + }, + { + "epoch": 0.6754912213110242, + "grad_norm": 0.6797559857368469, + "learning_rate": 7.464756142740539e-06, + "loss": 0.7101, + "step": 12273 + }, + { + "epoch": 0.6755462601133799, + "grad_norm": 0.7696588635444641, + "learning_rate": 7.464378992838531e-06, + "loss": 0.8114, + "step": 12274 + }, + { + "epoch": 0.6756012989157356, + "grad_norm": 0.6733334064483643, + "learning_rate": 7.4640018244149105e-06, + "loss": 0.7585, + "step": 12275 + }, + { + "epoch": 0.6756563377180913, + "grad_norm": 0.7087474465370178, + "learning_rate": 7.463624637472512e-06, + "loss": 0.6911, + "step": 12276 + }, + { + "epoch": 0.6757113765204469, + "grad_norm": 0.6944451928138733, + "learning_rate": 7.46324743201417e-06, + "loss": 0.7726, + "step": 12277 + }, + { + "epoch": 0.6757664153228026, + "grad_norm": 0.7214855551719666, + "learning_rate": 7.46287020804272e-06, + "loss": 0.7844, + "step": 12278 + }, + { + "epoch": 0.6758214541251583, + "grad_norm": 0.7106257677078247, + "learning_rate": 7.462492965560995e-06, + "loss": 0.7724, + "step": 12279 + }, + { + "epoch": 0.675876492927514, + "grad_norm": 0.7403497695922852, + "learning_rate": 7.462115704571833e-06, + "loss": 0.7558, + "step": 12280 + }, + { + "epoch": 0.6759315317298695, + "grad_norm": 0.7157884836196899, + "learning_rate": 7.4617384250780685e-06, + "loss": 0.6681, + "step": 12281 + }, + { + "epoch": 0.6759865705322252, + "grad_norm": 0.6937661170959473, + "learning_rate": 7.461361127082538e-06, + "loss": 0.7852, + "step": 12282 + }, + { + "epoch": 0.6760416093345809, + "grad_norm": 0.7106412053108215, + "learning_rate": 7.4609838105880735e-06, + "loss": 0.7689, + "step": 12283 + }, + { + "epoch": 0.6760966481369366, + "grad_norm": 0.6860619187355042, + "learning_rate": 7.460606475597516e-06, + "loss": 0.6528, + "step": 12284 + }, + { + "epoch": 0.6761516869392922, + "grad_norm": 0.7085865139961243, + "learning_rate": 7.460229122113698e-06, + "loss": 0.7303, + "step": 12285 + }, + { + "epoch": 0.6762067257416479, + "grad_norm": 0.6648178100585938, + "learning_rate": 7.459851750139457e-06, + "loss": 0.6751, + "step": 12286 + }, + { + "epoch": 0.6762617645440036, + "grad_norm": 0.74468594789505, + "learning_rate": 7.459474359677629e-06, + "loss": 0.756, + "step": 12287 + }, + { + "epoch": 0.6763168033463591, + "grad_norm": 0.6408486366271973, + "learning_rate": 7.459096950731048e-06, + "loss": 0.7737, + "step": 12288 + }, + { + "epoch": 0.6763718421487148, + "grad_norm": 0.7204515933990479, + "learning_rate": 7.458719523302556e-06, + "loss": 0.7845, + "step": 12289 + }, + { + "epoch": 0.6764268809510705, + "grad_norm": 0.7373428344726562, + "learning_rate": 7.458342077394984e-06, + "loss": 0.7245, + "step": 12290 + }, + { + "epoch": 0.6764819197534262, + "grad_norm": 0.701654851436615, + "learning_rate": 7.45796461301117e-06, + "loss": 0.7711, + "step": 12291 + }, + { + "epoch": 0.6765369585557818, + "grad_norm": 0.7002573013305664, + "learning_rate": 7.4575871301539526e-06, + "loss": 0.8138, + "step": 12292 + }, + { + "epoch": 0.6765919973581375, + "grad_norm": 0.7460681200027466, + "learning_rate": 7.45720962882617e-06, + "loss": 0.8012, + "step": 12293 + }, + { + "epoch": 0.6766470361604932, + "grad_norm": 0.6478421092033386, + "learning_rate": 7.456832109030655e-06, + "loss": 0.7161, + "step": 12294 + }, + { + "epoch": 0.6767020749628488, + "grad_norm": 0.7101582288742065, + "learning_rate": 7.456454570770248e-06, + "loss": 0.7348, + "step": 12295 + }, + { + "epoch": 0.6767571137652044, + "grad_norm": 0.7735113501548767, + "learning_rate": 7.4560770140477865e-06, + "loss": 0.7584, + "step": 12296 + }, + { + "epoch": 0.6768121525675601, + "grad_norm": 0.6811535358428955, + "learning_rate": 7.4556994388661085e-06, + "loss": 0.7653, + "step": 12297 + }, + { + "epoch": 0.6768671913699158, + "grad_norm": 0.7445605397224426, + "learning_rate": 7.455321845228051e-06, + "loss": 0.7661, + "step": 12298 + }, + { + "epoch": 0.6769222301722715, + "grad_norm": 0.6862059831619263, + "learning_rate": 7.4549442331364505e-06, + "loss": 0.776, + "step": 12299 + }, + { + "epoch": 0.6769772689746271, + "grad_norm": 0.7030314207077026, + "learning_rate": 7.4545666025941465e-06, + "loss": 0.7393, + "step": 12300 + }, + { + "epoch": 0.6770323077769828, + "grad_norm": 0.6718610525131226, + "learning_rate": 7.454188953603978e-06, + "loss": 0.7375, + "step": 12301 + }, + { + "epoch": 0.6770873465793384, + "grad_norm": 0.6716088652610779, + "learning_rate": 7.453811286168782e-06, + "loss": 0.8021, + "step": 12302 + }, + { + "epoch": 0.6771423853816941, + "grad_norm": 0.8916372656822205, + "learning_rate": 7.453433600291395e-06, + "loss": 0.8274, + "step": 12303 + }, + { + "epoch": 0.6771974241840497, + "grad_norm": 0.7396363615989685, + "learning_rate": 7.45305589597466e-06, + "loss": 0.7892, + "step": 12304 + }, + { + "epoch": 0.6772524629864054, + "grad_norm": 0.8074424862861633, + "learning_rate": 7.452678173221413e-06, + "loss": 0.7586, + "step": 12305 + }, + { + "epoch": 0.6773075017887611, + "grad_norm": 0.6928194165229797, + "learning_rate": 7.452300432034494e-06, + "loss": 0.7914, + "step": 12306 + }, + { + "epoch": 0.6773625405911168, + "grad_norm": 0.7064313292503357, + "learning_rate": 7.451922672416739e-06, + "loss": 0.7948, + "step": 12307 + }, + { + "epoch": 0.6774175793934724, + "grad_norm": 0.6828622221946716, + "learning_rate": 7.451544894370992e-06, + "loss": 0.6723, + "step": 12308 + }, + { + "epoch": 0.677472618195828, + "grad_norm": 0.6794914603233337, + "learning_rate": 7.45116709790009e-06, + "loss": 0.7344, + "step": 12309 + }, + { + "epoch": 0.6775276569981837, + "grad_norm": 0.7643330097198486, + "learning_rate": 7.45078928300687e-06, + "loss": 0.7836, + "step": 12310 + }, + { + "epoch": 0.6775826958005394, + "grad_norm": 0.692569375038147, + "learning_rate": 7.450411449694176e-06, + "loss": 0.7608, + "step": 12311 + }, + { + "epoch": 0.677637734602895, + "grad_norm": 0.7718693614006042, + "learning_rate": 7.4500335979648455e-06, + "loss": 0.7131, + "step": 12312 + }, + { + "epoch": 0.6776927734052507, + "grad_norm": 0.6267405152320862, + "learning_rate": 7.449655727821716e-06, + "loss": 0.7543, + "step": 12313 + }, + { + "epoch": 0.6777478122076064, + "grad_norm": 0.8252732157707214, + "learning_rate": 7.4492778392676325e-06, + "loss": 0.8799, + "step": 12314 + }, + { + "epoch": 0.6778028510099621, + "grad_norm": 0.6310145854949951, + "learning_rate": 7.448899932305429e-06, + "loss": 0.7389, + "step": 12315 + }, + { + "epoch": 0.6778578898123176, + "grad_norm": 0.6115848422050476, + "learning_rate": 7.448522006937951e-06, + "loss": 0.6069, + "step": 12316 + }, + { + "epoch": 0.6779129286146733, + "grad_norm": 0.6809090971946716, + "learning_rate": 7.448144063168038e-06, + "loss": 0.7092, + "step": 12317 + }, + { + "epoch": 0.677967967417029, + "grad_norm": 0.7285470366477966, + "learning_rate": 7.447766100998529e-06, + "loss": 0.714, + "step": 12318 + }, + { + "epoch": 0.6780230062193847, + "grad_norm": 0.6637021899223328, + "learning_rate": 7.447388120432264e-06, + "loss": 0.7247, + "step": 12319 + }, + { + "epoch": 0.6780780450217403, + "grad_norm": 0.7735750675201416, + "learning_rate": 7.447010121472087e-06, + "loss": 0.7616, + "step": 12320 + }, + { + "epoch": 0.678133083824096, + "grad_norm": 0.7643262147903442, + "learning_rate": 7.446632104120836e-06, + "loss": 0.5863, + "step": 12321 + }, + { + "epoch": 0.6781881226264517, + "grad_norm": 0.6957301497459412, + "learning_rate": 7.446254068381352e-06, + "loss": 0.7125, + "step": 12322 + }, + { + "epoch": 0.6782431614288074, + "grad_norm": 0.6573877930641174, + "learning_rate": 7.445876014256479e-06, + "loss": 0.7115, + "step": 12323 + }, + { + "epoch": 0.6782982002311629, + "grad_norm": 0.6507790684700012, + "learning_rate": 7.445497941749056e-06, + "loss": 0.7266, + "step": 12324 + }, + { + "epoch": 0.6783532390335186, + "grad_norm": 0.8314819931983948, + "learning_rate": 7.4451198508619245e-06, + "loss": 0.6902, + "step": 12325 + }, + { + "epoch": 0.6784082778358743, + "grad_norm": 0.6907274127006531, + "learning_rate": 7.444741741597927e-06, + "loss": 0.8253, + "step": 12326 + }, + { + "epoch": 0.67846331663823, + "grad_norm": 0.7311725616455078, + "learning_rate": 7.444363613959904e-06, + "loss": 0.8641, + "step": 12327 + }, + { + "epoch": 0.6785183554405856, + "grad_norm": 0.6690121293067932, + "learning_rate": 7.443985467950701e-06, + "loss": 0.6966, + "step": 12328 + }, + { + "epoch": 0.6785733942429413, + "grad_norm": 0.6444346308708191, + "learning_rate": 7.443607303573155e-06, + "loss": 0.7848, + "step": 12329 + }, + { + "epoch": 0.678628433045297, + "grad_norm": 0.7553900480270386, + "learning_rate": 7.4432291208301125e-06, + "loss": 0.8196, + "step": 12330 + }, + { + "epoch": 0.6786834718476525, + "grad_norm": 0.6393183469772339, + "learning_rate": 7.442850919724411e-06, + "loss": 0.7622, + "step": 12331 + }, + { + "epoch": 0.6787385106500082, + "grad_norm": 0.7045423984527588, + "learning_rate": 7.442472700258898e-06, + "loss": 0.7483, + "step": 12332 + }, + { + "epoch": 0.6787935494523639, + "grad_norm": 0.7536678314208984, + "learning_rate": 7.442094462436414e-06, + "loss": 0.815, + "step": 12333 + }, + { + "epoch": 0.6788485882547196, + "grad_norm": 0.645391047000885, + "learning_rate": 7.441716206259801e-06, + "loss": 0.7394, + "step": 12334 + }, + { + "epoch": 0.6789036270570752, + "grad_norm": 0.8870118260383606, + "learning_rate": 7.441337931731905e-06, + "loss": 0.8076, + "step": 12335 + }, + { + "epoch": 0.6789586658594309, + "grad_norm": 0.6672457456588745, + "learning_rate": 7.440959638855564e-06, + "loss": 0.7573, + "step": 12336 + }, + { + "epoch": 0.6790137046617866, + "grad_norm": 0.7104566693305969, + "learning_rate": 7.440581327633625e-06, + "loss": 0.6855, + "step": 12337 + }, + { + "epoch": 0.6790687434641423, + "grad_norm": 0.7201581001281738, + "learning_rate": 7.4402029980689294e-06, + "loss": 0.7977, + "step": 12338 + }, + { + "epoch": 0.6791237822664978, + "grad_norm": 0.6685218811035156, + "learning_rate": 7.43982465016432e-06, + "loss": 0.8114, + "step": 12339 + }, + { + "epoch": 0.6791788210688535, + "grad_norm": 0.6913738250732422, + "learning_rate": 7.439446283922645e-06, + "loss": 0.7584, + "step": 12340 + }, + { + "epoch": 0.6792338598712092, + "grad_norm": 0.7332273721694946, + "learning_rate": 7.439067899346742e-06, + "loss": 0.7658, + "step": 12341 + }, + { + "epoch": 0.6792888986735649, + "grad_norm": 0.777909517288208, + "learning_rate": 7.438689496439458e-06, + "loss": 0.8064, + "step": 12342 + }, + { + "epoch": 0.6793439374759205, + "grad_norm": 0.7444930076599121, + "learning_rate": 7.438311075203636e-06, + "loss": 0.7896, + "step": 12343 + }, + { + "epoch": 0.6793989762782762, + "grad_norm": 0.7678806781768799, + "learning_rate": 7.4379326356421224e-06, + "loss": 0.8533, + "step": 12344 + }, + { + "epoch": 0.6794540150806319, + "grad_norm": 0.6653377413749695, + "learning_rate": 7.437554177757759e-06, + "loss": 0.7287, + "step": 12345 + }, + { + "epoch": 0.6795090538829875, + "grad_norm": 0.6270567178726196, + "learning_rate": 7.43717570155339e-06, + "loss": 0.6802, + "step": 12346 + }, + { + "epoch": 0.6795640926853431, + "grad_norm": 0.7091223001480103, + "learning_rate": 7.436797207031861e-06, + "loss": 0.7693, + "step": 12347 + }, + { + "epoch": 0.6796191314876988, + "grad_norm": 0.6583104133605957, + "learning_rate": 7.436418694196018e-06, + "loss": 0.7171, + "step": 12348 + }, + { + "epoch": 0.6796741702900545, + "grad_norm": 0.6897410750389099, + "learning_rate": 7.436040163048703e-06, + "loss": 0.7831, + "step": 12349 + }, + { + "epoch": 0.6797292090924102, + "grad_norm": 0.6506269574165344, + "learning_rate": 7.435661613592763e-06, + "loss": 0.8037, + "step": 12350 + }, + { + "epoch": 0.6797842478947658, + "grad_norm": 0.6772280931472778, + "learning_rate": 7.435283045831041e-06, + "loss": 0.8102, + "step": 12351 + }, + { + "epoch": 0.6798392866971215, + "grad_norm": 0.8470273017883301, + "learning_rate": 7.434904459766384e-06, + "loss": 0.7816, + "step": 12352 + }, + { + "epoch": 0.6798943254994771, + "grad_norm": 0.6969698071479797, + "learning_rate": 7.434525855401638e-06, + "loss": 0.6911, + "step": 12353 + }, + { + "epoch": 0.6799493643018328, + "grad_norm": 0.9969611763954163, + "learning_rate": 7.434147232739646e-06, + "loss": 0.7041, + "step": 12354 + }, + { + "epoch": 0.6800044031041884, + "grad_norm": 0.6697688698768616, + "learning_rate": 7.433768591783255e-06, + "loss": 0.6602, + "step": 12355 + }, + { + "epoch": 0.6800594419065441, + "grad_norm": 0.9857928156852722, + "learning_rate": 7.433389932535311e-06, + "loss": 0.6505, + "step": 12356 + }, + { + "epoch": 0.6801144807088998, + "grad_norm": 0.8787727355957031, + "learning_rate": 7.43301125499866e-06, + "loss": 0.7558, + "step": 12357 + }, + { + "epoch": 0.6801695195112555, + "grad_norm": 0.6035268306732178, + "learning_rate": 7.432632559176147e-06, + "loss": 0.6337, + "step": 12358 + }, + { + "epoch": 0.6802245583136111, + "grad_norm": 0.7977258563041687, + "learning_rate": 7.432253845070621e-06, + "loss": 0.7324, + "step": 12359 + }, + { + "epoch": 0.6802795971159667, + "grad_norm": 0.5842836499214172, + "learning_rate": 7.431875112684923e-06, + "loss": 0.677, + "step": 12360 + }, + { + "epoch": 0.6803346359183224, + "grad_norm": 0.7134125828742981, + "learning_rate": 7.431496362021905e-06, + "loss": 0.7034, + "step": 12361 + }, + { + "epoch": 0.6803896747206781, + "grad_norm": 0.7101823091506958, + "learning_rate": 7.431117593084411e-06, + "loss": 0.7526, + "step": 12362 + }, + { + "epoch": 0.6804447135230337, + "grad_norm": 0.6543304920196533, + "learning_rate": 7.4307388058752865e-06, + "loss": 0.7548, + "step": 12363 + }, + { + "epoch": 0.6804997523253894, + "grad_norm": 0.6522945761680603, + "learning_rate": 7.430360000397381e-06, + "loss": 0.7044, + "step": 12364 + }, + { + "epoch": 0.6805547911277451, + "grad_norm": 0.7405091524124146, + "learning_rate": 7.429981176653539e-06, + "loss": 0.8064, + "step": 12365 + }, + { + "epoch": 0.6806098299301008, + "grad_norm": 0.6454355716705322, + "learning_rate": 7.429602334646611e-06, + "loss": 0.7179, + "step": 12366 + }, + { + "epoch": 0.6806648687324564, + "grad_norm": 0.8131621479988098, + "learning_rate": 7.429223474379439e-06, + "loss": 0.7144, + "step": 12367 + }, + { + "epoch": 0.680719907534812, + "grad_norm": 0.7203080058097839, + "learning_rate": 7.428844595854876e-06, + "loss": 0.8189, + "step": 12368 + }, + { + "epoch": 0.6807749463371677, + "grad_norm": 0.650414228439331, + "learning_rate": 7.428465699075767e-06, + "loss": 0.7815, + "step": 12369 + }, + { + "epoch": 0.6808299851395234, + "grad_norm": 0.8152775168418884, + "learning_rate": 7.42808678404496e-06, + "loss": 0.7365, + "step": 12370 + }, + { + "epoch": 0.680885023941879, + "grad_norm": 0.5871601700782776, + "learning_rate": 7.427707850765302e-06, + "loss": 0.6804, + "step": 12371 + }, + { + "epoch": 0.6809400627442347, + "grad_norm": 0.7115684747695923, + "learning_rate": 7.427328899239643e-06, + "loss": 0.728, + "step": 12372 + }, + { + "epoch": 0.6809951015465904, + "grad_norm": 0.6575615406036377, + "learning_rate": 7.426949929470828e-06, + "loss": 0.725, + "step": 12373 + }, + { + "epoch": 0.681050140348946, + "grad_norm": 0.7744095325469971, + "learning_rate": 7.426570941461708e-06, + "loss": 0.7647, + "step": 12374 + }, + { + "epoch": 0.6811051791513016, + "grad_norm": 0.6856220364570618, + "learning_rate": 7.4261919352151305e-06, + "loss": 0.8121, + "step": 12375 + }, + { + "epoch": 0.6811602179536573, + "grad_norm": 0.8197830319404602, + "learning_rate": 7.425812910733943e-06, + "loss": 0.8685, + "step": 12376 + }, + { + "epoch": 0.681215256756013, + "grad_norm": 1.240628719329834, + "learning_rate": 7.425433868020996e-06, + "loss": 0.8063, + "step": 12377 + }, + { + "epoch": 0.6812702955583686, + "grad_norm": 0.8716747760772705, + "learning_rate": 7.425054807079136e-06, + "loss": 0.7384, + "step": 12378 + }, + { + "epoch": 0.6813253343607243, + "grad_norm": 0.7512598037719727, + "learning_rate": 7.4246757279112135e-06, + "loss": 0.7428, + "step": 12379 + }, + { + "epoch": 0.68138037316308, + "grad_norm": 0.7002312541007996, + "learning_rate": 7.424296630520078e-06, + "loss": 0.6066, + "step": 12380 + }, + { + "epoch": 0.6814354119654357, + "grad_norm": 0.6422720551490784, + "learning_rate": 7.423917514908578e-06, + "loss": 0.6645, + "step": 12381 + }, + { + "epoch": 0.6814904507677912, + "grad_norm": 0.8667505383491516, + "learning_rate": 7.423538381079562e-06, + "loss": 0.8663, + "step": 12382 + }, + { + "epoch": 0.6815454895701469, + "grad_norm": 0.7045377492904663, + "learning_rate": 7.423159229035881e-06, + "loss": 0.7684, + "step": 12383 + }, + { + "epoch": 0.6816005283725026, + "grad_norm": 0.7663894295692444, + "learning_rate": 7.422780058780385e-06, + "loss": 0.8051, + "step": 12384 + }, + { + "epoch": 0.6816555671748583, + "grad_norm": 0.7612582445144653, + "learning_rate": 7.42240087031592e-06, + "loss": 0.7771, + "step": 12385 + }, + { + "epoch": 0.6817106059772139, + "grad_norm": 0.8682271838188171, + "learning_rate": 7.42202166364534e-06, + "loss": 0.7761, + "step": 12386 + }, + { + "epoch": 0.6817656447795696, + "grad_norm": 0.712204098701477, + "learning_rate": 7.421642438771492e-06, + "loss": 0.7832, + "step": 12387 + }, + { + "epoch": 0.6818206835819253, + "grad_norm": 0.6726338863372803, + "learning_rate": 7.42126319569723e-06, + "loss": 0.7541, + "step": 12388 + }, + { + "epoch": 0.681875722384281, + "grad_norm": 0.647570788860321, + "learning_rate": 7.420883934425401e-06, + "loss": 0.7281, + "step": 12389 + }, + { + "epoch": 0.6819307611866365, + "grad_norm": 0.7058577537536621, + "learning_rate": 7.420504654958857e-06, + "loss": 0.8315, + "step": 12390 + }, + { + "epoch": 0.6819857999889922, + "grad_norm": 0.6683655977249146, + "learning_rate": 7.420125357300446e-06, + "loss": 0.772, + "step": 12391 + }, + { + "epoch": 0.6820408387913479, + "grad_norm": 0.6768681406974792, + "learning_rate": 7.419746041453022e-06, + "loss": 0.7023, + "step": 12392 + }, + { + "epoch": 0.6820958775937036, + "grad_norm": 0.8037514686584473, + "learning_rate": 7.419366707419434e-06, + "loss": 0.6894, + "step": 12393 + }, + { + "epoch": 0.6821509163960592, + "grad_norm": 0.6510934829711914, + "learning_rate": 7.418987355202534e-06, + "loss": 0.6411, + "step": 12394 + }, + { + "epoch": 0.6822059551984149, + "grad_norm": 0.7628617882728577, + "learning_rate": 7.418607984805173e-06, + "loss": 0.7681, + "step": 12395 + }, + { + "epoch": 0.6822609940007706, + "grad_norm": 0.7146260738372803, + "learning_rate": 7.418228596230201e-06, + "loss": 0.7003, + "step": 12396 + }, + { + "epoch": 0.6823160328031262, + "grad_norm": 0.6208338737487793, + "learning_rate": 7.41784918948047e-06, + "loss": 0.7138, + "step": 12397 + }, + { + "epoch": 0.6823710716054818, + "grad_norm": 0.7859066724777222, + "learning_rate": 7.417469764558832e-06, + "loss": 0.7984, + "step": 12398 + }, + { + "epoch": 0.6824261104078375, + "grad_norm": 0.7636224031448364, + "learning_rate": 7.417090321468138e-06, + "loss": 0.7445, + "step": 12399 + }, + { + "epoch": 0.6824811492101932, + "grad_norm": 0.9071671366691589, + "learning_rate": 7.41671086021124e-06, + "loss": 0.8058, + "step": 12400 + }, + { + "epoch": 0.6825361880125489, + "grad_norm": 0.5986278057098389, + "learning_rate": 7.416331380790991e-06, + "loss": 0.7001, + "step": 12401 + }, + { + "epoch": 0.6825912268149045, + "grad_norm": 0.6812893152236938, + "learning_rate": 7.415951883210242e-06, + "loss": 0.7745, + "step": 12402 + }, + { + "epoch": 0.6826462656172602, + "grad_norm": 0.666362464427948, + "learning_rate": 7.415572367471844e-06, + "loss": 0.7861, + "step": 12403 + }, + { + "epoch": 0.6827013044196159, + "grad_norm": 0.6963029503822327, + "learning_rate": 7.415192833578653e-06, + "loss": 0.7657, + "step": 12404 + }, + { + "epoch": 0.6827563432219715, + "grad_norm": 0.669876217842102, + "learning_rate": 7.414813281533517e-06, + "loss": 0.6441, + "step": 12405 + }, + { + "epoch": 0.6828113820243271, + "grad_norm": 0.6608602404594421, + "learning_rate": 7.414433711339293e-06, + "loss": 0.7203, + "step": 12406 + }, + { + "epoch": 0.6828664208266828, + "grad_norm": 0.7262642979621887, + "learning_rate": 7.41405412299883e-06, + "loss": 0.7842, + "step": 12407 + }, + { + "epoch": 0.6829214596290385, + "grad_norm": 0.7728527188301086, + "learning_rate": 7.413674516514983e-06, + "loss": 0.7551, + "step": 12408 + }, + { + "epoch": 0.6829764984313942, + "grad_norm": 0.7970840930938721, + "learning_rate": 7.4132948918906035e-06, + "loss": 0.8181, + "step": 12409 + }, + { + "epoch": 0.6830315372337498, + "grad_norm": 0.6672868728637695, + "learning_rate": 7.412915249128546e-06, + "loss": 0.7201, + "step": 12410 + }, + { + "epoch": 0.6830865760361055, + "grad_norm": 0.8261075019836426, + "learning_rate": 7.412535588231664e-06, + "loss": 0.6006, + "step": 12411 + }, + { + "epoch": 0.6831416148384611, + "grad_norm": 0.6768019795417786, + "learning_rate": 7.412155909202809e-06, + "loss": 0.7326, + "step": 12412 + }, + { + "epoch": 0.6831966536408168, + "grad_norm": 0.7482851147651672, + "learning_rate": 7.4117762120448364e-06, + "loss": 0.7913, + "step": 12413 + }, + { + "epoch": 0.6832516924431724, + "grad_norm": 0.7315956354141235, + "learning_rate": 7.411396496760601e-06, + "loss": 0.7949, + "step": 12414 + }, + { + "epoch": 0.6833067312455281, + "grad_norm": 0.7460561394691467, + "learning_rate": 7.411016763352954e-06, + "loss": 0.8445, + "step": 12415 + }, + { + "epoch": 0.6833617700478838, + "grad_norm": 0.7025588154792786, + "learning_rate": 7.410637011824749e-06, + "loss": 0.7658, + "step": 12416 + }, + { + "epoch": 0.6834168088502394, + "grad_norm": 0.7507885694503784, + "learning_rate": 7.410257242178842e-06, + "loss": 0.711, + "step": 12417 + }, + { + "epoch": 0.683471847652595, + "grad_norm": 0.6935780048370361, + "learning_rate": 7.409877454418088e-06, + "loss": 0.8376, + "step": 12418 + }, + { + "epoch": 0.6835268864549507, + "grad_norm": 0.7747789025306702, + "learning_rate": 7.409497648545341e-06, + "loss": 0.8173, + "step": 12419 + }, + { + "epoch": 0.6835819252573064, + "grad_norm": 0.6559001803398132, + "learning_rate": 7.4091178245634525e-06, + "loss": 0.7146, + "step": 12420 + }, + { + "epoch": 0.683636964059662, + "grad_norm": 0.7123926877975464, + "learning_rate": 7.408737982475279e-06, + "loss": 0.7544, + "step": 12421 + }, + { + "epoch": 0.6836920028620177, + "grad_norm": 0.8163334131240845, + "learning_rate": 7.408358122283678e-06, + "loss": 0.8008, + "step": 12422 + }, + { + "epoch": 0.6837470416643734, + "grad_norm": 0.6837686896324158, + "learning_rate": 7.4079782439915e-06, + "loss": 0.6595, + "step": 12423 + }, + { + "epoch": 0.6838020804667291, + "grad_norm": 0.9385979175567627, + "learning_rate": 7.407598347601601e-06, + "loss": 0.8135, + "step": 12424 + }, + { + "epoch": 0.6838571192690847, + "grad_norm": 0.7197830677032471, + "learning_rate": 7.407218433116839e-06, + "loss": 0.8401, + "step": 12425 + }, + { + "epoch": 0.6839121580714403, + "grad_norm": 0.7165716290473938, + "learning_rate": 7.406838500540069e-06, + "loss": 0.7864, + "step": 12426 + }, + { + "epoch": 0.683967196873796, + "grad_norm": 0.6844950318336487, + "learning_rate": 7.4064585498741435e-06, + "loss": 0.7409, + "step": 12427 + }, + { + "epoch": 0.6840222356761517, + "grad_norm": 0.6237946152687073, + "learning_rate": 7.40607858112192e-06, + "loss": 0.6915, + "step": 12428 + }, + { + "epoch": 0.6840772744785073, + "grad_norm": 0.7437137365341187, + "learning_rate": 7.405698594286252e-06, + "loss": 0.8191, + "step": 12429 + }, + { + "epoch": 0.684132313280863, + "grad_norm": 0.6956225633621216, + "learning_rate": 7.4053185893700006e-06, + "loss": 0.7662, + "step": 12430 + }, + { + "epoch": 0.6841873520832187, + "grad_norm": 0.6508380174636841, + "learning_rate": 7.404938566376018e-06, + "loss": 0.7758, + "step": 12431 + }, + { + "epoch": 0.6842423908855744, + "grad_norm": 0.6759025454521179, + "learning_rate": 7.404558525307159e-06, + "loss": 0.7713, + "step": 12432 + }, + { + "epoch": 0.68429742968793, + "grad_norm": 0.7280172109603882, + "learning_rate": 7.404178466166283e-06, + "loss": 0.7753, + "step": 12433 + }, + { + "epoch": 0.6843524684902856, + "grad_norm": 0.7599073052406311, + "learning_rate": 7.403798388956245e-06, + "loss": 0.6993, + "step": 12434 + }, + { + "epoch": 0.6844075072926413, + "grad_norm": 0.7962353229522705, + "learning_rate": 7.403418293679903e-06, + "loss": 0.771, + "step": 12435 + }, + { + "epoch": 0.684462546094997, + "grad_norm": 0.6714458465576172, + "learning_rate": 7.40303818034011e-06, + "loss": 0.7077, + "step": 12436 + }, + { + "epoch": 0.6845175848973526, + "grad_norm": 0.6770713925361633, + "learning_rate": 7.402658048939726e-06, + "loss": 0.7695, + "step": 12437 + }, + { + "epoch": 0.6845726236997083, + "grad_norm": 0.7337867617607117, + "learning_rate": 7.402277899481608e-06, + "loss": 0.9453, + "step": 12438 + }, + { + "epoch": 0.684627662502064, + "grad_norm": 0.7457698583602905, + "learning_rate": 7.401897731968612e-06, + "loss": 0.7569, + "step": 12439 + }, + { + "epoch": 0.6846827013044197, + "grad_norm": 0.6683285236358643, + "learning_rate": 7.401517546403595e-06, + "loss": 0.7215, + "step": 12440 + }, + { + "epoch": 0.6847377401067752, + "grad_norm": 0.6516628861427307, + "learning_rate": 7.401137342789415e-06, + "loss": 0.7433, + "step": 12441 + }, + { + "epoch": 0.6847927789091309, + "grad_norm": 0.7572295665740967, + "learning_rate": 7.400757121128932e-06, + "loss": 0.7204, + "step": 12442 + }, + { + "epoch": 0.6848478177114866, + "grad_norm": 0.6884106993675232, + "learning_rate": 7.400376881425e-06, + "loss": 0.6766, + "step": 12443 + }, + { + "epoch": 0.6849028565138423, + "grad_norm": 0.798926591873169, + "learning_rate": 7.399996623680475e-06, + "loss": 0.7673, + "step": 12444 + }, + { + "epoch": 0.6849578953161979, + "grad_norm": 0.7200846672058105, + "learning_rate": 7.399616347898221e-06, + "loss": 0.8032, + "step": 12445 + }, + { + "epoch": 0.6850129341185536, + "grad_norm": 0.7085461020469666, + "learning_rate": 7.3992360540810915e-06, + "loss": 0.7075, + "step": 12446 + }, + { + "epoch": 0.6850679729209093, + "grad_norm": 0.6885339021682739, + "learning_rate": 7.398855742231947e-06, + "loss": 0.7278, + "step": 12447 + }, + { + "epoch": 0.685123011723265, + "grad_norm": 0.6693943738937378, + "learning_rate": 7.398475412353643e-06, + "loss": 0.7134, + "step": 12448 + }, + { + "epoch": 0.6851780505256205, + "grad_norm": 0.6908173561096191, + "learning_rate": 7.398095064449041e-06, + "loss": 0.8054, + "step": 12449 + }, + { + "epoch": 0.6852330893279762, + "grad_norm": 0.6207892894744873, + "learning_rate": 7.397714698520999e-06, + "loss": 0.5789, + "step": 12450 + }, + { + "epoch": 0.6852881281303319, + "grad_norm": 0.8367832899093628, + "learning_rate": 7.397334314572374e-06, + "loss": 0.8186, + "step": 12451 + }, + { + "epoch": 0.6853431669326876, + "grad_norm": 0.7005738615989685, + "learning_rate": 7.396953912606026e-06, + "loss": 0.8177, + "step": 12452 + }, + { + "epoch": 0.6853982057350432, + "grad_norm": 0.7189906239509583, + "learning_rate": 7.396573492624814e-06, + "loss": 0.8387, + "step": 12453 + }, + { + "epoch": 0.6854532445373989, + "grad_norm": 1.040576457977295, + "learning_rate": 7.3961930546315995e-06, + "loss": 0.7165, + "step": 12454 + }, + { + "epoch": 0.6855082833397546, + "grad_norm": 0.6417170166969299, + "learning_rate": 7.3958125986292385e-06, + "loss": 0.6671, + "step": 12455 + }, + { + "epoch": 0.6855633221421102, + "grad_norm": 0.6443242430686951, + "learning_rate": 7.395432124620589e-06, + "loss": 0.6995, + "step": 12456 + }, + { + "epoch": 0.6856183609444658, + "grad_norm": 0.5764951705932617, + "learning_rate": 7.395051632608516e-06, + "loss": 0.6088, + "step": 12457 + }, + { + "epoch": 0.6856733997468215, + "grad_norm": 0.6193686127662659, + "learning_rate": 7.394671122595873e-06, + "loss": 0.7283, + "step": 12458 + }, + { + "epoch": 0.6857284385491772, + "grad_norm": 0.6773817539215088, + "learning_rate": 7.394290594585525e-06, + "loss": 0.8204, + "step": 12459 + }, + { + "epoch": 0.6857834773515328, + "grad_norm": 0.7906570434570312, + "learning_rate": 7.393910048580328e-06, + "loss": 0.7057, + "step": 12460 + }, + { + "epoch": 0.6858385161538885, + "grad_norm": 0.7544124126434326, + "learning_rate": 7.393529484583145e-06, + "loss": 0.8053, + "step": 12461 + }, + { + "epoch": 0.6858935549562442, + "grad_norm": 0.6878008842468262, + "learning_rate": 7.3931489025968365e-06, + "loss": 0.6972, + "step": 12462 + }, + { + "epoch": 0.6859485937585998, + "grad_norm": 0.6734861731529236, + "learning_rate": 7.392768302624259e-06, + "loss": 0.7921, + "step": 12463 + }, + { + "epoch": 0.6860036325609554, + "grad_norm": 0.6845618486404419, + "learning_rate": 7.392387684668276e-06, + "loss": 0.7461, + "step": 12464 + }, + { + "epoch": 0.6860586713633111, + "grad_norm": 0.6362663507461548, + "learning_rate": 7.392007048731748e-06, + "loss": 0.7108, + "step": 12465 + }, + { + "epoch": 0.6861137101656668, + "grad_norm": 0.7441046237945557, + "learning_rate": 7.391626394817537e-06, + "loss": 0.6944, + "step": 12466 + }, + { + "epoch": 0.6861687489680225, + "grad_norm": 1.0933935642242432, + "learning_rate": 7.391245722928501e-06, + "loss": 0.7744, + "step": 12467 + }, + { + "epoch": 0.6862237877703781, + "grad_norm": 0.6531348824501038, + "learning_rate": 7.3908650330675e-06, + "loss": 0.6772, + "step": 12468 + }, + { + "epoch": 0.6862788265727338, + "grad_norm": 0.7533715963363647, + "learning_rate": 7.390484325237399e-06, + "loss": 0.7385, + "step": 12469 + }, + { + "epoch": 0.6863338653750894, + "grad_norm": 0.618679940700531, + "learning_rate": 7.390103599441058e-06, + "loss": 0.6053, + "step": 12470 + }, + { + "epoch": 0.6863889041774451, + "grad_norm": 0.7102347612380981, + "learning_rate": 7.389722855681338e-06, + "loss": 0.7246, + "step": 12471 + }, + { + "epoch": 0.6864439429798007, + "grad_norm": 0.8545061945915222, + "learning_rate": 7.3893420939611e-06, + "loss": 0.7386, + "step": 12472 + }, + { + "epoch": 0.6864989817821564, + "grad_norm": 0.6298168897628784, + "learning_rate": 7.388961314283207e-06, + "loss": 0.6573, + "step": 12473 + }, + { + "epoch": 0.6865540205845121, + "grad_norm": 0.6909272074699402, + "learning_rate": 7.388580516650521e-06, + "loss": 0.7973, + "step": 12474 + }, + { + "epoch": 0.6866090593868678, + "grad_norm": 0.6782366037368774, + "learning_rate": 7.388199701065904e-06, + "loss": 0.7437, + "step": 12475 + }, + { + "epoch": 0.6866640981892234, + "grad_norm": 0.6826187372207642, + "learning_rate": 7.387818867532213e-06, + "loss": 0.6254, + "step": 12476 + }, + { + "epoch": 0.686719136991579, + "grad_norm": 0.7471422553062439, + "learning_rate": 7.387438016052318e-06, + "loss": 0.8668, + "step": 12477 + }, + { + "epoch": 0.6867741757939347, + "grad_norm": 0.7987646460533142, + "learning_rate": 7.38705714662908e-06, + "loss": 0.6759, + "step": 12478 + }, + { + "epoch": 0.6868292145962904, + "grad_norm": 0.7318877577781677, + "learning_rate": 7.386676259265356e-06, + "loss": 0.7167, + "step": 12479 + }, + { + "epoch": 0.686884253398646, + "grad_norm": 0.6655439138412476, + "learning_rate": 7.386295353964013e-06, + "loss": 0.7184, + "step": 12480 + }, + { + "epoch": 0.6869392922010017, + "grad_norm": 0.7323878407478333, + "learning_rate": 7.385914430727912e-06, + "loss": 0.7562, + "step": 12481 + }, + { + "epoch": 0.6869943310033574, + "grad_norm": 0.7813006639480591, + "learning_rate": 7.385533489559918e-06, + "loss": 0.7665, + "step": 12482 + }, + { + "epoch": 0.6870493698057131, + "grad_norm": 0.6889718770980835, + "learning_rate": 7.385152530462894e-06, + "loss": 0.6587, + "step": 12483 + }, + { + "epoch": 0.6871044086080687, + "grad_norm": 0.6930332183837891, + "learning_rate": 7.384771553439698e-06, + "loss": 0.8244, + "step": 12484 + }, + { + "epoch": 0.6871594474104243, + "grad_norm": 0.8294679522514343, + "learning_rate": 7.384390558493201e-06, + "loss": 0.6977, + "step": 12485 + }, + { + "epoch": 0.68721448621278, + "grad_norm": 0.7235204577445984, + "learning_rate": 7.384009545626262e-06, + "loss": 0.7946, + "step": 12486 + }, + { + "epoch": 0.6872695250151357, + "grad_norm": 0.6346727609634399, + "learning_rate": 7.3836285148417456e-06, + "loss": 0.6109, + "step": 12487 + }, + { + "epoch": 0.6873245638174913, + "grad_norm": 0.7168872356414795, + "learning_rate": 7.383247466142513e-06, + "loss": 0.7485, + "step": 12488 + }, + { + "epoch": 0.687379602619847, + "grad_norm": 0.6511938571929932, + "learning_rate": 7.382866399531434e-06, + "loss": 0.8048, + "step": 12489 + }, + { + "epoch": 0.6874346414222027, + "grad_norm": 0.7569704651832581, + "learning_rate": 7.3824853150113674e-06, + "loss": 0.8017, + "step": 12490 + }, + { + "epoch": 0.6874896802245584, + "grad_norm": 0.7708210945129395, + "learning_rate": 7.382104212585178e-06, + "loss": 0.7258, + "step": 12491 + }, + { + "epoch": 0.6875447190269139, + "grad_norm": 0.709702730178833, + "learning_rate": 7.381723092255731e-06, + "loss": 0.7707, + "step": 12492 + }, + { + "epoch": 0.6875997578292696, + "grad_norm": 0.6683183908462524, + "learning_rate": 7.381341954025892e-06, + "loss": 0.702, + "step": 12493 + }, + { + "epoch": 0.6876547966316253, + "grad_norm": 0.7639274597167969, + "learning_rate": 7.380960797898524e-06, + "loss": 0.7027, + "step": 12494 + }, + { + "epoch": 0.687709835433981, + "grad_norm": 0.6735698580741882, + "learning_rate": 7.380579623876492e-06, + "loss": 0.7124, + "step": 12495 + }, + { + "epoch": 0.6877648742363366, + "grad_norm": 0.6635340452194214, + "learning_rate": 7.38019843196266e-06, + "loss": 0.6968, + "step": 12496 + }, + { + "epoch": 0.6878199130386923, + "grad_norm": 0.7459729313850403, + "learning_rate": 7.379817222159895e-06, + "loss": 0.7629, + "step": 12497 + }, + { + "epoch": 0.687874951841048, + "grad_norm": 0.7408778667449951, + "learning_rate": 7.37943599447106e-06, + "loss": 0.8327, + "step": 12498 + }, + { + "epoch": 0.6879299906434037, + "grad_norm": 0.659736156463623, + "learning_rate": 7.379054748899021e-06, + "loss": 0.6746, + "step": 12499 + }, + { + "epoch": 0.6879850294457592, + "grad_norm": 0.7429264783859253, + "learning_rate": 7.3786734854466435e-06, + "loss": 0.8555, + "step": 12500 + }, + { + "epoch": 0.6880400682481149, + "grad_norm": 0.7492697834968567, + "learning_rate": 7.378292204116793e-06, + "loss": 0.7825, + "step": 12501 + }, + { + "epoch": 0.6880951070504706, + "grad_norm": 0.6664871573448181, + "learning_rate": 7.377910904912336e-06, + "loss": 0.7343, + "step": 12502 + }, + { + "epoch": 0.6881501458528262, + "grad_norm": 0.8010555505752563, + "learning_rate": 7.377529587836135e-06, + "loss": 0.6789, + "step": 12503 + }, + { + "epoch": 0.6882051846551819, + "grad_norm": 0.6339166164398193, + "learning_rate": 7.3771482528910585e-06, + "loss": 0.7471, + "step": 12504 + }, + { + "epoch": 0.6882602234575376, + "grad_norm": 0.6750906109809875, + "learning_rate": 7.376766900079973e-06, + "loss": 0.665, + "step": 12505 + }, + { + "epoch": 0.6883152622598933, + "grad_norm": 0.6440090537071228, + "learning_rate": 7.376385529405743e-06, + "loss": 0.6804, + "step": 12506 + }, + { + "epoch": 0.6883703010622488, + "grad_norm": 0.7159061431884766, + "learning_rate": 7.376004140871236e-06, + "loss": 0.7524, + "step": 12507 + }, + { + "epoch": 0.6884253398646045, + "grad_norm": 0.7551491260528564, + "learning_rate": 7.375622734479316e-06, + "loss": 0.891, + "step": 12508 + }, + { + "epoch": 0.6884803786669602, + "grad_norm": 0.6584289073944092, + "learning_rate": 7.375241310232854e-06, + "loss": 0.7313, + "step": 12509 + }, + { + "epoch": 0.6885354174693159, + "grad_norm": 0.7616147398948669, + "learning_rate": 7.374859868134713e-06, + "loss": 0.8351, + "step": 12510 + }, + { + "epoch": 0.6885904562716715, + "grad_norm": 0.669541597366333, + "learning_rate": 7.374478408187761e-06, + "loss": 0.6836, + "step": 12511 + }, + { + "epoch": 0.6886454950740272, + "grad_norm": 0.6483158469200134, + "learning_rate": 7.374096930394864e-06, + "loss": 0.6909, + "step": 12512 + }, + { + "epoch": 0.6887005338763829, + "grad_norm": 0.7079604864120483, + "learning_rate": 7.3737154347588925e-06, + "loss": 0.7151, + "step": 12513 + }, + { + "epoch": 0.6887555726787385, + "grad_norm": 0.6805073618888855, + "learning_rate": 7.373333921282709e-06, + "loss": 0.7761, + "step": 12514 + }, + { + "epoch": 0.6888106114810941, + "grad_norm": 0.757008969783783, + "learning_rate": 7.372952389969183e-06, + "loss": 0.7249, + "step": 12515 + }, + { + "epoch": 0.6888656502834498, + "grad_norm": 0.6990587711334229, + "learning_rate": 7.372570840821183e-06, + "loss": 0.7463, + "step": 12516 + }, + { + "epoch": 0.6889206890858055, + "grad_norm": 0.7405683398246765, + "learning_rate": 7.3721892738415745e-06, + "loss": 0.8039, + "step": 12517 + }, + { + "epoch": 0.6889757278881612, + "grad_norm": 0.6736571192741394, + "learning_rate": 7.371807689033228e-06, + "loss": 0.7084, + "step": 12518 + }, + { + "epoch": 0.6890307666905168, + "grad_norm": 0.752955436706543, + "learning_rate": 7.3714260863990095e-06, + "loss": 0.7951, + "step": 12519 + }, + { + "epoch": 0.6890858054928725, + "grad_norm": 0.6810917258262634, + "learning_rate": 7.3710444659417855e-06, + "loss": 0.7884, + "step": 12520 + }, + { + "epoch": 0.6891408442952281, + "grad_norm": 0.727500855922699, + "learning_rate": 7.370662827664427e-06, + "loss": 0.7617, + "step": 12521 + }, + { + "epoch": 0.6891958830975838, + "grad_norm": 0.6739845871925354, + "learning_rate": 7.3702811715698016e-06, + "loss": 0.6831, + "step": 12522 + }, + { + "epoch": 0.6892509218999394, + "grad_norm": 0.850913941860199, + "learning_rate": 7.369899497660779e-06, + "loss": 0.7658, + "step": 12523 + }, + { + "epoch": 0.6893059607022951, + "grad_norm": 0.7352884411811829, + "learning_rate": 7.369517805940223e-06, + "loss": 0.7748, + "step": 12524 + }, + { + "epoch": 0.6893609995046508, + "grad_norm": 0.6702300310134888, + "learning_rate": 7.369136096411008e-06, + "loss": 0.7557, + "step": 12525 + }, + { + "epoch": 0.6894160383070065, + "grad_norm": 0.7117186784744263, + "learning_rate": 7.368754369075999e-06, + "loss": 0.8147, + "step": 12526 + }, + { + "epoch": 0.6894710771093621, + "grad_norm": 0.6896687746047974, + "learning_rate": 7.368372623938067e-06, + "loss": 0.7753, + "step": 12527 + }, + { + "epoch": 0.6895261159117178, + "grad_norm": 0.669207751750946, + "learning_rate": 7.367990861000078e-06, + "loss": 0.739, + "step": 12528 + }, + { + "epoch": 0.6895811547140734, + "grad_norm": 0.7014279961585999, + "learning_rate": 7.367609080264906e-06, + "loss": 0.7712, + "step": 12529 + }, + { + "epoch": 0.6896361935164291, + "grad_norm": 1.0029237270355225, + "learning_rate": 7.367227281735418e-06, + "loss": 0.7641, + "step": 12530 + }, + { + "epoch": 0.6896912323187847, + "grad_norm": 0.6342340707778931, + "learning_rate": 7.3668454654144824e-06, + "loss": 0.7572, + "step": 12531 + }, + { + "epoch": 0.6897462711211404, + "grad_norm": 0.7475802302360535, + "learning_rate": 7.3664636313049696e-06, + "loss": 0.7969, + "step": 12532 + }, + { + "epoch": 0.6898013099234961, + "grad_norm": 0.7478888630867004, + "learning_rate": 7.36608177940975e-06, + "loss": 0.8299, + "step": 12533 + }, + { + "epoch": 0.6898563487258518, + "grad_norm": 0.7017174363136292, + "learning_rate": 7.365699909731694e-06, + "loss": 0.6608, + "step": 12534 + }, + { + "epoch": 0.6899113875282074, + "grad_norm": 0.7259606122970581, + "learning_rate": 7.3653180222736695e-06, + "loss": 0.7088, + "step": 12535 + }, + { + "epoch": 0.689966426330563, + "grad_norm": 0.7049521207809448, + "learning_rate": 7.364936117038548e-06, + "loss": 0.8177, + "step": 12536 + }, + { + "epoch": 0.6900214651329187, + "grad_norm": 0.6557304263114929, + "learning_rate": 7.364554194029201e-06, + "loss": 0.73, + "step": 12537 + }, + { + "epoch": 0.6900765039352744, + "grad_norm": 0.704140305519104, + "learning_rate": 7.364172253248497e-06, + "loss": 0.7671, + "step": 12538 + }, + { + "epoch": 0.69013154273763, + "grad_norm": 0.6879541873931885, + "learning_rate": 7.3637902946993064e-06, + "loss": 0.6707, + "step": 12539 + }, + { + "epoch": 0.6901865815399857, + "grad_norm": 0.7715931534767151, + "learning_rate": 7.363408318384501e-06, + "loss": 0.7494, + "step": 12540 + }, + { + "epoch": 0.6902416203423414, + "grad_norm": 0.7890990972518921, + "learning_rate": 7.363026324306952e-06, + "loss": 0.7499, + "step": 12541 + }, + { + "epoch": 0.6902966591446971, + "grad_norm": 0.7177792191505432, + "learning_rate": 7.362644312469529e-06, + "loss": 0.8053, + "step": 12542 + }, + { + "epoch": 0.6903516979470526, + "grad_norm": 0.7434332370758057, + "learning_rate": 7.3622622828751044e-06, + "loss": 0.7371, + "step": 12543 + }, + { + "epoch": 0.6904067367494083, + "grad_norm": 0.5836912989616394, + "learning_rate": 7.361880235526547e-06, + "loss": 0.6681, + "step": 12544 + }, + { + "epoch": 0.690461775551764, + "grad_norm": 0.6814625263214111, + "learning_rate": 7.3614981704267315e-06, + "loss": 0.7408, + "step": 12545 + }, + { + "epoch": 0.6905168143541196, + "grad_norm": 0.6524162292480469, + "learning_rate": 7.361116087578528e-06, + "loss": 0.6788, + "step": 12546 + }, + { + "epoch": 0.6905718531564753, + "grad_norm": 0.6614788174629211, + "learning_rate": 7.360733986984808e-06, + "loss": 0.75, + "step": 12547 + }, + { + "epoch": 0.690626891958831, + "grad_norm": 1.035152792930603, + "learning_rate": 7.360351868648442e-06, + "loss": 0.7181, + "step": 12548 + }, + { + "epoch": 0.6906819307611867, + "grad_norm": 0.7525657415390015, + "learning_rate": 7.359969732572305e-06, + "loss": 0.8149, + "step": 12549 + }, + { + "epoch": 0.6907369695635422, + "grad_norm": 0.8323431015014648, + "learning_rate": 7.359587578759267e-06, + "loss": 0.6908, + "step": 12550 + }, + { + "epoch": 0.6907920083658979, + "grad_norm": 0.7551344633102417, + "learning_rate": 7.3592054072122e-06, + "loss": 0.794, + "step": 12551 + }, + { + "epoch": 0.6908470471682536, + "grad_norm": 0.5937384366989136, + "learning_rate": 7.358823217933977e-06, + "loss": 0.6532, + "step": 12552 + }, + { + "epoch": 0.6909020859706093, + "grad_norm": 1.5515329837799072, + "learning_rate": 7.358441010927468e-06, + "loss": 0.7003, + "step": 12553 + }, + { + "epoch": 0.6909571247729649, + "grad_norm": 0.6838175654411316, + "learning_rate": 7.3580587861955495e-06, + "loss": 0.7184, + "step": 12554 + }, + { + "epoch": 0.6910121635753206, + "grad_norm": 0.7055354714393616, + "learning_rate": 7.357676543741092e-06, + "loss": 0.8372, + "step": 12555 + }, + { + "epoch": 0.6910672023776763, + "grad_norm": 0.8683249950408936, + "learning_rate": 7.3572942835669695e-06, + "loss": 0.7594, + "step": 12556 + }, + { + "epoch": 0.691122241180032, + "grad_norm": 0.8586179614067078, + "learning_rate": 7.3569120056760535e-06, + "loss": 0.8422, + "step": 12557 + }, + { + "epoch": 0.6911772799823875, + "grad_norm": 0.692132830619812, + "learning_rate": 7.356529710071217e-06, + "loss": 0.7872, + "step": 12558 + }, + { + "epoch": 0.6912323187847432, + "grad_norm": 0.7342404723167419, + "learning_rate": 7.356147396755335e-06, + "loss": 0.6908, + "step": 12559 + }, + { + "epoch": 0.6912873575870989, + "grad_norm": 0.6941357254981995, + "learning_rate": 7.35576506573128e-06, + "loss": 0.608, + "step": 12560 + }, + { + "epoch": 0.6913423963894546, + "grad_norm": 0.648225724697113, + "learning_rate": 7.355382717001925e-06, + "loss": 0.6923, + "step": 12561 + }, + { + "epoch": 0.6913974351918102, + "grad_norm": 0.6735422015190125, + "learning_rate": 7.355000350570144e-06, + "loss": 0.7502, + "step": 12562 + }, + { + "epoch": 0.6914524739941659, + "grad_norm": 0.8507662415504456, + "learning_rate": 7.3546179664388105e-06, + "loss": 0.7883, + "step": 12563 + }, + { + "epoch": 0.6915075127965216, + "grad_norm": 0.7287268042564392, + "learning_rate": 7.3542355646108e-06, + "loss": 0.8687, + "step": 12564 + }, + { + "epoch": 0.6915625515988773, + "grad_norm": 0.6085666418075562, + "learning_rate": 7.353853145088983e-06, + "loss": 0.6675, + "step": 12565 + }, + { + "epoch": 0.6916175904012328, + "grad_norm": 0.727668046951294, + "learning_rate": 7.353470707876237e-06, + "loss": 0.8591, + "step": 12566 + }, + { + "epoch": 0.6916726292035885, + "grad_norm": 0.724846601486206, + "learning_rate": 7.353088252975436e-06, + "loss": 0.8501, + "step": 12567 + }, + { + "epoch": 0.6917276680059442, + "grad_norm": 0.6801046133041382, + "learning_rate": 7.352705780389452e-06, + "loss": 0.7637, + "step": 12568 + }, + { + "epoch": 0.6917827068082999, + "grad_norm": 0.680496335029602, + "learning_rate": 7.352323290121161e-06, + "loss": 0.7308, + "step": 12569 + }, + { + "epoch": 0.6918377456106555, + "grad_norm": 0.7143607139587402, + "learning_rate": 7.351940782173439e-06, + "loss": 0.7494, + "step": 12570 + }, + { + "epoch": 0.6918927844130112, + "grad_norm": 0.679755687713623, + "learning_rate": 7.351558256549158e-06, + "loss": 0.7731, + "step": 12571 + }, + { + "epoch": 0.6919478232153669, + "grad_norm": 0.6626351475715637, + "learning_rate": 7.351175713251197e-06, + "loss": 0.8593, + "step": 12572 + }, + { + "epoch": 0.6920028620177225, + "grad_norm": 0.6830954551696777, + "learning_rate": 7.350793152282427e-06, + "loss": 0.6327, + "step": 12573 + }, + { + "epoch": 0.6920579008200781, + "grad_norm": 0.653810977935791, + "learning_rate": 7.350410573645726e-06, + "loss": 0.7341, + "step": 12574 + }, + { + "epoch": 0.6921129396224338, + "grad_norm": 0.6939566731452942, + "learning_rate": 7.3500279773439675e-06, + "loss": 0.7823, + "step": 12575 + }, + { + "epoch": 0.6921679784247895, + "grad_norm": 0.8212422728538513, + "learning_rate": 7.349645363380029e-06, + "loss": 0.6388, + "step": 12576 + }, + { + "epoch": 0.6922230172271452, + "grad_norm": 0.7703338265419006, + "learning_rate": 7.349262731756783e-06, + "loss": 0.7476, + "step": 12577 + }, + { + "epoch": 0.6922780560295008, + "grad_norm": 0.6710889935493469, + "learning_rate": 7.348880082477108e-06, + "loss": 0.7869, + "step": 12578 + }, + { + "epoch": 0.6923330948318565, + "grad_norm": 0.7384413480758667, + "learning_rate": 7.3484974155438795e-06, + "loss": 0.6628, + "step": 12579 + }, + { + "epoch": 0.6923881336342121, + "grad_norm": 0.7628176212310791, + "learning_rate": 7.348114730959973e-06, + "loss": 0.7599, + "step": 12580 + }, + { + "epoch": 0.6924431724365678, + "grad_norm": 0.683885931968689, + "learning_rate": 7.347732028728264e-06, + "loss": 0.7134, + "step": 12581 + }, + { + "epoch": 0.6924982112389234, + "grad_norm": 0.6710503697395325, + "learning_rate": 7.34734930885163e-06, + "loss": 0.7147, + "step": 12582 + }, + { + "epoch": 0.6925532500412791, + "grad_norm": 0.6984537243843079, + "learning_rate": 7.346966571332947e-06, + "loss": 0.7517, + "step": 12583 + }, + { + "epoch": 0.6926082888436348, + "grad_norm": 0.7563193440437317, + "learning_rate": 7.346583816175092e-06, + "loss": 0.7971, + "step": 12584 + }, + { + "epoch": 0.6926633276459905, + "grad_norm": 0.8407838940620422, + "learning_rate": 7.346201043380941e-06, + "loss": 0.8227, + "step": 12585 + }, + { + "epoch": 0.6927183664483461, + "grad_norm": 0.673098623752594, + "learning_rate": 7.345818252953369e-06, + "loss": 0.7514, + "step": 12586 + }, + { + "epoch": 0.6927734052507017, + "grad_norm": 0.6452111005783081, + "learning_rate": 7.345435444895257e-06, + "loss": 0.7201, + "step": 12587 + }, + { + "epoch": 0.6928284440530574, + "grad_norm": 0.8728383779525757, + "learning_rate": 7.345052619209481e-06, + "loss": 0.7452, + "step": 12588 + }, + { + "epoch": 0.692883482855413, + "grad_norm": 0.7032049298286438, + "learning_rate": 7.344669775898914e-06, + "loss": 0.8885, + "step": 12589 + }, + { + "epoch": 0.6929385216577687, + "grad_norm": 0.7744605541229248, + "learning_rate": 7.344286914966438e-06, + "loss": 0.8048, + "step": 12590 + }, + { + "epoch": 0.6929935604601244, + "grad_norm": 0.7334163784980774, + "learning_rate": 7.343904036414931e-06, + "loss": 0.8502, + "step": 12591 + }, + { + "epoch": 0.6930485992624801, + "grad_norm": 0.6684108376502991, + "learning_rate": 7.343521140247266e-06, + "loss": 0.8264, + "step": 12592 + }, + { + "epoch": 0.6931036380648357, + "grad_norm": 0.6192718744277954, + "learning_rate": 7.343138226466324e-06, + "loss": 0.6625, + "step": 12593 + }, + { + "epoch": 0.6931586768671913, + "grad_norm": 0.6410724520683289, + "learning_rate": 7.342755295074984e-06, + "loss": 0.717, + "step": 12594 + }, + { + "epoch": 0.693213715669547, + "grad_norm": 0.6854361891746521, + "learning_rate": 7.342372346076121e-06, + "loss": 0.7246, + "step": 12595 + }, + { + "epoch": 0.6932687544719027, + "grad_norm": 0.6920250058174133, + "learning_rate": 7.341989379472614e-06, + "loss": 0.7414, + "step": 12596 + }, + { + "epoch": 0.6933237932742583, + "grad_norm": 0.6545842885971069, + "learning_rate": 7.341606395267342e-06, + "loss": 0.7731, + "step": 12597 + }, + { + "epoch": 0.693378832076614, + "grad_norm": 0.6879072785377502, + "learning_rate": 7.341223393463184e-06, + "loss": 0.7272, + "step": 12598 + }, + { + "epoch": 0.6934338708789697, + "grad_norm": 0.7460979223251343, + "learning_rate": 7.340840374063018e-06, + "loss": 0.771, + "step": 12599 + }, + { + "epoch": 0.6934889096813254, + "grad_norm": 0.7836858630180359, + "learning_rate": 7.340457337069722e-06, + "loss": 0.846, + "step": 12600 + }, + { + "epoch": 0.693543948483681, + "grad_norm": 0.958403468132019, + "learning_rate": 7.340074282486174e-06, + "loss": 0.8913, + "step": 12601 + }, + { + "epoch": 0.6935989872860366, + "grad_norm": 0.6614813208580017, + "learning_rate": 7.339691210315254e-06, + "loss": 0.7129, + "step": 12602 + }, + { + "epoch": 0.6936540260883923, + "grad_norm": 0.7303252816200256, + "learning_rate": 7.339308120559843e-06, + "loss": 0.8395, + "step": 12603 + }, + { + "epoch": 0.693709064890748, + "grad_norm": 0.7341620922088623, + "learning_rate": 7.338925013222817e-06, + "loss": 0.8341, + "step": 12604 + }, + { + "epoch": 0.6937641036931036, + "grad_norm": 0.7077179551124573, + "learning_rate": 7.338541888307056e-06, + "loss": 0.7813, + "step": 12605 + }, + { + "epoch": 0.6938191424954593, + "grad_norm": 0.6654969453811646, + "learning_rate": 7.338158745815441e-06, + "loss": 0.7337, + "step": 12606 + }, + { + "epoch": 0.693874181297815, + "grad_norm": 0.6637474894523621, + "learning_rate": 7.337775585750852e-06, + "loss": 0.8197, + "step": 12607 + }, + { + "epoch": 0.6939292201001707, + "grad_norm": 0.654712975025177, + "learning_rate": 7.337392408116166e-06, + "loss": 0.6991, + "step": 12608 + }, + { + "epoch": 0.6939842589025262, + "grad_norm": 0.6698346138000488, + "learning_rate": 7.337009212914265e-06, + "loss": 0.7991, + "step": 12609 + }, + { + "epoch": 0.6940392977048819, + "grad_norm": 0.9616294503211975, + "learning_rate": 7.336626000148028e-06, + "loss": 0.7326, + "step": 12610 + }, + { + "epoch": 0.6940943365072376, + "grad_norm": 0.7749543786048889, + "learning_rate": 7.336242769820335e-06, + "loss": 0.8015, + "step": 12611 + }, + { + "epoch": 0.6941493753095933, + "grad_norm": 0.7263140678405762, + "learning_rate": 7.335859521934068e-06, + "loss": 0.7538, + "step": 12612 + }, + { + "epoch": 0.6942044141119489, + "grad_norm": 0.6383689641952515, + "learning_rate": 7.335476256492105e-06, + "loss": 0.7611, + "step": 12613 + }, + { + "epoch": 0.6942594529143046, + "grad_norm": 0.7464908957481384, + "learning_rate": 7.335092973497326e-06, + "loss": 0.7904, + "step": 12614 + }, + { + "epoch": 0.6943144917166603, + "grad_norm": 1.114864468574524, + "learning_rate": 7.334709672952615e-06, + "loss": 0.8518, + "step": 12615 + }, + { + "epoch": 0.694369530519016, + "grad_norm": 0.6712734699249268, + "learning_rate": 7.334326354860852e-06, + "loss": 0.7431, + "step": 12616 + }, + { + "epoch": 0.6944245693213715, + "grad_norm": 0.7559850811958313, + "learning_rate": 7.3339430192249166e-06, + "loss": 0.7556, + "step": 12617 + }, + { + "epoch": 0.6944796081237272, + "grad_norm": 0.7262033224105835, + "learning_rate": 7.333559666047689e-06, + "loss": 0.7624, + "step": 12618 + }, + { + "epoch": 0.6945346469260829, + "grad_norm": 0.6428695917129517, + "learning_rate": 7.333176295332053e-06, + "loss": 0.6894, + "step": 12619 + }, + { + "epoch": 0.6945896857284386, + "grad_norm": 0.7353672385215759, + "learning_rate": 7.3327929070808875e-06, + "loss": 0.7611, + "step": 12620 + }, + { + "epoch": 0.6946447245307942, + "grad_norm": 0.7063810229301453, + "learning_rate": 7.332409501297076e-06, + "loss": 0.7428, + "step": 12621 + }, + { + "epoch": 0.6946997633331499, + "grad_norm": 0.6552421450614929, + "learning_rate": 7.332026077983498e-06, + "loss": 0.7046, + "step": 12622 + }, + { + "epoch": 0.6947548021355056, + "grad_norm": 0.8843327760696411, + "learning_rate": 7.331642637143037e-06, + "loss": 0.6952, + "step": 12623 + }, + { + "epoch": 0.6948098409378612, + "grad_norm": 0.7279102802276611, + "learning_rate": 7.331259178778574e-06, + "loss": 0.7911, + "step": 12624 + }, + { + "epoch": 0.6948648797402168, + "grad_norm": 0.6585525870323181, + "learning_rate": 7.33087570289299e-06, + "loss": 0.7684, + "step": 12625 + }, + { + "epoch": 0.6949199185425725, + "grad_norm": 0.663185715675354, + "learning_rate": 7.3304922094891695e-06, + "loss": 0.6753, + "step": 12626 + }, + { + "epoch": 0.6949749573449282, + "grad_norm": 0.652765691280365, + "learning_rate": 7.330108698569993e-06, + "loss": 0.7333, + "step": 12627 + }, + { + "epoch": 0.6950299961472839, + "grad_norm": 0.7781688570976257, + "learning_rate": 7.329725170138343e-06, + "loss": 0.7312, + "step": 12628 + }, + { + "epoch": 0.6950850349496395, + "grad_norm": 0.6798241138458252, + "learning_rate": 7.329341624197102e-06, + "loss": 0.7747, + "step": 12629 + }, + { + "epoch": 0.6951400737519952, + "grad_norm": 0.7588373422622681, + "learning_rate": 7.328958060749153e-06, + "loss": 0.8535, + "step": 12630 + }, + { + "epoch": 0.6951951125543508, + "grad_norm": 0.8833348155021667, + "learning_rate": 7.328574479797379e-06, + "loss": 0.8345, + "step": 12631 + }, + { + "epoch": 0.6952501513567064, + "grad_norm": 0.799454927444458, + "learning_rate": 7.328190881344663e-06, + "loss": 0.7571, + "step": 12632 + }, + { + "epoch": 0.6953051901590621, + "grad_norm": 0.8030340671539307, + "learning_rate": 7.327807265393887e-06, + "loss": 0.7426, + "step": 12633 + }, + { + "epoch": 0.6953602289614178, + "grad_norm": 0.6246228218078613, + "learning_rate": 7.327423631947934e-06, + "loss": 0.6712, + "step": 12634 + }, + { + "epoch": 0.6954152677637735, + "grad_norm": 0.7203500866889954, + "learning_rate": 7.32703998100969e-06, + "loss": 0.8315, + "step": 12635 + }, + { + "epoch": 0.6954703065661291, + "grad_norm": 0.6128239035606384, + "learning_rate": 7.326656312582035e-06, + "loss": 0.6788, + "step": 12636 + }, + { + "epoch": 0.6955253453684848, + "grad_norm": 0.8052619695663452, + "learning_rate": 7.326272626667852e-06, + "loss": 0.8076, + "step": 12637 + }, + { + "epoch": 0.6955803841708404, + "grad_norm": 0.9128470420837402, + "learning_rate": 7.325888923270029e-06, + "loss": 0.7135, + "step": 12638 + }, + { + "epoch": 0.6956354229731961, + "grad_norm": 0.6815299391746521, + "learning_rate": 7.325505202391447e-06, + "loss": 0.7756, + "step": 12639 + }, + { + "epoch": 0.6956904617755517, + "grad_norm": 0.6278733611106873, + "learning_rate": 7.325121464034991e-06, + "loss": 0.6583, + "step": 12640 + }, + { + "epoch": 0.6957455005779074, + "grad_norm": 0.7161649465560913, + "learning_rate": 7.324737708203543e-06, + "loss": 0.7106, + "step": 12641 + }, + { + "epoch": 0.6958005393802631, + "grad_norm": 0.6827715635299683, + "learning_rate": 7.324353934899989e-06, + "loss": 0.7988, + "step": 12642 + }, + { + "epoch": 0.6958555781826188, + "grad_norm": 0.9999695420265198, + "learning_rate": 7.323970144127215e-06, + "loss": 0.8222, + "step": 12643 + }, + { + "epoch": 0.6959106169849744, + "grad_norm": 0.8048173785209656, + "learning_rate": 7.323586335888102e-06, + "loss": 0.7157, + "step": 12644 + }, + { + "epoch": 0.69596565578733, + "grad_norm": 0.7403637170791626, + "learning_rate": 7.323202510185536e-06, + "loss": 0.7516, + "step": 12645 + }, + { + "epoch": 0.6960206945896857, + "grad_norm": 0.6660793423652649, + "learning_rate": 7.322818667022402e-06, + "loss": 0.7081, + "step": 12646 + }, + { + "epoch": 0.6960757333920414, + "grad_norm": 0.713985800743103, + "learning_rate": 7.322434806401585e-06, + "loss": 0.7682, + "step": 12647 + }, + { + "epoch": 0.696130772194397, + "grad_norm": 0.739253044128418, + "learning_rate": 7.322050928325969e-06, + "loss": 0.838, + "step": 12648 + }, + { + "epoch": 0.6961858109967527, + "grad_norm": 0.8350489735603333, + "learning_rate": 7.32166703279844e-06, + "loss": 0.7627, + "step": 12649 + }, + { + "epoch": 0.6962408497991084, + "grad_norm": 0.580456018447876, + "learning_rate": 7.321283119821883e-06, + "loss": 0.6248, + "step": 12650 + }, + { + "epoch": 0.6962958886014641, + "grad_norm": 0.8619480729103088, + "learning_rate": 7.320899189399183e-06, + "loss": 0.848, + "step": 12651 + }, + { + "epoch": 0.6963509274038197, + "grad_norm": 0.6201381087303162, + "learning_rate": 7.320515241533227e-06, + "loss": 0.6506, + "step": 12652 + }, + { + "epoch": 0.6964059662061753, + "grad_norm": 0.6956773400306702, + "learning_rate": 7.320131276226898e-06, + "loss": 0.7561, + "step": 12653 + }, + { + "epoch": 0.696461005008531, + "grad_norm": 0.6382080912590027, + "learning_rate": 7.319747293483085e-06, + "loss": 0.6462, + "step": 12654 + }, + { + "epoch": 0.6965160438108867, + "grad_norm": 0.7288708686828613, + "learning_rate": 7.319363293304672e-06, + "loss": 0.7907, + "step": 12655 + }, + { + "epoch": 0.6965710826132423, + "grad_norm": 0.6280390024185181, + "learning_rate": 7.318979275694546e-06, + "loss": 0.6882, + "step": 12656 + }, + { + "epoch": 0.696626121415598, + "grad_norm": 0.7260308861732483, + "learning_rate": 7.31859524065559e-06, + "loss": 0.756, + "step": 12657 + }, + { + "epoch": 0.6966811602179537, + "grad_norm": 0.6715009212493896, + "learning_rate": 7.318211188190696e-06, + "loss": 0.7194, + "step": 12658 + }, + { + "epoch": 0.6967361990203094, + "grad_norm": 0.6770408749580383, + "learning_rate": 7.3178271183027465e-06, + "loss": 0.808, + "step": 12659 + }, + { + "epoch": 0.6967912378226649, + "grad_norm": 0.7209904789924622, + "learning_rate": 7.317443030994628e-06, + "loss": 0.7242, + "step": 12660 + }, + { + "epoch": 0.6968462766250206, + "grad_norm": 0.6943202018737793, + "learning_rate": 7.317058926269227e-06, + "loss": 0.758, + "step": 12661 + }, + { + "epoch": 0.6969013154273763, + "grad_norm": 0.6073412299156189, + "learning_rate": 7.316674804129432e-06, + "loss": 0.6571, + "step": 12662 + }, + { + "epoch": 0.696956354229732, + "grad_norm": 0.7065439224243164, + "learning_rate": 7.316290664578129e-06, + "loss": 0.7333, + "step": 12663 + }, + { + "epoch": 0.6970113930320876, + "grad_norm": 0.6275133490562439, + "learning_rate": 7.315906507618207e-06, + "loss": 0.6785, + "step": 12664 + }, + { + "epoch": 0.6970664318344433, + "grad_norm": 0.6484677791595459, + "learning_rate": 7.315522333252551e-06, + "loss": 0.7461, + "step": 12665 + }, + { + "epoch": 0.697121470636799, + "grad_norm": 0.6815413236618042, + "learning_rate": 7.315138141484049e-06, + "loss": 0.673, + "step": 12666 + }, + { + "epoch": 0.6971765094391547, + "grad_norm": 0.7227872610092163, + "learning_rate": 7.314753932315587e-06, + "loss": 0.7212, + "step": 12667 + }, + { + "epoch": 0.6972315482415102, + "grad_norm": 0.661568284034729, + "learning_rate": 7.314369705750055e-06, + "loss": 0.7633, + "step": 12668 + }, + { + "epoch": 0.6972865870438659, + "grad_norm": 0.5873990654945374, + "learning_rate": 7.3139854617903405e-06, + "loss": 0.6142, + "step": 12669 + }, + { + "epoch": 0.6973416258462216, + "grad_norm": 0.7015652656555176, + "learning_rate": 7.313601200439331e-06, + "loss": 0.6762, + "step": 12670 + }, + { + "epoch": 0.6973966646485773, + "grad_norm": 0.7060853242874146, + "learning_rate": 7.313216921699913e-06, + "loss": 0.8111, + "step": 12671 + }, + { + "epoch": 0.6974517034509329, + "grad_norm": 0.6198092699050903, + "learning_rate": 7.312832625574977e-06, + "loss": 0.7058, + "step": 12672 + }, + { + "epoch": 0.6975067422532886, + "grad_norm": 0.6785464286804199, + "learning_rate": 7.312448312067408e-06, + "loss": 0.7509, + "step": 12673 + }, + { + "epoch": 0.6975617810556443, + "grad_norm": 0.74974524974823, + "learning_rate": 7.312063981180097e-06, + "loss": 0.7679, + "step": 12674 + }, + { + "epoch": 0.6976168198579998, + "grad_norm": 0.6188651919364929, + "learning_rate": 7.311679632915934e-06, + "loss": 0.663, + "step": 12675 + }, + { + "epoch": 0.6976718586603555, + "grad_norm": 0.7458493113517761, + "learning_rate": 7.3112952672778044e-06, + "loss": 0.7316, + "step": 12676 + }, + { + "epoch": 0.6977268974627112, + "grad_norm": 0.7480403780937195, + "learning_rate": 7.310910884268597e-06, + "loss": 0.8476, + "step": 12677 + }, + { + "epoch": 0.6977819362650669, + "grad_norm": 0.6921943426132202, + "learning_rate": 7.310526483891204e-06, + "loss": 0.7931, + "step": 12678 + }, + { + "epoch": 0.6978369750674225, + "grad_norm": 0.7384023666381836, + "learning_rate": 7.3101420661485124e-06, + "loss": 0.7698, + "step": 12679 + }, + { + "epoch": 0.6978920138697782, + "grad_norm": 0.6693310141563416, + "learning_rate": 7.3097576310434105e-06, + "loss": 0.6838, + "step": 12680 + }, + { + "epoch": 0.6979470526721339, + "grad_norm": 0.6888617873191833, + "learning_rate": 7.309373178578789e-06, + "loss": 0.7196, + "step": 12681 + }, + { + "epoch": 0.6980020914744895, + "grad_norm": 0.7608165144920349, + "learning_rate": 7.308988708757536e-06, + "loss": 0.7483, + "step": 12682 + }, + { + "epoch": 0.6980571302768451, + "grad_norm": 0.6969812512397766, + "learning_rate": 7.308604221582543e-06, + "loss": 0.7415, + "step": 12683 + }, + { + "epoch": 0.6981121690792008, + "grad_norm": 0.7440872192382812, + "learning_rate": 7.3082197170566996e-06, + "loss": 0.7776, + "step": 12684 + }, + { + "epoch": 0.6981672078815565, + "grad_norm": 0.7920299768447876, + "learning_rate": 7.307835195182892e-06, + "loss": 0.746, + "step": 12685 + }, + { + "epoch": 0.6982222466839122, + "grad_norm": 0.7002919912338257, + "learning_rate": 7.3074506559640134e-06, + "loss": 0.7948, + "step": 12686 + }, + { + "epoch": 0.6982772854862678, + "grad_norm": 0.7199681997299194, + "learning_rate": 7.3070660994029554e-06, + "loss": 0.7568, + "step": 12687 + }, + { + "epoch": 0.6983323242886235, + "grad_norm": 0.6287575960159302, + "learning_rate": 7.306681525502604e-06, + "loss": 0.6564, + "step": 12688 + }, + { + "epoch": 0.6983873630909792, + "grad_norm": 0.6910778880119324, + "learning_rate": 7.306296934265853e-06, + "loss": 0.7892, + "step": 12689 + }, + { + "epoch": 0.6984424018933348, + "grad_norm": 0.6454603672027588, + "learning_rate": 7.30591232569559e-06, + "loss": 0.7848, + "step": 12690 + }, + { + "epoch": 0.6984974406956904, + "grad_norm": 0.7337101101875305, + "learning_rate": 7.305527699794709e-06, + "loss": 0.8012, + "step": 12691 + }, + { + "epoch": 0.6985524794980461, + "grad_norm": 0.6694337129592896, + "learning_rate": 7.305143056566098e-06, + "loss": 0.7767, + "step": 12692 + }, + { + "epoch": 0.6986075183004018, + "grad_norm": 0.6485214233398438, + "learning_rate": 7.30475839601265e-06, + "loss": 0.7142, + "step": 12693 + }, + { + "epoch": 0.6986625571027575, + "grad_norm": 0.6401854753494263, + "learning_rate": 7.304373718137253e-06, + "loss": 0.6562, + "step": 12694 + }, + { + "epoch": 0.6987175959051131, + "grad_norm": 0.7190635800361633, + "learning_rate": 7.303989022942801e-06, + "loss": 0.7513, + "step": 12695 + }, + { + "epoch": 0.6987726347074688, + "grad_norm": 0.7100299596786499, + "learning_rate": 7.3036043104321854e-06, + "loss": 0.759, + "step": 12696 + }, + { + "epoch": 0.6988276735098244, + "grad_norm": 0.8507145047187805, + "learning_rate": 7.303219580608295e-06, + "loss": 0.7567, + "step": 12697 + }, + { + "epoch": 0.6988827123121801, + "grad_norm": 0.6758378744125366, + "learning_rate": 7.302834833474022e-06, + "loss": 0.6751, + "step": 12698 + }, + { + "epoch": 0.6989377511145357, + "grad_norm": 0.7602974772453308, + "learning_rate": 7.30245006903226e-06, + "loss": 0.7304, + "step": 12699 + }, + { + "epoch": 0.6989927899168914, + "grad_norm": 0.7519045472145081, + "learning_rate": 7.3020652872859e-06, + "loss": 0.7573, + "step": 12700 + }, + { + "epoch": 0.6990478287192471, + "grad_norm": 0.6076456904411316, + "learning_rate": 7.301680488237832e-06, + "loss": 0.6335, + "step": 12701 + }, + { + "epoch": 0.6991028675216028, + "grad_norm": 0.6900685429573059, + "learning_rate": 7.30129567189095e-06, + "loss": 0.7787, + "step": 12702 + }, + { + "epoch": 0.6991579063239584, + "grad_norm": 0.7366316318511963, + "learning_rate": 7.300910838248146e-06, + "loss": 0.8176, + "step": 12703 + }, + { + "epoch": 0.699212945126314, + "grad_norm": 0.6658521890640259, + "learning_rate": 7.300525987312312e-06, + "loss": 0.6436, + "step": 12704 + }, + { + "epoch": 0.6992679839286697, + "grad_norm": 0.7635871171951294, + "learning_rate": 7.300141119086341e-06, + "loss": 0.8421, + "step": 12705 + }, + { + "epoch": 0.6993230227310254, + "grad_norm": 0.7257800698280334, + "learning_rate": 7.299756233573125e-06, + "loss": 0.6468, + "step": 12706 + }, + { + "epoch": 0.699378061533381, + "grad_norm": 0.7536096572875977, + "learning_rate": 7.299371330775558e-06, + "loss": 0.7782, + "step": 12707 + }, + { + "epoch": 0.6994331003357367, + "grad_norm": 0.7504379153251648, + "learning_rate": 7.298986410696529e-06, + "loss": 0.7097, + "step": 12708 + }, + { + "epoch": 0.6994881391380924, + "grad_norm": 0.7340306043624878, + "learning_rate": 7.298601473338936e-06, + "loss": 0.8165, + "step": 12709 + }, + { + "epoch": 0.6995431779404481, + "grad_norm": 0.6928045749664307, + "learning_rate": 7.298216518705667e-06, + "loss": 0.777, + "step": 12710 + }, + { + "epoch": 0.6995982167428036, + "grad_norm": 0.6942496299743652, + "learning_rate": 7.29783154679962e-06, + "loss": 0.6607, + "step": 12711 + }, + { + "epoch": 0.6996532555451593, + "grad_norm": 0.6646896600723267, + "learning_rate": 7.297446557623684e-06, + "loss": 0.712, + "step": 12712 + }, + { + "epoch": 0.699708294347515, + "grad_norm": 0.6828078627586365, + "learning_rate": 7.297061551180758e-06, + "loss": 0.7251, + "step": 12713 + }, + { + "epoch": 0.6997633331498707, + "grad_norm": 0.7554219365119934, + "learning_rate": 7.296676527473729e-06, + "loss": 0.8279, + "step": 12714 + }, + { + "epoch": 0.6998183719522263, + "grad_norm": 0.8122106194496155, + "learning_rate": 7.296291486505495e-06, + "loss": 0.8039, + "step": 12715 + }, + { + "epoch": 0.699873410754582, + "grad_norm": 0.6602222323417664, + "learning_rate": 7.295906428278949e-06, + "loss": 0.7149, + "step": 12716 + }, + { + "epoch": 0.6999284495569377, + "grad_norm": 0.8341954350471497, + "learning_rate": 7.2955213527969845e-06, + "loss": 0.7868, + "step": 12717 + }, + { + "epoch": 0.6999834883592932, + "grad_norm": 0.7157256603240967, + "learning_rate": 7.295136260062496e-06, + "loss": 0.745, + "step": 12718 + }, + { + "epoch": 0.7000385271616489, + "grad_norm": 0.5845672488212585, + "learning_rate": 7.294751150078379e-06, + "loss": 0.657, + "step": 12719 + }, + { + "epoch": 0.7000935659640046, + "grad_norm": 0.7370786070823669, + "learning_rate": 7.2943660228475265e-06, + "loss": 0.7883, + "step": 12720 + }, + { + "epoch": 0.7001486047663603, + "grad_norm": 0.6687451004981995, + "learning_rate": 7.293980878372833e-06, + "loss": 0.7945, + "step": 12721 + }, + { + "epoch": 0.7002036435687159, + "grad_norm": 0.6352105736732483, + "learning_rate": 7.293595716657192e-06, + "loss": 0.6581, + "step": 12722 + }, + { + "epoch": 0.7002586823710716, + "grad_norm": 0.7371370196342468, + "learning_rate": 7.293210537703499e-06, + "loss": 0.7859, + "step": 12723 + }, + { + "epoch": 0.7003137211734273, + "grad_norm": 0.6885504722595215, + "learning_rate": 7.292825341514651e-06, + "loss": 0.7355, + "step": 12724 + }, + { + "epoch": 0.700368759975783, + "grad_norm": 0.6930849552154541, + "learning_rate": 7.292440128093542e-06, + "loss": 0.8145, + "step": 12725 + }, + { + "epoch": 0.7004237987781385, + "grad_norm": 0.6767199635505676, + "learning_rate": 7.292054897443065e-06, + "loss": 0.7136, + "step": 12726 + }, + { + "epoch": 0.7004788375804942, + "grad_norm": 0.6672216653823853, + "learning_rate": 7.291669649566117e-06, + "loss": 0.6131, + "step": 12727 + }, + { + "epoch": 0.7005338763828499, + "grad_norm": 0.6618815064430237, + "learning_rate": 7.291284384465595e-06, + "loss": 0.7633, + "step": 12728 + }, + { + "epoch": 0.7005889151852056, + "grad_norm": 0.6573876142501831, + "learning_rate": 7.290899102144392e-06, + "loss": 0.7621, + "step": 12729 + }, + { + "epoch": 0.7006439539875612, + "grad_norm": 0.7449564337730408, + "learning_rate": 7.290513802605405e-06, + "loss": 0.6488, + "step": 12730 + }, + { + "epoch": 0.7006989927899169, + "grad_norm": 0.7307295203208923, + "learning_rate": 7.290128485851529e-06, + "loss": 0.7095, + "step": 12731 + }, + { + "epoch": 0.7007540315922726, + "grad_norm": 0.698699951171875, + "learning_rate": 7.2897431518856596e-06, + "loss": 0.7428, + "step": 12732 + }, + { + "epoch": 0.7008090703946283, + "grad_norm": 0.6334750056266785, + "learning_rate": 7.289357800710695e-06, + "loss": 0.6977, + "step": 12733 + }, + { + "epoch": 0.7008641091969838, + "grad_norm": 0.6526468396186829, + "learning_rate": 7.288972432329529e-06, + "loss": 0.6375, + "step": 12734 + }, + { + "epoch": 0.7009191479993395, + "grad_norm": 0.7282149791717529, + "learning_rate": 7.288587046745059e-06, + "loss": 0.7494, + "step": 12735 + }, + { + "epoch": 0.7009741868016952, + "grad_norm": 0.8511056900024414, + "learning_rate": 7.288201643960182e-06, + "loss": 0.7494, + "step": 12736 + }, + { + "epoch": 0.7010292256040509, + "grad_norm": 0.6908526420593262, + "learning_rate": 7.287816223977793e-06, + "loss": 0.6861, + "step": 12737 + }, + { + "epoch": 0.7010842644064065, + "grad_norm": 0.7582982182502747, + "learning_rate": 7.2874307868007896e-06, + "loss": 0.7758, + "step": 12738 + }, + { + "epoch": 0.7011393032087622, + "grad_norm": 0.9717779159545898, + "learning_rate": 7.2870453324320685e-06, + "loss": 0.7221, + "step": 12739 + }, + { + "epoch": 0.7011943420111179, + "grad_norm": 0.6532751321792603, + "learning_rate": 7.286659860874529e-06, + "loss": 0.8009, + "step": 12740 + }, + { + "epoch": 0.7012493808134735, + "grad_norm": 0.6708540320396423, + "learning_rate": 7.286274372131065e-06, + "loss": 0.7177, + "step": 12741 + }, + { + "epoch": 0.7013044196158291, + "grad_norm": 0.7624804973602295, + "learning_rate": 7.285888866204575e-06, + "loss": 0.7878, + "step": 12742 + }, + { + "epoch": 0.7013594584181848, + "grad_norm": 0.7167851328849792, + "learning_rate": 7.285503343097955e-06, + "loss": 0.7276, + "step": 12743 + }, + { + "epoch": 0.7014144972205405, + "grad_norm": 0.6592209935188293, + "learning_rate": 7.2851178028141045e-06, + "loss": 0.7665, + "step": 12744 + }, + { + "epoch": 0.7014695360228962, + "grad_norm": 0.684847354888916, + "learning_rate": 7.284732245355921e-06, + "loss": 0.7358, + "step": 12745 + }, + { + "epoch": 0.7015245748252518, + "grad_norm": 0.6852415800094604, + "learning_rate": 7.2843466707262985e-06, + "loss": 0.7805, + "step": 12746 + }, + { + "epoch": 0.7015796136276075, + "grad_norm": 0.6422114968299866, + "learning_rate": 7.283961078928141e-06, + "loss": 0.7386, + "step": 12747 + }, + { + "epoch": 0.7016346524299631, + "grad_norm": 0.7538495659828186, + "learning_rate": 7.283575469964343e-06, + "loss": 0.798, + "step": 12748 + }, + { + "epoch": 0.7016896912323188, + "grad_norm": 0.6646687984466553, + "learning_rate": 7.2831898438378025e-06, + "loss": 0.7048, + "step": 12749 + }, + { + "epoch": 0.7017447300346744, + "grad_norm": 0.8338429927825928, + "learning_rate": 7.2828042005514176e-06, + "loss": 0.8585, + "step": 12750 + }, + { + "epoch": 0.7017997688370301, + "grad_norm": 0.7086663842201233, + "learning_rate": 7.282418540108088e-06, + "loss": 0.8011, + "step": 12751 + }, + { + "epoch": 0.7018548076393858, + "grad_norm": 0.6040074229240417, + "learning_rate": 7.282032862510712e-06, + "loss": 0.6327, + "step": 12752 + }, + { + "epoch": 0.7019098464417415, + "grad_norm": 0.7030978798866272, + "learning_rate": 7.281647167762187e-06, + "loss": 0.6373, + "step": 12753 + }, + { + "epoch": 0.7019648852440971, + "grad_norm": 0.662308394908905, + "learning_rate": 7.281261455865414e-06, + "loss": 0.7283, + "step": 12754 + }, + { + "epoch": 0.7020199240464527, + "grad_norm": 0.7369368672370911, + "learning_rate": 7.28087572682329e-06, + "loss": 0.7632, + "step": 12755 + }, + { + "epoch": 0.7020749628488084, + "grad_norm": 0.6887282729148865, + "learning_rate": 7.280489980638714e-06, + "loss": 0.7629, + "step": 12756 + }, + { + "epoch": 0.702130001651164, + "grad_norm": 0.656512975692749, + "learning_rate": 7.280104217314587e-06, + "loss": 0.8028, + "step": 12757 + }, + { + "epoch": 0.7021850404535197, + "grad_norm": 0.7006264328956604, + "learning_rate": 7.279718436853805e-06, + "loss": 0.7025, + "step": 12758 + }, + { + "epoch": 0.7022400792558754, + "grad_norm": 0.675585925579071, + "learning_rate": 7.279332639259271e-06, + "loss": 0.8001, + "step": 12759 + }, + { + "epoch": 0.7022951180582311, + "grad_norm": 0.7105827331542969, + "learning_rate": 7.278946824533883e-06, + "loss": 0.7767, + "step": 12760 + }, + { + "epoch": 0.7023501568605867, + "grad_norm": 0.8310064673423767, + "learning_rate": 7.27856099268054e-06, + "loss": 0.7828, + "step": 12761 + }, + { + "epoch": 0.7024051956629423, + "grad_norm": 0.6885055899620056, + "learning_rate": 7.278175143702142e-06, + "loss": 0.7018, + "step": 12762 + }, + { + "epoch": 0.702460234465298, + "grad_norm": 0.6542866826057434, + "learning_rate": 7.27778927760159e-06, + "loss": 0.7118, + "step": 12763 + }, + { + "epoch": 0.7025152732676537, + "grad_norm": 0.9102655053138733, + "learning_rate": 7.277403394381784e-06, + "loss": 0.8381, + "step": 12764 + }, + { + "epoch": 0.7025703120700093, + "grad_norm": 0.6538355946540833, + "learning_rate": 7.277017494045624e-06, + "loss": 0.7766, + "step": 12765 + }, + { + "epoch": 0.702625350872365, + "grad_norm": 0.6691237092018127, + "learning_rate": 7.27663157659601e-06, + "loss": 0.8077, + "step": 12766 + }, + { + "epoch": 0.7026803896747207, + "grad_norm": 0.7159995436668396, + "learning_rate": 7.2762456420358414e-06, + "loss": 0.8333, + "step": 12767 + }, + { + "epoch": 0.7027354284770764, + "grad_norm": 0.6518422365188599, + "learning_rate": 7.275859690368022e-06, + "loss": 0.7634, + "step": 12768 + }, + { + "epoch": 0.702790467279432, + "grad_norm": 0.6969057321548462, + "learning_rate": 7.275473721595449e-06, + "loss": 0.7481, + "step": 12769 + }, + { + "epoch": 0.7028455060817876, + "grad_norm": 0.6788915395736694, + "learning_rate": 7.2750877357210225e-06, + "loss": 0.7402, + "step": 12770 + }, + { + "epoch": 0.7029005448841433, + "grad_norm": 0.7323998212814331, + "learning_rate": 7.274701732747649e-06, + "loss": 0.7122, + "step": 12771 + }, + { + "epoch": 0.702955583686499, + "grad_norm": 0.7224077582359314, + "learning_rate": 7.274315712678224e-06, + "loss": 0.7333, + "step": 12772 + }, + { + "epoch": 0.7030106224888546, + "grad_norm": 0.9009444117546082, + "learning_rate": 7.273929675515652e-06, + "loss": 0.6912, + "step": 12773 + }, + { + "epoch": 0.7030656612912103, + "grad_norm": 0.7076312899589539, + "learning_rate": 7.273543621262832e-06, + "loss": 0.7651, + "step": 12774 + }, + { + "epoch": 0.703120700093566, + "grad_norm": 0.78575599193573, + "learning_rate": 7.273157549922668e-06, + "loss": 0.7443, + "step": 12775 + }, + { + "epoch": 0.7031757388959217, + "grad_norm": 0.6957094669342041, + "learning_rate": 7.27277146149806e-06, + "loss": 0.7684, + "step": 12776 + }, + { + "epoch": 0.7032307776982772, + "grad_norm": 1.177878975868225, + "learning_rate": 7.27238535599191e-06, + "loss": 0.9033, + "step": 12777 + }, + { + "epoch": 0.7032858165006329, + "grad_norm": 0.6929007768630981, + "learning_rate": 7.27199923340712e-06, + "loss": 0.7411, + "step": 12778 + }, + { + "epoch": 0.7033408553029886, + "grad_norm": 0.7725315093994141, + "learning_rate": 7.2716130937465926e-06, + "loss": 0.7833, + "step": 12779 + }, + { + "epoch": 0.7033958941053443, + "grad_norm": 0.6512928605079651, + "learning_rate": 7.271226937013228e-06, + "loss": 0.7918, + "step": 12780 + }, + { + "epoch": 0.7034509329076999, + "grad_norm": 0.7033893465995789, + "learning_rate": 7.270840763209931e-06, + "loss": 0.843, + "step": 12781 + }, + { + "epoch": 0.7035059717100556, + "grad_norm": 0.7596432566642761, + "learning_rate": 7.2704545723396e-06, + "loss": 0.7916, + "step": 12782 + }, + { + "epoch": 0.7035610105124113, + "grad_norm": 0.6256046891212463, + "learning_rate": 7.270068364405143e-06, + "loss": 0.6531, + "step": 12783 + }, + { + "epoch": 0.703616049314767, + "grad_norm": 0.8107615113258362, + "learning_rate": 7.26968213940946e-06, + "loss": 0.7755, + "step": 12784 + }, + { + "epoch": 0.7036710881171225, + "grad_norm": 0.6742845177650452, + "learning_rate": 7.269295897355451e-06, + "loss": 0.834, + "step": 12785 + }, + { + "epoch": 0.7037261269194782, + "grad_norm": 0.6665072441101074, + "learning_rate": 7.268909638246024e-06, + "loss": 0.6864, + "step": 12786 + }, + { + "epoch": 0.7037811657218339, + "grad_norm": 0.68357914686203, + "learning_rate": 7.268523362084078e-06, + "loss": 0.7789, + "step": 12787 + }, + { + "epoch": 0.7038362045241896, + "grad_norm": 0.6878114938735962, + "learning_rate": 7.268137068872519e-06, + "loss": 0.7277, + "step": 12788 + }, + { + "epoch": 0.7038912433265452, + "grad_norm": 0.7173313498497009, + "learning_rate": 7.267750758614247e-06, + "loss": 0.8156, + "step": 12789 + }, + { + "epoch": 0.7039462821289009, + "grad_norm": 0.6523084044456482, + "learning_rate": 7.267364431312169e-06, + "loss": 0.7143, + "step": 12790 + }, + { + "epoch": 0.7040013209312566, + "grad_norm": 0.7403815388679504, + "learning_rate": 7.2669780869691865e-06, + "loss": 0.8196, + "step": 12791 + }, + { + "epoch": 0.7040563597336122, + "grad_norm": 0.6411255598068237, + "learning_rate": 7.266591725588204e-06, + "loss": 0.6645, + "step": 12792 + }, + { + "epoch": 0.7041113985359678, + "grad_norm": 0.9094020128250122, + "learning_rate": 7.266205347172124e-06, + "loss": 0.8023, + "step": 12793 + }, + { + "epoch": 0.7041664373383235, + "grad_norm": 1.1041208505630493, + "learning_rate": 7.265818951723851e-06, + "loss": 0.7011, + "step": 12794 + }, + { + "epoch": 0.7042214761406792, + "grad_norm": 0.7339954376220703, + "learning_rate": 7.265432539246289e-06, + "loss": 0.7467, + "step": 12795 + }, + { + "epoch": 0.7042765149430349, + "grad_norm": 0.7055865526199341, + "learning_rate": 7.265046109742344e-06, + "loss": 0.7364, + "step": 12796 + }, + { + "epoch": 0.7043315537453905, + "grad_norm": 0.7052320241928101, + "learning_rate": 7.264659663214917e-06, + "loss": 0.7611, + "step": 12797 + }, + { + "epoch": 0.7043865925477462, + "grad_norm": 0.7374194860458374, + "learning_rate": 7.264273199666915e-06, + "loss": 0.7612, + "step": 12798 + }, + { + "epoch": 0.7044416313501018, + "grad_norm": 0.634986162185669, + "learning_rate": 7.263886719101242e-06, + "loss": 0.8001, + "step": 12799 + }, + { + "epoch": 0.7044966701524574, + "grad_norm": 0.8178644180297852, + "learning_rate": 7.2635002215208014e-06, + "loss": 0.8404, + "step": 12800 + }, + { + "epoch": 0.7045517089548131, + "grad_norm": 0.7743822336196899, + "learning_rate": 7.263113706928501e-06, + "loss": 0.7297, + "step": 12801 + }, + { + "epoch": 0.7046067477571688, + "grad_norm": 0.6558601260185242, + "learning_rate": 7.262727175327242e-06, + "loss": 0.6933, + "step": 12802 + }, + { + "epoch": 0.7046617865595245, + "grad_norm": 1.0608787536621094, + "learning_rate": 7.262340626719933e-06, + "loss": 0.8792, + "step": 12803 + }, + { + "epoch": 0.7047168253618801, + "grad_norm": 0.7488270401954651, + "learning_rate": 7.261954061109475e-06, + "loss": 0.7755, + "step": 12804 + }, + { + "epoch": 0.7047718641642358, + "grad_norm": 0.8960574865341187, + "learning_rate": 7.261567478498778e-06, + "loss": 0.7274, + "step": 12805 + }, + { + "epoch": 0.7048269029665915, + "grad_norm": 0.6289944648742676, + "learning_rate": 7.2611808788907436e-06, + "loss": 0.6469, + "step": 12806 + }, + { + "epoch": 0.7048819417689471, + "grad_norm": 0.6488339900970459, + "learning_rate": 7.26079426228828e-06, + "loss": 0.7581, + "step": 12807 + }, + { + "epoch": 0.7049369805713027, + "grad_norm": 0.7354650497436523, + "learning_rate": 7.260407628694292e-06, + "loss": 0.7596, + "step": 12808 + }, + { + "epoch": 0.7049920193736584, + "grad_norm": 0.8163169026374817, + "learning_rate": 7.2600209781116834e-06, + "loss": 0.8291, + "step": 12809 + }, + { + "epoch": 0.7050470581760141, + "grad_norm": 0.8223916292190552, + "learning_rate": 7.259634310543364e-06, + "loss": 0.7089, + "step": 12810 + }, + { + "epoch": 0.7051020969783698, + "grad_norm": 0.7815924286842346, + "learning_rate": 7.2592476259922374e-06, + "loss": 0.8098, + "step": 12811 + }, + { + "epoch": 0.7051571357807254, + "grad_norm": 0.7027734518051147, + "learning_rate": 7.2588609244612105e-06, + "loss": 0.7276, + "step": 12812 + }, + { + "epoch": 0.705212174583081, + "grad_norm": 0.7345930337905884, + "learning_rate": 7.2584742059531894e-06, + "loss": 0.803, + "step": 12813 + }, + { + "epoch": 0.7052672133854367, + "grad_norm": 0.6998127102851868, + "learning_rate": 7.258087470471081e-06, + "loss": 0.7938, + "step": 12814 + }, + { + "epoch": 0.7053222521877924, + "grad_norm": 0.6418118476867676, + "learning_rate": 7.257700718017793e-06, + "loss": 0.66, + "step": 12815 + }, + { + "epoch": 0.705377290990148, + "grad_norm": 0.6774695515632629, + "learning_rate": 7.257313948596228e-06, + "loss": 0.7143, + "step": 12816 + }, + { + "epoch": 0.7054323297925037, + "grad_norm": 0.7107009291648865, + "learning_rate": 7.256927162209298e-06, + "loss": 0.8378, + "step": 12817 + }, + { + "epoch": 0.7054873685948594, + "grad_norm": 0.7287374138832092, + "learning_rate": 7.256540358859906e-06, + "loss": 0.88, + "step": 12818 + }, + { + "epoch": 0.7055424073972151, + "grad_norm": 0.651221752166748, + "learning_rate": 7.256153538550961e-06, + "loss": 0.7092, + "step": 12819 + }, + { + "epoch": 0.7055974461995707, + "grad_norm": 0.6549085974693298, + "learning_rate": 7.255766701285371e-06, + "loss": 0.6697, + "step": 12820 + }, + { + "epoch": 0.7056524850019263, + "grad_norm": 0.6617292165756226, + "learning_rate": 7.255379847066041e-06, + "loss": 0.7779, + "step": 12821 + }, + { + "epoch": 0.705707523804282, + "grad_norm": 0.6677221655845642, + "learning_rate": 7.254992975895879e-06, + "loss": 0.7821, + "step": 12822 + }, + { + "epoch": 0.7057625626066377, + "grad_norm": 0.8183515667915344, + "learning_rate": 7.2546060877777945e-06, + "loss": 0.7727, + "step": 12823 + }, + { + "epoch": 0.7058176014089933, + "grad_norm": 0.6574132442474365, + "learning_rate": 7.2542191827146945e-06, + "loss": 0.7118, + "step": 12824 + }, + { + "epoch": 0.705872640211349, + "grad_norm": 0.6874130964279175, + "learning_rate": 7.253832260709487e-06, + "loss": 0.7677, + "step": 12825 + }, + { + "epoch": 0.7059276790137047, + "grad_norm": 0.6460297107696533, + "learning_rate": 7.253445321765079e-06, + "loss": 0.725, + "step": 12826 + }, + { + "epoch": 0.7059827178160604, + "grad_norm": 0.6618219614028931, + "learning_rate": 7.253058365884379e-06, + "loss": 0.7504, + "step": 12827 + }, + { + "epoch": 0.706037756618416, + "grad_norm": 0.6519019603729248, + "learning_rate": 7.252671393070295e-06, + "loss": 0.7382, + "step": 12828 + }, + { + "epoch": 0.7060927954207716, + "grad_norm": 0.7114588022232056, + "learning_rate": 7.252284403325737e-06, + "loss": 0.8364, + "step": 12829 + }, + { + "epoch": 0.7061478342231273, + "grad_norm": 0.6304726600646973, + "learning_rate": 7.251897396653611e-06, + "loss": 0.6972, + "step": 12830 + }, + { + "epoch": 0.706202873025483, + "grad_norm": 0.6728807687759399, + "learning_rate": 7.251510373056827e-06, + "loss": 0.671, + "step": 12831 + }, + { + "epoch": 0.7062579118278386, + "grad_norm": 0.690641462802887, + "learning_rate": 7.251123332538295e-06, + "loss": 0.7381, + "step": 12832 + }, + { + "epoch": 0.7063129506301943, + "grad_norm": 0.7018027305603027, + "learning_rate": 7.2507362751009226e-06, + "loss": 0.7546, + "step": 12833 + }, + { + "epoch": 0.70636798943255, + "grad_norm": 0.7203684449195862, + "learning_rate": 7.250349200747617e-06, + "loss": 0.7534, + "step": 12834 + }, + { + "epoch": 0.7064230282349057, + "grad_norm": 0.6936585903167725, + "learning_rate": 7.24996210948129e-06, + "loss": 0.7716, + "step": 12835 + }, + { + "epoch": 0.7064780670372612, + "grad_norm": 0.7421281337738037, + "learning_rate": 7.249575001304851e-06, + "loss": 0.7517, + "step": 12836 + }, + { + "epoch": 0.7065331058396169, + "grad_norm": 0.6622288227081299, + "learning_rate": 7.249187876221207e-06, + "loss": 0.6799, + "step": 12837 + }, + { + "epoch": 0.7065881446419726, + "grad_norm": 0.7267055511474609, + "learning_rate": 7.24880073423327e-06, + "loss": 0.7871, + "step": 12838 + }, + { + "epoch": 0.7066431834443283, + "grad_norm": 0.6978085041046143, + "learning_rate": 7.2484135753439485e-06, + "loss": 0.7812, + "step": 12839 + }, + { + "epoch": 0.7066982222466839, + "grad_norm": 0.8353652358055115, + "learning_rate": 7.248026399556153e-06, + "loss": 0.7481, + "step": 12840 + }, + { + "epoch": 0.7067532610490396, + "grad_norm": 0.8402471542358398, + "learning_rate": 7.247639206872792e-06, + "loss": 0.783, + "step": 12841 + }, + { + "epoch": 0.7068082998513953, + "grad_norm": 0.8279419541358948, + "learning_rate": 7.247251997296777e-06, + "loss": 0.8177, + "step": 12842 + }, + { + "epoch": 0.7068633386537508, + "grad_norm": 0.6850735545158386, + "learning_rate": 7.246864770831017e-06, + "loss": 0.7586, + "step": 12843 + }, + { + "epoch": 0.7069183774561065, + "grad_norm": 0.7327665090560913, + "learning_rate": 7.246477527478422e-06, + "loss": 0.9327, + "step": 12844 + }, + { + "epoch": 0.7069734162584622, + "grad_norm": 0.6343075037002563, + "learning_rate": 7.246090267241905e-06, + "loss": 0.6957, + "step": 12845 + }, + { + "epoch": 0.7070284550608179, + "grad_norm": 0.7028965353965759, + "learning_rate": 7.245702990124373e-06, + "loss": 0.7524, + "step": 12846 + }, + { + "epoch": 0.7070834938631735, + "grad_norm": 0.7578299045562744, + "learning_rate": 7.24531569612874e-06, + "loss": 0.7302, + "step": 12847 + }, + { + "epoch": 0.7071385326655292, + "grad_norm": 0.8113438487052917, + "learning_rate": 7.2449283852579146e-06, + "loss": 0.7658, + "step": 12848 + }, + { + "epoch": 0.7071935714678849, + "grad_norm": 0.6442512273788452, + "learning_rate": 7.244541057514809e-06, + "loss": 0.6742, + "step": 12849 + }, + { + "epoch": 0.7072486102702406, + "grad_norm": 0.8595272898674011, + "learning_rate": 7.244153712902333e-06, + "loss": 0.7944, + "step": 12850 + }, + { + "epoch": 0.7073036490725961, + "grad_norm": 0.6565983891487122, + "learning_rate": 7.243766351423398e-06, + "loss": 0.7411, + "step": 12851 + }, + { + "epoch": 0.7073586878749518, + "grad_norm": 0.7935337424278259, + "learning_rate": 7.243378973080917e-06, + "loss": 0.8109, + "step": 12852 + }, + { + "epoch": 0.7074137266773075, + "grad_norm": 0.7083927392959595, + "learning_rate": 7.242991577877799e-06, + "loss": 0.8405, + "step": 12853 + }, + { + "epoch": 0.7074687654796632, + "grad_norm": 0.7452830672264099, + "learning_rate": 7.242604165816958e-06, + "loss": 0.7972, + "step": 12854 + }, + { + "epoch": 0.7075238042820188, + "grad_norm": 0.6775808334350586, + "learning_rate": 7.242216736901302e-06, + "loss": 0.7114, + "step": 12855 + }, + { + "epoch": 0.7075788430843745, + "grad_norm": 0.8069992661476135, + "learning_rate": 7.241829291133748e-06, + "loss": 0.6606, + "step": 12856 + }, + { + "epoch": 0.7076338818867302, + "grad_norm": 0.6690802574157715, + "learning_rate": 7.241441828517203e-06, + "loss": 0.742, + "step": 12857 + }, + { + "epoch": 0.7076889206890858, + "grad_norm": 0.8077805638313293, + "learning_rate": 7.2410543490545814e-06, + "loss": 0.7786, + "step": 12858 + }, + { + "epoch": 0.7077439594914414, + "grad_norm": 0.6906875967979431, + "learning_rate": 7.240666852748795e-06, + "loss": 0.7445, + "step": 12859 + }, + { + "epoch": 0.7077989982937971, + "grad_norm": 0.6830704808235168, + "learning_rate": 7.2402793396027585e-06, + "loss": 0.7664, + "step": 12860 + }, + { + "epoch": 0.7078540370961528, + "grad_norm": 0.8118640780448914, + "learning_rate": 7.23989180961938e-06, + "loss": 0.7654, + "step": 12861 + }, + { + "epoch": 0.7079090758985085, + "grad_norm": 0.6819882392883301, + "learning_rate": 7.2395042628015755e-06, + "loss": 0.649, + "step": 12862 + }, + { + "epoch": 0.7079641147008641, + "grad_norm": 0.6543441414833069, + "learning_rate": 7.239116699152256e-06, + "loss": 0.8054, + "step": 12863 + }, + { + "epoch": 0.7080191535032198, + "grad_norm": 0.8613989353179932, + "learning_rate": 7.238729118674335e-06, + "loss": 0.7283, + "step": 12864 + }, + { + "epoch": 0.7080741923055754, + "grad_norm": 0.6993124485015869, + "learning_rate": 7.238341521370725e-06, + "loss": 0.8145, + "step": 12865 + }, + { + "epoch": 0.7081292311079311, + "grad_norm": 0.7047560811042786, + "learning_rate": 7.237953907244339e-06, + "loss": 0.6729, + "step": 12866 + }, + { + "epoch": 0.7081842699102867, + "grad_norm": 0.7923689484596252, + "learning_rate": 7.237566276298091e-06, + "loss": 0.7615, + "step": 12867 + }, + { + "epoch": 0.7082393087126424, + "grad_norm": 0.6873850226402283, + "learning_rate": 7.237178628534894e-06, + "loss": 0.7638, + "step": 12868 + }, + { + "epoch": 0.7082943475149981, + "grad_norm": 0.6483134031295776, + "learning_rate": 7.236790963957661e-06, + "loss": 0.6366, + "step": 12869 + }, + { + "epoch": 0.7083493863173538, + "grad_norm": 0.6623784899711609, + "learning_rate": 7.236403282569305e-06, + "loss": 0.7032, + "step": 12870 + }, + { + "epoch": 0.7084044251197094, + "grad_norm": 0.7004366517066956, + "learning_rate": 7.236015584372741e-06, + "loss": 0.6436, + "step": 12871 + }, + { + "epoch": 0.708459463922065, + "grad_norm": 0.5676529407501221, + "learning_rate": 7.235627869370883e-06, + "loss": 0.6395, + "step": 12872 + }, + { + "epoch": 0.7085145027244207, + "grad_norm": 0.6909729838371277, + "learning_rate": 7.235240137566644e-06, + "loss": 0.7063, + "step": 12873 + }, + { + "epoch": 0.7085695415267764, + "grad_norm": 0.7635348439216614, + "learning_rate": 7.234852388962939e-06, + "loss": 0.7518, + "step": 12874 + }, + { + "epoch": 0.708624580329132, + "grad_norm": 0.7217742204666138, + "learning_rate": 7.2344646235626815e-06, + "loss": 0.7782, + "step": 12875 + }, + { + "epoch": 0.7086796191314877, + "grad_norm": 0.6506509184837341, + "learning_rate": 7.2340768413687855e-06, + "loss": 0.7456, + "step": 12876 + }, + { + "epoch": 0.7087346579338434, + "grad_norm": 0.6537386775016785, + "learning_rate": 7.2336890423841664e-06, + "loss": 0.7395, + "step": 12877 + }, + { + "epoch": 0.7087896967361991, + "grad_norm": 0.7759900689125061, + "learning_rate": 7.233301226611737e-06, + "loss": 0.8098, + "step": 12878 + }, + { + "epoch": 0.7088447355385546, + "grad_norm": 0.8476354479789734, + "learning_rate": 7.232913394054415e-06, + "loss": 0.8241, + "step": 12879 + }, + { + "epoch": 0.7088997743409103, + "grad_norm": 0.6770507097244263, + "learning_rate": 7.232525544715114e-06, + "loss": 0.6966, + "step": 12880 + }, + { + "epoch": 0.708954813143266, + "grad_norm": 0.7750027775764465, + "learning_rate": 7.232137678596747e-06, + "loss": 0.8038, + "step": 12881 + }, + { + "epoch": 0.7090098519456217, + "grad_norm": 0.6507213711738586, + "learning_rate": 7.231749795702232e-06, + "loss": 0.6446, + "step": 12882 + }, + { + "epoch": 0.7090648907479773, + "grad_norm": 0.7554625272750854, + "learning_rate": 7.231361896034481e-06, + "loss": 0.7769, + "step": 12883 + }, + { + "epoch": 0.709119929550333, + "grad_norm": 0.8175020813941956, + "learning_rate": 7.230973979596414e-06, + "loss": 0.8283, + "step": 12884 + }, + { + "epoch": 0.7091749683526887, + "grad_norm": 0.7528663873672485, + "learning_rate": 7.2305860463909416e-06, + "loss": 0.7737, + "step": 12885 + }, + { + "epoch": 0.7092300071550443, + "grad_norm": 0.9242768883705139, + "learning_rate": 7.230198096420983e-06, + "loss": 0.647, + "step": 12886 + }, + { + "epoch": 0.7092850459573999, + "grad_norm": 0.899874746799469, + "learning_rate": 7.229810129689452e-06, + "loss": 0.8952, + "step": 12887 + }, + { + "epoch": 0.7093400847597556, + "grad_norm": 0.8221275806427002, + "learning_rate": 7.229422146199266e-06, + "loss": 0.6845, + "step": 12888 + }, + { + "epoch": 0.7093951235621113, + "grad_norm": 0.6964027285575867, + "learning_rate": 7.229034145953338e-06, + "loss": 0.7153, + "step": 12889 + }, + { + "epoch": 0.7094501623644669, + "grad_norm": 0.8018684387207031, + "learning_rate": 7.228646128954588e-06, + "loss": 0.6421, + "step": 12890 + }, + { + "epoch": 0.7095052011668226, + "grad_norm": 0.6874614953994751, + "learning_rate": 7.228258095205928e-06, + "loss": 0.8024, + "step": 12891 + }, + { + "epoch": 0.7095602399691783, + "grad_norm": 0.7141417860984802, + "learning_rate": 7.227870044710277e-06, + "loss": 0.7746, + "step": 12892 + }, + { + "epoch": 0.709615278771534, + "grad_norm": 0.7109399437904358, + "learning_rate": 7.227481977470552e-06, + "loss": 0.7826, + "step": 12893 + }, + { + "epoch": 0.7096703175738895, + "grad_norm": 0.7021867036819458, + "learning_rate": 7.227093893489669e-06, + "loss": 0.7196, + "step": 12894 + }, + { + "epoch": 0.7097253563762452, + "grad_norm": 0.6896560788154602, + "learning_rate": 7.226705792770543e-06, + "loss": 0.6925, + "step": 12895 + }, + { + "epoch": 0.7097803951786009, + "grad_norm": 0.7138262987136841, + "learning_rate": 7.226317675316094e-06, + "loss": 0.7417, + "step": 12896 + }, + { + "epoch": 0.7098354339809566, + "grad_norm": 0.6789212226867676, + "learning_rate": 7.225929541129236e-06, + "loss": 0.7095, + "step": 12897 + }, + { + "epoch": 0.7098904727833122, + "grad_norm": 0.8102045059204102, + "learning_rate": 7.225541390212889e-06, + "loss": 0.9252, + "step": 12898 + }, + { + "epoch": 0.7099455115856679, + "grad_norm": 0.6220358610153198, + "learning_rate": 7.2251532225699674e-06, + "loss": 0.7205, + "step": 12899 + }, + { + "epoch": 0.7100005503880236, + "grad_norm": 0.6375265121459961, + "learning_rate": 7.224765038203391e-06, + "loss": 0.7974, + "step": 12900 + }, + { + "epoch": 0.7100555891903793, + "grad_norm": 0.7457360029220581, + "learning_rate": 7.224376837116075e-06, + "loss": 0.7083, + "step": 12901 + }, + { + "epoch": 0.7101106279927348, + "grad_norm": 0.7012878060340881, + "learning_rate": 7.2239886193109374e-06, + "loss": 0.7334, + "step": 12902 + }, + { + "epoch": 0.7101656667950905, + "grad_norm": 0.7437683343887329, + "learning_rate": 7.223600384790898e-06, + "loss": 0.82, + "step": 12903 + }, + { + "epoch": 0.7102207055974462, + "grad_norm": 0.6727370619773865, + "learning_rate": 7.223212133558872e-06, + "loss": 0.7339, + "step": 12904 + }, + { + "epoch": 0.7102757443998019, + "grad_norm": 0.9253849983215332, + "learning_rate": 7.222823865617781e-06, + "loss": 0.7398, + "step": 12905 + }, + { + "epoch": 0.7103307832021575, + "grad_norm": 0.6664100885391235, + "learning_rate": 7.222435580970539e-06, + "loss": 0.7519, + "step": 12906 + }, + { + "epoch": 0.7103858220045132, + "grad_norm": 0.7452943325042725, + "learning_rate": 7.222047279620066e-06, + "loss": 0.7382, + "step": 12907 + }, + { + "epoch": 0.7104408608068689, + "grad_norm": 0.7235015630722046, + "learning_rate": 7.22165896156928e-06, + "loss": 0.7726, + "step": 12908 + }, + { + "epoch": 0.7104958996092245, + "grad_norm": 0.6324653029441833, + "learning_rate": 7.221270626821102e-06, + "loss": 0.7451, + "step": 12909 + }, + { + "epoch": 0.7105509384115801, + "grad_norm": 0.789829432964325, + "learning_rate": 7.220882275378447e-06, + "loss": 0.7375, + "step": 12910 + }, + { + "epoch": 0.7106059772139358, + "grad_norm": 0.9090244174003601, + "learning_rate": 7.220493907244236e-06, + "loss": 0.8935, + "step": 12911 + }, + { + "epoch": 0.7106610160162915, + "grad_norm": 0.6570677757263184, + "learning_rate": 7.220105522421388e-06, + "loss": 0.7259, + "step": 12912 + }, + { + "epoch": 0.7107160548186472, + "grad_norm": 0.7142132520675659, + "learning_rate": 7.219717120912819e-06, + "loss": 0.7862, + "step": 12913 + }, + { + "epoch": 0.7107710936210028, + "grad_norm": 0.7359404563903809, + "learning_rate": 7.219328702721452e-06, + "loss": 0.7074, + "step": 12914 + }, + { + "epoch": 0.7108261324233585, + "grad_norm": 0.7118046283721924, + "learning_rate": 7.218940267850203e-06, + "loss": 0.8151, + "step": 12915 + }, + { + "epoch": 0.7108811712257141, + "grad_norm": 0.8301580548286438, + "learning_rate": 7.218551816301994e-06, + "loss": 0.7031, + "step": 12916 + }, + { + "epoch": 0.7109362100280698, + "grad_norm": 0.6647501587867737, + "learning_rate": 7.218163348079743e-06, + "loss": 0.8309, + "step": 12917 + }, + { + "epoch": 0.7109912488304254, + "grad_norm": 0.6546997427940369, + "learning_rate": 7.217774863186371e-06, + "loss": 0.717, + "step": 12918 + }, + { + "epoch": 0.7110462876327811, + "grad_norm": 0.6639735102653503, + "learning_rate": 7.217386361624795e-06, + "loss": 0.7308, + "step": 12919 + }, + { + "epoch": 0.7111013264351368, + "grad_norm": 0.724433183670044, + "learning_rate": 7.216997843397938e-06, + "loss": 0.7576, + "step": 12920 + }, + { + "epoch": 0.7111563652374925, + "grad_norm": 0.750253438949585, + "learning_rate": 7.216609308508719e-06, + "loss": 0.7014, + "step": 12921 + }, + { + "epoch": 0.7112114040398481, + "grad_norm": 0.7010897397994995, + "learning_rate": 7.216220756960058e-06, + "loss": 0.6951, + "step": 12922 + }, + { + "epoch": 0.7112664428422037, + "grad_norm": 0.7739251852035522, + "learning_rate": 7.215832188754873e-06, + "loss": 0.7392, + "step": 12923 + }, + { + "epoch": 0.7113214816445594, + "grad_norm": 0.6893059015274048, + "learning_rate": 7.215443603896088e-06, + "loss": 0.7029, + "step": 12924 + }, + { + "epoch": 0.7113765204469151, + "grad_norm": 0.8061872124671936, + "learning_rate": 7.215055002386622e-06, + "loss": 0.7557, + "step": 12925 + }, + { + "epoch": 0.7114315592492707, + "grad_norm": 1.089525580406189, + "learning_rate": 7.214666384229395e-06, + "loss": 0.6701, + "step": 12926 + }, + { + "epoch": 0.7114865980516264, + "grad_norm": 0.7601733207702637, + "learning_rate": 7.2142777494273275e-06, + "loss": 0.8113, + "step": 12927 + }, + { + "epoch": 0.7115416368539821, + "grad_norm": 0.7863540649414062, + "learning_rate": 7.213889097983342e-06, + "loss": 0.7945, + "step": 12928 + }, + { + "epoch": 0.7115966756563377, + "grad_norm": 0.7722556591033936, + "learning_rate": 7.21350042990036e-06, + "loss": 0.9492, + "step": 12929 + }, + { + "epoch": 0.7116517144586934, + "grad_norm": 0.6834682822227478, + "learning_rate": 7.213111745181299e-06, + "loss": 0.7138, + "step": 12930 + }, + { + "epoch": 0.711706753261049, + "grad_norm": 0.6974432468414307, + "learning_rate": 7.212723043829083e-06, + "loss": 0.7654, + "step": 12931 + }, + { + "epoch": 0.7117617920634047, + "grad_norm": 0.9797543883323669, + "learning_rate": 7.2123343258466334e-06, + "loss": 0.7786, + "step": 12932 + }, + { + "epoch": 0.7118168308657603, + "grad_norm": 0.6337804794311523, + "learning_rate": 7.211945591236872e-06, + "loss": 0.7147, + "step": 12933 + }, + { + "epoch": 0.711871869668116, + "grad_norm": 0.7450474500656128, + "learning_rate": 7.211556840002718e-06, + "loss": 0.8516, + "step": 12934 + }, + { + "epoch": 0.7119269084704717, + "grad_norm": 0.7786532640457153, + "learning_rate": 7.2111680721470965e-06, + "loss": 0.837, + "step": 12935 + }, + { + "epoch": 0.7119819472728274, + "grad_norm": 0.666020393371582, + "learning_rate": 7.210779287672927e-06, + "loss": 0.7646, + "step": 12936 + }, + { + "epoch": 0.712036986075183, + "grad_norm": 0.622648298740387, + "learning_rate": 7.210390486583132e-06, + "loss": 0.7102, + "step": 12937 + }, + { + "epoch": 0.7120920248775386, + "grad_norm": 0.7175952792167664, + "learning_rate": 7.210001668880634e-06, + "loss": 0.7043, + "step": 12938 + }, + { + "epoch": 0.7121470636798943, + "grad_norm": 0.8019681572914124, + "learning_rate": 7.209612834568353e-06, + "loss": 0.8166, + "step": 12939 + }, + { + "epoch": 0.71220210248225, + "grad_norm": 0.804457426071167, + "learning_rate": 7.209223983649216e-06, + "loss": 0.7182, + "step": 12940 + }, + { + "epoch": 0.7122571412846056, + "grad_norm": 0.7261730432510376, + "learning_rate": 7.208835116126143e-06, + "loss": 0.6634, + "step": 12941 + }, + { + "epoch": 0.7123121800869613, + "grad_norm": 0.7461307644844055, + "learning_rate": 7.208446232002055e-06, + "loss": 0.709, + "step": 12942 + }, + { + "epoch": 0.712367218889317, + "grad_norm": 0.6730383634567261, + "learning_rate": 7.208057331279877e-06, + "loss": 0.7111, + "step": 12943 + }, + { + "epoch": 0.7124222576916727, + "grad_norm": 0.829530656337738, + "learning_rate": 7.207668413962531e-06, + "loss": 0.729, + "step": 12944 + }, + { + "epoch": 0.7124772964940282, + "grad_norm": 0.5997991561889648, + "learning_rate": 7.20727948005294e-06, + "loss": 0.6385, + "step": 12945 + }, + { + "epoch": 0.7125323352963839, + "grad_norm": 0.9590086936950684, + "learning_rate": 7.206890529554027e-06, + "loss": 0.7217, + "step": 12946 + }, + { + "epoch": 0.7125873740987396, + "grad_norm": 0.7818330526351929, + "learning_rate": 7.206501562468717e-06, + "loss": 0.7276, + "step": 12947 + }, + { + "epoch": 0.7126424129010953, + "grad_norm": 0.6033679842948914, + "learning_rate": 7.206112578799931e-06, + "loss": 0.5935, + "step": 12948 + }, + { + "epoch": 0.7126974517034509, + "grad_norm": 0.7431650757789612, + "learning_rate": 7.205723578550593e-06, + "loss": 0.8649, + "step": 12949 + }, + { + "epoch": 0.7127524905058066, + "grad_norm": 0.7026848793029785, + "learning_rate": 7.205334561723627e-06, + "loss": 0.7484, + "step": 12950 + }, + { + "epoch": 0.7128075293081623, + "grad_norm": 0.6328058242797852, + "learning_rate": 7.204945528321956e-06, + "loss": 0.6994, + "step": 12951 + }, + { + "epoch": 0.712862568110518, + "grad_norm": 0.6806536912918091, + "learning_rate": 7.204556478348507e-06, + "loss": 0.7461, + "step": 12952 + }, + { + "epoch": 0.7129176069128735, + "grad_norm": 0.6822162866592407, + "learning_rate": 7.2041674118062e-06, + "loss": 0.7947, + "step": 12953 + }, + { + "epoch": 0.7129726457152292, + "grad_norm": 0.7283263802528381, + "learning_rate": 7.203778328697962e-06, + "loss": 0.7559, + "step": 12954 + }, + { + "epoch": 0.7130276845175849, + "grad_norm": 0.663564920425415, + "learning_rate": 7.203389229026714e-06, + "loss": 0.6898, + "step": 12955 + }, + { + "epoch": 0.7130827233199406, + "grad_norm": 0.7218708395957947, + "learning_rate": 7.203000112795383e-06, + "loss": 0.8095, + "step": 12956 + }, + { + "epoch": 0.7131377621222962, + "grad_norm": 0.6931518912315369, + "learning_rate": 7.202610980006893e-06, + "loss": 0.7591, + "step": 12957 + }, + { + "epoch": 0.7131928009246519, + "grad_norm": 0.6982918381690979, + "learning_rate": 7.2022218306641704e-06, + "loss": 0.7651, + "step": 12958 + }, + { + "epoch": 0.7132478397270076, + "grad_norm": 0.8033974170684814, + "learning_rate": 7.201832664770135e-06, + "loss": 0.8857, + "step": 12959 + }, + { + "epoch": 0.7133028785293632, + "grad_norm": 0.6625493764877319, + "learning_rate": 7.201443482327717e-06, + "loss": 0.752, + "step": 12960 + }, + { + "epoch": 0.7133579173317188, + "grad_norm": 0.8149683475494385, + "learning_rate": 7.201054283339838e-06, + "loss": 0.8528, + "step": 12961 + }, + { + "epoch": 0.7134129561340745, + "grad_norm": 0.7894958257675171, + "learning_rate": 7.200665067809425e-06, + "loss": 0.8554, + "step": 12962 + }, + { + "epoch": 0.7134679949364302, + "grad_norm": 0.7613523602485657, + "learning_rate": 7.200275835739401e-06, + "loss": 0.7435, + "step": 12963 + }, + { + "epoch": 0.7135230337387859, + "grad_norm": 0.665985643863678, + "learning_rate": 7.199886587132693e-06, + "loss": 0.7072, + "step": 12964 + }, + { + "epoch": 0.7135780725411415, + "grad_norm": 0.7523592710494995, + "learning_rate": 7.199497321992227e-06, + "loss": 0.7945, + "step": 12965 + }, + { + "epoch": 0.7136331113434972, + "grad_norm": 0.8894450664520264, + "learning_rate": 7.199108040320928e-06, + "loss": 0.7885, + "step": 12966 + }, + { + "epoch": 0.7136881501458529, + "grad_norm": 0.639108419418335, + "learning_rate": 7.198718742121722e-06, + "loss": 0.6975, + "step": 12967 + }, + { + "epoch": 0.7137431889482085, + "grad_norm": 0.670013964176178, + "learning_rate": 7.198329427397532e-06, + "loss": 0.7441, + "step": 12968 + }, + { + "epoch": 0.7137982277505641, + "grad_norm": 0.7695425748825073, + "learning_rate": 7.197940096151289e-06, + "loss": 0.7616, + "step": 12969 + }, + { + "epoch": 0.7138532665529198, + "grad_norm": 0.9098057150840759, + "learning_rate": 7.197550748385917e-06, + "loss": 0.9028, + "step": 12970 + }, + { + "epoch": 0.7139083053552755, + "grad_norm": 0.7677769660949707, + "learning_rate": 7.197161384104341e-06, + "loss": 0.7926, + "step": 12971 + }, + { + "epoch": 0.7139633441576311, + "grad_norm": 0.7020674347877502, + "learning_rate": 7.196772003309487e-06, + "loss": 0.7248, + "step": 12972 + }, + { + "epoch": 0.7140183829599868, + "grad_norm": 0.6616366505622864, + "learning_rate": 7.196382606004283e-06, + "loss": 0.7137, + "step": 12973 + }, + { + "epoch": 0.7140734217623425, + "grad_norm": 0.7174738645553589, + "learning_rate": 7.195993192191656e-06, + "loss": 0.8167, + "step": 12974 + }, + { + "epoch": 0.7141284605646981, + "grad_norm": 0.6672176122665405, + "learning_rate": 7.1956037618745325e-06, + "loss": 0.6516, + "step": 12975 + }, + { + "epoch": 0.7141834993670537, + "grad_norm": 0.714790403842926, + "learning_rate": 7.195214315055837e-06, + "loss": 0.865, + "step": 12976 + }, + { + "epoch": 0.7142385381694094, + "grad_norm": 0.6637690663337708, + "learning_rate": 7.1948248517385e-06, + "loss": 0.7328, + "step": 12977 + }, + { + "epoch": 0.7142935769717651, + "grad_norm": 0.8998367786407471, + "learning_rate": 7.194435371925446e-06, + "loss": 0.7097, + "step": 12978 + }, + { + "epoch": 0.7143486157741208, + "grad_norm": 0.7472445964813232, + "learning_rate": 7.194045875619604e-06, + "loss": 0.7556, + "step": 12979 + }, + { + "epoch": 0.7144036545764764, + "grad_norm": 0.7897135019302368, + "learning_rate": 7.1936563628239e-06, + "loss": 0.8728, + "step": 12980 + }, + { + "epoch": 0.714458693378832, + "grad_norm": 0.6520817279815674, + "learning_rate": 7.193266833541261e-06, + "loss": 0.6824, + "step": 12981 + }, + { + "epoch": 0.7145137321811877, + "grad_norm": 0.833849310874939, + "learning_rate": 7.192877287774618e-06, + "loss": 0.8877, + "step": 12982 + }, + { + "epoch": 0.7145687709835434, + "grad_norm": 0.7105151414871216, + "learning_rate": 7.192487725526896e-06, + "loss": 0.7799, + "step": 12983 + }, + { + "epoch": 0.714623809785899, + "grad_norm": 0.7515869140625, + "learning_rate": 7.192098146801021e-06, + "loss": 0.7012, + "step": 12984 + }, + { + "epoch": 0.7146788485882547, + "grad_norm": 0.7447199821472168, + "learning_rate": 7.191708551599923e-06, + "loss": 0.7545, + "step": 12985 + }, + { + "epoch": 0.7147338873906104, + "grad_norm": 0.8502823114395142, + "learning_rate": 7.191318939926532e-06, + "loss": 0.7232, + "step": 12986 + }, + { + "epoch": 0.7147889261929661, + "grad_norm": 0.7193031907081604, + "learning_rate": 7.190929311783774e-06, + "loss": 0.762, + "step": 12987 + }, + { + "epoch": 0.7148439649953217, + "grad_norm": 0.8479939699172974, + "learning_rate": 7.190539667174576e-06, + "loss": 0.7238, + "step": 12988 + }, + { + "epoch": 0.7148990037976773, + "grad_norm": 0.8313719630241394, + "learning_rate": 7.1901500061018704e-06, + "loss": 0.8145, + "step": 12989 + }, + { + "epoch": 0.714954042600033, + "grad_norm": 0.7019978165626526, + "learning_rate": 7.189760328568584e-06, + "loss": 0.6461, + "step": 12990 + }, + { + "epoch": 0.7150090814023887, + "grad_norm": 0.897280216217041, + "learning_rate": 7.1893706345776436e-06, + "loss": 0.818, + "step": 12991 + }, + { + "epoch": 0.7150641202047443, + "grad_norm": 0.7495617866516113, + "learning_rate": 7.1889809241319795e-06, + "loss": 0.7533, + "step": 12992 + }, + { + "epoch": 0.7151191590071, + "grad_norm": 0.733496904373169, + "learning_rate": 7.188591197234522e-06, + "loss": 0.7405, + "step": 12993 + }, + { + "epoch": 0.7151741978094557, + "grad_norm": 0.8873284459114075, + "learning_rate": 7.1882014538882e-06, + "loss": 0.7525, + "step": 12994 + }, + { + "epoch": 0.7152292366118114, + "grad_norm": 0.6693230271339417, + "learning_rate": 7.187811694095939e-06, + "loss": 0.7509, + "step": 12995 + }, + { + "epoch": 0.715284275414167, + "grad_norm": 0.8513357043266296, + "learning_rate": 7.187421917860671e-06, + "loss": 0.8111, + "step": 12996 + }, + { + "epoch": 0.7153393142165226, + "grad_norm": 0.6986566185951233, + "learning_rate": 7.187032125185326e-06, + "loss": 0.8013, + "step": 12997 + }, + { + "epoch": 0.7153943530188783, + "grad_norm": 0.7062557339668274, + "learning_rate": 7.1866423160728335e-06, + "loss": 0.7266, + "step": 12998 + }, + { + "epoch": 0.715449391821234, + "grad_norm": 0.6329573392868042, + "learning_rate": 7.186252490526122e-06, + "loss": 0.6753, + "step": 12999 + }, + { + "epoch": 0.7155044306235896, + "grad_norm": 0.6740719079971313, + "learning_rate": 7.185862648548122e-06, + "loss": 0.7197, + "step": 13000 + }, + { + "epoch": 0.7155594694259453, + "grad_norm": 0.7911732196807861, + "learning_rate": 7.185472790141764e-06, + "loss": 0.6939, + "step": 13001 + }, + { + "epoch": 0.715614508228301, + "grad_norm": 0.7368680238723755, + "learning_rate": 7.185082915309978e-06, + "loss": 0.6919, + "step": 13002 + }, + { + "epoch": 0.7156695470306567, + "grad_norm": 0.6374472975730896, + "learning_rate": 7.1846930240556925e-06, + "loss": 0.6645, + "step": 13003 + }, + { + "epoch": 0.7157245858330122, + "grad_norm": 0.6727073192596436, + "learning_rate": 7.184303116381839e-06, + "loss": 0.5995, + "step": 13004 + }, + { + "epoch": 0.7157796246353679, + "grad_norm": 0.6122208833694458, + "learning_rate": 7.183913192291348e-06, + "loss": 0.6755, + "step": 13005 + }, + { + "epoch": 0.7158346634377236, + "grad_norm": 0.7095892429351807, + "learning_rate": 7.1835232517871525e-06, + "loss": 0.8009, + "step": 13006 + }, + { + "epoch": 0.7158897022400793, + "grad_norm": 0.6828192472457886, + "learning_rate": 7.1831332948721786e-06, + "loss": 0.7755, + "step": 13007 + }, + { + "epoch": 0.7159447410424349, + "grad_norm": 0.7997334003448486, + "learning_rate": 7.182743321549359e-06, + "loss": 0.7259, + "step": 13008 + }, + { + "epoch": 0.7159997798447906, + "grad_norm": 0.7431252002716064, + "learning_rate": 7.182353331821626e-06, + "loss": 0.7765, + "step": 13009 + }, + { + "epoch": 0.7160548186471463, + "grad_norm": 0.7202625870704651, + "learning_rate": 7.181963325691907e-06, + "loss": 0.7638, + "step": 13010 + }, + { + "epoch": 0.716109857449502, + "grad_norm": 0.7617568373680115, + "learning_rate": 7.181573303163139e-06, + "loss": 0.825, + "step": 13011 + }, + { + "epoch": 0.7161648962518575, + "grad_norm": 0.7382665276527405, + "learning_rate": 7.181183264238247e-06, + "loss": 0.8005, + "step": 13012 + }, + { + "epoch": 0.7162199350542132, + "grad_norm": 0.7782611846923828, + "learning_rate": 7.180793208920167e-06, + "loss": 0.7044, + "step": 13013 + }, + { + "epoch": 0.7162749738565689, + "grad_norm": 0.7020898461341858, + "learning_rate": 7.18040313721183e-06, + "loss": 0.8059, + "step": 13014 + }, + { + "epoch": 0.7163300126589245, + "grad_norm": 1.2005099058151245, + "learning_rate": 7.1800130491161656e-06, + "loss": 0.6663, + "step": 13015 + }, + { + "epoch": 0.7163850514612802, + "grad_norm": 0.6663569211959839, + "learning_rate": 7.1796229446361066e-06, + "loss": 0.7046, + "step": 13016 + }, + { + "epoch": 0.7164400902636359, + "grad_norm": 0.7010110020637512, + "learning_rate": 7.1792328237745845e-06, + "loss": 0.6433, + "step": 13017 + }, + { + "epoch": 0.7164951290659916, + "grad_norm": 0.6447514891624451, + "learning_rate": 7.178842686534534e-06, + "loss": 0.7794, + "step": 13018 + }, + { + "epoch": 0.7165501678683471, + "grad_norm": 0.6813021302223206, + "learning_rate": 7.1784525329188835e-06, + "loss": 0.7413, + "step": 13019 + }, + { + "epoch": 0.7166052066707028, + "grad_norm": 0.6894733905792236, + "learning_rate": 7.178062362930567e-06, + "loss": 0.7896, + "step": 13020 + }, + { + "epoch": 0.7166602454730585, + "grad_norm": 0.6717034578323364, + "learning_rate": 7.177672176572517e-06, + "loss": 0.7599, + "step": 13021 + }, + { + "epoch": 0.7167152842754142, + "grad_norm": 0.7861666083335876, + "learning_rate": 7.177281973847665e-06, + "loss": 0.9068, + "step": 13022 + }, + { + "epoch": 0.7167703230777698, + "grad_norm": 0.6784214973449707, + "learning_rate": 7.176891754758946e-06, + "loss": 0.8319, + "step": 13023 + }, + { + "epoch": 0.7168253618801255, + "grad_norm": 0.7053580284118652, + "learning_rate": 7.176501519309289e-06, + "loss": 0.8085, + "step": 13024 + }, + { + "epoch": 0.7168804006824812, + "grad_norm": 0.9643208980560303, + "learning_rate": 7.176111267501631e-06, + "loss": 0.7799, + "step": 13025 + }, + { + "epoch": 0.7169354394848368, + "grad_norm": 0.8921111822128296, + "learning_rate": 7.175720999338902e-06, + "loss": 0.6465, + "step": 13026 + }, + { + "epoch": 0.7169904782871924, + "grad_norm": 0.7356166839599609, + "learning_rate": 7.1753307148240385e-06, + "loss": 0.7862, + "step": 13027 + }, + { + "epoch": 0.7170455170895481, + "grad_norm": 0.6906836628913879, + "learning_rate": 7.174940413959968e-06, + "loss": 0.7341, + "step": 13028 + }, + { + "epoch": 0.7171005558919038, + "grad_norm": 0.6229632496833801, + "learning_rate": 7.174550096749632e-06, + "loss": 0.721, + "step": 13029 + }, + { + "epoch": 0.7171555946942595, + "grad_norm": 0.6832499504089355, + "learning_rate": 7.174159763195958e-06, + "loss": 0.6733, + "step": 13030 + }, + { + "epoch": 0.7172106334966151, + "grad_norm": 0.8304060697555542, + "learning_rate": 7.1737694133018806e-06, + "loss": 0.7732, + "step": 13031 + }, + { + "epoch": 0.7172656722989708, + "grad_norm": 0.6813186407089233, + "learning_rate": 7.173379047070333e-06, + "loss": 0.7742, + "step": 13032 + }, + { + "epoch": 0.7173207111013264, + "grad_norm": 0.6671963930130005, + "learning_rate": 7.172988664504252e-06, + "loss": 0.6516, + "step": 13033 + }, + { + "epoch": 0.7173757499036821, + "grad_norm": 0.661108136177063, + "learning_rate": 7.172598265606569e-06, + "loss": 0.7361, + "step": 13034 + }, + { + "epoch": 0.7174307887060377, + "grad_norm": 0.7097620368003845, + "learning_rate": 7.1722078503802196e-06, + "loss": 0.8142, + "step": 13035 + }, + { + "epoch": 0.7174858275083934, + "grad_norm": 0.7663383483886719, + "learning_rate": 7.1718174188281365e-06, + "loss": 0.8149, + "step": 13036 + }, + { + "epoch": 0.7175408663107491, + "grad_norm": 0.7142401337623596, + "learning_rate": 7.171426970953256e-06, + "loss": 0.7539, + "step": 13037 + }, + { + "epoch": 0.7175959051131048, + "grad_norm": 0.667346715927124, + "learning_rate": 7.171036506758512e-06, + "loss": 0.7517, + "step": 13038 + }, + { + "epoch": 0.7176509439154604, + "grad_norm": 0.5933231711387634, + "learning_rate": 7.170646026246838e-06, + "loss": 0.6852, + "step": 13039 + }, + { + "epoch": 0.717705982717816, + "grad_norm": 0.730015218257904, + "learning_rate": 7.170255529421168e-06, + "loss": 0.7316, + "step": 13040 + }, + { + "epoch": 0.7177610215201717, + "grad_norm": 0.6146146059036255, + "learning_rate": 7.169865016284442e-06, + "loss": 0.6715, + "step": 13041 + }, + { + "epoch": 0.7178160603225274, + "grad_norm": 0.694131076335907, + "learning_rate": 7.16947448683959e-06, + "loss": 0.7944, + "step": 13042 + }, + { + "epoch": 0.717871099124883, + "grad_norm": 0.6736807823181152, + "learning_rate": 7.169083941089547e-06, + "loss": 0.7922, + "step": 13043 + }, + { + "epoch": 0.7179261379272387, + "grad_norm": 0.6748425364494324, + "learning_rate": 7.16869337903725e-06, + "loss": 0.6738, + "step": 13044 + }, + { + "epoch": 0.7179811767295944, + "grad_norm": 0.6807510852813721, + "learning_rate": 7.168302800685635e-06, + "loss": 0.7291, + "step": 13045 + }, + { + "epoch": 0.7180362155319501, + "grad_norm": 0.6613160371780396, + "learning_rate": 7.167912206037637e-06, + "loss": 0.6839, + "step": 13046 + }, + { + "epoch": 0.7180912543343057, + "grad_norm": 0.7184692621231079, + "learning_rate": 7.16752159509619e-06, + "loss": 0.6748, + "step": 13047 + }, + { + "epoch": 0.7181462931366613, + "grad_norm": 0.6938989758491516, + "learning_rate": 7.167130967864231e-06, + "loss": 0.7926, + "step": 13048 + }, + { + "epoch": 0.718201331939017, + "grad_norm": 0.6871020793914795, + "learning_rate": 7.166740324344696e-06, + "loss": 0.8229, + "step": 13049 + }, + { + "epoch": 0.7182563707413727, + "grad_norm": 0.8003624081611633, + "learning_rate": 7.166349664540521e-06, + "loss": 0.8488, + "step": 13050 + }, + { + "epoch": 0.7183114095437283, + "grad_norm": 0.7309357523918152, + "learning_rate": 7.165958988454642e-06, + "loss": 0.7442, + "step": 13051 + }, + { + "epoch": 0.718366448346084, + "grad_norm": 0.7462141513824463, + "learning_rate": 7.165568296089993e-06, + "loss": 0.8014, + "step": 13052 + }, + { + "epoch": 0.7184214871484397, + "grad_norm": 0.8335661292076111, + "learning_rate": 7.165177587449516e-06, + "loss": 0.6773, + "step": 13053 + }, + { + "epoch": 0.7184765259507954, + "grad_norm": 0.6996884346008301, + "learning_rate": 7.164786862536142e-06, + "loss": 0.7491, + "step": 13054 + }, + { + "epoch": 0.7185315647531509, + "grad_norm": 0.7203043103218079, + "learning_rate": 7.164396121352809e-06, + "loss": 0.7196, + "step": 13055 + }, + { + "epoch": 0.7185866035555066, + "grad_norm": 0.7109461426734924, + "learning_rate": 7.164005363902453e-06, + "loss": 0.7336, + "step": 13056 + }, + { + "epoch": 0.7186416423578623, + "grad_norm": 0.7057282328605652, + "learning_rate": 7.1636145901880135e-06, + "loss": 0.734, + "step": 13057 + }, + { + "epoch": 0.7186966811602179, + "grad_norm": 0.7288782000541687, + "learning_rate": 7.163223800212427e-06, + "loss": 0.8141, + "step": 13058 + }, + { + "epoch": 0.7187517199625736, + "grad_norm": 0.6812320947647095, + "learning_rate": 7.162832993978628e-06, + "loss": 0.7525, + "step": 13059 + }, + { + "epoch": 0.7188067587649293, + "grad_norm": 0.6782627105712891, + "learning_rate": 7.1624421714895546e-06, + "loss": 0.7647, + "step": 13060 + }, + { + "epoch": 0.718861797567285, + "grad_norm": 0.7361965775489807, + "learning_rate": 7.162051332748146e-06, + "loss": 0.7774, + "step": 13061 + }, + { + "epoch": 0.7189168363696405, + "grad_norm": 0.68894362449646, + "learning_rate": 7.161660477757337e-06, + "loss": 0.767, + "step": 13062 + }, + { + "epoch": 0.7189718751719962, + "grad_norm": 0.6440854668617249, + "learning_rate": 7.161269606520067e-06, + "loss": 0.7062, + "step": 13063 + }, + { + "epoch": 0.7190269139743519, + "grad_norm": 0.8411546945571899, + "learning_rate": 7.160878719039273e-06, + "loss": 0.728, + "step": 13064 + }, + { + "epoch": 0.7190819527767076, + "grad_norm": 0.6895145177841187, + "learning_rate": 7.160487815317895e-06, + "loss": 0.6667, + "step": 13065 + }, + { + "epoch": 0.7191369915790632, + "grad_norm": 0.6943626403808594, + "learning_rate": 7.160096895358866e-06, + "loss": 0.7579, + "step": 13066 + }, + { + "epoch": 0.7191920303814189, + "grad_norm": 0.7940205335617065, + "learning_rate": 7.1597059591651294e-06, + "loss": 0.7286, + "step": 13067 + }, + { + "epoch": 0.7192470691837746, + "grad_norm": 0.7350896000862122, + "learning_rate": 7.159315006739619e-06, + "loss": 0.7174, + "step": 13068 + }, + { + "epoch": 0.7193021079861303, + "grad_norm": 0.7663372159004211, + "learning_rate": 7.158924038085275e-06, + "loss": 0.7871, + "step": 13069 + }, + { + "epoch": 0.7193571467884858, + "grad_norm": 0.7368965744972229, + "learning_rate": 7.1585330532050375e-06, + "loss": 0.7356, + "step": 13070 + }, + { + "epoch": 0.7194121855908415, + "grad_norm": 0.7345212697982788, + "learning_rate": 7.158142052101843e-06, + "loss": 0.7784, + "step": 13071 + }, + { + "epoch": 0.7194672243931972, + "grad_norm": 0.7847188711166382, + "learning_rate": 7.157751034778629e-06, + "loss": 0.7899, + "step": 13072 + }, + { + "epoch": 0.7195222631955529, + "grad_norm": 0.757514476776123, + "learning_rate": 7.157360001238337e-06, + "loss": 0.8899, + "step": 13073 + }, + { + "epoch": 0.7195773019979085, + "grad_norm": 0.73405522108078, + "learning_rate": 7.156968951483905e-06, + "loss": 0.7283, + "step": 13074 + }, + { + "epoch": 0.7196323408002642, + "grad_norm": 0.7950206398963928, + "learning_rate": 7.156577885518271e-06, + "loss": 0.7338, + "step": 13075 + }, + { + "epoch": 0.7196873796026199, + "grad_norm": 0.8082411289215088, + "learning_rate": 7.156186803344374e-06, + "loss": 0.711, + "step": 13076 + }, + { + "epoch": 0.7197424184049755, + "grad_norm": 0.6868693828582764, + "learning_rate": 7.1557957049651574e-06, + "loss": 0.7583, + "step": 13077 + }, + { + "epoch": 0.7197974572073311, + "grad_norm": 0.7226251363754272, + "learning_rate": 7.155404590383554e-06, + "loss": 0.746, + "step": 13078 + }, + { + "epoch": 0.7198524960096868, + "grad_norm": 0.7437220811843872, + "learning_rate": 7.155013459602509e-06, + "loss": 0.6884, + "step": 13079 + }, + { + "epoch": 0.7199075348120425, + "grad_norm": 0.7486164569854736, + "learning_rate": 7.154622312624958e-06, + "loss": 0.6968, + "step": 13080 + }, + { + "epoch": 0.7199625736143982, + "grad_norm": 0.7709106802940369, + "learning_rate": 7.154231149453843e-06, + "loss": 0.838, + "step": 13081 + }, + { + "epoch": 0.7200176124167538, + "grad_norm": 0.6962981224060059, + "learning_rate": 7.153839970092104e-06, + "loss": 0.7186, + "step": 13082 + }, + { + "epoch": 0.7200726512191095, + "grad_norm": 0.8195380568504333, + "learning_rate": 7.15344877454268e-06, + "loss": 0.7949, + "step": 13083 + }, + { + "epoch": 0.7201276900214651, + "grad_norm": 0.735285758972168, + "learning_rate": 7.15305756280851e-06, + "loss": 0.7477, + "step": 13084 + }, + { + "epoch": 0.7201827288238208, + "grad_norm": 0.6121101379394531, + "learning_rate": 7.1526663348925375e-06, + "loss": 0.6686, + "step": 13085 + }, + { + "epoch": 0.7202377676261764, + "grad_norm": 0.7204885482788086, + "learning_rate": 7.1522750907977e-06, + "loss": 0.8013, + "step": 13086 + }, + { + "epoch": 0.7202928064285321, + "grad_norm": 0.6808584332466125, + "learning_rate": 7.15188383052694e-06, + "loss": 0.7847, + "step": 13087 + }, + { + "epoch": 0.7203478452308878, + "grad_norm": 0.7049086093902588, + "learning_rate": 7.151492554083195e-06, + "loss": 0.7563, + "step": 13088 + }, + { + "epoch": 0.7204028840332435, + "grad_norm": 0.765708327293396, + "learning_rate": 7.151101261469411e-06, + "loss": 0.7648, + "step": 13089 + }, + { + "epoch": 0.7204579228355991, + "grad_norm": 0.6810007095336914, + "learning_rate": 7.150709952688525e-06, + "loss": 0.731, + "step": 13090 + }, + { + "epoch": 0.7205129616379548, + "grad_norm": 0.7242745757102966, + "learning_rate": 7.150318627743478e-06, + "loss": 0.8027, + "step": 13091 + }, + { + "epoch": 0.7205680004403104, + "grad_norm": 0.7452220916748047, + "learning_rate": 7.14992728663721e-06, + "loss": 0.7848, + "step": 13092 + }, + { + "epoch": 0.7206230392426661, + "grad_norm": 0.6333943605422974, + "learning_rate": 7.149535929372667e-06, + "loss": 0.7105, + "step": 13093 + }, + { + "epoch": 0.7206780780450217, + "grad_norm": 0.7565333247184753, + "learning_rate": 7.149144555952785e-06, + "loss": 0.8006, + "step": 13094 + }, + { + "epoch": 0.7207331168473774, + "grad_norm": 0.7703632712364197, + "learning_rate": 7.14875316638051e-06, + "loss": 0.7323, + "step": 13095 + }, + { + "epoch": 0.7207881556497331, + "grad_norm": 0.6275011301040649, + "learning_rate": 7.148361760658779e-06, + "loss": 0.6817, + "step": 13096 + }, + { + "epoch": 0.7208431944520888, + "grad_norm": 0.7363598942756653, + "learning_rate": 7.147970338790537e-06, + "loss": 0.7641, + "step": 13097 + }, + { + "epoch": 0.7208982332544444, + "grad_norm": 0.6284294724464417, + "learning_rate": 7.147578900778727e-06, + "loss": 0.7117, + "step": 13098 + }, + { + "epoch": 0.7209532720568, + "grad_norm": 0.7878503203392029, + "learning_rate": 7.147187446626287e-06, + "loss": 0.8184, + "step": 13099 + }, + { + "epoch": 0.7210083108591557, + "grad_norm": 0.6973691582679749, + "learning_rate": 7.146795976336159e-06, + "loss": 0.7815, + "step": 13100 + }, + { + "epoch": 0.7210633496615113, + "grad_norm": 0.7018479704856873, + "learning_rate": 7.146404489911291e-06, + "loss": 0.7305, + "step": 13101 + }, + { + "epoch": 0.721118388463867, + "grad_norm": 0.6903830766677856, + "learning_rate": 7.14601298735462e-06, + "loss": 0.7074, + "step": 13102 + }, + { + "epoch": 0.7211734272662227, + "grad_norm": 0.7612621188163757, + "learning_rate": 7.145621468669089e-06, + "loss": 0.8189, + "step": 13103 + }, + { + "epoch": 0.7212284660685784, + "grad_norm": 0.7256856560707092, + "learning_rate": 7.145229933857643e-06, + "loss": 0.5959, + "step": 13104 + }, + { + "epoch": 0.721283504870934, + "grad_norm": 0.6632323265075684, + "learning_rate": 7.1448383829232205e-06, + "loss": 0.7519, + "step": 13105 + }, + { + "epoch": 0.7213385436732896, + "grad_norm": 0.6320651769638062, + "learning_rate": 7.144446815868768e-06, + "loss": 0.7259, + "step": 13106 + }, + { + "epoch": 0.7213935824756453, + "grad_norm": 0.6883212924003601, + "learning_rate": 7.144055232697227e-06, + "loss": 0.7776, + "step": 13107 + }, + { + "epoch": 0.721448621278001, + "grad_norm": 0.7159759402275085, + "learning_rate": 7.1436636334115415e-06, + "loss": 0.6915, + "step": 13108 + }, + { + "epoch": 0.7215036600803566, + "grad_norm": 0.7108080983161926, + "learning_rate": 7.1432720180146535e-06, + "loss": 0.731, + "step": 13109 + }, + { + "epoch": 0.7215586988827123, + "grad_norm": 0.7765033841133118, + "learning_rate": 7.142880386509506e-06, + "loss": 0.6965, + "step": 13110 + }, + { + "epoch": 0.721613737685068, + "grad_norm": 0.7205119132995605, + "learning_rate": 7.142488738899045e-06, + "loss": 0.7262, + "step": 13111 + }, + { + "epoch": 0.7216687764874237, + "grad_norm": 0.6786921620368958, + "learning_rate": 7.142097075186212e-06, + "loss": 0.805, + "step": 13112 + }, + { + "epoch": 0.7217238152897792, + "grad_norm": 0.7947409152984619, + "learning_rate": 7.141705395373949e-06, + "loss": 0.7701, + "step": 13113 + }, + { + "epoch": 0.7217788540921349, + "grad_norm": 0.6672971844673157, + "learning_rate": 7.141313699465204e-06, + "loss": 0.7325, + "step": 13114 + }, + { + "epoch": 0.7218338928944906, + "grad_norm": 0.641765296459198, + "learning_rate": 7.140921987462916e-06, + "loss": 0.7902, + "step": 13115 + }, + { + "epoch": 0.7218889316968463, + "grad_norm": 0.6675699353218079, + "learning_rate": 7.140530259370032e-06, + "loss": 0.7422, + "step": 13116 + }, + { + "epoch": 0.7219439704992019, + "grad_norm": 0.6940729022026062, + "learning_rate": 7.140138515189495e-06, + "loss": 0.6978, + "step": 13117 + }, + { + "epoch": 0.7219990093015576, + "grad_norm": 0.6805779337882996, + "learning_rate": 7.1397467549242514e-06, + "loss": 0.7498, + "step": 13118 + }, + { + "epoch": 0.7220540481039133, + "grad_norm": 0.6231662631034851, + "learning_rate": 7.139354978577243e-06, + "loss": 0.7344, + "step": 13119 + }, + { + "epoch": 0.722109086906269, + "grad_norm": 0.6883575916290283, + "learning_rate": 7.138963186151416e-06, + "loss": 0.835, + "step": 13120 + }, + { + "epoch": 0.7221641257086245, + "grad_norm": 0.6902666687965393, + "learning_rate": 7.138571377649712e-06, + "loss": 0.7427, + "step": 13121 + }, + { + "epoch": 0.7222191645109802, + "grad_norm": 0.7156440019607544, + "learning_rate": 7.1381795530750805e-06, + "loss": 0.7661, + "step": 13122 + }, + { + "epoch": 0.7222742033133359, + "grad_norm": 0.6727150678634644, + "learning_rate": 7.137787712430464e-06, + "loss": 0.7872, + "step": 13123 + }, + { + "epoch": 0.7223292421156916, + "grad_norm": 0.6200405359268188, + "learning_rate": 7.137395855718806e-06, + "loss": 0.6108, + "step": 13124 + }, + { + "epoch": 0.7223842809180472, + "grad_norm": 0.6384756565093994, + "learning_rate": 7.137003982943054e-06, + "loss": 0.698, + "step": 13125 + }, + { + "epoch": 0.7224393197204029, + "grad_norm": 0.7212089896202087, + "learning_rate": 7.1366120941061515e-06, + "loss": 0.7679, + "step": 13126 + }, + { + "epoch": 0.7224943585227586, + "grad_norm": 0.737352192401886, + "learning_rate": 7.136220189211044e-06, + "loss": 0.8173, + "step": 13127 + }, + { + "epoch": 0.7225493973251143, + "grad_norm": 0.6244099736213684, + "learning_rate": 7.135828268260679e-06, + "loss": 0.7224, + "step": 13128 + }, + { + "epoch": 0.7226044361274698, + "grad_norm": 0.8191885948181152, + "learning_rate": 7.135436331257997e-06, + "loss": 0.8122, + "step": 13129 + }, + { + "epoch": 0.7226594749298255, + "grad_norm": 0.7069095373153687, + "learning_rate": 7.135044378205949e-06, + "loss": 0.7844, + "step": 13130 + }, + { + "epoch": 0.7227145137321812, + "grad_norm": 0.6094380021095276, + "learning_rate": 7.13465240910748e-06, + "loss": 0.7093, + "step": 13131 + }, + { + "epoch": 0.7227695525345369, + "grad_norm": 0.7075843811035156, + "learning_rate": 7.134260423965534e-06, + "loss": 0.8109, + "step": 13132 + }, + { + "epoch": 0.7228245913368925, + "grad_norm": 0.6684398651123047, + "learning_rate": 7.133868422783057e-06, + "loss": 0.7224, + "step": 13133 + }, + { + "epoch": 0.7228796301392482, + "grad_norm": 0.6574007272720337, + "learning_rate": 7.133476405562998e-06, + "loss": 0.6763, + "step": 13134 + }, + { + "epoch": 0.7229346689416039, + "grad_norm": 0.7124022841453552, + "learning_rate": 7.133084372308301e-06, + "loss": 0.8047, + "step": 13135 + }, + { + "epoch": 0.7229897077439595, + "grad_norm": 0.7035976648330688, + "learning_rate": 7.1326923230219124e-06, + "loss": 0.7544, + "step": 13136 + }, + { + "epoch": 0.7230447465463151, + "grad_norm": 0.7007604241371155, + "learning_rate": 7.132300257706779e-06, + "loss": 0.7584, + "step": 13137 + }, + { + "epoch": 0.7230997853486708, + "grad_norm": 0.6917324066162109, + "learning_rate": 7.131908176365848e-06, + "loss": 0.6846, + "step": 13138 + }, + { + "epoch": 0.7231548241510265, + "grad_norm": 0.6857448816299438, + "learning_rate": 7.1315160790020666e-06, + "loss": 0.8142, + "step": 13139 + }, + { + "epoch": 0.7232098629533822, + "grad_norm": 0.8381820321083069, + "learning_rate": 7.13112396561838e-06, + "loss": 0.8132, + "step": 13140 + }, + { + "epoch": 0.7232649017557378, + "grad_norm": 0.7024879455566406, + "learning_rate": 7.130731836217735e-06, + "loss": 0.7157, + "step": 13141 + }, + { + "epoch": 0.7233199405580935, + "grad_norm": 0.7313332557678223, + "learning_rate": 7.130339690803081e-06, + "loss": 0.7623, + "step": 13142 + }, + { + "epoch": 0.7233749793604491, + "grad_norm": 0.697536051273346, + "learning_rate": 7.129947529377364e-06, + "loss": 0.7202, + "step": 13143 + }, + { + "epoch": 0.7234300181628047, + "grad_norm": 0.6946722865104675, + "learning_rate": 7.129555351943533e-06, + "loss": 0.7862, + "step": 13144 + }, + { + "epoch": 0.7234850569651604, + "grad_norm": 0.6643924117088318, + "learning_rate": 7.129163158504532e-06, + "loss": 0.7055, + "step": 13145 + }, + { + "epoch": 0.7235400957675161, + "grad_norm": 0.7285693287849426, + "learning_rate": 7.1287709490633104e-06, + "loss": 0.6815, + "step": 13146 + }, + { + "epoch": 0.7235951345698718, + "grad_norm": 1.2701799869537354, + "learning_rate": 7.128378723622818e-06, + "loss": 0.8596, + "step": 13147 + }, + { + "epoch": 0.7236501733722274, + "grad_norm": 0.7067306041717529, + "learning_rate": 7.127986482186e-06, + "loss": 0.7077, + "step": 13148 + }, + { + "epoch": 0.7237052121745831, + "grad_norm": 0.8863486051559448, + "learning_rate": 7.127594224755805e-06, + "loss": 0.8961, + "step": 13149 + }, + { + "epoch": 0.7237602509769387, + "grad_norm": 0.7286190986633301, + "learning_rate": 7.127201951335182e-06, + "loss": 0.7941, + "step": 13150 + }, + { + "epoch": 0.7238152897792944, + "grad_norm": 0.8756779432296753, + "learning_rate": 7.126809661927079e-06, + "loss": 0.7862, + "step": 13151 + }, + { + "epoch": 0.72387032858165, + "grad_norm": 0.7780876755714417, + "learning_rate": 7.126417356534443e-06, + "loss": 0.7095, + "step": 13152 + }, + { + "epoch": 0.7239253673840057, + "grad_norm": 0.6332812905311584, + "learning_rate": 7.1260250351602225e-06, + "loss": 0.7057, + "step": 13153 + }, + { + "epoch": 0.7239804061863614, + "grad_norm": 0.8350435495376587, + "learning_rate": 7.125632697807368e-06, + "loss": 0.7695, + "step": 13154 + }, + { + "epoch": 0.7240354449887171, + "grad_norm": 0.8306411504745483, + "learning_rate": 7.125240344478827e-06, + "loss": 0.6605, + "step": 13155 + }, + { + "epoch": 0.7240904837910727, + "grad_norm": 0.7495117783546448, + "learning_rate": 7.124847975177548e-06, + "loss": 0.8078, + "step": 13156 + }, + { + "epoch": 0.7241455225934283, + "grad_norm": 0.6481010317802429, + "learning_rate": 7.12445558990648e-06, + "loss": 0.8094, + "step": 13157 + }, + { + "epoch": 0.724200561395784, + "grad_norm": 0.7742613554000854, + "learning_rate": 7.124063188668573e-06, + "loss": 0.78, + "step": 13158 + }, + { + "epoch": 0.7242556001981397, + "grad_norm": 0.8394206762313843, + "learning_rate": 7.123670771466776e-06, + "loss": 0.8983, + "step": 13159 + }, + { + "epoch": 0.7243106390004953, + "grad_norm": 0.7196840047836304, + "learning_rate": 7.123278338304038e-06, + "loss": 0.7203, + "step": 13160 + }, + { + "epoch": 0.724365677802851, + "grad_norm": 0.5964440107345581, + "learning_rate": 7.122885889183309e-06, + "loss": 0.6251, + "step": 13161 + }, + { + "epoch": 0.7244207166052067, + "grad_norm": 0.7394048571586609, + "learning_rate": 7.1224934241075375e-06, + "loss": 0.7755, + "step": 13162 + }, + { + "epoch": 0.7244757554075624, + "grad_norm": 0.6427145004272461, + "learning_rate": 7.1221009430796724e-06, + "loss": 0.74, + "step": 13163 + }, + { + "epoch": 0.724530794209918, + "grad_norm": 0.7084387540817261, + "learning_rate": 7.121708446102667e-06, + "loss": 0.7464, + "step": 13164 + }, + { + "epoch": 0.7245858330122736, + "grad_norm": 0.6623230576515198, + "learning_rate": 7.121315933179466e-06, + "loss": 0.7237, + "step": 13165 + }, + { + "epoch": 0.7246408718146293, + "grad_norm": 0.9234243631362915, + "learning_rate": 7.120923404313024e-06, + "loss": 0.8238, + "step": 13166 + }, + { + "epoch": 0.724695910616985, + "grad_norm": 0.6458896994590759, + "learning_rate": 7.120530859506289e-06, + "loss": 0.8105, + "step": 13167 + }, + { + "epoch": 0.7247509494193406, + "grad_norm": 0.7160854935646057, + "learning_rate": 7.1201382987622115e-06, + "loss": 0.7954, + "step": 13168 + }, + { + "epoch": 0.7248059882216963, + "grad_norm": 0.6896069645881653, + "learning_rate": 7.119745722083742e-06, + "loss": 0.7281, + "step": 13169 + }, + { + "epoch": 0.724861027024052, + "grad_norm": 0.6609574556350708, + "learning_rate": 7.119353129473831e-06, + "loss": 0.7682, + "step": 13170 + }, + { + "epoch": 0.7249160658264077, + "grad_norm": 0.6477035880088806, + "learning_rate": 7.118960520935429e-06, + "loss": 0.8183, + "step": 13171 + }, + { + "epoch": 0.7249711046287632, + "grad_norm": 1.4488556385040283, + "learning_rate": 7.1185678964714885e-06, + "loss": 0.8321, + "step": 13172 + }, + { + "epoch": 0.7250261434311189, + "grad_norm": 0.8502382040023804, + "learning_rate": 7.118175256084958e-06, + "loss": 0.7881, + "step": 13173 + }, + { + "epoch": 0.7250811822334746, + "grad_norm": 0.6969912648200989, + "learning_rate": 7.117782599778788e-06, + "loss": 0.7598, + "step": 13174 + }, + { + "epoch": 0.7251362210358303, + "grad_norm": 0.7254889011383057, + "learning_rate": 7.117389927555933e-06, + "loss": 0.8473, + "step": 13175 + }, + { + "epoch": 0.7251912598381859, + "grad_norm": 0.9958444237709045, + "learning_rate": 7.116997239419341e-06, + "loss": 0.7558, + "step": 13176 + }, + { + "epoch": 0.7252462986405416, + "grad_norm": 0.6694881916046143, + "learning_rate": 7.116604535371963e-06, + "loss": 0.7072, + "step": 13177 + }, + { + "epoch": 0.7253013374428973, + "grad_norm": 1.0730634927749634, + "learning_rate": 7.116211815416754e-06, + "loss": 0.7607, + "step": 13178 + }, + { + "epoch": 0.725356376245253, + "grad_norm": 0.6770226359367371, + "learning_rate": 7.115819079556663e-06, + "loss": 0.7213, + "step": 13179 + }, + { + "epoch": 0.7254114150476085, + "grad_norm": 0.866215705871582, + "learning_rate": 7.115426327794642e-06, + "loss": 0.7273, + "step": 13180 + }, + { + "epoch": 0.7254664538499642, + "grad_norm": 0.7303730845451355, + "learning_rate": 7.115033560133642e-06, + "loss": 0.764, + "step": 13181 + }, + { + "epoch": 0.7255214926523199, + "grad_norm": 0.6900389194488525, + "learning_rate": 7.114640776576617e-06, + "loss": 0.6958, + "step": 13182 + }, + { + "epoch": 0.7255765314546756, + "grad_norm": 0.7255710959434509, + "learning_rate": 7.114247977126518e-06, + "loss": 0.6507, + "step": 13183 + }, + { + "epoch": 0.7256315702570312, + "grad_norm": 0.6848479509353638, + "learning_rate": 7.113855161786297e-06, + "loss": 0.6848, + "step": 13184 + }, + { + "epoch": 0.7256866090593869, + "grad_norm": 0.6800528764724731, + "learning_rate": 7.113462330558907e-06, + "loss": 0.7354, + "step": 13185 + }, + { + "epoch": 0.7257416478617426, + "grad_norm": 0.7271339297294617, + "learning_rate": 7.113069483447299e-06, + "loss": 0.7695, + "step": 13186 + }, + { + "epoch": 0.7257966866640981, + "grad_norm": 0.8212381601333618, + "learning_rate": 7.112676620454427e-06, + "loss": 0.7348, + "step": 13187 + }, + { + "epoch": 0.7258517254664538, + "grad_norm": 0.6714771389961243, + "learning_rate": 7.112283741583242e-06, + "loss": 0.75, + "step": 13188 + }, + { + "epoch": 0.7259067642688095, + "grad_norm": 0.7834941148757935, + "learning_rate": 7.111890846836699e-06, + "loss": 0.6914, + "step": 13189 + }, + { + "epoch": 0.7259618030711652, + "grad_norm": 0.8107824325561523, + "learning_rate": 7.111497936217748e-06, + "loss": 0.803, + "step": 13190 + }, + { + "epoch": 0.7260168418735208, + "grad_norm": 0.6306549906730652, + "learning_rate": 7.1111050097293464e-06, + "loss": 0.7915, + "step": 13191 + }, + { + "epoch": 0.7260718806758765, + "grad_norm": 0.7030252814292908, + "learning_rate": 7.110712067374444e-06, + "loss": 0.7091, + "step": 13192 + }, + { + "epoch": 0.7261269194782322, + "grad_norm": 0.7625641226768494, + "learning_rate": 7.110319109155992e-06, + "loss": 0.774, + "step": 13193 + }, + { + "epoch": 0.7261819582805878, + "grad_norm": 0.6382628083229065, + "learning_rate": 7.109926135076949e-06, + "loss": 0.6774, + "step": 13194 + }, + { + "epoch": 0.7262369970829434, + "grad_norm": 0.6594563722610474, + "learning_rate": 7.109533145140265e-06, + "loss": 0.7977, + "step": 13195 + }, + { + "epoch": 0.7262920358852991, + "grad_norm": 0.7177248001098633, + "learning_rate": 7.109140139348895e-06, + "loss": 0.6771, + "step": 13196 + }, + { + "epoch": 0.7263470746876548, + "grad_norm": 0.6631305813789368, + "learning_rate": 7.108747117705792e-06, + "loss": 0.6877, + "step": 13197 + }, + { + "epoch": 0.7264021134900105, + "grad_norm": 0.6783736944198608, + "learning_rate": 7.10835408021391e-06, + "loss": 0.8048, + "step": 13198 + }, + { + "epoch": 0.7264571522923661, + "grad_norm": 0.7368303537368774, + "learning_rate": 7.107961026876204e-06, + "loss": 0.7962, + "step": 13199 + }, + { + "epoch": 0.7265121910947218, + "grad_norm": 0.7697044014930725, + "learning_rate": 7.107567957695627e-06, + "loss": 0.769, + "step": 13200 + }, + { + "epoch": 0.7265672298970774, + "grad_norm": 0.639934241771698, + "learning_rate": 7.1071748726751325e-06, + "loss": 0.722, + "step": 13201 + }, + { + "epoch": 0.7266222686994331, + "grad_norm": 0.8410669565200806, + "learning_rate": 7.106781771817676e-06, + "loss": 0.8861, + "step": 13202 + }, + { + "epoch": 0.7266773075017887, + "grad_norm": 0.654924213886261, + "learning_rate": 7.106388655126212e-06, + "loss": 0.7463, + "step": 13203 + }, + { + "epoch": 0.7267323463041444, + "grad_norm": 0.719714879989624, + "learning_rate": 7.105995522603695e-06, + "loss": 0.759, + "step": 13204 + }, + { + "epoch": 0.7267873851065001, + "grad_norm": 0.7019139528274536, + "learning_rate": 7.105602374253078e-06, + "loss": 0.7965, + "step": 13205 + }, + { + "epoch": 0.7268424239088558, + "grad_norm": 0.7289487719535828, + "learning_rate": 7.105209210077318e-06, + "loss": 0.8591, + "step": 13206 + }, + { + "epoch": 0.7268974627112114, + "grad_norm": 0.670274019241333, + "learning_rate": 7.104816030079369e-06, + "loss": 0.7707, + "step": 13207 + }, + { + "epoch": 0.726952501513567, + "grad_norm": 0.7156813740730286, + "learning_rate": 7.104422834262187e-06, + "loss": 0.7724, + "step": 13208 + }, + { + "epoch": 0.7270075403159227, + "grad_norm": 0.6776198148727417, + "learning_rate": 7.104029622628726e-06, + "loss": 0.7331, + "step": 13209 + }, + { + "epoch": 0.7270625791182784, + "grad_norm": 0.8008358478546143, + "learning_rate": 7.103636395181941e-06, + "loss": 0.8279, + "step": 13210 + }, + { + "epoch": 0.727117617920634, + "grad_norm": 0.6622886061668396, + "learning_rate": 7.1032431519247876e-06, + "loss": 0.6646, + "step": 13211 + }, + { + "epoch": 0.7271726567229897, + "grad_norm": 0.6834877729415894, + "learning_rate": 7.102849892860223e-06, + "loss": 0.75, + "step": 13212 + }, + { + "epoch": 0.7272276955253454, + "grad_norm": 0.7659596800804138, + "learning_rate": 7.1024566179912e-06, + "loss": 0.6999, + "step": 13213 + }, + { + "epoch": 0.7272827343277011, + "grad_norm": 0.7368002533912659, + "learning_rate": 7.102063327320677e-06, + "loss": 0.7376, + "step": 13214 + }, + { + "epoch": 0.7273377731300567, + "grad_norm": 0.7286058664321899, + "learning_rate": 7.101670020851609e-06, + "loss": 0.8139, + "step": 13215 + }, + { + "epoch": 0.7273928119324123, + "grad_norm": 1.0521546602249146, + "learning_rate": 7.101276698586951e-06, + "loss": 0.8545, + "step": 13216 + }, + { + "epoch": 0.727447850734768, + "grad_norm": 0.6940305233001709, + "learning_rate": 7.100883360529659e-06, + "loss": 0.7534, + "step": 13217 + }, + { + "epoch": 0.7275028895371237, + "grad_norm": 0.8279024362564087, + "learning_rate": 7.100490006682691e-06, + "loss": 0.852, + "step": 13218 + }, + { + "epoch": 0.7275579283394793, + "grad_norm": 0.63093501329422, + "learning_rate": 7.100096637049002e-06, + "loss": 0.6728, + "step": 13219 + }, + { + "epoch": 0.727612967141835, + "grad_norm": 0.7576018571853638, + "learning_rate": 7.099703251631549e-06, + "loss": 0.6343, + "step": 13220 + }, + { + "epoch": 0.7276680059441907, + "grad_norm": 0.9493140578269958, + "learning_rate": 7.0993098504332894e-06, + "loss": 0.82, + "step": 13221 + }, + { + "epoch": 0.7277230447465464, + "grad_norm": 0.7279804944992065, + "learning_rate": 7.098916433457177e-06, + "loss": 0.8149, + "step": 13222 + }, + { + "epoch": 0.7277780835489019, + "grad_norm": 0.7660531401634216, + "learning_rate": 7.0985230007061725e-06, + "loss": 0.8278, + "step": 13223 + }, + { + "epoch": 0.7278331223512576, + "grad_norm": 0.6468318104743958, + "learning_rate": 7.09812955218323e-06, + "loss": 0.7193, + "step": 13224 + }, + { + "epoch": 0.7278881611536133, + "grad_norm": 0.6389151811599731, + "learning_rate": 7.097736087891306e-06, + "loss": 0.6744, + "step": 13225 + }, + { + "epoch": 0.727943199955969, + "grad_norm": 0.6565649509429932, + "learning_rate": 7.097342607833361e-06, + "loss": 0.7586, + "step": 13226 + }, + { + "epoch": 0.7279982387583246, + "grad_norm": 0.6867381930351257, + "learning_rate": 7.09694911201235e-06, + "loss": 0.684, + "step": 13227 + }, + { + "epoch": 0.7280532775606803, + "grad_norm": 0.7509286403656006, + "learning_rate": 7.096555600431229e-06, + "loss": 0.8242, + "step": 13228 + }, + { + "epoch": 0.728108316363036, + "grad_norm": 0.6997731328010559, + "learning_rate": 7.096162073092959e-06, + "loss": 0.8182, + "step": 13229 + }, + { + "epoch": 0.7281633551653915, + "grad_norm": 0.6698907017707825, + "learning_rate": 7.095768530000496e-06, + "loss": 0.7752, + "step": 13230 + }, + { + "epoch": 0.7282183939677472, + "grad_norm": 0.7219094634056091, + "learning_rate": 7.095374971156799e-06, + "loss": 0.792, + "step": 13231 + }, + { + "epoch": 0.7282734327701029, + "grad_norm": 0.6479744911193848, + "learning_rate": 7.094981396564822e-06, + "loss": 0.7556, + "step": 13232 + }, + { + "epoch": 0.7283284715724586, + "grad_norm": 0.6795497536659241, + "learning_rate": 7.094587806227527e-06, + "loss": 0.7611, + "step": 13233 + }, + { + "epoch": 0.7283835103748142, + "grad_norm": 0.7145074605941772, + "learning_rate": 7.094194200147871e-06, + "loss": 0.8064, + "step": 13234 + }, + { + "epoch": 0.7284385491771699, + "grad_norm": 0.6750605702400208, + "learning_rate": 7.093800578328811e-06, + "loss": 0.7054, + "step": 13235 + }, + { + "epoch": 0.7284935879795256, + "grad_norm": 0.7574751377105713, + "learning_rate": 7.093406940773307e-06, + "loss": 0.7878, + "step": 13236 + }, + { + "epoch": 0.7285486267818813, + "grad_norm": 0.7836418747901917, + "learning_rate": 7.093013287484316e-06, + "loss": 0.7445, + "step": 13237 + }, + { + "epoch": 0.7286036655842368, + "grad_norm": 0.7658870220184326, + "learning_rate": 7.092619618464799e-06, + "loss": 0.7513, + "step": 13238 + }, + { + "epoch": 0.7286587043865925, + "grad_norm": 1.1127573251724243, + "learning_rate": 7.092225933717711e-06, + "loss": 0.7601, + "step": 13239 + }, + { + "epoch": 0.7287137431889482, + "grad_norm": 0.7003853917121887, + "learning_rate": 7.091832233246015e-06, + "loss": 0.8533, + "step": 13240 + }, + { + "epoch": 0.7287687819913039, + "grad_norm": 0.6513979434967041, + "learning_rate": 7.091438517052667e-06, + "loss": 0.7285, + "step": 13241 + }, + { + "epoch": 0.7288238207936595, + "grad_norm": 0.7072234153747559, + "learning_rate": 7.091044785140626e-06, + "loss": 0.7741, + "step": 13242 + }, + { + "epoch": 0.7288788595960152, + "grad_norm": 0.8117190599441528, + "learning_rate": 7.090651037512854e-06, + "loss": 0.6851, + "step": 13243 + }, + { + "epoch": 0.7289338983983709, + "grad_norm": 0.6876427531242371, + "learning_rate": 7.090257274172306e-06, + "loss": 0.7162, + "step": 13244 + }, + { + "epoch": 0.7289889372007266, + "grad_norm": 0.7128324508666992, + "learning_rate": 7.0898634951219455e-06, + "loss": 0.7302, + "step": 13245 + }, + { + "epoch": 0.7290439760030821, + "grad_norm": 0.6918201446533203, + "learning_rate": 7.089469700364731e-06, + "loss": 0.8582, + "step": 13246 + }, + { + "epoch": 0.7290990148054378, + "grad_norm": 0.6172242164611816, + "learning_rate": 7.08907588990362e-06, + "loss": 0.6846, + "step": 13247 + }, + { + "epoch": 0.7291540536077935, + "grad_norm": 0.6799596548080444, + "learning_rate": 7.088682063741575e-06, + "loss": 0.7174, + "step": 13248 + }, + { + "epoch": 0.7292090924101492, + "grad_norm": 0.6663293838500977, + "learning_rate": 7.088288221881554e-06, + "loss": 0.7237, + "step": 13249 + }, + { + "epoch": 0.7292641312125048, + "grad_norm": 0.6758549213409424, + "learning_rate": 7.0878943643265175e-06, + "loss": 0.7912, + "step": 13250 + }, + { + "epoch": 0.7293191700148605, + "grad_norm": 0.6937153339385986, + "learning_rate": 7.087500491079427e-06, + "loss": 0.742, + "step": 13251 + }, + { + "epoch": 0.7293742088172162, + "grad_norm": 0.6441238522529602, + "learning_rate": 7.087106602143241e-06, + "loss": 0.7676, + "step": 13252 + }, + { + "epoch": 0.7294292476195718, + "grad_norm": 0.6615588068962097, + "learning_rate": 7.08671269752092e-06, + "loss": 0.7069, + "step": 13253 + }, + { + "epoch": 0.7294842864219274, + "grad_norm": 0.8052160739898682, + "learning_rate": 7.086318777215424e-06, + "loss": 0.811, + "step": 13254 + }, + { + "epoch": 0.7295393252242831, + "grad_norm": 0.7293280363082886, + "learning_rate": 7.085924841229716e-06, + "loss": 0.7127, + "step": 13255 + }, + { + "epoch": 0.7295943640266388, + "grad_norm": 0.7104617953300476, + "learning_rate": 7.085530889566756e-06, + "loss": 0.716, + "step": 13256 + }, + { + "epoch": 0.7296494028289945, + "grad_norm": 0.72947758436203, + "learning_rate": 7.085136922229503e-06, + "loss": 0.8144, + "step": 13257 + }, + { + "epoch": 0.7297044416313501, + "grad_norm": 0.7993913292884827, + "learning_rate": 7.08474293922092e-06, + "loss": 0.7609, + "step": 13258 + }, + { + "epoch": 0.7297594804337058, + "grad_norm": 0.7810680270195007, + "learning_rate": 7.0843489405439656e-06, + "loss": 0.8107, + "step": 13259 + }, + { + "epoch": 0.7298145192360614, + "grad_norm": 0.6383776664733887, + "learning_rate": 7.083954926201604e-06, + "loss": 0.7842, + "step": 13260 + }, + { + "epoch": 0.7298695580384171, + "grad_norm": 0.7653967142105103, + "learning_rate": 7.083560896196795e-06, + "loss": 0.729, + "step": 13261 + }, + { + "epoch": 0.7299245968407727, + "grad_norm": 0.6693821549415588, + "learning_rate": 7.083166850532498e-06, + "loss": 0.6901, + "step": 13262 + }, + { + "epoch": 0.7299796356431284, + "grad_norm": 0.7408621907234192, + "learning_rate": 7.082772789211678e-06, + "loss": 0.7415, + "step": 13263 + }, + { + "epoch": 0.7300346744454841, + "grad_norm": 0.6693123579025269, + "learning_rate": 7.082378712237295e-06, + "loss": 0.8102, + "step": 13264 + }, + { + "epoch": 0.7300897132478398, + "grad_norm": 0.6572727560997009, + "learning_rate": 7.081984619612311e-06, + "loss": 0.6595, + "step": 13265 + }, + { + "epoch": 0.7301447520501954, + "grad_norm": 0.7934693694114685, + "learning_rate": 7.081590511339687e-06, + "loss": 0.8024, + "step": 13266 + }, + { + "epoch": 0.730199790852551, + "grad_norm": 1.0663061141967773, + "learning_rate": 7.081196387422388e-06, + "loss": 0.7844, + "step": 13267 + }, + { + "epoch": 0.7302548296549067, + "grad_norm": 0.8005035519599915, + "learning_rate": 7.080802247863372e-06, + "loss": 0.751, + "step": 13268 + }, + { + "epoch": 0.7303098684572624, + "grad_norm": 0.6480177044868469, + "learning_rate": 7.0804080926656046e-06, + "loss": 0.7745, + "step": 13269 + }, + { + "epoch": 0.730364907259618, + "grad_norm": 0.7026820182800293, + "learning_rate": 7.080013921832047e-06, + "loss": 0.7545, + "step": 13270 + }, + { + "epoch": 0.7304199460619737, + "grad_norm": 0.673954427242279, + "learning_rate": 7.079619735365662e-06, + "loss": 0.7142, + "step": 13271 + }, + { + "epoch": 0.7304749848643294, + "grad_norm": 0.7296637296676636, + "learning_rate": 7.079225533269411e-06, + "loss": 0.8493, + "step": 13272 + }, + { + "epoch": 0.730530023666685, + "grad_norm": 0.7147308588027954, + "learning_rate": 7.0788313155462576e-06, + "loss": 0.7638, + "step": 13273 + }, + { + "epoch": 0.7305850624690406, + "grad_norm": 0.7531922459602356, + "learning_rate": 7.078437082199163e-06, + "loss": 0.8644, + "step": 13274 + }, + { + "epoch": 0.7306401012713963, + "grad_norm": 0.6581404805183411, + "learning_rate": 7.078042833231092e-06, + "loss": 0.7555, + "step": 13275 + }, + { + "epoch": 0.730695140073752, + "grad_norm": 0.6781187057495117, + "learning_rate": 7.0776485686450095e-06, + "loss": 0.7536, + "step": 13276 + }, + { + "epoch": 0.7307501788761076, + "grad_norm": 0.7164949774742126, + "learning_rate": 7.077254288443874e-06, + "loss": 0.7275, + "step": 13277 + }, + { + "epoch": 0.7308052176784633, + "grad_norm": 0.8158305287361145, + "learning_rate": 7.076859992630652e-06, + "loss": 0.6821, + "step": 13278 + }, + { + "epoch": 0.730860256480819, + "grad_norm": 0.7101448178291321, + "learning_rate": 7.076465681208307e-06, + "loss": 0.69, + "step": 13279 + }, + { + "epoch": 0.7309152952831747, + "grad_norm": 0.6844518780708313, + "learning_rate": 7.076071354179802e-06, + "loss": 0.7577, + "step": 13280 + }, + { + "epoch": 0.7309703340855302, + "grad_norm": 0.6564158797264099, + "learning_rate": 7.0756770115481e-06, + "loss": 0.6752, + "step": 13281 + }, + { + "epoch": 0.7310253728878859, + "grad_norm": 0.7444283962249756, + "learning_rate": 7.0752826533161655e-06, + "loss": 0.8118, + "step": 13282 + }, + { + "epoch": 0.7310804116902416, + "grad_norm": 0.7657533884048462, + "learning_rate": 7.074888279486962e-06, + "loss": 0.8819, + "step": 13283 + }, + { + "epoch": 0.7311354504925973, + "grad_norm": 0.6924453973770142, + "learning_rate": 7.074493890063453e-06, + "loss": 0.7674, + "step": 13284 + }, + { + "epoch": 0.7311904892949529, + "grad_norm": 0.676188588142395, + "learning_rate": 7.074099485048603e-06, + "loss": 0.7266, + "step": 13285 + }, + { + "epoch": 0.7312455280973086, + "grad_norm": 0.6325914263725281, + "learning_rate": 7.073705064445378e-06, + "loss": 0.6856, + "step": 13286 + }, + { + "epoch": 0.7313005668996643, + "grad_norm": 0.662558913230896, + "learning_rate": 7.073310628256739e-06, + "loss": 0.751, + "step": 13287 + }, + { + "epoch": 0.73135560570202, + "grad_norm": 0.8313137292861938, + "learning_rate": 7.072916176485654e-06, + "loss": 0.7187, + "step": 13288 + }, + { + "epoch": 0.7314106445043755, + "grad_norm": 0.7033550143241882, + "learning_rate": 7.072521709135084e-06, + "loss": 0.8132, + "step": 13289 + }, + { + "epoch": 0.7314656833067312, + "grad_norm": 0.715242862701416, + "learning_rate": 7.0721272262079965e-06, + "loss": 0.8551, + "step": 13290 + }, + { + "epoch": 0.7315207221090869, + "grad_norm": 0.7545164227485657, + "learning_rate": 7.071732727707356e-06, + "loss": 0.7772, + "step": 13291 + }, + { + "epoch": 0.7315757609114426, + "grad_norm": 0.7181825637817383, + "learning_rate": 7.071338213636126e-06, + "loss": 0.7378, + "step": 13292 + }, + { + "epoch": 0.7316307997137982, + "grad_norm": 0.7793779969215393, + "learning_rate": 7.070943683997273e-06, + "loss": 0.7801, + "step": 13293 + }, + { + "epoch": 0.7316858385161539, + "grad_norm": 0.7456476092338562, + "learning_rate": 7.070549138793762e-06, + "loss": 0.8038, + "step": 13294 + }, + { + "epoch": 0.7317408773185096, + "grad_norm": 0.652519702911377, + "learning_rate": 7.0701545780285576e-06, + "loss": 0.746, + "step": 13295 + }, + { + "epoch": 0.7317959161208653, + "grad_norm": 0.784450888633728, + "learning_rate": 7.069760001704625e-06, + "loss": 0.8065, + "step": 13296 + }, + { + "epoch": 0.7318509549232208, + "grad_norm": 0.8052587509155273, + "learning_rate": 7.069365409824931e-06, + "loss": 0.8098, + "step": 13297 + }, + { + "epoch": 0.7319059937255765, + "grad_norm": 0.6890794038772583, + "learning_rate": 7.06897080239244e-06, + "loss": 0.783, + "step": 13298 + }, + { + "epoch": 0.7319610325279322, + "grad_norm": 0.7470653057098389, + "learning_rate": 7.068576179410119e-06, + "loss": 0.7658, + "step": 13299 + }, + { + "epoch": 0.7320160713302879, + "grad_norm": 0.6831437945365906, + "learning_rate": 7.068181540880932e-06, + "loss": 0.7864, + "step": 13300 + }, + { + "epoch": 0.7320711101326435, + "grad_norm": 0.7058265209197998, + "learning_rate": 7.067786886807847e-06, + "loss": 0.8254, + "step": 13301 + }, + { + "epoch": 0.7321261489349992, + "grad_norm": 0.7938248515129089, + "learning_rate": 7.067392217193828e-06, + "loss": 0.7291, + "step": 13302 + }, + { + "epoch": 0.7321811877373549, + "grad_norm": 0.7261865735054016, + "learning_rate": 7.066997532041844e-06, + "loss": 0.8115, + "step": 13303 + }, + { + "epoch": 0.7322362265397105, + "grad_norm": 0.6971743702888489, + "learning_rate": 7.0666028313548586e-06, + "loss": 0.7504, + "step": 13304 + }, + { + "epoch": 0.7322912653420661, + "grad_norm": 0.844879150390625, + "learning_rate": 7.0662081151358405e-06, + "loss": 0.7903, + "step": 13305 + }, + { + "epoch": 0.7323463041444218, + "grad_norm": 0.6670572757720947, + "learning_rate": 7.065813383387755e-06, + "loss": 0.7597, + "step": 13306 + }, + { + "epoch": 0.7324013429467775, + "grad_norm": 0.669711172580719, + "learning_rate": 7.06541863611357e-06, + "loss": 0.7179, + "step": 13307 + }, + { + "epoch": 0.7324563817491332, + "grad_norm": 0.7176600098609924, + "learning_rate": 7.0650238733162506e-06, + "loss": 0.8157, + "step": 13308 + }, + { + "epoch": 0.7325114205514888, + "grad_norm": 0.7230100631713867, + "learning_rate": 7.064629094998765e-06, + "loss": 0.7902, + "step": 13309 + }, + { + "epoch": 0.7325664593538445, + "grad_norm": 0.8811234831809998, + "learning_rate": 7.064234301164078e-06, + "loss": 0.7746, + "step": 13310 + }, + { + "epoch": 0.7326214981562001, + "grad_norm": 0.6777653098106384, + "learning_rate": 7.06383949181516e-06, + "loss": 0.7708, + "step": 13311 + }, + { + "epoch": 0.7326765369585558, + "grad_norm": 0.6692547798156738, + "learning_rate": 7.063444666954977e-06, + "loss": 0.7103, + "step": 13312 + }, + { + "epoch": 0.7327315757609114, + "grad_norm": 1.2304950952529907, + "learning_rate": 7.063049826586496e-06, + "loss": 0.7878, + "step": 13313 + }, + { + "epoch": 0.7327866145632671, + "grad_norm": 0.7073930501937866, + "learning_rate": 7.0626549707126834e-06, + "loss": 0.7546, + "step": 13314 + }, + { + "epoch": 0.7328416533656228, + "grad_norm": 0.7184866070747375, + "learning_rate": 7.06226009933651e-06, + "loss": 0.7207, + "step": 13315 + }, + { + "epoch": 0.7328966921679784, + "grad_norm": 0.7098046541213989, + "learning_rate": 7.061865212460941e-06, + "loss": 0.6415, + "step": 13316 + }, + { + "epoch": 0.7329517309703341, + "grad_norm": 0.714379608631134, + "learning_rate": 7.0614703100889445e-06, + "loss": 0.7305, + "step": 13317 + }, + { + "epoch": 0.7330067697726897, + "grad_norm": 0.655060887336731, + "learning_rate": 7.061075392223491e-06, + "loss": 0.6125, + "step": 13318 + }, + { + "epoch": 0.7330618085750454, + "grad_norm": 0.6481055617332458, + "learning_rate": 7.060680458867545e-06, + "loss": 0.7059, + "step": 13319 + }, + { + "epoch": 0.733116847377401, + "grad_norm": 0.7123916745185852, + "learning_rate": 7.060285510024076e-06, + "loss": 0.8007, + "step": 13320 + }, + { + "epoch": 0.7331718861797567, + "grad_norm": 0.7231262922286987, + "learning_rate": 7.059890545696053e-06, + "loss": 0.7781, + "step": 13321 + }, + { + "epoch": 0.7332269249821124, + "grad_norm": 0.8415369391441345, + "learning_rate": 7.0594955658864435e-06, + "loss": 0.6649, + "step": 13322 + }, + { + "epoch": 0.7332819637844681, + "grad_norm": 0.7243070006370544, + "learning_rate": 7.059100570598217e-06, + "loss": 0.6588, + "step": 13323 + }, + { + "epoch": 0.7333370025868237, + "grad_norm": 0.6581026315689087, + "learning_rate": 7.058705559834342e-06, + "loss": 0.7938, + "step": 13324 + }, + { + "epoch": 0.7333920413891793, + "grad_norm": 0.6213739514350891, + "learning_rate": 7.058310533597787e-06, + "loss": 0.7092, + "step": 13325 + }, + { + "epoch": 0.733447080191535, + "grad_norm": 0.6857954859733582, + "learning_rate": 7.057915491891522e-06, + "loss": 0.698, + "step": 13326 + }, + { + "epoch": 0.7335021189938907, + "grad_norm": 0.7528544068336487, + "learning_rate": 7.0575204347185135e-06, + "loss": 0.7234, + "step": 13327 + }, + { + "epoch": 0.7335571577962463, + "grad_norm": 0.6449099779129028, + "learning_rate": 7.057125362081733e-06, + "loss": 0.7391, + "step": 13328 + }, + { + "epoch": 0.733612196598602, + "grad_norm": 0.640689492225647, + "learning_rate": 7.0567302739841495e-06, + "loss": 0.5316, + "step": 13329 + }, + { + "epoch": 0.7336672354009577, + "grad_norm": 0.6686868071556091, + "learning_rate": 7.056335170428731e-06, + "loss": 0.7713, + "step": 13330 + }, + { + "epoch": 0.7337222742033134, + "grad_norm": 0.7627772688865662, + "learning_rate": 7.055940051418447e-06, + "loss": 0.7706, + "step": 13331 + }, + { + "epoch": 0.733777313005669, + "grad_norm": 0.7421852350234985, + "learning_rate": 7.055544916956269e-06, + "loss": 0.6418, + "step": 13332 + }, + { + "epoch": 0.7338323518080246, + "grad_norm": 0.7414699196815491, + "learning_rate": 7.0551497670451666e-06, + "loss": 0.811, + "step": 13333 + }, + { + "epoch": 0.7338873906103803, + "grad_norm": 0.7054136991500854, + "learning_rate": 7.0547546016881064e-06, + "loss": 0.8005, + "step": 13334 + }, + { + "epoch": 0.733942429412736, + "grad_norm": 0.670174241065979, + "learning_rate": 7.054359420888062e-06, + "loss": 0.6136, + "step": 13335 + }, + { + "epoch": 0.7339974682150916, + "grad_norm": 0.728255033493042, + "learning_rate": 7.053964224648001e-06, + "loss": 0.848, + "step": 13336 + }, + { + "epoch": 0.7340525070174473, + "grad_norm": 0.729815661907196, + "learning_rate": 7.053569012970896e-06, + "loss": 0.6985, + "step": 13337 + }, + { + "epoch": 0.734107545819803, + "grad_norm": 0.7564244866371155, + "learning_rate": 7.053173785859715e-06, + "loss": 0.7995, + "step": 13338 + }, + { + "epoch": 0.7341625846221587, + "grad_norm": 0.7746061682701111, + "learning_rate": 7.05277854331743e-06, + "loss": 0.7663, + "step": 13339 + }, + { + "epoch": 0.7342176234245142, + "grad_norm": 0.6878651976585388, + "learning_rate": 7.052383285347011e-06, + "loss": 0.8624, + "step": 13340 + }, + { + "epoch": 0.7342726622268699, + "grad_norm": 0.6989734768867493, + "learning_rate": 7.051988011951428e-06, + "loss": 0.7221, + "step": 13341 + }, + { + "epoch": 0.7343277010292256, + "grad_norm": 0.6854223012924194, + "learning_rate": 7.051592723133654e-06, + "loss": 0.7878, + "step": 13342 + }, + { + "epoch": 0.7343827398315813, + "grad_norm": 0.746696949005127, + "learning_rate": 7.051197418896657e-06, + "loss": 0.7074, + "step": 13343 + }, + { + "epoch": 0.7344377786339369, + "grad_norm": 0.6933150887489319, + "learning_rate": 7.050802099243409e-06, + "loss": 0.7587, + "step": 13344 + }, + { + "epoch": 0.7344928174362926, + "grad_norm": 0.7285788655281067, + "learning_rate": 7.050406764176882e-06, + "loss": 0.6589, + "step": 13345 + }, + { + "epoch": 0.7345478562386483, + "grad_norm": 0.6834994554519653, + "learning_rate": 7.050011413700046e-06, + "loss": 0.7196, + "step": 13346 + }, + { + "epoch": 0.734602895041004, + "grad_norm": 0.6504353880882263, + "learning_rate": 7.049616047815873e-06, + "loss": 0.7675, + "step": 13347 + }, + { + "epoch": 0.7346579338433595, + "grad_norm": 0.7009296417236328, + "learning_rate": 7.049220666527335e-06, + "loss": 0.7638, + "step": 13348 + }, + { + "epoch": 0.7347129726457152, + "grad_norm": 0.6210034489631653, + "learning_rate": 7.0488252698374024e-06, + "loss": 0.6872, + "step": 13349 + }, + { + "epoch": 0.7347680114480709, + "grad_norm": 0.6280165910720825, + "learning_rate": 7.0484298577490485e-06, + "loss": 0.7084, + "step": 13350 + }, + { + "epoch": 0.7348230502504266, + "grad_norm": 0.8055418133735657, + "learning_rate": 7.048034430265242e-06, + "loss": 0.8202, + "step": 13351 + }, + { + "epoch": 0.7348780890527822, + "grad_norm": 0.6674166917800903, + "learning_rate": 7.047638987388959e-06, + "loss": 0.6368, + "step": 13352 + }, + { + "epoch": 0.7349331278551379, + "grad_norm": 0.9182783961296082, + "learning_rate": 7.04724352912317e-06, + "loss": 0.6734, + "step": 13353 + }, + { + "epoch": 0.7349881666574936, + "grad_norm": 0.6371243596076965, + "learning_rate": 7.046848055470845e-06, + "loss": 0.7308, + "step": 13354 + }, + { + "epoch": 0.7350432054598492, + "grad_norm": 0.6454519033432007, + "learning_rate": 7.046452566434959e-06, + "loss": 0.6882, + "step": 13355 + }, + { + "epoch": 0.7350982442622048, + "grad_norm": 0.648970365524292, + "learning_rate": 7.046057062018483e-06, + "loss": 0.7247, + "step": 13356 + }, + { + "epoch": 0.7351532830645605, + "grad_norm": 0.668886661529541, + "learning_rate": 7.04566154222439e-06, + "loss": 0.7379, + "step": 13357 + }, + { + "epoch": 0.7352083218669162, + "grad_norm": 0.6593654751777649, + "learning_rate": 7.045266007055651e-06, + "loss": 0.7473, + "step": 13358 + }, + { + "epoch": 0.7352633606692718, + "grad_norm": 0.8418927192687988, + "learning_rate": 7.044870456515241e-06, + "loss": 0.7949, + "step": 13359 + }, + { + "epoch": 0.7353183994716275, + "grad_norm": 0.7350470423698425, + "learning_rate": 7.044474890606132e-06, + "loss": 0.7545, + "step": 13360 + }, + { + "epoch": 0.7353734382739832, + "grad_norm": 0.7786250114440918, + "learning_rate": 7.044079309331298e-06, + "loss": 0.8587, + "step": 13361 + }, + { + "epoch": 0.7354284770763388, + "grad_norm": 0.6345693469047546, + "learning_rate": 7.04368371269371e-06, + "loss": 0.77, + "step": 13362 + }, + { + "epoch": 0.7354835158786944, + "grad_norm": 0.7030417919158936, + "learning_rate": 7.043288100696343e-06, + "loss": 0.7624, + "step": 13363 + }, + { + "epoch": 0.7355385546810501, + "grad_norm": 0.7526041865348816, + "learning_rate": 7.042892473342169e-06, + "loss": 0.8018, + "step": 13364 + }, + { + "epoch": 0.7355935934834058, + "grad_norm": 0.6419941782951355, + "learning_rate": 7.042496830634162e-06, + "loss": 0.6788, + "step": 13365 + }, + { + "epoch": 0.7356486322857615, + "grad_norm": 0.6952203512191772, + "learning_rate": 7.042101172575297e-06, + "loss": 0.7747, + "step": 13366 + }, + { + "epoch": 0.7357036710881171, + "grad_norm": 0.8046327829360962, + "learning_rate": 7.041705499168544e-06, + "loss": 0.8216, + "step": 13367 + }, + { + "epoch": 0.7357587098904728, + "grad_norm": 0.6641537547111511, + "learning_rate": 7.041309810416881e-06, + "loss": 0.7313, + "step": 13368 + }, + { + "epoch": 0.7358137486928285, + "grad_norm": 0.6824444532394409, + "learning_rate": 7.040914106323278e-06, + "loss": 0.7179, + "step": 13369 + }, + { + "epoch": 0.7358687874951841, + "grad_norm": 0.6469557285308838, + "learning_rate": 7.040518386890711e-06, + "loss": 0.7671, + "step": 13370 + }, + { + "epoch": 0.7359238262975397, + "grad_norm": 0.6826488971710205, + "learning_rate": 7.040122652122156e-06, + "loss": 0.7, + "step": 13371 + }, + { + "epoch": 0.7359788650998954, + "grad_norm": 0.6931618452072144, + "learning_rate": 7.039726902020583e-06, + "loss": 0.7641, + "step": 13372 + }, + { + "epoch": 0.7360339039022511, + "grad_norm": 0.7445465922355652, + "learning_rate": 7.039331136588971e-06, + "loss": 0.7458, + "step": 13373 + }, + { + "epoch": 0.7360889427046068, + "grad_norm": 0.6358756422996521, + "learning_rate": 7.038935355830289e-06, + "loss": 0.6125, + "step": 13374 + }, + { + "epoch": 0.7361439815069624, + "grad_norm": 0.6966063380241394, + "learning_rate": 7.038539559747517e-06, + "loss": 0.6812, + "step": 13375 + }, + { + "epoch": 0.736199020309318, + "grad_norm": 0.9898090362548828, + "learning_rate": 7.038143748343626e-06, + "loss": 0.707, + "step": 13376 + }, + { + "epoch": 0.7362540591116737, + "grad_norm": 0.685951828956604, + "learning_rate": 7.0377479216215935e-06, + "loss": 0.7932, + "step": 13377 + }, + { + "epoch": 0.7363090979140294, + "grad_norm": 0.7056856751441956, + "learning_rate": 7.037352079584392e-06, + "loss": 0.7432, + "step": 13378 + }, + { + "epoch": 0.736364136716385, + "grad_norm": 0.7802489995956421, + "learning_rate": 7.036956222234999e-06, + "loss": 0.8275, + "step": 13379 + }, + { + "epoch": 0.7364191755187407, + "grad_norm": 0.7990192770957947, + "learning_rate": 7.036560349576387e-06, + "loss": 0.893, + "step": 13380 + }, + { + "epoch": 0.7364742143210964, + "grad_norm": 0.6454586386680603, + "learning_rate": 7.0361644616115334e-06, + "loss": 0.751, + "step": 13381 + }, + { + "epoch": 0.7365292531234521, + "grad_norm": 0.7071009278297424, + "learning_rate": 7.035768558343412e-06, + "loss": 0.7771, + "step": 13382 + }, + { + "epoch": 0.7365842919258077, + "grad_norm": 0.6530466079711914, + "learning_rate": 7.035372639774999e-06, + "loss": 0.7529, + "step": 13383 + }, + { + "epoch": 0.7366393307281633, + "grad_norm": 0.728689968585968, + "learning_rate": 7.03497670590927e-06, + "loss": 0.7862, + "step": 13384 + }, + { + "epoch": 0.736694369530519, + "grad_norm": 0.6640015244483948, + "learning_rate": 7.034580756749202e-06, + "loss": 0.6876, + "step": 13385 + }, + { + "epoch": 0.7367494083328747, + "grad_norm": 0.7388426661491394, + "learning_rate": 7.034184792297769e-06, + "loss": 0.8168, + "step": 13386 + }, + { + "epoch": 0.7368044471352303, + "grad_norm": 0.6543731093406677, + "learning_rate": 7.0337888125579465e-06, + "loss": 0.7555, + "step": 13387 + }, + { + "epoch": 0.736859485937586, + "grad_norm": 0.7783555388450623, + "learning_rate": 7.0333928175327125e-06, + "loss": 0.755, + "step": 13388 + }, + { + "epoch": 0.7369145247399417, + "grad_norm": 0.6275887489318848, + "learning_rate": 7.032996807225043e-06, + "loss": 0.7187, + "step": 13389 + }, + { + "epoch": 0.7369695635422974, + "grad_norm": 0.7007517218589783, + "learning_rate": 7.032600781637913e-06, + "loss": 0.6993, + "step": 13390 + }, + { + "epoch": 0.737024602344653, + "grad_norm": 0.6322247385978699, + "learning_rate": 7.0322047407743e-06, + "loss": 0.7178, + "step": 13391 + }, + { + "epoch": 0.7370796411470086, + "grad_norm": 0.7160976529121399, + "learning_rate": 7.0318086846371804e-06, + "loss": 0.6884, + "step": 13392 + }, + { + "epoch": 0.7371346799493643, + "grad_norm": 0.6056101322174072, + "learning_rate": 7.03141261322953e-06, + "loss": 0.6672, + "step": 13393 + }, + { + "epoch": 0.73718971875172, + "grad_norm": 0.8779410123825073, + "learning_rate": 7.0310165265543264e-06, + "loss": 0.7564, + "step": 13394 + }, + { + "epoch": 0.7372447575540756, + "grad_norm": 0.6868176460266113, + "learning_rate": 7.030620424614546e-06, + "loss": 0.7658, + "step": 13395 + }, + { + "epoch": 0.7372997963564313, + "grad_norm": 0.7611618041992188, + "learning_rate": 7.030224307413166e-06, + "loss": 0.6445, + "step": 13396 + }, + { + "epoch": 0.737354835158787, + "grad_norm": 0.7688242793083191, + "learning_rate": 7.0298281749531636e-06, + "loss": 0.8061, + "step": 13397 + }, + { + "epoch": 0.7374098739611427, + "grad_norm": 0.6781700849533081, + "learning_rate": 7.029432027237518e-06, + "loss": 0.6374, + "step": 13398 + }, + { + "epoch": 0.7374649127634982, + "grad_norm": 0.6719028353691101, + "learning_rate": 7.0290358642692e-06, + "loss": 0.7585, + "step": 13399 + }, + { + "epoch": 0.7375199515658539, + "grad_norm": 0.704429030418396, + "learning_rate": 7.028639686051195e-06, + "loss": 0.7052, + "step": 13400 + }, + { + "epoch": 0.7375749903682096, + "grad_norm": 0.714914083480835, + "learning_rate": 7.028243492586478e-06, + "loss": 0.7785, + "step": 13401 + }, + { + "epoch": 0.7376300291705652, + "grad_norm": 0.7732700705528259, + "learning_rate": 7.027847283878023e-06, + "loss": 0.7812, + "step": 13402 + }, + { + "epoch": 0.7376850679729209, + "grad_norm": 0.6849464178085327, + "learning_rate": 7.027451059928813e-06, + "loss": 0.7657, + "step": 13403 + }, + { + "epoch": 0.7377401067752766, + "grad_norm": 0.6924402117729187, + "learning_rate": 7.027054820741822e-06, + "loss": 0.677, + "step": 13404 + }, + { + "epoch": 0.7377951455776323, + "grad_norm": 0.7142716646194458, + "learning_rate": 7.02665856632003e-06, + "loss": 0.7071, + "step": 13405 + }, + { + "epoch": 0.7378501843799878, + "grad_norm": 0.7227265238761902, + "learning_rate": 7.0262622966664154e-06, + "loss": 0.6986, + "step": 13406 + }, + { + "epoch": 0.7379052231823435, + "grad_norm": 0.6387726664543152, + "learning_rate": 7.025866011783954e-06, + "loss": 0.6563, + "step": 13407 + }, + { + "epoch": 0.7379602619846992, + "grad_norm": 0.6411992311477661, + "learning_rate": 7.025469711675628e-06, + "loss": 0.5842, + "step": 13408 + }, + { + "epoch": 0.7380153007870549, + "grad_norm": 0.6811027526855469, + "learning_rate": 7.025073396344413e-06, + "loss": 0.6746, + "step": 13409 + }, + { + "epoch": 0.7380703395894105, + "grad_norm": 1.0705479383468628, + "learning_rate": 7.024677065793289e-06, + "loss": 0.7457, + "step": 13410 + }, + { + "epoch": 0.7381253783917662, + "grad_norm": 0.6920849084854126, + "learning_rate": 7.024280720025232e-06, + "loss": 0.6838, + "step": 13411 + }, + { + "epoch": 0.7381804171941219, + "grad_norm": 0.8089182376861572, + "learning_rate": 7.0238843590432236e-06, + "loss": 0.6682, + "step": 13412 + }, + { + "epoch": 0.7382354559964776, + "grad_norm": 0.6140334010124207, + "learning_rate": 7.023487982850244e-06, + "loss": 0.6992, + "step": 13413 + }, + { + "epoch": 0.7382904947988331, + "grad_norm": 0.8564643263816833, + "learning_rate": 7.023091591449269e-06, + "loss": 0.8512, + "step": 13414 + }, + { + "epoch": 0.7383455336011888, + "grad_norm": 0.655516505241394, + "learning_rate": 7.02269518484328e-06, + "loss": 0.7291, + "step": 13415 + }, + { + "epoch": 0.7384005724035445, + "grad_norm": 0.6373177766799927, + "learning_rate": 7.022298763035255e-06, + "loss": 0.7553, + "step": 13416 + }, + { + "epoch": 0.7384556112059002, + "grad_norm": 0.7023805379867554, + "learning_rate": 7.021902326028174e-06, + "loss": 0.7562, + "step": 13417 + }, + { + "epoch": 0.7385106500082558, + "grad_norm": 0.654181182384491, + "learning_rate": 7.021505873825016e-06, + "loss": 0.7153, + "step": 13418 + }, + { + "epoch": 0.7385656888106115, + "grad_norm": 0.6633459329605103, + "learning_rate": 7.02110940642876e-06, + "loss": 0.6779, + "step": 13419 + }, + { + "epoch": 0.7386207276129672, + "grad_norm": 0.7050659656524658, + "learning_rate": 7.020712923842388e-06, + "loss": 0.741, + "step": 13420 + }, + { + "epoch": 0.7386757664153228, + "grad_norm": 0.7241182327270508, + "learning_rate": 7.020316426068879e-06, + "loss": 0.7479, + "step": 13421 + }, + { + "epoch": 0.7387308052176784, + "grad_norm": 1.0262155532836914, + "learning_rate": 7.019919913111212e-06, + "loss": 0.8418, + "step": 13422 + }, + { + "epoch": 0.7387858440200341, + "grad_norm": 0.6765457391738892, + "learning_rate": 7.019523384972366e-06, + "loss": 0.727, + "step": 13423 + }, + { + "epoch": 0.7388408828223898, + "grad_norm": 0.6871724724769592, + "learning_rate": 7.0191268416553245e-06, + "loss": 0.8273, + "step": 13424 + }, + { + "epoch": 0.7388959216247455, + "grad_norm": 0.8085252046585083, + "learning_rate": 7.018730283163067e-06, + "loss": 0.7306, + "step": 13425 + }, + { + "epoch": 0.7389509604271011, + "grad_norm": 0.6822873950004578, + "learning_rate": 7.018333709498572e-06, + "loss": 0.7454, + "step": 13426 + }, + { + "epoch": 0.7390059992294568, + "grad_norm": 0.7210521697998047, + "learning_rate": 7.01793712066482e-06, + "loss": 0.8306, + "step": 13427 + }, + { + "epoch": 0.7390610380318124, + "grad_norm": 0.6404997110366821, + "learning_rate": 7.017540516664795e-06, + "loss": 0.7151, + "step": 13428 + }, + { + "epoch": 0.7391160768341681, + "grad_norm": 0.6662821769714355, + "learning_rate": 7.017143897501475e-06, + "loss": 0.7446, + "step": 13429 + }, + { + "epoch": 0.7391711156365237, + "grad_norm": 0.8048129081726074, + "learning_rate": 7.0167472631778415e-06, + "loss": 0.7953, + "step": 13430 + }, + { + "epoch": 0.7392261544388794, + "grad_norm": 0.7215000987052917, + "learning_rate": 7.016350613696873e-06, + "loss": 0.8373, + "step": 13431 + }, + { + "epoch": 0.7392811932412351, + "grad_norm": 0.7309150099754333, + "learning_rate": 7.015953949061555e-06, + "loss": 0.7654, + "step": 13432 + }, + { + "epoch": 0.7393362320435908, + "grad_norm": 0.6487464904785156, + "learning_rate": 7.0155572692748665e-06, + "loss": 0.6473, + "step": 13433 + }, + { + "epoch": 0.7393912708459464, + "grad_norm": 0.6172077059745789, + "learning_rate": 7.01516057433979e-06, + "loss": 0.6672, + "step": 13434 + }, + { + "epoch": 0.739446309648302, + "grad_norm": 0.7569651007652283, + "learning_rate": 7.014763864259304e-06, + "loss": 0.8501, + "step": 13435 + }, + { + "epoch": 0.7395013484506577, + "grad_norm": 0.824669599533081, + "learning_rate": 7.014367139036393e-06, + "loss": 0.8596, + "step": 13436 + }, + { + "epoch": 0.7395563872530134, + "grad_norm": 0.6904401183128357, + "learning_rate": 7.013970398674038e-06, + "loss": 0.7403, + "step": 13437 + }, + { + "epoch": 0.739611426055369, + "grad_norm": 0.7999581098556519, + "learning_rate": 7.013573643175221e-06, + "loss": 0.8879, + "step": 13438 + }, + { + "epoch": 0.7396664648577247, + "grad_norm": 0.6600533723831177, + "learning_rate": 7.0131768725429236e-06, + "loss": 0.7324, + "step": 13439 + }, + { + "epoch": 0.7397215036600804, + "grad_norm": 0.7174191474914551, + "learning_rate": 7.0127800867801275e-06, + "loss": 0.7474, + "step": 13440 + }, + { + "epoch": 0.7397765424624361, + "grad_norm": 0.7023884654045105, + "learning_rate": 7.012383285889814e-06, + "loss": 0.7826, + "step": 13441 + }, + { + "epoch": 0.7398315812647916, + "grad_norm": 0.6486913561820984, + "learning_rate": 7.011986469874969e-06, + "loss": 0.6553, + "step": 13442 + }, + { + "epoch": 0.7398866200671473, + "grad_norm": 0.7238486409187317, + "learning_rate": 7.011589638738569e-06, + "loss": 0.6759, + "step": 13443 + }, + { + "epoch": 0.739941658869503, + "grad_norm": 0.7879656553268433, + "learning_rate": 7.011192792483601e-06, + "loss": 0.886, + "step": 13444 + }, + { + "epoch": 0.7399966976718586, + "grad_norm": 0.6592407822608948, + "learning_rate": 7.010795931113047e-06, + "loss": 0.7746, + "step": 13445 + }, + { + "epoch": 0.7400517364742143, + "grad_norm": 0.8274507522583008, + "learning_rate": 7.010399054629889e-06, + "loss": 0.7615, + "step": 13446 + }, + { + "epoch": 0.74010677527657, + "grad_norm": 0.6233614087104797, + "learning_rate": 7.010002163037109e-06, + "loss": 0.695, + "step": 13447 + }, + { + "epoch": 0.7401618140789257, + "grad_norm": 0.7082701921463013, + "learning_rate": 7.00960525633769e-06, + "loss": 0.6677, + "step": 13448 + }, + { + "epoch": 0.7402168528812813, + "grad_norm": 1.0694652795791626, + "learning_rate": 7.009208334534618e-06, + "loss": 0.7792, + "step": 13449 + }, + { + "epoch": 0.7402718916836369, + "grad_norm": 0.7189109325408936, + "learning_rate": 7.008811397630874e-06, + "loss": 0.8606, + "step": 13450 + }, + { + "epoch": 0.7403269304859926, + "grad_norm": 0.7136901617050171, + "learning_rate": 7.00841444562944e-06, + "loss": 0.7142, + "step": 13451 + }, + { + "epoch": 0.7403819692883483, + "grad_norm": 0.6508508920669556, + "learning_rate": 7.008017478533301e-06, + "loss": 0.6748, + "step": 13452 + }, + { + "epoch": 0.7404370080907039, + "grad_norm": 0.6560903191566467, + "learning_rate": 7.007620496345441e-06, + "loss": 0.7929, + "step": 13453 + }, + { + "epoch": 0.7404920468930596, + "grad_norm": 0.6909067034721375, + "learning_rate": 7.007223499068841e-06, + "loss": 0.6118, + "step": 13454 + }, + { + "epoch": 0.7405470856954153, + "grad_norm": 0.6554582715034485, + "learning_rate": 7.0068264867064874e-06, + "loss": 0.7687, + "step": 13455 + }, + { + "epoch": 0.740602124497771, + "grad_norm": 0.7788346409797668, + "learning_rate": 7.006429459261363e-06, + "loss": 0.7535, + "step": 13456 + }, + { + "epoch": 0.7406571633001265, + "grad_norm": 0.7702943682670593, + "learning_rate": 7.006032416736452e-06, + "loss": 0.833, + "step": 13457 + }, + { + "epoch": 0.7407122021024822, + "grad_norm": 0.6860190033912659, + "learning_rate": 7.005635359134738e-06, + "loss": 0.6643, + "step": 13458 + }, + { + "epoch": 0.7407672409048379, + "grad_norm": 0.7470136880874634, + "learning_rate": 7.005238286459205e-06, + "loss": 0.7811, + "step": 13459 + }, + { + "epoch": 0.7408222797071936, + "grad_norm": 0.6769132614135742, + "learning_rate": 7.004841198712839e-06, + "loss": 0.7322, + "step": 13460 + }, + { + "epoch": 0.7408773185095492, + "grad_norm": 0.7865259647369385, + "learning_rate": 7.004444095898623e-06, + "loss": 0.817, + "step": 13461 + }, + { + "epoch": 0.7409323573119049, + "grad_norm": 0.7352784276008606, + "learning_rate": 7.004046978019542e-06, + "loss": 0.7373, + "step": 13462 + }, + { + "epoch": 0.7409873961142606, + "grad_norm": 0.7647448182106018, + "learning_rate": 7.00364984507858e-06, + "loss": 0.7129, + "step": 13463 + }, + { + "epoch": 0.7410424349166163, + "grad_norm": 0.6979989409446716, + "learning_rate": 7.003252697078722e-06, + "loss": 0.7833, + "step": 13464 + }, + { + "epoch": 0.7410974737189718, + "grad_norm": 0.6117465496063232, + "learning_rate": 7.002855534022953e-06, + "loss": 0.6732, + "step": 13465 + }, + { + "epoch": 0.7411525125213275, + "grad_norm": 0.6754159331321716, + "learning_rate": 7.002458355914258e-06, + "loss": 0.6939, + "step": 13466 + }, + { + "epoch": 0.7412075513236832, + "grad_norm": 0.6713566184043884, + "learning_rate": 7.002061162755621e-06, + "loss": 0.7459, + "step": 13467 + }, + { + "epoch": 0.7412625901260389, + "grad_norm": 0.6475394368171692, + "learning_rate": 7.001663954550029e-06, + "loss": 0.7912, + "step": 13468 + }, + { + "epoch": 0.7413176289283945, + "grad_norm": 0.6577908992767334, + "learning_rate": 7.001266731300467e-06, + "loss": 0.6903, + "step": 13469 + }, + { + "epoch": 0.7413726677307502, + "grad_norm": 0.8129748106002808, + "learning_rate": 7.00086949300992e-06, + "loss": 0.8277, + "step": 13470 + }, + { + "epoch": 0.7414277065331059, + "grad_norm": 0.6730444431304932, + "learning_rate": 7.000472239681372e-06, + "loss": 0.7357, + "step": 13471 + }, + { + "epoch": 0.7414827453354615, + "grad_norm": 0.7166460156440735, + "learning_rate": 7.000074971317812e-06, + "loss": 0.7544, + "step": 13472 + }, + { + "epoch": 0.7415377841378171, + "grad_norm": 0.6668731570243835, + "learning_rate": 6.9996776879222225e-06, + "loss": 0.7073, + "step": 13473 + }, + { + "epoch": 0.7415928229401728, + "grad_norm": 0.7031315565109253, + "learning_rate": 6.999280389497591e-06, + "loss": 0.7262, + "step": 13474 + }, + { + "epoch": 0.7416478617425285, + "grad_norm": 0.7426775693893433, + "learning_rate": 6.998883076046904e-06, + "loss": 0.7394, + "step": 13475 + }, + { + "epoch": 0.7417029005448842, + "grad_norm": 0.665226399898529, + "learning_rate": 6.9984857475731475e-06, + "loss": 0.7365, + "step": 13476 + }, + { + "epoch": 0.7417579393472398, + "grad_norm": 0.7762128114700317, + "learning_rate": 6.998088404079306e-06, + "loss": 0.8551, + "step": 13477 + }, + { + "epoch": 0.7418129781495955, + "grad_norm": 0.7129524350166321, + "learning_rate": 6.997691045568366e-06, + "loss": 0.7646, + "step": 13478 + }, + { + "epoch": 0.7418680169519511, + "grad_norm": 0.7199442386627197, + "learning_rate": 6.997293672043316e-06, + "loss": 0.6879, + "step": 13479 + }, + { + "epoch": 0.7419230557543068, + "grad_norm": 0.6559237241744995, + "learning_rate": 6.9968962835071415e-06, + "loss": 0.6965, + "step": 13480 + }, + { + "epoch": 0.7419780945566624, + "grad_norm": 0.7428768277168274, + "learning_rate": 6.996498879962829e-06, + "loss": 0.7748, + "step": 13481 + }, + { + "epoch": 0.7420331333590181, + "grad_norm": 0.7344076633453369, + "learning_rate": 6.996101461413365e-06, + "loss": 0.6554, + "step": 13482 + }, + { + "epoch": 0.7420881721613738, + "grad_norm": 0.7080272436141968, + "learning_rate": 6.995704027861736e-06, + "loss": 0.7335, + "step": 13483 + }, + { + "epoch": 0.7421432109637295, + "grad_norm": 0.6296887397766113, + "learning_rate": 6.9953065793109306e-06, + "loss": 0.6411, + "step": 13484 + }, + { + "epoch": 0.7421982497660851, + "grad_norm": 0.7597532868385315, + "learning_rate": 6.994909115763935e-06, + "loss": 0.8281, + "step": 13485 + }, + { + "epoch": 0.7422532885684407, + "grad_norm": 0.7059680819511414, + "learning_rate": 6.994511637223737e-06, + "loss": 0.8075, + "step": 13486 + }, + { + "epoch": 0.7423083273707964, + "grad_norm": 0.8097653388977051, + "learning_rate": 6.994114143693323e-06, + "loss": 0.772, + "step": 13487 + }, + { + "epoch": 0.742363366173152, + "grad_norm": 0.7609913945198059, + "learning_rate": 6.993716635175681e-06, + "loss": 0.8265, + "step": 13488 + }, + { + "epoch": 0.7424184049755077, + "grad_norm": 0.6209948062896729, + "learning_rate": 6.993319111673799e-06, + "loss": 0.6266, + "step": 13489 + }, + { + "epoch": 0.7424734437778634, + "grad_norm": 0.6655107140541077, + "learning_rate": 6.992921573190663e-06, + "loss": 0.7519, + "step": 13490 + }, + { + "epoch": 0.7425284825802191, + "grad_norm": 1.1243617534637451, + "learning_rate": 6.992524019729262e-06, + "loss": 0.7707, + "step": 13491 + }, + { + "epoch": 0.7425835213825747, + "grad_norm": 0.6680326461791992, + "learning_rate": 6.9921264512925845e-06, + "loss": 0.7344, + "step": 13492 + }, + { + "epoch": 0.7426385601849304, + "grad_norm": 0.7689213156700134, + "learning_rate": 6.991728867883618e-06, + "loss": 0.7591, + "step": 13493 + }, + { + "epoch": 0.742693598987286, + "grad_norm": 0.8587394952774048, + "learning_rate": 6.99133126950535e-06, + "loss": 0.6991, + "step": 13494 + }, + { + "epoch": 0.7427486377896417, + "grad_norm": 0.6736756563186646, + "learning_rate": 6.990933656160768e-06, + "loss": 0.7604, + "step": 13495 + }, + { + "epoch": 0.7428036765919973, + "grad_norm": 0.6538887023925781, + "learning_rate": 6.990536027852864e-06, + "loss": 0.7332, + "step": 13496 + }, + { + "epoch": 0.742858715394353, + "grad_norm": 0.6578357815742493, + "learning_rate": 6.990138384584623e-06, + "loss": 0.7238, + "step": 13497 + }, + { + "epoch": 0.7429137541967087, + "grad_norm": 0.6865534782409668, + "learning_rate": 6.989740726359035e-06, + "loss": 0.7012, + "step": 13498 + }, + { + "epoch": 0.7429687929990644, + "grad_norm": 0.6198129057884216, + "learning_rate": 6.989343053179088e-06, + "loss": 0.7391, + "step": 13499 + }, + { + "epoch": 0.74302383180142, + "grad_norm": 0.6929547786712646, + "learning_rate": 6.98894536504777e-06, + "loss": 0.8498, + "step": 13500 + }, + { + "epoch": 0.7430788706037756, + "grad_norm": 0.6863006353378296, + "learning_rate": 6.988547661968072e-06, + "loss": 0.6589, + "step": 13501 + }, + { + "epoch": 0.7431339094061313, + "grad_norm": 0.7490457892417908, + "learning_rate": 6.988149943942982e-06, + "loss": 0.8145, + "step": 13502 + }, + { + "epoch": 0.743188948208487, + "grad_norm": 0.6597211360931396, + "learning_rate": 6.987752210975489e-06, + "loss": 0.7786, + "step": 13503 + }, + { + "epoch": 0.7432439870108426, + "grad_norm": 0.7211003303527832, + "learning_rate": 6.987354463068583e-06, + "loss": 0.7668, + "step": 13504 + }, + { + "epoch": 0.7432990258131983, + "grad_norm": 0.6257827877998352, + "learning_rate": 6.9869567002252526e-06, + "loss": 0.7378, + "step": 13505 + }, + { + "epoch": 0.743354064615554, + "grad_norm": 0.656944751739502, + "learning_rate": 6.986558922448488e-06, + "loss": 0.6408, + "step": 13506 + }, + { + "epoch": 0.7434091034179097, + "grad_norm": 0.6862110495567322, + "learning_rate": 6.986161129741276e-06, + "loss": 0.7648, + "step": 13507 + }, + { + "epoch": 0.7434641422202652, + "grad_norm": 0.6216374039649963, + "learning_rate": 6.985763322106612e-06, + "loss": 0.6826, + "step": 13508 + }, + { + "epoch": 0.7435191810226209, + "grad_norm": 0.7959128618240356, + "learning_rate": 6.985365499547479e-06, + "loss": 0.7554, + "step": 13509 + }, + { + "epoch": 0.7435742198249766, + "grad_norm": 0.5882300734519958, + "learning_rate": 6.984967662066875e-06, + "loss": 0.6523, + "step": 13510 + }, + { + "epoch": 0.7436292586273323, + "grad_norm": 0.8529833555221558, + "learning_rate": 6.9845698096677805e-06, + "loss": 0.7871, + "step": 13511 + }, + { + "epoch": 0.7436842974296879, + "grad_norm": 1.2988953590393066, + "learning_rate": 6.9841719423531925e-06, + "loss": 0.708, + "step": 13512 + }, + { + "epoch": 0.7437393362320436, + "grad_norm": 0.6735696792602539, + "learning_rate": 6.983774060126101e-06, + "loss": 0.7962, + "step": 13513 + }, + { + "epoch": 0.7437943750343993, + "grad_norm": 0.8145982623100281, + "learning_rate": 6.9833761629894925e-06, + "loss": 0.9067, + "step": 13514 + }, + { + "epoch": 0.743849413836755, + "grad_norm": 0.7107387781143188, + "learning_rate": 6.98297825094636e-06, + "loss": 0.7986, + "step": 13515 + }, + { + "epoch": 0.7439044526391105, + "grad_norm": 0.7350436449050903, + "learning_rate": 6.9825803239996934e-06, + "loss": 0.7724, + "step": 13516 + }, + { + "epoch": 0.7439594914414662, + "grad_norm": 0.7300962805747986, + "learning_rate": 6.982182382152485e-06, + "loss": 0.734, + "step": 13517 + }, + { + "epoch": 0.7440145302438219, + "grad_norm": 0.7088475823402405, + "learning_rate": 6.981784425407724e-06, + "loss": 0.818, + "step": 13518 + }, + { + "epoch": 0.7440695690461776, + "grad_norm": 0.6911785006523132, + "learning_rate": 6.981386453768402e-06, + "loss": 0.6857, + "step": 13519 + }, + { + "epoch": 0.7441246078485332, + "grad_norm": 0.794143795967102, + "learning_rate": 6.980988467237508e-06, + "loss": 0.7496, + "step": 13520 + }, + { + "epoch": 0.7441796466508889, + "grad_norm": 0.7116371989250183, + "learning_rate": 6.980590465818037e-06, + "loss": 0.7082, + "step": 13521 + }, + { + "epoch": 0.7442346854532446, + "grad_norm": 0.6306180953979492, + "learning_rate": 6.980192449512978e-06, + "loss": 0.7227, + "step": 13522 + }, + { + "epoch": 0.7442897242556002, + "grad_norm": 0.6662481427192688, + "learning_rate": 6.979794418325323e-06, + "loss": 0.7323, + "step": 13523 + }, + { + "epoch": 0.7443447630579558, + "grad_norm": 0.6824387907981873, + "learning_rate": 6.97939637225806e-06, + "loss": 0.7188, + "step": 13524 + }, + { + "epoch": 0.7443998018603115, + "grad_norm": 0.7429190278053284, + "learning_rate": 6.9789983113141865e-06, + "loss": 0.7818, + "step": 13525 + }, + { + "epoch": 0.7444548406626672, + "grad_norm": 0.7148364782333374, + "learning_rate": 6.978600235496692e-06, + "loss": 0.7665, + "step": 13526 + }, + { + "epoch": 0.7445098794650229, + "grad_norm": 0.711482584476471, + "learning_rate": 6.978202144808567e-06, + "loss": 0.7865, + "step": 13527 + }, + { + "epoch": 0.7445649182673785, + "grad_norm": 0.6913465857505798, + "learning_rate": 6.977804039252802e-06, + "loss": 0.8206, + "step": 13528 + }, + { + "epoch": 0.7446199570697342, + "grad_norm": 0.9090713858604431, + "learning_rate": 6.977405918832394e-06, + "loss": 0.7243, + "step": 13529 + }, + { + "epoch": 0.7446749958720899, + "grad_norm": 0.7680408954620361, + "learning_rate": 6.977007783550331e-06, + "loss": 0.847, + "step": 13530 + }, + { + "epoch": 0.7447300346744454, + "grad_norm": 0.6486232876777649, + "learning_rate": 6.976609633409608e-06, + "loss": 0.7258, + "step": 13531 + }, + { + "epoch": 0.7447850734768011, + "grad_norm": 0.7612336277961731, + "learning_rate": 6.976211468413214e-06, + "loss": 0.7452, + "step": 13532 + }, + { + "epoch": 0.7448401122791568, + "grad_norm": 0.7539309859275818, + "learning_rate": 6.975813288564146e-06, + "loss": 0.8292, + "step": 13533 + }, + { + "epoch": 0.7448951510815125, + "grad_norm": 0.64984530210495, + "learning_rate": 6.975415093865394e-06, + "loss": 0.6818, + "step": 13534 + }, + { + "epoch": 0.7449501898838681, + "grad_norm": 0.6415309309959412, + "learning_rate": 6.9750168843199506e-06, + "loss": 0.7369, + "step": 13535 + }, + { + "epoch": 0.7450052286862238, + "grad_norm": 0.7107319235801697, + "learning_rate": 6.974618659930807e-06, + "loss": 0.7364, + "step": 13536 + }, + { + "epoch": 0.7450602674885795, + "grad_norm": 0.7358448505401611, + "learning_rate": 6.9742204207009605e-06, + "loss": 0.7784, + "step": 13537 + }, + { + "epoch": 0.7451153062909351, + "grad_norm": 0.6950068473815918, + "learning_rate": 6.9738221666334e-06, + "loss": 0.792, + "step": 13538 + }, + { + "epoch": 0.7451703450932907, + "grad_norm": 0.7355311512947083, + "learning_rate": 6.973423897731122e-06, + "loss": 0.7631, + "step": 13539 + }, + { + "epoch": 0.7452253838956464, + "grad_norm": 0.6813983917236328, + "learning_rate": 6.9730256139971175e-06, + "loss": 0.7397, + "step": 13540 + }, + { + "epoch": 0.7452804226980021, + "grad_norm": 0.7698497772216797, + "learning_rate": 6.9726273154343806e-06, + "loss": 0.7769, + "step": 13541 + }, + { + "epoch": 0.7453354615003578, + "grad_norm": 0.7406428456306458, + "learning_rate": 6.972229002045905e-06, + "loss": 0.6502, + "step": 13542 + }, + { + "epoch": 0.7453905003027134, + "grad_norm": 0.6976667046546936, + "learning_rate": 6.9718306738346846e-06, + "loss": 0.773, + "step": 13543 + }, + { + "epoch": 0.745445539105069, + "grad_norm": 0.6932592391967773, + "learning_rate": 6.9714323308037115e-06, + "loss": 0.7315, + "step": 13544 + }, + { + "epoch": 0.7455005779074247, + "grad_norm": 0.7329851984977722, + "learning_rate": 6.971033972955981e-06, + "loss": 0.7432, + "step": 13545 + }, + { + "epoch": 0.7455556167097804, + "grad_norm": 0.6262860298156738, + "learning_rate": 6.970635600294489e-06, + "loss": 0.6368, + "step": 13546 + }, + { + "epoch": 0.745610655512136, + "grad_norm": 0.7157273292541504, + "learning_rate": 6.970237212822225e-06, + "loss": 0.7209, + "step": 13547 + }, + { + "epoch": 0.7456656943144917, + "grad_norm": 0.7256374955177307, + "learning_rate": 6.9698388105421855e-06, + "loss": 0.794, + "step": 13548 + }, + { + "epoch": 0.7457207331168474, + "grad_norm": 0.7763124704360962, + "learning_rate": 6.969440393457365e-06, + "loss": 0.7211, + "step": 13549 + }, + { + "epoch": 0.7457757719192031, + "grad_norm": 0.7139148712158203, + "learning_rate": 6.9690419615707585e-06, + "loss": 0.6612, + "step": 13550 + }, + { + "epoch": 0.7458308107215587, + "grad_norm": 0.7532974481582642, + "learning_rate": 6.968643514885359e-06, + "loss": 0.6952, + "step": 13551 + }, + { + "epoch": 0.7458858495239143, + "grad_norm": 0.6845714449882507, + "learning_rate": 6.968245053404161e-06, + "loss": 0.6972, + "step": 13552 + }, + { + "epoch": 0.74594088832627, + "grad_norm": 0.7445462346076965, + "learning_rate": 6.967846577130162e-06, + "loss": 0.7826, + "step": 13553 + }, + { + "epoch": 0.7459959271286257, + "grad_norm": 0.7269366383552551, + "learning_rate": 6.967448086066353e-06, + "loss": 0.7353, + "step": 13554 + }, + { + "epoch": 0.7460509659309813, + "grad_norm": 0.7366362810134888, + "learning_rate": 6.967049580215732e-06, + "loss": 0.7955, + "step": 13555 + }, + { + "epoch": 0.746106004733337, + "grad_norm": 0.6456870436668396, + "learning_rate": 6.966651059581292e-06, + "loss": 0.7467, + "step": 13556 + }, + { + "epoch": 0.7461610435356927, + "grad_norm": 0.7196624279022217, + "learning_rate": 6.966252524166031e-06, + "loss": 0.6621, + "step": 13557 + }, + { + "epoch": 0.7462160823380484, + "grad_norm": 0.6776413917541504, + "learning_rate": 6.965853973972941e-06, + "loss": 0.7647, + "step": 13558 + }, + { + "epoch": 0.746271121140404, + "grad_norm": 0.7319629192352295, + "learning_rate": 6.9654554090050195e-06, + "loss": 0.8172, + "step": 13559 + }, + { + "epoch": 0.7463261599427596, + "grad_norm": 0.6995210647583008, + "learning_rate": 6.96505682926526e-06, + "loss": 0.7252, + "step": 13560 + }, + { + "epoch": 0.7463811987451153, + "grad_norm": 0.6520518064498901, + "learning_rate": 6.964658234756659e-06, + "loss": 0.6856, + "step": 13561 + }, + { + "epoch": 0.746436237547471, + "grad_norm": 0.7562724947929382, + "learning_rate": 6.964259625482215e-06, + "loss": 0.7088, + "step": 13562 + }, + { + "epoch": 0.7464912763498266, + "grad_norm": 0.788045346736908, + "learning_rate": 6.963861001444919e-06, + "loss": 0.7183, + "step": 13563 + }, + { + "epoch": 0.7465463151521823, + "grad_norm": 0.7461729049682617, + "learning_rate": 6.96346236264777e-06, + "loss": 0.6725, + "step": 13564 + }, + { + "epoch": 0.746601353954538, + "grad_norm": 0.7283952832221985, + "learning_rate": 6.963063709093764e-06, + "loss": 0.7765, + "step": 13565 + }, + { + "epoch": 0.7466563927568937, + "grad_norm": 0.7947741150856018, + "learning_rate": 6.962665040785896e-06, + "loss": 0.8423, + "step": 13566 + }, + { + "epoch": 0.7467114315592492, + "grad_norm": 0.7964398264884949, + "learning_rate": 6.962266357727164e-06, + "loss": 0.7589, + "step": 13567 + }, + { + "epoch": 0.7467664703616049, + "grad_norm": 0.7807595133781433, + "learning_rate": 6.961867659920563e-06, + "loss": 0.7843, + "step": 13568 + }, + { + "epoch": 0.7468215091639606, + "grad_norm": 0.678011417388916, + "learning_rate": 6.961468947369089e-06, + "loss": 0.6664, + "step": 13569 + }, + { + "epoch": 0.7468765479663163, + "grad_norm": 0.6768447756767273, + "learning_rate": 6.961070220075741e-06, + "loss": 0.7531, + "step": 13570 + }, + { + "epoch": 0.7469315867686719, + "grad_norm": 0.7405245304107666, + "learning_rate": 6.960671478043514e-06, + "loss": 0.8278, + "step": 13571 + }, + { + "epoch": 0.7469866255710276, + "grad_norm": 0.605675458908081, + "learning_rate": 6.960272721275403e-06, + "loss": 0.7167, + "step": 13572 + }, + { + "epoch": 0.7470416643733833, + "grad_norm": 0.7406657338142395, + "learning_rate": 6.959873949774409e-06, + "loss": 0.8191, + "step": 13573 + }, + { + "epoch": 0.7470967031757388, + "grad_norm": 0.6163522601127625, + "learning_rate": 6.959475163543526e-06, + "loss": 0.6711, + "step": 13574 + }, + { + "epoch": 0.7471517419780945, + "grad_norm": 0.6036590337753296, + "learning_rate": 6.9590763625857525e-06, + "loss": 0.7029, + "step": 13575 + }, + { + "epoch": 0.7472067807804502, + "grad_norm": 0.8638957738876343, + "learning_rate": 6.9586775469040845e-06, + "loss": 0.6288, + "step": 13576 + }, + { + "epoch": 0.7472618195828059, + "grad_norm": 0.7490845322608948, + "learning_rate": 6.958278716501521e-06, + "loss": 0.7375, + "step": 13577 + }, + { + "epoch": 0.7473168583851615, + "grad_norm": 0.7788114547729492, + "learning_rate": 6.957879871381059e-06, + "loss": 0.814, + "step": 13578 + }, + { + "epoch": 0.7473718971875172, + "grad_norm": 0.7247292995452881, + "learning_rate": 6.957481011545697e-06, + "loss": 0.6187, + "step": 13579 + }, + { + "epoch": 0.7474269359898729, + "grad_norm": 0.9642785787582397, + "learning_rate": 6.95708213699843e-06, + "loss": 0.8745, + "step": 13580 + }, + { + "epoch": 0.7474819747922286, + "grad_norm": 0.701675295829773, + "learning_rate": 6.956683247742259e-06, + "loss": 0.8474, + "step": 13581 + }, + { + "epoch": 0.7475370135945841, + "grad_norm": 0.6338050961494446, + "learning_rate": 6.9562843437801795e-06, + "loss": 0.7346, + "step": 13582 + }, + { + "epoch": 0.7475920523969398, + "grad_norm": 0.6954126358032227, + "learning_rate": 6.955885425115191e-06, + "loss": 0.8083, + "step": 13583 + }, + { + "epoch": 0.7476470911992955, + "grad_norm": 0.7316300272941589, + "learning_rate": 6.95548649175029e-06, + "loss": 0.8009, + "step": 13584 + }, + { + "epoch": 0.7477021300016512, + "grad_norm": 0.6314196586608887, + "learning_rate": 6.955087543688477e-06, + "loss": 0.6375, + "step": 13585 + }, + { + "epoch": 0.7477571688040068, + "grad_norm": 0.6604906320571899, + "learning_rate": 6.9546885809327495e-06, + "loss": 0.7081, + "step": 13586 + }, + { + "epoch": 0.7478122076063625, + "grad_norm": 0.8251973986625671, + "learning_rate": 6.9542896034861064e-06, + "loss": 0.7483, + "step": 13587 + }, + { + "epoch": 0.7478672464087182, + "grad_norm": 0.6946399211883545, + "learning_rate": 6.953890611351544e-06, + "loss": 0.8849, + "step": 13588 + }, + { + "epoch": 0.7479222852110738, + "grad_norm": 0.7713609933853149, + "learning_rate": 6.953491604532063e-06, + "loss": 0.7913, + "step": 13589 + }, + { + "epoch": 0.7479773240134294, + "grad_norm": 0.734355092048645, + "learning_rate": 6.953092583030664e-06, + "loss": 0.7216, + "step": 13590 + }, + { + "epoch": 0.7480323628157851, + "grad_norm": 0.6147064566612244, + "learning_rate": 6.952693546850342e-06, + "loss": 0.6894, + "step": 13591 + }, + { + "epoch": 0.7480874016181408, + "grad_norm": 0.7472255229949951, + "learning_rate": 6.9522944959940986e-06, + "loss": 0.7941, + "step": 13592 + }, + { + "epoch": 0.7481424404204965, + "grad_norm": 0.6478431224822998, + "learning_rate": 6.951895430464935e-06, + "loss": 0.6995, + "step": 13593 + }, + { + "epoch": 0.7481974792228521, + "grad_norm": 0.6956225633621216, + "learning_rate": 6.951496350265844e-06, + "loss": 0.7637, + "step": 13594 + }, + { + "epoch": 0.7482525180252078, + "grad_norm": 1.0637938976287842, + "learning_rate": 6.95109725539983e-06, + "loss": 0.7448, + "step": 13595 + }, + { + "epoch": 0.7483075568275634, + "grad_norm": 0.6948299407958984, + "learning_rate": 6.9506981458698916e-06, + "loss": 0.7343, + "step": 13596 + }, + { + "epoch": 0.7483625956299191, + "grad_norm": 0.9034255743026733, + "learning_rate": 6.950299021679028e-06, + "loss": 0.6481, + "step": 13597 + }, + { + "epoch": 0.7484176344322747, + "grad_norm": 0.7901731729507446, + "learning_rate": 6.949899882830239e-06, + "loss": 0.8368, + "step": 13598 + }, + { + "epoch": 0.7484726732346304, + "grad_norm": 0.7791730761528015, + "learning_rate": 6.949500729326525e-06, + "loss": 0.7912, + "step": 13599 + }, + { + "epoch": 0.7485277120369861, + "grad_norm": 0.7678626179695129, + "learning_rate": 6.949101561170883e-06, + "loss": 0.7514, + "step": 13600 + }, + { + "epoch": 0.7485827508393418, + "grad_norm": 0.709762454032898, + "learning_rate": 6.948702378366318e-06, + "loss": 0.6809, + "step": 13601 + }, + { + "epoch": 0.7486377896416974, + "grad_norm": 0.706031084060669, + "learning_rate": 6.948303180915827e-06, + "loss": 0.7454, + "step": 13602 + }, + { + "epoch": 0.748692828444053, + "grad_norm": 0.658869743347168, + "learning_rate": 6.9479039688224105e-06, + "loss": 0.6498, + "step": 13603 + }, + { + "epoch": 0.7487478672464087, + "grad_norm": 0.7253865599632263, + "learning_rate": 6.9475047420890685e-06, + "loss": 0.8063, + "step": 13604 + }, + { + "epoch": 0.7488029060487644, + "grad_norm": 0.752839207649231, + "learning_rate": 6.947105500718804e-06, + "loss": 0.7708, + "step": 13605 + }, + { + "epoch": 0.74885794485112, + "grad_norm": 0.6694571375846863, + "learning_rate": 6.946706244714615e-06, + "loss": 0.7121, + "step": 13606 + }, + { + "epoch": 0.7489129836534757, + "grad_norm": 0.751380443572998, + "learning_rate": 6.946306974079503e-06, + "loss": 0.8797, + "step": 13607 + }, + { + "epoch": 0.7489680224558314, + "grad_norm": 0.8001984357833862, + "learning_rate": 6.9459076888164676e-06, + "loss": 0.8963, + "step": 13608 + }, + { + "epoch": 0.7490230612581871, + "grad_norm": 0.7149432301521301, + "learning_rate": 6.945508388928511e-06, + "loss": 0.8311, + "step": 13609 + }, + { + "epoch": 0.7490781000605427, + "grad_norm": 0.8295183777809143, + "learning_rate": 6.945109074418635e-06, + "loss": 0.7466, + "step": 13610 + }, + { + "epoch": 0.7491331388628983, + "grad_norm": 0.7480556964874268, + "learning_rate": 6.94470974528984e-06, + "loss": 0.8277, + "step": 13611 + }, + { + "epoch": 0.749188177665254, + "grad_norm": 0.7962234616279602, + "learning_rate": 6.944310401545127e-06, + "loss": 0.7143, + "step": 13612 + }, + { + "epoch": 0.7492432164676097, + "grad_norm": 0.7722699642181396, + "learning_rate": 6.943911043187497e-06, + "loss": 0.6619, + "step": 13613 + }, + { + "epoch": 0.7492982552699653, + "grad_norm": 0.8495624661445618, + "learning_rate": 6.943511670219952e-06, + "loss": 0.8475, + "step": 13614 + }, + { + "epoch": 0.749353294072321, + "grad_norm": 0.7702826261520386, + "learning_rate": 6.943112282645494e-06, + "loss": 0.826, + "step": 13615 + }, + { + "epoch": 0.7494083328746767, + "grad_norm": 0.7435297966003418, + "learning_rate": 6.942712880467124e-06, + "loss": 0.8121, + "step": 13616 + }, + { + "epoch": 0.7494633716770323, + "grad_norm": 0.8108325600624084, + "learning_rate": 6.942313463687844e-06, + "loss": 0.7282, + "step": 13617 + }, + { + "epoch": 0.7495184104793879, + "grad_norm": 0.6840381622314453, + "learning_rate": 6.9419140323106574e-06, + "loss": 0.7446, + "step": 13618 + }, + { + "epoch": 0.7495734492817436, + "grad_norm": 0.7155357599258423, + "learning_rate": 6.941514586338562e-06, + "loss": 0.7598, + "step": 13619 + }, + { + "epoch": 0.7496284880840993, + "grad_norm": 0.7693290114402771, + "learning_rate": 6.941115125774564e-06, + "loss": 0.7666, + "step": 13620 + }, + { + "epoch": 0.7496835268864549, + "grad_norm": 0.6918750405311584, + "learning_rate": 6.940715650621665e-06, + "loss": 0.6831, + "step": 13621 + }, + { + "epoch": 0.7497385656888106, + "grad_norm": 0.8241471648216248, + "learning_rate": 6.9403161608828654e-06, + "loss": 0.6753, + "step": 13622 + }, + { + "epoch": 0.7497936044911663, + "grad_norm": 0.6659193634986877, + "learning_rate": 6.93991665656117e-06, + "loss": 0.6988, + "step": 13623 + }, + { + "epoch": 0.749848643293522, + "grad_norm": 0.8012998700141907, + "learning_rate": 6.9395171376595795e-06, + "loss": 0.7922, + "step": 13624 + }, + { + "epoch": 0.7499036820958775, + "grad_norm": 0.783018946647644, + "learning_rate": 6.9391176041810974e-06, + "loss": 0.7062, + "step": 13625 + }, + { + "epoch": 0.7499587208982332, + "grad_norm": 0.8228014707565308, + "learning_rate": 6.938718056128726e-06, + "loss": 0.7762, + "step": 13626 + }, + { + "epoch": 0.7500137597005889, + "grad_norm": 0.783525288105011, + "learning_rate": 6.9383184935054705e-06, + "loss": 0.7517, + "step": 13627 + }, + { + "epoch": 0.7500687985029446, + "grad_norm": 0.6686612963676453, + "learning_rate": 6.93791891631433e-06, + "loss": 0.7372, + "step": 13628 + }, + { + "epoch": 0.7501238373053002, + "grad_norm": 0.7089647054672241, + "learning_rate": 6.937519324558312e-06, + "loss": 0.7847, + "step": 13629 + }, + { + "epoch": 0.7501788761076559, + "grad_norm": 0.7674399018287659, + "learning_rate": 6.937119718240415e-06, + "loss": 0.7414, + "step": 13630 + }, + { + "epoch": 0.7502339149100116, + "grad_norm": 0.6331565380096436, + "learning_rate": 6.936720097363646e-06, + "loss": 0.7603, + "step": 13631 + }, + { + "epoch": 0.7502889537123673, + "grad_norm": 0.7084798812866211, + "learning_rate": 6.9363204619310065e-06, + "loss": 0.6844, + "step": 13632 + }, + { + "epoch": 0.7503439925147228, + "grad_norm": 0.8624362945556641, + "learning_rate": 6.9359208119455015e-06, + "loss": 0.7098, + "step": 13633 + }, + { + "epoch": 0.7503990313170785, + "grad_norm": 0.7681849598884583, + "learning_rate": 6.935521147410134e-06, + "loss": 0.7896, + "step": 13634 + }, + { + "epoch": 0.7504540701194342, + "grad_norm": 0.7494263052940369, + "learning_rate": 6.935121468327907e-06, + "loss": 0.7858, + "step": 13635 + }, + { + "epoch": 0.7505091089217899, + "grad_norm": 0.7102827429771423, + "learning_rate": 6.934721774701824e-06, + "loss": 0.7485, + "step": 13636 + }, + { + "epoch": 0.7505641477241455, + "grad_norm": 0.7031061053276062, + "learning_rate": 6.934322066534891e-06, + "loss": 0.7154, + "step": 13637 + }, + { + "epoch": 0.7506191865265012, + "grad_norm": 0.6468148231506348, + "learning_rate": 6.933922343830112e-06, + "loss": 0.729, + "step": 13638 + }, + { + "epoch": 0.7506742253288569, + "grad_norm": 0.8570408225059509, + "learning_rate": 6.933522606590489e-06, + "loss": 0.6922, + "step": 13639 + }, + { + "epoch": 0.7507292641312125, + "grad_norm": 0.6836286783218384, + "learning_rate": 6.933122854819027e-06, + "loss": 0.7982, + "step": 13640 + }, + { + "epoch": 0.7507843029335681, + "grad_norm": 1.052017092704773, + "learning_rate": 6.9327230885187344e-06, + "loss": 0.7522, + "step": 13641 + }, + { + "epoch": 0.7508393417359238, + "grad_norm": 0.6352099180221558, + "learning_rate": 6.932323307692611e-06, + "loss": 0.6724, + "step": 13642 + }, + { + "epoch": 0.7508943805382795, + "grad_norm": 0.7046655416488647, + "learning_rate": 6.931923512343663e-06, + "loss": 0.7732, + "step": 13643 + }, + { + "epoch": 0.7509494193406352, + "grad_norm": 0.7600587010383606, + "learning_rate": 6.931523702474893e-06, + "loss": 0.7013, + "step": 13644 + }, + { + "epoch": 0.7510044581429908, + "grad_norm": 0.674828052520752, + "learning_rate": 6.9311238780893095e-06, + "loss": 0.7022, + "step": 13645 + }, + { + "epoch": 0.7510594969453465, + "grad_norm": 0.7517798542976379, + "learning_rate": 6.930724039189916e-06, + "loss": 0.7248, + "step": 13646 + }, + { + "epoch": 0.7511145357477022, + "grad_norm": 0.7851112484931946, + "learning_rate": 6.930324185779716e-06, + "loss": 0.8025, + "step": 13647 + }, + { + "epoch": 0.7511695745500578, + "grad_norm": 0.6545413732528687, + "learning_rate": 6.929924317861717e-06, + "loss": 0.781, + "step": 13648 + }, + { + "epoch": 0.7512246133524134, + "grad_norm": 0.7079984545707703, + "learning_rate": 6.929524435438923e-06, + "loss": 0.8033, + "step": 13649 + }, + { + "epoch": 0.7512796521547691, + "grad_norm": 0.6501914262771606, + "learning_rate": 6.929124538514341e-06, + "loss": 0.7525, + "step": 13650 + }, + { + "epoch": 0.7513346909571248, + "grad_norm": 0.7697597742080688, + "learning_rate": 6.928724627090975e-06, + "loss": 0.7358, + "step": 13651 + }, + { + "epoch": 0.7513897297594805, + "grad_norm": 0.8155171275138855, + "learning_rate": 6.928324701171832e-06, + "loss": 0.7389, + "step": 13652 + }, + { + "epoch": 0.7514447685618361, + "grad_norm": 0.6969262361526489, + "learning_rate": 6.927924760759914e-06, + "loss": 0.8349, + "step": 13653 + }, + { + "epoch": 0.7514998073641918, + "grad_norm": 0.6736776828765869, + "learning_rate": 6.927524805858233e-06, + "loss": 0.7379, + "step": 13654 + }, + { + "epoch": 0.7515548461665474, + "grad_norm": 0.6362389922142029, + "learning_rate": 6.927124836469788e-06, + "loss": 0.7479, + "step": 13655 + }, + { + "epoch": 0.7516098849689031, + "grad_norm": 0.688922643661499, + "learning_rate": 6.92672485259759e-06, + "loss": 0.7828, + "step": 13656 + }, + { + "epoch": 0.7516649237712587, + "grad_norm": 0.7098214030265808, + "learning_rate": 6.926324854244644e-06, + "loss": 0.6084, + "step": 13657 + }, + { + "epoch": 0.7517199625736144, + "grad_norm": 0.6436209678649902, + "learning_rate": 6.925924841413956e-06, + "loss": 0.687, + "step": 13658 + }, + { + "epoch": 0.7517750013759701, + "grad_norm": 0.6051730513572693, + "learning_rate": 6.925524814108533e-06, + "loss": 0.6884, + "step": 13659 + }, + { + "epoch": 0.7518300401783257, + "grad_norm": 0.6347759962081909, + "learning_rate": 6.92512477233138e-06, + "loss": 0.7057, + "step": 13660 + }, + { + "epoch": 0.7518850789806814, + "grad_norm": 0.6917054653167725, + "learning_rate": 6.924724716085505e-06, + "loss": 0.8374, + "step": 13661 + }, + { + "epoch": 0.751940117783037, + "grad_norm": 0.7676698565483093, + "learning_rate": 6.924324645373914e-06, + "loss": 0.7435, + "step": 13662 + }, + { + "epoch": 0.7519951565853927, + "grad_norm": 0.6601388454437256, + "learning_rate": 6.923924560199613e-06, + "loss": 0.7168, + "step": 13663 + }, + { + "epoch": 0.7520501953877483, + "grad_norm": 0.6342683434486389, + "learning_rate": 6.923524460565611e-06, + "loss": 0.7382, + "step": 13664 + }, + { + "epoch": 0.752105234190104, + "grad_norm": 0.6703974604606628, + "learning_rate": 6.923124346474915e-06, + "loss": 0.7687, + "step": 13665 + }, + { + "epoch": 0.7521602729924597, + "grad_norm": 0.6937074661254883, + "learning_rate": 6.922724217930531e-06, + "loss": 0.7687, + "step": 13666 + }, + { + "epoch": 0.7522153117948154, + "grad_norm": 0.7919568419456482, + "learning_rate": 6.922324074935466e-06, + "loss": 0.7328, + "step": 13667 + }, + { + "epoch": 0.752270350597171, + "grad_norm": 0.668331503868103, + "learning_rate": 6.9219239174927275e-06, + "loss": 0.7654, + "step": 13668 + }, + { + "epoch": 0.7523253893995266, + "grad_norm": 0.6298941969871521, + "learning_rate": 6.921523745605323e-06, + "loss": 0.719, + "step": 13669 + }, + { + "epoch": 0.7523804282018823, + "grad_norm": 0.6539381146430969, + "learning_rate": 6.921123559276262e-06, + "loss": 0.6681, + "step": 13670 + }, + { + "epoch": 0.752435467004238, + "grad_norm": 1.0692330598831177, + "learning_rate": 6.920723358508548e-06, + "loss": 0.7914, + "step": 13671 + }, + { + "epoch": 0.7524905058065936, + "grad_norm": 0.7410482168197632, + "learning_rate": 6.920323143305193e-06, + "loss": 0.8331, + "step": 13672 + }, + { + "epoch": 0.7525455446089493, + "grad_norm": 0.6976327300071716, + "learning_rate": 6.919922913669203e-06, + "loss": 0.8131, + "step": 13673 + }, + { + "epoch": 0.752600583411305, + "grad_norm": 0.646442174911499, + "learning_rate": 6.919522669603587e-06, + "loss": 0.7658, + "step": 13674 + }, + { + "epoch": 0.7526556222136607, + "grad_norm": 0.6257727146148682, + "learning_rate": 6.919122411111352e-06, + "loss": 0.666, + "step": 13675 + }, + { + "epoch": 0.7527106610160162, + "grad_norm": 0.6913230419158936, + "learning_rate": 6.918722138195506e-06, + "loss": 0.6935, + "step": 13676 + }, + { + "epoch": 0.7527656998183719, + "grad_norm": 0.6282557249069214, + "learning_rate": 6.918321850859059e-06, + "loss": 0.7042, + "step": 13677 + }, + { + "epoch": 0.7528207386207276, + "grad_norm": 0.6980175971984863, + "learning_rate": 6.917921549105018e-06, + "loss": 0.6757, + "step": 13678 + }, + { + "epoch": 0.7528757774230833, + "grad_norm": 0.6954337954521179, + "learning_rate": 6.917521232936393e-06, + "loss": 0.729, + "step": 13679 + }, + { + "epoch": 0.7529308162254389, + "grad_norm": 0.6813758015632629, + "learning_rate": 6.91712090235619e-06, + "loss": 0.6964, + "step": 13680 + }, + { + "epoch": 0.7529858550277946, + "grad_norm": 1.0940780639648438, + "learning_rate": 6.916720557367419e-06, + "loss": 0.7853, + "step": 13681 + }, + { + "epoch": 0.7530408938301503, + "grad_norm": 0.6899382472038269, + "learning_rate": 6.9163201979730906e-06, + "loss": 0.7639, + "step": 13682 + }, + { + "epoch": 0.753095932632506, + "grad_norm": 0.660252034664154, + "learning_rate": 6.915919824176213e-06, + "loss": 0.7068, + "step": 13683 + }, + { + "epoch": 0.7531509714348615, + "grad_norm": 0.6454583406448364, + "learning_rate": 6.915519435979795e-06, + "loss": 0.7268, + "step": 13684 + }, + { + "epoch": 0.7532060102372172, + "grad_norm": 0.7292754650115967, + "learning_rate": 6.915119033386843e-06, + "loss": 0.8131, + "step": 13685 + }, + { + "epoch": 0.7532610490395729, + "grad_norm": 0.6312932372093201, + "learning_rate": 6.914718616400372e-06, + "loss": 0.6977, + "step": 13686 + }, + { + "epoch": 0.7533160878419286, + "grad_norm": 0.8528029322624207, + "learning_rate": 6.914318185023388e-06, + "loss": 0.8403, + "step": 13687 + }, + { + "epoch": 0.7533711266442842, + "grad_norm": 0.758721649646759, + "learning_rate": 6.9139177392589e-06, + "loss": 0.7, + "step": 13688 + }, + { + "epoch": 0.7534261654466399, + "grad_norm": 0.6678142547607422, + "learning_rate": 6.913517279109919e-06, + "loss": 0.6251, + "step": 13689 + }, + { + "epoch": 0.7534812042489956, + "grad_norm": 0.6136146783828735, + "learning_rate": 6.913116804579455e-06, + "loss": 0.653, + "step": 13690 + }, + { + "epoch": 0.7535362430513513, + "grad_norm": 0.7546648383140564, + "learning_rate": 6.912716315670517e-06, + "loss": 0.8202, + "step": 13691 + }, + { + "epoch": 0.7535912818537068, + "grad_norm": 0.7232012152671814, + "learning_rate": 6.912315812386114e-06, + "loss": 0.7993, + "step": 13692 + }, + { + "epoch": 0.7536463206560625, + "grad_norm": 0.7288710474967957, + "learning_rate": 6.911915294729258e-06, + "loss": 0.7702, + "step": 13693 + }, + { + "epoch": 0.7537013594584182, + "grad_norm": 0.6847403049468994, + "learning_rate": 6.9115147627029575e-06, + "loss": 0.8141, + "step": 13694 + }, + { + "epoch": 0.7537563982607739, + "grad_norm": 0.62345951795578, + "learning_rate": 6.9111142163102255e-06, + "loss": 0.6832, + "step": 13695 + }, + { + "epoch": 0.7538114370631295, + "grad_norm": 0.7275232672691345, + "learning_rate": 6.9107136555540695e-06, + "loss": 0.7548, + "step": 13696 + }, + { + "epoch": 0.7538664758654852, + "grad_norm": 0.6724695563316345, + "learning_rate": 6.910313080437501e-06, + "loss": 0.7755, + "step": 13697 + }, + { + "epoch": 0.7539215146678409, + "grad_norm": 0.8446974754333496, + "learning_rate": 6.90991249096353e-06, + "loss": 0.827, + "step": 13698 + }, + { + "epoch": 0.7539765534701965, + "grad_norm": 0.7124913930892944, + "learning_rate": 6.9095118871351705e-06, + "loss": 0.7463, + "step": 13699 + }, + { + "epoch": 0.7540315922725521, + "grad_norm": 0.6916043162345886, + "learning_rate": 6.90911126895543e-06, + "loss": 0.714, + "step": 13700 + }, + { + "epoch": 0.7540866310749078, + "grad_norm": 0.7585330009460449, + "learning_rate": 6.908710636427319e-06, + "loss": 0.6731, + "step": 13701 + }, + { + "epoch": 0.7541416698772635, + "grad_norm": 0.6905520558357239, + "learning_rate": 6.90830998955385e-06, + "loss": 0.726, + "step": 13702 + }, + { + "epoch": 0.7541967086796191, + "grad_norm": 0.7482494115829468, + "learning_rate": 6.907909328338035e-06, + "loss": 0.7269, + "step": 13703 + }, + { + "epoch": 0.7542517474819748, + "grad_norm": 0.7565957307815552, + "learning_rate": 6.907508652782884e-06, + "loss": 0.6959, + "step": 13704 + }, + { + "epoch": 0.7543067862843305, + "grad_norm": 0.7458370923995972, + "learning_rate": 6.9071079628914075e-06, + "loss": 0.7448, + "step": 13705 + }, + { + "epoch": 0.7543618250866861, + "grad_norm": 1.3538293838500977, + "learning_rate": 6.9067072586666185e-06, + "loss": 0.8164, + "step": 13706 + }, + { + "epoch": 0.7544168638890417, + "grad_norm": 0.6217493414878845, + "learning_rate": 6.906306540111528e-06, + "loss": 0.7001, + "step": 13707 + }, + { + "epoch": 0.7544719026913974, + "grad_norm": 0.6862730383872986, + "learning_rate": 6.9059058072291485e-06, + "loss": 0.7921, + "step": 13708 + }, + { + "epoch": 0.7545269414937531, + "grad_norm": 0.6684688925743103, + "learning_rate": 6.905505060022491e-06, + "loss": 0.6736, + "step": 13709 + }, + { + "epoch": 0.7545819802961088, + "grad_norm": 0.6581160426139832, + "learning_rate": 6.905104298494567e-06, + "loss": 0.7581, + "step": 13710 + }, + { + "epoch": 0.7546370190984644, + "grad_norm": 0.7772610783576965, + "learning_rate": 6.9047035226483885e-06, + "loss": 0.7984, + "step": 13711 + }, + { + "epoch": 0.7546920579008201, + "grad_norm": 0.6856822371482849, + "learning_rate": 6.90430273248697e-06, + "loss": 0.8232, + "step": 13712 + }, + { + "epoch": 0.7547470967031757, + "grad_norm": 0.7250725626945496, + "learning_rate": 6.903901928013322e-06, + "loss": 0.7844, + "step": 13713 + }, + { + "epoch": 0.7548021355055314, + "grad_norm": 0.7034164667129517, + "learning_rate": 6.9035011092304545e-06, + "loss": 0.8293, + "step": 13714 + }, + { + "epoch": 0.754857174307887, + "grad_norm": 0.6783095002174377, + "learning_rate": 6.903100276141383e-06, + "loss": 0.6841, + "step": 13715 + }, + { + "epoch": 0.7549122131102427, + "grad_norm": 0.6180121302604675, + "learning_rate": 6.90269942874912e-06, + "loss": 0.7111, + "step": 13716 + }, + { + "epoch": 0.7549672519125984, + "grad_norm": 0.70428466796875, + "learning_rate": 6.902298567056677e-06, + "loss": 0.8758, + "step": 13717 + }, + { + "epoch": 0.7550222907149541, + "grad_norm": 0.8130238652229309, + "learning_rate": 6.9018976910670665e-06, + "loss": 0.6443, + "step": 13718 + }, + { + "epoch": 0.7550773295173097, + "grad_norm": 0.6910800933837891, + "learning_rate": 6.901496800783302e-06, + "loss": 0.7231, + "step": 13719 + }, + { + "epoch": 0.7551323683196653, + "grad_norm": 0.700933575630188, + "learning_rate": 6.901095896208398e-06, + "loss": 0.6785, + "step": 13720 + }, + { + "epoch": 0.755187407122021, + "grad_norm": 0.7407829761505127, + "learning_rate": 6.9006949773453656e-06, + "loss": 0.694, + "step": 13721 + }, + { + "epoch": 0.7552424459243767, + "grad_norm": 0.7907935380935669, + "learning_rate": 6.900294044197218e-06, + "loss": 0.7674, + "step": 13722 + }, + { + "epoch": 0.7552974847267323, + "grad_norm": 0.6585111021995544, + "learning_rate": 6.89989309676697e-06, + "loss": 0.6785, + "step": 13723 + }, + { + "epoch": 0.755352523529088, + "grad_norm": 0.7611724138259888, + "learning_rate": 6.899492135057633e-06, + "loss": 0.8028, + "step": 13724 + }, + { + "epoch": 0.7554075623314437, + "grad_norm": 0.6412070989608765, + "learning_rate": 6.899091159072222e-06, + "loss": 0.7634, + "step": 13725 + }, + { + "epoch": 0.7554626011337994, + "grad_norm": 0.7712366580963135, + "learning_rate": 6.898690168813751e-06, + "loss": 0.8275, + "step": 13726 + }, + { + "epoch": 0.755517639936155, + "grad_norm": 0.6826579570770264, + "learning_rate": 6.898289164285232e-06, + "loss": 0.7949, + "step": 13727 + }, + { + "epoch": 0.7555726787385106, + "grad_norm": 0.7501955628395081, + "learning_rate": 6.897888145489681e-06, + "loss": 0.7846, + "step": 13728 + }, + { + "epoch": 0.7556277175408663, + "grad_norm": 0.6493077874183655, + "learning_rate": 6.8974871124301075e-06, + "loss": 0.7294, + "step": 13729 + }, + { + "epoch": 0.755682756343222, + "grad_norm": 0.6854347586631775, + "learning_rate": 6.897086065109532e-06, + "loss": 0.7121, + "step": 13730 + }, + { + "epoch": 0.7557377951455776, + "grad_norm": 0.7376317977905273, + "learning_rate": 6.896685003530964e-06, + "loss": 0.7719, + "step": 13731 + }, + { + "epoch": 0.7557928339479333, + "grad_norm": 0.8477175235748291, + "learning_rate": 6.89628392769742e-06, + "loss": 0.7981, + "step": 13732 + }, + { + "epoch": 0.755847872750289, + "grad_norm": 0.6611722111701965, + "learning_rate": 6.8958828376119125e-06, + "loss": 0.7628, + "step": 13733 + }, + { + "epoch": 0.7559029115526447, + "grad_norm": 0.6898290514945984, + "learning_rate": 6.895481733277458e-06, + "loss": 0.7578, + "step": 13734 + }, + { + "epoch": 0.7559579503550002, + "grad_norm": 0.6566810607910156, + "learning_rate": 6.89508061469707e-06, + "loss": 0.6919, + "step": 13735 + }, + { + "epoch": 0.7560129891573559, + "grad_norm": 0.6395933032035828, + "learning_rate": 6.894679481873763e-06, + "loss": 0.7334, + "step": 13736 + }, + { + "epoch": 0.7560680279597116, + "grad_norm": 0.7060876488685608, + "learning_rate": 6.8942783348105535e-06, + "loss": 0.7405, + "step": 13737 + }, + { + "epoch": 0.7561230667620673, + "grad_norm": 0.7303228974342346, + "learning_rate": 6.893877173510454e-06, + "loss": 0.8563, + "step": 13738 + }, + { + "epoch": 0.7561781055644229, + "grad_norm": 0.663474977016449, + "learning_rate": 6.893475997976481e-06, + "loss": 0.703, + "step": 13739 + }, + { + "epoch": 0.7562331443667786, + "grad_norm": 0.8005428910255432, + "learning_rate": 6.893074808211649e-06, + "loss": 0.7219, + "step": 13740 + }, + { + "epoch": 0.7562881831691343, + "grad_norm": 1.3285688161849976, + "learning_rate": 6.892673604218972e-06, + "loss": 0.672, + "step": 13741 + }, + { + "epoch": 0.75634322197149, + "grad_norm": 0.6958948373794556, + "learning_rate": 6.892272386001469e-06, + "loss": 0.7728, + "step": 13742 + }, + { + "epoch": 0.7563982607738455, + "grad_norm": 0.6840598583221436, + "learning_rate": 6.891871153562153e-06, + "loss": 0.7881, + "step": 13743 + }, + { + "epoch": 0.7564532995762012, + "grad_norm": 0.7184257507324219, + "learning_rate": 6.891469906904039e-06, + "loss": 0.736, + "step": 13744 + }, + { + "epoch": 0.7565083383785569, + "grad_norm": 0.6611571311950684, + "learning_rate": 6.891068646030143e-06, + "loss": 0.7171, + "step": 13745 + }, + { + "epoch": 0.7565633771809125, + "grad_norm": 0.8237559795379639, + "learning_rate": 6.890667370943482e-06, + "loss": 0.8669, + "step": 13746 + }, + { + "epoch": 0.7566184159832682, + "grad_norm": 0.6898388266563416, + "learning_rate": 6.890266081647072e-06, + "loss": 0.6654, + "step": 13747 + }, + { + "epoch": 0.7566734547856239, + "grad_norm": 0.6541711688041687, + "learning_rate": 6.889864778143928e-06, + "loss": 0.7455, + "step": 13748 + }, + { + "epoch": 0.7567284935879796, + "grad_norm": 0.6518157124519348, + "learning_rate": 6.8894634604370655e-06, + "loss": 0.7174, + "step": 13749 + }, + { + "epoch": 0.7567835323903351, + "grad_norm": 0.7992080450057983, + "learning_rate": 6.889062128529502e-06, + "loss": 0.7349, + "step": 13750 + }, + { + "epoch": 0.7568385711926908, + "grad_norm": 0.5748338103294373, + "learning_rate": 6.888660782424253e-06, + "loss": 0.5398, + "step": 13751 + }, + { + "epoch": 0.7568936099950465, + "grad_norm": 0.6507781744003296, + "learning_rate": 6.8882594221243344e-06, + "loss": 0.6762, + "step": 13752 + }, + { + "epoch": 0.7569486487974022, + "grad_norm": 0.6908432841300964, + "learning_rate": 6.887858047632764e-06, + "loss": 0.8034, + "step": 13753 + }, + { + "epoch": 0.7570036875997578, + "grad_norm": 0.6497751474380493, + "learning_rate": 6.887456658952557e-06, + "loss": 0.6351, + "step": 13754 + }, + { + "epoch": 0.7570587264021135, + "grad_norm": 0.7233273386955261, + "learning_rate": 6.887055256086732e-06, + "loss": 0.7096, + "step": 13755 + }, + { + "epoch": 0.7571137652044692, + "grad_norm": 0.6587454676628113, + "learning_rate": 6.886653839038305e-06, + "loss": 0.7354, + "step": 13756 + }, + { + "epoch": 0.7571688040068248, + "grad_norm": 0.6654310822486877, + "learning_rate": 6.886252407810292e-06, + "loss": 0.7776, + "step": 13757 + }, + { + "epoch": 0.7572238428091804, + "grad_norm": 0.796604573726654, + "learning_rate": 6.885850962405711e-06, + "loss": 0.7925, + "step": 13758 + }, + { + "epoch": 0.7572788816115361, + "grad_norm": 0.7053457498550415, + "learning_rate": 6.8854495028275795e-06, + "loss": 0.7893, + "step": 13759 + }, + { + "epoch": 0.7573339204138918, + "grad_norm": 0.7201200127601624, + "learning_rate": 6.885048029078914e-06, + "loss": 0.8346, + "step": 13760 + }, + { + "epoch": 0.7573889592162475, + "grad_norm": 0.8437653183937073, + "learning_rate": 6.884646541162731e-06, + "loss": 0.7468, + "step": 13761 + }, + { + "epoch": 0.7574439980186031, + "grad_norm": 0.6910028457641602, + "learning_rate": 6.884245039082052e-06, + "loss": 0.7362, + "step": 13762 + }, + { + "epoch": 0.7574990368209588, + "grad_norm": 0.6896274089813232, + "learning_rate": 6.883843522839889e-06, + "loss": 0.6515, + "step": 13763 + }, + { + "epoch": 0.7575540756233144, + "grad_norm": 0.9833560585975647, + "learning_rate": 6.8834419924392636e-06, + "loss": 0.8764, + "step": 13764 + }, + { + "epoch": 0.7576091144256701, + "grad_norm": 0.7130032181739807, + "learning_rate": 6.88304044788319e-06, + "loss": 0.7631, + "step": 13765 + }, + { + "epoch": 0.7576641532280257, + "grad_norm": 0.7059195041656494, + "learning_rate": 6.882638889174691e-06, + "loss": 0.8147, + "step": 13766 + }, + { + "epoch": 0.7577191920303814, + "grad_norm": 0.6451989412307739, + "learning_rate": 6.882237316316781e-06, + "loss": 0.6638, + "step": 13767 + }, + { + "epoch": 0.7577742308327371, + "grad_norm": 0.7541074752807617, + "learning_rate": 6.881835729312481e-06, + "loss": 0.6918, + "step": 13768 + }, + { + "epoch": 0.7578292696350928, + "grad_norm": 0.7227535843849182, + "learning_rate": 6.881434128164805e-06, + "loss": 0.7759, + "step": 13769 + }, + { + "epoch": 0.7578843084374484, + "grad_norm": 0.673112154006958, + "learning_rate": 6.881032512876774e-06, + "loss": 0.7328, + "step": 13770 + }, + { + "epoch": 0.757939347239804, + "grad_norm": 0.6536681056022644, + "learning_rate": 6.880630883451407e-06, + "loss": 0.7677, + "step": 13771 + }, + { + "epoch": 0.7579943860421597, + "grad_norm": 0.8517894148826599, + "learning_rate": 6.880229239891721e-06, + "loss": 0.8566, + "step": 13772 + }, + { + "epoch": 0.7580494248445154, + "grad_norm": 0.8260573148727417, + "learning_rate": 6.879827582200737e-06, + "loss": 0.8228, + "step": 13773 + }, + { + "epoch": 0.758104463646871, + "grad_norm": 0.7460072040557861, + "learning_rate": 6.87942591038147e-06, + "loss": 0.8047, + "step": 13774 + }, + { + "epoch": 0.7581595024492267, + "grad_norm": 0.7648436427116394, + "learning_rate": 6.879024224436942e-06, + "loss": 0.852, + "step": 13775 + }, + { + "epoch": 0.7582145412515824, + "grad_norm": 0.7161253094673157, + "learning_rate": 6.878622524370171e-06, + "loss": 0.7638, + "step": 13776 + }, + { + "epoch": 0.7582695800539381, + "grad_norm": 0.6559579372406006, + "learning_rate": 6.878220810184175e-06, + "loss": 0.6932, + "step": 13777 + }, + { + "epoch": 0.7583246188562937, + "grad_norm": 0.6846898198127747, + "learning_rate": 6.877819081881975e-06, + "loss": 0.7098, + "step": 13778 + }, + { + "epoch": 0.7583796576586493, + "grad_norm": 0.7569675445556641, + "learning_rate": 6.87741733946659e-06, + "loss": 0.687, + "step": 13779 + }, + { + "epoch": 0.758434696461005, + "grad_norm": 0.7513766288757324, + "learning_rate": 6.877015582941038e-06, + "loss": 0.8673, + "step": 13780 + }, + { + "epoch": 0.7584897352633607, + "grad_norm": 0.7158082127571106, + "learning_rate": 6.876613812308338e-06, + "loss": 0.7563, + "step": 13781 + }, + { + "epoch": 0.7585447740657163, + "grad_norm": 0.6307277083396912, + "learning_rate": 6.876212027571513e-06, + "loss": 0.6725, + "step": 13782 + }, + { + "epoch": 0.758599812868072, + "grad_norm": 0.735090434551239, + "learning_rate": 6.87581022873358e-06, + "loss": 0.763, + "step": 13783 + }, + { + "epoch": 0.7586548516704277, + "grad_norm": 0.6412403583526611, + "learning_rate": 6.8754084157975594e-06, + "loss": 0.5992, + "step": 13784 + }, + { + "epoch": 0.7587098904727834, + "grad_norm": 0.639854907989502, + "learning_rate": 6.875006588766472e-06, + "loss": 0.7372, + "step": 13785 + }, + { + "epoch": 0.7587649292751389, + "grad_norm": 0.6855082511901855, + "learning_rate": 6.8746047476433365e-06, + "loss": 0.7709, + "step": 13786 + }, + { + "epoch": 0.7588199680774946, + "grad_norm": 0.6838769912719727, + "learning_rate": 6.874202892431173e-06, + "loss": 0.7545, + "step": 13787 + }, + { + "epoch": 0.7588750068798503, + "grad_norm": 1.1560181379318237, + "learning_rate": 6.873801023133002e-06, + "loss": 0.7291, + "step": 13788 + }, + { + "epoch": 0.7589300456822059, + "grad_norm": 0.7140469551086426, + "learning_rate": 6.873399139751844e-06, + "loss": 0.7214, + "step": 13789 + }, + { + "epoch": 0.7589850844845616, + "grad_norm": 0.6856355667114258, + "learning_rate": 6.8729972422907195e-06, + "loss": 0.7417, + "step": 13790 + }, + { + "epoch": 0.7590401232869173, + "grad_norm": 0.7856155633926392, + "learning_rate": 6.8725953307526505e-06, + "loss": 0.7484, + "step": 13791 + }, + { + "epoch": 0.759095162089273, + "grad_norm": 0.8107255697250366, + "learning_rate": 6.8721934051406555e-06, + "loss": 0.7568, + "step": 13792 + }, + { + "epoch": 0.7591502008916285, + "grad_norm": 0.6590837240219116, + "learning_rate": 6.871791465457757e-06, + "loss": 0.7495, + "step": 13793 + }, + { + "epoch": 0.7592052396939842, + "grad_norm": 0.7531588077545166, + "learning_rate": 6.8713895117069715e-06, + "loss": 0.7434, + "step": 13794 + }, + { + "epoch": 0.7592602784963399, + "grad_norm": 0.6818329095840454, + "learning_rate": 6.870987543891326e-06, + "loss": 0.7128, + "step": 13795 + }, + { + "epoch": 0.7593153172986956, + "grad_norm": 0.6082884669303894, + "learning_rate": 6.8705855620138395e-06, + "loss": 0.7437, + "step": 13796 + }, + { + "epoch": 0.7593703561010512, + "grad_norm": 0.9583787322044373, + "learning_rate": 6.870183566077532e-06, + "loss": 0.7779, + "step": 13797 + }, + { + "epoch": 0.7594253949034069, + "grad_norm": 0.6684621572494507, + "learning_rate": 6.869781556085425e-06, + "loss": 0.5856, + "step": 13798 + }, + { + "epoch": 0.7594804337057626, + "grad_norm": 0.6225603222846985, + "learning_rate": 6.869379532040541e-06, + "loss": 0.7407, + "step": 13799 + }, + { + "epoch": 0.7595354725081183, + "grad_norm": 0.6973103284835815, + "learning_rate": 6.8689774939459005e-06, + "loss": 0.7789, + "step": 13800 + }, + { + "epoch": 0.7595905113104738, + "grad_norm": 0.6655399203300476, + "learning_rate": 6.868575441804526e-06, + "loss": 0.7489, + "step": 13801 + }, + { + "epoch": 0.7596455501128295, + "grad_norm": 0.7066664695739746, + "learning_rate": 6.868173375619437e-06, + "loss": 0.7035, + "step": 13802 + }, + { + "epoch": 0.7597005889151852, + "grad_norm": 1.0646852254867554, + "learning_rate": 6.867771295393658e-06, + "loss": 0.8488, + "step": 13803 + }, + { + "epoch": 0.7597556277175409, + "grad_norm": 0.6551353335380554, + "learning_rate": 6.867369201130209e-06, + "loss": 0.7147, + "step": 13804 + }, + { + "epoch": 0.7598106665198965, + "grad_norm": 0.6749850511550903, + "learning_rate": 6.866967092832115e-06, + "loss": 0.7963, + "step": 13805 + }, + { + "epoch": 0.7598657053222522, + "grad_norm": 0.6704042553901672, + "learning_rate": 6.866564970502394e-06, + "loss": 0.7992, + "step": 13806 + }, + { + "epoch": 0.7599207441246079, + "grad_norm": 0.7027791142463684, + "learning_rate": 6.866162834144071e-06, + "loss": 0.7931, + "step": 13807 + }, + { + "epoch": 0.7599757829269636, + "grad_norm": 0.7925322651863098, + "learning_rate": 6.865760683760169e-06, + "loss": 0.7826, + "step": 13808 + }, + { + "epoch": 0.7600308217293191, + "grad_norm": 0.7152161002159119, + "learning_rate": 6.865358519353708e-06, + "loss": 0.7481, + "step": 13809 + }, + { + "epoch": 0.7600858605316748, + "grad_norm": 0.6572757959365845, + "learning_rate": 6.864956340927711e-06, + "loss": 0.785, + "step": 13810 + }, + { + "epoch": 0.7601408993340305, + "grad_norm": 0.6848406791687012, + "learning_rate": 6.864554148485203e-06, + "loss": 0.6423, + "step": 13811 + }, + { + "epoch": 0.7601959381363862, + "grad_norm": 0.747597873210907, + "learning_rate": 6.864151942029205e-06, + "loss": 0.7901, + "step": 13812 + }, + { + "epoch": 0.7602509769387418, + "grad_norm": 0.7106720805168152, + "learning_rate": 6.863749721562738e-06, + "loss": 0.7488, + "step": 13813 + }, + { + "epoch": 0.7603060157410975, + "grad_norm": 0.6864057779312134, + "learning_rate": 6.8633474870888275e-06, + "loss": 0.7066, + "step": 13814 + }, + { + "epoch": 0.7603610545434532, + "grad_norm": 0.7022056579589844, + "learning_rate": 6.862945238610496e-06, + "loss": 0.6851, + "step": 13815 + }, + { + "epoch": 0.7604160933458088, + "grad_norm": 0.7361913919448853, + "learning_rate": 6.862542976130769e-06, + "loss": 0.7425, + "step": 13816 + }, + { + "epoch": 0.7604711321481644, + "grad_norm": 0.6723676323890686, + "learning_rate": 6.862140699652666e-06, + "loss": 0.7937, + "step": 13817 + }, + { + "epoch": 0.7605261709505201, + "grad_norm": 0.7491924166679382, + "learning_rate": 6.861738409179212e-06, + "loss": 0.7585, + "step": 13818 + }, + { + "epoch": 0.7605812097528758, + "grad_norm": 0.6772211790084839, + "learning_rate": 6.86133610471343e-06, + "loss": 0.7617, + "step": 13819 + }, + { + "epoch": 0.7606362485552315, + "grad_norm": 0.7819864153862, + "learning_rate": 6.860933786258344e-06, + "loss": 0.7924, + "step": 13820 + }, + { + "epoch": 0.7606912873575871, + "grad_norm": 0.6992526650428772, + "learning_rate": 6.86053145381698e-06, + "loss": 0.7054, + "step": 13821 + }, + { + "epoch": 0.7607463261599428, + "grad_norm": 0.7189231514930725, + "learning_rate": 6.860129107392357e-06, + "loss": 0.7603, + "step": 13822 + }, + { + "epoch": 0.7608013649622984, + "grad_norm": 0.7165294885635376, + "learning_rate": 6.859726746987503e-06, + "loss": 0.8118, + "step": 13823 + }, + { + "epoch": 0.7608564037646541, + "grad_norm": 0.6510334014892578, + "learning_rate": 6.85932437260544e-06, + "loss": 0.7584, + "step": 13824 + }, + { + "epoch": 0.7609114425670097, + "grad_norm": 0.7113379836082458, + "learning_rate": 6.8589219842491935e-06, + "loss": 0.7799, + "step": 13825 + }, + { + "epoch": 0.7609664813693654, + "grad_norm": 0.7441100478172302, + "learning_rate": 6.8585195819217856e-06, + "loss": 0.6468, + "step": 13826 + }, + { + "epoch": 0.7610215201717211, + "grad_norm": 1.0703508853912354, + "learning_rate": 6.858117165626244e-06, + "loss": 0.7922, + "step": 13827 + }, + { + "epoch": 0.7610765589740768, + "grad_norm": 0.7097275853157043, + "learning_rate": 6.857714735365589e-06, + "loss": 0.7594, + "step": 13828 + }, + { + "epoch": 0.7611315977764324, + "grad_norm": 0.7001124620437622, + "learning_rate": 6.857312291142848e-06, + "loss": 0.7679, + "step": 13829 + }, + { + "epoch": 0.761186636578788, + "grad_norm": 0.6898123621940613, + "learning_rate": 6.856909832961045e-06, + "loss": 0.7684, + "step": 13830 + }, + { + "epoch": 0.7612416753811437, + "grad_norm": 0.6535243391990662, + "learning_rate": 6.856507360823206e-06, + "loss": 0.6143, + "step": 13831 + }, + { + "epoch": 0.7612967141834993, + "grad_norm": 0.6726056933403015, + "learning_rate": 6.856104874732353e-06, + "loss": 0.7566, + "step": 13832 + }, + { + "epoch": 0.761351752985855, + "grad_norm": 0.8741437196731567, + "learning_rate": 6.855702374691513e-06, + "loss": 0.723, + "step": 13833 + }, + { + "epoch": 0.7614067917882107, + "grad_norm": 0.7025718092918396, + "learning_rate": 6.855299860703712e-06, + "loss": 0.8035, + "step": 13834 + }, + { + "epoch": 0.7614618305905664, + "grad_norm": 1.08286452293396, + "learning_rate": 6.8548973327719726e-06, + "loss": 0.7347, + "step": 13835 + }, + { + "epoch": 0.761516869392922, + "grad_norm": 0.6483243107795715, + "learning_rate": 6.854494790899322e-06, + "loss": 0.7326, + "step": 13836 + }, + { + "epoch": 0.7615719081952776, + "grad_norm": 0.6611089110374451, + "learning_rate": 6.854092235088784e-06, + "loss": 0.7619, + "step": 13837 + }, + { + "epoch": 0.7616269469976333, + "grad_norm": 0.8394322991371155, + "learning_rate": 6.853689665343385e-06, + "loss": 0.7017, + "step": 13838 + }, + { + "epoch": 0.761681985799989, + "grad_norm": 0.7131583094596863, + "learning_rate": 6.853287081666151e-06, + "loss": 0.7367, + "step": 13839 + }, + { + "epoch": 0.7617370246023446, + "grad_norm": 0.7316367626190186, + "learning_rate": 6.852884484060108e-06, + "loss": 0.7323, + "step": 13840 + }, + { + "epoch": 0.7617920634047003, + "grad_norm": 0.7639010548591614, + "learning_rate": 6.852481872528281e-06, + "loss": 0.819, + "step": 13841 + }, + { + "epoch": 0.761847102207056, + "grad_norm": 0.7118390202522278, + "learning_rate": 6.852079247073695e-06, + "loss": 0.7645, + "step": 13842 + }, + { + "epoch": 0.7619021410094117, + "grad_norm": 0.6885393857955933, + "learning_rate": 6.851676607699379e-06, + "loss": 0.8052, + "step": 13843 + }, + { + "epoch": 0.7619571798117672, + "grad_norm": 0.7034374475479126, + "learning_rate": 6.851273954408356e-06, + "loss": 0.8464, + "step": 13844 + }, + { + "epoch": 0.7620122186141229, + "grad_norm": 0.6531803607940674, + "learning_rate": 6.850871287203654e-06, + "loss": 0.7871, + "step": 13845 + }, + { + "epoch": 0.7620672574164786, + "grad_norm": 0.6637283563613892, + "learning_rate": 6.8504686060882995e-06, + "loss": 0.7326, + "step": 13846 + }, + { + "epoch": 0.7621222962188343, + "grad_norm": 0.6467694640159607, + "learning_rate": 6.850065911065318e-06, + "loss": 0.7936, + "step": 13847 + }, + { + "epoch": 0.7621773350211899, + "grad_norm": 0.6829109191894531, + "learning_rate": 6.849663202137735e-06, + "loss": 0.7003, + "step": 13848 + }, + { + "epoch": 0.7622323738235456, + "grad_norm": 0.7321386933326721, + "learning_rate": 6.84926047930858e-06, + "loss": 0.6921, + "step": 13849 + }, + { + "epoch": 0.7622874126259013, + "grad_norm": 0.6900202631950378, + "learning_rate": 6.8488577425808766e-06, + "loss": 0.7496, + "step": 13850 + }, + { + "epoch": 0.762342451428257, + "grad_norm": 0.6304247975349426, + "learning_rate": 6.848454991957655e-06, + "loss": 0.7135, + "step": 13851 + }, + { + "epoch": 0.7623974902306125, + "grad_norm": 0.7087798118591309, + "learning_rate": 6.8480522274419404e-06, + "loss": 0.7032, + "step": 13852 + }, + { + "epoch": 0.7624525290329682, + "grad_norm": 0.7777289152145386, + "learning_rate": 6.84764944903676e-06, + "loss": 0.7345, + "step": 13853 + }, + { + "epoch": 0.7625075678353239, + "grad_norm": 0.7282242774963379, + "learning_rate": 6.847246656745139e-06, + "loss": 0.6408, + "step": 13854 + }, + { + "epoch": 0.7625626066376796, + "grad_norm": 0.7798221707344055, + "learning_rate": 6.846843850570107e-06, + "loss": 0.9058, + "step": 13855 + }, + { + "epoch": 0.7626176454400352, + "grad_norm": 0.6145210266113281, + "learning_rate": 6.846441030514692e-06, + "loss": 0.6331, + "step": 13856 + }, + { + "epoch": 0.7626726842423909, + "grad_norm": 0.7079364061355591, + "learning_rate": 6.846038196581921e-06, + "loss": 0.7511, + "step": 13857 + }, + { + "epoch": 0.7627277230447466, + "grad_norm": 0.733635425567627, + "learning_rate": 6.845635348774821e-06, + "loss": 0.6957, + "step": 13858 + }, + { + "epoch": 0.7627827618471023, + "grad_norm": 0.8099489808082581, + "learning_rate": 6.845232487096419e-06, + "loss": 0.8068, + "step": 13859 + }, + { + "epoch": 0.7628378006494578, + "grad_norm": 0.6241937875747681, + "learning_rate": 6.844829611549744e-06, + "loss": 0.7102, + "step": 13860 + }, + { + "epoch": 0.7628928394518135, + "grad_norm": 0.8009611368179321, + "learning_rate": 6.8444267221378235e-06, + "loss": 0.8369, + "step": 13861 + }, + { + "epoch": 0.7629478782541692, + "grad_norm": 0.6700903177261353, + "learning_rate": 6.844023818863685e-06, + "loss": 0.8075, + "step": 13862 + }, + { + "epoch": 0.7630029170565249, + "grad_norm": 0.9378371834754944, + "learning_rate": 6.843620901730357e-06, + "loss": 0.7539, + "step": 13863 + }, + { + "epoch": 0.7630579558588805, + "grad_norm": 0.6704423427581787, + "learning_rate": 6.843217970740867e-06, + "loss": 0.7285, + "step": 13864 + }, + { + "epoch": 0.7631129946612362, + "grad_norm": 0.7236818075180054, + "learning_rate": 6.842815025898246e-06, + "loss": 0.7223, + "step": 13865 + }, + { + "epoch": 0.7631680334635919, + "grad_norm": 0.676184356212616, + "learning_rate": 6.84241206720552e-06, + "loss": 0.7286, + "step": 13866 + }, + { + "epoch": 0.7632230722659475, + "grad_norm": 0.6443304419517517, + "learning_rate": 6.842009094665717e-06, + "loss": 0.6806, + "step": 13867 + }, + { + "epoch": 0.7632781110683031, + "grad_norm": 0.7931790947914124, + "learning_rate": 6.841606108281868e-06, + "loss": 0.7801, + "step": 13868 + }, + { + "epoch": 0.7633331498706588, + "grad_norm": 0.7440798878669739, + "learning_rate": 6.841203108057e-06, + "loss": 0.8044, + "step": 13869 + }, + { + "epoch": 0.7633881886730145, + "grad_norm": 0.7226675748825073, + "learning_rate": 6.840800093994142e-06, + "loss": 0.718, + "step": 13870 + }, + { + "epoch": 0.7634432274753702, + "grad_norm": 0.7351265549659729, + "learning_rate": 6.8403970660963245e-06, + "loss": 0.8389, + "step": 13871 + }, + { + "epoch": 0.7634982662777258, + "grad_norm": 0.8326215744018555, + "learning_rate": 6.839994024366574e-06, + "loss": 0.8583, + "step": 13872 + }, + { + "epoch": 0.7635533050800815, + "grad_norm": 0.6841259002685547, + "learning_rate": 6.839590968807922e-06, + "loss": 0.7553, + "step": 13873 + }, + { + "epoch": 0.7636083438824371, + "grad_norm": 0.7305078506469727, + "learning_rate": 6.839187899423395e-06, + "loss": 0.7825, + "step": 13874 + }, + { + "epoch": 0.7636633826847927, + "grad_norm": 0.7235193252563477, + "learning_rate": 6.838784816216025e-06, + "loss": 0.7653, + "step": 13875 + }, + { + "epoch": 0.7637184214871484, + "grad_norm": 0.6468761563301086, + "learning_rate": 6.838381719188842e-06, + "loss": 0.6901, + "step": 13876 + }, + { + "epoch": 0.7637734602895041, + "grad_norm": 0.6806310415267944, + "learning_rate": 6.837978608344872e-06, + "loss": 0.6876, + "step": 13877 + }, + { + "epoch": 0.7638284990918598, + "grad_norm": 0.692081093788147, + "learning_rate": 6.837575483687147e-06, + "loss": 0.7506, + "step": 13878 + }, + { + "epoch": 0.7638835378942154, + "grad_norm": 0.6447135806083679, + "learning_rate": 6.837172345218697e-06, + "loss": 0.6841, + "step": 13879 + }, + { + "epoch": 0.7639385766965711, + "grad_norm": 0.7352014183998108, + "learning_rate": 6.8367691929425516e-06, + "loss": 0.8066, + "step": 13880 + }, + { + "epoch": 0.7639936154989267, + "grad_norm": 0.7305072546005249, + "learning_rate": 6.8363660268617405e-06, + "loss": 0.717, + "step": 13881 + }, + { + "epoch": 0.7640486543012824, + "grad_norm": 0.6580411195755005, + "learning_rate": 6.835962846979294e-06, + "loss": 0.7585, + "step": 13882 + }, + { + "epoch": 0.764103693103638, + "grad_norm": 0.7568425536155701, + "learning_rate": 6.835559653298242e-06, + "loss": 0.8273, + "step": 13883 + }, + { + "epoch": 0.7641587319059937, + "grad_norm": 0.8121107816696167, + "learning_rate": 6.835156445821616e-06, + "loss": 0.9064, + "step": 13884 + }, + { + "epoch": 0.7642137707083494, + "grad_norm": 0.6522091031074524, + "learning_rate": 6.834753224552444e-06, + "loss": 0.767, + "step": 13885 + }, + { + "epoch": 0.7642688095107051, + "grad_norm": 1.0779389142990112, + "learning_rate": 6.8343499894937574e-06, + "loss": 0.7702, + "step": 13886 + }, + { + "epoch": 0.7643238483130607, + "grad_norm": 0.6902838349342346, + "learning_rate": 6.833946740648588e-06, + "loss": 0.6529, + "step": 13887 + }, + { + "epoch": 0.7643788871154164, + "grad_norm": 0.692480742931366, + "learning_rate": 6.833543478019966e-06, + "loss": 0.7404, + "step": 13888 + }, + { + "epoch": 0.764433925917772, + "grad_norm": 0.633627712726593, + "learning_rate": 6.833140201610923e-06, + "loss": 0.711, + "step": 13889 + }, + { + "epoch": 0.7644889647201277, + "grad_norm": 0.8653294444084167, + "learning_rate": 6.832736911424487e-06, + "loss": 0.8102, + "step": 13890 + }, + { + "epoch": 0.7645440035224833, + "grad_norm": 0.7864197492599487, + "learning_rate": 6.832333607463692e-06, + "loss": 0.7064, + "step": 13891 + }, + { + "epoch": 0.764599042324839, + "grad_norm": 0.6703711748123169, + "learning_rate": 6.831930289731569e-06, + "loss": 0.7653, + "step": 13892 + }, + { + "epoch": 0.7646540811271947, + "grad_norm": 0.7420178651809692, + "learning_rate": 6.831526958231147e-06, + "loss": 0.8137, + "step": 13893 + }, + { + "epoch": 0.7647091199295504, + "grad_norm": 0.7372543215751648, + "learning_rate": 6.831123612965459e-06, + "loss": 0.6871, + "step": 13894 + }, + { + "epoch": 0.764764158731906, + "grad_norm": 0.77486652135849, + "learning_rate": 6.830720253937536e-06, + "loss": 0.727, + "step": 13895 + }, + { + "epoch": 0.7648191975342616, + "grad_norm": 0.7087406516075134, + "learning_rate": 6.83031688115041e-06, + "loss": 0.7743, + "step": 13896 + }, + { + "epoch": 0.7648742363366173, + "grad_norm": 0.8415336608886719, + "learning_rate": 6.829913494607112e-06, + "loss": 0.774, + "step": 13897 + }, + { + "epoch": 0.764929275138973, + "grad_norm": 0.7736749053001404, + "learning_rate": 6.829510094310674e-06, + "loss": 0.7541, + "step": 13898 + }, + { + "epoch": 0.7649843139413286, + "grad_norm": 0.6749987602233887, + "learning_rate": 6.829106680264128e-06, + "loss": 0.7139, + "step": 13899 + }, + { + "epoch": 0.7650393527436843, + "grad_norm": 0.7079635262489319, + "learning_rate": 6.8287032524705055e-06, + "loss": 0.75, + "step": 13900 + }, + { + "epoch": 0.76509439154604, + "grad_norm": 0.6906388401985168, + "learning_rate": 6.828299810932839e-06, + "loss": 0.6895, + "step": 13901 + }, + { + "epoch": 0.7651494303483957, + "grad_norm": 0.7045881152153015, + "learning_rate": 6.82789635565416e-06, + "loss": 0.8728, + "step": 13902 + }, + { + "epoch": 0.7652044691507512, + "grad_norm": 0.6836426258087158, + "learning_rate": 6.827492886637501e-06, + "loss": 0.7315, + "step": 13903 + }, + { + "epoch": 0.7652595079531069, + "grad_norm": 0.6467520594596863, + "learning_rate": 6.827089403885896e-06, + "loss": 0.7556, + "step": 13904 + }, + { + "epoch": 0.7653145467554626, + "grad_norm": 0.7118285894393921, + "learning_rate": 6.826685907402376e-06, + "loss": 0.8686, + "step": 13905 + }, + { + "epoch": 0.7653695855578183, + "grad_norm": 0.6093236207962036, + "learning_rate": 6.826282397189974e-06, + "loss": 0.7066, + "step": 13906 + }, + { + "epoch": 0.7654246243601739, + "grad_norm": 0.6839649677276611, + "learning_rate": 6.825878873251721e-06, + "loss": 0.7025, + "step": 13907 + }, + { + "epoch": 0.7654796631625296, + "grad_norm": 0.7582715153694153, + "learning_rate": 6.825475335590652e-06, + "loss": 0.7301, + "step": 13908 + }, + { + "epoch": 0.7655347019648853, + "grad_norm": 0.6580978631973267, + "learning_rate": 6.8250717842098e-06, + "loss": 0.6771, + "step": 13909 + }, + { + "epoch": 0.765589740767241, + "grad_norm": 0.6754937171936035, + "learning_rate": 6.824668219112195e-06, + "loss": 0.7446, + "step": 13910 + }, + { + "epoch": 0.7656447795695965, + "grad_norm": 0.7541018724441528, + "learning_rate": 6.8242646403008725e-06, + "loss": 0.802, + "step": 13911 + }, + { + "epoch": 0.7656998183719522, + "grad_norm": 0.6714808344841003, + "learning_rate": 6.823861047778866e-06, + "loss": 0.7334, + "step": 13912 + }, + { + "epoch": 0.7657548571743079, + "grad_norm": 0.6972425580024719, + "learning_rate": 6.823457441549209e-06, + "loss": 0.7859, + "step": 13913 + }, + { + "epoch": 0.7658098959766636, + "grad_norm": 0.6660878658294678, + "learning_rate": 6.823053821614931e-06, + "loss": 0.6594, + "step": 13914 + }, + { + "epoch": 0.7658649347790192, + "grad_norm": 0.7392181158065796, + "learning_rate": 6.82265018797907e-06, + "loss": 0.6667, + "step": 13915 + }, + { + "epoch": 0.7659199735813749, + "grad_norm": 0.7601449489593506, + "learning_rate": 6.822246540644659e-06, + "loss": 0.7349, + "step": 13916 + }, + { + "epoch": 0.7659750123837306, + "grad_norm": 0.6648421287536621, + "learning_rate": 6.821842879614731e-06, + "loss": 0.7597, + "step": 13917 + }, + { + "epoch": 0.7660300511860861, + "grad_norm": 0.6369950175285339, + "learning_rate": 6.821439204892317e-06, + "loss": 0.7452, + "step": 13918 + }, + { + "epoch": 0.7660850899884418, + "grad_norm": 0.747653603553772, + "learning_rate": 6.821035516480457e-06, + "loss": 0.693, + "step": 13919 + }, + { + "epoch": 0.7661401287907975, + "grad_norm": 0.6450137495994568, + "learning_rate": 6.8206318143821795e-06, + "loss": 0.6492, + "step": 13920 + }, + { + "epoch": 0.7661951675931532, + "grad_norm": 0.707801878452301, + "learning_rate": 6.8202280986005205e-06, + "loss": 0.7284, + "step": 13921 + }, + { + "epoch": 0.7662502063955088, + "grad_norm": 0.7191962003707886, + "learning_rate": 6.8198243691385146e-06, + "loss": 0.7714, + "step": 13922 + }, + { + "epoch": 0.7663052451978645, + "grad_norm": 0.7477172613143921, + "learning_rate": 6.819420625999196e-06, + "loss": 0.7076, + "step": 13923 + }, + { + "epoch": 0.7663602840002202, + "grad_norm": 0.6221175193786621, + "learning_rate": 6.819016869185599e-06, + "loss": 0.6848, + "step": 13924 + }, + { + "epoch": 0.7664153228025758, + "grad_norm": 0.7840436697006226, + "learning_rate": 6.818613098700758e-06, + "loss": 0.7028, + "step": 13925 + }, + { + "epoch": 0.7664703616049314, + "grad_norm": 0.7147907018661499, + "learning_rate": 6.818209314547707e-06, + "loss": 0.7242, + "step": 13926 + }, + { + "epoch": 0.7665254004072871, + "grad_norm": 0.6627985835075378, + "learning_rate": 6.817805516729482e-06, + "loss": 0.7177, + "step": 13927 + }, + { + "epoch": 0.7665804392096428, + "grad_norm": 0.8019070625305176, + "learning_rate": 6.817401705249118e-06, + "loss": 0.6594, + "step": 13928 + }, + { + "epoch": 0.7666354780119985, + "grad_norm": 0.7127207517623901, + "learning_rate": 6.816997880109649e-06, + "loss": 0.8282, + "step": 13929 + }, + { + "epoch": 0.7666905168143541, + "grad_norm": 0.7335825562477112, + "learning_rate": 6.816594041314111e-06, + "loss": 0.7593, + "step": 13930 + }, + { + "epoch": 0.7667455556167098, + "grad_norm": 0.6878668069839478, + "learning_rate": 6.816190188865538e-06, + "loss": 0.7898, + "step": 13931 + }, + { + "epoch": 0.7668005944190655, + "grad_norm": 0.6441968679428101, + "learning_rate": 6.815786322766965e-06, + "loss": 0.6795, + "step": 13932 + }, + { + "epoch": 0.7668556332214211, + "grad_norm": 0.6503410339355469, + "learning_rate": 6.815382443021429e-06, + "loss": 0.753, + "step": 13933 + }, + { + "epoch": 0.7669106720237767, + "grad_norm": 0.6734908223152161, + "learning_rate": 6.8149785496319645e-06, + "loss": 0.7145, + "step": 13934 + }, + { + "epoch": 0.7669657108261324, + "grad_norm": 0.8363823890686035, + "learning_rate": 6.814574642601606e-06, + "loss": 0.8499, + "step": 13935 + }, + { + "epoch": 0.7670207496284881, + "grad_norm": 0.6986021995544434, + "learning_rate": 6.81417072193339e-06, + "loss": 0.7101, + "step": 13936 + }, + { + "epoch": 0.7670757884308438, + "grad_norm": 0.9656592011451721, + "learning_rate": 6.813766787630354e-06, + "loss": 0.7841, + "step": 13937 + }, + { + "epoch": 0.7671308272331994, + "grad_norm": 0.6830777525901794, + "learning_rate": 6.813362839695532e-06, + "loss": 0.7443, + "step": 13938 + }, + { + "epoch": 0.767185866035555, + "grad_norm": 0.6358513236045837, + "learning_rate": 6.812958878131959e-06, + "loss": 0.7017, + "step": 13939 + }, + { + "epoch": 0.7672409048379107, + "grad_norm": 0.9075862169265747, + "learning_rate": 6.812554902942673e-06, + "loss": 0.6991, + "step": 13940 + }, + { + "epoch": 0.7672959436402664, + "grad_norm": 0.7004347443580627, + "learning_rate": 6.812150914130709e-06, + "loss": 0.6519, + "step": 13941 + }, + { + "epoch": 0.767350982442622, + "grad_norm": 0.6648300886154175, + "learning_rate": 6.811746911699105e-06, + "loss": 0.7044, + "step": 13942 + }, + { + "epoch": 0.7674060212449777, + "grad_norm": 0.7050208449363708, + "learning_rate": 6.811342895650896e-06, + "loss": 0.78, + "step": 13943 + }, + { + "epoch": 0.7674610600473334, + "grad_norm": 0.6387132406234741, + "learning_rate": 6.810938865989119e-06, + "loss": 0.6062, + "step": 13944 + }, + { + "epoch": 0.7675160988496891, + "grad_norm": 0.6441114544868469, + "learning_rate": 6.81053482271681e-06, + "loss": 0.7252, + "step": 13945 + }, + { + "epoch": 0.7675711376520447, + "grad_norm": 0.7309751510620117, + "learning_rate": 6.810130765837006e-06, + "loss": 0.6407, + "step": 13946 + }, + { + "epoch": 0.7676261764544003, + "grad_norm": 0.7132161259651184, + "learning_rate": 6.809726695352742e-06, + "loss": 0.8341, + "step": 13947 + }, + { + "epoch": 0.767681215256756, + "grad_norm": 0.7214738726615906, + "learning_rate": 6.809322611267058e-06, + "loss": 0.8357, + "step": 13948 + }, + { + "epoch": 0.7677362540591117, + "grad_norm": 0.6410175561904907, + "learning_rate": 6.80891851358299e-06, + "loss": 0.6718, + "step": 13949 + }, + { + "epoch": 0.7677912928614673, + "grad_norm": 0.8888845443725586, + "learning_rate": 6.8085144023035745e-06, + "loss": 0.7823, + "step": 13950 + }, + { + "epoch": 0.767846331663823, + "grad_norm": 0.7327878475189209, + "learning_rate": 6.808110277431848e-06, + "loss": 0.7083, + "step": 13951 + }, + { + "epoch": 0.7679013704661787, + "grad_norm": 0.6871985793113708, + "learning_rate": 6.807706138970849e-06, + "loss": 0.7808, + "step": 13952 + }, + { + "epoch": 0.7679564092685344, + "grad_norm": 0.6939501762390137, + "learning_rate": 6.8073019869236134e-06, + "loss": 0.693, + "step": 13953 + }, + { + "epoch": 0.76801144807089, + "grad_norm": 0.7377064824104309, + "learning_rate": 6.8068978212931814e-06, + "loss": 0.9322, + "step": 13954 + }, + { + "epoch": 0.7680664868732456, + "grad_norm": 0.8165044188499451, + "learning_rate": 6.80649364208259e-06, + "loss": 0.6846, + "step": 13955 + }, + { + "epoch": 0.7681215256756013, + "grad_norm": 0.6774152517318726, + "learning_rate": 6.806089449294875e-06, + "loss": 0.8503, + "step": 13956 + }, + { + "epoch": 0.768176564477957, + "grad_norm": 0.7773441076278687, + "learning_rate": 6.805685242933074e-06, + "loss": 0.8775, + "step": 13957 + }, + { + "epoch": 0.7682316032803126, + "grad_norm": 0.6710473895072937, + "learning_rate": 6.805281023000227e-06, + "loss": 0.7831, + "step": 13958 + }, + { + "epoch": 0.7682866420826683, + "grad_norm": 0.6163424849510193, + "learning_rate": 6.80487678949937e-06, + "loss": 0.7309, + "step": 13959 + }, + { + "epoch": 0.768341680885024, + "grad_norm": 0.6851963400840759, + "learning_rate": 6.804472542433543e-06, + "loss": 0.6556, + "step": 13960 + }, + { + "epoch": 0.7683967196873795, + "grad_norm": 0.6881004571914673, + "learning_rate": 6.804068281805784e-06, + "loss": 0.7115, + "step": 13961 + }, + { + "epoch": 0.7684517584897352, + "grad_norm": 0.7372351884841919, + "learning_rate": 6.8036640076191304e-06, + "loss": 0.7869, + "step": 13962 + }, + { + "epoch": 0.7685067972920909, + "grad_norm": 0.7900989055633545, + "learning_rate": 6.8032597198766205e-06, + "loss": 0.7419, + "step": 13963 + }, + { + "epoch": 0.7685618360944466, + "grad_norm": 0.7245132327079773, + "learning_rate": 6.802855418581294e-06, + "loss": 0.8175, + "step": 13964 + }, + { + "epoch": 0.7686168748968022, + "grad_norm": 0.6681550741195679, + "learning_rate": 6.802451103736188e-06, + "loss": 0.773, + "step": 13965 + }, + { + "epoch": 0.7686719136991579, + "grad_norm": 0.6316970586776733, + "learning_rate": 6.802046775344343e-06, + "loss": 0.6597, + "step": 13966 + }, + { + "epoch": 0.7687269525015136, + "grad_norm": 0.7201604843139648, + "learning_rate": 6.801642433408796e-06, + "loss": 0.7205, + "step": 13967 + }, + { + "epoch": 0.7687819913038693, + "grad_norm": 0.6226171851158142, + "learning_rate": 6.801238077932587e-06, + "loss": 0.7271, + "step": 13968 + }, + { + "epoch": 0.7688370301062248, + "grad_norm": 0.833369255065918, + "learning_rate": 6.800833708918755e-06, + "loss": 0.7731, + "step": 13969 + }, + { + "epoch": 0.7688920689085805, + "grad_norm": 0.7280329465866089, + "learning_rate": 6.800429326370339e-06, + "loss": 0.7833, + "step": 13970 + }, + { + "epoch": 0.7689471077109362, + "grad_norm": 0.7581672072410583, + "learning_rate": 6.800024930290376e-06, + "loss": 0.8008, + "step": 13971 + }, + { + "epoch": 0.7690021465132919, + "grad_norm": 0.7931516170501709, + "learning_rate": 6.79962052068191e-06, + "loss": 0.8884, + "step": 13972 + }, + { + "epoch": 0.7690571853156475, + "grad_norm": 0.8455879092216492, + "learning_rate": 6.799216097547977e-06, + "loss": 0.8109, + "step": 13973 + }, + { + "epoch": 0.7691122241180032, + "grad_norm": 0.687336266040802, + "learning_rate": 6.798811660891618e-06, + "loss": 0.783, + "step": 13974 + }, + { + "epoch": 0.7691672629203589, + "grad_norm": 0.7661089897155762, + "learning_rate": 6.7984072107158696e-06, + "loss": 0.8448, + "step": 13975 + }, + { + "epoch": 0.7692223017227146, + "grad_norm": 0.6965043544769287, + "learning_rate": 6.798002747023776e-06, + "loss": 0.7421, + "step": 13976 + }, + { + "epoch": 0.7692773405250701, + "grad_norm": 0.7373656630516052, + "learning_rate": 6.797598269818375e-06, + "loss": 0.7093, + "step": 13977 + }, + { + "epoch": 0.7693323793274258, + "grad_norm": 0.6387331485748291, + "learning_rate": 6.7971937791027064e-06, + "loss": 0.7811, + "step": 13978 + }, + { + "epoch": 0.7693874181297815, + "grad_norm": 0.7566075325012207, + "learning_rate": 6.796789274879811e-06, + "loss": 0.8245, + "step": 13979 + }, + { + "epoch": 0.7694424569321372, + "grad_norm": 0.7035738229751587, + "learning_rate": 6.796384757152729e-06, + "loss": 0.7674, + "step": 13980 + }, + { + "epoch": 0.7694974957344928, + "grad_norm": 0.8265605568885803, + "learning_rate": 6.795980225924499e-06, + "loss": 0.7755, + "step": 13981 + }, + { + "epoch": 0.7695525345368485, + "grad_norm": 0.709454357624054, + "learning_rate": 6.7955756811981625e-06, + "loss": 0.8651, + "step": 13982 + }, + { + "epoch": 0.7696075733392042, + "grad_norm": 0.7075764536857605, + "learning_rate": 6.795171122976758e-06, + "loss": 0.7371, + "step": 13983 + }, + { + "epoch": 0.7696626121415598, + "grad_norm": 0.7027561664581299, + "learning_rate": 6.79476655126333e-06, + "loss": 0.7763, + "step": 13984 + }, + { + "epoch": 0.7697176509439154, + "grad_norm": 0.7922375202178955, + "learning_rate": 6.794361966060916e-06, + "loss": 0.7677, + "step": 13985 + }, + { + "epoch": 0.7697726897462711, + "grad_norm": 0.7185537219047546, + "learning_rate": 6.793957367372559e-06, + "loss": 0.7229, + "step": 13986 + }, + { + "epoch": 0.7698277285486268, + "grad_norm": 0.7173545956611633, + "learning_rate": 6.793552755201297e-06, + "loss": 0.7508, + "step": 13987 + }, + { + "epoch": 0.7698827673509825, + "grad_norm": 0.7743139863014221, + "learning_rate": 6.793148129550175e-06, + "loss": 0.7305, + "step": 13988 + }, + { + "epoch": 0.7699378061533381, + "grad_norm": 0.7992164492607117, + "learning_rate": 6.792743490422229e-06, + "loss": 0.7212, + "step": 13989 + }, + { + "epoch": 0.7699928449556938, + "grad_norm": 0.7437503337860107, + "learning_rate": 6.792338837820504e-06, + "loss": 0.6396, + "step": 13990 + }, + { + "epoch": 0.7700478837580494, + "grad_norm": 0.6908634305000305, + "learning_rate": 6.79193417174804e-06, + "loss": 0.7279, + "step": 13991 + }, + { + "epoch": 0.7701029225604051, + "grad_norm": 0.6894391775131226, + "learning_rate": 6.7915294922078805e-06, + "loss": 0.7615, + "step": 13992 + }, + { + "epoch": 0.7701579613627607, + "grad_norm": 0.7162172794342041, + "learning_rate": 6.791124799203062e-06, + "loss": 0.7404, + "step": 13993 + }, + { + "epoch": 0.7702130001651164, + "grad_norm": 0.6469258069992065, + "learning_rate": 6.79072009273663e-06, + "loss": 0.7035, + "step": 13994 + }, + { + "epoch": 0.7702680389674721, + "grad_norm": 0.6456457376480103, + "learning_rate": 6.790315372811625e-06, + "loss": 0.708, + "step": 13995 + }, + { + "epoch": 0.7703230777698278, + "grad_norm": 0.7880644798278809, + "learning_rate": 6.789910639431089e-06, + "loss": 0.7723, + "step": 13996 + }, + { + "epoch": 0.7703781165721834, + "grad_norm": 0.7847834229469299, + "learning_rate": 6.789505892598063e-06, + "loss": 0.8585, + "step": 13997 + }, + { + "epoch": 0.770433155374539, + "grad_norm": 0.6909215450286865, + "learning_rate": 6.789101132315591e-06, + "loss": 0.7107, + "step": 13998 + }, + { + "epoch": 0.7704881941768947, + "grad_norm": 0.7883939146995544, + "learning_rate": 6.788696358586713e-06, + "loss": 0.7575, + "step": 13999 + }, + { + "epoch": 0.7705432329792504, + "grad_norm": 0.6629998087882996, + "learning_rate": 6.788291571414472e-06, + "loss": 0.7273, + "step": 14000 + }, + { + "epoch": 0.770598271781606, + "grad_norm": 0.7548647522926331, + "learning_rate": 6.7878867708019106e-06, + "loss": 0.8214, + "step": 14001 + }, + { + "epoch": 0.7706533105839617, + "grad_norm": 0.6721330881118774, + "learning_rate": 6.78748195675207e-06, + "loss": 0.7153, + "step": 14002 + }, + { + "epoch": 0.7707083493863174, + "grad_norm": 0.6921262145042419, + "learning_rate": 6.787077129267994e-06, + "loss": 0.7099, + "step": 14003 + }, + { + "epoch": 0.770763388188673, + "grad_norm": 0.956937849521637, + "learning_rate": 6.786672288352725e-06, + "loss": 0.6765, + "step": 14004 + }, + { + "epoch": 0.7708184269910286, + "grad_norm": 0.7265778183937073, + "learning_rate": 6.786267434009306e-06, + "loss": 0.7653, + "step": 14005 + }, + { + "epoch": 0.7708734657933843, + "grad_norm": 0.7429845929145813, + "learning_rate": 6.785862566240778e-06, + "loss": 0.8064, + "step": 14006 + }, + { + "epoch": 0.77092850459574, + "grad_norm": 0.7437632083892822, + "learning_rate": 6.785457685050184e-06, + "loss": 0.7138, + "step": 14007 + }, + { + "epoch": 0.7709835433980956, + "grad_norm": 0.7218232750892639, + "learning_rate": 6.7850527904405695e-06, + "loss": 0.7785, + "step": 14008 + }, + { + "epoch": 0.7710385822004513, + "grad_norm": 0.7131973505020142, + "learning_rate": 6.784647882414977e-06, + "loss": 0.7651, + "step": 14009 + }, + { + "epoch": 0.771093621002807, + "grad_norm": 0.739919126033783, + "learning_rate": 6.784242960976447e-06, + "loss": 0.7993, + "step": 14010 + }, + { + "epoch": 0.7711486598051627, + "grad_norm": 0.6655608415603638, + "learning_rate": 6.783838026128025e-06, + "loss": 0.7394, + "step": 14011 + }, + { + "epoch": 0.7712036986075183, + "grad_norm": 0.9327310919761658, + "learning_rate": 6.783433077872753e-06, + "loss": 0.8737, + "step": 14012 + }, + { + "epoch": 0.7712587374098739, + "grad_norm": 0.5928294062614441, + "learning_rate": 6.783028116213677e-06, + "loss": 0.5819, + "step": 14013 + }, + { + "epoch": 0.7713137762122296, + "grad_norm": 0.6752136945724487, + "learning_rate": 6.782623141153838e-06, + "loss": 0.8021, + "step": 14014 + }, + { + "epoch": 0.7713688150145853, + "grad_norm": 0.6452222466468811, + "learning_rate": 6.78221815269628e-06, + "loss": 0.7806, + "step": 14015 + }, + { + "epoch": 0.7714238538169409, + "grad_norm": 0.7725237607955933, + "learning_rate": 6.78181315084405e-06, + "loss": 0.7679, + "step": 14016 + }, + { + "epoch": 0.7714788926192966, + "grad_norm": 0.6594743728637695, + "learning_rate": 6.781408135600187e-06, + "loss": 0.7254, + "step": 14017 + }, + { + "epoch": 0.7715339314216523, + "grad_norm": 0.7008917927742004, + "learning_rate": 6.7810031069677385e-06, + "loss": 0.705, + "step": 14018 + }, + { + "epoch": 0.771588970224008, + "grad_norm": 0.9435684084892273, + "learning_rate": 6.780598064949746e-06, + "loss": 0.7787, + "step": 14019 + }, + { + "epoch": 0.7716440090263635, + "grad_norm": 0.6615981459617615, + "learning_rate": 6.780193009549256e-06, + "loss": 0.7592, + "step": 14020 + }, + { + "epoch": 0.7716990478287192, + "grad_norm": 0.7042600512504578, + "learning_rate": 6.7797879407693115e-06, + "loss": 0.719, + "step": 14021 + }, + { + "epoch": 0.7717540866310749, + "grad_norm": 0.7135425209999084, + "learning_rate": 6.779382858612957e-06, + "loss": 0.739, + "step": 14022 + }, + { + "epoch": 0.7718091254334306, + "grad_norm": 0.6546016931533813, + "learning_rate": 6.778977763083238e-06, + "loss": 0.7039, + "step": 14023 + }, + { + "epoch": 0.7718641642357862, + "grad_norm": 0.8549250960350037, + "learning_rate": 6.778572654183198e-06, + "loss": 0.8384, + "step": 14024 + }, + { + "epoch": 0.7719192030381419, + "grad_norm": 0.7008731365203857, + "learning_rate": 6.778167531915882e-06, + "loss": 0.776, + "step": 14025 + }, + { + "epoch": 0.7719742418404976, + "grad_norm": 0.7047393321990967, + "learning_rate": 6.7777623962843355e-06, + "loss": 0.819, + "step": 14026 + }, + { + "epoch": 0.7720292806428533, + "grad_norm": 0.7015580534934998, + "learning_rate": 6.777357247291601e-06, + "loss": 0.8339, + "step": 14027 + }, + { + "epoch": 0.7720843194452088, + "grad_norm": 0.7008551955223083, + "learning_rate": 6.776952084940727e-06, + "loss": 0.783, + "step": 14028 + }, + { + "epoch": 0.7721393582475645, + "grad_norm": 1.0310637950897217, + "learning_rate": 6.776546909234757e-06, + "loss": 0.7447, + "step": 14029 + }, + { + "epoch": 0.7721943970499202, + "grad_norm": 0.6264338493347168, + "learning_rate": 6.776141720176734e-06, + "loss": 0.5542, + "step": 14030 + }, + { + "epoch": 0.7722494358522759, + "grad_norm": 0.6249508261680603, + "learning_rate": 6.775736517769707e-06, + "loss": 0.6514, + "step": 14031 + }, + { + "epoch": 0.7723044746546315, + "grad_norm": 0.6741732954978943, + "learning_rate": 6.775331302016719e-06, + "loss": 0.6967, + "step": 14032 + }, + { + "epoch": 0.7723595134569872, + "grad_norm": 0.7342913746833801, + "learning_rate": 6.774926072920815e-06, + "loss": 0.8279, + "step": 14033 + }, + { + "epoch": 0.7724145522593429, + "grad_norm": 0.7702916264533997, + "learning_rate": 6.774520830485044e-06, + "loss": 0.8539, + "step": 14034 + }, + { + "epoch": 0.7724695910616985, + "grad_norm": 0.7873550057411194, + "learning_rate": 6.774115574712448e-06, + "loss": 0.6999, + "step": 14035 + }, + { + "epoch": 0.7725246298640541, + "grad_norm": 0.6832353472709656, + "learning_rate": 6.773710305606074e-06, + "loss": 0.7246, + "step": 14036 + }, + { + "epoch": 0.7725796686664098, + "grad_norm": 0.7547367215156555, + "learning_rate": 6.773305023168969e-06, + "loss": 0.7357, + "step": 14037 + }, + { + "epoch": 0.7726347074687655, + "grad_norm": 0.7146826386451721, + "learning_rate": 6.772899727404178e-06, + "loss": 0.6742, + "step": 14038 + }, + { + "epoch": 0.7726897462711212, + "grad_norm": 0.7623558640480042, + "learning_rate": 6.772494418314748e-06, + "loss": 0.7729, + "step": 14039 + }, + { + "epoch": 0.7727447850734768, + "grad_norm": 0.637706458568573, + "learning_rate": 6.772089095903723e-06, + "loss": 0.6662, + "step": 14040 + }, + { + "epoch": 0.7727998238758325, + "grad_norm": 0.7293589115142822, + "learning_rate": 6.771683760174151e-06, + "loss": 0.7899, + "step": 14041 + }, + { + "epoch": 0.7728548626781881, + "grad_norm": 0.7191390991210938, + "learning_rate": 6.771278411129079e-06, + "loss": 0.6912, + "step": 14042 + }, + { + "epoch": 0.7729099014805438, + "grad_norm": 0.8264575004577637, + "learning_rate": 6.770873048771552e-06, + "loss": 0.7027, + "step": 14043 + }, + { + "epoch": 0.7729649402828994, + "grad_norm": 0.7490931749343872, + "learning_rate": 6.770467673104617e-06, + "loss": 0.6917, + "step": 14044 + }, + { + "epoch": 0.7730199790852551, + "grad_norm": 0.6901552081108093, + "learning_rate": 6.77006228413132e-06, + "loss": 0.8097, + "step": 14045 + }, + { + "epoch": 0.7730750178876108, + "grad_norm": 0.6340280175209045, + "learning_rate": 6.76965688185471e-06, + "loss": 0.6309, + "step": 14046 + }, + { + "epoch": 0.7731300566899664, + "grad_norm": 0.6807279586791992, + "learning_rate": 6.7692514662778315e-06, + "loss": 0.7744, + "step": 14047 + }, + { + "epoch": 0.7731850954923221, + "grad_norm": 1.2796865701675415, + "learning_rate": 6.7688460374037335e-06, + "loss": 0.7499, + "step": 14048 + }, + { + "epoch": 0.7732401342946778, + "grad_norm": 0.7059674263000488, + "learning_rate": 6.768440595235463e-06, + "loss": 0.8705, + "step": 14049 + }, + { + "epoch": 0.7732951730970334, + "grad_norm": 0.7626641392707825, + "learning_rate": 6.768035139776066e-06, + "loss": 0.8448, + "step": 14050 + }, + { + "epoch": 0.773350211899389, + "grad_norm": 0.6590229868888855, + "learning_rate": 6.767629671028588e-06, + "loss": 0.6796, + "step": 14051 + }, + { + "epoch": 0.7734052507017447, + "grad_norm": 0.6702030301094055, + "learning_rate": 6.767224188996081e-06, + "loss": 0.7087, + "step": 14052 + }, + { + "epoch": 0.7734602895041004, + "grad_norm": 0.670612096786499, + "learning_rate": 6.76681869368159e-06, + "loss": 0.7203, + "step": 14053 + }, + { + "epoch": 0.7735153283064561, + "grad_norm": 0.6892215013504028, + "learning_rate": 6.766413185088161e-06, + "loss": 0.6891, + "step": 14054 + }, + { + "epoch": 0.7735703671088117, + "grad_norm": 0.8354474902153015, + "learning_rate": 6.766007663218843e-06, + "loss": 0.7378, + "step": 14055 + }, + { + "epoch": 0.7736254059111674, + "grad_norm": 0.7633876204490662, + "learning_rate": 6.765602128076686e-06, + "loss": 0.6916, + "step": 14056 + }, + { + "epoch": 0.773680444713523, + "grad_norm": 0.7249060869216919, + "learning_rate": 6.765196579664736e-06, + "loss": 0.791, + "step": 14057 + }, + { + "epoch": 0.7737354835158787, + "grad_norm": 0.7033042311668396, + "learning_rate": 6.7647910179860395e-06, + "loss": 0.6799, + "step": 14058 + }, + { + "epoch": 0.7737905223182343, + "grad_norm": 0.7087684273719788, + "learning_rate": 6.7643854430436466e-06, + "loss": 0.6389, + "step": 14059 + }, + { + "epoch": 0.77384556112059, + "grad_norm": 0.6433978080749512, + "learning_rate": 6.763979854840606e-06, + "loss": 0.7214, + "step": 14060 + }, + { + "epoch": 0.7739005999229457, + "grad_norm": 0.7777101993560791, + "learning_rate": 6.763574253379964e-06, + "loss": 0.7458, + "step": 14061 + }, + { + "epoch": 0.7739556387253014, + "grad_norm": 0.7065346240997314, + "learning_rate": 6.763168638664771e-06, + "loss": 0.7663, + "step": 14062 + }, + { + "epoch": 0.774010677527657, + "grad_norm": 0.7136278748512268, + "learning_rate": 6.762763010698074e-06, + "loss": 0.667, + "step": 14063 + }, + { + "epoch": 0.7740657163300126, + "grad_norm": 0.6670508980751038, + "learning_rate": 6.762357369482921e-06, + "loss": 0.7462, + "step": 14064 + }, + { + "epoch": 0.7741207551323683, + "grad_norm": 0.6366799473762512, + "learning_rate": 6.7619517150223635e-06, + "loss": 0.7147, + "step": 14065 + }, + { + "epoch": 0.774175793934724, + "grad_norm": 0.5999431610107422, + "learning_rate": 6.761546047319447e-06, + "loss": 0.667, + "step": 14066 + }, + { + "epoch": 0.7742308327370796, + "grad_norm": 0.6751196980476379, + "learning_rate": 6.761140366377222e-06, + "loss": 0.7255, + "step": 14067 + }, + { + "epoch": 0.7742858715394353, + "grad_norm": 0.6786272525787354, + "learning_rate": 6.760734672198738e-06, + "loss": 0.7694, + "step": 14068 + }, + { + "epoch": 0.774340910341791, + "grad_norm": 0.6915947794914246, + "learning_rate": 6.760328964787044e-06, + "loss": 0.7955, + "step": 14069 + }, + { + "epoch": 0.7743959491441467, + "grad_norm": 0.7041972279548645, + "learning_rate": 6.759923244145188e-06, + "loss": 0.6542, + "step": 14070 + }, + { + "epoch": 0.7744509879465022, + "grad_norm": 0.6384761333465576, + "learning_rate": 6.759517510276221e-06, + "loss": 0.7384, + "step": 14071 + }, + { + "epoch": 0.7745060267488579, + "grad_norm": 0.7430800199508667, + "learning_rate": 6.759111763183189e-06, + "loss": 0.7587, + "step": 14072 + }, + { + "epoch": 0.7745610655512136, + "grad_norm": 0.6568213701248169, + "learning_rate": 6.758706002869146e-06, + "loss": 0.7118, + "step": 14073 + }, + { + "epoch": 0.7746161043535693, + "grad_norm": 0.8791618943214417, + "learning_rate": 6.75830022933714e-06, + "loss": 0.8049, + "step": 14074 + }, + { + "epoch": 0.7746711431559249, + "grad_norm": 0.6377304792404175, + "learning_rate": 6.75789444259022e-06, + "loss": 0.737, + "step": 14075 + }, + { + "epoch": 0.7747261819582806, + "grad_norm": 0.7253721356391907, + "learning_rate": 6.757488642631434e-06, + "loss": 0.8432, + "step": 14076 + }, + { + "epoch": 0.7747812207606363, + "grad_norm": 0.684626042842865, + "learning_rate": 6.757082829463835e-06, + "loss": 0.7845, + "step": 14077 + }, + { + "epoch": 0.774836259562992, + "grad_norm": 0.7737520337104797, + "learning_rate": 6.756677003090471e-06, + "loss": 0.8055, + "step": 14078 + }, + { + "epoch": 0.7748912983653475, + "grad_norm": 0.7294824719429016, + "learning_rate": 6.756271163514394e-06, + "loss": 0.7666, + "step": 14079 + }, + { + "epoch": 0.7749463371677032, + "grad_norm": 0.7728607654571533, + "learning_rate": 6.755865310738651e-06, + "loss": 0.7748, + "step": 14080 + }, + { + "epoch": 0.7750013759700589, + "grad_norm": 0.6738442778587341, + "learning_rate": 6.755459444766297e-06, + "loss": 0.6711, + "step": 14081 + }, + { + "epoch": 0.7750564147724146, + "grad_norm": 0.7041414976119995, + "learning_rate": 6.7550535656003794e-06, + "loss": 0.7126, + "step": 14082 + }, + { + "epoch": 0.7751114535747702, + "grad_norm": 1.0205422639846802, + "learning_rate": 6.754647673243948e-06, + "loss": 0.7394, + "step": 14083 + }, + { + "epoch": 0.7751664923771259, + "grad_norm": 0.6594380736351013, + "learning_rate": 6.754241767700054e-06, + "loss": 0.7599, + "step": 14084 + }, + { + "epoch": 0.7752215311794816, + "grad_norm": 0.6800520420074463, + "learning_rate": 6.753835848971749e-06, + "loss": 0.7579, + "step": 14085 + }, + { + "epoch": 0.7752765699818372, + "grad_norm": 0.7658087611198425, + "learning_rate": 6.7534299170620846e-06, + "loss": 0.7705, + "step": 14086 + }, + { + "epoch": 0.7753316087841928, + "grad_norm": 0.7242750525474548, + "learning_rate": 6.7530239719741084e-06, + "loss": 0.7683, + "step": 14087 + }, + { + "epoch": 0.7753866475865485, + "grad_norm": 0.6997398138046265, + "learning_rate": 6.752618013710874e-06, + "loss": 0.8023, + "step": 14088 + }, + { + "epoch": 0.7754416863889042, + "grad_norm": 0.7041590809822083, + "learning_rate": 6.752212042275431e-06, + "loss": 0.7013, + "step": 14089 + }, + { + "epoch": 0.7754967251912598, + "grad_norm": 0.7027721405029297, + "learning_rate": 6.751806057670832e-06, + "loss": 0.7678, + "step": 14090 + }, + { + "epoch": 0.7755517639936155, + "grad_norm": 0.714290201663971, + "learning_rate": 6.751400059900128e-06, + "loss": 0.6769, + "step": 14091 + }, + { + "epoch": 0.7756068027959712, + "grad_norm": 0.7385110855102539, + "learning_rate": 6.750994048966369e-06, + "loss": 0.6576, + "step": 14092 + }, + { + "epoch": 0.7756618415983269, + "grad_norm": 0.7665147185325623, + "learning_rate": 6.750588024872607e-06, + "loss": 0.8127, + "step": 14093 + }, + { + "epoch": 0.7757168804006824, + "grad_norm": 0.6774508953094482, + "learning_rate": 6.750181987621895e-06, + "loss": 0.8112, + "step": 14094 + }, + { + "epoch": 0.7757719192030381, + "grad_norm": 0.666394054889679, + "learning_rate": 6.749775937217285e-06, + "loss": 0.6444, + "step": 14095 + }, + { + "epoch": 0.7758269580053938, + "grad_norm": 0.6557022929191589, + "learning_rate": 6.749369873661825e-06, + "loss": 0.7613, + "step": 14096 + }, + { + "epoch": 0.7758819968077495, + "grad_norm": 0.7090621590614319, + "learning_rate": 6.74896379695857e-06, + "loss": 0.7229, + "step": 14097 + }, + { + "epoch": 0.7759370356101051, + "grad_norm": 0.8117626309394836, + "learning_rate": 6.7485577071105734e-06, + "loss": 0.8002, + "step": 14098 + }, + { + "epoch": 0.7759920744124608, + "grad_norm": 0.6743370294570923, + "learning_rate": 6.748151604120883e-06, + "loss": 0.7457, + "step": 14099 + }, + { + "epoch": 0.7760471132148165, + "grad_norm": 0.7637452483177185, + "learning_rate": 6.747745487992553e-06, + "loss": 0.7471, + "step": 14100 + }, + { + "epoch": 0.7761021520171721, + "grad_norm": 0.6732922196388245, + "learning_rate": 6.747339358728636e-06, + "loss": 0.7471, + "step": 14101 + }, + { + "epoch": 0.7761571908195277, + "grad_norm": 0.7510336637496948, + "learning_rate": 6.746933216332184e-06, + "loss": 0.7252, + "step": 14102 + }, + { + "epoch": 0.7762122296218834, + "grad_norm": 0.731719434261322, + "learning_rate": 6.746527060806251e-06, + "loss": 0.8706, + "step": 14103 + }, + { + "epoch": 0.7762672684242391, + "grad_norm": 0.7625692486763, + "learning_rate": 6.746120892153886e-06, + "loss": 0.7518, + "step": 14104 + }, + { + "epoch": 0.7763223072265948, + "grad_norm": 0.6809547543525696, + "learning_rate": 6.745714710378145e-06, + "loss": 0.7172, + "step": 14105 + }, + { + "epoch": 0.7763773460289504, + "grad_norm": 0.709996223449707, + "learning_rate": 6.745308515482079e-06, + "loss": 0.7925, + "step": 14106 + }, + { + "epoch": 0.776432384831306, + "grad_norm": 0.6675372123718262, + "learning_rate": 6.744902307468742e-06, + "loss": 0.8175, + "step": 14107 + }, + { + "epoch": 0.7764874236336617, + "grad_norm": 0.6978115439414978, + "learning_rate": 6.744496086341186e-06, + "loss": 0.7895, + "step": 14108 + }, + { + "epoch": 0.7765424624360174, + "grad_norm": 0.6593814492225647, + "learning_rate": 6.7440898521024634e-06, + "loss": 0.7791, + "step": 14109 + }, + { + "epoch": 0.776597501238373, + "grad_norm": 0.7169299721717834, + "learning_rate": 6.743683604755631e-06, + "loss": 0.7944, + "step": 14110 + }, + { + "epoch": 0.7766525400407287, + "grad_norm": 0.6805511713027954, + "learning_rate": 6.743277344303738e-06, + "loss": 0.7671, + "step": 14111 + }, + { + "epoch": 0.7767075788430844, + "grad_norm": 0.7300780415534973, + "learning_rate": 6.742871070749838e-06, + "loss": 0.7789, + "step": 14112 + }, + { + "epoch": 0.7767626176454401, + "grad_norm": 0.6475857496261597, + "learning_rate": 6.742464784096987e-06, + "loss": 0.6652, + "step": 14113 + }, + { + "epoch": 0.7768176564477957, + "grad_norm": 0.6941269040107727, + "learning_rate": 6.742058484348236e-06, + "loss": 0.8138, + "step": 14114 + }, + { + "epoch": 0.7768726952501513, + "grad_norm": 0.6175981760025024, + "learning_rate": 6.7416521715066405e-06, + "loss": 0.7667, + "step": 14115 + }, + { + "epoch": 0.776927734052507, + "grad_norm": 0.6499401330947876, + "learning_rate": 6.741245845575252e-06, + "loss": 0.7415, + "step": 14116 + }, + { + "epoch": 0.7769827728548627, + "grad_norm": 0.6601547598838806, + "learning_rate": 6.740839506557127e-06, + "loss": 0.732, + "step": 14117 + }, + { + "epoch": 0.7770378116572183, + "grad_norm": 0.7939042448997498, + "learning_rate": 6.740433154455319e-06, + "loss": 0.7043, + "step": 14118 + }, + { + "epoch": 0.777092850459574, + "grad_norm": 0.7381628751754761, + "learning_rate": 6.740026789272881e-06, + "loss": 0.8256, + "step": 14119 + }, + { + "epoch": 0.7771478892619297, + "grad_norm": 0.6131769418716431, + "learning_rate": 6.739620411012866e-06, + "loss": 0.726, + "step": 14120 + }, + { + "epoch": 0.7772029280642854, + "grad_norm": 1.201745867729187, + "learning_rate": 6.739214019678332e-06, + "loss": 0.7097, + "step": 14121 + }, + { + "epoch": 0.777257966866641, + "grad_norm": 0.6618456244468689, + "learning_rate": 6.7388076152723295e-06, + "loss": 0.6396, + "step": 14122 + }, + { + "epoch": 0.7773130056689966, + "grad_norm": 0.7490836977958679, + "learning_rate": 6.738401197797915e-06, + "loss": 0.6475, + "step": 14123 + }, + { + "epoch": 0.7773680444713523, + "grad_norm": 0.8125407099723816, + "learning_rate": 6.737994767258142e-06, + "loss": 0.7693, + "step": 14124 + }, + { + "epoch": 0.777423083273708, + "grad_norm": 0.7501794099807739, + "learning_rate": 6.737588323656065e-06, + "loss": 0.7333, + "step": 14125 + }, + { + "epoch": 0.7774781220760636, + "grad_norm": 1.3062889575958252, + "learning_rate": 6.73718186699474e-06, + "loss": 0.6909, + "step": 14126 + }, + { + "epoch": 0.7775331608784193, + "grad_norm": 0.6784525513648987, + "learning_rate": 6.736775397277221e-06, + "loss": 0.7256, + "step": 14127 + }, + { + "epoch": 0.777588199680775, + "grad_norm": 0.7018646597862244, + "learning_rate": 6.736368914506562e-06, + "loss": 0.7632, + "step": 14128 + }, + { + "epoch": 0.7776432384831307, + "grad_norm": 0.7596307992935181, + "learning_rate": 6.735962418685821e-06, + "loss": 0.7117, + "step": 14129 + }, + { + "epoch": 0.7776982772854862, + "grad_norm": 0.7582107186317444, + "learning_rate": 6.7355559098180504e-06, + "loss": 0.7808, + "step": 14130 + }, + { + "epoch": 0.7777533160878419, + "grad_norm": 0.6460647583007812, + "learning_rate": 6.7351493879063056e-06, + "loss": 0.675, + "step": 14131 + }, + { + "epoch": 0.7778083548901976, + "grad_norm": 0.6801304221153259, + "learning_rate": 6.7347428529536415e-06, + "loss": 0.6504, + "step": 14132 + }, + { + "epoch": 0.7778633936925532, + "grad_norm": 0.8122933506965637, + "learning_rate": 6.7343363049631176e-06, + "loss": 0.7949, + "step": 14133 + }, + { + "epoch": 0.7779184324949089, + "grad_norm": 0.6750267744064331, + "learning_rate": 6.733929743937784e-06, + "loss": 0.7689, + "step": 14134 + }, + { + "epoch": 0.7779734712972646, + "grad_norm": 0.7141891121864319, + "learning_rate": 6.7335231698807005e-06, + "loss": 0.7099, + "step": 14135 + }, + { + "epoch": 0.7780285100996203, + "grad_norm": 0.7904065251350403, + "learning_rate": 6.733116582794918e-06, + "loss": 0.8458, + "step": 14136 + }, + { + "epoch": 0.7780835489019758, + "grad_norm": 0.6905248165130615, + "learning_rate": 6.732709982683496e-06, + "loss": 0.7848, + "step": 14137 + }, + { + "epoch": 0.7781385877043315, + "grad_norm": 0.6707245707511902, + "learning_rate": 6.732303369549491e-06, + "loss": 0.8319, + "step": 14138 + }, + { + "epoch": 0.7781936265066872, + "grad_norm": 0.6611519455909729, + "learning_rate": 6.731896743395957e-06, + "loss": 0.7025, + "step": 14139 + }, + { + "epoch": 0.7782486653090429, + "grad_norm": 0.7113156914710999, + "learning_rate": 6.73149010422595e-06, + "loss": 0.8297, + "step": 14140 + }, + { + "epoch": 0.7783037041113985, + "grad_norm": 0.7279486060142517, + "learning_rate": 6.7310834520425265e-06, + "loss": 0.8134, + "step": 14141 + }, + { + "epoch": 0.7783587429137542, + "grad_norm": 0.7561796307563782, + "learning_rate": 6.730676786848744e-06, + "loss": 0.806, + "step": 14142 + }, + { + "epoch": 0.7784137817161099, + "grad_norm": 0.6724728345870972, + "learning_rate": 6.7302701086476585e-06, + "loss": 0.7782, + "step": 14143 + }, + { + "epoch": 0.7784688205184656, + "grad_norm": 0.6363211274147034, + "learning_rate": 6.729863417442325e-06, + "loss": 0.6298, + "step": 14144 + }, + { + "epoch": 0.7785238593208211, + "grad_norm": 0.6920950412750244, + "learning_rate": 6.729456713235803e-06, + "loss": 0.5804, + "step": 14145 + }, + { + "epoch": 0.7785788981231768, + "grad_norm": 0.7388806343078613, + "learning_rate": 6.729049996031145e-06, + "loss": 0.6594, + "step": 14146 + }, + { + "epoch": 0.7786339369255325, + "grad_norm": 0.7736972570419312, + "learning_rate": 6.728643265831412e-06, + "loss": 0.8244, + "step": 14147 + }, + { + "epoch": 0.7786889757278882, + "grad_norm": 0.6928302049636841, + "learning_rate": 6.728236522639658e-06, + "loss": 0.6713, + "step": 14148 + }, + { + "epoch": 0.7787440145302438, + "grad_norm": 0.8058464527130127, + "learning_rate": 6.72782976645894e-06, + "loss": 0.7647, + "step": 14149 + }, + { + "epoch": 0.7787990533325995, + "grad_norm": 0.7111127376556396, + "learning_rate": 6.727422997292317e-06, + "loss": 0.7629, + "step": 14150 + }, + { + "epoch": 0.7788540921349552, + "grad_norm": 0.9375373721122742, + "learning_rate": 6.7270162151428455e-06, + "loss": 0.8306, + "step": 14151 + }, + { + "epoch": 0.7789091309373108, + "grad_norm": 0.6894392371177673, + "learning_rate": 6.726609420013581e-06, + "loss": 0.6995, + "step": 14152 + }, + { + "epoch": 0.7789641697396664, + "grad_norm": 0.7058690786361694, + "learning_rate": 6.726202611907583e-06, + "loss": 0.844, + "step": 14153 + }, + { + "epoch": 0.7790192085420221, + "grad_norm": 0.7672932744026184, + "learning_rate": 6.725795790827909e-06, + "loss": 0.6613, + "step": 14154 + }, + { + "epoch": 0.7790742473443778, + "grad_norm": 0.8575173020362854, + "learning_rate": 6.7253889567776146e-06, + "loss": 0.6946, + "step": 14155 + }, + { + "epoch": 0.7791292861467335, + "grad_norm": 0.6832261085510254, + "learning_rate": 6.724982109759759e-06, + "loss": 0.7121, + "step": 14156 + }, + { + "epoch": 0.7791843249490891, + "grad_norm": 0.8188209533691406, + "learning_rate": 6.724575249777401e-06, + "loss": 0.6479, + "step": 14157 + }, + { + "epoch": 0.7792393637514448, + "grad_norm": 0.6514336466789246, + "learning_rate": 6.724168376833595e-06, + "loss": 0.6117, + "step": 14158 + }, + { + "epoch": 0.7792944025538004, + "grad_norm": 0.7283767461776733, + "learning_rate": 6.723761490931403e-06, + "loss": 0.6882, + "step": 14159 + }, + { + "epoch": 0.7793494413561561, + "grad_norm": 0.7681146860122681, + "learning_rate": 6.7233545920738785e-06, + "loss": 0.8028, + "step": 14160 + }, + { + "epoch": 0.7794044801585117, + "grad_norm": 0.6202995181083679, + "learning_rate": 6.722947680264084e-06, + "loss": 0.713, + "step": 14161 + }, + { + "epoch": 0.7794595189608674, + "grad_norm": 0.7137139439582825, + "learning_rate": 6.722540755505076e-06, + "loss": 0.7842, + "step": 14162 + }, + { + "epoch": 0.7795145577632231, + "grad_norm": 0.6852554678916931, + "learning_rate": 6.722133817799913e-06, + "loss": 0.7329, + "step": 14163 + }, + { + "epoch": 0.7795695965655788, + "grad_norm": 0.7520774602890015, + "learning_rate": 6.7217268671516525e-06, + "loss": 0.7498, + "step": 14164 + }, + { + "epoch": 0.7796246353679344, + "grad_norm": 0.708577573299408, + "learning_rate": 6.7213199035633525e-06, + "loss": 0.675, + "step": 14165 + }, + { + "epoch": 0.77967967417029, + "grad_norm": 0.8061410188674927, + "learning_rate": 6.7209129270380744e-06, + "loss": 0.7176, + "step": 14166 + }, + { + "epoch": 0.7797347129726457, + "grad_norm": 0.8070787787437439, + "learning_rate": 6.720505937578876e-06, + "loss": 0.8138, + "step": 14167 + }, + { + "epoch": 0.7797897517750014, + "grad_norm": 0.7127004265785217, + "learning_rate": 6.720098935188815e-06, + "loss": 0.7004, + "step": 14168 + }, + { + "epoch": 0.779844790577357, + "grad_norm": 0.7188708782196045, + "learning_rate": 6.719691919870951e-06, + "loss": 0.6996, + "step": 14169 + }, + { + "epoch": 0.7798998293797127, + "grad_norm": 0.6346360445022583, + "learning_rate": 6.719284891628342e-06, + "loss": 0.7349, + "step": 14170 + }, + { + "epoch": 0.7799548681820684, + "grad_norm": 0.6262187361717224, + "learning_rate": 6.71887785046405e-06, + "loss": 0.7279, + "step": 14171 + }, + { + "epoch": 0.7800099069844241, + "grad_norm": 0.7538053393363953, + "learning_rate": 6.718470796381129e-06, + "loss": 0.754, + "step": 14172 + }, + { + "epoch": 0.7800649457867797, + "grad_norm": 0.6569569706916809, + "learning_rate": 6.718063729382643e-06, + "loss": 0.6787, + "step": 14173 + }, + { + "epoch": 0.7801199845891353, + "grad_norm": 0.6446678042411804, + "learning_rate": 6.71765664947165e-06, + "loss": 0.6338, + "step": 14174 + }, + { + "epoch": 0.780175023391491, + "grad_norm": 0.7559269666671753, + "learning_rate": 6.7172495566512095e-06, + "loss": 0.7472, + "step": 14175 + }, + { + "epoch": 0.7802300621938466, + "grad_norm": 0.6920101642608643, + "learning_rate": 6.71684245092438e-06, + "loss": 0.7189, + "step": 14176 + }, + { + "epoch": 0.7802851009962023, + "grad_norm": 0.6513105034828186, + "learning_rate": 6.716435332294223e-06, + "loss": 0.6104, + "step": 14177 + }, + { + "epoch": 0.780340139798558, + "grad_norm": 0.7076418399810791, + "learning_rate": 6.716028200763798e-06, + "loss": 0.7974, + "step": 14178 + }, + { + "epoch": 0.7803951786009137, + "grad_norm": 0.7291662693023682, + "learning_rate": 6.715621056336164e-06, + "loss": 0.7661, + "step": 14179 + }, + { + "epoch": 0.7804502174032693, + "grad_norm": 0.682321310043335, + "learning_rate": 6.715213899014381e-06, + "loss": 0.7345, + "step": 14180 + }, + { + "epoch": 0.7805052562056249, + "grad_norm": 0.7170400619506836, + "learning_rate": 6.71480672880151e-06, + "loss": 0.6968, + "step": 14181 + }, + { + "epoch": 0.7805602950079806, + "grad_norm": 0.7504192590713501, + "learning_rate": 6.714399545700611e-06, + "loss": 0.7868, + "step": 14182 + }, + { + "epoch": 0.7806153338103363, + "grad_norm": 0.7334801554679871, + "learning_rate": 6.713992349714744e-06, + "loss": 0.8806, + "step": 14183 + }, + { + "epoch": 0.7806703726126919, + "grad_norm": 0.6495537161827087, + "learning_rate": 6.713585140846969e-06, + "loss": 0.7272, + "step": 14184 + }, + { + "epoch": 0.7807254114150476, + "grad_norm": 0.7101101279258728, + "learning_rate": 6.713177919100347e-06, + "loss": 0.8038, + "step": 14185 + }, + { + "epoch": 0.7807804502174033, + "grad_norm": 0.7013083100318909, + "learning_rate": 6.712770684477937e-06, + "loss": 0.7576, + "step": 14186 + }, + { + "epoch": 0.780835489019759, + "grad_norm": 0.7535369992256165, + "learning_rate": 6.712363436982802e-06, + "loss": 0.6537, + "step": 14187 + }, + { + "epoch": 0.7808905278221145, + "grad_norm": 0.7432667016983032, + "learning_rate": 6.711956176618001e-06, + "loss": 0.7734, + "step": 14188 + }, + { + "epoch": 0.7809455666244702, + "grad_norm": 0.718006432056427, + "learning_rate": 6.711548903386597e-06, + "loss": 0.7291, + "step": 14189 + }, + { + "epoch": 0.7810006054268259, + "grad_norm": 0.7983072400093079, + "learning_rate": 6.711141617291649e-06, + "loss": 0.8403, + "step": 14190 + }, + { + "epoch": 0.7810556442291816, + "grad_norm": 0.7017259001731873, + "learning_rate": 6.710734318336218e-06, + "loss": 0.7293, + "step": 14191 + }, + { + "epoch": 0.7811106830315372, + "grad_norm": 0.6061737537384033, + "learning_rate": 6.710327006523366e-06, + "loss": 0.6624, + "step": 14192 + }, + { + "epoch": 0.7811657218338929, + "grad_norm": 0.6876726746559143, + "learning_rate": 6.709919681856155e-06, + "loss": 0.723, + "step": 14193 + }, + { + "epoch": 0.7812207606362486, + "grad_norm": 0.6926757097244263, + "learning_rate": 6.709512344337646e-06, + "loss": 0.7392, + "step": 14194 + }, + { + "epoch": 0.7812757994386043, + "grad_norm": 0.6464381217956543, + "learning_rate": 6.7091049939708985e-06, + "loss": 0.7301, + "step": 14195 + }, + { + "epoch": 0.7813308382409598, + "grad_norm": 0.7292629480361938, + "learning_rate": 6.708697630758974e-06, + "loss": 0.7511, + "step": 14196 + }, + { + "epoch": 0.7813858770433155, + "grad_norm": 0.7483099102973938, + "learning_rate": 6.708290254704937e-06, + "loss": 0.7981, + "step": 14197 + }, + { + "epoch": 0.7814409158456712, + "grad_norm": 0.6766877770423889, + "learning_rate": 6.707882865811848e-06, + "loss": 0.7987, + "step": 14198 + }, + { + "epoch": 0.7814959546480269, + "grad_norm": 0.7340181469917297, + "learning_rate": 6.707475464082769e-06, + "loss": 0.799, + "step": 14199 + }, + { + "epoch": 0.7815509934503825, + "grad_norm": 0.6247759461402893, + "learning_rate": 6.707068049520759e-06, + "loss": 0.7299, + "step": 14200 + }, + { + "epoch": 0.7816060322527382, + "grad_norm": 0.6783067584037781, + "learning_rate": 6.706660622128885e-06, + "loss": 0.6987, + "step": 14201 + }, + { + "epoch": 0.7816610710550939, + "grad_norm": 0.7613719701766968, + "learning_rate": 6.706253181910205e-06, + "loss": 0.7894, + "step": 14202 + }, + { + "epoch": 0.7817161098574495, + "grad_norm": 0.6673761606216431, + "learning_rate": 6.705845728867784e-06, + "loss": 0.8015, + "step": 14203 + }, + { + "epoch": 0.7817711486598051, + "grad_norm": 0.6551307439804077, + "learning_rate": 6.705438263004683e-06, + "loss": 0.7057, + "step": 14204 + }, + { + "epoch": 0.7818261874621608, + "grad_norm": 0.6815405488014221, + "learning_rate": 6.705030784323965e-06, + "loss": 0.7466, + "step": 14205 + }, + { + "epoch": 0.7818812262645165, + "grad_norm": 0.6838087439537048, + "learning_rate": 6.704623292828692e-06, + "loss": 0.8226, + "step": 14206 + }, + { + "epoch": 0.7819362650668722, + "grad_norm": 0.6704637408256531, + "learning_rate": 6.704215788521925e-06, + "loss": 0.8101, + "step": 14207 + }, + { + "epoch": 0.7819913038692278, + "grad_norm": 0.6606172919273376, + "learning_rate": 6.70380827140673e-06, + "loss": 0.7824, + "step": 14208 + }, + { + "epoch": 0.7820463426715835, + "grad_norm": 0.6641090512275696, + "learning_rate": 6.703400741486166e-06, + "loss": 0.7507, + "step": 14209 + }, + { + "epoch": 0.7821013814739392, + "grad_norm": 1.6413429975509644, + "learning_rate": 6.702993198763299e-06, + "loss": 0.7793, + "step": 14210 + }, + { + "epoch": 0.7821564202762948, + "grad_norm": 0.6664854884147644, + "learning_rate": 6.7025856432411915e-06, + "loss": 0.7304, + "step": 14211 + }, + { + "epoch": 0.7822114590786504, + "grad_norm": 0.6968172192573547, + "learning_rate": 6.7021780749229075e-06, + "loss": 0.7506, + "step": 14212 + }, + { + "epoch": 0.7822664978810061, + "grad_norm": 0.6443943381309509, + "learning_rate": 6.701770493811506e-06, + "loss": 0.7511, + "step": 14213 + }, + { + "epoch": 0.7823215366833618, + "grad_norm": 0.67723548412323, + "learning_rate": 6.701362899910053e-06, + "loss": 0.6839, + "step": 14214 + }, + { + "epoch": 0.7823765754857175, + "grad_norm": 0.7601221203804016, + "learning_rate": 6.700955293221614e-06, + "loss": 0.7397, + "step": 14215 + }, + { + "epoch": 0.7824316142880731, + "grad_norm": 0.6056920289993286, + "learning_rate": 6.700547673749249e-06, + "loss": 0.7706, + "step": 14216 + }, + { + "epoch": 0.7824866530904288, + "grad_norm": 0.6421142816543579, + "learning_rate": 6.700140041496024e-06, + "loss": 0.7209, + "step": 14217 + }, + { + "epoch": 0.7825416918927844, + "grad_norm": 0.6653133034706116, + "learning_rate": 6.6997323964650005e-06, + "loss": 0.708, + "step": 14218 + }, + { + "epoch": 0.78259673069514, + "grad_norm": 0.8854939937591553, + "learning_rate": 6.699324738659243e-06, + "loss": 0.7658, + "step": 14219 + }, + { + "epoch": 0.7826517694974957, + "grad_norm": 0.7130745649337769, + "learning_rate": 6.6989170680818175e-06, + "loss": 0.7827, + "step": 14220 + }, + { + "epoch": 0.7827068082998514, + "grad_norm": 0.953117847442627, + "learning_rate": 6.698509384735783e-06, + "loss": 0.7852, + "step": 14221 + }, + { + "epoch": 0.7827618471022071, + "grad_norm": 0.655768871307373, + "learning_rate": 6.698101688624209e-06, + "loss": 0.8461, + "step": 14222 + }, + { + "epoch": 0.7828168859045627, + "grad_norm": 0.656775951385498, + "learning_rate": 6.6976939797501575e-06, + "loss": 0.7254, + "step": 14223 + }, + { + "epoch": 0.7828719247069184, + "grad_norm": 0.6901991963386536, + "learning_rate": 6.697286258116691e-06, + "loss": 0.7242, + "step": 14224 + }, + { + "epoch": 0.782926963509274, + "grad_norm": 0.8289571404457092, + "learning_rate": 6.696878523726875e-06, + "loss": 0.8578, + "step": 14225 + }, + { + "epoch": 0.7829820023116297, + "grad_norm": 0.6268846392631531, + "learning_rate": 6.696470776583775e-06, + "loss": 0.737, + "step": 14226 + }, + { + "epoch": 0.7830370411139853, + "grad_norm": 0.7026770114898682, + "learning_rate": 6.696063016690455e-06, + "loss": 0.6771, + "step": 14227 + }, + { + "epoch": 0.783092079916341, + "grad_norm": 0.7377839088439941, + "learning_rate": 6.69565524404998e-06, + "loss": 0.7174, + "step": 14228 + }, + { + "epoch": 0.7831471187186967, + "grad_norm": 0.6778523921966553, + "learning_rate": 6.695247458665414e-06, + "loss": 0.8255, + "step": 14229 + }, + { + "epoch": 0.7832021575210524, + "grad_norm": 0.7624330520629883, + "learning_rate": 6.69483966053982e-06, + "loss": 0.7495, + "step": 14230 + }, + { + "epoch": 0.783257196323408, + "grad_norm": 0.8944052457809448, + "learning_rate": 6.694431849676267e-06, + "loss": 0.868, + "step": 14231 + }, + { + "epoch": 0.7833122351257636, + "grad_norm": 0.7391701936721802, + "learning_rate": 6.694024026077816e-06, + "loss": 0.7032, + "step": 14232 + }, + { + "epoch": 0.7833672739281193, + "grad_norm": 0.7548620104789734, + "learning_rate": 6.693616189747535e-06, + "loss": 0.8272, + "step": 14233 + }, + { + "epoch": 0.783422312730475, + "grad_norm": 0.6795994639396667, + "learning_rate": 6.693208340688489e-06, + "loss": 0.703, + "step": 14234 + }, + { + "epoch": 0.7834773515328306, + "grad_norm": 0.6580816507339478, + "learning_rate": 6.69280047890374e-06, + "loss": 0.7454, + "step": 14235 + }, + { + "epoch": 0.7835323903351863, + "grad_norm": 0.7124443650245667, + "learning_rate": 6.6923926043963576e-06, + "loss": 0.6655, + "step": 14236 + }, + { + "epoch": 0.783587429137542, + "grad_norm": 0.6730241179466248, + "learning_rate": 6.691984717169404e-06, + "loss": 0.7522, + "step": 14237 + }, + { + "epoch": 0.7836424679398977, + "grad_norm": 0.8156033158302307, + "learning_rate": 6.6915768172259466e-06, + "loss": 0.8955, + "step": 14238 + }, + { + "epoch": 0.7836975067422532, + "grad_norm": 0.8041443228721619, + "learning_rate": 6.6911689045690506e-06, + "loss": 0.8019, + "step": 14239 + }, + { + "epoch": 0.7837525455446089, + "grad_norm": 0.7252053618431091, + "learning_rate": 6.690760979201782e-06, + "loss": 0.7014, + "step": 14240 + }, + { + "epoch": 0.7838075843469646, + "grad_norm": 0.6969071626663208, + "learning_rate": 6.690353041127208e-06, + "loss": 0.7304, + "step": 14241 + }, + { + "epoch": 0.7838626231493203, + "grad_norm": 0.8254885673522949, + "learning_rate": 6.6899450903483906e-06, + "loss": 0.7193, + "step": 14242 + }, + { + "epoch": 0.7839176619516759, + "grad_norm": 0.7426590323448181, + "learning_rate": 6.6895371268684e-06, + "loss": 0.697, + "step": 14243 + }, + { + "epoch": 0.7839727007540316, + "grad_norm": 0.6744338274002075, + "learning_rate": 6.6891291506903e-06, + "loss": 0.8363, + "step": 14244 + }, + { + "epoch": 0.7840277395563873, + "grad_norm": 0.6609839797019958, + "learning_rate": 6.688721161817156e-06, + "loss": 0.7756, + "step": 14245 + }, + { + "epoch": 0.784082778358743, + "grad_norm": 0.8377131223678589, + "learning_rate": 6.688313160252038e-06, + "loss": 0.8355, + "step": 14246 + }, + { + "epoch": 0.7841378171610985, + "grad_norm": 0.6922308802604675, + "learning_rate": 6.687905145998009e-06, + "loss": 0.756, + "step": 14247 + }, + { + "epoch": 0.7841928559634542, + "grad_norm": 0.7217739820480347, + "learning_rate": 6.687497119058137e-06, + "loss": 0.7309, + "step": 14248 + }, + { + "epoch": 0.7842478947658099, + "grad_norm": 0.6906038522720337, + "learning_rate": 6.687089079435488e-06, + "loss": 0.6645, + "step": 14249 + }, + { + "epoch": 0.7843029335681656, + "grad_norm": 0.6800183057785034, + "learning_rate": 6.6866810271331305e-06, + "loss": 0.6791, + "step": 14250 + }, + { + "epoch": 0.7843579723705212, + "grad_norm": 0.6835503578186035, + "learning_rate": 6.686272962154129e-06, + "loss": 0.699, + "step": 14251 + }, + { + "epoch": 0.7844130111728769, + "grad_norm": 0.6643723845481873, + "learning_rate": 6.685864884501552e-06, + "loss": 0.7808, + "step": 14252 + }, + { + "epoch": 0.7844680499752326, + "grad_norm": 0.6742954850196838, + "learning_rate": 6.685456794178464e-06, + "loss": 0.7704, + "step": 14253 + }, + { + "epoch": 0.7845230887775883, + "grad_norm": 0.6374711990356445, + "learning_rate": 6.6850486911879355e-06, + "loss": 0.7557, + "step": 14254 + }, + { + "epoch": 0.7845781275799438, + "grad_norm": 0.7354347109794617, + "learning_rate": 6.684640575533031e-06, + "loss": 0.7928, + "step": 14255 + }, + { + "epoch": 0.7846331663822995, + "grad_norm": 0.6694937348365784, + "learning_rate": 6.684232447216821e-06, + "loss": 0.7247, + "step": 14256 + }, + { + "epoch": 0.7846882051846552, + "grad_norm": 0.716623842716217, + "learning_rate": 6.683824306242368e-06, + "loss": 0.8638, + "step": 14257 + }, + { + "epoch": 0.7847432439870109, + "grad_norm": 0.667164146900177, + "learning_rate": 6.683416152612743e-06, + "loss": 0.7455, + "step": 14258 + }, + { + "epoch": 0.7847982827893665, + "grad_norm": 0.7302100658416748, + "learning_rate": 6.683007986331014e-06, + "loss": 0.707, + "step": 14259 + }, + { + "epoch": 0.7848533215917222, + "grad_norm": 0.7605045437812805, + "learning_rate": 6.682599807400246e-06, + "loss": 0.7727, + "step": 14260 + }, + { + "epoch": 0.7849083603940779, + "grad_norm": 0.6819437146186829, + "learning_rate": 6.682191615823508e-06, + "loss": 0.7538, + "step": 14261 + }, + { + "epoch": 0.7849633991964334, + "grad_norm": 0.7399439811706543, + "learning_rate": 6.6817834116038695e-06, + "loss": 0.7499, + "step": 14262 + }, + { + "epoch": 0.7850184379987891, + "grad_norm": 0.7864901423454285, + "learning_rate": 6.681375194744397e-06, + "loss": 0.7128, + "step": 14263 + }, + { + "epoch": 0.7850734768011448, + "grad_norm": 0.7308626174926758, + "learning_rate": 6.680966965248159e-06, + "loss": 0.7239, + "step": 14264 + }, + { + "epoch": 0.7851285156035005, + "grad_norm": 0.6553478837013245, + "learning_rate": 6.680558723118222e-06, + "loss": 0.6984, + "step": 14265 + }, + { + "epoch": 0.7851835544058561, + "grad_norm": 0.621415376663208, + "learning_rate": 6.680150468357656e-06, + "loss": 0.6428, + "step": 14266 + }, + { + "epoch": 0.7852385932082118, + "grad_norm": 1.0505764484405518, + "learning_rate": 6.679742200969529e-06, + "loss": 0.8073, + "step": 14267 + }, + { + "epoch": 0.7852936320105675, + "grad_norm": 0.7393355369567871, + "learning_rate": 6.67933392095691e-06, + "loss": 0.7396, + "step": 14268 + }, + { + "epoch": 0.7853486708129231, + "grad_norm": 0.7346563935279846, + "learning_rate": 6.678925628322864e-06, + "loss": 0.7398, + "step": 14269 + }, + { + "epoch": 0.7854037096152787, + "grad_norm": 0.6694674491882324, + "learning_rate": 6.678517323070465e-06, + "loss": 0.7346, + "step": 14270 + }, + { + "epoch": 0.7854587484176344, + "grad_norm": 0.6907033920288086, + "learning_rate": 6.678109005202779e-06, + "loss": 0.7617, + "step": 14271 + }, + { + "epoch": 0.7855137872199901, + "grad_norm": 0.6588131189346313, + "learning_rate": 6.677700674722873e-06, + "loss": 0.7514, + "step": 14272 + }, + { + "epoch": 0.7855688260223458, + "grad_norm": 0.6535136699676514, + "learning_rate": 6.677292331633819e-06, + "loss": 0.7154, + "step": 14273 + }, + { + "epoch": 0.7856238648247014, + "grad_norm": 0.7013682723045349, + "learning_rate": 6.676883975938685e-06, + "loss": 0.8506, + "step": 14274 + }, + { + "epoch": 0.7856789036270571, + "grad_norm": 0.7128416895866394, + "learning_rate": 6.67647560764054e-06, + "loss": 0.7669, + "step": 14275 + }, + { + "epoch": 0.7857339424294127, + "grad_norm": 0.7021318674087524, + "learning_rate": 6.676067226742453e-06, + "loss": 0.8236, + "step": 14276 + }, + { + "epoch": 0.7857889812317684, + "grad_norm": 0.7067561745643616, + "learning_rate": 6.675658833247493e-06, + "loss": 0.6848, + "step": 14277 + }, + { + "epoch": 0.785844020034124, + "grad_norm": 0.6488254070281982, + "learning_rate": 6.675250427158731e-06, + "loss": 0.7877, + "step": 14278 + }, + { + "epoch": 0.7858990588364797, + "grad_norm": 0.7153946757316589, + "learning_rate": 6.674842008479234e-06, + "loss": 0.7994, + "step": 14279 + }, + { + "epoch": 0.7859540976388354, + "grad_norm": 0.7290914058685303, + "learning_rate": 6.6744335772120735e-06, + "loss": 0.8074, + "step": 14280 + }, + { + "epoch": 0.7860091364411911, + "grad_norm": 0.726309061050415, + "learning_rate": 6.674025133360316e-06, + "loss": 0.7789, + "step": 14281 + }, + { + "epoch": 0.7860641752435467, + "grad_norm": 0.6294347047805786, + "learning_rate": 6.673616676927037e-06, + "loss": 0.6405, + "step": 14282 + }, + { + "epoch": 0.7861192140459023, + "grad_norm": 0.654400646686554, + "learning_rate": 6.673208207915302e-06, + "loss": 0.7876, + "step": 14283 + }, + { + "epoch": 0.786174252848258, + "grad_norm": 0.6729328632354736, + "learning_rate": 6.672799726328182e-06, + "loss": 0.7773, + "step": 14284 + }, + { + "epoch": 0.7862292916506137, + "grad_norm": 0.7607905268669128, + "learning_rate": 6.672391232168745e-06, + "loss": 0.8262, + "step": 14285 + }, + { + "epoch": 0.7862843304529693, + "grad_norm": 0.6475018858909607, + "learning_rate": 6.671982725440065e-06, + "loss": 0.7383, + "step": 14286 + }, + { + "epoch": 0.786339369255325, + "grad_norm": 0.8290789723396301, + "learning_rate": 6.671574206145211e-06, + "loss": 0.7968, + "step": 14287 + }, + { + "epoch": 0.7863944080576807, + "grad_norm": 0.7462177872657776, + "learning_rate": 6.671165674287252e-06, + "loss": 0.7465, + "step": 14288 + }, + { + "epoch": 0.7864494468600364, + "grad_norm": 0.7029373049736023, + "learning_rate": 6.6707571298692595e-06, + "loss": 0.7342, + "step": 14289 + }, + { + "epoch": 0.786504485662392, + "grad_norm": 0.8253761529922485, + "learning_rate": 6.670348572894303e-06, + "loss": 0.8196, + "step": 14290 + }, + { + "epoch": 0.7865595244647476, + "grad_norm": 0.7234970331192017, + "learning_rate": 6.669940003365455e-06, + "loss": 0.7966, + "step": 14291 + }, + { + "epoch": 0.7866145632671033, + "grad_norm": 0.8699348568916321, + "learning_rate": 6.6695314212857845e-06, + "loss": 0.8761, + "step": 14292 + }, + { + "epoch": 0.786669602069459, + "grad_norm": 0.6620158553123474, + "learning_rate": 6.66912282665836e-06, + "loss": 0.7534, + "step": 14293 + }, + { + "epoch": 0.7867246408718146, + "grad_norm": 0.6469776630401611, + "learning_rate": 6.668714219486259e-06, + "loss": 0.7812, + "step": 14294 + }, + { + "epoch": 0.7867796796741703, + "grad_norm": 0.6477407813072205, + "learning_rate": 6.668305599772546e-06, + "loss": 0.7144, + "step": 14295 + }, + { + "epoch": 0.786834718476526, + "grad_norm": 0.6626473665237427, + "learning_rate": 6.667896967520297e-06, + "loss": 0.7283, + "step": 14296 + }, + { + "epoch": 0.7868897572788817, + "grad_norm": 0.6214945316314697, + "learning_rate": 6.667488322732578e-06, + "loss": 0.6835, + "step": 14297 + }, + { + "epoch": 0.7869447960812372, + "grad_norm": 0.6199555397033691, + "learning_rate": 6.667079665412465e-06, + "loss": 0.706, + "step": 14298 + }, + { + "epoch": 0.7869998348835929, + "grad_norm": 0.8127612471580505, + "learning_rate": 6.666670995563027e-06, + "loss": 0.7099, + "step": 14299 + }, + { + "epoch": 0.7870548736859486, + "grad_norm": 0.6241362690925598, + "learning_rate": 6.6662623131873374e-06, + "loss": 0.7076, + "step": 14300 + }, + { + "epoch": 0.7871099124883043, + "grad_norm": 0.7260692715644836, + "learning_rate": 6.665853618288465e-06, + "loss": 0.7842, + "step": 14301 + }, + { + "epoch": 0.7871649512906599, + "grad_norm": 0.6644107103347778, + "learning_rate": 6.665444910869482e-06, + "loss": 0.6515, + "step": 14302 + }, + { + "epoch": 0.7872199900930156, + "grad_norm": 0.6629641056060791, + "learning_rate": 6.6650361909334616e-06, + "loss": 0.7062, + "step": 14303 + }, + { + "epoch": 0.7872750288953713, + "grad_norm": 0.6616516709327698, + "learning_rate": 6.6646274584834745e-06, + "loss": 0.8195, + "step": 14304 + }, + { + "epoch": 0.7873300676977268, + "grad_norm": 0.7184805870056152, + "learning_rate": 6.664218713522593e-06, + "loss": 0.8699, + "step": 14305 + }, + { + "epoch": 0.7873851065000825, + "grad_norm": 0.6567219495773315, + "learning_rate": 6.6638099560538905e-06, + "loss": 0.7679, + "step": 14306 + }, + { + "epoch": 0.7874401453024382, + "grad_norm": 0.6952399611473083, + "learning_rate": 6.663401186080436e-06, + "loss": 0.603, + "step": 14307 + }, + { + "epoch": 0.7874951841047939, + "grad_norm": 0.7298767566680908, + "learning_rate": 6.662992403605304e-06, + "loss": 0.7655, + "step": 14308 + }, + { + "epoch": 0.7875502229071495, + "grad_norm": 0.7162219882011414, + "learning_rate": 6.662583608631567e-06, + "loss": 0.7797, + "step": 14309 + }, + { + "epoch": 0.7876052617095052, + "grad_norm": 0.6489827036857605, + "learning_rate": 6.662174801162296e-06, + "loss": 0.8165, + "step": 14310 + }, + { + "epoch": 0.7876603005118609, + "grad_norm": 0.7893611192703247, + "learning_rate": 6.6617659812005635e-06, + "loss": 0.8082, + "step": 14311 + }, + { + "epoch": 0.7877153393142166, + "grad_norm": 0.6709675192832947, + "learning_rate": 6.661357148749443e-06, + "loss": 0.7549, + "step": 14312 + }, + { + "epoch": 0.7877703781165721, + "grad_norm": 0.6166689991950989, + "learning_rate": 6.660948303812009e-06, + "loss": 0.7116, + "step": 14313 + }, + { + "epoch": 0.7878254169189278, + "grad_norm": 0.7941738367080688, + "learning_rate": 6.660539446391329e-06, + "loss": 0.7981, + "step": 14314 + }, + { + "epoch": 0.7878804557212835, + "grad_norm": 0.6339346170425415, + "learning_rate": 6.660130576490481e-06, + "loss": 0.7306, + "step": 14315 + }, + { + "epoch": 0.7879354945236392, + "grad_norm": 0.7044192552566528, + "learning_rate": 6.659721694112535e-06, + "loss": 0.7811, + "step": 14316 + }, + { + "epoch": 0.7879905333259948, + "grad_norm": 0.7853406071662903, + "learning_rate": 6.659312799260565e-06, + "loss": 0.7652, + "step": 14317 + }, + { + "epoch": 0.7880455721283505, + "grad_norm": 0.7076637148857117, + "learning_rate": 6.658903891937645e-06, + "loss": 0.7672, + "step": 14318 + }, + { + "epoch": 0.7881006109307062, + "grad_norm": 0.7043278813362122, + "learning_rate": 6.658494972146847e-06, + "loss": 0.726, + "step": 14319 + }, + { + "epoch": 0.7881556497330618, + "grad_norm": 0.8903809785842896, + "learning_rate": 6.658086039891245e-06, + "loss": 0.8, + "step": 14320 + }, + { + "epoch": 0.7882106885354174, + "grad_norm": 0.8239984512329102, + "learning_rate": 6.657677095173911e-06, + "loss": 0.7283, + "step": 14321 + }, + { + "epoch": 0.7882657273377731, + "grad_norm": 0.7221176028251648, + "learning_rate": 6.6572681379979206e-06, + "loss": 0.8058, + "step": 14322 + }, + { + "epoch": 0.7883207661401288, + "grad_norm": 0.8297285437583923, + "learning_rate": 6.6568591683663475e-06, + "loss": 0.8064, + "step": 14323 + }, + { + "epoch": 0.7883758049424845, + "grad_norm": 0.680659294128418, + "learning_rate": 6.656450186282264e-06, + "loss": 0.7259, + "step": 14324 + }, + { + "epoch": 0.7884308437448401, + "grad_norm": 0.7067807912826538, + "learning_rate": 6.656041191748744e-06, + "loss": 0.8414, + "step": 14325 + }, + { + "epoch": 0.7884858825471958, + "grad_norm": 0.6053900718688965, + "learning_rate": 6.655632184768861e-06, + "loss": 0.6762, + "step": 14326 + }, + { + "epoch": 0.7885409213495514, + "grad_norm": 0.7123621106147766, + "learning_rate": 6.65522316534569e-06, + "loss": 0.6968, + "step": 14327 + }, + { + "epoch": 0.7885959601519071, + "grad_norm": 0.7308228015899658, + "learning_rate": 6.6548141334823045e-06, + "loss": 0.6715, + "step": 14328 + }, + { + "epoch": 0.7886509989542627, + "grad_norm": 0.7508199214935303, + "learning_rate": 6.654405089181779e-06, + "loss": 0.7884, + "step": 14329 + }, + { + "epoch": 0.7887060377566184, + "grad_norm": 0.7317141890525818, + "learning_rate": 6.653996032447188e-06, + "loss": 0.7319, + "step": 14330 + }, + { + "epoch": 0.7887610765589741, + "grad_norm": 0.6797091364860535, + "learning_rate": 6.653586963281607e-06, + "loss": 0.7898, + "step": 14331 + }, + { + "epoch": 0.7888161153613298, + "grad_norm": 0.6293582320213318, + "learning_rate": 6.6531778816881065e-06, + "loss": 0.6784, + "step": 14332 + }, + { + "epoch": 0.7888711541636854, + "grad_norm": 0.7604238986968994, + "learning_rate": 6.652768787669763e-06, + "loss": 0.7226, + "step": 14333 + }, + { + "epoch": 0.788926192966041, + "grad_norm": 0.6921128034591675, + "learning_rate": 6.652359681229654e-06, + "loss": 0.7375, + "step": 14334 + }, + { + "epoch": 0.7889812317683967, + "grad_norm": 0.6532993316650391, + "learning_rate": 6.651950562370851e-06, + "loss": 0.703, + "step": 14335 + }, + { + "epoch": 0.7890362705707524, + "grad_norm": 0.6739360094070435, + "learning_rate": 6.651541431096431e-06, + "loss": 0.7488, + "step": 14336 + }, + { + "epoch": 0.789091309373108, + "grad_norm": 0.7503200173377991, + "learning_rate": 6.651132287409466e-06, + "loss": 0.7492, + "step": 14337 + }, + { + "epoch": 0.7891463481754637, + "grad_norm": 0.6537551879882812, + "learning_rate": 6.650723131313035e-06, + "loss": 0.723, + "step": 14338 + }, + { + "epoch": 0.7892013869778194, + "grad_norm": 0.6378511786460876, + "learning_rate": 6.650313962810208e-06, + "loss": 0.7764, + "step": 14339 + }, + { + "epoch": 0.7892564257801751, + "grad_norm": 0.7948685884475708, + "learning_rate": 6.649904781904065e-06, + "loss": 0.7996, + "step": 14340 + }, + { + "epoch": 0.7893114645825307, + "grad_norm": 0.7558071613311768, + "learning_rate": 6.649495588597678e-06, + "loss": 0.8249, + "step": 14341 + }, + { + "epoch": 0.7893665033848863, + "grad_norm": 0.7158063054084778, + "learning_rate": 6.649086382894124e-06, + "loss": 0.815, + "step": 14342 + }, + { + "epoch": 0.789421542187242, + "grad_norm": 0.7551599144935608, + "learning_rate": 6.648677164796479e-06, + "loss": 0.7151, + "step": 14343 + }, + { + "epoch": 0.7894765809895977, + "grad_norm": 0.6966339349746704, + "learning_rate": 6.648267934307817e-06, + "loss": 0.8057, + "step": 14344 + }, + { + "epoch": 0.7895316197919533, + "grad_norm": 0.6863396167755127, + "learning_rate": 6.647858691431214e-06, + "loss": 0.7819, + "step": 14345 + }, + { + "epoch": 0.789586658594309, + "grad_norm": 0.7352383136749268, + "learning_rate": 6.647449436169747e-06, + "loss": 0.8101, + "step": 14346 + }, + { + "epoch": 0.7896416973966647, + "grad_norm": 0.7630855441093445, + "learning_rate": 6.64704016852649e-06, + "loss": 0.7155, + "step": 14347 + }, + { + "epoch": 0.7896967361990203, + "grad_norm": 0.6740198135375977, + "learning_rate": 6.646630888504522e-06, + "loss": 0.7255, + "step": 14348 + }, + { + "epoch": 0.7897517750013759, + "grad_norm": 0.7095367908477783, + "learning_rate": 6.646221596106917e-06, + "loss": 0.7527, + "step": 14349 + }, + { + "epoch": 0.7898068138037316, + "grad_norm": 0.6096131801605225, + "learning_rate": 6.645812291336749e-06, + "loss": 0.7116, + "step": 14350 + }, + { + "epoch": 0.7898618526060873, + "grad_norm": 0.7212585210800171, + "learning_rate": 6.645402974197097e-06, + "loss": 0.7647, + "step": 14351 + }, + { + "epoch": 0.7899168914084429, + "grad_norm": 0.7145454287528992, + "learning_rate": 6.6449936446910376e-06, + "loss": 0.7988, + "step": 14352 + }, + { + "epoch": 0.7899719302107986, + "grad_norm": 0.668269693851471, + "learning_rate": 6.644584302821646e-06, + "loss": 0.8453, + "step": 14353 + }, + { + "epoch": 0.7900269690131543, + "grad_norm": 0.7431649565696716, + "learning_rate": 6.644174948591998e-06, + "loss": 0.6981, + "step": 14354 + }, + { + "epoch": 0.79008200781551, + "grad_norm": 0.6727485060691833, + "learning_rate": 6.643765582005172e-06, + "loss": 0.792, + "step": 14355 + }, + { + "epoch": 0.7901370466178655, + "grad_norm": 0.7102059721946716, + "learning_rate": 6.643356203064244e-06, + "loss": 0.7469, + "step": 14356 + }, + { + "epoch": 0.7901920854202212, + "grad_norm": 0.6719706654548645, + "learning_rate": 6.642946811772291e-06, + "loss": 0.7542, + "step": 14357 + }, + { + "epoch": 0.7902471242225769, + "grad_norm": 0.7044880986213684, + "learning_rate": 6.6425374081323875e-06, + "loss": 0.7884, + "step": 14358 + }, + { + "epoch": 0.7903021630249326, + "grad_norm": 0.656411349773407, + "learning_rate": 6.642127992147614e-06, + "loss": 0.7596, + "step": 14359 + }, + { + "epoch": 0.7903572018272882, + "grad_norm": 0.6256445050239563, + "learning_rate": 6.641718563821047e-06, + "loss": 0.6257, + "step": 14360 + }, + { + "epoch": 0.7904122406296439, + "grad_norm": 0.6761715412139893, + "learning_rate": 6.641309123155761e-06, + "loss": 0.7024, + "step": 14361 + }, + { + "epoch": 0.7904672794319996, + "grad_norm": 0.7567794322967529, + "learning_rate": 6.640899670154837e-06, + "loss": 0.7948, + "step": 14362 + }, + { + "epoch": 0.7905223182343553, + "grad_norm": 0.6192977428436279, + "learning_rate": 6.640490204821349e-06, + "loss": 0.7307, + "step": 14363 + }, + { + "epoch": 0.7905773570367108, + "grad_norm": 0.8120929002761841, + "learning_rate": 6.640080727158376e-06, + "loss": 0.7173, + "step": 14364 + }, + { + "epoch": 0.7906323958390665, + "grad_norm": 0.7303271293640137, + "learning_rate": 6.639671237168996e-06, + "loss": 0.8118, + "step": 14365 + }, + { + "epoch": 0.7906874346414222, + "grad_norm": 0.6731529831886292, + "learning_rate": 6.639261734856284e-06, + "loss": 0.76, + "step": 14366 + }, + { + "epoch": 0.7907424734437779, + "grad_norm": 0.6909935474395752, + "learning_rate": 6.638852220223321e-06, + "loss": 0.7732, + "step": 14367 + }, + { + "epoch": 0.7907975122461335, + "grad_norm": 0.6543979048728943, + "learning_rate": 6.638442693273183e-06, + "loss": 0.7408, + "step": 14368 + }, + { + "epoch": 0.7908525510484892, + "grad_norm": 0.6411511301994324, + "learning_rate": 6.6380331540089485e-06, + "loss": 0.6963, + "step": 14369 + }, + { + "epoch": 0.7909075898508449, + "grad_norm": 0.6657214164733887, + "learning_rate": 6.637623602433694e-06, + "loss": 0.7417, + "step": 14370 + }, + { + "epoch": 0.7909626286532006, + "grad_norm": 0.6852405071258545, + "learning_rate": 6.6372140385505e-06, + "loss": 0.7176, + "step": 14371 + }, + { + "epoch": 0.7910176674555561, + "grad_norm": 0.6453777551651001, + "learning_rate": 6.636804462362444e-06, + "loss": 0.7791, + "step": 14372 + }, + { + "epoch": 0.7910727062579118, + "grad_norm": 0.6806328296661377, + "learning_rate": 6.636394873872603e-06, + "loss": 0.7856, + "step": 14373 + }, + { + "epoch": 0.7911277450602675, + "grad_norm": 0.6819495558738708, + "learning_rate": 6.635985273084058e-06, + "loss": 0.7865, + "step": 14374 + }, + { + "epoch": 0.7911827838626232, + "grad_norm": 0.7372999787330627, + "learning_rate": 6.635575659999883e-06, + "loss": 0.8549, + "step": 14375 + }, + { + "epoch": 0.7912378226649788, + "grad_norm": 0.8146817684173584, + "learning_rate": 6.635166034623162e-06, + "loss": 0.7253, + "step": 14376 + }, + { + "epoch": 0.7912928614673345, + "grad_norm": 0.8205630779266357, + "learning_rate": 6.634756396956969e-06, + "loss": 0.6915, + "step": 14377 + }, + { + "epoch": 0.7913479002696902, + "grad_norm": 0.7168713808059692, + "learning_rate": 6.634346747004383e-06, + "loss": 0.7495, + "step": 14378 + }, + { + "epoch": 0.7914029390720458, + "grad_norm": 0.7210709452629089, + "learning_rate": 6.6339370847684854e-06, + "loss": 0.7323, + "step": 14379 + }, + { + "epoch": 0.7914579778744014, + "grad_norm": 0.9042065143585205, + "learning_rate": 6.633527410252355e-06, + "loss": 0.847, + "step": 14380 + }, + { + "epoch": 0.7915130166767571, + "grad_norm": 0.6700118184089661, + "learning_rate": 6.633117723459071e-06, + "loss": 0.7975, + "step": 14381 + }, + { + "epoch": 0.7915680554791128, + "grad_norm": 0.6355725526809692, + "learning_rate": 6.632708024391707e-06, + "loss": 0.7398, + "step": 14382 + }, + { + "epoch": 0.7916230942814685, + "grad_norm": 0.8274535536766052, + "learning_rate": 6.6322983130533505e-06, + "loss": 0.8641, + "step": 14383 + }, + { + "epoch": 0.7916781330838241, + "grad_norm": 0.5835573077201843, + "learning_rate": 6.631888589447075e-06, + "loss": 0.636, + "step": 14384 + }, + { + "epoch": 0.7917331718861798, + "grad_norm": 0.6933130621910095, + "learning_rate": 6.631478853575963e-06, + "loss": 0.7874, + "step": 14385 + }, + { + "epoch": 0.7917882106885354, + "grad_norm": 0.8125241994857788, + "learning_rate": 6.631069105443092e-06, + "loss": 0.7961, + "step": 14386 + }, + { + "epoch": 0.7918432494908911, + "grad_norm": 0.6661116480827332, + "learning_rate": 6.630659345051542e-06, + "loss": 0.6498, + "step": 14387 + }, + { + "epoch": 0.7918982882932467, + "grad_norm": 0.6807548403739929, + "learning_rate": 6.630249572404393e-06, + "loss": 0.6952, + "step": 14388 + }, + { + "epoch": 0.7919533270956024, + "grad_norm": 0.6886214017868042, + "learning_rate": 6.629839787504726e-06, + "loss": 0.7416, + "step": 14389 + }, + { + "epoch": 0.7920083658979581, + "grad_norm": 0.7633732557296753, + "learning_rate": 6.629429990355617e-06, + "loss": 0.8008, + "step": 14390 + }, + { + "epoch": 0.7920634047003137, + "grad_norm": 0.8401023745536804, + "learning_rate": 6.6290201809601494e-06, + "loss": 0.8312, + "step": 14391 + }, + { + "epoch": 0.7921184435026694, + "grad_norm": 0.6608526706695557, + "learning_rate": 6.628610359321403e-06, + "loss": 0.563, + "step": 14392 + }, + { + "epoch": 0.792173482305025, + "grad_norm": 0.687045156955719, + "learning_rate": 6.6282005254424566e-06, + "loss": 0.7451, + "step": 14393 + }, + { + "epoch": 0.7922285211073807, + "grad_norm": 0.7129287123680115, + "learning_rate": 6.627790679326389e-06, + "loss": 0.8495, + "step": 14394 + }, + { + "epoch": 0.7922835599097363, + "grad_norm": 0.6951952576637268, + "learning_rate": 6.627380820976283e-06, + "loss": 0.7895, + "step": 14395 + }, + { + "epoch": 0.792338598712092, + "grad_norm": 0.8020780086517334, + "learning_rate": 6.626970950395221e-06, + "loss": 0.7136, + "step": 14396 + }, + { + "epoch": 0.7923936375144477, + "grad_norm": 0.6654007434844971, + "learning_rate": 6.626561067586279e-06, + "loss": 0.7865, + "step": 14397 + }, + { + "epoch": 0.7924486763168034, + "grad_norm": 0.844744861125946, + "learning_rate": 6.62615117255254e-06, + "loss": 0.7856, + "step": 14398 + }, + { + "epoch": 0.792503715119159, + "grad_norm": 0.6890879273414612, + "learning_rate": 6.625741265297083e-06, + "loss": 0.7574, + "step": 14399 + }, + { + "epoch": 0.7925587539215146, + "grad_norm": 0.7559735774993896, + "learning_rate": 6.625331345822992e-06, + "loss": 0.634, + "step": 14400 + }, + { + "epoch": 0.7926137927238703, + "grad_norm": 0.6918107867240906, + "learning_rate": 6.624921414133344e-06, + "loss": 0.6935, + "step": 14401 + }, + { + "epoch": 0.792668831526226, + "grad_norm": 0.7468792200088501, + "learning_rate": 6.624511470231221e-06, + "loss": 0.7301, + "step": 14402 + }, + { + "epoch": 0.7927238703285816, + "grad_norm": 0.6749486327171326, + "learning_rate": 6.624101514119705e-06, + "loss": 0.7143, + "step": 14403 + }, + { + "epoch": 0.7927789091309373, + "grad_norm": 0.7765836119651794, + "learning_rate": 6.623691545801878e-06, + "loss": 0.7201, + "step": 14404 + }, + { + "epoch": 0.792833947933293, + "grad_norm": 0.6263312697410583, + "learning_rate": 6.623281565280819e-06, + "loss": 0.5866, + "step": 14405 + }, + { + "epoch": 0.7928889867356487, + "grad_norm": 0.6325232982635498, + "learning_rate": 6.62287157255961e-06, + "loss": 0.7389, + "step": 14406 + }, + { + "epoch": 0.7929440255380042, + "grad_norm": 0.7165958881378174, + "learning_rate": 6.622461567641333e-06, + "loss": 0.7378, + "step": 14407 + }, + { + "epoch": 0.7929990643403599, + "grad_norm": 0.7611519694328308, + "learning_rate": 6.62205155052907e-06, + "loss": 0.7146, + "step": 14408 + }, + { + "epoch": 0.7930541031427156, + "grad_norm": 0.6764969825744629, + "learning_rate": 6.6216415212259e-06, + "loss": 0.7802, + "step": 14409 + }, + { + "epoch": 0.7931091419450713, + "grad_norm": 0.7266956567764282, + "learning_rate": 6.621231479734908e-06, + "loss": 0.7065, + "step": 14410 + }, + { + "epoch": 0.7931641807474269, + "grad_norm": 0.7540454268455505, + "learning_rate": 6.620821426059174e-06, + "loss": 0.7327, + "step": 14411 + }, + { + "epoch": 0.7932192195497826, + "grad_norm": 0.7931423783302307, + "learning_rate": 6.620411360201779e-06, + "loss": 0.8032, + "step": 14412 + }, + { + "epoch": 0.7932742583521383, + "grad_norm": 1.2976648807525635, + "learning_rate": 6.620001282165808e-06, + "loss": 0.7422, + "step": 14413 + }, + { + "epoch": 0.793329297154494, + "grad_norm": 0.6525906920433044, + "learning_rate": 6.619591191954338e-06, + "loss": 0.6857, + "step": 14414 + }, + { + "epoch": 0.7933843359568495, + "grad_norm": 0.6153263449668884, + "learning_rate": 6.619181089570456e-06, + "loss": 0.6117, + "step": 14415 + }, + { + "epoch": 0.7934393747592052, + "grad_norm": 0.7076815962791443, + "learning_rate": 6.6187709750172425e-06, + "loss": 0.8053, + "step": 14416 + }, + { + "epoch": 0.7934944135615609, + "grad_norm": 0.6999046802520752, + "learning_rate": 6.618360848297779e-06, + "loss": 0.6275, + "step": 14417 + }, + { + "epoch": 0.7935494523639166, + "grad_norm": 0.7043859958648682, + "learning_rate": 6.6179507094151484e-06, + "loss": 0.8273, + "step": 14418 + }, + { + "epoch": 0.7936044911662722, + "grad_norm": 0.6295393705368042, + "learning_rate": 6.617540558372434e-06, + "loss": 0.6394, + "step": 14419 + }, + { + "epoch": 0.7936595299686279, + "grad_norm": 0.8165664076805115, + "learning_rate": 6.617130395172718e-06, + "loss": 0.8473, + "step": 14420 + }, + { + "epoch": 0.7937145687709836, + "grad_norm": 0.7598135471343994, + "learning_rate": 6.616720219819082e-06, + "loss": 0.729, + "step": 14421 + }, + { + "epoch": 0.7937696075733393, + "grad_norm": 0.7222034335136414, + "learning_rate": 6.6163100323146105e-06, + "loss": 0.7526, + "step": 14422 + }, + { + "epoch": 0.7938246463756948, + "grad_norm": 0.7994693517684937, + "learning_rate": 6.615899832662385e-06, + "loss": 0.8346, + "step": 14423 + }, + { + "epoch": 0.7938796851780505, + "grad_norm": 0.6603162884712219, + "learning_rate": 6.615489620865489e-06, + "loss": 0.7546, + "step": 14424 + }, + { + "epoch": 0.7939347239804062, + "grad_norm": 0.6525929570198059, + "learning_rate": 6.615079396927005e-06, + "loss": 0.7344, + "step": 14425 + }, + { + "epoch": 0.7939897627827619, + "grad_norm": 0.6144835948944092, + "learning_rate": 6.614669160850016e-06, + "loss": 0.6776, + "step": 14426 + }, + { + "epoch": 0.7940448015851175, + "grad_norm": 0.7205507159233093, + "learning_rate": 6.614258912637607e-06, + "loss": 0.809, + "step": 14427 + }, + { + "epoch": 0.7940998403874732, + "grad_norm": 0.6757732629776001, + "learning_rate": 6.61384865229286e-06, + "loss": 0.7403, + "step": 14428 + }, + { + "epoch": 0.7941548791898289, + "grad_norm": 0.6392103433609009, + "learning_rate": 6.6134383798188586e-06, + "loss": 0.7689, + "step": 14429 + }, + { + "epoch": 0.7942099179921845, + "grad_norm": 0.6647289395332336, + "learning_rate": 6.613028095218685e-06, + "loss": 0.6611, + "step": 14430 + }, + { + "epoch": 0.7942649567945401, + "grad_norm": 0.6961668133735657, + "learning_rate": 6.612617798495426e-06, + "loss": 0.7784, + "step": 14431 + }, + { + "epoch": 0.7943199955968958, + "grad_norm": 1.1188037395477295, + "learning_rate": 6.6122074896521615e-06, + "loss": 0.6518, + "step": 14432 + }, + { + "epoch": 0.7943750343992515, + "grad_norm": 0.6382507085800171, + "learning_rate": 6.611797168691978e-06, + "loss": 0.6954, + "step": 14433 + }, + { + "epoch": 0.7944300732016071, + "grad_norm": 0.6720117330551147, + "learning_rate": 6.6113868356179585e-06, + "loss": 0.7267, + "step": 14434 + }, + { + "epoch": 0.7944851120039628, + "grad_norm": 0.6667274832725525, + "learning_rate": 6.610976490433186e-06, + "loss": 0.6867, + "step": 14435 + }, + { + "epoch": 0.7945401508063185, + "grad_norm": 0.658217191696167, + "learning_rate": 6.610566133140747e-06, + "loss": 0.66, + "step": 14436 + }, + { + "epoch": 0.7945951896086741, + "grad_norm": 0.6820386648178101, + "learning_rate": 6.610155763743723e-06, + "loss": 0.7352, + "step": 14437 + }, + { + "epoch": 0.7946502284110297, + "grad_norm": 0.788696825504303, + "learning_rate": 6.609745382245198e-06, + "loss": 0.6822, + "step": 14438 + }, + { + "epoch": 0.7947052672133854, + "grad_norm": 0.6485540270805359, + "learning_rate": 6.6093349886482596e-06, + "loss": 0.718, + "step": 14439 + }, + { + "epoch": 0.7947603060157411, + "grad_norm": 0.717659056186676, + "learning_rate": 6.60892458295599e-06, + "loss": 0.7898, + "step": 14440 + }, + { + "epoch": 0.7948153448180968, + "grad_norm": 0.6576352119445801, + "learning_rate": 6.608514165171473e-06, + "loss": 0.8041, + "step": 14441 + }, + { + "epoch": 0.7948703836204524, + "grad_norm": 0.7034726738929749, + "learning_rate": 6.608103735297795e-06, + "loss": 0.7901, + "step": 14442 + }, + { + "epoch": 0.7949254224228081, + "grad_norm": 0.7001451253890991, + "learning_rate": 6.6076932933380386e-06, + "loss": 0.6814, + "step": 14443 + }, + { + "epoch": 0.7949804612251637, + "grad_norm": 0.789359450340271, + "learning_rate": 6.607282839295291e-06, + "loss": 0.744, + "step": 14444 + }, + { + "epoch": 0.7950355000275194, + "grad_norm": 0.7830412983894348, + "learning_rate": 6.606872373172636e-06, + "loss": 0.8161, + "step": 14445 + }, + { + "epoch": 0.795090538829875, + "grad_norm": 0.6462455987930298, + "learning_rate": 6.606461894973157e-06, + "loss": 0.7723, + "step": 14446 + }, + { + "epoch": 0.7951455776322307, + "grad_norm": 0.6232526898384094, + "learning_rate": 6.606051404699943e-06, + "loss": 0.6723, + "step": 14447 + }, + { + "epoch": 0.7952006164345864, + "grad_norm": 0.7790026068687439, + "learning_rate": 6.605640902356074e-06, + "loss": 0.7687, + "step": 14448 + }, + { + "epoch": 0.7952556552369421, + "grad_norm": 0.7281851768493652, + "learning_rate": 6.605230387944639e-06, + "loss": 0.827, + "step": 14449 + }, + { + "epoch": 0.7953106940392977, + "grad_norm": 0.6519556045532227, + "learning_rate": 6.604819861468721e-06, + "loss": 0.7039, + "step": 14450 + }, + { + "epoch": 0.7953657328416534, + "grad_norm": 0.6768763661384583, + "learning_rate": 6.604409322931406e-06, + "loss": 0.7288, + "step": 14451 + }, + { + "epoch": 0.795420771644009, + "grad_norm": 0.7457320094108582, + "learning_rate": 6.6039987723357825e-06, + "loss": 0.8386, + "step": 14452 + }, + { + "epoch": 0.7954758104463647, + "grad_norm": 0.9579072594642639, + "learning_rate": 6.6035882096849325e-06, + "loss": 0.7552, + "step": 14453 + }, + { + "epoch": 0.7955308492487203, + "grad_norm": 0.6709916591644287, + "learning_rate": 6.603177634981941e-06, + "loss": 0.724, + "step": 14454 + }, + { + "epoch": 0.795585888051076, + "grad_norm": 0.6097317934036255, + "learning_rate": 6.602767048229897e-06, + "loss": 0.6866, + "step": 14455 + }, + { + "epoch": 0.7956409268534317, + "grad_norm": 0.7303394675254822, + "learning_rate": 6.602356449431885e-06, + "loss": 0.682, + "step": 14456 + }, + { + "epoch": 0.7956959656557874, + "grad_norm": 0.775979220867157, + "learning_rate": 6.601945838590991e-06, + "loss": 0.7784, + "step": 14457 + }, + { + "epoch": 0.795751004458143, + "grad_norm": 0.7016483545303345, + "learning_rate": 6.6015352157103e-06, + "loss": 0.7557, + "step": 14458 + }, + { + "epoch": 0.7958060432604986, + "grad_norm": 0.688946545124054, + "learning_rate": 6.6011245807929e-06, + "loss": 0.707, + "step": 14459 + }, + { + "epoch": 0.7958610820628543, + "grad_norm": 0.7286174297332764, + "learning_rate": 6.600713933841877e-06, + "loss": 0.784, + "step": 14460 + }, + { + "epoch": 0.79591612086521, + "grad_norm": 0.7604749798774719, + "learning_rate": 6.600303274860316e-06, + "loss": 0.7099, + "step": 14461 + }, + { + "epoch": 0.7959711596675656, + "grad_norm": 0.6626706123352051, + "learning_rate": 6.599892603851301e-06, + "loss": 0.7137, + "step": 14462 + }, + { + "epoch": 0.7960261984699213, + "grad_norm": 0.7692080736160278, + "learning_rate": 6.599481920817925e-06, + "loss": 0.847, + "step": 14463 + }, + { + "epoch": 0.796081237272277, + "grad_norm": 0.6811042428016663, + "learning_rate": 6.599071225763269e-06, + "loss": 0.7888, + "step": 14464 + }, + { + "epoch": 0.7961362760746327, + "grad_norm": 0.654481053352356, + "learning_rate": 6.598660518690424e-06, + "loss": 0.6973, + "step": 14465 + }, + { + "epoch": 0.7961913148769882, + "grad_norm": 0.7332738637924194, + "learning_rate": 6.598249799602472e-06, + "loss": 0.8311, + "step": 14466 + }, + { + "epoch": 0.7962463536793439, + "grad_norm": 0.7098381519317627, + "learning_rate": 6.597839068502503e-06, + "loss": 0.8265, + "step": 14467 + }, + { + "epoch": 0.7963013924816996, + "grad_norm": 0.6338212490081787, + "learning_rate": 6.597428325393604e-06, + "loss": 0.6889, + "step": 14468 + }, + { + "epoch": 0.7963564312840553, + "grad_norm": 0.7001339197158813, + "learning_rate": 6.597017570278861e-06, + "loss": 0.7613, + "step": 14469 + }, + { + "epoch": 0.7964114700864109, + "grad_norm": 0.6565783619880676, + "learning_rate": 6.596606803161361e-06, + "loss": 0.6284, + "step": 14470 + }, + { + "epoch": 0.7964665088887666, + "grad_norm": 0.6638015508651733, + "learning_rate": 6.5961960240441935e-06, + "loss": 0.6635, + "step": 14471 + }, + { + "epoch": 0.7965215476911223, + "grad_norm": 0.6389575600624084, + "learning_rate": 6.595785232930443e-06, + "loss": 0.6588, + "step": 14472 + }, + { + "epoch": 0.796576586493478, + "grad_norm": 0.9486858248710632, + "learning_rate": 6.595374429823197e-06, + "loss": 0.8314, + "step": 14473 + }, + { + "epoch": 0.7966316252958335, + "grad_norm": 0.7555649280548096, + "learning_rate": 6.594963614725544e-06, + "loss": 0.8173, + "step": 14474 + }, + { + "epoch": 0.7966866640981892, + "grad_norm": 0.63021320104599, + "learning_rate": 6.5945527876405715e-06, + "loss": 0.7038, + "step": 14475 + }, + { + "epoch": 0.7967417029005449, + "grad_norm": 0.802980899810791, + "learning_rate": 6.594141948571366e-06, + "loss": 0.8031, + "step": 14476 + }, + { + "epoch": 0.7967967417029005, + "grad_norm": 0.7204614281654358, + "learning_rate": 6.593731097521019e-06, + "loss": 0.827, + "step": 14477 + }, + { + "epoch": 0.7968517805052562, + "grad_norm": 0.6805211305618286, + "learning_rate": 6.593320234492613e-06, + "loss": 0.7405, + "step": 14478 + }, + { + "epoch": 0.7969068193076119, + "grad_norm": 0.7011345028877258, + "learning_rate": 6.59290935948924e-06, + "loss": 0.7241, + "step": 14479 + }, + { + "epoch": 0.7969618581099676, + "grad_norm": 0.8995540738105774, + "learning_rate": 6.592498472513986e-06, + "loss": 0.6864, + "step": 14480 + }, + { + "epoch": 0.7970168969123231, + "grad_norm": 0.7518284320831299, + "learning_rate": 6.592087573569941e-06, + "loss": 0.7561, + "step": 14481 + }, + { + "epoch": 0.7970719357146788, + "grad_norm": 0.6359231472015381, + "learning_rate": 6.591676662660191e-06, + "loss": 0.6402, + "step": 14482 + }, + { + "epoch": 0.7971269745170345, + "grad_norm": 0.6610120534896851, + "learning_rate": 6.5912657397878264e-06, + "loss": 0.6419, + "step": 14483 + }, + { + "epoch": 0.7971820133193902, + "grad_norm": 0.7054341435432434, + "learning_rate": 6.590854804955934e-06, + "loss": 0.7252, + "step": 14484 + }, + { + "epoch": 0.7972370521217458, + "grad_norm": 0.6929903626441956, + "learning_rate": 6.5904438581676025e-06, + "loss": 0.6566, + "step": 14485 + }, + { + "epoch": 0.7972920909241015, + "grad_norm": 0.7354124188423157, + "learning_rate": 6.59003289942592e-06, + "loss": 0.763, + "step": 14486 + }, + { + "epoch": 0.7973471297264572, + "grad_norm": 0.6366610527038574, + "learning_rate": 6.5896219287339755e-06, + "loss": 0.6601, + "step": 14487 + }, + { + "epoch": 0.7974021685288128, + "grad_norm": 0.6916924715042114, + "learning_rate": 6.589210946094859e-06, + "loss": 0.7683, + "step": 14488 + }, + { + "epoch": 0.7974572073311684, + "grad_norm": 0.6567399501800537, + "learning_rate": 6.5887999515116586e-06, + "loss": 0.7487, + "step": 14489 + }, + { + "epoch": 0.7975122461335241, + "grad_norm": 0.8082888722419739, + "learning_rate": 6.5883889449874626e-06, + "loss": 0.7579, + "step": 14490 + }, + { + "epoch": 0.7975672849358798, + "grad_norm": 0.7138401865959167, + "learning_rate": 6.58797792652536e-06, + "loss": 0.7256, + "step": 14491 + }, + { + "epoch": 0.7976223237382355, + "grad_norm": 0.6514482498168945, + "learning_rate": 6.587566896128441e-06, + "loss": 0.6612, + "step": 14492 + }, + { + "epoch": 0.7976773625405911, + "grad_norm": 0.6770455837249756, + "learning_rate": 6.587155853799795e-06, + "loss": 0.677, + "step": 14493 + }, + { + "epoch": 0.7977324013429468, + "grad_norm": 0.6956327557563782, + "learning_rate": 6.586744799542511e-06, + "loss": 0.7824, + "step": 14494 + }, + { + "epoch": 0.7977874401453025, + "grad_norm": 0.6565653085708618, + "learning_rate": 6.586333733359676e-06, + "loss": 0.7496, + "step": 14495 + }, + { + "epoch": 0.7978424789476581, + "grad_norm": 0.6353399157524109, + "learning_rate": 6.585922655254382e-06, + "loss": 0.7264, + "step": 14496 + }, + { + "epoch": 0.7978975177500137, + "grad_norm": 1.037051796913147, + "learning_rate": 6.585511565229717e-06, + "loss": 0.7562, + "step": 14497 + }, + { + "epoch": 0.7979525565523694, + "grad_norm": 0.6447896957397461, + "learning_rate": 6.5851004632887725e-06, + "loss": 0.7509, + "step": 14498 + }, + { + "epoch": 0.7980075953547251, + "grad_norm": 0.7022401690483093, + "learning_rate": 6.584689349434636e-06, + "loss": 0.7752, + "step": 14499 + }, + { + "epoch": 0.7980626341570808, + "grad_norm": 0.7033591270446777, + "learning_rate": 6.5842782236703996e-06, + "loss": 0.7693, + "step": 14500 + }, + { + "epoch": 0.7981176729594364, + "grad_norm": 0.7061769962310791, + "learning_rate": 6.583867085999151e-06, + "loss": 0.6833, + "step": 14501 + }, + { + "epoch": 0.798172711761792, + "grad_norm": 0.7934882640838623, + "learning_rate": 6.583455936423984e-06, + "loss": 0.799, + "step": 14502 + }, + { + "epoch": 0.7982277505641477, + "grad_norm": 0.6968011260032654, + "learning_rate": 6.5830447749479835e-06, + "loss": 0.7132, + "step": 14503 + }, + { + "epoch": 0.7982827893665034, + "grad_norm": 1.7348299026489258, + "learning_rate": 6.582633601574243e-06, + "loss": 0.8996, + "step": 14504 + }, + { + "epoch": 0.798337828168859, + "grad_norm": 0.6822964549064636, + "learning_rate": 6.582222416305852e-06, + "loss": 0.7381, + "step": 14505 + }, + { + "epoch": 0.7983928669712147, + "grad_norm": 0.6600543856620789, + "learning_rate": 6.581811219145902e-06, + "loss": 0.711, + "step": 14506 + }, + { + "epoch": 0.7984479057735704, + "grad_norm": 0.8719834089279175, + "learning_rate": 6.581400010097481e-06, + "loss": 0.7567, + "step": 14507 + }, + { + "epoch": 0.7985029445759261, + "grad_norm": 0.7221046090126038, + "learning_rate": 6.580988789163681e-06, + "loss": 0.7417, + "step": 14508 + }, + { + "epoch": 0.7985579833782817, + "grad_norm": 0.6720401048660278, + "learning_rate": 6.580577556347592e-06, + "loss": 0.7467, + "step": 14509 + }, + { + "epoch": 0.7986130221806373, + "grad_norm": 0.7007263898849487, + "learning_rate": 6.580166311652306e-06, + "loss": 0.7356, + "step": 14510 + }, + { + "epoch": 0.798668060982993, + "grad_norm": 0.7384739518165588, + "learning_rate": 6.579755055080912e-06, + "loss": 0.7807, + "step": 14511 + }, + { + "epoch": 0.7987230997853487, + "grad_norm": 0.8054519295692444, + "learning_rate": 6.579343786636503e-06, + "loss": 0.7737, + "step": 14512 + }, + { + "epoch": 0.7987781385877043, + "grad_norm": 1.042319655418396, + "learning_rate": 6.578932506322169e-06, + "loss": 0.8708, + "step": 14513 + }, + { + "epoch": 0.79883317739006, + "grad_norm": 0.7122198343276978, + "learning_rate": 6.578521214141e-06, + "loss": 0.7818, + "step": 14514 + }, + { + "epoch": 0.7988882161924157, + "grad_norm": 0.9158271551132202, + "learning_rate": 6.578109910096088e-06, + "loss": 0.7439, + "step": 14515 + }, + { + "epoch": 0.7989432549947714, + "grad_norm": 0.7280082106590271, + "learning_rate": 6.577698594190524e-06, + "loss": 0.7888, + "step": 14516 + }, + { + "epoch": 0.798998293797127, + "grad_norm": 0.8203748464584351, + "learning_rate": 6.577287266427401e-06, + "loss": 0.7669, + "step": 14517 + }, + { + "epoch": 0.7990533325994826, + "grad_norm": 0.6998257637023926, + "learning_rate": 6.576875926809809e-06, + "loss": 0.7819, + "step": 14518 + }, + { + "epoch": 0.7991083714018383, + "grad_norm": 0.672575831413269, + "learning_rate": 6.57646457534084e-06, + "loss": 0.7359, + "step": 14519 + }, + { + "epoch": 0.7991634102041939, + "grad_norm": 0.931996762752533, + "learning_rate": 6.5760532120235845e-06, + "loss": 0.8816, + "step": 14520 + }, + { + "epoch": 0.7992184490065496, + "grad_norm": 0.7250553369522095, + "learning_rate": 6.575641836861134e-06, + "loss": 0.7924, + "step": 14521 + }, + { + "epoch": 0.7992734878089053, + "grad_norm": 0.6658768057823181, + "learning_rate": 6.575230449856582e-06, + "loss": 0.7064, + "step": 14522 + }, + { + "epoch": 0.799328526611261, + "grad_norm": 0.6901206374168396, + "learning_rate": 6.57481905101302e-06, + "loss": 0.7826, + "step": 14523 + }, + { + "epoch": 0.7993835654136165, + "grad_norm": 0.6772152781486511, + "learning_rate": 6.5744076403335386e-06, + "loss": 0.8143, + "step": 14524 + }, + { + "epoch": 0.7994386042159722, + "grad_norm": 0.6718147397041321, + "learning_rate": 6.5739962178212325e-06, + "loss": 0.765, + "step": 14525 + }, + { + "epoch": 0.7994936430183279, + "grad_norm": 0.7435488700866699, + "learning_rate": 6.573584783479191e-06, + "loss": 0.8685, + "step": 14526 + }, + { + "epoch": 0.7995486818206836, + "grad_norm": 0.7146314382553101, + "learning_rate": 6.573173337310506e-06, + "loss": 0.7605, + "step": 14527 + }, + { + "epoch": 0.7996037206230392, + "grad_norm": 0.6808409690856934, + "learning_rate": 6.572761879318274e-06, + "loss": 0.6996, + "step": 14528 + }, + { + "epoch": 0.7996587594253949, + "grad_norm": 1.1303905248641968, + "learning_rate": 6.572350409505584e-06, + "loss": 0.6107, + "step": 14529 + }, + { + "epoch": 0.7997137982277506, + "grad_norm": 0.7584583163261414, + "learning_rate": 6.571938927875529e-06, + "loss": 0.771, + "step": 14530 + }, + { + "epoch": 0.7997688370301063, + "grad_norm": 0.808233916759491, + "learning_rate": 6.5715274344312015e-06, + "loss": 0.7179, + "step": 14531 + }, + { + "epoch": 0.7998238758324618, + "grad_norm": 0.7067314386367798, + "learning_rate": 6.571115929175695e-06, + "loss": 0.7519, + "step": 14532 + }, + { + "epoch": 0.7998789146348175, + "grad_norm": 0.7611628174781799, + "learning_rate": 6.570704412112101e-06, + "loss": 0.8727, + "step": 14533 + }, + { + "epoch": 0.7999339534371732, + "grad_norm": 0.6485727429389954, + "learning_rate": 6.5702928832435145e-06, + "loss": 0.8455, + "step": 14534 + }, + { + "epoch": 0.7999889922395289, + "grad_norm": 1.5309134721755981, + "learning_rate": 6.569881342573024e-06, + "loss": 0.8362, + "step": 14535 + }, + { + "epoch": 0.8000440310418845, + "grad_norm": 0.7068225145339966, + "learning_rate": 6.569469790103729e-06, + "loss": 0.7924, + "step": 14536 + }, + { + "epoch": 0.8000990698442402, + "grad_norm": 0.7326669692993164, + "learning_rate": 6.569058225838717e-06, + "loss": 0.7594, + "step": 14537 + }, + { + "epoch": 0.8001541086465959, + "grad_norm": 0.6705706119537354, + "learning_rate": 6.568646649781085e-06, + "loss": 0.7331, + "step": 14538 + }, + { + "epoch": 0.8002091474489516, + "grad_norm": 0.7303051948547363, + "learning_rate": 6.568235061933923e-06, + "loss": 0.7274, + "step": 14539 + }, + { + "epoch": 0.8002641862513071, + "grad_norm": 0.6334550380706787, + "learning_rate": 6.567823462300326e-06, + "loss": 0.7105, + "step": 14540 + }, + { + "epoch": 0.8003192250536628, + "grad_norm": 0.7183839678764343, + "learning_rate": 6.56741185088339e-06, + "loss": 0.657, + "step": 14541 + }, + { + "epoch": 0.8003742638560185, + "grad_norm": 0.6896400451660156, + "learning_rate": 6.567000227686204e-06, + "loss": 0.7752, + "step": 14542 + }, + { + "epoch": 0.8004293026583742, + "grad_norm": 0.7214651703834534, + "learning_rate": 6.566588592711864e-06, + "loss": 0.753, + "step": 14543 + }, + { + "epoch": 0.8004843414607298, + "grad_norm": 0.7064470648765564, + "learning_rate": 6.566176945963464e-06, + "loss": 0.744, + "step": 14544 + }, + { + "epoch": 0.8005393802630855, + "grad_norm": 0.696674644947052, + "learning_rate": 6.565765287444097e-06, + "loss": 0.6822, + "step": 14545 + }, + { + "epoch": 0.8005944190654412, + "grad_norm": 0.711722195148468, + "learning_rate": 6.5653536171568574e-06, + "loss": 0.7724, + "step": 14546 + }, + { + "epoch": 0.8006494578677968, + "grad_norm": 0.791977047920227, + "learning_rate": 6.564941935104838e-06, + "loss": 0.7913, + "step": 14547 + }, + { + "epoch": 0.8007044966701524, + "grad_norm": 0.6904259920120239, + "learning_rate": 6.564530241291135e-06, + "loss": 0.7732, + "step": 14548 + }, + { + "epoch": 0.8007595354725081, + "grad_norm": 0.6089264750480652, + "learning_rate": 6.564118535718842e-06, + "loss": 0.6506, + "step": 14549 + }, + { + "epoch": 0.8008145742748638, + "grad_norm": 0.6502360105514526, + "learning_rate": 6.563706818391051e-06, + "loss": 0.6638, + "step": 14550 + }, + { + "epoch": 0.8008696130772195, + "grad_norm": 0.6249814033508301, + "learning_rate": 6.563295089310859e-06, + "loss": 0.7066, + "step": 14551 + }, + { + "epoch": 0.8009246518795751, + "grad_norm": 0.8013060688972473, + "learning_rate": 6.56288334848136e-06, + "loss": 0.7968, + "step": 14552 + }, + { + "epoch": 0.8009796906819308, + "grad_norm": 0.7289897799491882, + "learning_rate": 6.562471595905648e-06, + "loss": 0.752, + "step": 14553 + }, + { + "epoch": 0.8010347294842864, + "grad_norm": 0.6774812340736389, + "learning_rate": 6.5620598315868176e-06, + "loss": 0.8263, + "step": 14554 + }, + { + "epoch": 0.8010897682866421, + "grad_norm": 0.6756269931793213, + "learning_rate": 6.561648055527965e-06, + "loss": 0.8096, + "step": 14555 + }, + { + "epoch": 0.8011448070889977, + "grad_norm": 0.7138845324516296, + "learning_rate": 6.5612362677321815e-06, + "loss": 0.7513, + "step": 14556 + }, + { + "epoch": 0.8011998458913534, + "grad_norm": 0.6763927340507507, + "learning_rate": 6.5608244682025656e-06, + "loss": 0.7975, + "step": 14557 + }, + { + "epoch": 0.8012548846937091, + "grad_norm": 0.8147655129432678, + "learning_rate": 6.56041265694221e-06, + "loss": 0.8192, + "step": 14558 + }, + { + "epoch": 0.8013099234960648, + "grad_norm": 0.7272641658782959, + "learning_rate": 6.5600008339542095e-06, + "loss": 0.7829, + "step": 14559 + }, + { + "epoch": 0.8013649622984204, + "grad_norm": 0.7464525103569031, + "learning_rate": 6.559588999241661e-06, + "loss": 0.7596, + "step": 14560 + }, + { + "epoch": 0.801420001100776, + "grad_norm": 0.7236443758010864, + "learning_rate": 6.559177152807661e-06, + "loss": 0.8151, + "step": 14561 + }, + { + "epoch": 0.8014750399031317, + "grad_norm": 0.6752793192863464, + "learning_rate": 6.558765294655301e-06, + "loss": 0.7578, + "step": 14562 + }, + { + "epoch": 0.8015300787054873, + "grad_norm": 0.709994375705719, + "learning_rate": 6.558353424787678e-06, + "loss": 0.6847, + "step": 14563 + }, + { + "epoch": 0.801585117507843, + "grad_norm": 0.7082880139350891, + "learning_rate": 6.557941543207889e-06, + "loss": 0.7968, + "step": 14564 + }, + { + "epoch": 0.8016401563101987, + "grad_norm": 0.692663848400116, + "learning_rate": 6.557529649919028e-06, + "loss": 0.6625, + "step": 14565 + }, + { + "epoch": 0.8016951951125544, + "grad_norm": 0.8464102149009705, + "learning_rate": 6.557117744924191e-06, + "loss": 0.7383, + "step": 14566 + }, + { + "epoch": 0.80175023391491, + "grad_norm": 0.6129899024963379, + "learning_rate": 6.5567058282264735e-06, + "loss": 0.7007, + "step": 14567 + }, + { + "epoch": 0.8018052727172656, + "grad_norm": 0.6458886861801147, + "learning_rate": 6.556293899828973e-06, + "loss": 0.7019, + "step": 14568 + }, + { + "epoch": 0.8018603115196213, + "grad_norm": 0.6543694138526917, + "learning_rate": 6.555881959734783e-06, + "loss": 0.7254, + "step": 14569 + }, + { + "epoch": 0.801915350321977, + "grad_norm": 0.7678859829902649, + "learning_rate": 6.555470007947001e-06, + "loss": 0.7952, + "step": 14570 + }, + { + "epoch": 0.8019703891243326, + "grad_norm": 0.7121342420578003, + "learning_rate": 6.555058044468722e-06, + "loss": 0.7951, + "step": 14571 + }, + { + "epoch": 0.8020254279266883, + "grad_norm": 0.6496285200119019, + "learning_rate": 6.554646069303043e-06, + "loss": 0.696, + "step": 14572 + }, + { + "epoch": 0.802080466729044, + "grad_norm": 0.7206087112426758, + "learning_rate": 6.5542340824530614e-06, + "loss": 0.7599, + "step": 14573 + }, + { + "epoch": 0.8021355055313997, + "grad_norm": 0.7285301685333252, + "learning_rate": 6.553822083921872e-06, + "loss": 0.7805, + "step": 14574 + }, + { + "epoch": 0.8021905443337553, + "grad_norm": 0.7524350881576538, + "learning_rate": 6.553410073712572e-06, + "loss": 0.7388, + "step": 14575 + }, + { + "epoch": 0.8022455831361109, + "grad_norm": 0.7634537220001221, + "learning_rate": 6.552998051828256e-06, + "loss": 0.6969, + "step": 14576 + }, + { + "epoch": 0.8023006219384666, + "grad_norm": 0.6950779557228088, + "learning_rate": 6.552586018272024e-06, + "loss": 0.8533, + "step": 14577 + }, + { + "epoch": 0.8023556607408223, + "grad_norm": 0.694496214389801, + "learning_rate": 6.552173973046972e-06, + "loss": 0.766, + "step": 14578 + }, + { + "epoch": 0.8024106995431779, + "grad_norm": 0.8068329691886902, + "learning_rate": 6.5517619161561954e-06, + "loss": 0.7642, + "step": 14579 + }, + { + "epoch": 0.8024657383455336, + "grad_norm": 0.6933363080024719, + "learning_rate": 6.5513498476027905e-06, + "loss": 0.8721, + "step": 14580 + }, + { + "epoch": 0.8025207771478893, + "grad_norm": 0.7041658163070679, + "learning_rate": 6.550937767389857e-06, + "loss": 0.6654, + "step": 14581 + }, + { + "epoch": 0.802575815950245, + "grad_norm": 0.7080103754997253, + "learning_rate": 6.550525675520489e-06, + "loss": 0.6917, + "step": 14582 + }, + { + "epoch": 0.8026308547526005, + "grad_norm": 0.6644875407218933, + "learning_rate": 6.550113571997785e-06, + "loss": 0.7674, + "step": 14583 + }, + { + "epoch": 0.8026858935549562, + "grad_norm": 0.7660395503044128, + "learning_rate": 6.549701456824843e-06, + "loss": 0.792, + "step": 14584 + }, + { + "epoch": 0.8027409323573119, + "grad_norm": 0.6853451132774353, + "learning_rate": 6.549289330004759e-06, + "loss": 0.8038, + "step": 14585 + }, + { + "epoch": 0.8027959711596676, + "grad_norm": 0.7349985837936401, + "learning_rate": 6.548877191540632e-06, + "loss": 0.7658, + "step": 14586 + }, + { + "epoch": 0.8028510099620232, + "grad_norm": 0.7605637311935425, + "learning_rate": 6.548465041435557e-06, + "loss": 0.7691, + "step": 14587 + }, + { + "epoch": 0.8029060487643789, + "grad_norm": 0.7635177969932556, + "learning_rate": 6.548052879692635e-06, + "loss": 0.8337, + "step": 14588 + }, + { + "epoch": 0.8029610875667346, + "grad_norm": 0.6873355507850647, + "learning_rate": 6.5476407063149614e-06, + "loss": 0.64, + "step": 14589 + }, + { + "epoch": 0.8030161263690903, + "grad_norm": 0.7642813920974731, + "learning_rate": 6.547228521305635e-06, + "loss": 0.6961, + "step": 14590 + }, + { + "epoch": 0.8030711651714458, + "grad_norm": 0.6329793334007263, + "learning_rate": 6.546816324667752e-06, + "loss": 0.73, + "step": 14591 + }, + { + "epoch": 0.8031262039738015, + "grad_norm": 0.6932308673858643, + "learning_rate": 6.546404116404412e-06, + "loss": 0.7582, + "step": 14592 + }, + { + "epoch": 0.8031812427761572, + "grad_norm": 0.699260413646698, + "learning_rate": 6.545991896518713e-06, + "loss": 0.7219, + "step": 14593 + }, + { + "epoch": 0.8032362815785129, + "grad_norm": 0.6217201948165894, + "learning_rate": 6.545579665013754e-06, + "loss": 0.6237, + "step": 14594 + }, + { + "epoch": 0.8032913203808685, + "grad_norm": 0.7078647017478943, + "learning_rate": 6.545167421892629e-06, + "loss": 0.666, + "step": 14595 + }, + { + "epoch": 0.8033463591832242, + "grad_norm": 0.6955916881561279, + "learning_rate": 6.544755167158441e-06, + "loss": 0.737, + "step": 14596 + }, + { + "epoch": 0.8034013979855799, + "grad_norm": 0.8195130825042725, + "learning_rate": 6.544342900814287e-06, + "loss": 0.787, + "step": 14597 + }, + { + "epoch": 0.8034564367879355, + "grad_norm": 0.6160768270492554, + "learning_rate": 6.543930622863263e-06, + "loss": 0.6141, + "step": 14598 + }, + { + "epoch": 0.8035114755902911, + "grad_norm": 0.8483116030693054, + "learning_rate": 6.543518333308472e-06, + "loss": 0.7639, + "step": 14599 + }, + { + "epoch": 0.8035665143926468, + "grad_norm": 0.6937680244445801, + "learning_rate": 6.5431060321530105e-06, + "loss": 0.7484, + "step": 14600 + }, + { + "epoch": 0.8036215531950025, + "grad_norm": 0.6298720836639404, + "learning_rate": 6.542693719399975e-06, + "loss": 0.6357, + "step": 14601 + }, + { + "epoch": 0.8036765919973582, + "grad_norm": 0.6431903839111328, + "learning_rate": 6.54228139505247e-06, + "loss": 0.6749, + "step": 14602 + }, + { + "epoch": 0.8037316307997138, + "grad_norm": 0.8972636461257935, + "learning_rate": 6.541869059113588e-06, + "loss": 0.8907, + "step": 14603 + }, + { + "epoch": 0.8037866696020695, + "grad_norm": 0.7302204966545105, + "learning_rate": 6.5414567115864316e-06, + "loss": 0.7494, + "step": 14604 + }, + { + "epoch": 0.8038417084044251, + "grad_norm": 0.7784821391105652, + "learning_rate": 6.541044352474099e-06, + "loss": 0.6582, + "step": 14605 + }, + { + "epoch": 0.8038967472067807, + "grad_norm": 0.7257398366928101, + "learning_rate": 6.54063198177969e-06, + "loss": 0.7362, + "step": 14606 + }, + { + "epoch": 0.8039517860091364, + "grad_norm": 0.6745980381965637, + "learning_rate": 6.540219599506302e-06, + "loss": 0.6756, + "step": 14607 + }, + { + "epoch": 0.8040068248114921, + "grad_norm": 0.8664490580558777, + "learning_rate": 6.539807205657037e-06, + "loss": 0.6728, + "step": 14608 + }, + { + "epoch": 0.8040618636138478, + "grad_norm": 0.704233705997467, + "learning_rate": 6.5393948002349926e-06, + "loss": 0.7713, + "step": 14609 + }, + { + "epoch": 0.8041169024162034, + "grad_norm": 0.7709019780158997, + "learning_rate": 6.538982383243271e-06, + "loss": 0.8148, + "step": 14610 + }, + { + "epoch": 0.8041719412185591, + "grad_norm": 0.7056839466094971, + "learning_rate": 6.538569954684967e-06, + "loss": 0.7143, + "step": 14611 + }, + { + "epoch": 0.8042269800209148, + "grad_norm": 0.715506374835968, + "learning_rate": 6.538157514563184e-06, + "loss": 0.7932, + "step": 14612 + }, + { + "epoch": 0.8042820188232704, + "grad_norm": 0.8245391845703125, + "learning_rate": 6.537745062881021e-06, + "loss": 0.7569, + "step": 14613 + }, + { + "epoch": 0.804337057625626, + "grad_norm": 0.6912628412246704, + "learning_rate": 6.5373325996415794e-06, + "loss": 0.7174, + "step": 14614 + }, + { + "epoch": 0.8043920964279817, + "grad_norm": 0.6994870901107788, + "learning_rate": 6.536920124847955e-06, + "loss": 0.6174, + "step": 14615 + }, + { + "epoch": 0.8044471352303374, + "grad_norm": 0.6660363674163818, + "learning_rate": 6.536507638503251e-06, + "loss": 0.8065, + "step": 14616 + }, + { + "epoch": 0.8045021740326931, + "grad_norm": 0.6742863059043884, + "learning_rate": 6.536095140610567e-06, + "loss": 0.7984, + "step": 14617 + }, + { + "epoch": 0.8045572128350487, + "grad_norm": 0.6868259906768799, + "learning_rate": 6.535682631173005e-06, + "loss": 0.7907, + "step": 14618 + }, + { + "epoch": 0.8046122516374044, + "grad_norm": 0.7442048788070679, + "learning_rate": 6.5352701101936615e-06, + "loss": 0.7893, + "step": 14619 + }, + { + "epoch": 0.80466729043976, + "grad_norm": 0.7389286756515503, + "learning_rate": 6.534857577675639e-06, + "loss": 0.827, + "step": 14620 + }, + { + "epoch": 0.8047223292421157, + "grad_norm": 0.6679701209068298, + "learning_rate": 6.534445033622036e-06, + "loss": 0.6721, + "step": 14621 + }, + { + "epoch": 0.8047773680444713, + "grad_norm": 0.6372442841529846, + "learning_rate": 6.534032478035957e-06, + "loss": 0.7381, + "step": 14622 + }, + { + "epoch": 0.804832406846827, + "grad_norm": 0.7682638764381409, + "learning_rate": 6.533619910920501e-06, + "loss": 0.7003, + "step": 14623 + }, + { + "epoch": 0.8048874456491827, + "grad_norm": 0.6821291446685791, + "learning_rate": 6.533207332278767e-06, + "loss": 0.8164, + "step": 14624 + }, + { + "epoch": 0.8049424844515384, + "grad_norm": 0.6591019034385681, + "learning_rate": 6.532794742113858e-06, + "loss": 0.6772, + "step": 14625 + }, + { + "epoch": 0.804997523253894, + "grad_norm": 0.7331292033195496, + "learning_rate": 6.532382140428874e-06, + "loss": 0.7606, + "step": 14626 + }, + { + "epoch": 0.8050525620562496, + "grad_norm": 0.9654768705368042, + "learning_rate": 6.531969527226917e-06, + "loss": 0.9196, + "step": 14627 + }, + { + "epoch": 0.8051076008586053, + "grad_norm": 0.6320267915725708, + "learning_rate": 6.5315569025110844e-06, + "loss": 0.6982, + "step": 14628 + }, + { + "epoch": 0.805162639660961, + "grad_norm": 0.6921746134757996, + "learning_rate": 6.531144266284481e-06, + "loss": 0.7176, + "step": 14629 + }, + { + "epoch": 0.8052176784633166, + "grad_norm": 0.7233335375785828, + "learning_rate": 6.530731618550208e-06, + "loss": 0.8388, + "step": 14630 + }, + { + "epoch": 0.8052727172656723, + "grad_norm": 0.6576363444328308, + "learning_rate": 6.530318959311366e-06, + "loss": 0.7511, + "step": 14631 + }, + { + "epoch": 0.805327756068028, + "grad_norm": 0.6921162009239197, + "learning_rate": 6.529906288571055e-06, + "loss": 0.8161, + "step": 14632 + }, + { + "epoch": 0.8053827948703837, + "grad_norm": 0.7314246296882629, + "learning_rate": 6.529493606332379e-06, + "loss": 0.7824, + "step": 14633 + }, + { + "epoch": 0.8054378336727392, + "grad_norm": 0.6419001221656799, + "learning_rate": 6.529080912598438e-06, + "loss": 0.7593, + "step": 14634 + }, + { + "epoch": 0.8054928724750949, + "grad_norm": 0.9500213861465454, + "learning_rate": 6.528668207372335e-06, + "loss": 0.7429, + "step": 14635 + }, + { + "epoch": 0.8055479112774506, + "grad_norm": 0.7299035787582397, + "learning_rate": 6.52825549065717e-06, + "loss": 0.8064, + "step": 14636 + }, + { + "epoch": 0.8056029500798063, + "grad_norm": 0.6231887936592102, + "learning_rate": 6.527842762456046e-06, + "loss": 0.6177, + "step": 14637 + }, + { + "epoch": 0.8056579888821619, + "grad_norm": 0.6219315528869629, + "learning_rate": 6.527430022772066e-06, + "loss": 0.6781, + "step": 14638 + }, + { + "epoch": 0.8057130276845176, + "grad_norm": 0.696861982345581, + "learning_rate": 6.527017271608329e-06, + "loss": 0.7508, + "step": 14639 + }, + { + "epoch": 0.8057680664868733, + "grad_norm": 0.7849573493003845, + "learning_rate": 6.5266045089679394e-06, + "loss": 0.7347, + "step": 14640 + }, + { + "epoch": 0.805823105289229, + "grad_norm": 0.6350993514060974, + "learning_rate": 6.526191734853999e-06, + "loss": 0.6863, + "step": 14641 + }, + { + "epoch": 0.8058781440915845, + "grad_norm": 0.6293141841888428, + "learning_rate": 6.5257789492696115e-06, + "loss": 0.7288, + "step": 14642 + }, + { + "epoch": 0.8059331828939402, + "grad_norm": 0.7801508903503418, + "learning_rate": 6.525366152217876e-06, + "loss": 0.7592, + "step": 14643 + }, + { + "epoch": 0.8059882216962959, + "grad_norm": 0.7031479477882385, + "learning_rate": 6.5249533437018964e-06, + "loss": 0.8677, + "step": 14644 + }, + { + "epoch": 0.8060432604986516, + "grad_norm": 0.7052507996559143, + "learning_rate": 6.524540523724777e-06, + "loss": 0.7957, + "step": 14645 + }, + { + "epoch": 0.8060982993010072, + "grad_norm": 0.669743537902832, + "learning_rate": 6.524127692289619e-06, + "loss": 0.7163, + "step": 14646 + }, + { + "epoch": 0.8061533381033629, + "grad_norm": 0.7180876731872559, + "learning_rate": 6.523714849399525e-06, + "loss": 0.8814, + "step": 14647 + }, + { + "epoch": 0.8062083769057186, + "grad_norm": 0.6617746353149414, + "learning_rate": 6.523301995057597e-06, + "loss": 0.721, + "step": 14648 + }, + { + "epoch": 0.8062634157080741, + "grad_norm": 0.6464657783508301, + "learning_rate": 6.5228891292669404e-06, + "loss": 0.7334, + "step": 14649 + }, + { + "epoch": 0.8063184545104298, + "grad_norm": 0.7648638486862183, + "learning_rate": 6.522476252030658e-06, + "loss": 0.7701, + "step": 14650 + }, + { + "epoch": 0.8063734933127855, + "grad_norm": 0.7313019037246704, + "learning_rate": 6.522063363351851e-06, + "loss": 0.7912, + "step": 14651 + }, + { + "epoch": 0.8064285321151412, + "grad_norm": 0.6175631284713745, + "learning_rate": 6.5216504632336195e-06, + "loss": 0.7568, + "step": 14652 + }, + { + "epoch": 0.8064835709174968, + "grad_norm": 0.6935408711433411, + "learning_rate": 6.521237551679074e-06, + "loss": 0.7622, + "step": 14653 + }, + { + "epoch": 0.8065386097198525, + "grad_norm": 0.7232398390769958, + "learning_rate": 6.520824628691314e-06, + "loss": 0.7908, + "step": 14654 + }, + { + "epoch": 0.8065936485222082, + "grad_norm": 0.6642309427261353, + "learning_rate": 6.520411694273443e-06, + "loss": 0.7355, + "step": 14655 + }, + { + "epoch": 0.8066486873245639, + "grad_norm": 0.6679350137710571, + "learning_rate": 6.5199987484285635e-06, + "loss": 0.735, + "step": 14656 + }, + { + "epoch": 0.8067037261269194, + "grad_norm": 0.6861871480941772, + "learning_rate": 6.519585791159782e-06, + "loss": 0.6744, + "step": 14657 + }, + { + "epoch": 0.8067587649292751, + "grad_norm": 0.7689095735549927, + "learning_rate": 6.519172822470199e-06, + "loss": 0.6888, + "step": 14658 + }, + { + "epoch": 0.8068138037316308, + "grad_norm": 0.6604742407798767, + "learning_rate": 6.5187598423629206e-06, + "loss": 0.6943, + "step": 14659 + }, + { + "epoch": 0.8068688425339865, + "grad_norm": 0.6478890776634216, + "learning_rate": 6.518346850841049e-06, + "loss": 0.7161, + "step": 14660 + }, + { + "epoch": 0.8069238813363421, + "grad_norm": 0.6213741302490234, + "learning_rate": 6.517933847907689e-06, + "loss": 0.68, + "step": 14661 + }, + { + "epoch": 0.8069789201386978, + "grad_norm": 0.7663899660110474, + "learning_rate": 6.517520833565945e-06, + "loss": 0.7498, + "step": 14662 + }, + { + "epoch": 0.8070339589410535, + "grad_norm": 0.653498649597168, + "learning_rate": 6.517107807818921e-06, + "loss": 0.7433, + "step": 14663 + }, + { + "epoch": 0.8070889977434091, + "grad_norm": 0.7618738412857056, + "learning_rate": 6.51669477066972e-06, + "loss": 0.7499, + "step": 14664 + }, + { + "epoch": 0.8071440365457647, + "grad_norm": 0.5960344672203064, + "learning_rate": 6.516281722121447e-06, + "loss": 0.6005, + "step": 14665 + }, + { + "epoch": 0.8071990753481204, + "grad_norm": 0.6768549084663391, + "learning_rate": 6.5158686621772075e-06, + "loss": 0.6859, + "step": 14666 + }, + { + "epoch": 0.8072541141504761, + "grad_norm": 0.6475711464881897, + "learning_rate": 6.515455590840104e-06, + "loss": 0.7582, + "step": 14667 + }, + { + "epoch": 0.8073091529528318, + "grad_norm": 0.7188607454299927, + "learning_rate": 6.5150425081132414e-06, + "loss": 0.7241, + "step": 14668 + }, + { + "epoch": 0.8073641917551874, + "grad_norm": 0.6507582068443298, + "learning_rate": 6.514629413999727e-06, + "loss": 0.7659, + "step": 14669 + }, + { + "epoch": 0.807419230557543, + "grad_norm": 0.6676538586616516, + "learning_rate": 6.514216308502661e-06, + "loss": 0.7336, + "step": 14670 + }, + { + "epoch": 0.8074742693598987, + "grad_norm": 0.7141211628913879, + "learning_rate": 6.513803191625152e-06, + "loss": 0.8121, + "step": 14671 + }, + { + "epoch": 0.8075293081622544, + "grad_norm": 0.7497949600219727, + "learning_rate": 6.513390063370302e-06, + "loss": 0.7238, + "step": 14672 + }, + { + "epoch": 0.80758434696461, + "grad_norm": 0.671271562576294, + "learning_rate": 6.51297692374122e-06, + "loss": 0.7876, + "step": 14673 + }, + { + "epoch": 0.8076393857669657, + "grad_norm": 0.7081878781318665, + "learning_rate": 6.512563772741008e-06, + "loss": 0.6774, + "step": 14674 + }, + { + "epoch": 0.8076944245693214, + "grad_norm": 0.640925943851471, + "learning_rate": 6.512150610372769e-06, + "loss": 0.7094, + "step": 14675 + }, + { + "epoch": 0.8077494633716771, + "grad_norm": 0.6333619952201843, + "learning_rate": 6.511737436639611e-06, + "loss": 0.6439, + "step": 14676 + }, + { + "epoch": 0.8078045021740327, + "grad_norm": 0.7294490337371826, + "learning_rate": 6.511324251544642e-06, + "loss": 0.7786, + "step": 14677 + }, + { + "epoch": 0.8078595409763883, + "grad_norm": 0.6488819718360901, + "learning_rate": 6.510911055090963e-06, + "loss": 0.7495, + "step": 14678 + }, + { + "epoch": 0.807914579778744, + "grad_norm": 0.6535395383834839, + "learning_rate": 6.51049784728168e-06, + "loss": 0.6713, + "step": 14679 + }, + { + "epoch": 0.8079696185810997, + "grad_norm": 0.6795744895935059, + "learning_rate": 6.5100846281198995e-06, + "loss": 0.71, + "step": 14680 + }, + { + "epoch": 0.8080246573834553, + "grad_norm": 0.661171019077301, + "learning_rate": 6.509671397608728e-06, + "loss": 0.7009, + "step": 14681 + }, + { + "epoch": 0.808079696185811, + "grad_norm": 0.6474859118461609, + "learning_rate": 6.50925815575127e-06, + "loss": 0.7268, + "step": 14682 + }, + { + "epoch": 0.8081347349881667, + "grad_norm": 0.676891565322876, + "learning_rate": 6.508844902550633e-06, + "loss": 0.8748, + "step": 14683 + }, + { + "epoch": 0.8081897737905224, + "grad_norm": 0.9747083783149719, + "learning_rate": 6.50843163800992e-06, + "loss": 0.6817, + "step": 14684 + }, + { + "epoch": 0.808244812592878, + "grad_norm": 0.655274510383606, + "learning_rate": 6.50801836213224e-06, + "loss": 0.7675, + "step": 14685 + }, + { + "epoch": 0.8082998513952336, + "grad_norm": 0.6916972398757935, + "learning_rate": 6.507605074920697e-06, + "loss": 0.7862, + "step": 14686 + }, + { + "epoch": 0.8083548901975893, + "grad_norm": 0.7079103589057922, + "learning_rate": 6.5071917763783975e-06, + "loss": 0.671, + "step": 14687 + }, + { + "epoch": 0.808409928999945, + "grad_norm": 0.7460986375808716, + "learning_rate": 6.506778466508447e-06, + "loss": 0.7136, + "step": 14688 + }, + { + "epoch": 0.8084649678023006, + "grad_norm": 0.6531261801719666, + "learning_rate": 6.5063651453139555e-06, + "loss": 0.811, + "step": 14689 + }, + { + "epoch": 0.8085200066046563, + "grad_norm": 0.7160762548446655, + "learning_rate": 6.505951812798025e-06, + "loss": 0.8368, + "step": 14690 + }, + { + "epoch": 0.808575045407012, + "grad_norm": 0.7230852842330933, + "learning_rate": 6.505538468963763e-06, + "loss": 0.6908, + "step": 14691 + }, + { + "epoch": 0.8086300842093676, + "grad_norm": 0.6912978887557983, + "learning_rate": 6.505125113814278e-06, + "loss": 0.6716, + "step": 14692 + }, + { + "epoch": 0.8086851230117232, + "grad_norm": 0.6745109558105469, + "learning_rate": 6.504711747352677e-06, + "loss": 0.7119, + "step": 14693 + }, + { + "epoch": 0.8087401618140789, + "grad_norm": 0.678657054901123, + "learning_rate": 6.5042983695820624e-06, + "loss": 0.7548, + "step": 14694 + }, + { + "epoch": 0.8087952006164346, + "grad_norm": 0.7501665949821472, + "learning_rate": 6.503884980505546e-06, + "loss": 0.7493, + "step": 14695 + }, + { + "epoch": 0.8088502394187902, + "grad_norm": 0.6181747317314148, + "learning_rate": 6.503471580126232e-06, + "loss": 0.7217, + "step": 14696 + }, + { + "epoch": 0.8089052782211459, + "grad_norm": 0.6548559069633484, + "learning_rate": 6.5030581684472295e-06, + "loss": 0.7448, + "step": 14697 + }, + { + "epoch": 0.8089603170235016, + "grad_norm": 0.7716642022132874, + "learning_rate": 6.5026447454716426e-06, + "loss": 0.8794, + "step": 14698 + }, + { + "epoch": 0.8090153558258573, + "grad_norm": 0.861995279788971, + "learning_rate": 6.502231311202581e-06, + "loss": 0.7839, + "step": 14699 + }, + { + "epoch": 0.8090703946282128, + "grad_norm": 0.796821117401123, + "learning_rate": 6.501817865643149e-06, + "loss": 0.8541, + "step": 14700 + }, + { + "epoch": 0.8091254334305685, + "grad_norm": 0.6995296478271484, + "learning_rate": 6.501404408796457e-06, + "loss": 0.677, + "step": 14701 + }, + { + "epoch": 0.8091804722329242, + "grad_norm": 0.6681582927703857, + "learning_rate": 6.500990940665611e-06, + "loss": 0.7754, + "step": 14702 + }, + { + "epoch": 0.8092355110352799, + "grad_norm": 0.5945298671722412, + "learning_rate": 6.50057746125372e-06, + "loss": 0.6762, + "step": 14703 + }, + { + "epoch": 0.8092905498376355, + "grad_norm": 0.672554612159729, + "learning_rate": 6.500163970563889e-06, + "loss": 0.6967, + "step": 14704 + }, + { + "epoch": 0.8093455886399912, + "grad_norm": 0.6375272870063782, + "learning_rate": 6.499750468599227e-06, + "loss": 0.7291, + "step": 14705 + }, + { + "epoch": 0.8094006274423469, + "grad_norm": 0.6369407773017883, + "learning_rate": 6.499336955362844e-06, + "loss": 0.6939, + "step": 14706 + }, + { + "epoch": 0.8094556662447026, + "grad_norm": 0.6497664451599121, + "learning_rate": 6.498923430857844e-06, + "loss": 0.7207, + "step": 14707 + }, + { + "epoch": 0.8095107050470581, + "grad_norm": 0.7345920205116272, + "learning_rate": 6.498509895087337e-06, + "loss": 0.8373, + "step": 14708 + }, + { + "epoch": 0.8095657438494138, + "grad_norm": 0.6824862957000732, + "learning_rate": 6.4980963480544324e-06, + "loss": 0.7531, + "step": 14709 + }, + { + "epoch": 0.8096207826517695, + "grad_norm": 0.7067939639091492, + "learning_rate": 6.497682789762236e-06, + "loss": 0.6951, + "step": 14710 + }, + { + "epoch": 0.8096758214541252, + "grad_norm": 0.6856693625450134, + "learning_rate": 6.497269220213856e-06, + "loss": 0.7264, + "step": 14711 + }, + { + "epoch": 0.8097308602564808, + "grad_norm": 0.6881466507911682, + "learning_rate": 6.4968556394124e-06, + "loss": 0.7837, + "step": 14712 + }, + { + "epoch": 0.8097858990588365, + "grad_norm": 0.6211455464363098, + "learning_rate": 6.49644204736098e-06, + "loss": 0.7278, + "step": 14713 + }, + { + "epoch": 0.8098409378611922, + "grad_norm": 0.688604474067688, + "learning_rate": 6.496028444062701e-06, + "loss": 0.7786, + "step": 14714 + }, + { + "epoch": 0.8098959766635478, + "grad_norm": 0.6615015268325806, + "learning_rate": 6.495614829520673e-06, + "loss": 0.7014, + "step": 14715 + }, + { + "epoch": 0.8099510154659034, + "grad_norm": 0.712661623954773, + "learning_rate": 6.495201203738004e-06, + "loss": 0.6792, + "step": 14716 + }, + { + "epoch": 0.8100060542682591, + "grad_norm": 0.6737191677093506, + "learning_rate": 6.494787566717803e-06, + "loss": 0.7937, + "step": 14717 + }, + { + "epoch": 0.8100610930706148, + "grad_norm": 0.8007351160049438, + "learning_rate": 6.494373918463179e-06, + "loss": 0.8367, + "step": 14718 + }, + { + "epoch": 0.8101161318729705, + "grad_norm": 0.7500883936882019, + "learning_rate": 6.493960258977241e-06, + "loss": 0.8102, + "step": 14719 + }, + { + "epoch": 0.8101711706753261, + "grad_norm": 0.7605966925621033, + "learning_rate": 6.493546588263097e-06, + "loss": 0.8316, + "step": 14720 + }, + { + "epoch": 0.8102262094776818, + "grad_norm": 0.746762216091156, + "learning_rate": 6.493132906323858e-06, + "loss": 0.7765, + "step": 14721 + }, + { + "epoch": 0.8102812482800374, + "grad_norm": 0.6034676432609558, + "learning_rate": 6.49271921316263e-06, + "loss": 0.7109, + "step": 14722 + }, + { + "epoch": 0.8103362870823931, + "grad_norm": 0.6965274810791016, + "learning_rate": 6.492305508782525e-06, + "loss": 0.8156, + "step": 14723 + }, + { + "epoch": 0.8103913258847487, + "grad_norm": 0.6813820004463196, + "learning_rate": 6.4918917931866495e-06, + "loss": 0.7016, + "step": 14724 + }, + { + "epoch": 0.8104463646871044, + "grad_norm": 0.8055655360221863, + "learning_rate": 6.491478066378117e-06, + "loss": 0.7837, + "step": 14725 + }, + { + "epoch": 0.8105014034894601, + "grad_norm": 0.6131647229194641, + "learning_rate": 6.491064328360033e-06, + "loss": 0.6716, + "step": 14726 + }, + { + "epoch": 0.8105564422918158, + "grad_norm": 0.6845986247062683, + "learning_rate": 6.49065057913551e-06, + "loss": 0.8112, + "step": 14727 + }, + { + "epoch": 0.8106114810941714, + "grad_norm": 0.6867175698280334, + "learning_rate": 6.490236818707653e-06, + "loss": 0.7953, + "step": 14728 + }, + { + "epoch": 0.810666519896527, + "grad_norm": 0.7170011401176453, + "learning_rate": 6.489823047079578e-06, + "loss": 0.8108, + "step": 14729 + }, + { + "epoch": 0.8107215586988827, + "grad_norm": 0.6280927658081055, + "learning_rate": 6.489409264254393e-06, + "loss": 0.6807, + "step": 14730 + }, + { + "epoch": 0.8107765975012384, + "grad_norm": 0.8344630002975464, + "learning_rate": 6.488995470235204e-06, + "loss": 0.7555, + "step": 14731 + }, + { + "epoch": 0.810831636303594, + "grad_norm": 0.6674200296401978, + "learning_rate": 6.488581665025125e-06, + "loss": 0.5732, + "step": 14732 + }, + { + "epoch": 0.8108866751059497, + "grad_norm": 0.7843313217163086, + "learning_rate": 6.4881678486272646e-06, + "loss": 0.6689, + "step": 14733 + }, + { + "epoch": 0.8109417139083054, + "grad_norm": 0.6951878666877747, + "learning_rate": 6.487754021044732e-06, + "loss": 0.8005, + "step": 14734 + }, + { + "epoch": 0.810996752710661, + "grad_norm": 0.7773714065551758, + "learning_rate": 6.487340182280639e-06, + "loss": 0.8151, + "step": 14735 + }, + { + "epoch": 0.8110517915130167, + "grad_norm": 0.824998140335083, + "learning_rate": 6.486926332338095e-06, + "loss": 0.7947, + "step": 14736 + }, + { + "epoch": 0.8111068303153723, + "grad_norm": 0.6411730647087097, + "learning_rate": 6.486512471220212e-06, + "loss": 0.7272, + "step": 14737 + }, + { + "epoch": 0.811161869117728, + "grad_norm": 0.6758518815040588, + "learning_rate": 6.486098598930097e-06, + "loss": 0.6676, + "step": 14738 + }, + { + "epoch": 0.8112169079200836, + "grad_norm": 0.7147762179374695, + "learning_rate": 6.485684715470866e-06, + "loss": 0.7796, + "step": 14739 + }, + { + "epoch": 0.8112719467224393, + "grad_norm": 0.7641217112541199, + "learning_rate": 6.485270820845623e-06, + "loss": 0.7943, + "step": 14740 + }, + { + "epoch": 0.811326985524795, + "grad_norm": 0.6947311162948608, + "learning_rate": 6.484856915057482e-06, + "loss": 0.7791, + "step": 14741 + }, + { + "epoch": 0.8113820243271507, + "grad_norm": 0.6781480312347412, + "learning_rate": 6.4844429981095565e-06, + "loss": 0.7399, + "step": 14742 + }, + { + "epoch": 0.8114370631295063, + "grad_norm": 0.6716181039810181, + "learning_rate": 6.484029070004953e-06, + "loss": 0.8111, + "step": 14743 + }, + { + "epoch": 0.8114921019318619, + "grad_norm": 0.8642836213111877, + "learning_rate": 6.4836151307467854e-06, + "loss": 0.756, + "step": 14744 + }, + { + "epoch": 0.8115471407342176, + "grad_norm": 0.5997880101203918, + "learning_rate": 6.483201180338163e-06, + "loss": 0.6043, + "step": 14745 + }, + { + "epoch": 0.8116021795365733, + "grad_norm": 0.7397846579551697, + "learning_rate": 6.4827872187821985e-06, + "loss": 0.848, + "step": 14746 + }, + { + "epoch": 0.8116572183389289, + "grad_norm": 0.7586305141448975, + "learning_rate": 6.482373246082001e-06, + "loss": 0.802, + "step": 14747 + }, + { + "epoch": 0.8117122571412846, + "grad_norm": 0.705182671546936, + "learning_rate": 6.4819592622406825e-06, + "loss": 0.7484, + "step": 14748 + }, + { + "epoch": 0.8117672959436403, + "grad_norm": 0.7092768549919128, + "learning_rate": 6.481545267261357e-06, + "loss": 0.7031, + "step": 14749 + }, + { + "epoch": 0.811822334745996, + "grad_norm": 0.6800800561904907, + "learning_rate": 6.4811312611471325e-06, + "loss": 0.7253, + "step": 14750 + }, + { + "epoch": 0.8118773735483515, + "grad_norm": 0.6862359642982483, + "learning_rate": 6.4807172439011215e-06, + "loss": 0.818, + "step": 14751 + }, + { + "epoch": 0.8119324123507072, + "grad_norm": 0.6928552389144897, + "learning_rate": 6.480303215526436e-06, + "loss": 0.7459, + "step": 14752 + }, + { + "epoch": 0.8119874511530629, + "grad_norm": 0.6869228482246399, + "learning_rate": 6.479889176026189e-06, + "loss": 0.7024, + "step": 14753 + }, + { + "epoch": 0.8120424899554186, + "grad_norm": 0.7036190032958984, + "learning_rate": 6.479475125403489e-06, + "loss": 0.766, + "step": 14754 + }, + { + "epoch": 0.8120975287577742, + "grad_norm": 0.6574180722236633, + "learning_rate": 6.479061063661452e-06, + "loss": 0.7355, + "step": 14755 + }, + { + "epoch": 0.8121525675601299, + "grad_norm": 0.6424534916877747, + "learning_rate": 6.478646990803188e-06, + "loss": 0.6837, + "step": 14756 + }, + { + "epoch": 0.8122076063624856, + "grad_norm": 0.6922320127487183, + "learning_rate": 6.478232906831808e-06, + "loss": 0.7535, + "step": 14757 + }, + { + "epoch": 0.8122626451648413, + "grad_norm": 0.6424705386161804, + "learning_rate": 6.477818811750426e-06, + "loss": 0.691, + "step": 14758 + }, + { + "epoch": 0.8123176839671968, + "grad_norm": 0.6180749535560608, + "learning_rate": 6.4774047055621525e-06, + "loss": 0.6944, + "step": 14759 + }, + { + "epoch": 0.8123727227695525, + "grad_norm": 0.8718746900558472, + "learning_rate": 6.4769905882701e-06, + "loss": 0.89, + "step": 14760 + }, + { + "epoch": 0.8124277615719082, + "grad_norm": 0.6664311289787292, + "learning_rate": 6.476576459877384e-06, + "loss": 0.7144, + "step": 14761 + }, + { + "epoch": 0.8124828003742639, + "grad_norm": 0.6547374129295349, + "learning_rate": 6.476162320387112e-06, + "loss": 0.7292, + "step": 14762 + }, + { + "epoch": 0.8125378391766195, + "grad_norm": 0.7387503385543823, + "learning_rate": 6.475748169802401e-06, + "loss": 0.7388, + "step": 14763 + }, + { + "epoch": 0.8125928779789752, + "grad_norm": 0.6013749241828918, + "learning_rate": 6.475334008126361e-06, + "loss": 0.6853, + "step": 14764 + }, + { + "epoch": 0.8126479167813309, + "grad_norm": 0.6720583438873291, + "learning_rate": 6.474919835362105e-06, + "loss": 0.7392, + "step": 14765 + }, + { + "epoch": 0.8127029555836865, + "grad_norm": 0.6651661992073059, + "learning_rate": 6.474505651512748e-06, + "loss": 0.7586, + "step": 14766 + }, + { + "epoch": 0.8127579943860421, + "grad_norm": 0.7653207182884216, + "learning_rate": 6.474091456581401e-06, + "loss": 0.9182, + "step": 14767 + }, + { + "epoch": 0.8128130331883978, + "grad_norm": 0.6322795152664185, + "learning_rate": 6.473677250571176e-06, + "loss": 0.6954, + "step": 14768 + }, + { + "epoch": 0.8128680719907535, + "grad_norm": 0.7423616647720337, + "learning_rate": 6.4732630334851885e-06, + "loss": 0.748, + "step": 14769 + }, + { + "epoch": 0.8129231107931092, + "grad_norm": 0.5989160537719727, + "learning_rate": 6.472848805326549e-06, + "loss": 0.6571, + "step": 14770 + }, + { + "epoch": 0.8129781495954648, + "grad_norm": 0.695566713809967, + "learning_rate": 6.472434566098373e-06, + "loss": 0.6936, + "step": 14771 + }, + { + "epoch": 0.8130331883978205, + "grad_norm": 0.6993961930274963, + "learning_rate": 6.4720203158037734e-06, + "loss": 0.8283, + "step": 14772 + }, + { + "epoch": 0.8130882272001762, + "grad_norm": 0.6430020928382874, + "learning_rate": 6.471606054445861e-06, + "loss": 0.6882, + "step": 14773 + }, + { + "epoch": 0.8131432660025318, + "grad_norm": 0.6834734678268433, + "learning_rate": 6.471191782027754e-06, + "loss": 0.7519, + "step": 14774 + }, + { + "epoch": 0.8131983048048874, + "grad_norm": 0.679432213306427, + "learning_rate": 6.470777498552561e-06, + "loss": 0.7707, + "step": 14775 + }, + { + "epoch": 0.8132533436072431, + "grad_norm": 0.6929466128349304, + "learning_rate": 6.4703632040234e-06, + "loss": 0.7166, + "step": 14776 + }, + { + "epoch": 0.8133083824095988, + "grad_norm": 0.7033447623252869, + "learning_rate": 6.469948898443381e-06, + "loss": 0.7558, + "step": 14777 + }, + { + "epoch": 0.8133634212119544, + "grad_norm": 0.89338618516922, + "learning_rate": 6.469534581815621e-06, + "loss": 0.7829, + "step": 14778 + }, + { + "epoch": 0.8134184600143101, + "grad_norm": 0.7361789345741272, + "learning_rate": 6.469120254143233e-06, + "loss": 0.7885, + "step": 14779 + }, + { + "epoch": 0.8134734988166658, + "grad_norm": 0.7532172203063965, + "learning_rate": 6.468705915429329e-06, + "loss": 0.7791, + "step": 14780 + }, + { + "epoch": 0.8135285376190214, + "grad_norm": 0.7082527279853821, + "learning_rate": 6.468291565677025e-06, + "loss": 0.7809, + "step": 14781 + }, + { + "epoch": 0.813583576421377, + "grad_norm": 0.7854330539703369, + "learning_rate": 6.467877204889435e-06, + "loss": 0.8467, + "step": 14782 + }, + { + "epoch": 0.8136386152237327, + "grad_norm": 0.7649636268615723, + "learning_rate": 6.467462833069672e-06, + "loss": 0.7766, + "step": 14783 + }, + { + "epoch": 0.8136936540260884, + "grad_norm": 0.6293399930000305, + "learning_rate": 6.467048450220852e-06, + "loss": 0.7307, + "step": 14784 + }, + { + "epoch": 0.8137486928284441, + "grad_norm": 0.7131813764572144, + "learning_rate": 6.4666340563460874e-06, + "loss": 0.7614, + "step": 14785 + }, + { + "epoch": 0.8138037316307997, + "grad_norm": 0.6650925874710083, + "learning_rate": 6.466219651448496e-06, + "loss": 0.7576, + "step": 14786 + }, + { + "epoch": 0.8138587704331554, + "grad_norm": 0.8009011745452881, + "learning_rate": 6.4658052355311875e-06, + "loss": 0.7127, + "step": 14787 + }, + { + "epoch": 0.813913809235511, + "grad_norm": 1.009027123451233, + "learning_rate": 6.465390808597281e-06, + "loss": 0.7647, + "step": 14788 + }, + { + "epoch": 0.8139688480378667, + "grad_norm": 0.7495583891868591, + "learning_rate": 6.464976370649888e-06, + "loss": 0.7276, + "step": 14789 + }, + { + "epoch": 0.8140238868402223, + "grad_norm": 0.7181064486503601, + "learning_rate": 6.464561921692125e-06, + "loss": 0.687, + "step": 14790 + }, + { + "epoch": 0.814078925642578, + "grad_norm": 0.7480552196502686, + "learning_rate": 6.464147461727108e-06, + "loss": 0.7813, + "step": 14791 + }, + { + "epoch": 0.8141339644449337, + "grad_norm": 0.6699607968330383, + "learning_rate": 6.4637329907579506e-06, + "loss": 0.7364, + "step": 14792 + }, + { + "epoch": 0.8141890032472894, + "grad_norm": 0.7321322560310364, + "learning_rate": 6.463318508787767e-06, + "loss": 0.6799, + "step": 14793 + }, + { + "epoch": 0.814244042049645, + "grad_norm": 0.8992179036140442, + "learning_rate": 6.462904015819673e-06, + "loss": 0.7602, + "step": 14794 + }, + { + "epoch": 0.8142990808520006, + "grad_norm": 0.6949485540390015, + "learning_rate": 6.462489511856784e-06, + "loss": 0.6701, + "step": 14795 + }, + { + "epoch": 0.8143541196543563, + "grad_norm": 0.6367032527923584, + "learning_rate": 6.462074996902217e-06, + "loss": 0.7132, + "step": 14796 + }, + { + "epoch": 0.814409158456712, + "grad_norm": 0.6424476504325867, + "learning_rate": 6.461660470959084e-06, + "loss": 0.7111, + "step": 14797 + }, + { + "epoch": 0.8144641972590676, + "grad_norm": 0.6649259924888611, + "learning_rate": 6.4612459340305025e-06, + "loss": 0.6583, + "step": 14798 + }, + { + "epoch": 0.8145192360614233, + "grad_norm": 0.7781171798706055, + "learning_rate": 6.460831386119587e-06, + "loss": 0.8145, + "step": 14799 + }, + { + "epoch": 0.814574274863779, + "grad_norm": 0.7409094572067261, + "learning_rate": 6.460416827229455e-06, + "loss": 0.7559, + "step": 14800 + }, + { + "epoch": 0.8146293136661347, + "grad_norm": 1.2152613401412964, + "learning_rate": 6.46000225736322e-06, + "loss": 0.8263, + "step": 14801 + }, + { + "epoch": 0.8146843524684902, + "grad_norm": 0.7133356332778931, + "learning_rate": 6.459587676524e-06, + "loss": 0.7687, + "step": 14802 + }, + { + "epoch": 0.8147393912708459, + "grad_norm": 0.8576061129570007, + "learning_rate": 6.459173084714908e-06, + "loss": 0.8364, + "step": 14803 + }, + { + "epoch": 0.8147944300732016, + "grad_norm": 0.7701650857925415, + "learning_rate": 6.4587584819390634e-06, + "loss": 0.7768, + "step": 14804 + }, + { + "epoch": 0.8148494688755573, + "grad_norm": 0.6629199981689453, + "learning_rate": 6.45834386819958e-06, + "loss": 0.7338, + "step": 14805 + }, + { + "epoch": 0.8149045076779129, + "grad_norm": 0.6498340964317322, + "learning_rate": 6.457929243499574e-06, + "loss": 0.7241, + "step": 14806 + }, + { + "epoch": 0.8149595464802686, + "grad_norm": 0.7107635140419006, + "learning_rate": 6.457514607842164e-06, + "loss": 0.7999, + "step": 14807 + }, + { + "epoch": 0.8150145852826243, + "grad_norm": 0.8689384460449219, + "learning_rate": 6.457099961230462e-06, + "loss": 0.7882, + "step": 14808 + }, + { + "epoch": 0.81506962408498, + "grad_norm": 0.7050377726554871, + "learning_rate": 6.456685303667587e-06, + "loss": 0.8039, + "step": 14809 + }, + { + "epoch": 0.8151246628873355, + "grad_norm": 0.6171709895133972, + "learning_rate": 6.456270635156656e-06, + "loss": 0.6569, + "step": 14810 + }, + { + "epoch": 0.8151797016896912, + "grad_norm": 0.837285041809082, + "learning_rate": 6.455855955700785e-06, + "loss": 0.6529, + "step": 14811 + }, + { + "epoch": 0.8152347404920469, + "grad_norm": 0.7335891723632812, + "learning_rate": 6.45544126530309e-06, + "loss": 0.814, + "step": 14812 + }, + { + "epoch": 0.8152897792944026, + "grad_norm": 0.7217129468917847, + "learning_rate": 6.4550265639666864e-06, + "loss": 0.795, + "step": 14813 + }, + { + "epoch": 0.8153448180967582, + "grad_norm": 0.7292104959487915, + "learning_rate": 6.454611851694694e-06, + "loss": 0.7169, + "step": 14814 + }, + { + "epoch": 0.8153998568991139, + "grad_norm": 0.7190173864364624, + "learning_rate": 6.454197128490229e-06, + "loss": 0.8413, + "step": 14815 + }, + { + "epoch": 0.8154548957014696, + "grad_norm": 0.6679649949073792, + "learning_rate": 6.453782394356407e-06, + "loss": 0.6626, + "step": 14816 + }, + { + "epoch": 0.8155099345038253, + "grad_norm": 0.6829885244369507, + "learning_rate": 6.453367649296347e-06, + "loss": 0.6512, + "step": 14817 + }, + { + "epoch": 0.8155649733061808, + "grad_norm": 0.659461498260498, + "learning_rate": 6.452952893313163e-06, + "loss": 0.7271, + "step": 14818 + }, + { + "epoch": 0.8156200121085365, + "grad_norm": 0.6737749576568604, + "learning_rate": 6.452538126409975e-06, + "loss": 0.6882, + "step": 14819 + }, + { + "epoch": 0.8156750509108922, + "grad_norm": 0.7798036336898804, + "learning_rate": 6.452123348589899e-06, + "loss": 0.7214, + "step": 14820 + }, + { + "epoch": 0.8157300897132478, + "grad_norm": 0.6594774127006531, + "learning_rate": 6.451708559856051e-06, + "loss": 0.7611, + "step": 14821 + }, + { + "epoch": 0.8157851285156035, + "grad_norm": 0.6795164942741394, + "learning_rate": 6.451293760211552e-06, + "loss": 0.6825, + "step": 14822 + }, + { + "epoch": 0.8158401673179592, + "grad_norm": 0.8376501798629761, + "learning_rate": 6.450878949659517e-06, + "loss": 0.7898, + "step": 14823 + }, + { + "epoch": 0.8158952061203149, + "grad_norm": 0.6746712923049927, + "learning_rate": 6.450464128203064e-06, + "loss": 0.6771, + "step": 14824 + }, + { + "epoch": 0.8159502449226704, + "grad_norm": 0.7984384894371033, + "learning_rate": 6.450049295845311e-06, + "loss": 0.7326, + "step": 14825 + }, + { + "epoch": 0.8160052837250261, + "grad_norm": 0.8210996389389038, + "learning_rate": 6.449634452589376e-06, + "loss": 0.8194, + "step": 14826 + }, + { + "epoch": 0.8160603225273818, + "grad_norm": 0.7045891284942627, + "learning_rate": 6.449219598438376e-06, + "loss": 0.7683, + "step": 14827 + }, + { + "epoch": 0.8161153613297375, + "grad_norm": 0.7199337482452393, + "learning_rate": 6.448804733395431e-06, + "loss": 0.7125, + "step": 14828 + }, + { + "epoch": 0.8161704001320931, + "grad_norm": 0.8576976656913757, + "learning_rate": 6.448389857463655e-06, + "loss": 0.6744, + "step": 14829 + }, + { + "epoch": 0.8162254389344488, + "grad_norm": 0.6944701075553894, + "learning_rate": 6.4479749706461705e-06, + "loss": 0.7663, + "step": 14830 + }, + { + "epoch": 0.8162804777368045, + "grad_norm": 0.7436455488204956, + "learning_rate": 6.447560072946093e-06, + "loss": 0.7612, + "step": 14831 + }, + { + "epoch": 0.8163355165391601, + "grad_norm": 0.6023590564727783, + "learning_rate": 6.447145164366542e-06, + "loss": 0.7029, + "step": 14832 + }, + { + "epoch": 0.8163905553415157, + "grad_norm": 0.6720685362815857, + "learning_rate": 6.446730244910633e-06, + "loss": 0.7821, + "step": 14833 + }, + { + "epoch": 0.8164455941438714, + "grad_norm": 0.6359856128692627, + "learning_rate": 6.446315314581488e-06, + "loss": 0.7119, + "step": 14834 + }, + { + "epoch": 0.8165006329462271, + "grad_norm": 0.6796891689300537, + "learning_rate": 6.445900373382225e-06, + "loss": 0.7414, + "step": 14835 + }, + { + "epoch": 0.8165556717485828, + "grad_norm": 0.6865763068199158, + "learning_rate": 6.445485421315963e-06, + "loss": 0.7239, + "step": 14836 + }, + { + "epoch": 0.8166107105509384, + "grad_norm": 0.6696601510047913, + "learning_rate": 6.445070458385816e-06, + "loss": 0.6322, + "step": 14837 + }, + { + "epoch": 0.8166657493532941, + "grad_norm": 0.6800506711006165, + "learning_rate": 6.444655484594909e-06, + "loss": 0.7827, + "step": 14838 + }, + { + "epoch": 0.8167207881556497, + "grad_norm": 0.7590689063072205, + "learning_rate": 6.444240499946357e-06, + "loss": 0.7177, + "step": 14839 + }, + { + "epoch": 0.8167758269580054, + "grad_norm": 0.6692266464233398, + "learning_rate": 6.4438255044432805e-06, + "loss": 0.6631, + "step": 14840 + }, + { + "epoch": 0.816830865760361, + "grad_norm": 0.695164144039154, + "learning_rate": 6.443410498088798e-06, + "loss": 0.6953, + "step": 14841 + }, + { + "epoch": 0.8168859045627167, + "grad_norm": 0.6503697037696838, + "learning_rate": 6.442995480886028e-06, + "loss": 0.7868, + "step": 14842 + }, + { + "epoch": 0.8169409433650724, + "grad_norm": 0.6943323016166687, + "learning_rate": 6.442580452838091e-06, + "loss": 0.7464, + "step": 14843 + }, + { + "epoch": 0.8169959821674281, + "grad_norm": 0.7510622143745422, + "learning_rate": 6.442165413948105e-06, + "loss": 0.7984, + "step": 14844 + }, + { + "epoch": 0.8170510209697837, + "grad_norm": 0.6322263479232788, + "learning_rate": 6.441750364219189e-06, + "loss": 0.7693, + "step": 14845 + }, + { + "epoch": 0.8171060597721393, + "grad_norm": 0.681967556476593, + "learning_rate": 6.4413353036544646e-06, + "loss": 0.6781, + "step": 14846 + }, + { + "epoch": 0.817161098574495, + "grad_norm": 0.6799043416976929, + "learning_rate": 6.440920232257049e-06, + "loss": 0.7791, + "step": 14847 + }, + { + "epoch": 0.8172161373768507, + "grad_norm": 0.673652172088623, + "learning_rate": 6.440505150030064e-06, + "loss": 0.7099, + "step": 14848 + }, + { + "epoch": 0.8172711761792063, + "grad_norm": 0.755377471446991, + "learning_rate": 6.4400900569766255e-06, + "loss": 0.7292, + "step": 14849 + }, + { + "epoch": 0.817326214981562, + "grad_norm": 0.6099830269813538, + "learning_rate": 6.439674953099857e-06, + "loss": 0.7154, + "step": 14850 + }, + { + "epoch": 0.8173812537839177, + "grad_norm": 0.6330500841140747, + "learning_rate": 6.439259838402878e-06, + "loss": 0.6858, + "step": 14851 + }, + { + "epoch": 0.8174362925862734, + "grad_norm": 0.6727203726768494, + "learning_rate": 6.438844712888806e-06, + "loss": 0.7089, + "step": 14852 + }, + { + "epoch": 0.817491331388629, + "grad_norm": 0.7482651472091675, + "learning_rate": 6.438429576560763e-06, + "loss": 0.7065, + "step": 14853 + }, + { + "epoch": 0.8175463701909846, + "grad_norm": 0.6786343455314636, + "learning_rate": 6.438014429421868e-06, + "loss": 0.7049, + "step": 14854 + }, + { + "epoch": 0.8176014089933403, + "grad_norm": 0.6155980825424194, + "learning_rate": 6.437599271475241e-06, + "loss": 0.607, + "step": 14855 + }, + { + "epoch": 0.817656447795696, + "grad_norm": 0.6551154851913452, + "learning_rate": 6.437184102724003e-06, + "loss": 0.7022, + "step": 14856 + }, + { + "epoch": 0.8177114865980516, + "grad_norm": 0.6127358078956604, + "learning_rate": 6.436768923171273e-06, + "loss": 0.6827, + "step": 14857 + }, + { + "epoch": 0.8177665254004073, + "grad_norm": 0.6470245718955994, + "learning_rate": 6.436353732820175e-06, + "loss": 0.6877, + "step": 14858 + }, + { + "epoch": 0.817821564202763, + "grad_norm": 0.704667866230011, + "learning_rate": 6.435938531673825e-06, + "loss": 0.7223, + "step": 14859 + }, + { + "epoch": 0.8178766030051187, + "grad_norm": 0.6328873634338379, + "learning_rate": 6.435523319735345e-06, + "loss": 0.7181, + "step": 14860 + }, + { + "epoch": 0.8179316418074742, + "grad_norm": 0.6489065885543823, + "learning_rate": 6.435108097007856e-06, + "loss": 0.7597, + "step": 14861 + }, + { + "epoch": 0.8179866806098299, + "grad_norm": 0.6398639678955078, + "learning_rate": 6.43469286349448e-06, + "loss": 0.667, + "step": 14862 + }, + { + "epoch": 0.8180417194121856, + "grad_norm": 0.7615578770637512, + "learning_rate": 6.434277619198335e-06, + "loss": 0.8474, + "step": 14863 + }, + { + "epoch": 0.8180967582145412, + "grad_norm": 0.8604047894477844, + "learning_rate": 6.433862364122545e-06, + "loss": 0.7977, + "step": 14864 + }, + { + "epoch": 0.8181517970168969, + "grad_norm": 0.6157855987548828, + "learning_rate": 6.433447098270228e-06, + "loss": 0.6513, + "step": 14865 + }, + { + "epoch": 0.8182068358192526, + "grad_norm": 0.7052211761474609, + "learning_rate": 6.433031821644507e-06, + "loss": 0.7043, + "step": 14866 + }, + { + "epoch": 0.8182618746216083, + "grad_norm": 0.785987138748169, + "learning_rate": 6.432616534248503e-06, + "loss": 0.8722, + "step": 14867 + }, + { + "epoch": 0.8183169134239638, + "grad_norm": 0.7711461782455444, + "learning_rate": 6.432201236085336e-06, + "loss": 0.68, + "step": 14868 + }, + { + "epoch": 0.8183719522263195, + "grad_norm": 0.6299784183502197, + "learning_rate": 6.431785927158126e-06, + "loss": 0.7397, + "step": 14869 + }, + { + "epoch": 0.8184269910286752, + "grad_norm": 0.6292238235473633, + "learning_rate": 6.431370607469998e-06, + "loss": 0.7392, + "step": 14870 + }, + { + "epoch": 0.8184820298310309, + "grad_norm": 0.8696228861808777, + "learning_rate": 6.430955277024071e-06, + "loss": 0.884, + "step": 14871 + }, + { + "epoch": 0.8185370686333865, + "grad_norm": 0.6754364967346191, + "learning_rate": 6.430539935823469e-06, + "loss": 0.7122, + "step": 14872 + }, + { + "epoch": 0.8185921074357422, + "grad_norm": 0.6936547160148621, + "learning_rate": 6.4301245838713085e-06, + "loss": 0.7353, + "step": 14873 + }, + { + "epoch": 0.8186471462380979, + "grad_norm": 0.8840705156326294, + "learning_rate": 6.429709221170717e-06, + "loss": 0.7043, + "step": 14874 + }, + { + "epoch": 0.8187021850404536, + "grad_norm": 0.7349988222122192, + "learning_rate": 6.4292938477248135e-06, + "loss": 0.7861, + "step": 14875 + }, + { + "epoch": 0.8187572238428091, + "grad_norm": 0.697790801525116, + "learning_rate": 6.428878463536721e-06, + "loss": 0.8021, + "step": 14876 + }, + { + "epoch": 0.8188122626451648, + "grad_norm": 0.7873979806900024, + "learning_rate": 6.428463068609559e-06, + "loss": 0.7313, + "step": 14877 + }, + { + "epoch": 0.8188673014475205, + "grad_norm": 0.6542018055915833, + "learning_rate": 6.4280476629464505e-06, + "loss": 0.7811, + "step": 14878 + }, + { + "epoch": 0.8189223402498762, + "grad_norm": 0.7477063536643982, + "learning_rate": 6.427632246550519e-06, + "loss": 0.764, + "step": 14879 + }, + { + "epoch": 0.8189773790522318, + "grad_norm": 0.6456438302993774, + "learning_rate": 6.4272168194248855e-06, + "loss": 0.7517, + "step": 14880 + }, + { + "epoch": 0.8190324178545875, + "grad_norm": 0.699684202671051, + "learning_rate": 6.426801381572671e-06, + "loss": 0.7963, + "step": 14881 + }, + { + "epoch": 0.8190874566569432, + "grad_norm": 0.9158867001533508, + "learning_rate": 6.426385932997001e-06, + "loss": 0.8782, + "step": 14882 + }, + { + "epoch": 0.8191424954592988, + "grad_norm": 0.5998190641403198, + "learning_rate": 6.425970473700995e-06, + "loss": 0.6598, + "step": 14883 + }, + { + "epoch": 0.8191975342616544, + "grad_norm": 0.6674730777740479, + "learning_rate": 6.4255550036877775e-06, + "loss": 0.7232, + "step": 14884 + }, + { + "epoch": 0.8192525730640101, + "grad_norm": 0.6303582191467285, + "learning_rate": 6.42513952296047e-06, + "loss": 0.7614, + "step": 14885 + }, + { + "epoch": 0.8193076118663658, + "grad_norm": 0.6255910992622375, + "learning_rate": 6.424724031522195e-06, + "loss": 0.7052, + "step": 14886 + }, + { + "epoch": 0.8193626506687215, + "grad_norm": 0.6610854268074036, + "learning_rate": 6.424308529376075e-06, + "loss": 0.7403, + "step": 14887 + }, + { + "epoch": 0.8194176894710771, + "grad_norm": 0.6758664846420288, + "learning_rate": 6.4238930165252355e-06, + "loss": 0.7603, + "step": 14888 + }, + { + "epoch": 0.8194727282734328, + "grad_norm": 0.6897797584533691, + "learning_rate": 6.423477492972796e-06, + "loss": 0.7194, + "step": 14889 + }, + { + "epoch": 0.8195277670757884, + "grad_norm": 0.7007622718811035, + "learning_rate": 6.42306195872188e-06, + "loss": 0.7905, + "step": 14890 + }, + { + "epoch": 0.8195828058781441, + "grad_norm": 0.7482092976570129, + "learning_rate": 6.422646413775613e-06, + "loss": 0.7809, + "step": 14891 + }, + { + "epoch": 0.8196378446804997, + "grad_norm": 0.9551613926887512, + "learning_rate": 6.422230858137115e-06, + "loss": 0.8559, + "step": 14892 + }, + { + "epoch": 0.8196928834828554, + "grad_norm": 0.6831939220428467, + "learning_rate": 6.42181529180951e-06, + "loss": 0.7867, + "step": 14893 + }, + { + "epoch": 0.8197479222852111, + "grad_norm": 1.446377158164978, + "learning_rate": 6.421399714795923e-06, + "loss": 0.8745, + "step": 14894 + }, + { + "epoch": 0.8198029610875668, + "grad_norm": 0.6738638877868652, + "learning_rate": 6.420984127099475e-06, + "loss": 0.727, + "step": 14895 + }, + { + "epoch": 0.8198579998899224, + "grad_norm": 0.7388872504234314, + "learning_rate": 6.420568528723292e-06, + "loss": 0.7041, + "step": 14896 + }, + { + "epoch": 0.819913038692278, + "grad_norm": 0.6977630853652954, + "learning_rate": 6.420152919670495e-06, + "loss": 0.7944, + "step": 14897 + }, + { + "epoch": 0.8199680774946337, + "grad_norm": 0.6300190091133118, + "learning_rate": 6.41973729994421e-06, + "loss": 0.6879, + "step": 14898 + }, + { + "epoch": 0.8200231162969894, + "grad_norm": 0.6350599527359009, + "learning_rate": 6.419321669547559e-06, + "loss": 0.6725, + "step": 14899 + }, + { + "epoch": 0.820078155099345, + "grad_norm": 0.8604453206062317, + "learning_rate": 6.418906028483667e-06, + "loss": 0.7706, + "step": 14900 + }, + { + "epoch": 0.8201331939017007, + "grad_norm": 0.6574103236198425, + "learning_rate": 6.418490376755656e-06, + "loss": 0.7008, + "step": 14901 + }, + { + "epoch": 0.8201882327040564, + "grad_norm": 0.706132173538208, + "learning_rate": 6.418074714366651e-06, + "loss": 0.7608, + "step": 14902 + }, + { + "epoch": 0.8202432715064121, + "grad_norm": 1.155480146408081, + "learning_rate": 6.417659041319777e-06, + "loss": 0.6893, + "step": 14903 + }, + { + "epoch": 0.8202983103087677, + "grad_norm": 0.8497835397720337, + "learning_rate": 6.417243357618157e-06, + "loss": 0.6889, + "step": 14904 + }, + { + "epoch": 0.8203533491111233, + "grad_norm": 0.9319966435432434, + "learning_rate": 6.416827663264915e-06, + "loss": 0.8098, + "step": 14905 + }, + { + "epoch": 0.820408387913479, + "grad_norm": 0.744888186454773, + "learning_rate": 6.4164119582631745e-06, + "loss": 0.7871, + "step": 14906 + }, + { + "epoch": 0.8204634267158346, + "grad_norm": 0.6928347945213318, + "learning_rate": 6.415996242616063e-06, + "loss": 0.7693, + "step": 14907 + }, + { + "epoch": 0.8205184655181903, + "grad_norm": 0.7455456852912903, + "learning_rate": 6.415580516326701e-06, + "loss": 0.6475, + "step": 14908 + }, + { + "epoch": 0.820573504320546, + "grad_norm": 0.6823583245277405, + "learning_rate": 6.415164779398215e-06, + "loss": 0.7223, + "step": 14909 + }, + { + "epoch": 0.8206285431229017, + "grad_norm": 0.6989970207214355, + "learning_rate": 6.414749031833729e-06, + "loss": 0.8203, + "step": 14910 + }, + { + "epoch": 0.8206835819252573, + "grad_norm": 0.6026825308799744, + "learning_rate": 6.414333273636369e-06, + "loss": 0.6307, + "step": 14911 + }, + { + "epoch": 0.8207386207276129, + "grad_norm": 0.6102367639541626, + "learning_rate": 6.413917504809258e-06, + "loss": 0.7049, + "step": 14912 + }, + { + "epoch": 0.8207936595299686, + "grad_norm": 0.6658119559288025, + "learning_rate": 6.4135017253555225e-06, + "loss": 0.7541, + "step": 14913 + }, + { + "epoch": 0.8208486983323243, + "grad_norm": 0.7272284626960754, + "learning_rate": 6.413085935278286e-06, + "loss": 0.7581, + "step": 14914 + }, + { + "epoch": 0.8209037371346799, + "grad_norm": 0.7826990485191345, + "learning_rate": 6.412670134580674e-06, + "loss": 0.8121, + "step": 14915 + }, + { + "epoch": 0.8209587759370356, + "grad_norm": 0.5845723748207092, + "learning_rate": 6.412254323265811e-06, + "loss": 0.5921, + "step": 14916 + }, + { + "epoch": 0.8210138147393913, + "grad_norm": 0.655577540397644, + "learning_rate": 6.411838501336823e-06, + "loss": 0.7694, + "step": 14917 + }, + { + "epoch": 0.821068853541747, + "grad_norm": 0.6722497940063477, + "learning_rate": 6.4114226687968325e-06, + "loss": 0.6377, + "step": 14918 + }, + { + "epoch": 0.8211238923441025, + "grad_norm": 0.713169276714325, + "learning_rate": 6.41100682564897e-06, + "loss": 0.7328, + "step": 14919 + }, + { + "epoch": 0.8211789311464582, + "grad_norm": 0.6004113554954529, + "learning_rate": 6.410590971896357e-06, + "loss": 0.6564, + "step": 14920 + }, + { + "epoch": 0.8212339699488139, + "grad_norm": 0.6541520953178406, + "learning_rate": 6.410175107542119e-06, + "loss": 0.7063, + "step": 14921 + }, + { + "epoch": 0.8212890087511696, + "grad_norm": 0.7937784194946289, + "learning_rate": 6.409759232589383e-06, + "loss": 0.7516, + "step": 14922 + }, + { + "epoch": 0.8213440475535252, + "grad_norm": 0.7017408013343811, + "learning_rate": 6.409343347041274e-06, + "loss": 0.6846, + "step": 14923 + }, + { + "epoch": 0.8213990863558809, + "grad_norm": 0.6233413815498352, + "learning_rate": 6.408927450900917e-06, + "loss": 0.6655, + "step": 14924 + }, + { + "epoch": 0.8214541251582366, + "grad_norm": 0.93160480260849, + "learning_rate": 6.4085115441714396e-06, + "loss": 0.7461, + "step": 14925 + }, + { + "epoch": 0.8215091639605923, + "grad_norm": 0.6075658202171326, + "learning_rate": 6.4080956268559655e-06, + "loss": 0.705, + "step": 14926 + }, + { + "epoch": 0.8215642027629478, + "grad_norm": 0.6212051510810852, + "learning_rate": 6.407679698957623e-06, + "loss": 0.6943, + "step": 14927 + }, + { + "epoch": 0.8216192415653035, + "grad_norm": 0.8143971562385559, + "learning_rate": 6.407263760479536e-06, + "loss": 0.6918, + "step": 14928 + }, + { + "epoch": 0.8216742803676592, + "grad_norm": 0.6851963996887207, + "learning_rate": 6.406847811424831e-06, + "loss": 0.7849, + "step": 14929 + }, + { + "epoch": 0.8217293191700149, + "grad_norm": 0.7047909498214722, + "learning_rate": 6.406431851796633e-06, + "loss": 0.7364, + "step": 14930 + }, + { + "epoch": 0.8217843579723705, + "grad_norm": 0.7377674579620361, + "learning_rate": 6.406015881598071e-06, + "loss": 0.7413, + "step": 14931 + }, + { + "epoch": 0.8218393967747262, + "grad_norm": 0.7188243269920349, + "learning_rate": 6.405599900832271e-06, + "loss": 0.8051, + "step": 14932 + }, + { + "epoch": 0.8218944355770819, + "grad_norm": 0.7588842511177063, + "learning_rate": 6.4051839095023575e-06, + "loss": 0.7687, + "step": 14933 + }, + { + "epoch": 0.8219494743794376, + "grad_norm": 0.6396436095237732, + "learning_rate": 6.404767907611457e-06, + "loss": 0.7516, + "step": 14934 + }, + { + "epoch": 0.8220045131817931, + "grad_norm": 0.6896073818206787, + "learning_rate": 6.404351895162698e-06, + "loss": 0.7904, + "step": 14935 + }, + { + "epoch": 0.8220595519841488, + "grad_norm": 0.7475640773773193, + "learning_rate": 6.403935872159206e-06, + "loss": 0.8325, + "step": 14936 + }, + { + "epoch": 0.8221145907865045, + "grad_norm": 0.6456442475318909, + "learning_rate": 6.403519838604107e-06, + "loss": 0.7685, + "step": 14937 + }, + { + "epoch": 0.8221696295888602, + "grad_norm": 0.6446966528892517, + "learning_rate": 6.40310379450053e-06, + "loss": 0.731, + "step": 14938 + }, + { + "epoch": 0.8222246683912158, + "grad_norm": 0.7744176983833313, + "learning_rate": 6.4026877398515995e-06, + "loss": 0.7975, + "step": 14939 + }, + { + "epoch": 0.8222797071935715, + "grad_norm": 0.6441214680671692, + "learning_rate": 6.402271674660444e-06, + "loss": 0.7386, + "step": 14940 + }, + { + "epoch": 0.8223347459959272, + "grad_norm": 0.6788361072540283, + "learning_rate": 6.40185559893019e-06, + "loss": 0.7664, + "step": 14941 + }, + { + "epoch": 0.8223897847982828, + "grad_norm": 0.6565073132514954, + "learning_rate": 6.4014395126639624e-06, + "loss": 0.6716, + "step": 14942 + }, + { + "epoch": 0.8224448236006384, + "grad_norm": 0.6475300788879395, + "learning_rate": 6.401023415864893e-06, + "loss": 0.6887, + "step": 14943 + }, + { + "epoch": 0.8224998624029941, + "grad_norm": 0.7058338522911072, + "learning_rate": 6.400607308536107e-06, + "loss": 0.7248, + "step": 14944 + }, + { + "epoch": 0.8225549012053498, + "grad_norm": 0.7184485197067261, + "learning_rate": 6.4001911906807305e-06, + "loss": 0.693, + "step": 14945 + }, + { + "epoch": 0.8226099400077055, + "grad_norm": 0.6280504465103149, + "learning_rate": 6.399775062301891e-06, + "loss": 0.6776, + "step": 14946 + }, + { + "epoch": 0.8226649788100611, + "grad_norm": 0.6995168328285217, + "learning_rate": 6.399358923402716e-06, + "loss": 0.7536, + "step": 14947 + }, + { + "epoch": 0.8227200176124168, + "grad_norm": 0.7770118713378906, + "learning_rate": 6.398942773986337e-06, + "loss": 0.6966, + "step": 14948 + }, + { + "epoch": 0.8227750564147724, + "grad_norm": 0.6947488188743591, + "learning_rate": 6.398526614055876e-06, + "loss": 0.7317, + "step": 14949 + }, + { + "epoch": 0.822830095217128, + "grad_norm": 0.7234527468681335, + "learning_rate": 6.3981104436144645e-06, + "loss": 0.6495, + "step": 14950 + }, + { + "epoch": 0.8228851340194837, + "grad_norm": 0.6872434020042419, + "learning_rate": 6.3976942626652295e-06, + "loss": 0.651, + "step": 14951 + }, + { + "epoch": 0.8229401728218394, + "grad_norm": 0.6762012243270874, + "learning_rate": 6.397278071211298e-06, + "loss": 0.7115, + "step": 14952 + }, + { + "epoch": 0.8229952116241951, + "grad_norm": 0.7007278800010681, + "learning_rate": 6.396861869255799e-06, + "loss": 0.717, + "step": 14953 + }, + { + "epoch": 0.8230502504265507, + "grad_norm": 0.7403082251548767, + "learning_rate": 6.396445656801859e-06, + "loss": 0.846, + "step": 14954 + }, + { + "epoch": 0.8231052892289064, + "grad_norm": 0.688758373260498, + "learning_rate": 6.396029433852609e-06, + "loss": 0.7871, + "step": 14955 + }, + { + "epoch": 0.823160328031262, + "grad_norm": 0.7264360189437866, + "learning_rate": 6.395613200411173e-06, + "loss": 0.7803, + "step": 14956 + }, + { + "epoch": 0.8232153668336177, + "grad_norm": 0.6858585476875305, + "learning_rate": 6.395196956480683e-06, + "loss": 0.6595, + "step": 14957 + }, + { + "epoch": 0.8232704056359733, + "grad_norm": 0.7834211587905884, + "learning_rate": 6.394780702064266e-06, + "loss": 0.7689, + "step": 14958 + }, + { + "epoch": 0.823325444438329, + "grad_norm": 0.6933274865150452, + "learning_rate": 6.394364437165052e-06, + "loss": 0.758, + "step": 14959 + }, + { + "epoch": 0.8233804832406847, + "grad_norm": 0.7490070462226868, + "learning_rate": 6.3939481617861664e-06, + "loss": 0.8106, + "step": 14960 + }, + { + "epoch": 0.8234355220430404, + "grad_norm": 0.5586501955986023, + "learning_rate": 6.3935318759307405e-06, + "loss": 0.6207, + "step": 14961 + }, + { + "epoch": 0.823490560845396, + "grad_norm": 0.6999693512916565, + "learning_rate": 6.393115579601902e-06, + "loss": 0.7787, + "step": 14962 + }, + { + "epoch": 0.8235455996477516, + "grad_norm": 1.0214177370071411, + "learning_rate": 6.392699272802779e-06, + "loss": 0.6444, + "step": 14963 + }, + { + "epoch": 0.8236006384501073, + "grad_norm": 0.7808836698532104, + "learning_rate": 6.392282955536502e-06, + "loss": 0.7537, + "step": 14964 + }, + { + "epoch": 0.823655677252463, + "grad_norm": 0.6825253963470459, + "learning_rate": 6.391866627806198e-06, + "loss": 0.7346, + "step": 14965 + }, + { + "epoch": 0.8237107160548186, + "grad_norm": 0.6105558276176453, + "learning_rate": 6.391450289614998e-06, + "loss": 0.6631, + "step": 14966 + }, + { + "epoch": 0.8237657548571743, + "grad_norm": 0.721986711025238, + "learning_rate": 6.391033940966029e-06, + "loss": 0.8638, + "step": 14967 + }, + { + "epoch": 0.82382079365953, + "grad_norm": 0.6226428747177124, + "learning_rate": 6.390617581862421e-06, + "loss": 0.7291, + "step": 14968 + }, + { + "epoch": 0.8238758324618857, + "grad_norm": 0.7403777241706848, + "learning_rate": 6.390201212307305e-06, + "loss": 0.7417, + "step": 14969 + }, + { + "epoch": 0.8239308712642412, + "grad_norm": 0.7188371419906616, + "learning_rate": 6.389784832303808e-06, + "loss": 0.757, + "step": 14970 + }, + { + "epoch": 0.8239859100665969, + "grad_norm": 0.8741163611412048, + "learning_rate": 6.389368441855061e-06, + "loss": 0.7264, + "step": 14971 + }, + { + "epoch": 0.8240409488689526, + "grad_norm": 0.7092788219451904, + "learning_rate": 6.388952040964192e-06, + "loss": 0.731, + "step": 14972 + }, + { + "epoch": 0.8240959876713083, + "grad_norm": 0.9291765689849854, + "learning_rate": 6.388535629634331e-06, + "loss": 0.7964, + "step": 14973 + }, + { + "epoch": 0.8241510264736639, + "grad_norm": 0.6140535473823547, + "learning_rate": 6.388119207868608e-06, + "loss": 0.7099, + "step": 14974 + }, + { + "epoch": 0.8242060652760196, + "grad_norm": 0.654778778553009, + "learning_rate": 6.387702775670154e-06, + "loss": 0.6667, + "step": 14975 + }, + { + "epoch": 0.8242611040783753, + "grad_norm": 0.7221185564994812, + "learning_rate": 6.387286333042095e-06, + "loss": 0.7533, + "step": 14976 + }, + { + "epoch": 0.824316142880731, + "grad_norm": 0.6680133938789368, + "learning_rate": 6.386869879987565e-06, + "loss": 0.6404, + "step": 14977 + }, + { + "epoch": 0.8243711816830865, + "grad_norm": 0.7067292928695679, + "learning_rate": 6.386453416509691e-06, + "loss": 0.8493, + "step": 14978 + }, + { + "epoch": 0.8244262204854422, + "grad_norm": 0.6279785633087158, + "learning_rate": 6.386036942611605e-06, + "loss": 0.7465, + "step": 14979 + }, + { + "epoch": 0.8244812592877979, + "grad_norm": 0.7184332013130188, + "learning_rate": 6.385620458296438e-06, + "loss": 0.738, + "step": 14980 + }, + { + "epoch": 0.8245362980901536, + "grad_norm": 0.7318315505981445, + "learning_rate": 6.385203963567316e-06, + "loss": 0.7409, + "step": 14981 + }, + { + "epoch": 0.8245913368925092, + "grad_norm": 0.6848355531692505, + "learning_rate": 6.384787458427372e-06, + "loss": 0.7343, + "step": 14982 + }, + { + "epoch": 0.8246463756948649, + "grad_norm": 0.7097738981246948, + "learning_rate": 6.384370942879736e-06, + "loss": 0.817, + "step": 14983 + }, + { + "epoch": 0.8247014144972206, + "grad_norm": 0.6933857798576355, + "learning_rate": 6.38395441692754e-06, + "loss": 0.7356, + "step": 14984 + }, + { + "epoch": 0.8247564532995763, + "grad_norm": 0.6631865501403809, + "learning_rate": 6.383537880573913e-06, + "loss": 0.752, + "step": 14985 + }, + { + "epoch": 0.8248114921019318, + "grad_norm": 0.6564633846282959, + "learning_rate": 6.3831213338219855e-06, + "loss": 0.7755, + "step": 14986 + }, + { + "epoch": 0.8248665309042875, + "grad_norm": 0.6518037915229797, + "learning_rate": 6.382704776674887e-06, + "loss": 0.7185, + "step": 14987 + }, + { + "epoch": 0.8249215697066432, + "grad_norm": 0.7074370384216309, + "learning_rate": 6.382288209135752e-06, + "loss": 0.7632, + "step": 14988 + }, + { + "epoch": 0.8249766085089989, + "grad_norm": 0.7034205198287964, + "learning_rate": 6.381871631207707e-06, + "loss": 0.8234, + "step": 14989 + }, + { + "epoch": 0.8250316473113545, + "grad_norm": 0.7635502815246582, + "learning_rate": 6.381455042893884e-06, + "loss": 0.7847, + "step": 14990 + }, + { + "epoch": 0.8250866861137102, + "grad_norm": 0.7682950496673584, + "learning_rate": 6.381038444197416e-06, + "loss": 0.6815, + "step": 14991 + }, + { + "epoch": 0.8251417249160659, + "grad_norm": 0.7713856101036072, + "learning_rate": 6.380621835121432e-06, + "loss": 0.7437, + "step": 14992 + }, + { + "epoch": 0.8251967637184214, + "grad_norm": 0.7955800294876099, + "learning_rate": 6.380205215669064e-06, + "loss": 0.876, + "step": 14993 + }, + { + "epoch": 0.8252518025207771, + "grad_norm": 0.6979825496673584, + "learning_rate": 6.379788585843443e-06, + "loss": 0.7018, + "step": 14994 + }, + { + "epoch": 0.8253068413231328, + "grad_norm": 0.6413466930389404, + "learning_rate": 6.379371945647701e-06, + "loss": 0.7345, + "step": 14995 + }, + { + "epoch": 0.8253618801254885, + "grad_norm": 0.6284430027008057, + "learning_rate": 6.378955295084968e-06, + "loss": 0.6758, + "step": 14996 + }, + { + "epoch": 0.8254169189278441, + "grad_norm": 0.5943842530250549, + "learning_rate": 6.378538634158377e-06, + "loss": 0.6572, + "step": 14997 + }, + { + "epoch": 0.8254719577301998, + "grad_norm": 0.7123218774795532, + "learning_rate": 6.378121962871058e-06, + "loss": 0.6993, + "step": 14998 + }, + { + "epoch": 0.8255269965325555, + "grad_norm": 0.6608574390411377, + "learning_rate": 6.377705281226143e-06, + "loss": 0.7802, + "step": 14999 + }, + { + "epoch": 0.8255820353349111, + "grad_norm": 0.6387534141540527, + "learning_rate": 6.377288589226764e-06, + "loss": 0.6572, + "step": 15000 + }, + { + "epoch": 0.8256370741372667, + "grad_norm": 0.6593596935272217, + "learning_rate": 6.376871886876054e-06, + "loss": 0.665, + "step": 15001 + }, + { + "epoch": 0.8256921129396224, + "grad_norm": 0.7146610617637634, + "learning_rate": 6.376455174177141e-06, + "loss": 0.7278, + "step": 15002 + }, + { + "epoch": 0.8257471517419781, + "grad_norm": 0.6776326298713684, + "learning_rate": 6.376038451133161e-06, + "loss": 0.7679, + "step": 15003 + }, + { + "epoch": 0.8258021905443338, + "grad_norm": 0.7008724808692932, + "learning_rate": 6.375621717747244e-06, + "loss": 0.8749, + "step": 15004 + }, + { + "epoch": 0.8258572293466894, + "grad_norm": 0.6809947490692139, + "learning_rate": 6.375204974022522e-06, + "loss": 0.7248, + "step": 15005 + }, + { + "epoch": 0.8259122681490451, + "grad_norm": 0.6921886205673218, + "learning_rate": 6.374788219962127e-06, + "loss": 0.6685, + "step": 15006 + }, + { + "epoch": 0.8259673069514007, + "grad_norm": 0.6471500396728516, + "learning_rate": 6.374371455569192e-06, + "loss": 0.6856, + "step": 15007 + }, + { + "epoch": 0.8260223457537564, + "grad_norm": 0.673425555229187, + "learning_rate": 6.373954680846851e-06, + "loss": 0.7006, + "step": 15008 + }, + { + "epoch": 0.826077384556112, + "grad_norm": 0.710217297077179, + "learning_rate": 6.373537895798233e-06, + "loss": 0.7315, + "step": 15009 + }, + { + "epoch": 0.8261324233584677, + "grad_norm": 0.692030668258667, + "learning_rate": 6.3731211004264725e-06, + "loss": 0.6534, + "step": 15010 + }, + { + "epoch": 0.8261874621608234, + "grad_norm": 0.6370778679847717, + "learning_rate": 6.372704294734701e-06, + "loss": 0.7278, + "step": 15011 + }, + { + "epoch": 0.8262425009631791, + "grad_norm": 0.6571012139320374, + "learning_rate": 6.372287478726052e-06, + "loss": 0.6889, + "step": 15012 + }, + { + "epoch": 0.8262975397655347, + "grad_norm": 0.721810519695282, + "learning_rate": 6.371870652403657e-06, + "loss": 0.8572, + "step": 15013 + }, + { + "epoch": 0.8263525785678904, + "grad_norm": 0.6751163601875305, + "learning_rate": 6.371453815770647e-06, + "loss": 0.7646, + "step": 15014 + }, + { + "epoch": 0.826407617370246, + "grad_norm": 0.724319338798523, + "learning_rate": 6.371036968830161e-06, + "loss": 0.8433, + "step": 15015 + }, + { + "epoch": 0.8264626561726017, + "grad_norm": 0.6961913108825684, + "learning_rate": 6.370620111585326e-06, + "loss": 0.7069, + "step": 15016 + }, + { + "epoch": 0.8265176949749573, + "grad_norm": 0.649428129196167, + "learning_rate": 6.370203244039279e-06, + "loss": 0.7286, + "step": 15017 + }, + { + "epoch": 0.826572733777313, + "grad_norm": 0.6468552947044373, + "learning_rate": 6.369786366195149e-06, + "loss": 0.7006, + "step": 15018 + }, + { + "epoch": 0.8266277725796687, + "grad_norm": 0.6564732789993286, + "learning_rate": 6.369369478056072e-06, + "loss": 0.727, + "step": 15019 + }, + { + "epoch": 0.8266828113820244, + "grad_norm": 0.6573188900947571, + "learning_rate": 6.36895257962518e-06, + "loss": 0.6603, + "step": 15020 + }, + { + "epoch": 0.82673785018438, + "grad_norm": 0.747164785861969, + "learning_rate": 6.368535670905609e-06, + "loss": 0.7426, + "step": 15021 + }, + { + "epoch": 0.8267928889867356, + "grad_norm": 0.6366723775863647, + "learning_rate": 6.368118751900489e-06, + "loss": 0.6487, + "step": 15022 + }, + { + "epoch": 0.8268479277890913, + "grad_norm": 0.6517844200134277, + "learning_rate": 6.367701822612955e-06, + "loss": 0.7131, + "step": 15023 + }, + { + "epoch": 0.826902966591447, + "grad_norm": 0.774309515953064, + "learning_rate": 6.367284883046141e-06, + "loss": 0.7978, + "step": 15024 + }, + { + "epoch": 0.8269580053938026, + "grad_norm": 0.6302667856216431, + "learning_rate": 6.366867933203178e-06, + "loss": 0.7403, + "step": 15025 + }, + { + "epoch": 0.8270130441961583, + "grad_norm": 0.6881224513053894, + "learning_rate": 6.366450973087202e-06, + "loss": 0.7884, + "step": 15026 + }, + { + "epoch": 0.827068082998514, + "grad_norm": 0.6901270747184753, + "learning_rate": 6.366034002701346e-06, + "loss": 0.6596, + "step": 15027 + }, + { + "epoch": 0.8271231218008697, + "grad_norm": 0.7436091303825378, + "learning_rate": 6.365617022048745e-06, + "loss": 0.8141, + "step": 15028 + }, + { + "epoch": 0.8271781606032252, + "grad_norm": 0.6745834350585938, + "learning_rate": 6.365200031132531e-06, + "loss": 0.7738, + "step": 15029 + }, + { + "epoch": 0.8272331994055809, + "grad_norm": 0.6963297724723816, + "learning_rate": 6.364783029955839e-06, + "loss": 0.8649, + "step": 15030 + }, + { + "epoch": 0.8272882382079366, + "grad_norm": 0.6468135714530945, + "learning_rate": 6.364366018521803e-06, + "loss": 0.7403, + "step": 15031 + }, + { + "epoch": 0.8273432770102923, + "grad_norm": 0.6481515169143677, + "learning_rate": 6.363948996833559e-06, + "loss": 0.6268, + "step": 15032 + }, + { + "epoch": 0.8273983158126479, + "grad_norm": 0.6881366968154907, + "learning_rate": 6.3635319648942386e-06, + "loss": 0.6339, + "step": 15033 + }, + { + "epoch": 0.8274533546150036, + "grad_norm": 0.6858122944831848, + "learning_rate": 6.363114922706977e-06, + "loss": 0.7685, + "step": 15034 + }, + { + "epoch": 0.8275083934173593, + "grad_norm": 0.6630339026451111, + "learning_rate": 6.362697870274907e-06, + "loss": 0.7281, + "step": 15035 + }, + { + "epoch": 0.8275634322197148, + "grad_norm": 0.7198584079742432, + "learning_rate": 6.362280807601167e-06, + "loss": 0.7726, + "step": 15036 + }, + { + "epoch": 0.8276184710220705, + "grad_norm": 0.721622884273529, + "learning_rate": 6.361863734688888e-06, + "loss": 0.6471, + "step": 15037 + }, + { + "epoch": 0.8276735098244262, + "grad_norm": 0.6032352447509766, + "learning_rate": 6.3614466515412055e-06, + "loss": 0.6684, + "step": 15038 + }, + { + "epoch": 0.8277285486267819, + "grad_norm": 0.7568576335906982, + "learning_rate": 6.3610295581612535e-06, + "loss": 0.7089, + "step": 15039 + }, + { + "epoch": 0.8277835874291375, + "grad_norm": 0.7461723685264587, + "learning_rate": 6.360612454552168e-06, + "loss": 0.806, + "step": 15040 + }, + { + "epoch": 0.8278386262314932, + "grad_norm": 0.6606107354164124, + "learning_rate": 6.3601953407170855e-06, + "loss": 0.7276, + "step": 15041 + }, + { + "epoch": 0.8278936650338489, + "grad_norm": 0.7203792333602905, + "learning_rate": 6.3597782166591384e-06, + "loss": 0.844, + "step": 15042 + }, + { + "epoch": 0.8279487038362046, + "grad_norm": 0.7327194213867188, + "learning_rate": 6.35936108238146e-06, + "loss": 0.8289, + "step": 15043 + }, + { + "epoch": 0.8280037426385601, + "grad_norm": 0.6741734147071838, + "learning_rate": 6.358943937887189e-06, + "loss": 0.7022, + "step": 15044 + }, + { + "epoch": 0.8280587814409158, + "grad_norm": 0.795724630355835, + "learning_rate": 6.35852678317946e-06, + "loss": 0.7703, + "step": 15045 + }, + { + "epoch": 0.8281138202432715, + "grad_norm": 0.6476230621337891, + "learning_rate": 6.3581096182614055e-06, + "loss": 0.7471, + "step": 15046 + }, + { + "epoch": 0.8281688590456272, + "grad_norm": 0.658829391002655, + "learning_rate": 6.357692443136164e-06, + "loss": 0.7796, + "step": 15047 + }, + { + "epoch": 0.8282238978479828, + "grad_norm": 0.6755202412605286, + "learning_rate": 6.35727525780687e-06, + "loss": 0.8239, + "step": 15048 + }, + { + "epoch": 0.8282789366503385, + "grad_norm": 0.6518263220787048, + "learning_rate": 6.356858062276658e-06, + "loss": 0.7222, + "step": 15049 + }, + { + "epoch": 0.8283339754526942, + "grad_norm": 0.7006294131278992, + "learning_rate": 6.356440856548662e-06, + "loss": 0.7779, + "step": 15050 + }, + { + "epoch": 0.8283890142550498, + "grad_norm": 0.6771633625030518, + "learning_rate": 6.356023640626021e-06, + "loss": 0.7529, + "step": 15051 + }, + { + "epoch": 0.8284440530574054, + "grad_norm": 0.6893792152404785, + "learning_rate": 6.35560641451187e-06, + "loss": 0.834, + "step": 15052 + }, + { + "epoch": 0.8284990918597611, + "grad_norm": 0.7450309991836548, + "learning_rate": 6.355189178209343e-06, + "loss": 0.7017, + "step": 15053 + }, + { + "epoch": 0.8285541306621168, + "grad_norm": 0.7094436883926392, + "learning_rate": 6.3547719317215785e-06, + "loss": 0.7883, + "step": 15054 + }, + { + "epoch": 0.8286091694644725, + "grad_norm": 0.6926944255828857, + "learning_rate": 6.3543546750517085e-06, + "loss": 0.7309, + "step": 15055 + }, + { + "epoch": 0.8286642082668281, + "grad_norm": 0.7394436597824097, + "learning_rate": 6.3539374082028725e-06, + "loss": 0.8819, + "step": 15056 + }, + { + "epoch": 0.8287192470691838, + "grad_norm": 0.7663393616676331, + "learning_rate": 6.353520131178206e-06, + "loss": 0.7269, + "step": 15057 + }, + { + "epoch": 0.8287742858715395, + "grad_norm": 0.702627956867218, + "learning_rate": 6.353102843980844e-06, + "loss": 0.8205, + "step": 15058 + }, + { + "epoch": 0.8288293246738951, + "grad_norm": 0.6575393676757812, + "learning_rate": 6.352685546613924e-06, + "loss": 0.782, + "step": 15059 + }, + { + "epoch": 0.8288843634762507, + "grad_norm": 0.6844787001609802, + "learning_rate": 6.35226823908058e-06, + "loss": 0.7485, + "step": 15060 + }, + { + "epoch": 0.8289394022786064, + "grad_norm": 0.6018843054771423, + "learning_rate": 6.351850921383951e-06, + "loss": 0.6788, + "step": 15061 + }, + { + "epoch": 0.8289944410809621, + "grad_norm": 0.7418997883796692, + "learning_rate": 6.351433593527172e-06, + "loss": 0.6789, + "step": 15062 + }, + { + "epoch": 0.8290494798833178, + "grad_norm": 0.625535786151886, + "learning_rate": 6.351016255513379e-06, + "loss": 0.7405, + "step": 15063 + }, + { + "epoch": 0.8291045186856734, + "grad_norm": 0.678569495677948, + "learning_rate": 6.350598907345711e-06, + "loss": 0.7386, + "step": 15064 + }, + { + "epoch": 0.829159557488029, + "grad_norm": 0.8012919425964355, + "learning_rate": 6.350181549027302e-06, + "loss": 0.7703, + "step": 15065 + }, + { + "epoch": 0.8292145962903847, + "grad_norm": 0.6115431189537048, + "learning_rate": 6.3497641805612905e-06, + "loss": 0.7131, + "step": 15066 + }, + { + "epoch": 0.8292696350927404, + "grad_norm": 0.7392085194587708, + "learning_rate": 6.349346801950812e-06, + "loss": 0.7648, + "step": 15067 + }, + { + "epoch": 0.829324673895096, + "grad_norm": 0.597613513469696, + "learning_rate": 6.348929413199005e-06, + "loss": 0.6023, + "step": 15068 + }, + { + "epoch": 0.8293797126974517, + "grad_norm": 0.6418130397796631, + "learning_rate": 6.348512014309005e-06, + "loss": 0.7507, + "step": 15069 + }, + { + "epoch": 0.8294347514998074, + "grad_norm": 0.6351965665817261, + "learning_rate": 6.34809460528395e-06, + "loss": 0.722, + "step": 15070 + }, + { + "epoch": 0.8294897903021631, + "grad_norm": 0.6593570709228516, + "learning_rate": 6.347677186126977e-06, + "loss": 0.7032, + "step": 15071 + }, + { + "epoch": 0.8295448291045187, + "grad_norm": 0.8040562868118286, + "learning_rate": 6.3472597568412235e-06, + "loss": 0.6519, + "step": 15072 + }, + { + "epoch": 0.8295998679068743, + "grad_norm": 0.7043612599372864, + "learning_rate": 6.346842317429825e-06, + "loss": 0.7765, + "step": 15073 + }, + { + "epoch": 0.82965490670923, + "grad_norm": 0.6304612159729004, + "learning_rate": 6.346424867895922e-06, + "loss": 0.6763, + "step": 15074 + }, + { + "epoch": 0.8297099455115857, + "grad_norm": 0.6402591466903687, + "learning_rate": 6.346007408242647e-06, + "loss": 0.828, + "step": 15075 + }, + { + "epoch": 0.8297649843139413, + "grad_norm": 0.6908280849456787, + "learning_rate": 6.345589938473142e-06, + "loss": 0.855, + "step": 15076 + }, + { + "epoch": 0.829820023116297, + "grad_norm": 0.5829552412033081, + "learning_rate": 6.345172458590545e-06, + "loss": 0.6323, + "step": 15077 + }, + { + "epoch": 0.8298750619186527, + "grad_norm": 0.8221700191497803, + "learning_rate": 6.34475496859799e-06, + "loss": 0.7069, + "step": 15078 + }, + { + "epoch": 0.8299301007210083, + "grad_norm": 0.7065801024436951, + "learning_rate": 6.344337468498616e-06, + "loss": 0.692, + "step": 15079 + }, + { + "epoch": 0.829985139523364, + "grad_norm": 0.6199344396591187, + "learning_rate": 6.343919958295564e-06, + "loss": 0.682, + "step": 15080 + }, + { + "epoch": 0.8300401783257196, + "grad_norm": 0.8999378681182861, + "learning_rate": 6.343502437991968e-06, + "loss": 0.7924, + "step": 15081 + }, + { + "epoch": 0.8300952171280753, + "grad_norm": 0.639163076877594, + "learning_rate": 6.343084907590966e-06, + "loss": 0.6976, + "step": 15082 + }, + { + "epoch": 0.8301502559304309, + "grad_norm": 0.8266178965568542, + "learning_rate": 6.3426673670957e-06, + "loss": 0.6831, + "step": 15083 + }, + { + "epoch": 0.8302052947327866, + "grad_norm": 0.6245449781417847, + "learning_rate": 6.3422498165093034e-06, + "loss": 0.6917, + "step": 15084 + }, + { + "epoch": 0.8302603335351423, + "grad_norm": 0.7809823751449585, + "learning_rate": 6.341832255834918e-06, + "loss": 0.8424, + "step": 15085 + }, + { + "epoch": 0.830315372337498, + "grad_norm": 0.6803410053253174, + "learning_rate": 6.34141468507568e-06, + "loss": 0.8345, + "step": 15086 + }, + { + "epoch": 0.8303704111398535, + "grad_norm": 0.7445305585861206, + "learning_rate": 6.340997104234728e-06, + "loss": 0.8823, + "step": 15087 + }, + { + "epoch": 0.8304254499422092, + "grad_norm": 0.6992506384849548, + "learning_rate": 6.340579513315199e-06, + "loss": 0.7857, + "step": 15088 + }, + { + "epoch": 0.8304804887445649, + "grad_norm": 0.7050431966781616, + "learning_rate": 6.340161912320237e-06, + "loss": 0.7988, + "step": 15089 + }, + { + "epoch": 0.8305355275469206, + "grad_norm": 0.8718838095664978, + "learning_rate": 6.339744301252973e-06, + "loss": 0.9983, + "step": 15090 + }, + { + "epoch": 0.8305905663492762, + "grad_norm": 0.7317140698432922, + "learning_rate": 6.339326680116551e-06, + "loss": 0.6852, + "step": 15091 + }, + { + "epoch": 0.8306456051516319, + "grad_norm": 0.6975864768028259, + "learning_rate": 6.338909048914108e-06, + "loss": 0.7334, + "step": 15092 + }, + { + "epoch": 0.8307006439539876, + "grad_norm": 0.6615436673164368, + "learning_rate": 6.3384914076487834e-06, + "loss": 0.776, + "step": 15093 + }, + { + "epoch": 0.8307556827563433, + "grad_norm": 0.773273766040802, + "learning_rate": 6.338073756323717e-06, + "loss": 0.7868, + "step": 15094 + }, + { + "epoch": 0.8308107215586988, + "grad_norm": 0.6686182022094727, + "learning_rate": 6.337656094942045e-06, + "loss": 0.7487, + "step": 15095 + }, + { + "epoch": 0.8308657603610545, + "grad_norm": 0.8202255368232727, + "learning_rate": 6.337238423506909e-06, + "loss": 0.7748, + "step": 15096 + }, + { + "epoch": 0.8309207991634102, + "grad_norm": 0.6356936693191528, + "learning_rate": 6.336820742021445e-06, + "loss": 0.6539, + "step": 15097 + }, + { + "epoch": 0.8309758379657659, + "grad_norm": 0.6543401479721069, + "learning_rate": 6.3364030504887955e-06, + "loss": 0.7185, + "step": 15098 + }, + { + "epoch": 0.8310308767681215, + "grad_norm": 0.6499043107032776, + "learning_rate": 6.335985348912097e-06, + "loss": 0.7254, + "step": 15099 + }, + { + "epoch": 0.8310859155704772, + "grad_norm": 0.6983271241188049, + "learning_rate": 6.335567637294491e-06, + "loss": 0.784, + "step": 15100 + }, + { + "epoch": 0.8311409543728329, + "grad_norm": 0.7932507395744324, + "learning_rate": 6.335149915639117e-06, + "loss": 0.6708, + "step": 15101 + }, + { + "epoch": 0.8311959931751886, + "grad_norm": 0.6792518496513367, + "learning_rate": 6.334732183949112e-06, + "loss": 0.7365, + "step": 15102 + }, + { + "epoch": 0.8312510319775441, + "grad_norm": 0.6852229237556458, + "learning_rate": 6.334314442227618e-06, + "loss": 0.7283, + "step": 15103 + }, + { + "epoch": 0.8313060707798998, + "grad_norm": 0.6528468728065491, + "learning_rate": 6.333896690477774e-06, + "loss": 0.763, + "step": 15104 + }, + { + "epoch": 0.8313611095822555, + "grad_norm": 0.7215067148208618, + "learning_rate": 6.33347892870272e-06, + "loss": 0.7769, + "step": 15105 + }, + { + "epoch": 0.8314161483846112, + "grad_norm": 0.7171593308448792, + "learning_rate": 6.333061156905596e-06, + "loss": 0.6807, + "step": 15106 + }, + { + "epoch": 0.8314711871869668, + "grad_norm": 0.6781407594680786, + "learning_rate": 6.332643375089539e-06, + "loss": 0.6801, + "step": 15107 + }, + { + "epoch": 0.8315262259893225, + "grad_norm": 0.803057849407196, + "learning_rate": 6.332225583257693e-06, + "loss": 0.682, + "step": 15108 + }, + { + "epoch": 0.8315812647916782, + "grad_norm": 0.6467291712760925, + "learning_rate": 6.331807781413195e-06, + "loss": 0.6675, + "step": 15109 + }, + { + "epoch": 0.8316363035940338, + "grad_norm": 0.7285529971122742, + "learning_rate": 6.331389969559186e-06, + "loss": 0.7333, + "step": 15110 + }, + { + "epoch": 0.8316913423963894, + "grad_norm": 0.6569895148277283, + "learning_rate": 6.330972147698806e-06, + "loss": 0.7202, + "step": 15111 + }, + { + "epoch": 0.8317463811987451, + "grad_norm": 0.7848708033561707, + "learning_rate": 6.330554315835198e-06, + "loss": 0.7936, + "step": 15112 + }, + { + "epoch": 0.8318014200011008, + "grad_norm": 0.6699723601341248, + "learning_rate": 6.330136473971498e-06, + "loss": 0.7107, + "step": 15113 + }, + { + "epoch": 0.8318564588034565, + "grad_norm": 0.7443183660507202, + "learning_rate": 6.329718622110848e-06, + "loss": 0.8102, + "step": 15114 + }, + { + "epoch": 0.8319114976058121, + "grad_norm": 0.6073893904685974, + "learning_rate": 6.329300760256389e-06, + "loss": 0.7061, + "step": 15115 + }, + { + "epoch": 0.8319665364081678, + "grad_norm": 0.6192148923873901, + "learning_rate": 6.328882888411262e-06, + "loss": 0.6929, + "step": 15116 + }, + { + "epoch": 0.8320215752105234, + "grad_norm": 0.7347237467765808, + "learning_rate": 6.3284650065786065e-06, + "loss": 0.6705, + "step": 15117 + }, + { + "epoch": 0.8320766140128791, + "grad_norm": 0.6286477446556091, + "learning_rate": 6.328047114761564e-06, + "loss": 0.6494, + "step": 15118 + }, + { + "epoch": 0.8321316528152347, + "grad_norm": 0.6492440104484558, + "learning_rate": 6.327629212963275e-06, + "loss": 0.6618, + "step": 15119 + }, + { + "epoch": 0.8321866916175904, + "grad_norm": 0.6295114755630493, + "learning_rate": 6.3272113011868804e-06, + "loss": 0.786, + "step": 15120 + }, + { + "epoch": 0.8322417304199461, + "grad_norm": 0.6737865805625916, + "learning_rate": 6.3267933794355206e-06, + "loss": 0.7544, + "step": 15121 + }, + { + "epoch": 0.8322967692223017, + "grad_norm": 0.8025132417678833, + "learning_rate": 6.3263754477123374e-06, + "loss": 0.7736, + "step": 15122 + }, + { + "epoch": 0.8323518080246574, + "grad_norm": 0.6820534467697144, + "learning_rate": 6.32595750602047e-06, + "loss": 0.6616, + "step": 15123 + }, + { + "epoch": 0.832406846827013, + "grad_norm": 0.7022573351860046, + "learning_rate": 6.325539554363061e-06, + "loss": 0.8175, + "step": 15124 + }, + { + "epoch": 0.8324618856293687, + "grad_norm": 0.7034926414489746, + "learning_rate": 6.325121592743253e-06, + "loss": 0.7047, + "step": 15125 + }, + { + "epoch": 0.8325169244317243, + "grad_norm": 0.654296875, + "learning_rate": 6.3247036211641856e-06, + "loss": 0.6468, + "step": 15126 + }, + { + "epoch": 0.83257196323408, + "grad_norm": 0.647859513759613, + "learning_rate": 6.324285639628999e-06, + "loss": 0.694, + "step": 15127 + }, + { + "epoch": 0.8326270020364357, + "grad_norm": 1.0824226140975952, + "learning_rate": 6.323867648140837e-06, + "loss": 0.7226, + "step": 15128 + }, + { + "epoch": 0.8326820408387914, + "grad_norm": 0.8568648099899292, + "learning_rate": 6.323449646702839e-06, + "loss": 0.7524, + "step": 15129 + }, + { + "epoch": 0.832737079641147, + "grad_norm": 0.6550299525260925, + "learning_rate": 6.32303163531815e-06, + "loss": 0.7294, + "step": 15130 + }, + { + "epoch": 0.8327921184435026, + "grad_norm": 0.7722175121307373, + "learning_rate": 6.3226136139899075e-06, + "loss": 0.7864, + "step": 15131 + }, + { + "epoch": 0.8328471572458583, + "grad_norm": 0.6542928218841553, + "learning_rate": 6.322195582721256e-06, + "loss": 0.6614, + "step": 15132 + }, + { + "epoch": 0.832902196048214, + "grad_norm": 0.6617493629455566, + "learning_rate": 6.321777541515337e-06, + "loss": 0.7147, + "step": 15133 + }, + { + "epoch": 0.8329572348505696, + "grad_norm": 0.698868989944458, + "learning_rate": 6.321359490375291e-06, + "loss": 0.6894, + "step": 15134 + }, + { + "epoch": 0.8330122736529253, + "grad_norm": 0.8005796074867249, + "learning_rate": 6.3209414293042595e-06, + "loss": 0.7513, + "step": 15135 + }, + { + "epoch": 0.833067312455281, + "grad_norm": 0.7656713128089905, + "learning_rate": 6.320523358305387e-06, + "loss": 0.7387, + "step": 15136 + }, + { + "epoch": 0.8331223512576367, + "grad_norm": 0.7299987077713013, + "learning_rate": 6.320105277381815e-06, + "loss": 0.7868, + "step": 15137 + }, + { + "epoch": 0.8331773900599923, + "grad_norm": 0.782574474811554, + "learning_rate": 6.319687186536685e-06, + "loss": 0.8307, + "step": 15138 + }, + { + "epoch": 0.8332324288623479, + "grad_norm": 0.6786854863166809, + "learning_rate": 6.319269085773138e-06, + "loss": 0.7819, + "step": 15139 + }, + { + "epoch": 0.8332874676647036, + "grad_norm": 1.173049807548523, + "learning_rate": 6.318850975094318e-06, + "loss": 0.7623, + "step": 15140 + }, + { + "epoch": 0.8333425064670593, + "grad_norm": 0.8410226106643677, + "learning_rate": 6.318432854503368e-06, + "loss": 0.812, + "step": 15141 + }, + { + "epoch": 0.8333975452694149, + "grad_norm": 0.8525705337524414, + "learning_rate": 6.3180147240034304e-06, + "loss": 0.7585, + "step": 15142 + }, + { + "epoch": 0.8334525840717706, + "grad_norm": 0.6345195770263672, + "learning_rate": 6.317596583597645e-06, + "loss": 0.7446, + "step": 15143 + }, + { + "epoch": 0.8335076228741263, + "grad_norm": 0.7238603234291077, + "learning_rate": 6.317178433289157e-06, + "loss": 0.7461, + "step": 15144 + }, + { + "epoch": 0.833562661676482, + "grad_norm": 0.6187044382095337, + "learning_rate": 6.31676027308111e-06, + "loss": 0.7195, + "step": 15145 + }, + { + "epoch": 0.8336177004788375, + "grad_norm": 0.6813417077064514, + "learning_rate": 6.316342102976644e-06, + "loss": 0.772, + "step": 15146 + }, + { + "epoch": 0.8336727392811932, + "grad_norm": 0.665515124797821, + "learning_rate": 6.315923922978902e-06, + "loss": 0.7127, + "step": 15147 + }, + { + "epoch": 0.8337277780835489, + "grad_norm": 0.8104628920555115, + "learning_rate": 6.315505733091028e-06, + "loss": 0.7332, + "step": 15148 + }, + { + "epoch": 0.8337828168859046, + "grad_norm": 0.8447679281234741, + "learning_rate": 6.315087533316166e-06, + "loss": 0.6803, + "step": 15149 + }, + { + "epoch": 0.8338378556882602, + "grad_norm": 0.7588180303573608, + "learning_rate": 6.31466932365746e-06, + "loss": 0.8301, + "step": 15150 + }, + { + "epoch": 0.8338928944906159, + "grad_norm": 0.7697302103042603, + "learning_rate": 6.314251104118048e-06, + "loss": 0.7777, + "step": 15151 + }, + { + "epoch": 0.8339479332929716, + "grad_norm": 0.8361233472824097, + "learning_rate": 6.313832874701078e-06, + "loss": 0.7585, + "step": 15152 + }, + { + "epoch": 0.8340029720953273, + "grad_norm": 0.6954757571220398, + "learning_rate": 6.313414635409692e-06, + "loss": 0.759, + "step": 15153 + }, + { + "epoch": 0.8340580108976828, + "grad_norm": 0.72389155626297, + "learning_rate": 6.312996386247034e-06, + "loss": 0.6679, + "step": 15154 + }, + { + "epoch": 0.8341130497000385, + "grad_norm": 0.781382143497467, + "learning_rate": 6.312578127216245e-06, + "loss": 0.769, + "step": 15155 + }, + { + "epoch": 0.8341680885023942, + "grad_norm": 0.7186244130134583, + "learning_rate": 6.312159858320472e-06, + "loss": 0.7476, + "step": 15156 + }, + { + "epoch": 0.8342231273047499, + "grad_norm": 0.6909130215644836, + "learning_rate": 6.311741579562855e-06, + "loss": 0.749, + "step": 15157 + }, + { + "epoch": 0.8342781661071055, + "grad_norm": 0.7692446708679199, + "learning_rate": 6.31132329094654e-06, + "loss": 0.7141, + "step": 15158 + }, + { + "epoch": 0.8343332049094612, + "grad_norm": 0.6753776669502258, + "learning_rate": 6.310904992474669e-06, + "loss": 0.7259, + "step": 15159 + }, + { + "epoch": 0.8343882437118169, + "grad_norm": 0.7118550539016724, + "learning_rate": 6.3104866841503885e-06, + "loss": 0.8282, + "step": 15160 + }, + { + "epoch": 0.8344432825141725, + "grad_norm": 0.6651625037193298, + "learning_rate": 6.31006836597684e-06, + "loss": 0.7639, + "step": 15161 + }, + { + "epoch": 0.8344983213165281, + "grad_norm": 0.6745681762695312, + "learning_rate": 6.30965003795717e-06, + "loss": 0.5922, + "step": 15162 + }, + { + "epoch": 0.8345533601188838, + "grad_norm": 0.7344138622283936, + "learning_rate": 6.309231700094518e-06, + "loss": 0.7134, + "step": 15163 + }, + { + "epoch": 0.8346083989212395, + "grad_norm": 0.7628228664398193, + "learning_rate": 6.308813352392034e-06, + "loss": 0.7341, + "step": 15164 + }, + { + "epoch": 0.8346634377235951, + "grad_norm": 0.6599448919296265, + "learning_rate": 6.308394994852858e-06, + "loss": 0.6821, + "step": 15165 + }, + { + "epoch": 0.8347184765259508, + "grad_norm": 0.9132193922996521, + "learning_rate": 6.307976627480136e-06, + "loss": 0.7862, + "step": 15166 + }, + { + "epoch": 0.8347735153283065, + "grad_norm": 0.752200722694397, + "learning_rate": 6.307558250277011e-06, + "loss": 0.7942, + "step": 15167 + }, + { + "epoch": 0.8348285541306621, + "grad_norm": 0.6848111748695374, + "learning_rate": 6.307139863246628e-06, + "loss": 0.8161, + "step": 15168 + }, + { + "epoch": 0.8348835929330177, + "grad_norm": 0.7229306697845459, + "learning_rate": 6.306721466392132e-06, + "loss": 0.684, + "step": 15169 + }, + { + "epoch": 0.8349386317353734, + "grad_norm": 0.7294610142707825, + "learning_rate": 6.306303059716667e-06, + "loss": 0.7046, + "step": 15170 + }, + { + "epoch": 0.8349936705377291, + "grad_norm": 0.7153074741363525, + "learning_rate": 6.305884643223378e-06, + "loss": 0.7613, + "step": 15171 + }, + { + "epoch": 0.8350487093400848, + "grad_norm": 0.6200907826423645, + "learning_rate": 6.30546621691541e-06, + "loss": 0.642, + "step": 15172 + }, + { + "epoch": 0.8351037481424404, + "grad_norm": 0.6640743017196655, + "learning_rate": 6.305047780795907e-06, + "loss": 0.7201, + "step": 15173 + }, + { + "epoch": 0.8351587869447961, + "grad_norm": 0.6427313089370728, + "learning_rate": 6.3046293348680144e-06, + "loss": 0.764, + "step": 15174 + }, + { + "epoch": 0.8352138257471518, + "grad_norm": 0.6475403308868408, + "learning_rate": 6.3042108791348755e-06, + "loss": 0.6678, + "step": 15175 + }, + { + "epoch": 0.8352688645495074, + "grad_norm": 0.6376405358314514, + "learning_rate": 6.303792413599638e-06, + "loss": 0.6972, + "step": 15176 + }, + { + "epoch": 0.835323903351863, + "grad_norm": 0.6648433804512024, + "learning_rate": 6.303373938265447e-06, + "loss": 0.6531, + "step": 15177 + }, + { + "epoch": 0.8353789421542187, + "grad_norm": 0.6582038402557373, + "learning_rate": 6.302955453135446e-06, + "loss": 0.7703, + "step": 15178 + }, + { + "epoch": 0.8354339809565744, + "grad_norm": 0.6386045217514038, + "learning_rate": 6.30253695821278e-06, + "loss": 0.6821, + "step": 15179 + }, + { + "epoch": 0.8354890197589301, + "grad_norm": 0.7268567681312561, + "learning_rate": 6.302118453500594e-06, + "loss": 0.7434, + "step": 15180 + }, + { + "epoch": 0.8355440585612857, + "grad_norm": 0.8008975982666016, + "learning_rate": 6.301699939002035e-06, + "loss": 0.8537, + "step": 15181 + }, + { + "epoch": 0.8355990973636414, + "grad_norm": 0.6803351044654846, + "learning_rate": 6.301281414720247e-06, + "loss": 0.6741, + "step": 15182 + }, + { + "epoch": 0.835654136165997, + "grad_norm": 0.6567045450210571, + "learning_rate": 6.3008628806583785e-06, + "loss": 0.7033, + "step": 15183 + }, + { + "epoch": 0.8357091749683527, + "grad_norm": 0.7088850140571594, + "learning_rate": 6.3004443368195685e-06, + "loss": 0.699, + "step": 15184 + }, + { + "epoch": 0.8357642137707083, + "grad_norm": 0.664929986000061, + "learning_rate": 6.3000257832069715e-06, + "loss": 0.6875, + "step": 15185 + }, + { + "epoch": 0.835819252573064, + "grad_norm": 0.7132309079170227, + "learning_rate": 6.299607219823727e-06, + "loss": 0.8172, + "step": 15186 + }, + { + "epoch": 0.8358742913754197, + "grad_norm": 0.7312454581260681, + "learning_rate": 6.2991886466729815e-06, + "loss": 0.7277, + "step": 15187 + }, + { + "epoch": 0.8359293301777754, + "grad_norm": 0.6576625108718872, + "learning_rate": 6.298770063757882e-06, + "loss": 0.7134, + "step": 15188 + }, + { + "epoch": 0.835984368980131, + "grad_norm": 0.6840282678604126, + "learning_rate": 6.2983514710815756e-06, + "loss": 0.777, + "step": 15189 + }, + { + "epoch": 0.8360394077824866, + "grad_norm": 0.7194011211395264, + "learning_rate": 6.297932868647207e-06, + "loss": 0.783, + "step": 15190 + }, + { + "epoch": 0.8360944465848423, + "grad_norm": 0.6619371175765991, + "learning_rate": 6.297514256457922e-06, + "loss": 0.7809, + "step": 15191 + }, + { + "epoch": 0.836149485387198, + "grad_norm": 0.8256712555885315, + "learning_rate": 6.2970956345168666e-06, + "loss": 0.9086, + "step": 15192 + }, + { + "epoch": 0.8362045241895536, + "grad_norm": 0.6951783299446106, + "learning_rate": 6.296677002827188e-06, + "loss": 0.7489, + "step": 15193 + }, + { + "epoch": 0.8362595629919093, + "grad_norm": 0.8535193204879761, + "learning_rate": 6.296258361392033e-06, + "loss": 0.7744, + "step": 15194 + }, + { + "epoch": 0.836314601794265, + "grad_norm": 0.7569966912269592, + "learning_rate": 6.295839710214546e-06, + "loss": 0.7091, + "step": 15195 + }, + { + "epoch": 0.8363696405966207, + "grad_norm": 0.6435930728912354, + "learning_rate": 6.295421049297875e-06, + "loss": 0.6601, + "step": 15196 + }, + { + "epoch": 0.8364246793989762, + "grad_norm": 0.811500608921051, + "learning_rate": 6.295002378645166e-06, + "loss": 0.7304, + "step": 15197 + }, + { + "epoch": 0.8364797182013319, + "grad_norm": 0.7306826114654541, + "learning_rate": 6.294583698259566e-06, + "loss": 0.8471, + "step": 15198 + }, + { + "epoch": 0.8365347570036876, + "grad_norm": 0.6411521434783936, + "learning_rate": 6.294165008144222e-06, + "loss": 0.6572, + "step": 15199 + }, + { + "epoch": 0.8365897958060433, + "grad_norm": 0.6460714340209961, + "learning_rate": 6.293746308302278e-06, + "loss": 0.7514, + "step": 15200 + }, + { + "epoch": 0.8366448346083989, + "grad_norm": 0.9355582594871521, + "learning_rate": 6.2933275987368855e-06, + "loss": 0.8171, + "step": 15201 + }, + { + "epoch": 0.8366998734107546, + "grad_norm": 0.6221946477890015, + "learning_rate": 6.292908879451189e-06, + "loss": 0.7323, + "step": 15202 + }, + { + "epoch": 0.8367549122131103, + "grad_norm": 0.6820993423461914, + "learning_rate": 6.292490150448335e-06, + "loss": 0.8168, + "step": 15203 + }, + { + "epoch": 0.836809951015466, + "grad_norm": 0.6494680643081665, + "learning_rate": 6.29207141173147e-06, + "loss": 0.7926, + "step": 15204 + }, + { + "epoch": 0.8368649898178215, + "grad_norm": 0.7658956050872803, + "learning_rate": 6.291652663303744e-06, + "loss": 0.7304, + "step": 15205 + }, + { + "epoch": 0.8369200286201772, + "grad_norm": 0.6653497219085693, + "learning_rate": 6.2912339051683e-06, + "loss": 0.7284, + "step": 15206 + }, + { + "epoch": 0.8369750674225329, + "grad_norm": 0.6136276721954346, + "learning_rate": 6.290815137328289e-06, + "loss": 0.7313, + "step": 15207 + }, + { + "epoch": 0.8370301062248885, + "grad_norm": 0.7542527914047241, + "learning_rate": 6.2903963597868555e-06, + "loss": 0.7806, + "step": 15208 + }, + { + "epoch": 0.8370851450272442, + "grad_norm": 0.6994839906692505, + "learning_rate": 6.2899775725471505e-06, + "loss": 0.8132, + "step": 15209 + }, + { + "epoch": 0.8371401838295999, + "grad_norm": 0.6558997631072998, + "learning_rate": 6.289558775612319e-06, + "loss": 0.7188, + "step": 15210 + }, + { + "epoch": 0.8371952226319556, + "grad_norm": 0.7155564427375793, + "learning_rate": 6.289139968985507e-06, + "loss": 0.6584, + "step": 15211 + }, + { + "epoch": 0.8372502614343111, + "grad_norm": 0.7645565867424011, + "learning_rate": 6.288721152669865e-06, + "loss": 0.761, + "step": 15212 + }, + { + "epoch": 0.8373053002366668, + "grad_norm": 0.6507940292358398, + "learning_rate": 6.288302326668542e-06, + "loss": 0.7139, + "step": 15213 + }, + { + "epoch": 0.8373603390390225, + "grad_norm": 0.7598558664321899, + "learning_rate": 6.287883490984682e-06, + "loss": 0.7627, + "step": 15214 + }, + { + "epoch": 0.8374153778413782, + "grad_norm": 0.6542350649833679, + "learning_rate": 6.287464645621434e-06, + "loss": 0.7508, + "step": 15215 + }, + { + "epoch": 0.8374704166437338, + "grad_norm": 0.7530503869056702, + "learning_rate": 6.287045790581946e-06, + "loss": 0.8234, + "step": 15216 + }, + { + "epoch": 0.8375254554460895, + "grad_norm": 0.9945759773254395, + "learning_rate": 6.286626925869367e-06, + "loss": 0.7637, + "step": 15217 + }, + { + "epoch": 0.8375804942484452, + "grad_norm": 0.6644982695579529, + "learning_rate": 6.286208051486844e-06, + "loss": 0.7671, + "step": 15218 + }, + { + "epoch": 0.8376355330508009, + "grad_norm": 0.8195061683654785, + "learning_rate": 6.285789167437526e-06, + "loss": 0.662, + "step": 15219 + }, + { + "epoch": 0.8376905718531564, + "grad_norm": 0.6578626036643982, + "learning_rate": 6.2853702737245605e-06, + "loss": 0.7681, + "step": 15220 + }, + { + "epoch": 0.8377456106555121, + "grad_norm": 0.6632179021835327, + "learning_rate": 6.2849513703510955e-06, + "loss": 0.759, + "step": 15221 + }, + { + "epoch": 0.8378006494578678, + "grad_norm": 0.6822313070297241, + "learning_rate": 6.284532457320282e-06, + "loss": 0.7859, + "step": 15222 + }, + { + "epoch": 0.8378556882602235, + "grad_norm": 0.6448203921318054, + "learning_rate": 6.284113534635265e-06, + "loss": 0.7224, + "step": 15223 + }, + { + "epoch": 0.8379107270625791, + "grad_norm": 0.6147580146789551, + "learning_rate": 6.2836946022991926e-06, + "loss": 0.7389, + "step": 15224 + }, + { + "epoch": 0.8379657658649348, + "grad_norm": 0.7476562857627869, + "learning_rate": 6.283275660315219e-06, + "loss": 0.7535, + "step": 15225 + }, + { + "epoch": 0.8380208046672905, + "grad_norm": 0.7396713495254517, + "learning_rate": 6.282856708686488e-06, + "loss": 0.7621, + "step": 15226 + }, + { + "epoch": 0.8380758434696461, + "grad_norm": 0.7220024466514587, + "learning_rate": 6.282437747416148e-06, + "loss": 0.672, + "step": 15227 + }, + { + "epoch": 0.8381308822720017, + "grad_norm": 0.9414284229278564, + "learning_rate": 6.2820187765073495e-06, + "loss": 0.8791, + "step": 15228 + }, + { + "epoch": 0.8381859210743574, + "grad_norm": 0.6074691414833069, + "learning_rate": 6.281599795963241e-06, + "loss": 0.6771, + "step": 15229 + }, + { + "epoch": 0.8382409598767131, + "grad_norm": 0.7367346286773682, + "learning_rate": 6.281180805786973e-06, + "loss": 0.7869, + "step": 15230 + }, + { + "epoch": 0.8382959986790688, + "grad_norm": 0.711016833782196, + "learning_rate": 6.280761805981691e-06, + "loss": 0.7166, + "step": 15231 + }, + { + "epoch": 0.8383510374814244, + "grad_norm": 0.6464707255363464, + "learning_rate": 6.280342796550546e-06, + "loss": 0.6965, + "step": 15232 + }, + { + "epoch": 0.83840607628378, + "grad_norm": 0.7385185956954956, + "learning_rate": 6.279923777496688e-06, + "loss": 0.7031, + "step": 15233 + }, + { + "epoch": 0.8384611150861357, + "grad_norm": 0.6799347996711731, + "learning_rate": 6.2795047488232665e-06, + "loss": 0.6777, + "step": 15234 + }, + { + "epoch": 0.8385161538884914, + "grad_norm": 0.690740168094635, + "learning_rate": 6.279085710533429e-06, + "loss": 0.7675, + "step": 15235 + }, + { + "epoch": 0.838571192690847, + "grad_norm": 0.9359111189842224, + "learning_rate": 6.278666662630325e-06, + "loss": 0.7063, + "step": 15236 + }, + { + "epoch": 0.8386262314932027, + "grad_norm": 0.751430094242096, + "learning_rate": 6.2782476051171075e-06, + "loss": 0.7851, + "step": 15237 + }, + { + "epoch": 0.8386812702955584, + "grad_norm": 0.6865997314453125, + "learning_rate": 6.27782853799692e-06, + "loss": 0.7347, + "step": 15238 + }, + { + "epoch": 0.8387363090979141, + "grad_norm": 0.6713284850120544, + "learning_rate": 6.277409461272916e-06, + "loss": 0.7651, + "step": 15239 + }, + { + "epoch": 0.8387913479002697, + "grad_norm": 0.7481899857521057, + "learning_rate": 6.276990374948244e-06, + "loss": 0.7681, + "step": 15240 + }, + { + "epoch": 0.8388463867026253, + "grad_norm": 0.7126002311706543, + "learning_rate": 6.2765712790260554e-06, + "loss": 0.7772, + "step": 15241 + }, + { + "epoch": 0.838901425504981, + "grad_norm": 0.6616978645324707, + "learning_rate": 6.276152173509497e-06, + "loss": 0.7028, + "step": 15242 + }, + { + "epoch": 0.8389564643073367, + "grad_norm": 0.9032973051071167, + "learning_rate": 6.2757330584017225e-06, + "loss": 0.7646, + "step": 15243 + }, + { + "epoch": 0.8390115031096923, + "grad_norm": 0.6345590353012085, + "learning_rate": 6.275313933705879e-06, + "loss": 0.6692, + "step": 15244 + }, + { + "epoch": 0.839066541912048, + "grad_norm": 0.6989019513130188, + "learning_rate": 6.2748947994251175e-06, + "loss": 0.6916, + "step": 15245 + }, + { + "epoch": 0.8391215807144037, + "grad_norm": 0.7115045189857483, + "learning_rate": 6.2744756555625875e-06, + "loss": 0.6923, + "step": 15246 + }, + { + "epoch": 0.8391766195167594, + "grad_norm": 0.6989235281944275, + "learning_rate": 6.2740565021214406e-06, + "loss": 0.7057, + "step": 15247 + }, + { + "epoch": 0.839231658319115, + "grad_norm": 0.684779942035675, + "learning_rate": 6.273637339104824e-06, + "loss": 0.7777, + "step": 15248 + }, + { + "epoch": 0.8392866971214706, + "grad_norm": 0.6341322064399719, + "learning_rate": 6.2732181665158934e-06, + "loss": 0.7335, + "step": 15249 + }, + { + "epoch": 0.8393417359238263, + "grad_norm": 0.7232723832130432, + "learning_rate": 6.272798984357793e-06, + "loss": 0.8055, + "step": 15250 + }, + { + "epoch": 0.8393967747261819, + "grad_norm": 0.9725174307823181, + "learning_rate": 6.272379792633678e-06, + "loss": 0.6221, + "step": 15251 + }, + { + "epoch": 0.8394518135285376, + "grad_norm": 0.6602086424827576, + "learning_rate": 6.271960591346695e-06, + "loss": 0.8023, + "step": 15252 + }, + { + "epoch": 0.8395068523308933, + "grad_norm": 0.7092040777206421, + "learning_rate": 6.271541380499998e-06, + "loss": 0.8135, + "step": 15253 + }, + { + "epoch": 0.839561891133249, + "grad_norm": 0.5656731724739075, + "learning_rate": 6.271122160096736e-06, + "loss": 0.647, + "step": 15254 + }, + { + "epoch": 0.8396169299356046, + "grad_norm": 1.1831625699996948, + "learning_rate": 6.270702930140061e-06, + "loss": 0.8513, + "step": 15255 + }, + { + "epoch": 0.8396719687379602, + "grad_norm": 0.6398816704750061, + "learning_rate": 6.270283690633121e-06, + "loss": 0.6988, + "step": 15256 + }, + { + "epoch": 0.8397270075403159, + "grad_norm": 0.6856167316436768, + "learning_rate": 6.26986444157907e-06, + "loss": 0.7789, + "step": 15257 + }, + { + "epoch": 0.8397820463426716, + "grad_norm": 0.7355605363845825, + "learning_rate": 6.269445182981058e-06, + "loss": 0.6652, + "step": 15258 + }, + { + "epoch": 0.8398370851450272, + "grad_norm": 0.6691173315048218, + "learning_rate": 6.2690259148422364e-06, + "loss": 0.6807, + "step": 15259 + }, + { + "epoch": 0.8398921239473829, + "grad_norm": 0.6596276164054871, + "learning_rate": 6.268606637165754e-06, + "loss": 0.6947, + "step": 15260 + }, + { + "epoch": 0.8399471627497386, + "grad_norm": 0.7198327779769897, + "learning_rate": 6.268187349954766e-06, + "loss": 0.7981, + "step": 15261 + }, + { + "epoch": 0.8400022015520943, + "grad_norm": 0.7006517648696899, + "learning_rate": 6.267768053212419e-06, + "loss": 0.7756, + "step": 15262 + }, + { + "epoch": 0.8400572403544498, + "grad_norm": 0.769062340259552, + "learning_rate": 6.267348746941869e-06, + "loss": 0.8433, + "step": 15263 + }, + { + "epoch": 0.8401122791568055, + "grad_norm": 0.6317951679229736, + "learning_rate": 6.266929431146263e-06, + "loss": 0.6575, + "step": 15264 + }, + { + "epoch": 0.8401673179591612, + "grad_norm": 0.7127153873443604, + "learning_rate": 6.2665101058287554e-06, + "loss": 0.7745, + "step": 15265 + }, + { + "epoch": 0.8402223567615169, + "grad_norm": 0.6909182667732239, + "learning_rate": 6.266090770992497e-06, + "loss": 0.7567, + "step": 15266 + }, + { + "epoch": 0.8402773955638725, + "grad_norm": 0.7875083684921265, + "learning_rate": 6.2656714266406384e-06, + "loss": 0.7392, + "step": 15267 + }, + { + "epoch": 0.8403324343662282, + "grad_norm": 0.7068803906440735, + "learning_rate": 6.2652520727763326e-06, + "loss": 0.6723, + "step": 15268 + }, + { + "epoch": 0.8403874731685839, + "grad_norm": 0.6994038820266724, + "learning_rate": 6.264832709402731e-06, + "loss": 0.6989, + "step": 15269 + }, + { + "epoch": 0.8404425119709396, + "grad_norm": 0.714044988155365, + "learning_rate": 6.264413336522985e-06, + "loss": 0.7464, + "step": 15270 + }, + { + "epoch": 0.8404975507732951, + "grad_norm": 0.8202210068702698, + "learning_rate": 6.263993954140249e-06, + "loss": 0.7174, + "step": 15271 + }, + { + "epoch": 0.8405525895756508, + "grad_norm": 0.6762316823005676, + "learning_rate": 6.2635745622576694e-06, + "loss": 0.7416, + "step": 15272 + }, + { + "epoch": 0.8406076283780065, + "grad_norm": 0.7461959719657898, + "learning_rate": 6.263155160878405e-06, + "loss": 0.7835, + "step": 15273 + }, + { + "epoch": 0.8406626671803622, + "grad_norm": 0.6263054609298706, + "learning_rate": 6.262735750005602e-06, + "loss": 0.7034, + "step": 15274 + }, + { + "epoch": 0.8407177059827178, + "grad_norm": 0.7489733695983887, + "learning_rate": 6.2623163296424165e-06, + "loss": 0.7387, + "step": 15275 + }, + { + "epoch": 0.8407727447850735, + "grad_norm": 0.7841430306434631, + "learning_rate": 6.261896899791997e-06, + "loss": 0.8487, + "step": 15276 + }, + { + "epoch": 0.8408277835874292, + "grad_norm": 0.8390078544616699, + "learning_rate": 6.2614774604575e-06, + "loss": 0.8335, + "step": 15277 + }, + { + "epoch": 0.8408828223897848, + "grad_norm": 0.9100946187973022, + "learning_rate": 6.261058011642076e-06, + "loss": 0.6196, + "step": 15278 + }, + { + "epoch": 0.8409378611921404, + "grad_norm": 0.7001772522926331, + "learning_rate": 6.260638553348879e-06, + "loss": 0.6935, + "step": 15279 + }, + { + "epoch": 0.8409928999944961, + "grad_norm": 0.7877102494239807, + "learning_rate": 6.260219085581057e-06, + "loss": 0.7378, + "step": 15280 + }, + { + "epoch": 0.8410479387968518, + "grad_norm": 0.687240719795227, + "learning_rate": 6.259799608341768e-06, + "loss": 0.7224, + "step": 15281 + }, + { + "epoch": 0.8411029775992075, + "grad_norm": 0.7766143083572388, + "learning_rate": 6.2593801216341625e-06, + "loss": 0.7157, + "step": 15282 + }, + { + "epoch": 0.8411580164015631, + "grad_norm": 1.1593633890151978, + "learning_rate": 6.258960625461391e-06, + "loss": 0.8555, + "step": 15283 + }, + { + "epoch": 0.8412130552039188, + "grad_norm": 0.6179451942443848, + "learning_rate": 6.2585411198266085e-06, + "loss": 0.6715, + "step": 15284 + }, + { + "epoch": 0.8412680940062744, + "grad_norm": 0.6755460500717163, + "learning_rate": 6.258121604732971e-06, + "loss": 0.7475, + "step": 15285 + }, + { + "epoch": 0.8413231328086301, + "grad_norm": 0.6775393486022949, + "learning_rate": 6.257702080183627e-06, + "loss": 0.6594, + "step": 15286 + }, + { + "epoch": 0.8413781716109857, + "grad_norm": 0.6972197890281677, + "learning_rate": 6.25728254618173e-06, + "loss": 0.7865, + "step": 15287 + }, + { + "epoch": 0.8414332104133414, + "grad_norm": 0.6446948051452637, + "learning_rate": 6.256863002730433e-06, + "loss": 0.6874, + "step": 15288 + }, + { + "epoch": 0.8414882492156971, + "grad_norm": 0.7012035846710205, + "learning_rate": 6.256443449832892e-06, + "loss": 0.7465, + "step": 15289 + }, + { + "epoch": 0.8415432880180528, + "grad_norm": 0.698693573474884, + "learning_rate": 6.256023887492257e-06, + "loss": 0.8206, + "step": 15290 + }, + { + "epoch": 0.8415983268204084, + "grad_norm": 0.7083185315132141, + "learning_rate": 6.255604315711684e-06, + "loss": 0.8306, + "step": 15291 + }, + { + "epoch": 0.841653365622764, + "grad_norm": 0.6605321764945984, + "learning_rate": 6.255184734494324e-06, + "loss": 0.6742, + "step": 15292 + }, + { + "epoch": 0.8417084044251197, + "grad_norm": 0.681881844997406, + "learning_rate": 6.254765143843331e-06, + "loss": 0.7009, + "step": 15293 + }, + { + "epoch": 0.8417634432274753, + "grad_norm": 0.6995699405670166, + "learning_rate": 6.2543455437618605e-06, + "loss": 0.8069, + "step": 15294 + }, + { + "epoch": 0.841818482029831, + "grad_norm": 0.7004442811012268, + "learning_rate": 6.2539259342530644e-06, + "loss": 0.71, + "step": 15295 + }, + { + "epoch": 0.8418735208321867, + "grad_norm": 0.7816279530525208, + "learning_rate": 6.253506315320097e-06, + "loss": 0.7833, + "step": 15296 + }, + { + "epoch": 0.8419285596345424, + "grad_norm": 0.6875490546226501, + "learning_rate": 6.25308668696611e-06, + "loss": 0.7223, + "step": 15297 + }, + { + "epoch": 0.841983598436898, + "grad_norm": 0.7126815915107727, + "learning_rate": 6.252667049194261e-06, + "loss": 0.7934, + "step": 15298 + }, + { + "epoch": 0.8420386372392537, + "grad_norm": 0.8048780560493469, + "learning_rate": 6.252247402007701e-06, + "loss": 0.7775, + "step": 15299 + }, + { + "epoch": 0.8420936760416093, + "grad_norm": 0.6681318879127502, + "learning_rate": 6.251827745409583e-06, + "loss": 0.6516, + "step": 15300 + }, + { + "epoch": 0.842148714843965, + "grad_norm": 0.6467457413673401, + "learning_rate": 6.251408079403064e-06, + "loss": 0.7417, + "step": 15301 + }, + { + "epoch": 0.8422037536463206, + "grad_norm": 0.6815666556358337, + "learning_rate": 6.250988403991297e-06, + "loss": 0.7498, + "step": 15302 + }, + { + "epoch": 0.8422587924486763, + "grad_norm": 0.6596205234527588, + "learning_rate": 6.250568719177437e-06, + "loss": 0.762, + "step": 15303 + }, + { + "epoch": 0.842313831251032, + "grad_norm": 0.7564731240272522, + "learning_rate": 6.250149024964635e-06, + "loss": 0.7592, + "step": 15304 + }, + { + "epoch": 0.8423688700533877, + "grad_norm": 0.6755058169364929, + "learning_rate": 6.249729321356048e-06, + "loss": 0.6953, + "step": 15305 + }, + { + "epoch": 0.8424239088557433, + "grad_norm": 0.7423762083053589, + "learning_rate": 6.249309608354832e-06, + "loss": 0.7018, + "step": 15306 + }, + { + "epoch": 0.8424789476580989, + "grad_norm": 0.727678120136261, + "learning_rate": 6.248889885964138e-06, + "loss": 0.8159, + "step": 15307 + }, + { + "epoch": 0.8425339864604546, + "grad_norm": 1.0823713541030884, + "learning_rate": 6.248470154187123e-06, + "loss": 0.872, + "step": 15308 + }, + { + "epoch": 0.8425890252628103, + "grad_norm": 0.6428259015083313, + "learning_rate": 6.248050413026939e-06, + "loss": 0.683, + "step": 15309 + }, + { + "epoch": 0.8426440640651659, + "grad_norm": 0.6622119545936584, + "learning_rate": 6.247630662486743e-06, + "loss": 0.7891, + "step": 15310 + }, + { + "epoch": 0.8426991028675216, + "grad_norm": 1.2377631664276123, + "learning_rate": 6.247210902569689e-06, + "loss": 0.7675, + "step": 15311 + }, + { + "epoch": 0.8427541416698773, + "grad_norm": 0.7909934520721436, + "learning_rate": 6.246791133278931e-06, + "loss": 0.8688, + "step": 15312 + }, + { + "epoch": 0.842809180472233, + "grad_norm": 0.6541300415992737, + "learning_rate": 6.246371354617625e-06, + "loss": 0.6754, + "step": 15313 + }, + { + "epoch": 0.8428642192745885, + "grad_norm": 0.6664960384368896, + "learning_rate": 6.245951566588926e-06, + "loss": 0.6666, + "step": 15314 + }, + { + "epoch": 0.8429192580769442, + "grad_norm": 0.7288552522659302, + "learning_rate": 6.245531769195988e-06, + "loss": 0.8179, + "step": 15315 + }, + { + "epoch": 0.8429742968792999, + "grad_norm": 0.7044054865837097, + "learning_rate": 6.245111962441966e-06, + "loss": 0.7306, + "step": 15316 + }, + { + "epoch": 0.8430293356816556, + "grad_norm": 0.6108603477478027, + "learning_rate": 6.244692146330016e-06, + "loss": 0.6213, + "step": 15317 + }, + { + "epoch": 0.8430843744840112, + "grad_norm": 0.6381129622459412, + "learning_rate": 6.2442723208632935e-06, + "loss": 0.7709, + "step": 15318 + }, + { + "epoch": 0.8431394132863669, + "grad_norm": 0.7355496883392334, + "learning_rate": 6.243852486044955e-06, + "loss": 0.665, + "step": 15319 + }, + { + "epoch": 0.8431944520887226, + "grad_norm": 0.7450826168060303, + "learning_rate": 6.2434326418781525e-06, + "loss": 0.7551, + "step": 15320 + }, + { + "epoch": 0.8432494908910783, + "grad_norm": 0.6463751792907715, + "learning_rate": 6.243012788366043e-06, + "loss": 0.7956, + "step": 15321 + }, + { + "epoch": 0.8433045296934338, + "grad_norm": 0.6673271059989929, + "learning_rate": 6.242592925511782e-06, + "loss": 0.7148, + "step": 15322 + }, + { + "epoch": 0.8433595684957895, + "grad_norm": 0.7663269639015198, + "learning_rate": 6.242173053318526e-06, + "loss": 0.8594, + "step": 15323 + }, + { + "epoch": 0.8434146072981452, + "grad_norm": 0.8503594994544983, + "learning_rate": 6.2417531717894285e-06, + "loss": 0.7594, + "step": 15324 + }, + { + "epoch": 0.8434696461005009, + "grad_norm": 0.6903344988822937, + "learning_rate": 6.241333280927647e-06, + "loss": 0.7252, + "step": 15325 + }, + { + "epoch": 0.8435246849028565, + "grad_norm": 0.6472830772399902, + "learning_rate": 6.240913380736337e-06, + "loss": 0.7379, + "step": 15326 + }, + { + "epoch": 0.8435797237052122, + "grad_norm": 0.6442959308624268, + "learning_rate": 6.240493471218655e-06, + "loss": 0.7447, + "step": 15327 + }, + { + "epoch": 0.8436347625075679, + "grad_norm": 0.6387843489646912, + "learning_rate": 6.240073552377756e-06, + "loss": 0.7659, + "step": 15328 + }, + { + "epoch": 0.8436898013099235, + "grad_norm": 0.7017341256141663, + "learning_rate": 6.239653624216794e-06, + "loss": 0.6934, + "step": 15329 + }, + { + "epoch": 0.8437448401122791, + "grad_norm": 0.6204355359077454, + "learning_rate": 6.2392336867389294e-06, + "loss": 0.6553, + "step": 15330 + }, + { + "epoch": 0.8437998789146348, + "grad_norm": 0.6765483021736145, + "learning_rate": 6.238813739947315e-06, + "loss": 0.7492, + "step": 15331 + }, + { + "epoch": 0.8438549177169905, + "grad_norm": 0.7261079549789429, + "learning_rate": 6.238393783845109e-06, + "loss": 0.7373, + "step": 15332 + }, + { + "epoch": 0.8439099565193462, + "grad_norm": 0.7019803524017334, + "learning_rate": 6.237973818435466e-06, + "loss": 0.7742, + "step": 15333 + }, + { + "epoch": 0.8439649953217018, + "grad_norm": 0.7521516680717468, + "learning_rate": 6.237553843721545e-06, + "loss": 0.8808, + "step": 15334 + }, + { + "epoch": 0.8440200341240575, + "grad_norm": 0.6796375513076782, + "learning_rate": 6.237133859706499e-06, + "loss": 0.7759, + "step": 15335 + }, + { + "epoch": 0.8440750729264132, + "grad_norm": 0.6199387311935425, + "learning_rate": 6.236713866393487e-06, + "loss": 0.6203, + "step": 15336 + }, + { + "epoch": 0.8441301117287687, + "grad_norm": 0.6968052983283997, + "learning_rate": 6.236293863785663e-06, + "loss": 0.7645, + "step": 15337 + }, + { + "epoch": 0.8441851505311244, + "grad_norm": 0.757556676864624, + "learning_rate": 6.235873851886186e-06, + "loss": 0.8005, + "step": 15338 + }, + { + "epoch": 0.8442401893334801, + "grad_norm": 0.6558085680007935, + "learning_rate": 6.235453830698211e-06, + "loss": 0.796, + "step": 15339 + }, + { + "epoch": 0.8442952281358358, + "grad_norm": 0.6963368654251099, + "learning_rate": 6.235033800224898e-06, + "loss": 0.7077, + "step": 15340 + }, + { + "epoch": 0.8443502669381914, + "grad_norm": 0.6057709455490112, + "learning_rate": 6.234613760469399e-06, + "loss": 0.5443, + "step": 15341 + }, + { + "epoch": 0.8444053057405471, + "grad_norm": 0.7616491317749023, + "learning_rate": 6.234193711434875e-06, + "loss": 0.6764, + "step": 15342 + }, + { + "epoch": 0.8444603445429028, + "grad_norm": 0.7143368721008301, + "learning_rate": 6.233773653124482e-06, + "loss": 0.6647, + "step": 15343 + }, + { + "epoch": 0.8445153833452584, + "grad_norm": 0.8766696453094482, + "learning_rate": 6.233353585541375e-06, + "loss": 0.7112, + "step": 15344 + }, + { + "epoch": 0.844570422147614, + "grad_norm": 0.6184048652648926, + "learning_rate": 6.232933508688714e-06, + "loss": 0.6645, + "step": 15345 + }, + { + "epoch": 0.8446254609499697, + "grad_norm": 0.8119208812713623, + "learning_rate": 6.232513422569655e-06, + "loss": 0.6729, + "step": 15346 + }, + { + "epoch": 0.8446804997523254, + "grad_norm": 0.5964543223381042, + "learning_rate": 6.2320933271873544e-06, + "loss": 0.6931, + "step": 15347 + }, + { + "epoch": 0.8447355385546811, + "grad_norm": 0.696611225605011, + "learning_rate": 6.23167322254497e-06, + "loss": 0.8292, + "step": 15348 + }, + { + "epoch": 0.8447905773570367, + "grad_norm": 0.6196489930152893, + "learning_rate": 6.231253108645658e-06, + "loss": 0.6651, + "step": 15349 + }, + { + "epoch": 0.8448456161593924, + "grad_norm": 0.6222663521766663, + "learning_rate": 6.230832985492579e-06, + "loss": 0.6513, + "step": 15350 + }, + { + "epoch": 0.844900654961748, + "grad_norm": 0.6424199342727661, + "learning_rate": 6.230412853088889e-06, + "loss": 0.7005, + "step": 15351 + }, + { + "epoch": 0.8449556937641037, + "grad_norm": 0.6484132409095764, + "learning_rate": 6.229992711437745e-06, + "loss": 0.6931, + "step": 15352 + }, + { + "epoch": 0.8450107325664593, + "grad_norm": 0.7568885684013367, + "learning_rate": 6.229572560542303e-06, + "loss": 0.7036, + "step": 15353 + }, + { + "epoch": 0.845065771368815, + "grad_norm": 0.665937602519989, + "learning_rate": 6.229152400405724e-06, + "loss": 0.5498, + "step": 15354 + }, + { + "epoch": 0.8451208101711707, + "grad_norm": 0.6861961483955383, + "learning_rate": 6.228732231031165e-06, + "loss": 0.7622, + "step": 15355 + }, + { + "epoch": 0.8451758489735264, + "grad_norm": 0.6793088316917419, + "learning_rate": 6.2283120524217845e-06, + "loss": 0.758, + "step": 15356 + }, + { + "epoch": 0.845230887775882, + "grad_norm": 0.7460890412330627, + "learning_rate": 6.227891864580739e-06, + "loss": 0.6618, + "step": 15357 + }, + { + "epoch": 0.8452859265782376, + "grad_norm": 0.6434195041656494, + "learning_rate": 6.227471667511186e-06, + "loss": 0.7226, + "step": 15358 + }, + { + "epoch": 0.8453409653805933, + "grad_norm": 0.7655256986618042, + "learning_rate": 6.227051461216285e-06, + "loss": 0.8461, + "step": 15359 + }, + { + "epoch": 0.845396004182949, + "grad_norm": 0.6727028489112854, + "learning_rate": 6.226631245699193e-06, + "loss": 0.6765, + "step": 15360 + }, + { + "epoch": 0.8454510429853046, + "grad_norm": 0.6030625700950623, + "learning_rate": 6.226211020963069e-06, + "loss": 0.6548, + "step": 15361 + }, + { + "epoch": 0.8455060817876603, + "grad_norm": 0.6430317163467407, + "learning_rate": 6.225790787011071e-06, + "loss": 0.7564, + "step": 15362 + }, + { + "epoch": 0.845561120590016, + "grad_norm": 0.633975088596344, + "learning_rate": 6.225370543846359e-06, + "loss": 0.716, + "step": 15363 + }, + { + "epoch": 0.8456161593923717, + "grad_norm": 0.6722174286842346, + "learning_rate": 6.2249502914720895e-06, + "loss": 0.7266, + "step": 15364 + }, + { + "epoch": 0.8456711981947272, + "grad_norm": 0.724166214466095, + "learning_rate": 6.22453002989142e-06, + "loss": 0.788, + "step": 15365 + }, + { + "epoch": 0.8457262369970829, + "grad_norm": 0.6406343579292297, + "learning_rate": 6.224109759107512e-06, + "loss": 0.8086, + "step": 15366 + }, + { + "epoch": 0.8457812757994386, + "grad_norm": 0.7344949245452881, + "learning_rate": 6.223689479123523e-06, + "loss": 0.7838, + "step": 15367 + }, + { + "epoch": 0.8458363146017943, + "grad_norm": 0.8572549819946289, + "learning_rate": 6.22326918994261e-06, + "loss": 0.7427, + "step": 15368 + }, + { + "epoch": 0.8458913534041499, + "grad_norm": 0.662644624710083, + "learning_rate": 6.222848891567934e-06, + "loss": 0.7165, + "step": 15369 + }, + { + "epoch": 0.8459463922065056, + "grad_norm": 0.7139797210693359, + "learning_rate": 6.222428584002654e-06, + "loss": 0.8218, + "step": 15370 + }, + { + "epoch": 0.8460014310088613, + "grad_norm": 0.6846550107002258, + "learning_rate": 6.222008267249927e-06, + "loss": 0.6686, + "step": 15371 + }, + { + "epoch": 0.846056469811217, + "grad_norm": 0.6675787568092346, + "learning_rate": 6.221587941312914e-06, + "loss": 0.7151, + "step": 15372 + }, + { + "epoch": 0.8461115086135725, + "grad_norm": 0.626371443271637, + "learning_rate": 6.221167606194771e-06, + "loss": 0.7637, + "step": 15373 + }, + { + "epoch": 0.8461665474159282, + "grad_norm": 0.6768763065338135, + "learning_rate": 6.220747261898661e-06, + "loss": 0.7363, + "step": 15374 + }, + { + "epoch": 0.8462215862182839, + "grad_norm": 0.7771314978599548, + "learning_rate": 6.220326908427741e-06, + "loss": 0.7032, + "step": 15375 + }, + { + "epoch": 0.8462766250206396, + "grad_norm": 0.8215247392654419, + "learning_rate": 6.219906545785171e-06, + "loss": 0.8917, + "step": 15376 + }, + { + "epoch": 0.8463316638229952, + "grad_norm": 0.7277588248252869, + "learning_rate": 6.219486173974107e-06, + "loss": 0.7531, + "step": 15377 + }, + { + "epoch": 0.8463867026253509, + "grad_norm": 0.6487376093864441, + "learning_rate": 6.219065792997714e-06, + "loss": 0.7182, + "step": 15378 + }, + { + "epoch": 0.8464417414277066, + "grad_norm": 0.6960493326187134, + "learning_rate": 6.218645402859148e-06, + "loss": 0.8125, + "step": 15379 + }, + { + "epoch": 0.8464967802300621, + "grad_norm": 0.7183159589767456, + "learning_rate": 6.218225003561571e-06, + "loss": 0.6536, + "step": 15380 + }, + { + "epoch": 0.8465518190324178, + "grad_norm": 0.7001940011978149, + "learning_rate": 6.217804595108139e-06, + "loss": 0.8203, + "step": 15381 + }, + { + "epoch": 0.8466068578347735, + "grad_norm": 0.5986705422401428, + "learning_rate": 6.217384177502015e-06, + "loss": 0.6672, + "step": 15382 + }, + { + "epoch": 0.8466618966371292, + "grad_norm": 0.6191138029098511, + "learning_rate": 6.216963750746356e-06, + "loss": 0.6565, + "step": 15383 + }, + { + "epoch": 0.8467169354394848, + "grad_norm": 1.2927004098892212, + "learning_rate": 6.216543314844326e-06, + "loss": 0.7511, + "step": 15384 + }, + { + "epoch": 0.8467719742418405, + "grad_norm": 0.6715198159217834, + "learning_rate": 6.2161228697990785e-06, + "loss": 0.7712, + "step": 15385 + }, + { + "epoch": 0.8468270130441962, + "grad_norm": 0.7516033053398132, + "learning_rate": 6.215702415613778e-06, + "loss": 0.6595, + "step": 15386 + }, + { + "epoch": 0.8468820518465519, + "grad_norm": 0.6913008689880371, + "learning_rate": 6.215281952291585e-06, + "loss": 0.7262, + "step": 15387 + }, + { + "epoch": 0.8469370906489074, + "grad_norm": 0.7288102507591248, + "learning_rate": 6.214861479835657e-06, + "loss": 0.6628, + "step": 15388 + }, + { + "epoch": 0.8469921294512631, + "grad_norm": 0.7889914512634277, + "learning_rate": 6.214440998249155e-06, + "loss": 0.7744, + "step": 15389 + }, + { + "epoch": 0.8470471682536188, + "grad_norm": 0.7622396945953369, + "learning_rate": 6.21402050753524e-06, + "loss": 0.7818, + "step": 15390 + }, + { + "epoch": 0.8471022070559745, + "grad_norm": 0.6172721982002258, + "learning_rate": 6.213600007697072e-06, + "loss": 0.626, + "step": 15391 + }, + { + "epoch": 0.8471572458583301, + "grad_norm": 0.710991621017456, + "learning_rate": 6.213179498737812e-06, + "loss": 0.7313, + "step": 15392 + }, + { + "epoch": 0.8472122846606858, + "grad_norm": 0.660139262676239, + "learning_rate": 6.2127589806606195e-06, + "loss": 0.6479, + "step": 15393 + }, + { + "epoch": 0.8472673234630415, + "grad_norm": 0.6611735224723816, + "learning_rate": 6.2123384534686534e-06, + "loss": 0.7091, + "step": 15394 + }, + { + "epoch": 0.8473223622653971, + "grad_norm": 0.8392653465270996, + "learning_rate": 6.211917917165078e-06, + "loss": 0.8514, + "step": 15395 + }, + { + "epoch": 0.8473774010677527, + "grad_norm": 0.6202608942985535, + "learning_rate": 6.211497371753052e-06, + "loss": 0.7068, + "step": 15396 + }, + { + "epoch": 0.8474324398701084, + "grad_norm": 0.6785926818847656, + "learning_rate": 6.211076817235734e-06, + "loss": 0.7216, + "step": 15397 + }, + { + "epoch": 0.8474874786724641, + "grad_norm": 0.7234075665473938, + "learning_rate": 6.210656253616288e-06, + "loss": 0.7379, + "step": 15398 + }, + { + "epoch": 0.8475425174748198, + "grad_norm": 0.6223714351654053, + "learning_rate": 6.210235680897874e-06, + "loss": 0.758, + "step": 15399 + }, + { + "epoch": 0.8475975562771754, + "grad_norm": 0.7993804812431335, + "learning_rate": 6.209815099083651e-06, + "loss": 0.8174, + "step": 15400 + }, + { + "epoch": 0.8476525950795311, + "grad_norm": 0.7897897362709045, + "learning_rate": 6.209394508176783e-06, + "loss": 0.6833, + "step": 15401 + }, + { + "epoch": 0.8477076338818867, + "grad_norm": 0.6803291440010071, + "learning_rate": 6.208973908180429e-06, + "loss": 0.7977, + "step": 15402 + }, + { + "epoch": 0.8477626726842424, + "grad_norm": 0.6937161087989807, + "learning_rate": 6.208553299097751e-06, + "loss": 0.7118, + "step": 15403 + }, + { + "epoch": 0.847817711486598, + "grad_norm": 0.7939958572387695, + "learning_rate": 6.208132680931911e-06, + "loss": 0.794, + "step": 15404 + }, + { + "epoch": 0.8478727502889537, + "grad_norm": 0.7009061574935913, + "learning_rate": 6.207712053686068e-06, + "loss": 0.7534, + "step": 15405 + }, + { + "epoch": 0.8479277890913094, + "grad_norm": 0.6890555620193481, + "learning_rate": 6.207291417363384e-06, + "loss": 0.7638, + "step": 15406 + }, + { + "epoch": 0.8479828278936651, + "grad_norm": 0.677119255065918, + "learning_rate": 6.206870771967022e-06, + "loss": 0.6814, + "step": 15407 + }, + { + "epoch": 0.8480378666960207, + "grad_norm": 0.706792950630188, + "learning_rate": 6.2064501175001425e-06, + "loss": 0.7722, + "step": 15408 + }, + { + "epoch": 0.8480929054983763, + "grad_norm": 0.6590496897697449, + "learning_rate": 6.206029453965905e-06, + "loss": 0.772, + "step": 15409 + }, + { + "epoch": 0.848147944300732, + "grad_norm": 0.6821194887161255, + "learning_rate": 6.205608781367475e-06, + "loss": 0.7687, + "step": 15410 + }, + { + "epoch": 0.8482029831030877, + "grad_norm": 0.6030088663101196, + "learning_rate": 6.205188099708011e-06, + "loss": 0.6673, + "step": 15411 + }, + { + "epoch": 0.8482580219054433, + "grad_norm": 0.6877727508544922, + "learning_rate": 6.204767408990676e-06, + "loss": 0.756, + "step": 15412 + }, + { + "epoch": 0.848313060707799, + "grad_norm": 0.7107367515563965, + "learning_rate": 6.204346709218632e-06, + "loss": 0.7481, + "step": 15413 + }, + { + "epoch": 0.8483680995101547, + "grad_norm": 0.7213658094406128, + "learning_rate": 6.2039260003950395e-06, + "loss": 0.7135, + "step": 15414 + }, + { + "epoch": 0.8484231383125104, + "grad_norm": 0.7002324461936951, + "learning_rate": 6.203505282523063e-06, + "loss": 0.6768, + "step": 15415 + }, + { + "epoch": 0.848478177114866, + "grad_norm": 0.7483230829238892, + "learning_rate": 6.2030845556058614e-06, + "loss": 0.633, + "step": 15416 + }, + { + "epoch": 0.8485332159172216, + "grad_norm": 0.6701670289039612, + "learning_rate": 6.2026638196466e-06, + "loss": 0.7936, + "step": 15417 + }, + { + "epoch": 0.8485882547195773, + "grad_norm": 0.6940304636955261, + "learning_rate": 6.202243074648438e-06, + "loss": 0.7787, + "step": 15418 + }, + { + "epoch": 0.848643293521933, + "grad_norm": 0.5912098288536072, + "learning_rate": 6.20182232061454e-06, + "loss": 0.6458, + "step": 15419 + }, + { + "epoch": 0.8486983323242886, + "grad_norm": 0.6538116931915283, + "learning_rate": 6.201401557548066e-06, + "loss": 0.6986, + "step": 15420 + }, + { + "epoch": 0.8487533711266443, + "grad_norm": 1.0245170593261719, + "learning_rate": 6.20098078545218e-06, + "loss": 0.7111, + "step": 15421 + }, + { + "epoch": 0.848808409929, + "grad_norm": 0.6896708011627197, + "learning_rate": 6.200560004330043e-06, + "loss": 0.7921, + "step": 15422 + }, + { + "epoch": 0.8488634487313556, + "grad_norm": 0.6219936013221741, + "learning_rate": 6.2001392141848195e-06, + "loss": 0.7345, + "step": 15423 + }, + { + "epoch": 0.8489184875337112, + "grad_norm": 0.7418678998947144, + "learning_rate": 6.199718415019671e-06, + "loss": 0.8517, + "step": 15424 + }, + { + "epoch": 0.8489735263360669, + "grad_norm": 0.7002347111701965, + "learning_rate": 6.199297606837759e-06, + "loss": 0.7345, + "step": 15425 + }, + { + "epoch": 0.8490285651384226, + "grad_norm": 0.7004539966583252, + "learning_rate": 6.198876789642247e-06, + "loss": 0.7639, + "step": 15426 + }, + { + "epoch": 0.8490836039407782, + "grad_norm": 0.64945387840271, + "learning_rate": 6.1984559634362995e-06, + "loss": 0.7556, + "step": 15427 + }, + { + "epoch": 0.8491386427431339, + "grad_norm": 0.6660465598106384, + "learning_rate": 6.1980351282230764e-06, + "loss": 0.7342, + "step": 15428 + }, + { + "epoch": 0.8491936815454896, + "grad_norm": 0.6177669763565063, + "learning_rate": 6.197614284005743e-06, + "loss": 0.7092, + "step": 15429 + }, + { + "epoch": 0.8492487203478453, + "grad_norm": 0.7604618072509766, + "learning_rate": 6.197193430787462e-06, + "loss": 0.8271, + "step": 15430 + }, + { + "epoch": 0.8493037591502008, + "grad_norm": 0.6788204312324524, + "learning_rate": 6.196772568571394e-06, + "loss": 0.7817, + "step": 15431 + }, + { + "epoch": 0.8493587979525565, + "grad_norm": 0.6073753833770752, + "learning_rate": 6.196351697360704e-06, + "loss": 0.6479, + "step": 15432 + }, + { + "epoch": 0.8494138367549122, + "grad_norm": 0.6842348575592041, + "learning_rate": 6.195930817158555e-06, + "loss": 0.7956, + "step": 15433 + }, + { + "epoch": 0.8494688755572679, + "grad_norm": 0.7863163352012634, + "learning_rate": 6.19550992796811e-06, + "loss": 0.7441, + "step": 15434 + }, + { + "epoch": 0.8495239143596235, + "grad_norm": 0.7495602965354919, + "learning_rate": 6.195089029792532e-06, + "loss": 0.7854, + "step": 15435 + }, + { + "epoch": 0.8495789531619792, + "grad_norm": 0.6595779061317444, + "learning_rate": 6.194668122634986e-06, + "loss": 0.6705, + "step": 15436 + }, + { + "epoch": 0.8496339919643349, + "grad_norm": 0.7727940082550049, + "learning_rate": 6.194247206498633e-06, + "loss": 0.7269, + "step": 15437 + }, + { + "epoch": 0.8496890307666906, + "grad_norm": 0.7433161735534668, + "learning_rate": 6.193826281386639e-06, + "loss": 0.7747, + "step": 15438 + }, + { + "epoch": 0.8497440695690461, + "grad_norm": 0.7075695991516113, + "learning_rate": 6.193405347302165e-06, + "loss": 0.8423, + "step": 15439 + }, + { + "epoch": 0.8497991083714018, + "grad_norm": 0.8821007013320923, + "learning_rate": 6.192984404248377e-06, + "loss": 0.705, + "step": 15440 + }, + { + "epoch": 0.8498541471737575, + "grad_norm": 0.7283695936203003, + "learning_rate": 6.192563452228437e-06, + "loss": 0.7013, + "step": 15441 + }, + { + "epoch": 0.8499091859761132, + "grad_norm": 0.7810649275779724, + "learning_rate": 6.192142491245509e-06, + "loss": 0.8303, + "step": 15442 + }, + { + "epoch": 0.8499642247784688, + "grad_norm": 0.5930086374282837, + "learning_rate": 6.191721521302758e-06, + "loss": 0.7117, + "step": 15443 + }, + { + "epoch": 0.8500192635808245, + "grad_norm": 0.6570530533790588, + "learning_rate": 6.191300542403347e-06, + "loss": 0.7525, + "step": 15444 + }, + { + "epoch": 0.8500743023831802, + "grad_norm": 0.8024932146072388, + "learning_rate": 6.190879554550437e-06, + "loss": 0.8011, + "step": 15445 + }, + { + "epoch": 0.8501293411855358, + "grad_norm": 0.851327121257782, + "learning_rate": 6.190458557747199e-06, + "loss": 0.8117, + "step": 15446 + }, + { + "epoch": 0.8501843799878914, + "grad_norm": 0.816034197807312, + "learning_rate": 6.190037551996791e-06, + "loss": 0.6659, + "step": 15447 + }, + { + "epoch": 0.8502394187902471, + "grad_norm": 0.7001582980155945, + "learning_rate": 6.18961653730238e-06, + "loss": 0.7406, + "step": 15448 + }, + { + "epoch": 0.8502944575926028, + "grad_norm": 0.6798322200775146, + "learning_rate": 6.189195513667129e-06, + "loss": 0.7504, + "step": 15449 + }, + { + "epoch": 0.8503494963949585, + "grad_norm": 0.6565585136413574, + "learning_rate": 6.188774481094203e-06, + "loss": 0.6445, + "step": 15450 + }, + { + "epoch": 0.8504045351973141, + "grad_norm": 0.674721360206604, + "learning_rate": 6.188353439586767e-06, + "loss": 0.6718, + "step": 15451 + }, + { + "epoch": 0.8504595739996698, + "grad_norm": 0.7626152634620667, + "learning_rate": 6.187932389147984e-06, + "loss": 0.7273, + "step": 15452 + }, + { + "epoch": 0.8505146128020254, + "grad_norm": 0.6497740149497986, + "learning_rate": 6.18751132978102e-06, + "loss": 0.7619, + "step": 15453 + }, + { + "epoch": 0.8505696516043811, + "grad_norm": 0.7532587647438049, + "learning_rate": 6.1870902614890384e-06, + "loss": 0.6929, + "step": 15454 + }, + { + "epoch": 0.8506246904067367, + "grad_norm": 0.5978666543960571, + "learning_rate": 6.186669184275204e-06, + "loss": 0.6246, + "step": 15455 + }, + { + "epoch": 0.8506797292090924, + "grad_norm": 0.8646023869514465, + "learning_rate": 6.186248098142681e-06, + "loss": 0.8215, + "step": 15456 + }, + { + "epoch": 0.8507347680114481, + "grad_norm": 0.635597825050354, + "learning_rate": 6.1858270030946355e-06, + "loss": 0.7683, + "step": 15457 + }, + { + "epoch": 0.8507898068138038, + "grad_norm": 0.7014510631561279, + "learning_rate": 6.185405899134231e-06, + "loss": 0.708, + "step": 15458 + }, + { + "epoch": 0.8508448456161594, + "grad_norm": 1.6040544509887695, + "learning_rate": 6.184984786264633e-06, + "loss": 0.893, + "step": 15459 + }, + { + "epoch": 0.850899884418515, + "grad_norm": 0.8098211288452148, + "learning_rate": 6.184563664489007e-06, + "loss": 0.7704, + "step": 15460 + }, + { + "epoch": 0.8509549232208707, + "grad_norm": 0.8390217423439026, + "learning_rate": 6.184142533810518e-06, + "loss": 0.7161, + "step": 15461 + }, + { + "epoch": 0.8510099620232264, + "grad_norm": 0.792433500289917, + "learning_rate": 6.183721394232329e-06, + "loss": 0.7247, + "step": 15462 + }, + { + "epoch": 0.851065000825582, + "grad_norm": 0.6644124984741211, + "learning_rate": 6.183300245757609e-06, + "loss": 0.7996, + "step": 15463 + }, + { + "epoch": 0.8511200396279377, + "grad_norm": 0.7543407082557678, + "learning_rate": 6.182879088389521e-06, + "loss": 0.7912, + "step": 15464 + }, + { + "epoch": 0.8511750784302934, + "grad_norm": 0.7752966284751892, + "learning_rate": 6.18245792213123e-06, + "loss": 0.7222, + "step": 15465 + }, + { + "epoch": 0.851230117232649, + "grad_norm": 0.6863895654678345, + "learning_rate": 6.182036746985901e-06, + "loss": 0.7375, + "step": 15466 + }, + { + "epoch": 0.8512851560350047, + "grad_norm": 0.6404759287834167, + "learning_rate": 6.1816155629567006e-06, + "loss": 0.7666, + "step": 15467 + }, + { + "epoch": 0.8513401948373603, + "grad_norm": 0.6879389882087708, + "learning_rate": 6.181194370046795e-06, + "loss": 0.697, + "step": 15468 + }, + { + "epoch": 0.851395233639716, + "grad_norm": 0.6798561215400696, + "learning_rate": 6.180773168259347e-06, + "loss": 0.7401, + "step": 15469 + }, + { + "epoch": 0.8514502724420716, + "grad_norm": 0.6846516728401184, + "learning_rate": 6.180351957597524e-06, + "loss": 0.7512, + "step": 15470 + }, + { + "epoch": 0.8515053112444273, + "grad_norm": 0.6618537902832031, + "learning_rate": 6.1799307380644925e-06, + "loss": 0.7577, + "step": 15471 + }, + { + "epoch": 0.851560350046783, + "grad_norm": 0.7780229449272156, + "learning_rate": 6.179509509663417e-06, + "loss": 0.7145, + "step": 15472 + }, + { + "epoch": 0.8516153888491387, + "grad_norm": 0.8041914701461792, + "learning_rate": 6.179088272397464e-06, + "loss": 0.7662, + "step": 15473 + }, + { + "epoch": 0.8516704276514943, + "grad_norm": 0.6719852685928345, + "learning_rate": 6.178667026269799e-06, + "loss": 0.7269, + "step": 15474 + }, + { + "epoch": 0.8517254664538499, + "grad_norm": 0.6677427291870117, + "learning_rate": 6.178245771283589e-06, + "loss": 0.6949, + "step": 15475 + }, + { + "epoch": 0.8517805052562056, + "grad_norm": 0.7873098850250244, + "learning_rate": 6.177824507441998e-06, + "loss": 0.7435, + "step": 15476 + }, + { + "epoch": 0.8518355440585613, + "grad_norm": 0.726902186870575, + "learning_rate": 6.1774032347481935e-06, + "loss": 0.8712, + "step": 15477 + }, + { + "epoch": 0.8518905828609169, + "grad_norm": 0.6984882950782776, + "learning_rate": 6.176981953205342e-06, + "loss": 0.8705, + "step": 15478 + }, + { + "epoch": 0.8519456216632726, + "grad_norm": 0.9757348895072937, + "learning_rate": 6.176560662816609e-06, + "loss": 0.8172, + "step": 15479 + }, + { + "epoch": 0.8520006604656283, + "grad_norm": 0.8358891606330872, + "learning_rate": 6.1761393635851615e-06, + "loss": 0.7576, + "step": 15480 + }, + { + "epoch": 0.852055699267984, + "grad_norm": 0.7324516177177429, + "learning_rate": 6.175718055514165e-06, + "loss": 0.6785, + "step": 15481 + }, + { + "epoch": 0.8521107380703395, + "grad_norm": 0.7352122664451599, + "learning_rate": 6.175296738606785e-06, + "loss": 0.6998, + "step": 15482 + }, + { + "epoch": 0.8521657768726952, + "grad_norm": 0.7331148982048035, + "learning_rate": 6.17487541286619e-06, + "loss": 0.7318, + "step": 15483 + }, + { + "epoch": 0.8522208156750509, + "grad_norm": 0.7264272570610046, + "learning_rate": 6.174454078295547e-06, + "loss": 0.8328, + "step": 15484 + }, + { + "epoch": 0.8522758544774066, + "grad_norm": 0.6304524540901184, + "learning_rate": 6.174032734898021e-06, + "loss": 0.6982, + "step": 15485 + }, + { + "epoch": 0.8523308932797622, + "grad_norm": 0.709864616394043, + "learning_rate": 6.173611382676778e-06, + "loss": 0.784, + "step": 15486 + }, + { + "epoch": 0.8523859320821179, + "grad_norm": 0.689153254032135, + "learning_rate": 6.173190021634987e-06, + "loss": 0.596, + "step": 15487 + }, + { + "epoch": 0.8524409708844736, + "grad_norm": 0.6987396478652954, + "learning_rate": 6.172768651775815e-06, + "loss": 0.776, + "step": 15488 + }, + { + "epoch": 0.8524960096868293, + "grad_norm": 0.6473208665847778, + "learning_rate": 6.172347273102427e-06, + "loss": 0.7084, + "step": 15489 + }, + { + "epoch": 0.8525510484891848, + "grad_norm": 0.7659436464309692, + "learning_rate": 6.17192588561799e-06, + "loss": 0.6916, + "step": 15490 + }, + { + "epoch": 0.8526060872915405, + "grad_norm": 0.6404485702514648, + "learning_rate": 6.171504489325673e-06, + "loss": 0.6761, + "step": 15491 + }, + { + "epoch": 0.8526611260938962, + "grad_norm": 0.7309281826019287, + "learning_rate": 6.171083084228641e-06, + "loss": 0.7993, + "step": 15492 + }, + { + "epoch": 0.8527161648962519, + "grad_norm": 0.6718485355377197, + "learning_rate": 6.170661670330062e-06, + "loss": 0.719, + "step": 15493 + }, + { + "epoch": 0.8527712036986075, + "grad_norm": 0.7705026865005493, + "learning_rate": 6.170240247633101e-06, + "loss": 0.7148, + "step": 15494 + }, + { + "epoch": 0.8528262425009632, + "grad_norm": 0.9131157398223877, + "learning_rate": 6.169818816140931e-06, + "loss": 0.7439, + "step": 15495 + }, + { + "epoch": 0.8528812813033189, + "grad_norm": 0.7396848201751709, + "learning_rate": 6.169397375856715e-06, + "loss": 0.7745, + "step": 15496 + }, + { + "epoch": 0.8529363201056746, + "grad_norm": 0.613767147064209, + "learning_rate": 6.168975926783621e-06, + "loss": 0.6724, + "step": 15497 + }, + { + "epoch": 0.8529913589080301, + "grad_norm": 0.8665019869804382, + "learning_rate": 6.168554468924815e-06, + "loss": 0.6896, + "step": 15498 + }, + { + "epoch": 0.8530463977103858, + "grad_norm": 0.7870811223983765, + "learning_rate": 6.168133002283469e-06, + "loss": 0.7177, + "step": 15499 + }, + { + "epoch": 0.8531014365127415, + "grad_norm": 0.5830097794532776, + "learning_rate": 6.167711526862747e-06, + "loss": 0.6465, + "step": 15500 + }, + { + "epoch": 0.8531564753150972, + "grad_norm": 0.6497567892074585, + "learning_rate": 6.167290042665819e-06, + "loss": 0.6842, + "step": 15501 + }, + { + "epoch": 0.8532115141174528, + "grad_norm": 0.6574105620384216, + "learning_rate": 6.1668685496958515e-06, + "loss": 0.7197, + "step": 15502 + }, + { + "epoch": 0.8532665529198085, + "grad_norm": 0.7069656252861023, + "learning_rate": 6.166447047956011e-06, + "loss": 0.8031, + "step": 15503 + }, + { + "epoch": 0.8533215917221642, + "grad_norm": 0.700334370136261, + "learning_rate": 6.166025537449467e-06, + "loss": 0.7731, + "step": 15504 + }, + { + "epoch": 0.8533766305245198, + "grad_norm": 0.7227431535720825, + "learning_rate": 6.165604018179388e-06, + "loss": 0.7741, + "step": 15505 + }, + { + "epoch": 0.8534316693268754, + "grad_norm": 0.6752750277519226, + "learning_rate": 6.16518249014894e-06, + "loss": 0.7176, + "step": 15506 + }, + { + "epoch": 0.8534867081292311, + "grad_norm": 0.6595750451087952, + "learning_rate": 6.1647609533612925e-06, + "loss": 0.7758, + "step": 15507 + }, + { + "epoch": 0.8535417469315868, + "grad_norm": 0.7232886552810669, + "learning_rate": 6.1643394078196136e-06, + "loss": 0.667, + "step": 15508 + }, + { + "epoch": 0.8535967857339424, + "grad_norm": 0.7297642827033997, + "learning_rate": 6.163917853527072e-06, + "loss": 0.6951, + "step": 15509 + }, + { + "epoch": 0.8536518245362981, + "grad_norm": 0.6766324043273926, + "learning_rate": 6.163496290486834e-06, + "loss": 0.7429, + "step": 15510 + }, + { + "epoch": 0.8537068633386538, + "grad_norm": 0.6861003041267395, + "learning_rate": 6.16307471870207e-06, + "loss": 0.8159, + "step": 15511 + }, + { + "epoch": 0.8537619021410094, + "grad_norm": 0.687471330165863, + "learning_rate": 6.1626531381759494e-06, + "loss": 0.7001, + "step": 15512 + }, + { + "epoch": 0.853816940943365, + "grad_norm": 0.8295855522155762, + "learning_rate": 6.162231548911637e-06, + "loss": 0.824, + "step": 15513 + }, + { + "epoch": 0.8538719797457207, + "grad_norm": 0.7513163685798645, + "learning_rate": 6.161809950912304e-06, + "loss": 0.7563, + "step": 15514 + }, + { + "epoch": 0.8539270185480764, + "grad_norm": 0.824586033821106, + "learning_rate": 6.161388344181119e-06, + "loss": 0.9607, + "step": 15515 + }, + { + "epoch": 0.8539820573504321, + "grad_norm": 0.9423270225524902, + "learning_rate": 6.160966728721249e-06, + "loss": 0.7527, + "step": 15516 + }, + { + "epoch": 0.8540370961527877, + "grad_norm": 0.6734938621520996, + "learning_rate": 6.160545104535866e-06, + "loss": 0.7741, + "step": 15517 + }, + { + "epoch": 0.8540921349551434, + "grad_norm": 0.6953195929527283, + "learning_rate": 6.160123471628133e-06, + "loss": 0.7844, + "step": 15518 + }, + { + "epoch": 0.854147173757499, + "grad_norm": 0.8023058772087097, + "learning_rate": 6.1597018300012245e-06, + "loss": 0.815, + "step": 15519 + }, + { + "epoch": 0.8542022125598547, + "grad_norm": 1.1232868432998657, + "learning_rate": 6.159280179658308e-06, + "loss": 0.7702, + "step": 15520 + }, + { + "epoch": 0.8542572513622103, + "grad_norm": 0.6074864268302917, + "learning_rate": 6.158858520602552e-06, + "loss": 0.6025, + "step": 15521 + }, + { + "epoch": 0.854312290164566, + "grad_norm": 0.6659427285194397, + "learning_rate": 6.158436852837124e-06, + "loss": 0.7019, + "step": 15522 + }, + { + "epoch": 0.8543673289669217, + "grad_norm": 0.6226561665534973, + "learning_rate": 6.158015176365197e-06, + "loss": 0.6748, + "step": 15523 + }, + { + "epoch": 0.8544223677692774, + "grad_norm": 0.6821898818016052, + "learning_rate": 6.157593491189936e-06, + "loss": 0.7617, + "step": 15524 + }, + { + "epoch": 0.854477406571633, + "grad_norm": 0.6755489110946655, + "learning_rate": 6.157171797314513e-06, + "loss": 0.6664, + "step": 15525 + }, + { + "epoch": 0.8545324453739886, + "grad_norm": 0.5896545052528381, + "learning_rate": 6.156750094742096e-06, + "loss": 0.7147, + "step": 15526 + }, + { + "epoch": 0.8545874841763443, + "grad_norm": 0.7024844288825989, + "learning_rate": 6.1563283834758555e-06, + "loss": 0.7884, + "step": 15527 + }, + { + "epoch": 0.8546425229787, + "grad_norm": 0.8020251393318176, + "learning_rate": 6.15590666351896e-06, + "loss": 0.8014, + "step": 15528 + }, + { + "epoch": 0.8546975617810556, + "grad_norm": 0.7213570475578308, + "learning_rate": 6.1554849348745805e-06, + "loss": 0.7289, + "step": 15529 + }, + { + "epoch": 0.8547526005834113, + "grad_norm": 0.618235170841217, + "learning_rate": 6.155063197545884e-06, + "loss": 0.7241, + "step": 15530 + }, + { + "epoch": 0.854807639385767, + "grad_norm": 0.7157233953475952, + "learning_rate": 6.154641451536042e-06, + "loss": 0.7591, + "step": 15531 + }, + { + "epoch": 0.8548626781881227, + "grad_norm": 0.7156147360801697, + "learning_rate": 6.1542196968482245e-06, + "loss": 0.7337, + "step": 15532 + }, + { + "epoch": 0.8549177169904782, + "grad_norm": 0.7722104787826538, + "learning_rate": 6.153797933485601e-06, + "loss": 0.7287, + "step": 15533 + }, + { + "epoch": 0.8549727557928339, + "grad_norm": 0.8379881381988525, + "learning_rate": 6.1533761614513394e-06, + "loss": 0.7898, + "step": 15534 + }, + { + "epoch": 0.8550277945951896, + "grad_norm": 0.6861830949783325, + "learning_rate": 6.152954380748614e-06, + "loss": 0.7838, + "step": 15535 + }, + { + "epoch": 0.8550828333975453, + "grad_norm": 0.6730731725692749, + "learning_rate": 6.15253259138059e-06, + "loss": 0.7606, + "step": 15536 + }, + { + "epoch": 0.8551378721999009, + "grad_norm": 0.6933832168579102, + "learning_rate": 6.152110793350441e-06, + "loss": 0.7637, + "step": 15537 + }, + { + "epoch": 0.8551929110022566, + "grad_norm": 0.9518343806266785, + "learning_rate": 6.151688986661335e-06, + "loss": 0.7715, + "step": 15538 + }, + { + "epoch": 0.8552479498046123, + "grad_norm": 0.7800498008728027, + "learning_rate": 6.151267171316442e-06, + "loss": 0.7933, + "step": 15539 + }, + { + "epoch": 0.855302988606968, + "grad_norm": 0.8873908519744873, + "learning_rate": 6.150845347318934e-06, + "loss": 0.8349, + "step": 15540 + }, + { + "epoch": 0.8553580274093235, + "grad_norm": 0.6778621673583984, + "learning_rate": 6.15042351467198e-06, + "loss": 0.7855, + "step": 15541 + }, + { + "epoch": 0.8554130662116792, + "grad_norm": 0.6535203456878662, + "learning_rate": 6.150001673378751e-06, + "loss": 0.7288, + "step": 15542 + }, + { + "epoch": 0.8554681050140349, + "grad_norm": 0.7087036967277527, + "learning_rate": 6.149579823442418e-06, + "loss": 0.8542, + "step": 15543 + }, + { + "epoch": 0.8555231438163906, + "grad_norm": 0.8136983513832092, + "learning_rate": 6.1491579648661495e-06, + "loss": 0.8618, + "step": 15544 + }, + { + "epoch": 0.8555781826187462, + "grad_norm": 0.7439128756523132, + "learning_rate": 6.148736097653118e-06, + "loss": 0.7257, + "step": 15545 + }, + { + "epoch": 0.8556332214211019, + "grad_norm": 0.853769838809967, + "learning_rate": 6.148314221806493e-06, + "loss": 0.8759, + "step": 15546 + }, + { + "epoch": 0.8556882602234576, + "grad_norm": 0.6681458950042725, + "learning_rate": 6.147892337329446e-06, + "loss": 0.6993, + "step": 15547 + }, + { + "epoch": 0.8557432990258133, + "grad_norm": 0.6452274918556213, + "learning_rate": 6.147470444225147e-06, + "loss": 0.6838, + "step": 15548 + }, + { + "epoch": 0.8557983378281688, + "grad_norm": 0.7074391841888428, + "learning_rate": 6.147048542496769e-06, + "loss": 0.7978, + "step": 15549 + }, + { + "epoch": 0.8558533766305245, + "grad_norm": 0.634824275970459, + "learning_rate": 6.14662663214748e-06, + "loss": 0.6602, + "step": 15550 + }, + { + "epoch": 0.8559084154328802, + "grad_norm": 0.7253528237342834, + "learning_rate": 6.146204713180453e-06, + "loss": 0.8662, + "step": 15551 + }, + { + "epoch": 0.8559634542352358, + "grad_norm": 0.6583418846130371, + "learning_rate": 6.145782785598858e-06, + "loss": 0.7502, + "step": 15552 + }, + { + "epoch": 0.8560184930375915, + "grad_norm": 0.7276360392570496, + "learning_rate": 6.1453608494058645e-06, + "loss": 0.7066, + "step": 15553 + }, + { + "epoch": 0.8560735318399472, + "grad_norm": 0.6554223299026489, + "learning_rate": 6.144938904604646e-06, + "loss": 0.7468, + "step": 15554 + }, + { + "epoch": 0.8561285706423029, + "grad_norm": 0.6767130494117737, + "learning_rate": 6.144516951198374e-06, + "loss": 0.724, + "step": 15555 + }, + { + "epoch": 0.8561836094446584, + "grad_norm": 0.7025824785232544, + "learning_rate": 6.144094989190219e-06, + "loss": 0.7658, + "step": 15556 + }, + { + "epoch": 0.8562386482470141, + "grad_norm": 0.6780791282653809, + "learning_rate": 6.143673018583353e-06, + "loss": 0.7273, + "step": 15557 + }, + { + "epoch": 0.8562936870493698, + "grad_norm": 0.6552621722221375, + "learning_rate": 6.143251039380944e-06, + "loss": 0.7374, + "step": 15558 + }, + { + "epoch": 0.8563487258517255, + "grad_norm": 0.7451765537261963, + "learning_rate": 6.142829051586169e-06, + "loss": 0.7918, + "step": 15559 + }, + { + "epoch": 0.8564037646540811, + "grad_norm": 0.7365521788597107, + "learning_rate": 6.142407055202195e-06, + "loss": 0.7142, + "step": 15560 + }, + { + "epoch": 0.8564588034564368, + "grad_norm": 0.6708245277404785, + "learning_rate": 6.1419850502321976e-06, + "loss": 0.773, + "step": 15561 + }, + { + "epoch": 0.8565138422587925, + "grad_norm": 0.8878017067909241, + "learning_rate": 6.141563036679344e-06, + "loss": 0.7704, + "step": 15562 + }, + { + "epoch": 0.8565688810611481, + "grad_norm": 0.8903444409370422, + "learning_rate": 6.14114101454681e-06, + "loss": 0.8422, + "step": 15563 + }, + { + "epoch": 0.8566239198635037, + "grad_norm": 0.7124255895614624, + "learning_rate": 6.140718983837764e-06, + "loss": 0.74, + "step": 15564 + }, + { + "epoch": 0.8566789586658594, + "grad_norm": 0.7107509970664978, + "learning_rate": 6.14029694455538e-06, + "loss": 0.7052, + "step": 15565 + }, + { + "epoch": 0.8567339974682151, + "grad_norm": 0.6864815950393677, + "learning_rate": 6.13987489670283e-06, + "loss": 0.7541, + "step": 15566 + }, + { + "epoch": 0.8567890362705708, + "grad_norm": 0.696169912815094, + "learning_rate": 6.1394528402832845e-06, + "loss": 0.7829, + "step": 15567 + }, + { + "epoch": 0.8568440750729264, + "grad_norm": 0.5650855302810669, + "learning_rate": 6.139030775299917e-06, + "loss": 0.6268, + "step": 15568 + }, + { + "epoch": 0.8568991138752821, + "grad_norm": 0.8753485083580017, + "learning_rate": 6.138608701755899e-06, + "loss": 0.7658, + "step": 15569 + }, + { + "epoch": 0.8569541526776377, + "grad_norm": 0.6950936317443848, + "learning_rate": 6.138186619654401e-06, + "loss": 0.8013, + "step": 15570 + }, + { + "epoch": 0.8570091914799934, + "grad_norm": 0.6608526110649109, + "learning_rate": 6.1377645289986e-06, + "loss": 0.8095, + "step": 15571 + }, + { + "epoch": 0.857064230282349, + "grad_norm": 0.6566348075866699, + "learning_rate": 6.137342429791664e-06, + "loss": 0.787, + "step": 15572 + }, + { + "epoch": 0.8571192690847047, + "grad_norm": 0.6536870002746582, + "learning_rate": 6.136920322036768e-06, + "loss": 0.8754, + "step": 15573 + }, + { + "epoch": 0.8571743078870604, + "grad_norm": 0.6461239457130432, + "learning_rate": 6.136498205737081e-06, + "loss": 0.6948, + "step": 15574 + }, + { + "epoch": 0.8572293466894161, + "grad_norm": 0.6441778540611267, + "learning_rate": 6.13607608089578e-06, + "loss": 0.7097, + "step": 15575 + }, + { + "epoch": 0.8572843854917717, + "grad_norm": 0.6770008206367493, + "learning_rate": 6.135653947516034e-06, + "loss": 0.7497, + "step": 15576 + }, + { + "epoch": 0.8573394242941274, + "grad_norm": 0.6479504704475403, + "learning_rate": 6.1352318056010175e-06, + "loss": 0.6975, + "step": 15577 + }, + { + "epoch": 0.857394463096483, + "grad_norm": 0.6586747765541077, + "learning_rate": 6.134809655153901e-06, + "loss": 0.7977, + "step": 15578 + }, + { + "epoch": 0.8574495018988387, + "grad_norm": 0.6888973116874695, + "learning_rate": 6.1343874961778604e-06, + "loss": 0.7399, + "step": 15579 + }, + { + "epoch": 0.8575045407011943, + "grad_norm": 0.6897402405738831, + "learning_rate": 6.133965328676066e-06, + "loss": 0.7507, + "step": 15580 + }, + { + "epoch": 0.85755957950355, + "grad_norm": 0.6857936382293701, + "learning_rate": 6.133543152651693e-06, + "loss": 0.76, + "step": 15581 + }, + { + "epoch": 0.8576146183059057, + "grad_norm": 0.8104296922683716, + "learning_rate": 6.133120968107912e-06, + "loss": 0.711, + "step": 15582 + }, + { + "epoch": 0.8576696571082614, + "grad_norm": 0.786551296710968, + "learning_rate": 6.132698775047897e-06, + "loss": 0.7603, + "step": 15583 + }, + { + "epoch": 0.857724695910617, + "grad_norm": 0.6685918569564819, + "learning_rate": 6.132276573474822e-06, + "loss": 0.6986, + "step": 15584 + }, + { + "epoch": 0.8577797347129726, + "grad_norm": 0.8557218909263611, + "learning_rate": 6.131854363391859e-06, + "loss": 0.795, + "step": 15585 + }, + { + "epoch": 0.8578347735153283, + "grad_norm": 0.6823254823684692, + "learning_rate": 6.1314321448021825e-06, + "loss": 0.7349, + "step": 15586 + }, + { + "epoch": 0.857889812317684, + "grad_norm": 0.772792637348175, + "learning_rate": 6.131009917708965e-06, + "loss": 0.7547, + "step": 15587 + }, + { + "epoch": 0.8579448511200396, + "grad_norm": 0.7231488227844238, + "learning_rate": 6.130587682115379e-06, + "loss": 0.7997, + "step": 15588 + }, + { + "epoch": 0.8579998899223953, + "grad_norm": 0.6683667898178101, + "learning_rate": 6.130165438024598e-06, + "loss": 0.7863, + "step": 15589 + }, + { + "epoch": 0.858054928724751, + "grad_norm": 0.6588496565818787, + "learning_rate": 6.129743185439796e-06, + "loss": 0.7859, + "step": 15590 + }, + { + "epoch": 0.8581099675271067, + "grad_norm": 0.6130164861679077, + "learning_rate": 6.129320924364147e-06, + "loss": 0.7042, + "step": 15591 + }, + { + "epoch": 0.8581650063294622, + "grad_norm": 0.610054612159729, + "learning_rate": 6.128898654800824e-06, + "loss": 0.6645, + "step": 15592 + }, + { + "epoch": 0.8582200451318179, + "grad_norm": 0.6974982023239136, + "learning_rate": 6.128476376753002e-06, + "loss": 0.8146, + "step": 15593 + }, + { + "epoch": 0.8582750839341736, + "grad_norm": 0.7313922047615051, + "learning_rate": 6.128054090223853e-06, + "loss": 0.7055, + "step": 15594 + }, + { + "epoch": 0.8583301227365292, + "grad_norm": 0.7004476189613342, + "learning_rate": 6.12763179521655e-06, + "loss": 0.7848, + "step": 15595 + }, + { + "epoch": 0.8583851615388849, + "grad_norm": 0.6916295289993286, + "learning_rate": 6.127209491734269e-06, + "loss": 0.7711, + "step": 15596 + }, + { + "epoch": 0.8584402003412406, + "grad_norm": 0.648551881313324, + "learning_rate": 6.126787179780185e-06, + "loss": 0.7098, + "step": 15597 + }, + { + "epoch": 0.8584952391435963, + "grad_norm": 0.6482384204864502, + "learning_rate": 6.126364859357469e-06, + "loss": 0.7596, + "step": 15598 + }, + { + "epoch": 0.8585502779459518, + "grad_norm": 0.7109531164169312, + "learning_rate": 6.125942530469297e-06, + "loss": 0.7539, + "step": 15599 + }, + { + "epoch": 0.8586053167483075, + "grad_norm": 0.6109207272529602, + "learning_rate": 6.125520193118841e-06, + "loss": 0.6764, + "step": 15600 + }, + { + "epoch": 0.8586603555506632, + "grad_norm": 0.7050053477287292, + "learning_rate": 6.125097847309277e-06, + "loss": 0.7304, + "step": 15601 + }, + { + "epoch": 0.8587153943530189, + "grad_norm": 0.653078019618988, + "learning_rate": 6.124675493043779e-06, + "loss": 0.6985, + "step": 15602 + }, + { + "epoch": 0.8587704331553745, + "grad_norm": 0.8391665816307068, + "learning_rate": 6.124253130325521e-06, + "loss": 0.8011, + "step": 15603 + }, + { + "epoch": 0.8588254719577302, + "grad_norm": 0.6978835463523865, + "learning_rate": 6.123830759157676e-06, + "loss": 0.6783, + "step": 15604 + }, + { + "epoch": 0.8588805107600859, + "grad_norm": 0.7796862125396729, + "learning_rate": 6.123408379543422e-06, + "loss": 0.7237, + "step": 15605 + }, + { + "epoch": 0.8589355495624416, + "grad_norm": 0.7162224054336548, + "learning_rate": 6.12298599148593e-06, + "loss": 0.8095, + "step": 15606 + }, + { + "epoch": 0.8589905883647971, + "grad_norm": 0.7654495239257812, + "learning_rate": 6.122563594988375e-06, + "loss": 0.7149, + "step": 15607 + }, + { + "epoch": 0.8590456271671528, + "grad_norm": 0.6186618804931641, + "learning_rate": 6.122141190053935e-06, + "loss": 0.6687, + "step": 15608 + }, + { + "epoch": 0.8591006659695085, + "grad_norm": 0.6669701337814331, + "learning_rate": 6.121718776685781e-06, + "loss": 0.7281, + "step": 15609 + }, + { + "epoch": 0.8591557047718642, + "grad_norm": 0.6581971645355225, + "learning_rate": 6.121296354887089e-06, + "loss": 0.7158, + "step": 15610 + }, + { + "epoch": 0.8592107435742198, + "grad_norm": 0.698243260383606, + "learning_rate": 6.120873924661034e-06, + "loss": 0.7894, + "step": 15611 + }, + { + "epoch": 0.8592657823765755, + "grad_norm": 0.6746723651885986, + "learning_rate": 6.120451486010791e-06, + "loss": 0.7993, + "step": 15612 + }, + { + "epoch": 0.8593208211789312, + "grad_norm": 0.727219820022583, + "learning_rate": 6.1200290389395335e-06, + "loss": 0.8446, + "step": 15613 + }, + { + "epoch": 0.8593758599812868, + "grad_norm": 0.7818809151649475, + "learning_rate": 6.119606583450438e-06, + "loss": 0.7167, + "step": 15614 + }, + { + "epoch": 0.8594308987836424, + "grad_norm": 0.692720890045166, + "learning_rate": 6.119184119546679e-06, + "loss": 0.705, + "step": 15615 + }, + { + "epoch": 0.8594859375859981, + "grad_norm": 0.6671997308731079, + "learning_rate": 6.1187616472314315e-06, + "loss": 0.8383, + "step": 15616 + }, + { + "epoch": 0.8595409763883538, + "grad_norm": 0.8043667674064636, + "learning_rate": 6.118339166507872e-06, + "loss": 0.6775, + "step": 15617 + }, + { + "epoch": 0.8595960151907095, + "grad_norm": 0.6313692927360535, + "learning_rate": 6.117916677379173e-06, + "loss": 0.6327, + "step": 15618 + }, + { + "epoch": 0.8596510539930651, + "grad_norm": 0.6770568490028381, + "learning_rate": 6.117494179848512e-06, + "loss": 0.741, + "step": 15619 + }, + { + "epoch": 0.8597060927954208, + "grad_norm": 0.6715630292892456, + "learning_rate": 6.117071673919064e-06, + "loss": 0.7105, + "step": 15620 + }, + { + "epoch": 0.8597611315977765, + "grad_norm": 0.618145763874054, + "learning_rate": 6.116649159594006e-06, + "loss": 0.7316, + "step": 15621 + }, + { + "epoch": 0.8598161704001321, + "grad_norm": 0.7127259969711304, + "learning_rate": 6.11622663687651e-06, + "loss": 0.7736, + "step": 15622 + }, + { + "epoch": 0.8598712092024877, + "grad_norm": 0.6675243377685547, + "learning_rate": 6.115804105769754e-06, + "loss": 0.6747, + "step": 15623 + }, + { + "epoch": 0.8599262480048434, + "grad_norm": 0.7965354323387146, + "learning_rate": 6.115381566276912e-06, + "loss": 0.7524, + "step": 15624 + }, + { + "epoch": 0.8599812868071991, + "grad_norm": 0.5921181440353394, + "learning_rate": 6.114959018401163e-06, + "loss": 0.679, + "step": 15625 + }, + { + "epoch": 0.8600363256095548, + "grad_norm": 0.635802149772644, + "learning_rate": 6.1145364621456795e-06, + "loss": 0.699, + "step": 15626 + }, + { + "epoch": 0.8600913644119104, + "grad_norm": 0.7159842252731323, + "learning_rate": 6.114113897513636e-06, + "loss": 0.7112, + "step": 15627 + }, + { + "epoch": 0.860146403214266, + "grad_norm": 0.7100176215171814, + "learning_rate": 6.113691324508213e-06, + "loss": 0.7459, + "step": 15628 + }, + { + "epoch": 0.8602014420166217, + "grad_norm": 0.6484093070030212, + "learning_rate": 6.113268743132583e-06, + "loss": 0.6779, + "step": 15629 + }, + { + "epoch": 0.8602564808189774, + "grad_norm": 0.6825945377349854, + "learning_rate": 6.112846153389924e-06, + "loss": 0.7607, + "step": 15630 + }, + { + "epoch": 0.860311519621333, + "grad_norm": 0.7553657293319702, + "learning_rate": 6.112423555283411e-06, + "loss": 0.6945, + "step": 15631 + }, + { + "epoch": 0.8603665584236887, + "grad_norm": 0.7892605662345886, + "learning_rate": 6.11200094881622e-06, + "loss": 0.767, + "step": 15632 + }, + { + "epoch": 0.8604215972260444, + "grad_norm": 0.6485433578491211, + "learning_rate": 6.111578333991528e-06, + "loss": 0.7302, + "step": 15633 + }, + { + "epoch": 0.8604766360284001, + "grad_norm": 0.6713895201683044, + "learning_rate": 6.111155710812511e-06, + "loss": 0.774, + "step": 15634 + }, + { + "epoch": 0.8605316748307557, + "grad_norm": 0.9890132546424866, + "learning_rate": 6.110733079282345e-06, + "loss": 0.7549, + "step": 15635 + }, + { + "epoch": 0.8605867136331113, + "grad_norm": 0.6421818137168884, + "learning_rate": 6.110310439404206e-06, + "loss": 0.7004, + "step": 15636 + }, + { + "epoch": 0.860641752435467, + "grad_norm": 0.6384093165397644, + "learning_rate": 6.109887791181272e-06, + "loss": 0.7465, + "step": 15637 + }, + { + "epoch": 0.8606967912378226, + "grad_norm": 0.7991462349891663, + "learning_rate": 6.109465134616717e-06, + "loss": 0.8041, + "step": 15638 + }, + { + "epoch": 0.8607518300401783, + "grad_norm": 0.661189615726471, + "learning_rate": 6.1090424697137185e-06, + "loss": 0.7717, + "step": 15639 + }, + { + "epoch": 0.860806868842534, + "grad_norm": 0.6952805519104004, + "learning_rate": 6.108619796475455e-06, + "loss": 0.7149, + "step": 15640 + }, + { + "epoch": 0.8608619076448897, + "grad_norm": 0.7330671548843384, + "learning_rate": 6.108197114905102e-06, + "loss": 0.7229, + "step": 15641 + }, + { + "epoch": 0.8609169464472453, + "grad_norm": 0.6831181049346924, + "learning_rate": 6.107774425005836e-06, + "loss": 0.6937, + "step": 15642 + }, + { + "epoch": 0.860971985249601, + "grad_norm": 0.7261425852775574, + "learning_rate": 6.107351726780833e-06, + "loss": 0.7963, + "step": 15643 + }, + { + "epoch": 0.8610270240519566, + "grad_norm": 0.6796271800994873, + "learning_rate": 6.106929020233272e-06, + "loss": 0.7785, + "step": 15644 + }, + { + "epoch": 0.8610820628543123, + "grad_norm": 0.6772015690803528, + "learning_rate": 6.106506305366328e-06, + "loss": 0.7732, + "step": 15645 + }, + { + "epoch": 0.8611371016566679, + "grad_norm": 0.6153992414474487, + "learning_rate": 6.10608358218318e-06, + "loss": 0.6361, + "step": 15646 + }, + { + "epoch": 0.8611921404590236, + "grad_norm": 0.9580141305923462, + "learning_rate": 6.105660850687003e-06, + "loss": 0.7178, + "step": 15647 + }, + { + "epoch": 0.8612471792613793, + "grad_norm": 0.8536281585693359, + "learning_rate": 6.105238110880975e-06, + "loss": 0.6689, + "step": 15648 + }, + { + "epoch": 0.861302218063735, + "grad_norm": 0.6578275561332703, + "learning_rate": 6.104815362768274e-06, + "loss": 0.7474, + "step": 15649 + }, + { + "epoch": 0.8613572568660905, + "grad_norm": 0.7298864126205444, + "learning_rate": 6.104392606352075e-06, + "loss": 0.7134, + "step": 15650 + }, + { + "epoch": 0.8614122956684462, + "grad_norm": 1.0637938976287842, + "learning_rate": 6.103969841635557e-06, + "loss": 0.8614, + "step": 15651 + }, + { + "epoch": 0.8614673344708019, + "grad_norm": 0.6678902506828308, + "learning_rate": 6.103547068621898e-06, + "loss": 0.6736, + "step": 15652 + }, + { + "epoch": 0.8615223732731576, + "grad_norm": 0.9442873001098633, + "learning_rate": 6.103124287314275e-06, + "loss": 0.8002, + "step": 15653 + }, + { + "epoch": 0.8615774120755132, + "grad_norm": 0.7156786322593689, + "learning_rate": 6.102701497715864e-06, + "loss": 0.8764, + "step": 15654 + }, + { + "epoch": 0.8616324508778689, + "grad_norm": 0.7954290509223938, + "learning_rate": 6.102278699829843e-06, + "loss": 0.7308, + "step": 15655 + }, + { + "epoch": 0.8616874896802246, + "grad_norm": 0.7544524073600769, + "learning_rate": 6.101855893659392e-06, + "loss": 0.7064, + "step": 15656 + }, + { + "epoch": 0.8617425284825803, + "grad_norm": 0.652656078338623, + "learning_rate": 6.101433079207687e-06, + "loss": 0.7264, + "step": 15657 + }, + { + "epoch": 0.8617975672849358, + "grad_norm": 0.6478716135025024, + "learning_rate": 6.101010256477906e-06, + "loss": 0.6265, + "step": 15658 + }, + { + "epoch": 0.8618526060872915, + "grad_norm": 0.5916007161140442, + "learning_rate": 6.1005874254732256e-06, + "loss": 0.6485, + "step": 15659 + }, + { + "epoch": 0.8619076448896472, + "grad_norm": 0.7353591322898865, + "learning_rate": 6.1001645861968264e-06, + "loss": 0.759, + "step": 15660 + }, + { + "epoch": 0.8619626836920029, + "grad_norm": 0.7352280020713806, + "learning_rate": 6.099741738651883e-06, + "loss": 0.7392, + "step": 15661 + }, + { + "epoch": 0.8620177224943585, + "grad_norm": 0.6027048230171204, + "learning_rate": 6.099318882841576e-06, + "loss": 0.7039, + "step": 15662 + }, + { + "epoch": 0.8620727612967142, + "grad_norm": 0.6907329559326172, + "learning_rate": 6.09889601876908e-06, + "loss": 0.8164, + "step": 15663 + }, + { + "epoch": 0.8621278000990699, + "grad_norm": 0.7133687138557434, + "learning_rate": 6.098473146437579e-06, + "loss": 0.791, + "step": 15664 + }, + { + "epoch": 0.8621828389014256, + "grad_norm": 0.7229570746421814, + "learning_rate": 6.098050265850246e-06, + "loss": 0.7839, + "step": 15665 + }, + { + "epoch": 0.8622378777037811, + "grad_norm": 0.7066009640693665, + "learning_rate": 6.097627377010262e-06, + "loss": 0.6816, + "step": 15666 + }, + { + "epoch": 0.8622929165061368, + "grad_norm": 0.7801152467727661, + "learning_rate": 6.097204479920804e-06, + "loss": 0.7402, + "step": 15667 + }, + { + "epoch": 0.8623479553084925, + "grad_norm": 0.6149227023124695, + "learning_rate": 6.096781574585051e-06, + "loss": 0.6964, + "step": 15668 + }, + { + "epoch": 0.8624029941108482, + "grad_norm": 0.6978023648262024, + "learning_rate": 6.096358661006181e-06, + "loss": 0.6982, + "step": 15669 + }, + { + "epoch": 0.8624580329132038, + "grad_norm": 0.6561335325241089, + "learning_rate": 6.095935739187373e-06, + "loss": 0.74, + "step": 15670 + }, + { + "epoch": 0.8625130717155595, + "grad_norm": 0.7627743482589722, + "learning_rate": 6.0955128091318065e-06, + "loss": 0.6886, + "step": 15671 + }, + { + "epoch": 0.8625681105179152, + "grad_norm": 0.919551432132721, + "learning_rate": 6.095089870842657e-06, + "loss": 0.7453, + "step": 15672 + }, + { + "epoch": 0.8626231493202708, + "grad_norm": 0.732641875743866, + "learning_rate": 6.094666924323107e-06, + "loss": 0.7502, + "step": 15673 + }, + { + "epoch": 0.8626781881226264, + "grad_norm": 0.7035035490989685, + "learning_rate": 6.094243969576332e-06, + "loss": 0.7825, + "step": 15674 + }, + { + "epoch": 0.8627332269249821, + "grad_norm": 0.648766279220581, + "learning_rate": 6.093821006605513e-06, + "loss": 0.6771, + "step": 15675 + }, + { + "epoch": 0.8627882657273378, + "grad_norm": 0.6031193137168884, + "learning_rate": 6.093398035413828e-06, + "loss": 0.6246, + "step": 15676 + }, + { + "epoch": 0.8628433045296935, + "grad_norm": 0.76170414686203, + "learning_rate": 6.0929750560044555e-06, + "loss": 0.713, + "step": 15677 + }, + { + "epoch": 0.8628983433320491, + "grad_norm": 1.023805022239685, + "learning_rate": 6.092552068380575e-06, + "loss": 0.6824, + "step": 15678 + }, + { + "epoch": 0.8629533821344048, + "grad_norm": 0.7333651185035706, + "learning_rate": 6.092129072545366e-06, + "loss": 0.7213, + "step": 15679 + }, + { + "epoch": 0.8630084209367604, + "grad_norm": 0.6620833873748779, + "learning_rate": 6.091706068502007e-06, + "loss": 0.7436, + "step": 15680 + }, + { + "epoch": 0.863063459739116, + "grad_norm": 0.5971367359161377, + "learning_rate": 6.091283056253679e-06, + "loss": 0.6774, + "step": 15681 + }, + { + "epoch": 0.8631184985414717, + "grad_norm": 0.6435208320617676, + "learning_rate": 6.090860035803558e-06, + "loss": 0.7556, + "step": 15682 + }, + { + "epoch": 0.8631735373438274, + "grad_norm": 0.6666582822799683, + "learning_rate": 6.090437007154824e-06, + "loss": 0.7533, + "step": 15683 + }, + { + "epoch": 0.8632285761461831, + "grad_norm": 0.665928840637207, + "learning_rate": 6.09001397031066e-06, + "loss": 0.7325, + "step": 15684 + }, + { + "epoch": 0.8632836149485387, + "grad_norm": 0.6638591885566711, + "learning_rate": 6.0895909252742414e-06, + "loss": 0.8256, + "step": 15685 + }, + { + "epoch": 0.8633386537508944, + "grad_norm": 0.6556721925735474, + "learning_rate": 6.089167872048749e-06, + "loss": 0.728, + "step": 15686 + }, + { + "epoch": 0.86339369255325, + "grad_norm": 0.6327305436134338, + "learning_rate": 6.088744810637361e-06, + "loss": 0.7584, + "step": 15687 + }, + { + "epoch": 0.8634487313556057, + "grad_norm": 0.676216185092926, + "learning_rate": 6.088321741043262e-06, + "loss": 0.7868, + "step": 15688 + }, + { + "epoch": 0.8635037701579613, + "grad_norm": 0.646700918674469, + "learning_rate": 6.0878986632696255e-06, + "loss": 0.7248, + "step": 15689 + }, + { + "epoch": 0.863558808960317, + "grad_norm": 0.6748735308647156, + "learning_rate": 6.087475577319635e-06, + "loss": 0.7657, + "step": 15690 + }, + { + "epoch": 0.8636138477626727, + "grad_norm": 0.6363335251808167, + "learning_rate": 6.087052483196467e-06, + "loss": 0.7273, + "step": 15691 + }, + { + "epoch": 0.8636688865650284, + "grad_norm": 0.6166467666625977, + "learning_rate": 6.086629380903305e-06, + "loss": 0.6642, + "step": 15692 + }, + { + "epoch": 0.863723925367384, + "grad_norm": 1.3258485794067383, + "learning_rate": 6.086206270443328e-06, + "loss": 0.7227, + "step": 15693 + }, + { + "epoch": 0.8637789641697396, + "grad_norm": 0.8923795223236084, + "learning_rate": 6.085783151819716e-06, + "loss": 0.7513, + "step": 15694 + }, + { + "epoch": 0.8638340029720953, + "grad_norm": 0.7227154970169067, + "learning_rate": 6.085360025035647e-06, + "loss": 0.7078, + "step": 15695 + }, + { + "epoch": 0.863889041774451, + "grad_norm": 0.6465400457382202, + "learning_rate": 6.084936890094303e-06, + "loss": 0.7541, + "step": 15696 + }, + { + "epoch": 0.8639440805768066, + "grad_norm": 0.6628104448318481, + "learning_rate": 6.084513746998865e-06, + "loss": 0.7121, + "step": 15697 + }, + { + "epoch": 0.8639991193791623, + "grad_norm": 0.6723392605781555, + "learning_rate": 6.08409059575251e-06, + "loss": 0.7086, + "step": 15698 + }, + { + "epoch": 0.864054158181518, + "grad_norm": 0.7443264126777649, + "learning_rate": 6.08366743635842e-06, + "loss": 0.7098, + "step": 15699 + }, + { + "epoch": 0.8641091969838737, + "grad_norm": 0.7792028188705444, + "learning_rate": 6.083244268819777e-06, + "loss": 0.8472, + "step": 15700 + }, + { + "epoch": 0.8641642357862293, + "grad_norm": 0.7211549878120422, + "learning_rate": 6.08282109313976e-06, + "loss": 0.7917, + "step": 15701 + }, + { + "epoch": 0.8642192745885849, + "grad_norm": 0.6670592427253723, + "learning_rate": 6.082397909321549e-06, + "loss": 0.7758, + "step": 15702 + }, + { + "epoch": 0.8642743133909406, + "grad_norm": 0.8279144167900085, + "learning_rate": 6.0819747173683255e-06, + "loss": 0.7355, + "step": 15703 + }, + { + "epoch": 0.8643293521932963, + "grad_norm": 0.7409362196922302, + "learning_rate": 6.081551517283269e-06, + "loss": 0.7283, + "step": 15704 + }, + { + "epoch": 0.8643843909956519, + "grad_norm": 0.6700742840766907, + "learning_rate": 6.081128309069562e-06, + "loss": 0.6555, + "step": 15705 + }, + { + "epoch": 0.8644394297980076, + "grad_norm": 0.7364388108253479, + "learning_rate": 6.080705092730383e-06, + "loss": 0.6652, + "step": 15706 + }, + { + "epoch": 0.8644944686003633, + "grad_norm": 0.778404176235199, + "learning_rate": 6.080281868268913e-06, + "loss": 0.7774, + "step": 15707 + }, + { + "epoch": 0.864549507402719, + "grad_norm": 0.6663825511932373, + "learning_rate": 6.079858635688336e-06, + "loss": 0.7665, + "step": 15708 + }, + { + "epoch": 0.8646045462050745, + "grad_norm": 0.7061408758163452, + "learning_rate": 6.079435394991829e-06, + "loss": 0.7824, + "step": 15709 + }, + { + "epoch": 0.8646595850074302, + "grad_norm": 0.6537507176399231, + "learning_rate": 6.079012146182576e-06, + "loss": 0.7313, + "step": 15710 + }, + { + "epoch": 0.8647146238097859, + "grad_norm": 0.6154575943946838, + "learning_rate": 6.078588889263754e-06, + "loss": 0.7066, + "step": 15711 + }, + { + "epoch": 0.8647696626121416, + "grad_norm": 0.659093976020813, + "learning_rate": 6.078165624238548e-06, + "loss": 0.7495, + "step": 15712 + }, + { + "epoch": 0.8648247014144972, + "grad_norm": 0.677669107913971, + "learning_rate": 6.077742351110138e-06, + "loss": 0.7072, + "step": 15713 + }, + { + "epoch": 0.8648797402168529, + "grad_norm": 0.7204097509384155, + "learning_rate": 6.077319069881705e-06, + "loss": 0.7181, + "step": 15714 + }, + { + "epoch": 0.8649347790192086, + "grad_norm": 0.6903330683708191, + "learning_rate": 6.076895780556429e-06, + "loss": 0.8565, + "step": 15715 + }, + { + "epoch": 0.8649898178215643, + "grad_norm": 0.8147342205047607, + "learning_rate": 6.076472483137493e-06, + "loss": 0.6916, + "step": 15716 + }, + { + "epoch": 0.8650448566239198, + "grad_norm": 0.7021569013595581, + "learning_rate": 6.076049177628079e-06, + "loss": 0.6893, + "step": 15717 + }, + { + "epoch": 0.8650998954262755, + "grad_norm": 0.6534682512283325, + "learning_rate": 6.075625864031368e-06, + "loss": 0.6313, + "step": 15718 + }, + { + "epoch": 0.8651549342286312, + "grad_norm": 0.7883698344230652, + "learning_rate": 6.07520254235054e-06, + "loss": 0.7204, + "step": 15719 + }, + { + "epoch": 0.8652099730309869, + "grad_norm": 0.6255857944488525, + "learning_rate": 6.074779212588777e-06, + "loss": 0.7137, + "step": 15720 + }, + { + "epoch": 0.8652650118333425, + "grad_norm": 0.7278919816017151, + "learning_rate": 6.074355874749261e-06, + "loss": 0.8003, + "step": 15721 + }, + { + "epoch": 0.8653200506356982, + "grad_norm": 0.7809221744537354, + "learning_rate": 6.073932528835176e-06, + "loss": 0.8652, + "step": 15722 + }, + { + "epoch": 0.8653750894380539, + "grad_norm": 0.6781452894210815, + "learning_rate": 6.0735091748496985e-06, + "loss": 0.6111, + "step": 15723 + }, + { + "epoch": 0.8654301282404094, + "grad_norm": 0.6400741934776306, + "learning_rate": 6.073085812796015e-06, + "loss": 0.7126, + "step": 15724 + }, + { + "epoch": 0.8654851670427651, + "grad_norm": 0.6753132343292236, + "learning_rate": 6.072662442677305e-06, + "loss": 0.7389, + "step": 15725 + }, + { + "epoch": 0.8655402058451208, + "grad_norm": 0.6688135266304016, + "learning_rate": 6.072239064496752e-06, + "loss": 0.6356, + "step": 15726 + }, + { + "epoch": 0.8655952446474765, + "grad_norm": 0.664271354675293, + "learning_rate": 6.0718156782575365e-06, + "loss": 0.6949, + "step": 15727 + }, + { + "epoch": 0.8656502834498321, + "grad_norm": 0.6760862469673157, + "learning_rate": 6.071392283962843e-06, + "loss": 0.7279, + "step": 15728 + }, + { + "epoch": 0.8657053222521878, + "grad_norm": 0.6911706924438477, + "learning_rate": 6.07096888161585e-06, + "loss": 0.8132, + "step": 15729 + }, + { + "epoch": 0.8657603610545435, + "grad_norm": 0.7274359464645386, + "learning_rate": 6.070545471219743e-06, + "loss": 0.7894, + "step": 15730 + }, + { + "epoch": 0.8658153998568991, + "grad_norm": 0.7742472290992737, + "learning_rate": 6.070122052777703e-06, + "loss": 0.8057, + "step": 15731 + }, + { + "epoch": 0.8658704386592547, + "grad_norm": 0.8446773290634155, + "learning_rate": 6.06969862629291e-06, + "loss": 0.7816, + "step": 15732 + }, + { + "epoch": 0.8659254774616104, + "grad_norm": 0.669518232345581, + "learning_rate": 6.06927519176855e-06, + "loss": 0.696, + "step": 15733 + }, + { + "epoch": 0.8659805162639661, + "grad_norm": 0.6845564842224121, + "learning_rate": 6.068851749207803e-06, + "loss": 0.7486, + "step": 15734 + }, + { + "epoch": 0.8660355550663218, + "grad_norm": 0.6650436520576477, + "learning_rate": 6.068428298613853e-06, + "loss": 0.7215, + "step": 15735 + }, + { + "epoch": 0.8660905938686774, + "grad_norm": 0.67397540807724, + "learning_rate": 6.068004839989881e-06, + "loss": 0.7458, + "step": 15736 + }, + { + "epoch": 0.8661456326710331, + "grad_norm": 0.7140672206878662, + "learning_rate": 6.067581373339072e-06, + "loss": 0.8213, + "step": 15737 + }, + { + "epoch": 0.8662006714733888, + "grad_norm": 0.8632931113243103, + "learning_rate": 6.067157898664606e-06, + "loss": 0.8109, + "step": 15738 + }, + { + "epoch": 0.8662557102757444, + "grad_norm": 0.6106804013252258, + "learning_rate": 6.066734415969669e-06, + "loss": 0.7183, + "step": 15739 + }, + { + "epoch": 0.8663107490781, + "grad_norm": 0.8055095672607422, + "learning_rate": 6.066310925257438e-06, + "loss": 0.7871, + "step": 15740 + }, + { + "epoch": 0.8663657878804557, + "grad_norm": 0.6310189366340637, + "learning_rate": 6.065887426531102e-06, + "loss": 0.5873, + "step": 15741 + }, + { + "epoch": 0.8664208266828114, + "grad_norm": 0.6704412698745728, + "learning_rate": 6.065463919793842e-06, + "loss": 0.6838, + "step": 15742 + }, + { + "epoch": 0.8664758654851671, + "grad_norm": 0.6292148232460022, + "learning_rate": 6.06504040504884e-06, + "loss": 0.6886, + "step": 15743 + }, + { + "epoch": 0.8665309042875227, + "grad_norm": 0.8556584715843201, + "learning_rate": 6.064616882299277e-06, + "loss": 0.8967, + "step": 15744 + }, + { + "epoch": 0.8665859430898784, + "grad_norm": 0.6956119537353516, + "learning_rate": 6.064193351548341e-06, + "loss": 0.7444, + "step": 15745 + }, + { + "epoch": 0.866640981892234, + "grad_norm": 1.01414954662323, + "learning_rate": 6.063769812799212e-06, + "loss": 0.9216, + "step": 15746 + }, + { + "epoch": 0.8666960206945897, + "grad_norm": 0.6685424447059631, + "learning_rate": 6.063346266055073e-06, + "loss": 0.6795, + "step": 15747 + }, + { + "epoch": 0.8667510594969453, + "grad_norm": 0.6735886335372925, + "learning_rate": 6.062922711319108e-06, + "loss": 0.6805, + "step": 15748 + }, + { + "epoch": 0.866806098299301, + "grad_norm": 0.6536576747894287, + "learning_rate": 6.062499148594502e-06, + "loss": 0.6575, + "step": 15749 + }, + { + "epoch": 0.8668611371016567, + "grad_norm": 0.6739212870597839, + "learning_rate": 6.062075577884437e-06, + "loss": 0.6704, + "step": 15750 + }, + { + "epoch": 0.8669161759040124, + "grad_norm": 0.73397296667099, + "learning_rate": 6.061651999192094e-06, + "loss": 0.7892, + "step": 15751 + }, + { + "epoch": 0.866971214706368, + "grad_norm": 0.7974724769592285, + "learning_rate": 6.06122841252066e-06, + "loss": 0.7133, + "step": 15752 + }, + { + "epoch": 0.8670262535087236, + "grad_norm": 0.6199150681495667, + "learning_rate": 6.060804817873317e-06, + "loss": 0.765, + "step": 15753 + }, + { + "epoch": 0.8670812923110793, + "grad_norm": 0.709783673286438, + "learning_rate": 6.060381215253251e-06, + "loss": 0.7332, + "step": 15754 + }, + { + "epoch": 0.867136331113435, + "grad_norm": 0.6947084069252014, + "learning_rate": 6.059957604663642e-06, + "loss": 0.8224, + "step": 15755 + }, + { + "epoch": 0.8671913699157906, + "grad_norm": 0.9439684152603149, + "learning_rate": 6.059533986107674e-06, + "loss": 0.8347, + "step": 15756 + }, + { + "epoch": 0.8672464087181463, + "grad_norm": 0.806992769241333, + "learning_rate": 6.059110359588534e-06, + "loss": 0.8055, + "step": 15757 + }, + { + "epoch": 0.867301447520502, + "grad_norm": 0.659092128276825, + "learning_rate": 6.058686725109404e-06, + "loss": 0.6972, + "step": 15758 + }, + { + "epoch": 0.8673564863228577, + "grad_norm": 0.7345813512802124, + "learning_rate": 6.058263082673468e-06, + "loss": 0.8044, + "step": 15759 + }, + { + "epoch": 0.8674115251252132, + "grad_norm": 0.7216777205467224, + "learning_rate": 6.057839432283908e-06, + "loss": 0.7816, + "step": 15760 + }, + { + "epoch": 0.8674665639275689, + "grad_norm": 0.6828186511993408, + "learning_rate": 6.0574157739439125e-06, + "loss": 0.7534, + "step": 15761 + }, + { + "epoch": 0.8675216027299246, + "grad_norm": 0.7324418425559998, + "learning_rate": 6.0569921076566615e-06, + "loss": 0.7476, + "step": 15762 + }, + { + "epoch": 0.8675766415322803, + "grad_norm": 0.5894229412078857, + "learning_rate": 6.056568433425342e-06, + "loss": 0.6667, + "step": 15763 + }, + { + "epoch": 0.8676316803346359, + "grad_norm": 0.6743035912513733, + "learning_rate": 6.056144751253135e-06, + "loss": 0.6765, + "step": 15764 + }, + { + "epoch": 0.8676867191369916, + "grad_norm": 0.6885803937911987, + "learning_rate": 6.055721061143229e-06, + "loss": 0.6954, + "step": 15765 + }, + { + "epoch": 0.8677417579393473, + "grad_norm": 0.6543543338775635, + "learning_rate": 6.055297363098806e-06, + "loss": 0.6277, + "step": 15766 + }, + { + "epoch": 0.8677967967417028, + "grad_norm": 0.7671917080879211, + "learning_rate": 6.054873657123049e-06, + "loss": 0.7575, + "step": 15767 + }, + { + "epoch": 0.8678518355440585, + "grad_norm": 0.7491669654846191, + "learning_rate": 6.054449943219144e-06, + "loss": 0.727, + "step": 15768 + }, + { + "epoch": 0.8679068743464142, + "grad_norm": 0.7161419987678528, + "learning_rate": 6.0540262213902765e-06, + "loss": 0.7381, + "step": 15769 + }, + { + "epoch": 0.8679619131487699, + "grad_norm": 0.7061475515365601, + "learning_rate": 6.05360249163963e-06, + "loss": 0.7831, + "step": 15770 + }, + { + "epoch": 0.8680169519511255, + "grad_norm": 0.7481213212013245, + "learning_rate": 6.053178753970389e-06, + "loss": 0.7235, + "step": 15771 + }, + { + "epoch": 0.8680719907534812, + "grad_norm": 0.6475214958190918, + "learning_rate": 6.052755008385736e-06, + "loss": 0.6864, + "step": 15772 + }, + { + "epoch": 0.8681270295558369, + "grad_norm": 0.7365770936012268, + "learning_rate": 6.052331254888862e-06, + "loss": 0.7746, + "step": 15773 + }, + { + "epoch": 0.8681820683581926, + "grad_norm": 0.6339132785797119, + "learning_rate": 6.0519074934829456e-06, + "loss": 0.7102, + "step": 15774 + }, + { + "epoch": 0.8682371071605481, + "grad_norm": 0.691531240940094, + "learning_rate": 6.0514837241711754e-06, + "loss": 0.7896, + "step": 15775 + }, + { + "epoch": 0.8682921459629038, + "grad_norm": 0.6793948411941528, + "learning_rate": 6.051059946956734e-06, + "loss": 0.6514, + "step": 15776 + }, + { + "epoch": 0.8683471847652595, + "grad_norm": 0.6301077008247375, + "learning_rate": 6.050636161842809e-06, + "loss": 0.6831, + "step": 15777 + }, + { + "epoch": 0.8684022235676152, + "grad_norm": 0.7680420875549316, + "learning_rate": 6.0502123688325835e-06, + "loss": 0.7504, + "step": 15778 + }, + { + "epoch": 0.8684572623699708, + "grad_norm": 0.6260972619056702, + "learning_rate": 6.0497885679292415e-06, + "loss": 0.7066, + "step": 15779 + }, + { + "epoch": 0.8685123011723265, + "grad_norm": 0.663060188293457, + "learning_rate": 6.04936475913597e-06, + "loss": 0.6634, + "step": 15780 + }, + { + "epoch": 0.8685673399746822, + "grad_norm": 0.6798335313796997, + "learning_rate": 6.048940942455954e-06, + "loss": 0.7055, + "step": 15781 + }, + { + "epoch": 0.8686223787770379, + "grad_norm": 0.7080284953117371, + "learning_rate": 6.048517117892379e-06, + "loss": 0.7606, + "step": 15782 + }, + { + "epoch": 0.8686774175793934, + "grad_norm": 0.67658931016922, + "learning_rate": 6.04809328544843e-06, + "loss": 0.7656, + "step": 15783 + }, + { + "epoch": 0.8687324563817491, + "grad_norm": 0.6667472720146179, + "learning_rate": 6.047669445127291e-06, + "loss": 0.7275, + "step": 15784 + }, + { + "epoch": 0.8687874951841048, + "grad_norm": 0.782096266746521, + "learning_rate": 6.04724559693215e-06, + "loss": 0.7524, + "step": 15785 + }, + { + "epoch": 0.8688425339864605, + "grad_norm": 0.7733443379402161, + "learning_rate": 6.046821740866192e-06, + "loss": 0.7022, + "step": 15786 + }, + { + "epoch": 0.8688975727888161, + "grad_norm": 0.6487871408462524, + "learning_rate": 6.046397876932602e-06, + "loss": 0.7077, + "step": 15787 + }, + { + "epoch": 0.8689526115911718, + "grad_norm": 0.6294482350349426, + "learning_rate": 6.045974005134564e-06, + "loss": 0.6974, + "step": 15788 + }, + { + "epoch": 0.8690076503935275, + "grad_norm": 0.6573933362960815, + "learning_rate": 6.045550125475268e-06, + "loss": 0.735, + "step": 15789 + }, + { + "epoch": 0.8690626891958831, + "grad_norm": 0.6794875264167786, + "learning_rate": 6.045126237957895e-06, + "loss": 0.7511, + "step": 15790 + }, + { + "epoch": 0.8691177279982387, + "grad_norm": 0.687599778175354, + "learning_rate": 6.0447023425856345e-06, + "loss": 0.7164, + "step": 15791 + }, + { + "epoch": 0.8691727668005944, + "grad_norm": 0.6593008637428284, + "learning_rate": 6.04427843936167e-06, + "loss": 0.688, + "step": 15792 + }, + { + "epoch": 0.8692278056029501, + "grad_norm": 0.7226807475090027, + "learning_rate": 6.043854528289188e-06, + "loss": 0.7364, + "step": 15793 + }, + { + "epoch": 0.8692828444053058, + "grad_norm": 0.603318452835083, + "learning_rate": 6.043430609371375e-06, + "loss": 0.6933, + "step": 15794 + }, + { + "epoch": 0.8693378832076614, + "grad_norm": 0.8227141499519348, + "learning_rate": 6.043006682611416e-06, + "loss": 0.7039, + "step": 15795 + }, + { + "epoch": 0.869392922010017, + "grad_norm": 0.729284405708313, + "learning_rate": 6.042582748012499e-06, + "loss": 0.7288, + "step": 15796 + }, + { + "epoch": 0.8694479608123727, + "grad_norm": 0.8269371390342712, + "learning_rate": 6.042158805577809e-06, + "loss": 0.7419, + "step": 15797 + }, + { + "epoch": 0.8695029996147284, + "grad_norm": 0.6699450016021729, + "learning_rate": 6.0417348553105325e-06, + "loss": 0.7893, + "step": 15798 + }, + { + "epoch": 0.869558038417084, + "grad_norm": 0.7747042775154114, + "learning_rate": 6.041310897213856e-06, + "loss": 0.791, + "step": 15799 + }, + { + "epoch": 0.8696130772194397, + "grad_norm": 0.7503781318664551, + "learning_rate": 6.0408869312909645e-06, + "loss": 0.7204, + "step": 15800 + }, + { + "epoch": 0.8696681160217954, + "grad_norm": 0.6733731627464294, + "learning_rate": 6.0404629575450464e-06, + "loss": 0.815, + "step": 15801 + }, + { + "epoch": 0.8697231548241511, + "grad_norm": 0.6925041079521179, + "learning_rate": 6.040038975979288e-06, + "loss": 0.8096, + "step": 15802 + }, + { + "epoch": 0.8697781936265067, + "grad_norm": 0.7510724067687988, + "learning_rate": 6.039614986596873e-06, + "loss": 0.7957, + "step": 15803 + }, + { + "epoch": 0.8698332324288623, + "grad_norm": 0.9631650447845459, + "learning_rate": 6.039190989400991e-06, + "loss": 0.7574, + "step": 15804 + }, + { + "epoch": 0.869888271231218, + "grad_norm": 0.7080852389335632, + "learning_rate": 6.0387669843948285e-06, + "loss": 0.7037, + "step": 15805 + }, + { + "epoch": 0.8699433100335737, + "grad_norm": 0.723419725894928, + "learning_rate": 6.03834297158157e-06, + "loss": 0.7424, + "step": 15806 + }, + { + "epoch": 0.8699983488359293, + "grad_norm": 0.6093000173568726, + "learning_rate": 6.037918950964404e-06, + "loss": 0.6754, + "step": 15807 + }, + { + "epoch": 0.870053387638285, + "grad_norm": 0.7614741921424866, + "learning_rate": 6.037494922546518e-06, + "loss": 0.6856, + "step": 15808 + }, + { + "epoch": 0.8701084264406407, + "grad_norm": 0.6535844802856445, + "learning_rate": 6.0370708863310965e-06, + "loss": 0.8201, + "step": 15809 + }, + { + "epoch": 0.8701634652429963, + "grad_norm": 0.724897027015686, + "learning_rate": 6.036646842321329e-06, + "loss": 0.7399, + "step": 15810 + }, + { + "epoch": 0.870218504045352, + "grad_norm": 0.7602331638336182, + "learning_rate": 6.036222790520401e-06, + "loss": 0.8233, + "step": 15811 + }, + { + "epoch": 0.8702735428477076, + "grad_norm": 0.7890536189079285, + "learning_rate": 6.035798730931498e-06, + "loss": 0.8473, + "step": 15812 + }, + { + "epoch": 0.8703285816500633, + "grad_norm": 0.7241165637969971, + "learning_rate": 6.035374663557813e-06, + "loss": 0.7298, + "step": 15813 + }, + { + "epoch": 0.8703836204524189, + "grad_norm": 0.6661847829818726, + "learning_rate": 6.034950588402526e-06, + "loss": 0.7461, + "step": 15814 + }, + { + "epoch": 0.8704386592547746, + "grad_norm": 0.6431320309638977, + "learning_rate": 6.034526505468829e-06, + "loss": 0.7436, + "step": 15815 + }, + { + "epoch": 0.8704936980571303, + "grad_norm": 1.122704267501831, + "learning_rate": 6.0341024147599055e-06, + "loss": 0.7378, + "step": 15816 + }, + { + "epoch": 0.870548736859486, + "grad_norm": 0.6391544938087463, + "learning_rate": 6.033678316278947e-06, + "loss": 0.7517, + "step": 15817 + }, + { + "epoch": 0.8706037756618416, + "grad_norm": 0.6522098183631897, + "learning_rate": 6.033254210029139e-06, + "loss": 0.7188, + "step": 15818 + }, + { + "epoch": 0.8706588144641972, + "grad_norm": 0.7638733386993408, + "learning_rate": 6.0328300960136686e-06, + "loss": 0.8032, + "step": 15819 + }, + { + "epoch": 0.8707138532665529, + "grad_norm": 0.6374132633209229, + "learning_rate": 6.032405974235722e-06, + "loss": 0.7292, + "step": 15820 + }, + { + "epoch": 0.8707688920689086, + "grad_norm": 0.7061800360679626, + "learning_rate": 6.03198184469849e-06, + "loss": 0.7304, + "step": 15821 + }, + { + "epoch": 0.8708239308712642, + "grad_norm": 0.646089494228363, + "learning_rate": 6.031557707405159e-06, + "loss": 0.6762, + "step": 15822 + }, + { + "epoch": 0.8708789696736199, + "grad_norm": 0.8142202496528625, + "learning_rate": 6.031133562358916e-06, + "loss": 0.7789, + "step": 15823 + }, + { + "epoch": 0.8709340084759756, + "grad_norm": 0.6444084644317627, + "learning_rate": 6.030709409562949e-06, + "loss": 0.7383, + "step": 15824 + }, + { + "epoch": 0.8709890472783313, + "grad_norm": 0.8917344808578491, + "learning_rate": 6.030285249020448e-06, + "loss": 0.7527, + "step": 15825 + }, + { + "epoch": 0.8710440860806868, + "grad_norm": 0.6395692825317383, + "learning_rate": 6.029861080734597e-06, + "loss": 0.6923, + "step": 15826 + }, + { + "epoch": 0.8710991248830425, + "grad_norm": 0.6475933790206909, + "learning_rate": 6.029436904708586e-06, + "loss": 0.7495, + "step": 15827 + }, + { + "epoch": 0.8711541636853982, + "grad_norm": 0.7310789823532104, + "learning_rate": 6.029012720945602e-06, + "loss": 0.7541, + "step": 15828 + }, + { + "epoch": 0.8712092024877539, + "grad_norm": 0.8475071787834167, + "learning_rate": 6.028588529448835e-06, + "loss": 0.7397, + "step": 15829 + }, + { + "epoch": 0.8712642412901095, + "grad_norm": 0.6214048266410828, + "learning_rate": 6.028164330221471e-06, + "loss": 0.7365, + "step": 15830 + }, + { + "epoch": 0.8713192800924652, + "grad_norm": 0.6558026671409607, + "learning_rate": 6.0277401232667e-06, + "loss": 0.79, + "step": 15831 + }, + { + "epoch": 0.8713743188948209, + "grad_norm": 0.6652923226356506, + "learning_rate": 6.0273159085877074e-06, + "loss": 0.7539, + "step": 15832 + }, + { + "epoch": 0.8714293576971766, + "grad_norm": 0.7908313870429993, + "learning_rate": 6.026891686187686e-06, + "loss": 0.6776, + "step": 15833 + }, + { + "epoch": 0.8714843964995321, + "grad_norm": 0.6947218775749207, + "learning_rate": 6.02646745606982e-06, + "loss": 0.7375, + "step": 15834 + }, + { + "epoch": 0.8715394353018878, + "grad_norm": 0.7137001156806946, + "learning_rate": 6.0260432182373e-06, + "loss": 0.7213, + "step": 15835 + }, + { + "epoch": 0.8715944741042435, + "grad_norm": 0.6175974011421204, + "learning_rate": 6.025618972693314e-06, + "loss": 0.6468, + "step": 15836 + }, + { + "epoch": 0.8716495129065992, + "grad_norm": 0.6631742119789124, + "learning_rate": 6.0251947194410496e-06, + "loss": 0.7116, + "step": 15837 + }, + { + "epoch": 0.8717045517089548, + "grad_norm": 0.7667781710624695, + "learning_rate": 6.024770458483698e-06, + "loss": 0.8836, + "step": 15838 + }, + { + "epoch": 0.8717595905113105, + "grad_norm": 0.664364218711853, + "learning_rate": 6.024346189824444e-06, + "loss": 0.7719, + "step": 15839 + }, + { + "epoch": 0.8718146293136662, + "grad_norm": 0.7073011994361877, + "learning_rate": 6.023921913466477e-06, + "loss": 0.7117, + "step": 15840 + }, + { + "epoch": 0.8718696681160218, + "grad_norm": 0.7126373052597046, + "learning_rate": 6.02349762941299e-06, + "loss": 0.7693, + "step": 15841 + }, + { + "epoch": 0.8719247069183774, + "grad_norm": 0.7864155173301697, + "learning_rate": 6.0230733376671665e-06, + "loss": 0.8195, + "step": 15842 + }, + { + "epoch": 0.8719797457207331, + "grad_norm": 0.7260663509368896, + "learning_rate": 6.0226490382322e-06, + "loss": 0.7739, + "step": 15843 + }, + { + "epoch": 0.8720347845230888, + "grad_norm": 0.7656667232513428, + "learning_rate": 6.0222247311112745e-06, + "loss": 0.6552, + "step": 15844 + }, + { + "epoch": 0.8720898233254445, + "grad_norm": 0.7063844799995422, + "learning_rate": 6.0218004163075826e-06, + "loss": 0.7506, + "step": 15845 + }, + { + "epoch": 0.8721448621278001, + "grad_norm": 0.6452813744544983, + "learning_rate": 6.021376093824313e-06, + "loss": 0.6854, + "step": 15846 + }, + { + "epoch": 0.8721999009301558, + "grad_norm": 0.6507169008255005, + "learning_rate": 6.020951763664653e-06, + "loss": 0.7289, + "step": 15847 + }, + { + "epoch": 0.8722549397325114, + "grad_norm": 0.6529967784881592, + "learning_rate": 6.020527425831793e-06, + "loss": 0.7196, + "step": 15848 + }, + { + "epoch": 0.8723099785348671, + "grad_norm": 0.8070194125175476, + "learning_rate": 6.020103080328924e-06, + "loss": 0.7848, + "step": 15849 + }, + { + "epoch": 0.8723650173372227, + "grad_norm": 0.7091495394706726, + "learning_rate": 6.019678727159232e-06, + "loss": 0.7948, + "step": 15850 + }, + { + "epoch": 0.8724200561395784, + "grad_norm": 0.8268260955810547, + "learning_rate": 6.019254366325907e-06, + "loss": 0.7446, + "step": 15851 + }, + { + "epoch": 0.8724750949419341, + "grad_norm": 0.7777679562568665, + "learning_rate": 6.018829997832139e-06, + "loss": 0.8307, + "step": 15852 + }, + { + "epoch": 0.8725301337442897, + "grad_norm": 0.6404305696487427, + "learning_rate": 6.018405621681117e-06, + "loss": 0.6952, + "step": 15853 + }, + { + "epoch": 0.8725851725466454, + "grad_norm": 0.8895840644836426, + "learning_rate": 6.017981237876033e-06, + "loss": 0.7554, + "step": 15854 + }, + { + "epoch": 0.872640211349001, + "grad_norm": 0.6717105507850647, + "learning_rate": 6.017556846420073e-06, + "loss": 0.674, + "step": 15855 + }, + { + "epoch": 0.8726952501513567, + "grad_norm": 0.6096089482307434, + "learning_rate": 6.017132447316427e-06, + "loss": 0.7508, + "step": 15856 + }, + { + "epoch": 0.8727502889537123, + "grad_norm": 0.7513056397438049, + "learning_rate": 6.016708040568288e-06, + "loss": 0.717, + "step": 15857 + }, + { + "epoch": 0.872805327756068, + "grad_norm": 0.6977408528327942, + "learning_rate": 6.0162836261788425e-06, + "loss": 0.7002, + "step": 15858 + }, + { + "epoch": 0.8728603665584237, + "grad_norm": 0.6753636598587036, + "learning_rate": 6.015859204151282e-06, + "loss": 0.7414, + "step": 15859 + }, + { + "epoch": 0.8729154053607794, + "grad_norm": 0.7120729684829712, + "learning_rate": 6.015434774488795e-06, + "loss": 0.6774, + "step": 15860 + }, + { + "epoch": 0.872970444163135, + "grad_norm": 0.7560111880302429, + "learning_rate": 6.015010337194573e-06, + "loss": 0.6887, + "step": 15861 + }, + { + "epoch": 0.8730254829654907, + "grad_norm": 0.652497410774231, + "learning_rate": 6.0145858922718044e-06, + "loss": 0.76, + "step": 15862 + }, + { + "epoch": 0.8730805217678463, + "grad_norm": 0.7120025753974915, + "learning_rate": 6.01416143972368e-06, + "loss": 0.7008, + "step": 15863 + }, + { + "epoch": 0.873135560570202, + "grad_norm": 0.7517643570899963, + "learning_rate": 6.013736979553389e-06, + "loss": 0.8944, + "step": 15864 + }, + { + "epoch": 0.8731905993725576, + "grad_norm": 0.6225923299789429, + "learning_rate": 6.013312511764122e-06, + "loss": 0.6217, + "step": 15865 + }, + { + "epoch": 0.8732456381749133, + "grad_norm": 0.8815253376960754, + "learning_rate": 6.012888036359071e-06, + "loss": 0.8121, + "step": 15866 + }, + { + "epoch": 0.873300676977269, + "grad_norm": 0.676211953163147, + "learning_rate": 6.012463553341424e-06, + "loss": 0.7233, + "step": 15867 + }, + { + "epoch": 0.8733557157796247, + "grad_norm": 0.6566252708435059, + "learning_rate": 6.012039062714371e-06, + "loss": 0.8099, + "step": 15868 + }, + { + "epoch": 0.8734107545819803, + "grad_norm": 0.7964142560958862, + "learning_rate": 6.011614564481103e-06, + "loss": 0.758, + "step": 15869 + }, + { + "epoch": 0.8734657933843359, + "grad_norm": 0.6923096776008606, + "learning_rate": 6.011190058644811e-06, + "loss": 0.6997, + "step": 15870 + }, + { + "epoch": 0.8735208321866916, + "grad_norm": 0.6507520079612732, + "learning_rate": 6.010765545208687e-06, + "loss": 0.7046, + "step": 15871 + }, + { + "epoch": 0.8735758709890473, + "grad_norm": 0.8206372857093811, + "learning_rate": 6.010341024175918e-06, + "loss": 0.8568, + "step": 15872 + }, + { + "epoch": 0.8736309097914029, + "grad_norm": 0.6379685997962952, + "learning_rate": 6.0099164955496965e-06, + "loss": 0.7537, + "step": 15873 + }, + { + "epoch": 0.8736859485937586, + "grad_norm": 0.7258248925209045, + "learning_rate": 6.009491959333214e-06, + "loss": 0.6946, + "step": 15874 + }, + { + "epoch": 0.8737409873961143, + "grad_norm": 0.6882272362709045, + "learning_rate": 6.0090674155296606e-06, + "loss": 0.6508, + "step": 15875 + }, + { + "epoch": 0.87379602619847, + "grad_norm": 0.646864652633667, + "learning_rate": 6.0086428641422245e-06, + "loss": 0.7061, + "step": 15876 + }, + { + "epoch": 0.8738510650008255, + "grad_norm": 0.772055983543396, + "learning_rate": 6.008218305174099e-06, + "loss": 0.7435, + "step": 15877 + }, + { + "epoch": 0.8739061038031812, + "grad_norm": 0.660976767539978, + "learning_rate": 6.007793738628476e-06, + "loss": 0.6834, + "step": 15878 + }, + { + "epoch": 0.8739611426055369, + "grad_norm": 0.6279324293136597, + "learning_rate": 6.007369164508544e-06, + "loss": 0.6903, + "step": 15879 + }, + { + "epoch": 0.8740161814078926, + "grad_norm": 0.7111205458641052, + "learning_rate": 6.006944582817495e-06, + "loss": 0.7338, + "step": 15880 + }, + { + "epoch": 0.8740712202102482, + "grad_norm": 0.6149270534515381, + "learning_rate": 6.006519993558519e-06, + "loss": 0.6639, + "step": 15881 + }, + { + "epoch": 0.8741262590126039, + "grad_norm": 0.7477333545684814, + "learning_rate": 6.00609539673481e-06, + "loss": 0.8032, + "step": 15882 + }, + { + "epoch": 0.8741812978149596, + "grad_norm": 0.8613518476486206, + "learning_rate": 6.005670792349557e-06, + "loss": 0.7508, + "step": 15883 + }, + { + "epoch": 0.8742363366173153, + "grad_norm": 0.6627817153930664, + "learning_rate": 6.0052461804059515e-06, + "loss": 0.7898, + "step": 15884 + }, + { + "epoch": 0.8742913754196708, + "grad_norm": 0.6863798499107361, + "learning_rate": 6.004821560907185e-06, + "loss": 0.7674, + "step": 15885 + }, + { + "epoch": 0.8743464142220265, + "grad_norm": 0.6809577941894531, + "learning_rate": 6.004396933856449e-06, + "loss": 0.8094, + "step": 15886 + }, + { + "epoch": 0.8744014530243822, + "grad_norm": 0.6340956687927246, + "learning_rate": 6.003972299256934e-06, + "loss": 0.6508, + "step": 15887 + }, + { + "epoch": 0.8744564918267379, + "grad_norm": 0.6261658072471619, + "learning_rate": 6.003547657111831e-06, + "loss": 0.7375, + "step": 15888 + }, + { + "epoch": 0.8745115306290935, + "grad_norm": 0.7042009830474854, + "learning_rate": 6.003123007424332e-06, + "loss": 0.6817, + "step": 15889 + }, + { + "epoch": 0.8745665694314492, + "grad_norm": 0.719497561454773, + "learning_rate": 6.002698350197631e-06, + "loss": 0.7689, + "step": 15890 + }, + { + "epoch": 0.8746216082338049, + "grad_norm": 0.7034541964530945, + "learning_rate": 6.002273685434916e-06, + "loss": 0.7956, + "step": 15891 + }, + { + "epoch": 0.8746766470361605, + "grad_norm": 0.6404724717140198, + "learning_rate": 6.0018490131393815e-06, + "loss": 0.672, + "step": 15892 + }, + { + "epoch": 0.8747316858385161, + "grad_norm": 0.6812208294868469, + "learning_rate": 6.001424333314216e-06, + "loss": 0.6911, + "step": 15893 + }, + { + "epoch": 0.8747867246408718, + "grad_norm": 0.5907782912254333, + "learning_rate": 6.000999645962615e-06, + "loss": 0.6476, + "step": 15894 + }, + { + "epoch": 0.8748417634432275, + "grad_norm": 1.2116328477859497, + "learning_rate": 6.000574951087769e-06, + "loss": 0.733, + "step": 15895 + }, + { + "epoch": 0.8748968022455831, + "grad_norm": 0.6581991314888, + "learning_rate": 6.000150248692868e-06, + "loss": 0.7441, + "step": 15896 + }, + { + "epoch": 0.8749518410479388, + "grad_norm": 0.7342226505279541, + "learning_rate": 5.999725538781107e-06, + "loss": 0.6592, + "step": 15897 + }, + { + "epoch": 0.8750068798502945, + "grad_norm": 0.6864113211631775, + "learning_rate": 5.9993008213556766e-06, + "loss": 0.7652, + "step": 15898 + }, + { + "epoch": 0.8750619186526502, + "grad_norm": 0.6845645904541016, + "learning_rate": 5.9988760964197675e-06, + "loss": 0.7471, + "step": 15899 + }, + { + "epoch": 0.8751169574550057, + "grad_norm": 0.663165271282196, + "learning_rate": 5.998451363976574e-06, + "loss": 0.7667, + "step": 15900 + }, + { + "epoch": 0.8751719962573614, + "grad_norm": 0.6032472252845764, + "learning_rate": 5.998026624029286e-06, + "loss": 0.6309, + "step": 15901 + }, + { + "epoch": 0.8752270350597171, + "grad_norm": 0.6466236710548401, + "learning_rate": 5.997601876581098e-06, + "loss": 0.7418, + "step": 15902 + }, + { + "epoch": 0.8752820738620728, + "grad_norm": 0.6456779837608337, + "learning_rate": 5.997177121635201e-06, + "loss": 0.6598, + "step": 15903 + }, + { + "epoch": 0.8753371126644284, + "grad_norm": 0.7854783535003662, + "learning_rate": 5.996752359194788e-06, + "loss": 0.7545, + "step": 15904 + }, + { + "epoch": 0.8753921514667841, + "grad_norm": 0.7146682143211365, + "learning_rate": 5.99632758926305e-06, + "loss": 0.6856, + "step": 15905 + }, + { + "epoch": 0.8754471902691398, + "grad_norm": 0.7379660606384277, + "learning_rate": 5.995902811843181e-06, + "loss": 0.8601, + "step": 15906 + }, + { + "epoch": 0.8755022290714954, + "grad_norm": 0.6879674196243286, + "learning_rate": 5.995478026938375e-06, + "loss": 0.8147, + "step": 15907 + }, + { + "epoch": 0.875557267873851, + "grad_norm": 0.6188415884971619, + "learning_rate": 5.995053234551821e-06, + "loss": 0.6646, + "step": 15908 + }, + { + "epoch": 0.8756123066762067, + "grad_norm": 1.0765966176986694, + "learning_rate": 5.994628434686713e-06, + "loss": 0.6624, + "step": 15909 + }, + { + "epoch": 0.8756673454785624, + "grad_norm": 0.6391757726669312, + "learning_rate": 5.994203627346245e-06, + "loss": 0.7087, + "step": 15910 + }, + { + "epoch": 0.8757223842809181, + "grad_norm": 0.7664490938186646, + "learning_rate": 5.993778812533609e-06, + "loss": 0.6969, + "step": 15911 + }, + { + "epoch": 0.8757774230832737, + "grad_norm": 0.6901882290840149, + "learning_rate": 5.993353990251995e-06, + "loss": 0.7879, + "step": 15912 + }, + { + "epoch": 0.8758324618856294, + "grad_norm": 0.6871299147605896, + "learning_rate": 5.992929160504599e-06, + "loss": 0.6981, + "step": 15913 + }, + { + "epoch": 0.875887500687985, + "grad_norm": 0.754436731338501, + "learning_rate": 5.9925043232946145e-06, + "loss": 0.7549, + "step": 15914 + }, + { + "epoch": 0.8759425394903407, + "grad_norm": 0.7250627875328064, + "learning_rate": 5.992079478625232e-06, + "loss": 0.8134, + "step": 15915 + }, + { + "epoch": 0.8759975782926963, + "grad_norm": 0.7294771671295166, + "learning_rate": 5.991654626499647e-06, + "loss": 0.8551, + "step": 15916 + }, + { + "epoch": 0.876052617095052, + "grad_norm": 0.6388968229293823, + "learning_rate": 5.991229766921049e-06, + "loss": 0.7562, + "step": 15917 + }, + { + "epoch": 0.8761076558974077, + "grad_norm": 0.7206701636314392, + "learning_rate": 5.990804899892636e-06, + "loss": 0.6818, + "step": 15918 + }, + { + "epoch": 0.8761626946997634, + "grad_norm": 0.6607910394668579, + "learning_rate": 5.990380025417597e-06, + "loss": 0.6956, + "step": 15919 + }, + { + "epoch": 0.876217733502119, + "grad_norm": 0.6806843280792236, + "learning_rate": 5.9899551434991276e-06, + "loss": 0.7261, + "step": 15920 + }, + { + "epoch": 0.8762727723044746, + "grad_norm": 0.6994869709014893, + "learning_rate": 5.989530254140421e-06, + "loss": 0.7709, + "step": 15921 + }, + { + "epoch": 0.8763278111068303, + "grad_norm": 0.6707572937011719, + "learning_rate": 5.9891053573446685e-06, + "loss": 0.6962, + "step": 15922 + }, + { + "epoch": 0.876382849909186, + "grad_norm": 0.8244118690490723, + "learning_rate": 5.988680453115065e-06, + "loss": 0.7173, + "step": 15923 + }, + { + "epoch": 0.8764378887115416, + "grad_norm": 0.7745859026908875, + "learning_rate": 5.988255541454806e-06, + "loss": 0.7176, + "step": 15924 + }, + { + "epoch": 0.8764929275138973, + "grad_norm": 0.6572975516319275, + "learning_rate": 5.98783062236708e-06, + "loss": 0.6994, + "step": 15925 + }, + { + "epoch": 0.876547966316253, + "grad_norm": 0.7363128066062927, + "learning_rate": 5.9874056958550845e-06, + "loss": 0.7392, + "step": 15926 + }, + { + "epoch": 0.8766030051186087, + "grad_norm": 0.6638974547386169, + "learning_rate": 5.986980761922012e-06, + "loss": 0.693, + "step": 15927 + }, + { + "epoch": 0.8766580439209642, + "grad_norm": 0.6946521997451782, + "learning_rate": 5.9865558205710576e-06, + "loss": 0.7727, + "step": 15928 + }, + { + "epoch": 0.8767130827233199, + "grad_norm": 0.7787197232246399, + "learning_rate": 5.9861308718054115e-06, + "loss": 0.7203, + "step": 15929 + }, + { + "epoch": 0.8767681215256756, + "grad_norm": 0.6275830864906311, + "learning_rate": 5.985705915628271e-06, + "loss": 0.7613, + "step": 15930 + }, + { + "epoch": 0.8768231603280313, + "grad_norm": 0.5722871422767639, + "learning_rate": 5.985280952042829e-06, + "loss": 0.6635, + "step": 15931 + }, + { + "epoch": 0.8768781991303869, + "grad_norm": 0.6991726160049438, + "learning_rate": 5.984855981052278e-06, + "loss": 0.7965, + "step": 15932 + }, + { + "epoch": 0.8769332379327426, + "grad_norm": 0.6550602912902832, + "learning_rate": 5.984431002659815e-06, + "loss": 0.7591, + "step": 15933 + }, + { + "epoch": 0.8769882767350983, + "grad_norm": 0.703481137752533, + "learning_rate": 5.984006016868631e-06, + "loss": 0.7623, + "step": 15934 + }, + { + "epoch": 0.877043315537454, + "grad_norm": 0.5606309175491333, + "learning_rate": 5.983581023681922e-06, + "loss": 0.5957, + "step": 15935 + }, + { + "epoch": 0.8770983543398095, + "grad_norm": 0.6857683062553406, + "learning_rate": 5.98315602310288e-06, + "loss": 0.7629, + "step": 15936 + }, + { + "epoch": 0.8771533931421652, + "grad_norm": 0.6976568698883057, + "learning_rate": 5.982731015134699e-06, + "loss": 0.7259, + "step": 15937 + }, + { + "epoch": 0.8772084319445209, + "grad_norm": 0.7161057591438293, + "learning_rate": 5.982305999780578e-06, + "loss": 0.8572, + "step": 15938 + }, + { + "epoch": 0.8772634707468765, + "grad_norm": 0.7051637768745422, + "learning_rate": 5.981880977043706e-06, + "loss": 0.8251, + "step": 15939 + }, + { + "epoch": 0.8773185095492322, + "grad_norm": 0.6403392553329468, + "learning_rate": 5.98145594692728e-06, + "loss": 0.7021, + "step": 15940 + }, + { + "epoch": 0.8773735483515879, + "grad_norm": 0.8370338678359985, + "learning_rate": 5.981030909434493e-06, + "loss": 0.6376, + "step": 15941 + }, + { + "epoch": 0.8774285871539436, + "grad_norm": 0.7969813346862793, + "learning_rate": 5.980605864568541e-06, + "loss": 0.7935, + "step": 15942 + }, + { + "epoch": 0.8774836259562991, + "grad_norm": 0.7235258221626282, + "learning_rate": 5.980180812332619e-06, + "loss": 0.8024, + "step": 15943 + }, + { + "epoch": 0.8775386647586548, + "grad_norm": 0.6627998352050781, + "learning_rate": 5.97975575272992e-06, + "loss": 0.8184, + "step": 15944 + }, + { + "epoch": 0.8775937035610105, + "grad_norm": 1.123727798461914, + "learning_rate": 5.979330685763638e-06, + "loss": 0.9015, + "step": 15945 + }, + { + "epoch": 0.8776487423633662, + "grad_norm": 0.8116182088851929, + "learning_rate": 5.97890561143697e-06, + "loss": 0.7327, + "step": 15946 + }, + { + "epoch": 0.8777037811657218, + "grad_norm": 0.6537826657295227, + "learning_rate": 5.978480529753108e-06, + "loss": 0.7215, + "step": 15947 + }, + { + "epoch": 0.8777588199680775, + "grad_norm": 0.746971845626831, + "learning_rate": 5.978055440715249e-06, + "loss": 0.8549, + "step": 15948 + }, + { + "epoch": 0.8778138587704332, + "grad_norm": 0.7417864799499512, + "learning_rate": 5.9776303443265855e-06, + "loss": 0.7806, + "step": 15949 + }, + { + "epoch": 0.8778688975727889, + "grad_norm": 0.6726604700088501, + "learning_rate": 5.977205240590317e-06, + "loss": 0.7745, + "step": 15950 + }, + { + "epoch": 0.8779239363751444, + "grad_norm": 0.9483097791671753, + "learning_rate": 5.976780129509634e-06, + "loss": 0.81, + "step": 15951 + }, + { + "epoch": 0.8779789751775001, + "grad_norm": 0.7212807536125183, + "learning_rate": 5.976355011087734e-06, + "loss": 0.6534, + "step": 15952 + }, + { + "epoch": 0.8780340139798558, + "grad_norm": 0.725513756275177, + "learning_rate": 5.975929885327808e-06, + "loss": 0.6902, + "step": 15953 + }, + { + "epoch": 0.8780890527822115, + "grad_norm": 0.6132134795188904, + "learning_rate": 5.975504752233057e-06, + "loss": 0.7397, + "step": 15954 + }, + { + "epoch": 0.8781440915845671, + "grad_norm": 0.7595381736755371, + "learning_rate": 5.975079611806672e-06, + "loss": 0.5893, + "step": 15955 + }, + { + "epoch": 0.8781991303869228, + "grad_norm": 0.7069967985153198, + "learning_rate": 5.974654464051851e-06, + "loss": 0.7702, + "step": 15956 + }, + { + "epoch": 0.8782541691892785, + "grad_norm": 0.650039553642273, + "learning_rate": 5.974229308971788e-06, + "loss": 0.7708, + "step": 15957 + }, + { + "epoch": 0.8783092079916341, + "grad_norm": 0.6799747347831726, + "learning_rate": 5.973804146569677e-06, + "loss": 0.7094, + "step": 15958 + }, + { + "epoch": 0.8783642467939897, + "grad_norm": 0.653275728225708, + "learning_rate": 5.973378976848716e-06, + "loss": 0.623, + "step": 15959 + }, + { + "epoch": 0.8784192855963454, + "grad_norm": 0.6734615564346313, + "learning_rate": 5.972953799812098e-06, + "loss": 0.7071, + "step": 15960 + }, + { + "epoch": 0.8784743243987011, + "grad_norm": 0.6319865584373474, + "learning_rate": 5.9725286154630205e-06, + "loss": 0.7239, + "step": 15961 + }, + { + "epoch": 0.8785293632010568, + "grad_norm": 0.6933672428131104, + "learning_rate": 5.972103423804677e-06, + "loss": 0.7138, + "step": 15962 + }, + { + "epoch": 0.8785844020034124, + "grad_norm": 0.7323144674301147, + "learning_rate": 5.971678224840266e-06, + "loss": 0.7534, + "step": 15963 + }, + { + "epoch": 0.8786394408057681, + "grad_norm": 0.6736310720443726, + "learning_rate": 5.971253018572981e-06, + "loss": 0.7644, + "step": 15964 + }, + { + "epoch": 0.8786944796081237, + "grad_norm": 0.6524562835693359, + "learning_rate": 5.970827805006016e-06, + "loss": 0.7727, + "step": 15965 + }, + { + "epoch": 0.8787495184104794, + "grad_norm": 0.7736978530883789, + "learning_rate": 5.970402584142573e-06, + "loss": 0.7641, + "step": 15966 + }, + { + "epoch": 0.878804557212835, + "grad_norm": 0.7099476456642151, + "learning_rate": 5.969977355985842e-06, + "loss": 0.7064, + "step": 15967 + }, + { + "epoch": 0.8788595960151907, + "grad_norm": 0.6340349912643433, + "learning_rate": 5.969552120539021e-06, + "loss": 0.7135, + "step": 15968 + }, + { + "epoch": 0.8789146348175464, + "grad_norm": 0.649853527545929, + "learning_rate": 5.969126877805306e-06, + "loss": 0.7068, + "step": 15969 + }, + { + "epoch": 0.8789696736199021, + "grad_norm": 0.589920699596405, + "learning_rate": 5.9687016277878925e-06, + "loss": 0.6891, + "step": 15970 + }, + { + "epoch": 0.8790247124222577, + "grad_norm": 0.7485616207122803, + "learning_rate": 5.968276370489977e-06, + "loss": 0.7305, + "step": 15971 + }, + { + "epoch": 0.8790797512246133, + "grad_norm": 0.6765890121459961, + "learning_rate": 5.967851105914756e-06, + "loss": 0.6862, + "step": 15972 + }, + { + "epoch": 0.879134790026969, + "grad_norm": 0.7127717733383179, + "learning_rate": 5.967425834065423e-06, + "loss": 0.6255, + "step": 15973 + }, + { + "epoch": 0.8791898288293247, + "grad_norm": 1.1564292907714844, + "learning_rate": 5.967000554945179e-06, + "loss": 0.7351, + "step": 15974 + }, + { + "epoch": 0.8792448676316803, + "grad_norm": 0.7343658804893494, + "learning_rate": 5.966575268557217e-06, + "loss": 0.7605, + "step": 15975 + }, + { + "epoch": 0.879299906434036, + "grad_norm": 0.695280134677887, + "learning_rate": 5.966149974904733e-06, + "loss": 0.7425, + "step": 15976 + }, + { + "epoch": 0.8793549452363917, + "grad_norm": 0.7075724005699158, + "learning_rate": 5.965724673990925e-06, + "loss": 0.6501, + "step": 15977 + }, + { + "epoch": 0.8794099840387474, + "grad_norm": 0.6764316558837891, + "learning_rate": 5.96529936581899e-06, + "loss": 0.6871, + "step": 15978 + }, + { + "epoch": 0.879465022841103, + "grad_norm": 0.6782918572425842, + "learning_rate": 5.964874050392122e-06, + "loss": 0.7298, + "step": 15979 + }, + { + "epoch": 0.8795200616434586, + "grad_norm": 0.7106739282608032, + "learning_rate": 5.964448727713519e-06, + "loss": 0.755, + "step": 15980 + }, + { + "epoch": 0.8795751004458143, + "grad_norm": 0.7591151595115662, + "learning_rate": 5.964023397786378e-06, + "loss": 0.7603, + "step": 15981 + }, + { + "epoch": 0.8796301392481699, + "grad_norm": 0.6050585508346558, + "learning_rate": 5.963598060613896e-06, + "loss": 0.6895, + "step": 15982 + }, + { + "epoch": 0.8796851780505256, + "grad_norm": 0.7037108540534973, + "learning_rate": 5.963172716199267e-06, + "loss": 0.7475, + "step": 15983 + }, + { + "epoch": 0.8797402168528813, + "grad_norm": 0.6476989984512329, + "learning_rate": 5.962747364545692e-06, + "loss": 0.7925, + "step": 15984 + }, + { + "epoch": 0.879795255655237, + "grad_norm": 0.6409072875976562, + "learning_rate": 5.962322005656362e-06, + "loss": 0.8134, + "step": 15985 + }, + { + "epoch": 0.8798502944575926, + "grad_norm": 0.8730958700180054, + "learning_rate": 5.96189663953448e-06, + "loss": 0.8283, + "step": 15986 + }, + { + "epoch": 0.8799053332599482, + "grad_norm": 0.73405522108078, + "learning_rate": 5.96147126618324e-06, + "loss": 0.7351, + "step": 15987 + }, + { + "epoch": 0.8799603720623039, + "grad_norm": 0.6587926745414734, + "learning_rate": 5.961045885605839e-06, + "loss": 0.7713, + "step": 15988 + }, + { + "epoch": 0.8800154108646596, + "grad_norm": 0.8479684591293335, + "learning_rate": 5.9606204978054736e-06, + "loss": 0.7537, + "step": 15989 + }, + { + "epoch": 0.8800704496670152, + "grad_norm": 0.731315553188324, + "learning_rate": 5.960195102785343e-06, + "loss": 0.7363, + "step": 15990 + }, + { + "epoch": 0.8801254884693709, + "grad_norm": 0.7163324952125549, + "learning_rate": 5.9597697005486434e-06, + "loss": 0.718, + "step": 15991 + }, + { + "epoch": 0.8801805272717266, + "grad_norm": 0.6090518832206726, + "learning_rate": 5.9593442910985714e-06, + "loss": 0.6002, + "step": 15992 + }, + { + "epoch": 0.8802355660740823, + "grad_norm": 0.633756160736084, + "learning_rate": 5.958918874438324e-06, + "loss": 0.5981, + "step": 15993 + }, + { + "epoch": 0.8802906048764378, + "grad_norm": 0.728961169719696, + "learning_rate": 5.958493450571099e-06, + "loss": 0.7964, + "step": 15994 + }, + { + "epoch": 0.8803456436787935, + "grad_norm": 0.8194692134857178, + "learning_rate": 5.958068019500094e-06, + "loss": 0.748, + "step": 15995 + }, + { + "epoch": 0.8804006824811492, + "grad_norm": 0.9324885010719299, + "learning_rate": 5.957642581228506e-06, + "loss": 0.8263, + "step": 15996 + }, + { + "epoch": 0.8804557212835049, + "grad_norm": 0.7585923075675964, + "learning_rate": 5.957217135759532e-06, + "loss": 0.8208, + "step": 15997 + }, + { + "epoch": 0.8805107600858605, + "grad_norm": 0.726158618927002, + "learning_rate": 5.956791683096371e-06, + "loss": 0.7664, + "step": 15998 + }, + { + "epoch": 0.8805657988882162, + "grad_norm": 0.7648130059242249, + "learning_rate": 5.95636622324222e-06, + "loss": 0.8102, + "step": 15999 + }, + { + "epoch": 0.8806208376905719, + "grad_norm": 0.7053543925285339, + "learning_rate": 5.955940756200277e-06, + "loss": 0.8446, + "step": 16000 + }, + { + "epoch": 0.8806758764929276, + "grad_norm": 1.0602153539657593, + "learning_rate": 5.955515281973737e-06, + "loss": 0.8199, + "step": 16001 + }, + { + "epoch": 0.8807309152952831, + "grad_norm": 0.7527370452880859, + "learning_rate": 5.955089800565802e-06, + "loss": 0.7895, + "step": 16002 + }, + { + "epoch": 0.8807859540976388, + "grad_norm": 0.7191178202629089, + "learning_rate": 5.954664311979666e-06, + "loss": 0.8836, + "step": 16003 + }, + { + "epoch": 0.8808409928999945, + "grad_norm": 0.6993798017501831, + "learning_rate": 5.95423881621853e-06, + "loss": 0.7765, + "step": 16004 + }, + { + "epoch": 0.8808960317023502, + "grad_norm": 0.6863248944282532, + "learning_rate": 5.9538133132855915e-06, + "loss": 0.6294, + "step": 16005 + }, + { + "epoch": 0.8809510705047058, + "grad_norm": 0.7736430168151855, + "learning_rate": 5.953387803184046e-06, + "loss": 0.9003, + "step": 16006 + }, + { + "epoch": 0.8810061093070615, + "grad_norm": 0.6101526618003845, + "learning_rate": 5.9529622859170935e-06, + "loss": 0.7125, + "step": 16007 + }, + { + "epoch": 0.8810611481094172, + "grad_norm": 0.7688401341438293, + "learning_rate": 5.952536761487932e-06, + "loss": 0.7667, + "step": 16008 + }, + { + "epoch": 0.8811161869117728, + "grad_norm": 0.6438688039779663, + "learning_rate": 5.9521112298997575e-06, + "loss": 0.7327, + "step": 16009 + }, + { + "epoch": 0.8811712257141284, + "grad_norm": 0.7732130885124207, + "learning_rate": 5.951685691155769e-06, + "loss": 0.8466, + "step": 16010 + }, + { + "epoch": 0.8812262645164841, + "grad_norm": 0.755892813205719, + "learning_rate": 5.951260145259168e-06, + "loss": 0.8042, + "step": 16011 + }, + { + "epoch": 0.8812813033188398, + "grad_norm": 0.7132954001426697, + "learning_rate": 5.950834592213151e-06, + "loss": 0.7801, + "step": 16012 + }, + { + "epoch": 0.8813363421211955, + "grad_norm": 0.702319324016571, + "learning_rate": 5.950409032020914e-06, + "loss": 0.7487, + "step": 16013 + }, + { + "epoch": 0.8813913809235511, + "grad_norm": 0.6477691531181335, + "learning_rate": 5.949983464685656e-06, + "loss": 0.6942, + "step": 16014 + }, + { + "epoch": 0.8814464197259068, + "grad_norm": 0.6817807555198669, + "learning_rate": 5.949557890210578e-06, + "loss": 0.676, + "step": 16015 + }, + { + "epoch": 0.8815014585282624, + "grad_norm": 0.6980645060539246, + "learning_rate": 5.949132308598877e-06, + "loss": 0.7837, + "step": 16016 + }, + { + "epoch": 0.8815564973306181, + "grad_norm": 0.9056459665298462, + "learning_rate": 5.948706719853753e-06, + "loss": 0.7005, + "step": 16017 + }, + { + "epoch": 0.8816115361329737, + "grad_norm": 0.8172656297683716, + "learning_rate": 5.948281123978402e-06, + "loss": 0.7784, + "step": 16018 + }, + { + "epoch": 0.8816665749353294, + "grad_norm": 0.6387753486633301, + "learning_rate": 5.947855520976025e-06, + "loss": 0.8171, + "step": 16019 + }, + { + "epoch": 0.8817216137376851, + "grad_norm": 0.6150957345962524, + "learning_rate": 5.947429910849818e-06, + "loss": 0.6448, + "step": 16020 + }, + { + "epoch": 0.8817766525400408, + "grad_norm": 0.7051831483840942, + "learning_rate": 5.947004293602982e-06, + "loss": 0.7585, + "step": 16021 + }, + { + "epoch": 0.8818316913423964, + "grad_norm": 0.7967584729194641, + "learning_rate": 5.946578669238714e-06, + "loss": 0.8122, + "step": 16022 + }, + { + "epoch": 0.881886730144752, + "grad_norm": 0.6126663088798523, + "learning_rate": 5.946153037760216e-06, + "loss": 0.6149, + "step": 16023 + }, + { + "epoch": 0.8819417689471077, + "grad_norm": 0.6940233111381531, + "learning_rate": 5.945727399170684e-06, + "loss": 0.7373, + "step": 16024 + }, + { + "epoch": 0.8819968077494633, + "grad_norm": 0.652776837348938, + "learning_rate": 5.945301753473318e-06, + "loss": 0.7029, + "step": 16025 + }, + { + "epoch": 0.882051846551819, + "grad_norm": 0.7182415723800659, + "learning_rate": 5.944876100671317e-06, + "loss": 0.7343, + "step": 16026 + }, + { + "epoch": 0.8821068853541747, + "grad_norm": 0.6714525818824768, + "learning_rate": 5.944450440767881e-06, + "loss": 0.7402, + "step": 16027 + }, + { + "epoch": 0.8821619241565304, + "grad_norm": 0.7144107818603516, + "learning_rate": 5.944024773766208e-06, + "loss": 0.6775, + "step": 16028 + }, + { + "epoch": 0.882216962958886, + "grad_norm": 0.6483643054962158, + "learning_rate": 5.943599099669497e-06, + "loss": 0.698, + "step": 16029 + }, + { + "epoch": 0.8822720017612417, + "grad_norm": 0.6388065218925476, + "learning_rate": 5.943173418480949e-06, + "loss": 0.7814, + "step": 16030 + }, + { + "epoch": 0.8823270405635973, + "grad_norm": 0.6891177892684937, + "learning_rate": 5.942747730203761e-06, + "loss": 0.7423, + "step": 16031 + }, + { + "epoch": 0.882382079365953, + "grad_norm": 0.6425840258598328, + "learning_rate": 5.942322034841133e-06, + "loss": 0.7178, + "step": 16032 + }, + { + "epoch": 0.8824371181683086, + "grad_norm": 0.6308293342590332, + "learning_rate": 5.941896332396266e-06, + "loss": 0.6624, + "step": 16033 + }, + { + "epoch": 0.8824921569706643, + "grad_norm": 0.6310557126998901, + "learning_rate": 5.941470622872358e-06, + "loss": 0.7365, + "step": 16034 + }, + { + "epoch": 0.88254719577302, + "grad_norm": 0.7731123566627502, + "learning_rate": 5.941044906272609e-06, + "loss": 0.6385, + "step": 16035 + }, + { + "epoch": 0.8826022345753757, + "grad_norm": 0.7063789963722229, + "learning_rate": 5.940619182600217e-06, + "loss": 0.7606, + "step": 16036 + }, + { + "epoch": 0.8826572733777313, + "grad_norm": 0.6370593309402466, + "learning_rate": 5.940193451858384e-06, + "loss": 0.7332, + "step": 16037 + }, + { + "epoch": 0.8827123121800869, + "grad_norm": 0.7685242891311646, + "learning_rate": 5.9397677140503085e-06, + "loss": 0.7809, + "step": 16038 + }, + { + "epoch": 0.8827673509824426, + "grad_norm": 0.8439769148826599, + "learning_rate": 5.93934196917919e-06, + "loss": 0.7419, + "step": 16039 + }, + { + "epoch": 0.8828223897847983, + "grad_norm": 0.873909056186676, + "learning_rate": 5.93891621724823e-06, + "loss": 0.6559, + "step": 16040 + }, + { + "epoch": 0.8828774285871539, + "grad_norm": 0.711383581161499, + "learning_rate": 5.938490458260626e-06, + "loss": 0.718, + "step": 16041 + }, + { + "epoch": 0.8829324673895096, + "grad_norm": 0.6775739192962646, + "learning_rate": 5.938064692219579e-06, + "loss": 0.7062, + "step": 16042 + }, + { + "epoch": 0.8829875061918653, + "grad_norm": 0.687095582485199, + "learning_rate": 5.93763891912829e-06, + "loss": 0.7773, + "step": 16043 + }, + { + "epoch": 0.883042544994221, + "grad_norm": 0.6648910641670227, + "learning_rate": 5.937213138989957e-06, + "loss": 0.7267, + "step": 16044 + }, + { + "epoch": 0.8830975837965765, + "grad_norm": 0.6296299695968628, + "learning_rate": 5.9367873518077815e-06, + "loss": 0.7561, + "step": 16045 + }, + { + "epoch": 0.8831526225989322, + "grad_norm": 1.1233999729156494, + "learning_rate": 5.936361557584961e-06, + "loss": 0.7401, + "step": 16046 + }, + { + "epoch": 0.8832076614012879, + "grad_norm": 0.7307866811752319, + "learning_rate": 5.935935756324699e-06, + "loss": 0.7945, + "step": 16047 + }, + { + "epoch": 0.8832627002036436, + "grad_norm": 0.627402663230896, + "learning_rate": 5.9355099480301944e-06, + "loss": 0.7005, + "step": 16048 + }, + { + "epoch": 0.8833177390059992, + "grad_norm": 0.6698537468910217, + "learning_rate": 5.935084132704648e-06, + "loss": 0.7349, + "step": 16049 + }, + { + "epoch": 0.8833727778083549, + "grad_norm": 0.7348290681838989, + "learning_rate": 5.934658310351258e-06, + "loss": 0.8033, + "step": 16050 + }, + { + "epoch": 0.8834278166107106, + "grad_norm": 0.6543971300125122, + "learning_rate": 5.934232480973228e-06, + "loss": 0.706, + "step": 16051 + }, + { + "epoch": 0.8834828554130663, + "grad_norm": 0.7279872894287109, + "learning_rate": 5.933806644573756e-06, + "loss": 0.8142, + "step": 16052 + }, + { + "epoch": 0.8835378942154218, + "grad_norm": 0.6433993577957153, + "learning_rate": 5.933380801156044e-06, + "loss": 0.7092, + "step": 16053 + }, + { + "epoch": 0.8835929330177775, + "grad_norm": 0.7375844717025757, + "learning_rate": 5.932954950723291e-06, + "loss": 0.7632, + "step": 16054 + }, + { + "epoch": 0.8836479718201332, + "grad_norm": 0.6837477087974548, + "learning_rate": 5.932529093278698e-06, + "loss": 0.735, + "step": 16055 + }, + { + "epoch": 0.8837030106224889, + "grad_norm": 0.5978528261184692, + "learning_rate": 5.932103228825467e-06, + "loss": 0.6745, + "step": 16056 + }, + { + "epoch": 0.8837580494248445, + "grad_norm": 0.6475256085395813, + "learning_rate": 5.931677357366798e-06, + "loss": 0.7469, + "step": 16057 + }, + { + "epoch": 0.8838130882272002, + "grad_norm": 0.685108482837677, + "learning_rate": 5.931251478905888e-06, + "loss": 0.7528, + "step": 16058 + }, + { + "epoch": 0.8838681270295559, + "grad_norm": 0.7046063542366028, + "learning_rate": 5.930825593445945e-06, + "loss": 0.8157, + "step": 16059 + }, + { + "epoch": 0.8839231658319116, + "grad_norm": 0.6626511216163635, + "learning_rate": 5.930399700990165e-06, + "loss": 0.791, + "step": 16060 + }, + { + "epoch": 0.8839782046342671, + "grad_norm": 0.7249611020088196, + "learning_rate": 5.929973801541749e-06, + "loss": 0.8428, + "step": 16061 + }, + { + "epoch": 0.8840332434366228, + "grad_norm": 0.5953019261360168, + "learning_rate": 5.929547895103899e-06, + "loss": 0.6589, + "step": 16062 + }, + { + "epoch": 0.8840882822389785, + "grad_norm": 0.6847557425498962, + "learning_rate": 5.9291219816798165e-06, + "loss": 0.7976, + "step": 16063 + }, + { + "epoch": 0.8841433210413342, + "grad_norm": 0.7806814312934875, + "learning_rate": 5.928696061272701e-06, + "loss": 0.7879, + "step": 16064 + }, + { + "epoch": 0.8841983598436898, + "grad_norm": 1.028922200202942, + "learning_rate": 5.9282701338857555e-06, + "loss": 0.732, + "step": 16065 + }, + { + "epoch": 0.8842533986460455, + "grad_norm": 0.6143102645874023, + "learning_rate": 5.927844199522179e-06, + "loss": 0.7216, + "step": 16066 + }, + { + "epoch": 0.8843084374484012, + "grad_norm": 0.6739519834518433, + "learning_rate": 5.927418258185176e-06, + "loss": 0.7134, + "step": 16067 + }, + { + "epoch": 0.8843634762507567, + "grad_norm": 0.7888758778572083, + "learning_rate": 5.926992309877944e-06, + "loss": 0.7396, + "step": 16068 + }, + { + "epoch": 0.8844185150531124, + "grad_norm": 0.6926425695419312, + "learning_rate": 5.926566354603687e-06, + "loss": 0.7629, + "step": 16069 + }, + { + "epoch": 0.8844735538554681, + "grad_norm": 0.7800819277763367, + "learning_rate": 5.926140392365602e-06, + "loss": 0.834, + "step": 16070 + }, + { + "epoch": 0.8845285926578238, + "grad_norm": 0.711067259311676, + "learning_rate": 5.925714423166897e-06, + "loss": 0.7401, + "step": 16071 + }, + { + "epoch": 0.8845836314601794, + "grad_norm": 0.645727276802063, + "learning_rate": 5.92528844701077e-06, + "loss": 0.719, + "step": 16072 + }, + { + "epoch": 0.8846386702625351, + "grad_norm": 0.7098503112792969, + "learning_rate": 5.924862463900421e-06, + "loss": 0.7838, + "step": 16073 + }, + { + "epoch": 0.8846937090648908, + "grad_norm": 1.0021764039993286, + "learning_rate": 5.924436473839055e-06, + "loss": 0.9824, + "step": 16074 + }, + { + "epoch": 0.8847487478672464, + "grad_norm": 0.657049298286438, + "learning_rate": 5.924010476829871e-06, + "loss": 0.6797, + "step": 16075 + }, + { + "epoch": 0.884803786669602, + "grad_norm": 0.6779371500015259, + "learning_rate": 5.923584472876072e-06, + "loss": 0.6697, + "step": 16076 + }, + { + "epoch": 0.8848588254719577, + "grad_norm": 0.6699591279029846, + "learning_rate": 5.923158461980859e-06, + "loss": 0.6779, + "step": 16077 + }, + { + "epoch": 0.8849138642743134, + "grad_norm": 0.6137605905532837, + "learning_rate": 5.922732444147434e-06, + "loss": 0.7195, + "step": 16078 + }, + { + "epoch": 0.8849689030766691, + "grad_norm": 0.6648411750793457, + "learning_rate": 5.922306419379e-06, + "loss": 0.7027, + "step": 16079 + }, + { + "epoch": 0.8850239418790247, + "grad_norm": 0.6827279329299927, + "learning_rate": 5.921880387678758e-06, + "loss": 0.7582, + "step": 16080 + }, + { + "epoch": 0.8850789806813804, + "grad_norm": 0.6747342944145203, + "learning_rate": 5.921454349049909e-06, + "loss": 0.7174, + "step": 16081 + }, + { + "epoch": 0.885134019483736, + "grad_norm": 0.7580771446228027, + "learning_rate": 5.921028303495654e-06, + "loss": 0.7457, + "step": 16082 + }, + { + "epoch": 0.8851890582860917, + "grad_norm": 0.8015843033790588, + "learning_rate": 5.920602251019198e-06, + "loss": 0.7084, + "step": 16083 + }, + { + "epoch": 0.8852440970884473, + "grad_norm": 0.7056819796562195, + "learning_rate": 5.9201761916237434e-06, + "loss": 0.7363, + "step": 16084 + }, + { + "epoch": 0.885299135890803, + "grad_norm": 0.6790309548377991, + "learning_rate": 5.91975012531249e-06, + "loss": 0.7966, + "step": 16085 + }, + { + "epoch": 0.8853541746931587, + "grad_norm": 0.6311555504798889, + "learning_rate": 5.91932405208864e-06, + "loss": 0.7099, + "step": 16086 + }, + { + "epoch": 0.8854092134955144, + "grad_norm": 0.8329381942749023, + "learning_rate": 5.918897971955397e-06, + "loss": 0.8127, + "step": 16087 + }, + { + "epoch": 0.88546425229787, + "grad_norm": 0.6810656785964966, + "learning_rate": 5.918471884915964e-06, + "loss": 0.8052, + "step": 16088 + }, + { + "epoch": 0.8855192911002256, + "grad_norm": 0.7434407472610474, + "learning_rate": 5.918045790973541e-06, + "loss": 0.7936, + "step": 16089 + }, + { + "epoch": 0.8855743299025813, + "grad_norm": 0.6702361106872559, + "learning_rate": 5.917619690131332e-06, + "loss": 0.744, + "step": 16090 + }, + { + "epoch": 0.885629368704937, + "grad_norm": 0.6321039199829102, + "learning_rate": 5.9171935823925384e-06, + "loss": 0.6949, + "step": 16091 + }, + { + "epoch": 0.8856844075072926, + "grad_norm": 0.7133139371871948, + "learning_rate": 5.916767467760365e-06, + "loss": 0.7113, + "step": 16092 + }, + { + "epoch": 0.8857394463096483, + "grad_norm": 0.7414994239807129, + "learning_rate": 5.916341346238011e-06, + "loss": 0.8203, + "step": 16093 + }, + { + "epoch": 0.885794485112004, + "grad_norm": 0.6744404435157776, + "learning_rate": 5.91591521782868e-06, + "loss": 0.7869, + "step": 16094 + }, + { + "epoch": 0.8858495239143597, + "grad_norm": 0.7183727025985718, + "learning_rate": 5.915489082535577e-06, + "loss": 0.7375, + "step": 16095 + }, + { + "epoch": 0.8859045627167152, + "grad_norm": 0.740496814250946, + "learning_rate": 5.9150629403619035e-06, + "loss": 0.8134, + "step": 16096 + }, + { + "epoch": 0.8859596015190709, + "grad_norm": 0.696391224861145, + "learning_rate": 5.9146367913108605e-06, + "loss": 0.7137, + "step": 16097 + }, + { + "epoch": 0.8860146403214266, + "grad_norm": 0.6438629031181335, + "learning_rate": 5.914210635385652e-06, + "loss": 0.698, + "step": 16098 + }, + { + "epoch": 0.8860696791237823, + "grad_norm": 0.7644562125205994, + "learning_rate": 5.913784472589482e-06, + "loss": 0.771, + "step": 16099 + }, + { + "epoch": 0.8861247179261379, + "grad_norm": 0.7281080484390259, + "learning_rate": 5.913358302925553e-06, + "loss": 0.7281, + "step": 16100 + }, + { + "epoch": 0.8861797567284936, + "grad_norm": 0.7768884301185608, + "learning_rate": 5.912932126397067e-06, + "loss": 0.7859, + "step": 16101 + }, + { + "epoch": 0.8862347955308493, + "grad_norm": 0.6960753202438354, + "learning_rate": 5.9125059430072275e-06, + "loss": 0.702, + "step": 16102 + }, + { + "epoch": 0.886289834333205, + "grad_norm": 0.6299503445625305, + "learning_rate": 5.912079752759238e-06, + "loss": 0.6558, + "step": 16103 + }, + { + "epoch": 0.8863448731355605, + "grad_norm": 0.7048517465591431, + "learning_rate": 5.9116535556563005e-06, + "loss": 0.6915, + "step": 16104 + }, + { + "epoch": 0.8863999119379162, + "grad_norm": 1.0701110363006592, + "learning_rate": 5.9112273517016195e-06, + "loss": 0.7721, + "step": 16105 + }, + { + "epoch": 0.8864549507402719, + "grad_norm": 0.6834803223609924, + "learning_rate": 5.910801140898396e-06, + "loss": 0.7474, + "step": 16106 + }, + { + "epoch": 0.8865099895426276, + "grad_norm": 0.6799558401107788, + "learning_rate": 5.9103749232498366e-06, + "loss": 0.655, + "step": 16107 + }, + { + "epoch": 0.8865650283449832, + "grad_norm": 0.9704173803329468, + "learning_rate": 5.9099486987591425e-06, + "loss": 0.911, + "step": 16108 + }, + { + "epoch": 0.8866200671473389, + "grad_norm": 0.7304208278656006, + "learning_rate": 5.909522467429518e-06, + "loss": 0.7315, + "step": 16109 + }, + { + "epoch": 0.8866751059496946, + "grad_norm": 0.6966742277145386, + "learning_rate": 5.909096229264164e-06, + "loss": 0.8658, + "step": 16110 + }, + { + "epoch": 0.8867301447520501, + "grad_norm": 0.667934238910675, + "learning_rate": 5.908669984266289e-06, + "loss": 0.6654, + "step": 16111 + }, + { + "epoch": 0.8867851835544058, + "grad_norm": 0.6689571142196655, + "learning_rate": 5.908243732439092e-06, + "loss": 0.8669, + "step": 16112 + }, + { + "epoch": 0.8868402223567615, + "grad_norm": 0.6054841876029968, + "learning_rate": 5.9078174737857795e-06, + "loss": 0.6063, + "step": 16113 + }, + { + "epoch": 0.8868952611591172, + "grad_norm": 0.6113643050193787, + "learning_rate": 5.907391208309553e-06, + "loss": 0.5609, + "step": 16114 + }, + { + "epoch": 0.8869502999614728, + "grad_norm": 0.6858495473861694, + "learning_rate": 5.906964936013617e-06, + "loss": 0.7651, + "step": 16115 + }, + { + "epoch": 0.8870053387638285, + "grad_norm": 0.6587123870849609, + "learning_rate": 5.906538656901175e-06, + "loss": 0.6834, + "step": 16116 + }, + { + "epoch": 0.8870603775661842, + "grad_norm": 0.7558240294456482, + "learning_rate": 5.906112370975432e-06, + "loss": 0.7911, + "step": 16117 + }, + { + "epoch": 0.8871154163685399, + "grad_norm": 0.7324747443199158, + "learning_rate": 5.90568607823959e-06, + "loss": 0.7132, + "step": 16118 + }, + { + "epoch": 0.8871704551708954, + "grad_norm": 0.6696536540985107, + "learning_rate": 5.9052597786968545e-06, + "loss": 0.6761, + "step": 16119 + }, + { + "epoch": 0.8872254939732511, + "grad_norm": 0.6724241375923157, + "learning_rate": 5.904833472350429e-06, + "loss": 0.7367, + "step": 16120 + }, + { + "epoch": 0.8872805327756068, + "grad_norm": 0.8458320498466492, + "learning_rate": 5.904407159203517e-06, + "loss": 0.8542, + "step": 16121 + }, + { + "epoch": 0.8873355715779625, + "grad_norm": 0.6740517020225525, + "learning_rate": 5.903980839259323e-06, + "loss": 0.711, + "step": 16122 + }, + { + "epoch": 0.8873906103803181, + "grad_norm": 0.7465891242027283, + "learning_rate": 5.9035545125210505e-06, + "loss": 0.8501, + "step": 16123 + }, + { + "epoch": 0.8874456491826738, + "grad_norm": 0.9160736203193665, + "learning_rate": 5.903128178991905e-06, + "loss": 0.8055, + "step": 16124 + }, + { + "epoch": 0.8875006879850295, + "grad_norm": 0.8358868956565857, + "learning_rate": 5.902701838675089e-06, + "loss": 0.7946, + "step": 16125 + }, + { + "epoch": 0.8875557267873851, + "grad_norm": 0.760776162147522, + "learning_rate": 5.902275491573808e-06, + "loss": 0.7944, + "step": 16126 + }, + { + "epoch": 0.8876107655897407, + "grad_norm": 0.605964720249176, + "learning_rate": 5.901849137691267e-06, + "loss": 0.6512, + "step": 16127 + }, + { + "epoch": 0.8876658043920964, + "grad_norm": 1.4000526666641235, + "learning_rate": 5.9014227770306676e-06, + "loss": 0.8047, + "step": 16128 + }, + { + "epoch": 0.8877208431944521, + "grad_norm": 0.7314043045043945, + "learning_rate": 5.900996409595217e-06, + "loss": 0.7674, + "step": 16129 + }, + { + "epoch": 0.8877758819968078, + "grad_norm": 1.3130903244018555, + "learning_rate": 5.900570035388117e-06, + "loss": 0.6922, + "step": 16130 + }, + { + "epoch": 0.8878309207991634, + "grad_norm": 0.6799461841583252, + "learning_rate": 5.900143654412576e-06, + "loss": 0.6921, + "step": 16131 + }, + { + "epoch": 0.8878859596015191, + "grad_norm": 0.6657615900039673, + "learning_rate": 5.899717266671794e-06, + "loss": 0.642, + "step": 16132 + }, + { + "epoch": 0.8879409984038747, + "grad_norm": 0.6838696599006653, + "learning_rate": 5.899290872168979e-06, + "loss": 0.7077, + "step": 16133 + }, + { + "epoch": 0.8879960372062304, + "grad_norm": 0.7769932150840759, + "learning_rate": 5.898864470907334e-06, + "loss": 0.8155, + "step": 16134 + }, + { + "epoch": 0.888051076008586, + "grad_norm": 0.6874750852584839, + "learning_rate": 5.898438062890065e-06, + "loss": 0.7597, + "step": 16135 + }, + { + "epoch": 0.8881061148109417, + "grad_norm": 0.7016799449920654, + "learning_rate": 5.898011648120375e-06, + "loss": 0.7973, + "step": 16136 + }, + { + "epoch": 0.8881611536132974, + "grad_norm": 0.743046760559082, + "learning_rate": 5.897585226601471e-06, + "loss": 0.7217, + "step": 16137 + }, + { + "epoch": 0.8882161924156531, + "grad_norm": 0.6889417767524719, + "learning_rate": 5.8971587983365566e-06, + "loss": 0.7067, + "step": 16138 + }, + { + "epoch": 0.8882712312180087, + "grad_norm": 0.631155788898468, + "learning_rate": 5.896732363328836e-06, + "loss": 0.8113, + "step": 16139 + }, + { + "epoch": 0.8883262700203644, + "grad_norm": 0.64445960521698, + "learning_rate": 5.8963059215815165e-06, + "loss": 0.7801, + "step": 16140 + }, + { + "epoch": 0.88838130882272, + "grad_norm": 1.6496944427490234, + "learning_rate": 5.895879473097801e-06, + "loss": 0.7997, + "step": 16141 + }, + { + "epoch": 0.8884363476250757, + "grad_norm": 0.8304264545440674, + "learning_rate": 5.895453017880893e-06, + "loss": 0.7333, + "step": 16142 + }, + { + "epoch": 0.8884913864274313, + "grad_norm": 0.659909725189209, + "learning_rate": 5.895026555934002e-06, + "loss": 0.7924, + "step": 16143 + }, + { + "epoch": 0.888546425229787, + "grad_norm": 0.7013087272644043, + "learning_rate": 5.894600087260332e-06, + "loss": 0.7704, + "step": 16144 + }, + { + "epoch": 0.8886014640321427, + "grad_norm": 0.6914981603622437, + "learning_rate": 5.894173611863085e-06, + "loss": 0.7377, + "step": 16145 + }, + { + "epoch": 0.8886565028344984, + "grad_norm": 0.8310953378677368, + "learning_rate": 5.89374712974547e-06, + "loss": 0.7893, + "step": 16146 + }, + { + "epoch": 0.888711541636854, + "grad_norm": 0.6522740721702576, + "learning_rate": 5.8933206409106895e-06, + "loss": 0.6915, + "step": 16147 + }, + { + "epoch": 0.8887665804392096, + "grad_norm": 0.6072065234184265, + "learning_rate": 5.89289414536195e-06, + "loss": 0.5744, + "step": 16148 + }, + { + "epoch": 0.8888216192415653, + "grad_norm": 0.5975275635719299, + "learning_rate": 5.892467643102458e-06, + "loss": 0.6281, + "step": 16149 + }, + { + "epoch": 0.888876658043921, + "grad_norm": 0.9194470643997192, + "learning_rate": 5.892041134135418e-06, + "loss": 0.6909, + "step": 16150 + }, + { + "epoch": 0.8889316968462766, + "grad_norm": 0.5815016031265259, + "learning_rate": 5.891614618464037e-06, + "loss": 0.6342, + "step": 16151 + }, + { + "epoch": 0.8889867356486323, + "grad_norm": 0.666912853717804, + "learning_rate": 5.891188096091517e-06, + "loss": 0.8043, + "step": 16152 + }, + { + "epoch": 0.889041774450988, + "grad_norm": 0.7708194851875305, + "learning_rate": 5.890761567021067e-06, + "loss": 0.811, + "step": 16153 + }, + { + "epoch": 0.8890968132533436, + "grad_norm": 0.7158086895942688, + "learning_rate": 5.890335031255892e-06, + "loss": 0.7615, + "step": 16154 + }, + { + "epoch": 0.8891518520556992, + "grad_norm": 0.7432296872138977, + "learning_rate": 5.889908488799194e-06, + "loss": 0.8236, + "step": 16155 + }, + { + "epoch": 0.8892068908580549, + "grad_norm": 0.7223588228225708, + "learning_rate": 5.889481939654185e-06, + "loss": 0.7022, + "step": 16156 + }, + { + "epoch": 0.8892619296604106, + "grad_norm": 0.7680726647377014, + "learning_rate": 5.889055383824067e-06, + "loss": 0.7329, + "step": 16157 + }, + { + "epoch": 0.8893169684627662, + "grad_norm": 0.679315984249115, + "learning_rate": 5.888628821312048e-06, + "loss": 0.7213, + "step": 16158 + }, + { + "epoch": 0.8893720072651219, + "grad_norm": 0.9369942545890808, + "learning_rate": 5.88820225212133e-06, + "loss": 0.7661, + "step": 16159 + }, + { + "epoch": 0.8894270460674776, + "grad_norm": 0.710561990737915, + "learning_rate": 5.887775676255123e-06, + "loss": 0.7869, + "step": 16160 + }, + { + "epoch": 0.8894820848698333, + "grad_norm": 0.6641749143600464, + "learning_rate": 5.887349093716632e-06, + "loss": 0.748, + "step": 16161 + }, + { + "epoch": 0.8895371236721888, + "grad_norm": 0.6491042971611023, + "learning_rate": 5.886922504509062e-06, + "loss": 0.7208, + "step": 16162 + }, + { + "epoch": 0.8895921624745445, + "grad_norm": 0.706950843334198, + "learning_rate": 5.886495908635622e-06, + "loss": 0.7579, + "step": 16163 + }, + { + "epoch": 0.8896472012769002, + "grad_norm": 0.7884653806686401, + "learning_rate": 5.886069306099514e-06, + "loss": 0.7289, + "step": 16164 + }, + { + "epoch": 0.8897022400792559, + "grad_norm": 0.7089719176292419, + "learning_rate": 5.885642696903948e-06, + "loss": 0.7796, + "step": 16165 + }, + { + "epoch": 0.8897572788816115, + "grad_norm": 0.7245141267776489, + "learning_rate": 5.8852160810521275e-06, + "loss": 0.7357, + "step": 16166 + }, + { + "epoch": 0.8898123176839672, + "grad_norm": 0.74881511926651, + "learning_rate": 5.884789458547258e-06, + "loss": 0.707, + "step": 16167 + }, + { + "epoch": 0.8898673564863229, + "grad_norm": 0.623418390750885, + "learning_rate": 5.88436282939255e-06, + "loss": 0.5891, + "step": 16168 + }, + { + "epoch": 0.8899223952886786, + "grad_norm": 0.8884579539299011, + "learning_rate": 5.883936193591208e-06, + "loss": 0.9109, + "step": 16169 + }, + { + "epoch": 0.8899774340910341, + "grad_norm": 0.7089982628822327, + "learning_rate": 5.883509551146437e-06, + "loss": 0.8435, + "step": 16170 + }, + { + "epoch": 0.8900324728933898, + "grad_norm": 0.6861062049865723, + "learning_rate": 5.883082902061444e-06, + "loss": 0.5662, + "step": 16171 + }, + { + "epoch": 0.8900875116957455, + "grad_norm": 0.7688663005828857, + "learning_rate": 5.882656246339438e-06, + "loss": 0.7483, + "step": 16172 + }, + { + "epoch": 0.8901425504981012, + "grad_norm": 0.6451166868209839, + "learning_rate": 5.882229583983623e-06, + "loss": 0.8061, + "step": 16173 + }, + { + "epoch": 0.8901975893004568, + "grad_norm": 0.668999195098877, + "learning_rate": 5.881802914997208e-06, + "loss": 0.698, + "step": 16174 + }, + { + "epoch": 0.8902526281028125, + "grad_norm": 0.5772761702537537, + "learning_rate": 5.881376239383398e-06, + "loss": 0.6718, + "step": 16175 + }, + { + "epoch": 0.8903076669051682, + "grad_norm": 0.6677992343902588, + "learning_rate": 5.880949557145399e-06, + "loss": 0.7835, + "step": 16176 + }, + { + "epoch": 0.8903627057075238, + "grad_norm": 0.7227941751480103, + "learning_rate": 5.880522868286419e-06, + "loss": 0.7166, + "step": 16177 + }, + { + "epoch": 0.8904177445098794, + "grad_norm": 0.7365387082099915, + "learning_rate": 5.880096172809665e-06, + "loss": 0.7743, + "step": 16178 + }, + { + "epoch": 0.8904727833122351, + "grad_norm": 0.7826401591300964, + "learning_rate": 5.8796694707183435e-06, + "loss": 0.7963, + "step": 16179 + }, + { + "epoch": 0.8905278221145908, + "grad_norm": 0.6749493479728699, + "learning_rate": 5.879242762015662e-06, + "loss": 0.7023, + "step": 16180 + }, + { + "epoch": 0.8905828609169465, + "grad_norm": 0.7109015583992004, + "learning_rate": 5.8788160467048275e-06, + "loss": 0.8432, + "step": 16181 + }, + { + "epoch": 0.8906378997193021, + "grad_norm": 0.737983226776123, + "learning_rate": 5.878389324789047e-06, + "loss": 0.807, + "step": 16182 + }, + { + "epoch": 0.8906929385216578, + "grad_norm": 0.676296055316925, + "learning_rate": 5.877962596271526e-06, + "loss": 0.7894, + "step": 16183 + }, + { + "epoch": 0.8907479773240135, + "grad_norm": 0.6367083191871643, + "learning_rate": 5.877535861155474e-06, + "loss": 0.6995, + "step": 16184 + }, + { + "epoch": 0.8908030161263691, + "grad_norm": 0.7221261262893677, + "learning_rate": 5.877109119444099e-06, + "loss": 0.8032, + "step": 16185 + }, + { + "epoch": 0.8908580549287247, + "grad_norm": 0.9108307957649231, + "learning_rate": 5.8766823711406055e-06, + "loss": 0.6949, + "step": 16186 + }, + { + "epoch": 0.8909130937310804, + "grad_norm": 0.5985114574432373, + "learning_rate": 5.876255616248201e-06, + "loss": 0.6981, + "step": 16187 + }, + { + "epoch": 0.8909681325334361, + "grad_norm": 0.6146743297576904, + "learning_rate": 5.875828854770096e-06, + "loss": 0.6869, + "step": 16188 + }, + { + "epoch": 0.8910231713357918, + "grad_norm": 1.2942423820495605, + "learning_rate": 5.875402086709494e-06, + "loss": 0.8142, + "step": 16189 + }, + { + "epoch": 0.8910782101381474, + "grad_norm": 0.6676996350288391, + "learning_rate": 5.874975312069605e-06, + "loss": 0.7476, + "step": 16190 + }, + { + "epoch": 0.891133248940503, + "grad_norm": 0.6210917234420776, + "learning_rate": 5.874548530853635e-06, + "loss": 0.7248, + "step": 16191 + }, + { + "epoch": 0.8911882877428587, + "grad_norm": 0.7242050766944885, + "learning_rate": 5.874121743064792e-06, + "loss": 0.8378, + "step": 16192 + }, + { + "epoch": 0.8912433265452144, + "grad_norm": 0.7029538750648499, + "learning_rate": 5.873694948706286e-06, + "loss": 0.7487, + "step": 16193 + }, + { + "epoch": 0.89129836534757, + "grad_norm": 0.7620413899421692, + "learning_rate": 5.87326814778132e-06, + "loss": 0.7102, + "step": 16194 + }, + { + "epoch": 0.8913534041499257, + "grad_norm": 0.7075870633125305, + "learning_rate": 5.872841340293105e-06, + "loss": 0.7771, + "step": 16195 + }, + { + "epoch": 0.8914084429522814, + "grad_norm": 0.706533670425415, + "learning_rate": 5.8724145262448495e-06, + "loss": 0.8173, + "step": 16196 + }, + { + "epoch": 0.891463481754637, + "grad_norm": 0.6712881326675415, + "learning_rate": 5.871987705639759e-06, + "loss": 0.7933, + "step": 16197 + }, + { + "epoch": 0.8915185205569927, + "grad_norm": 0.6531795859336853, + "learning_rate": 5.871560878481043e-06, + "loss": 0.8013, + "step": 16198 + }, + { + "epoch": 0.8915735593593483, + "grad_norm": 0.7291449308395386, + "learning_rate": 5.8711340447719086e-06, + "loss": 0.7379, + "step": 16199 + }, + { + "epoch": 0.891628598161704, + "grad_norm": 0.7187185883522034, + "learning_rate": 5.870707204515564e-06, + "loss": 0.7627, + "step": 16200 + }, + { + "epoch": 0.8916836369640596, + "grad_norm": 0.6900884509086609, + "learning_rate": 5.870280357715217e-06, + "loss": 0.7779, + "step": 16201 + }, + { + "epoch": 0.8917386757664153, + "grad_norm": 0.647745668888092, + "learning_rate": 5.869853504374075e-06, + "loss": 0.7616, + "step": 16202 + }, + { + "epoch": 0.891793714568771, + "grad_norm": 0.6717308759689331, + "learning_rate": 5.869426644495347e-06, + "loss": 0.6673, + "step": 16203 + }, + { + "epoch": 0.8918487533711267, + "grad_norm": 0.7069498300552368, + "learning_rate": 5.868999778082242e-06, + "loss": 0.7349, + "step": 16204 + }, + { + "epoch": 0.8919037921734823, + "grad_norm": 0.72287917137146, + "learning_rate": 5.868572905137967e-06, + "loss": 0.7272, + "step": 16205 + }, + { + "epoch": 0.891958830975838, + "grad_norm": 0.6938319802284241, + "learning_rate": 5.868146025665731e-06, + "loss": 0.8285, + "step": 16206 + }, + { + "epoch": 0.8920138697781936, + "grad_norm": 1.5806810855865479, + "learning_rate": 5.867719139668739e-06, + "loss": 0.6881, + "step": 16207 + }, + { + "epoch": 0.8920689085805493, + "grad_norm": 0.7156746983528137, + "learning_rate": 5.867292247150206e-06, + "loss": 0.8212, + "step": 16208 + }, + { + "epoch": 0.8921239473829049, + "grad_norm": 0.6833271980285645, + "learning_rate": 5.866865348113335e-06, + "loss": 0.7741, + "step": 16209 + }, + { + "epoch": 0.8921789861852606, + "grad_norm": 0.6972640156745911, + "learning_rate": 5.866438442561336e-06, + "loss": 0.9058, + "step": 16210 + }, + { + "epoch": 0.8922340249876163, + "grad_norm": 0.697632372379303, + "learning_rate": 5.866011530497419e-06, + "loss": 0.8319, + "step": 16211 + }, + { + "epoch": 0.892289063789972, + "grad_norm": 0.7249447703361511, + "learning_rate": 5.865584611924789e-06, + "loss": 0.7491, + "step": 16212 + }, + { + "epoch": 0.8923441025923275, + "grad_norm": 0.7094838619232178, + "learning_rate": 5.865157686846659e-06, + "loss": 0.7371, + "step": 16213 + }, + { + "epoch": 0.8923991413946832, + "grad_norm": 0.7066075205802917, + "learning_rate": 5.864730755266233e-06, + "loss": 0.8273, + "step": 16214 + }, + { + "epoch": 0.8924541801970389, + "grad_norm": 0.7090823650360107, + "learning_rate": 5.864303817186723e-06, + "loss": 0.7642, + "step": 16215 + }, + { + "epoch": 0.8925092189993946, + "grad_norm": 0.7501302361488342, + "learning_rate": 5.863876872611337e-06, + "loss": 0.716, + "step": 16216 + }, + { + "epoch": 0.8925642578017502, + "grad_norm": 0.7354205250740051, + "learning_rate": 5.863449921543284e-06, + "loss": 0.7011, + "step": 16217 + }, + { + "epoch": 0.8926192966041059, + "grad_norm": 0.9364498853683472, + "learning_rate": 5.863022963985773e-06, + "loss": 0.7843, + "step": 16218 + }, + { + "epoch": 0.8926743354064616, + "grad_norm": 0.6501762270927429, + "learning_rate": 5.86259599994201e-06, + "loss": 0.7215, + "step": 16219 + }, + { + "epoch": 0.8927293742088173, + "grad_norm": 0.6883421540260315, + "learning_rate": 5.862169029415208e-06, + "loss": 0.8631, + "step": 16220 + }, + { + "epoch": 0.8927844130111728, + "grad_norm": 0.7614920735359192, + "learning_rate": 5.861742052408575e-06, + "loss": 0.8031, + "step": 16221 + }, + { + "epoch": 0.8928394518135285, + "grad_norm": 0.7668151259422302, + "learning_rate": 5.861315068925319e-06, + "loss": 0.8024, + "step": 16222 + }, + { + "epoch": 0.8928944906158842, + "grad_norm": 0.6772485971450806, + "learning_rate": 5.860888078968649e-06, + "loss": 0.72, + "step": 16223 + }, + { + "epoch": 0.8929495294182399, + "grad_norm": 0.742821216583252, + "learning_rate": 5.860461082541775e-06, + "loss": 0.7432, + "step": 16224 + }, + { + "epoch": 0.8930045682205955, + "grad_norm": 0.7056832909584045, + "learning_rate": 5.860034079647907e-06, + "loss": 0.7089, + "step": 16225 + }, + { + "epoch": 0.8930596070229512, + "grad_norm": 0.6898871660232544, + "learning_rate": 5.859607070290252e-06, + "loss": 0.7505, + "step": 16226 + }, + { + "epoch": 0.8931146458253069, + "grad_norm": 0.6888724565505981, + "learning_rate": 5.859180054472019e-06, + "loss": 0.7638, + "step": 16227 + }, + { + "epoch": 0.8931696846276626, + "grad_norm": 0.9010199308395386, + "learning_rate": 5.858753032196421e-06, + "loss": 0.775, + "step": 16228 + }, + { + "epoch": 0.8932247234300181, + "grad_norm": 0.6443523168563843, + "learning_rate": 5.858326003466663e-06, + "loss": 0.6702, + "step": 16229 + }, + { + "epoch": 0.8932797622323738, + "grad_norm": 0.6245587468147278, + "learning_rate": 5.857898968285957e-06, + "loss": 0.6907, + "step": 16230 + }, + { + "epoch": 0.8933348010347295, + "grad_norm": 0.6724962592124939, + "learning_rate": 5.857471926657512e-06, + "loss": 0.7381, + "step": 16231 + }, + { + "epoch": 0.8933898398370852, + "grad_norm": 1.0391525030136108, + "learning_rate": 5.857044878584539e-06, + "loss": 0.7919, + "step": 16232 + }, + { + "epoch": 0.8934448786394408, + "grad_norm": 0.8852080702781677, + "learning_rate": 5.8566178240702455e-06, + "loss": 0.8572, + "step": 16233 + }, + { + "epoch": 0.8934999174417965, + "grad_norm": 0.7087608575820923, + "learning_rate": 5.856190763117843e-06, + "loss": 0.7739, + "step": 16234 + }, + { + "epoch": 0.8935549562441522, + "grad_norm": 0.6688494086265564, + "learning_rate": 5.855763695730536e-06, + "loss": 0.6978, + "step": 16235 + }, + { + "epoch": 0.8936099950465078, + "grad_norm": 0.6174076795578003, + "learning_rate": 5.8553366219115415e-06, + "loss": 0.7838, + "step": 16236 + }, + { + "epoch": 0.8936650338488634, + "grad_norm": 0.6558929681777954, + "learning_rate": 5.854909541664065e-06, + "loss": 0.6071, + "step": 16237 + }, + { + "epoch": 0.8937200726512191, + "grad_norm": 0.678820013999939, + "learning_rate": 5.854482454991317e-06, + "loss": 0.792, + "step": 16238 + }, + { + "epoch": 0.8937751114535748, + "grad_norm": 0.6893227696418762, + "learning_rate": 5.854055361896507e-06, + "loss": 0.7182, + "step": 16239 + }, + { + "epoch": 0.8938301502559304, + "grad_norm": 0.6799605488777161, + "learning_rate": 5.853628262382847e-06, + "loss": 0.7011, + "step": 16240 + }, + { + "epoch": 0.8938851890582861, + "grad_norm": 0.7625865340232849, + "learning_rate": 5.853201156453544e-06, + "loss": 0.9217, + "step": 16241 + }, + { + "epoch": 0.8939402278606418, + "grad_norm": 0.6532776355743408, + "learning_rate": 5.8527740441118104e-06, + "loss": 0.7527, + "step": 16242 + }, + { + "epoch": 0.8939952666629974, + "grad_norm": 0.7904673218727112, + "learning_rate": 5.852346925360854e-06, + "loss": 0.8359, + "step": 16243 + }, + { + "epoch": 0.894050305465353, + "grad_norm": 0.7274239659309387, + "learning_rate": 5.851919800203888e-06, + "loss": 0.7335, + "step": 16244 + }, + { + "epoch": 0.8941053442677087, + "grad_norm": 0.843180239200592, + "learning_rate": 5.85149266864412e-06, + "loss": 0.7913, + "step": 16245 + }, + { + "epoch": 0.8941603830700644, + "grad_norm": 0.7756116390228271, + "learning_rate": 5.851065530684763e-06, + "loss": 0.7508, + "step": 16246 + }, + { + "epoch": 0.8942154218724201, + "grad_norm": 0.7086586952209473, + "learning_rate": 5.850638386329022e-06, + "loss": 0.7754, + "step": 16247 + }, + { + "epoch": 0.8942704606747757, + "grad_norm": 0.9373844265937805, + "learning_rate": 5.850211235580112e-06, + "loss": 0.7391, + "step": 16248 + }, + { + "epoch": 0.8943254994771314, + "grad_norm": 0.6847782135009766, + "learning_rate": 5.849784078441243e-06, + "loss": 0.6655, + "step": 16249 + }, + { + "epoch": 0.894380538279487, + "grad_norm": 0.6071921586990356, + "learning_rate": 5.849356914915624e-06, + "loss": 0.6933, + "step": 16250 + }, + { + "epoch": 0.8944355770818427, + "grad_norm": 0.712497889995575, + "learning_rate": 5.848929745006464e-06, + "loss": 0.8025, + "step": 16251 + }, + { + "epoch": 0.8944906158841983, + "grad_norm": 0.5942297577857971, + "learning_rate": 5.848502568716976e-06, + "loss": 0.621, + "step": 16252 + }, + { + "epoch": 0.894545654686554, + "grad_norm": 0.6706910729408264, + "learning_rate": 5.84807538605037e-06, + "loss": 0.7664, + "step": 16253 + }, + { + "epoch": 0.8946006934889097, + "grad_norm": 0.7494041919708252, + "learning_rate": 5.847648197009858e-06, + "loss": 0.7418, + "step": 16254 + }, + { + "epoch": 0.8946557322912654, + "grad_norm": 0.7373181581497192, + "learning_rate": 5.847221001598646e-06, + "loss": 0.7133, + "step": 16255 + }, + { + "epoch": 0.894710771093621, + "grad_norm": 0.8178310394287109, + "learning_rate": 5.84679379981995e-06, + "loss": 0.8562, + "step": 16256 + }, + { + "epoch": 0.8947658098959766, + "grad_norm": 0.6232174634933472, + "learning_rate": 5.8463665916769785e-06, + "loss": 0.7723, + "step": 16257 + }, + { + "epoch": 0.8948208486983323, + "grad_norm": 0.6817423701286316, + "learning_rate": 5.845939377172942e-06, + "loss": 0.6706, + "step": 16258 + }, + { + "epoch": 0.894875887500688, + "grad_norm": 0.8091211318969727, + "learning_rate": 5.845512156311051e-06, + "loss": 0.7453, + "step": 16259 + }, + { + "epoch": 0.8949309263030436, + "grad_norm": 0.7013124227523804, + "learning_rate": 5.845084929094518e-06, + "loss": 0.7104, + "step": 16260 + }, + { + "epoch": 0.8949859651053993, + "grad_norm": 0.8373565077781677, + "learning_rate": 5.844657695526552e-06, + "loss": 0.8181, + "step": 16261 + }, + { + "epoch": 0.895041003907755, + "grad_norm": 0.7339033484458923, + "learning_rate": 5.844230455610364e-06, + "loss": 0.7465, + "step": 16262 + }, + { + "epoch": 0.8950960427101107, + "grad_norm": 0.7680185437202454, + "learning_rate": 5.843803209349167e-06, + "loss": 0.7962, + "step": 16263 + }, + { + "epoch": 0.8951510815124663, + "grad_norm": 0.7889038920402527, + "learning_rate": 5.843375956746171e-06, + "loss": 0.7846, + "step": 16264 + }, + { + "epoch": 0.8952061203148219, + "grad_norm": 0.7361034154891968, + "learning_rate": 5.842948697804587e-06, + "loss": 0.7077, + "step": 16265 + }, + { + "epoch": 0.8952611591171776, + "grad_norm": 0.7543736100196838, + "learning_rate": 5.8425214325276255e-06, + "loss": 0.748, + "step": 16266 + }, + { + "epoch": 0.8953161979195333, + "grad_norm": 0.6194653511047363, + "learning_rate": 5.842094160918499e-06, + "loss": 0.8032, + "step": 16267 + }, + { + "epoch": 0.8953712367218889, + "grad_norm": 0.789439857006073, + "learning_rate": 5.841666882980418e-06, + "loss": 0.745, + "step": 16268 + }, + { + "epoch": 0.8954262755242446, + "grad_norm": 0.6813651919364929, + "learning_rate": 5.841239598716595e-06, + "loss": 0.7169, + "step": 16269 + }, + { + "epoch": 0.8954813143266003, + "grad_norm": 0.6128388047218323, + "learning_rate": 5.84081230813024e-06, + "loss": 0.7109, + "step": 16270 + }, + { + "epoch": 0.895536353128956, + "grad_norm": 0.7170562148094177, + "learning_rate": 5.8403850112245645e-06, + "loss": 0.7461, + "step": 16271 + }, + { + "epoch": 0.8955913919313115, + "grad_norm": 0.7415544986724854, + "learning_rate": 5.83995770800278e-06, + "loss": 0.7786, + "step": 16272 + }, + { + "epoch": 0.8956464307336672, + "grad_norm": 0.5996596813201904, + "learning_rate": 5.8395303984680985e-06, + "loss": 0.6926, + "step": 16273 + }, + { + "epoch": 0.8957014695360229, + "grad_norm": 0.7399057149887085, + "learning_rate": 5.839103082623732e-06, + "loss": 0.6953, + "step": 16274 + }, + { + "epoch": 0.8957565083383786, + "grad_norm": 0.7090675830841064, + "learning_rate": 5.838675760472888e-06, + "loss": 0.7638, + "step": 16275 + }, + { + "epoch": 0.8958115471407342, + "grad_norm": 0.6865240931510925, + "learning_rate": 5.838248432018785e-06, + "loss": 0.7023, + "step": 16276 + }, + { + "epoch": 0.8958665859430899, + "grad_norm": 0.604603111743927, + "learning_rate": 5.83782109726463e-06, + "loss": 0.6845, + "step": 16277 + }, + { + "epoch": 0.8959216247454456, + "grad_norm": 0.6722466349601746, + "learning_rate": 5.837393756213636e-06, + "loss": 0.7794, + "step": 16278 + }, + { + "epoch": 0.8959766635478013, + "grad_norm": 0.683106541633606, + "learning_rate": 5.836966408869014e-06, + "loss": 0.7988, + "step": 16279 + }, + { + "epoch": 0.8960317023501568, + "grad_norm": 0.7195246815681458, + "learning_rate": 5.8365390552339774e-06, + "loss": 0.7532, + "step": 16280 + }, + { + "epoch": 0.8960867411525125, + "grad_norm": 0.6945170760154724, + "learning_rate": 5.836111695311737e-06, + "loss": 0.7515, + "step": 16281 + }, + { + "epoch": 0.8961417799548682, + "grad_norm": 0.7424500584602356, + "learning_rate": 5.8356843291055065e-06, + "loss": 0.6813, + "step": 16282 + }, + { + "epoch": 0.8961968187572238, + "grad_norm": 0.673574686050415, + "learning_rate": 5.835256956618495e-06, + "loss": 0.7234, + "step": 16283 + }, + { + "epoch": 0.8962518575595795, + "grad_norm": 0.6816020011901855, + "learning_rate": 5.834829577853913e-06, + "loss": 0.7935, + "step": 16284 + }, + { + "epoch": 0.8963068963619352, + "grad_norm": 0.7598507404327393, + "learning_rate": 5.834402192814979e-06, + "loss": 0.8141, + "step": 16285 + }, + { + "epoch": 0.8963619351642909, + "grad_norm": 0.7720094323158264, + "learning_rate": 5.8339748015049e-06, + "loss": 0.8241, + "step": 16286 + }, + { + "epoch": 0.8964169739666464, + "grad_norm": 0.7409939169883728, + "learning_rate": 5.833547403926891e-06, + "loss": 0.7196, + "step": 16287 + }, + { + "epoch": 0.8964720127690021, + "grad_norm": 0.670557975769043, + "learning_rate": 5.83312000008416e-06, + "loss": 0.7931, + "step": 16288 + }, + { + "epoch": 0.8965270515713578, + "grad_norm": 0.6361322999000549, + "learning_rate": 5.832692589979925e-06, + "loss": 0.7292, + "step": 16289 + }, + { + "epoch": 0.8965820903737135, + "grad_norm": 0.6359429359436035, + "learning_rate": 5.832265173617393e-06, + "loss": 0.7705, + "step": 16290 + }, + { + "epoch": 0.8966371291760691, + "grad_norm": 0.7249873876571655, + "learning_rate": 5.831837750999781e-06, + "loss": 0.7741, + "step": 16291 + }, + { + "epoch": 0.8966921679784248, + "grad_norm": 0.6784750819206238, + "learning_rate": 5.831410322130296e-06, + "loss": 0.8434, + "step": 16292 + }, + { + "epoch": 0.8967472067807805, + "grad_norm": 0.7696726322174072, + "learning_rate": 5.830982887012157e-06, + "loss": 0.8218, + "step": 16293 + }, + { + "epoch": 0.8968022455831361, + "grad_norm": 0.5974952578544617, + "learning_rate": 5.830555445648572e-06, + "loss": 0.5864, + "step": 16294 + }, + { + "epoch": 0.8968572843854917, + "grad_norm": 1.4088029861450195, + "learning_rate": 5.830127998042755e-06, + "loss": 0.705, + "step": 16295 + }, + { + "epoch": 0.8969123231878474, + "grad_norm": 0.621288001537323, + "learning_rate": 5.8297005441979174e-06, + "loss": 0.6799, + "step": 16296 + }, + { + "epoch": 0.8969673619902031, + "grad_norm": 0.7229657173156738, + "learning_rate": 5.829273084117272e-06, + "loss": 0.7222, + "step": 16297 + }, + { + "epoch": 0.8970224007925588, + "grad_norm": 0.7076373100280762, + "learning_rate": 5.828845617804033e-06, + "loss": 0.7813, + "step": 16298 + }, + { + "epoch": 0.8970774395949144, + "grad_norm": 0.6931923627853394, + "learning_rate": 5.828418145261412e-06, + "loss": 0.6936, + "step": 16299 + }, + { + "epoch": 0.8971324783972701, + "grad_norm": 0.6719018220901489, + "learning_rate": 5.827990666492621e-06, + "loss": 0.747, + "step": 16300 + }, + { + "epoch": 0.8971875171996258, + "grad_norm": 0.6288262605667114, + "learning_rate": 5.827563181500875e-06, + "loss": 0.7731, + "step": 16301 + }, + { + "epoch": 0.8972425560019814, + "grad_norm": 0.6359015703201294, + "learning_rate": 5.8271356902893864e-06, + "loss": 0.6408, + "step": 16302 + }, + { + "epoch": 0.897297594804337, + "grad_norm": 0.6485893726348877, + "learning_rate": 5.826708192861365e-06, + "loss": 0.7739, + "step": 16303 + }, + { + "epoch": 0.8973526336066927, + "grad_norm": 0.7622523903846741, + "learning_rate": 5.826280689220027e-06, + "loss": 0.6087, + "step": 16304 + }, + { + "epoch": 0.8974076724090484, + "grad_norm": 0.650451123714447, + "learning_rate": 5.825853179368586e-06, + "loss": 0.6953, + "step": 16305 + }, + { + "epoch": 0.8974627112114041, + "grad_norm": 0.7266152501106262, + "learning_rate": 5.8254256633102535e-06, + "loss": 0.7762, + "step": 16306 + }, + { + "epoch": 0.8975177500137597, + "grad_norm": 0.6428011059761047, + "learning_rate": 5.824998141048241e-06, + "loss": 0.67, + "step": 16307 + }, + { + "epoch": 0.8975727888161154, + "grad_norm": 0.6991005539894104, + "learning_rate": 5.824570612585764e-06, + "loss": 0.79, + "step": 16308 + }, + { + "epoch": 0.897627827618471, + "grad_norm": 0.6385177969932556, + "learning_rate": 5.824143077926034e-06, + "loss": 0.6993, + "step": 16309 + }, + { + "epoch": 0.8976828664208267, + "grad_norm": 0.6891354322433472, + "learning_rate": 5.823715537072268e-06, + "loss": 0.7155, + "step": 16310 + }, + { + "epoch": 0.8977379052231823, + "grad_norm": 0.7448866367340088, + "learning_rate": 5.823287990027674e-06, + "loss": 0.674, + "step": 16311 + }, + { + "epoch": 0.897792944025538, + "grad_norm": 0.6892699599266052, + "learning_rate": 5.822860436795468e-06, + "loss": 0.7719, + "step": 16312 + }, + { + "epoch": 0.8978479828278937, + "grad_norm": 1.2621982097625732, + "learning_rate": 5.822432877378864e-06, + "loss": 0.6985, + "step": 16313 + }, + { + "epoch": 0.8979030216302494, + "grad_norm": 0.635137677192688, + "learning_rate": 5.822005311781075e-06, + "loss": 0.748, + "step": 16314 + }, + { + "epoch": 0.897958060432605, + "grad_norm": 0.8765038847923279, + "learning_rate": 5.821577740005313e-06, + "loss": 0.798, + "step": 16315 + }, + { + "epoch": 0.8980130992349606, + "grad_norm": 0.734259843826294, + "learning_rate": 5.8211501620547926e-06, + "loss": 0.7462, + "step": 16316 + }, + { + "epoch": 0.8980681380373163, + "grad_norm": 0.7057023048400879, + "learning_rate": 5.820722577932729e-06, + "loss": 0.8057, + "step": 16317 + }, + { + "epoch": 0.898123176839672, + "grad_norm": 0.7444988489151001, + "learning_rate": 5.8202949876423344e-06, + "loss": 0.7023, + "step": 16318 + }, + { + "epoch": 0.8981782156420276, + "grad_norm": 0.7205658555030823, + "learning_rate": 5.819867391186821e-06, + "loss": 0.6949, + "step": 16319 + }, + { + "epoch": 0.8982332544443833, + "grad_norm": 0.7285442352294922, + "learning_rate": 5.819439788569403e-06, + "loss": 0.6966, + "step": 16320 + }, + { + "epoch": 0.898288293246739, + "grad_norm": 0.7485014796257019, + "learning_rate": 5.819012179793295e-06, + "loss": 0.7676, + "step": 16321 + }, + { + "epoch": 0.8983433320490947, + "grad_norm": 0.7867493629455566, + "learning_rate": 5.818584564861712e-06, + "loss": 0.7601, + "step": 16322 + }, + { + "epoch": 0.8983983708514502, + "grad_norm": 0.6732510328292847, + "learning_rate": 5.818156943777867e-06, + "loss": 0.7068, + "step": 16323 + }, + { + "epoch": 0.8984534096538059, + "grad_norm": 0.648333728313446, + "learning_rate": 5.817729316544971e-06, + "loss": 0.762, + "step": 16324 + }, + { + "epoch": 0.8985084484561616, + "grad_norm": 0.9556308388710022, + "learning_rate": 5.817301683166241e-06, + "loss": 0.7177, + "step": 16325 + }, + { + "epoch": 0.8985634872585172, + "grad_norm": 0.7043321132659912, + "learning_rate": 5.816874043644891e-06, + "loss": 0.7206, + "step": 16326 + }, + { + "epoch": 0.8986185260608729, + "grad_norm": 0.6318387985229492, + "learning_rate": 5.816446397984136e-06, + "loss": 0.7306, + "step": 16327 + }, + { + "epoch": 0.8986735648632286, + "grad_norm": 0.7083125114440918, + "learning_rate": 5.816018746187186e-06, + "loss": 0.7204, + "step": 16328 + }, + { + "epoch": 0.8987286036655843, + "grad_norm": 0.6810079216957092, + "learning_rate": 5.815591088257259e-06, + "loss": 0.8575, + "step": 16329 + }, + { + "epoch": 0.8987836424679398, + "grad_norm": 0.7081509232521057, + "learning_rate": 5.815163424197567e-06, + "loss": 0.7198, + "step": 16330 + }, + { + "epoch": 0.8988386812702955, + "grad_norm": 1.1525241136550903, + "learning_rate": 5.814735754011325e-06, + "loss": 0.6222, + "step": 16331 + }, + { + "epoch": 0.8988937200726512, + "grad_norm": 0.724651575088501, + "learning_rate": 5.8143080777017456e-06, + "loss": 0.6937, + "step": 16332 + }, + { + "epoch": 0.8989487588750069, + "grad_norm": 0.7607846260070801, + "learning_rate": 5.813880395272047e-06, + "loss": 0.7552, + "step": 16333 + }, + { + "epoch": 0.8990037976773625, + "grad_norm": 0.6370975375175476, + "learning_rate": 5.813452706725441e-06, + "loss": 0.753, + "step": 16334 + }, + { + "epoch": 0.8990588364797182, + "grad_norm": 0.7018759250640869, + "learning_rate": 5.813025012065141e-06, + "loss": 0.8026, + "step": 16335 + }, + { + "epoch": 0.8991138752820739, + "grad_norm": 0.7106475234031677, + "learning_rate": 5.812597311294363e-06, + "loss": 0.6558, + "step": 16336 + }, + { + "epoch": 0.8991689140844296, + "grad_norm": 0.715859591960907, + "learning_rate": 5.812169604416321e-06, + "loss": 0.6581, + "step": 16337 + }, + { + "epoch": 0.8992239528867851, + "grad_norm": 1.1907461881637573, + "learning_rate": 5.811741891434231e-06, + "loss": 0.7618, + "step": 16338 + }, + { + "epoch": 0.8992789916891408, + "grad_norm": 0.6529675722122192, + "learning_rate": 5.811314172351304e-06, + "loss": 0.7317, + "step": 16339 + }, + { + "epoch": 0.8993340304914965, + "grad_norm": 0.6876475811004639, + "learning_rate": 5.810886447170758e-06, + "loss": 0.724, + "step": 16340 + }, + { + "epoch": 0.8993890692938522, + "grad_norm": 0.6798568964004517, + "learning_rate": 5.8104587158958084e-06, + "loss": 0.7173, + "step": 16341 + }, + { + "epoch": 0.8994441080962078, + "grad_norm": 1.0269527435302734, + "learning_rate": 5.8100309785296664e-06, + "loss": 0.737, + "step": 16342 + }, + { + "epoch": 0.8994991468985635, + "grad_norm": 0.7578931450843811, + "learning_rate": 5.809603235075547e-06, + "loss": 0.8119, + "step": 16343 + }, + { + "epoch": 0.8995541857009192, + "grad_norm": 0.7005903124809265, + "learning_rate": 5.8091754855366675e-06, + "loss": 0.7548, + "step": 16344 + }, + { + "epoch": 0.8996092245032749, + "grad_norm": 0.6554011702537537, + "learning_rate": 5.808747729916242e-06, + "loss": 0.6705, + "step": 16345 + }, + { + "epoch": 0.8996642633056304, + "grad_norm": 0.6242305040359497, + "learning_rate": 5.808319968217485e-06, + "loss": 0.7278, + "step": 16346 + }, + { + "epoch": 0.8997193021079861, + "grad_norm": 0.9426488876342773, + "learning_rate": 5.807892200443611e-06, + "loss": 0.7129, + "step": 16347 + }, + { + "epoch": 0.8997743409103418, + "grad_norm": 0.671768307685852, + "learning_rate": 5.807464426597835e-06, + "loss": 0.7688, + "step": 16348 + }, + { + "epoch": 0.8998293797126975, + "grad_norm": 0.672828197479248, + "learning_rate": 5.807036646683374e-06, + "loss": 0.8089, + "step": 16349 + }, + { + "epoch": 0.8998844185150531, + "grad_norm": 0.6689574718475342, + "learning_rate": 5.806608860703441e-06, + "loss": 0.7523, + "step": 16350 + }, + { + "epoch": 0.8999394573174088, + "grad_norm": 0.7440112233161926, + "learning_rate": 5.8061810686612514e-06, + "loss": 0.7802, + "step": 16351 + }, + { + "epoch": 0.8999944961197645, + "grad_norm": 0.6520549654960632, + "learning_rate": 5.8057532705600206e-06, + "loss": 0.6582, + "step": 16352 + }, + { + "epoch": 0.9000495349221201, + "grad_norm": 0.73201984167099, + "learning_rate": 5.805325466402965e-06, + "loss": 0.685, + "step": 16353 + }, + { + "epoch": 0.9001045737244757, + "grad_norm": 0.6377021074295044, + "learning_rate": 5.804897656193298e-06, + "loss": 0.802, + "step": 16354 + }, + { + "epoch": 0.9001596125268314, + "grad_norm": 0.6761305928230286, + "learning_rate": 5.8044698399342355e-06, + "loss": 0.7005, + "step": 16355 + }, + { + "epoch": 0.9002146513291871, + "grad_norm": 0.6993832588195801, + "learning_rate": 5.804042017628992e-06, + "loss": 0.848, + "step": 16356 + }, + { + "epoch": 0.9002696901315428, + "grad_norm": 0.6453230381011963, + "learning_rate": 5.803614189280786e-06, + "loss": 0.7525, + "step": 16357 + }, + { + "epoch": 0.9003247289338984, + "grad_norm": 0.620090663433075, + "learning_rate": 5.80318635489283e-06, + "loss": 0.668, + "step": 16358 + }, + { + "epoch": 0.900379767736254, + "grad_norm": 0.7285178899765015, + "learning_rate": 5.80275851446834e-06, + "loss": 0.7936, + "step": 16359 + }, + { + "epoch": 0.9004348065386097, + "grad_norm": 0.7555778622627258, + "learning_rate": 5.802330668010532e-06, + "loss": 0.7493, + "step": 16360 + }, + { + "epoch": 0.9004898453409654, + "grad_norm": 0.8527930974960327, + "learning_rate": 5.801902815522622e-06, + "loss": 0.7609, + "step": 16361 + }, + { + "epoch": 0.900544884143321, + "grad_norm": 0.9000833034515381, + "learning_rate": 5.801474957007824e-06, + "loss": 0.7719, + "step": 16362 + } + ], + "logging_steps": 1, + "max_steps": 36338, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 909, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.82852648541959e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-16362/training_args.bin b/checkpoint-16362/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4fcf8689837015e25934915ab36e9943776ca6cd --- /dev/null +++ b/checkpoint-16362/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c62f9cafd9057de88f53b2d6143eaf1e38cf3558d65c4e5642eaa284f31d316 +size 7928 diff --git a/checkpoint-16362/zero_to_fp32.py b/checkpoint-16362/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-16362/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)