diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..37c355ef923d57b2961a68e4d576f2c323e9dfb1 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +--- +language: +- fr +- en +license: apache-2.0 +tags: +- medical +- domain-adaptation +- continual-pretraining +- causal-lm +- question-answering +- evaluation +datasets: +- Dr-BERT/NACHOS +base_model: MedLLaMA-13B +model_type: causal-lm +--- + +# MedLLaMA-13B-CPT (CPT) + +## Model description +This checkpoint is a **continual-pretrained (CPT)** version of **MedLLaMA-13B**, adapted on unlabeled french medical text to strengthen domain-specific representations for medical question answering. CPT is performed via **full-parameter training** on medical corpora (Dr-BERT/NACHOS). diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..9c16aa4be022f03ad001b006fba14dfb73a1929c --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,3 @@ +{ + "": 32000 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f901c335800e87aca9cf4884870902843b249b9b --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 0, + "eos_token_id": 1, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "max_position_embeddings": 2048, + "max_sequence_length": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 40, + "pad_token_id": -1, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.53.2", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d503a8a258c4b8105a3f851dd28b2b5f5cee6368 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.2" +} diff --git a/model-00001-of-00011.safetensors b/model-00001-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10ce8c56b5e6d2495c0a58ffd73ab2728d2fad9d --- /dev/null +++ b/model-00001-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b5f96c2ed523ec9c64691e88578c127aad5125ef80143c859c69a911ba241a9 +size 4881247856 diff --git a/model-00002-of-00011.safetensors b/model-00002-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b3eb97a973de5039b2527f99529d831db799d39 --- /dev/null +++ b/model-00002-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e52328b786aa8f611b77df0591ab6a4072f7d1d89e77e851bfb0a37a107843 +size 4970418112 diff --git a/model-00003-of-00011.safetensors b/model-00003-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff14eb0564e591dafa987599400da6fed35bc204 --- /dev/null +++ b/model-00003-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f5dac5d6966e0e3d08db763302569b3beb5eaba11e8496743217495c4985c29 +size 4970418120 diff --git a/model-00004-of-00011.safetensors b/model-00004-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a6b725514b48eaf5d06139b960796980d99d0dab --- /dev/null +++ b/model-00004-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd01afb776c0430f6cb9f40bf66c516f465635fcd2c59bd18d009e3370a18363 +size 4970418144 diff --git a/model-00005-of-00011.safetensors b/model-00005-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a39e5e829e50107b1d644b0b427bae6bac09265a --- /dev/null +++ b/model-00005-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff1e33e5fab23e072d15082b37d4a9a7f9175d8271ce69b65af2387e1dab2e6b +size 4970418144 diff --git a/model-00006-of-00011.safetensors b/model-00006-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1585aecc87832463e8315ecb08825cabc8debdd3 --- /dev/null +++ b/model-00006-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff99290e41ce788eb1eefc2c07e2dc1d8753e6388becaac33c9195ce4e44378 +size 4792119040 diff --git a/model-00007-of-00011.safetensors b/model-00007-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7cfa68ee5fc5367d023d48fda8c0951e08a70b16 --- /dev/null +++ b/model-00007-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011014bf310b0f982cc82c20595bcb5ad1ec2d3e3f1e2ddf8296674556d78ac9 +size 4792160232 diff --git a/model-00008-of-00011.safetensors b/model-00008-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82265e3680dc26b3f0d5d248c3973bedeae96faf --- /dev/null +++ b/model-00008-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a7e4f9fc5e851f0f5a2bbd1e0bedcb4493160146a182ac8554f049697b1e70 +size 4792160224 diff --git a/model-00009-of-00011.safetensors b/model-00009-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b00a8b02e998e1d306b61669a3c67575ffd16f79 --- /dev/null +++ b/model-00009-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd027e02aa26533363dadec2624ef98bbe718b379dad3735a8f67b3ad44f6c7 +size 4970418144 diff --git a/model-00010-of-00011.safetensors b/model-00010-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..249f2df4c9bf6d25bd1cbaaf3e9c15a04f328fb4 --- /dev/null +++ b/model-00010-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c9d2c0d7a5afb3f60f740f1d85de6e0ceae5335376d29cbdba3c82613a74ea4 +size 4970418144 diff --git a/model-00011-of-00011.safetensors b/model-00011-of-00011.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab8af26ee81909cd3d2ac35d1767408bfffe6293 --- /dev/null +++ b/model-00011-of-00011.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c91c7b2db05d4ac57d33c3aa8251a4010d1d261205191ca79a8149479276a31 +size 2983303184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..6e40097b1cf44152ff3778fdfa77b0f55adbf14f --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,371 @@ +{ + "metadata": { + "total_parameters": 406745760, + "total_size": 52063457280 + }, + "weight_map": { + "lm_head.weight": "model-00011-of-00011.safetensors", + "model.embed_tokens.weight": "model-00001-of-00011.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.11.input_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.input_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.input_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00011.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.15.input_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00011.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.input_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.input_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.input_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00011.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00005-of-00011.safetensors", + "model.layers.19.input_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00011.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.20.input_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.input_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00011.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.input_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00006-of-00011.safetensors", + "model.layers.23.input_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.input_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.input_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00011.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.26.input_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00007-of-00011.safetensors", + "model.layers.27.input_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.input_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.input_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00011.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00011.safetensors", + "model.layers.30.input_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00008-of-00011.safetensors", + "model.layers.31.input_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.input_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.input_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00011.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.34.input_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00009-of-00011.safetensors", + "model.layers.35.input_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.input_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.36.mlp.down_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.mlp.gate_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.mlp.up_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.36.self_attn.k_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.self_attn.q_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.36.self_attn.v_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.input_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.37.mlp.down_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.mlp.gate_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.mlp.up_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00010-of-00011.safetensors", + "model.layers.37.self_attn.k_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.self_attn.q_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.37.self_attn.v_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.38.input_layernorm.weight": "model-00011-of-00011.safetensors", + "model.layers.38.mlp.down_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.38.mlp.gate_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.38.mlp.up_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00011-of-00011.safetensors", + "model.layers.38.self_attn.k_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.38.self_attn.q_proj.weight": "model-00010-of-00011.safetensors", + "model.layers.38.self_attn.v_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.input_layernorm.weight": "model-00011-of-00011.safetensors", + "model.layers.39.mlp.down_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.mlp.gate_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.mlp.up_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00011-of-00011.safetensors", + "model.layers.39.self_attn.k_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.self_attn.q_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.39.self_attn.v_proj.weight": "model-00011-of-00011.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00011.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.7.input_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00011.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00011.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00011.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00011.safetensors", + "model.norm.weight": "model-00011-of-00011.safetensors" + } +} diff --git a/rng_state_0.pth b/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..25c968b44f64cbb72be6f6649ee277d3fa7097d0 --- /dev/null +++ b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03269e9b7ea16cd98a78d6ea7a50b8454cbd266a053c02f734d27c4d0056b89d +size 15024 diff --git a/rng_state_1.pth b/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..955a80e651b1f8cb989b49c8b23425013715296e --- /dev/null +++ b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4509495bd2a081212bbe15f7d1784af338aff3aadb1c5fe444c3f44626992d0 +size 15024 diff --git a/rng_state_10.pth b/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebd168f693bcb951f6f9b1bc29c7758dfb6e882e --- /dev/null +++ b/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:576d9c3fa72c59b7084a408dc9dde57204e5fa3c9a8a50d47c8f5f51324cb626 +size 15033 diff --git a/rng_state_11.pth b/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..70060415a2e2da75a8c7b0c80106fddd8caf5ca6 --- /dev/null +++ b/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b518affc164693d762db0b412de62486db847b0d29b49663bcef4c696d69454 +size 15033 diff --git a/rng_state_12.pth b/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcc5c088f7ab8ff1931fcbf99940ee1354028c0b --- /dev/null +++ b/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef92841ef949d985351fdd17590d20e8c825b1780cc77e9b5755fafea06343e9 +size 15033 diff --git a/rng_state_13.pth b/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..0830c1f73f5944f96e526aac3f49edba0a0e39fe --- /dev/null +++ b/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef777f37a836090a8970337f9bdc1f501b606c0e286cdb186d8588d2e6d803db +size 15033 diff --git a/rng_state_14.pth b/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b1b1e50194ce61b7e72d1ff7be5943e827adcba --- /dev/null +++ b/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a72e1cf749f626c942d66eb4aa6b1b9f8ca8fd0ff2deda4365f47afc45f35e58 +size 15033 diff --git a/rng_state_15.pth b/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..62945a425c9acea6f65a5ef1adfc0ff5e97a6e83 --- /dev/null +++ b/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871772b721a469522f47e4b4c1390d83121eea670b95dc8e75bda4cf3253ab24 +size 15033 diff --git a/rng_state_16.pth b/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb498e424cef951895e9a1b64cf6723952eb1176 --- /dev/null +++ b/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8815297c3fcebc5be0c9560ca5a0e8d2fc8b343661a4d8d6c570155f9603da81 +size 15033 diff --git a/rng_state_17.pth b/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..30f493e91d06fabd9e270d1364eea57a705a2452 --- /dev/null +++ b/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a368436014b241b5f613085702d4174f89cd7ec56b4e189b0e9a7f82cda767d2 +size 15033 diff --git a/rng_state_18.pth b/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5be3e639f4cd4ab358736de65113fd504c5f9af --- /dev/null +++ b/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366da76d65199a2b4dde530196fe1907d588129a9417719d8bc54c2e64ee91d5 +size 15033 diff --git a/rng_state_19.pth b/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1d72bcff86fa826ea595fe9674a809a3b344695 --- /dev/null +++ b/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67a307d5a29470677449feb8d39a259df73ee1e4f3375b67ec7af94fa27f306 +size 15033 diff --git a/rng_state_2.pth b/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa005c74cc3eb2519657be3c494f323e7c8d1d79 --- /dev/null +++ b/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104638f895e79f68373b82e8ca745e8070a37851d21a073a0fa5ae1db47649d6 +size 15024 diff --git a/rng_state_20.pth b/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5e89e613e66de2794797865dd623634058c520d --- /dev/null +++ b/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea17befb9020fbdce639ba3fdb5c4ecaaaa244ae0a63ba932c3fc2dcfcdddbac +size 15033 diff --git a/rng_state_21.pth b/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e1245be089d0ec28031ed2af73a1f71195b3da7 --- /dev/null +++ b/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f8cc729a703c8a71d5a769c376e7131b296691c5b9fd8e8b0809927deddaf2 +size 15033 diff --git a/rng_state_22.pth b/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..24523d45f074e9568f52560716e31b151dcf9810 --- /dev/null +++ b/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0876016abcb0aa45acaba3ef553b1db536f8f92e370998bd9054bb273eb288 +size 15033 diff --git a/rng_state_23.pth b/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..7adb54668e1388d372018f8281035ba8ba4b5903 --- /dev/null +++ b/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae20c0ac66d59e883d359286ae5ad3eaf0897604c46c106879a8e03154e0934 +size 15033 diff --git a/rng_state_24.pth b/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7203e5dcb7e44055ad44d4a16c0eeed233b0f47 --- /dev/null +++ b/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1024255eb3f502ffa47a33146b5cb017f523d16728e5476291fef2f3b9ddce12 +size 15033 diff --git a/rng_state_25.pth b/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..267ddba723b54ebc9eeb163441814f9de23a21fe --- /dev/null +++ b/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c483f4f00936cfc0ad68b5ea470fadf5e87494dc897aa22640a207bfca87bcb +size 15033 diff --git a/rng_state_26.pth b/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..86d297d212b1665432224c15899bc3e135e3d121 --- /dev/null +++ b/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e813529ffd8f6a3a8dca1f9610ecbda58e84541747eb1a7f27dc000304b331 +size 15033 diff --git a/rng_state_27.pth b/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..f319c53aeadcf6beb4a477506faf6671033fcb72 --- /dev/null +++ b/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e6dfa37d4146c36c1844ccb7d862acd5a4fb8be7865d2c14431e0ae3463f209 +size 15033 diff --git a/rng_state_28.pth b/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdeedbf954f8bed0247dc676fcfb54fa8921c9f6 --- /dev/null +++ b/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847b5776b744c97bbfd967b883aa9636b574e4c6235627811813820795c018da +size 15033 diff --git a/rng_state_29.pth b/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c3aca18919a5909f409348c2cf4719108b9a317 --- /dev/null +++ b/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ec4281a7481dc77bfa5dd39339da8b2563c45e94b5a17b4928d573ec63fdad +size 15033 diff --git a/rng_state_3.pth b/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..104e9ccfd2036951dd3d763f5838f18c13eb7f46 --- /dev/null +++ b/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:765f109d2f595ccad091f8175cdf531d159f08de0ff5196e08c6bb1d0ceda0d5 +size 15024 diff --git a/rng_state_30.pth b/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b04fb20c9bb3aa7d55eb25b025f14d5f5fd2797 --- /dev/null +++ b/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd83ffb944df86c316c5dd310a6c3d49f874e9769724e9b56072b6843f71c6d1 +size 15033 diff --git a/rng_state_31.pth b/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..353fed1d7e58711b3951dc3793336d3a57141d9e --- /dev/null +++ b/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212418db1d3e2746446f3e38b2c002f2fc24f15f22dfbbd9a49188b6ce9a2a61 +size 15033 diff --git a/rng_state_4.pth b/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b0d0948fc60bcec7a3af58f77ee0a5e1b764db5 --- /dev/null +++ b/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d620abf3603a214a65bc4bbdffe37dd699f44b168af1b18f2e29715df9c801e8 +size 15024 diff --git a/rng_state_5.pth b/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae610401ff49fd98fe73faeee3a44a0307a3cb78 --- /dev/null +++ b/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de4622f10c8af2b5c088a5b1129644980de4affbf1447941180408ef33a81f4a +size 15024 diff --git a/rng_state_6.pth b/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..45f7f4e9d7e3c5642d4cd780a19b5b06c1af5561 --- /dev/null +++ b/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49de98dfba5dc191e6a791d0bcd230547aa914b0c4a23c2bec1e0fc12cfdd084 +size 15024 diff --git a/rng_state_7.pth b/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f17bb97627c7b417e12986a7ac01d62d148f3aa6 --- /dev/null +++ b/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b75b0c340a919002c88fdd74edae30d1169bf50d0211c5a90248bacdf0bae4 +size 15024 diff --git a/rng_state_8.pth b/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7892ce87ee4212f9d41cb4a97b8edb727e8df30 --- /dev/null +++ b/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d724773f54f0305ed0266dd19f05b11fb4d0c2bee186febf335a1352f136d394 +size 15024 diff --git a/rng_state_9.pth b/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2aae0b13440735ff468e3b45a7d6bc84addfcac --- /dev/null +++ b/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0910de2bcf037052227ed18ef956e5c5788efd7dc3cc8a677732b0ce6244e948 +size 15024 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..386d062d336feef5f643dc6ac230d071f125fbe3 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635c3b12183bd84351919b6bc3a696f5d4e9722d5b06c9cf6b60289a803e8ce6 +size 1064 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..a782b2f1cdab4d0bacb2dc0f85d02c4b1e31f0bd --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e12b0c0e6eab622a43b54373c25d0720993ff5c2 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,52 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6c32caed4970798a5336af39c4a4bf024fe0b5e --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,9946 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6668237289732835, + "eval_steps": 500, + "global_step": 14150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018847476794044198, + "grad_norm": 1.248213768005371, + "learning_rate": 2e-05, + "loss": 1.598, + "step": 1 + }, + { + "epoch": 0.0018847476794044196, + "grad_norm": 0.5134799480438232, + "learning_rate": 1.9999984224699035e-05, + "loss": 1.5347, + "step": 10 + }, + { + "epoch": 0.0037694953588088393, + "grad_norm": 0.30949828028678894, + "learning_rate": 1.9999929692858345e-05, + "loss": 1.4408, + "step": 20 + }, + { + "epoch": 0.00565424303821326, + "grad_norm": 0.28539228439331055, + "learning_rate": 1.9999836209933504e-05, + "loss": 1.3976, + "step": 30 + }, + { + "epoch": 0.007538990717617679, + "grad_norm": 0.27772006392478943, + "learning_rate": 1.999970377628863e-05, + "loss": 1.386, + "step": 40 + }, + { + "epoch": 0.009423738397022098, + "grad_norm": 0.25717052817344666, + "learning_rate": 1.9999532392439568e-05, + "loss": 1.3805, + "step": 50 + }, + { + "epoch": 0.01130848607642652, + "grad_norm": 0.2524262070655823, + "learning_rate": 1.9999322059053887e-05, + "loss": 1.3552, + "step": 60 + }, + { + "epoch": 0.013193233755830938, + "grad_norm": 0.2641507685184479, + "learning_rate": 1.9999072776950864e-05, + "loss": 1.3535, + "step": 70 + }, + { + "epoch": 0.015077981435235357, + "grad_norm": 0.261675626039505, + "learning_rate": 1.9998784547101483e-05, + "loss": 1.3425, + "step": 80 + }, + { + "epoch": 0.016962729114639776, + "grad_norm": 0.2634064853191376, + "learning_rate": 1.9998457370628442e-05, + "loss": 1.3284, + "step": 90 + }, + { + "epoch": 0.018847476794044197, + "grad_norm": 0.259843111038208, + "learning_rate": 1.9998091248806136e-05, + "loss": 1.3238, + "step": 100 + }, + { + "epoch": 0.020732224473448618, + "grad_norm": 0.2529247999191284, + "learning_rate": 1.999768618306066e-05, + "loss": 1.3089, + "step": 110 + }, + { + "epoch": 0.02261697215285304, + "grad_norm": 0.25355592370033264, + "learning_rate": 1.99972421749698e-05, + "loss": 1.3075, + "step": 120 + }, + { + "epoch": 0.024501719832257456, + "grad_norm": 0.2631305456161499, + "learning_rate": 1.9996759226263028e-05, + "loss": 1.2942, + "step": 130 + }, + { + "epoch": 0.026386467511661876, + "grad_norm": 0.2569214999675751, + "learning_rate": 1.9996237338821495e-05, + "loss": 1.3046, + "step": 140 + }, + { + "epoch": 0.028271215191066297, + "grad_norm": 0.253907710313797, + "learning_rate": 1.999567651467802e-05, + "loss": 1.2788, + "step": 150 + }, + { + "epoch": 0.030155962870470714, + "grad_norm": 0.3623337745666504, + "learning_rate": 1.9995076756017094e-05, + "loss": 1.284, + "step": 160 + }, + { + "epoch": 0.03204071054987514, + "grad_norm": 0.2715981900691986, + "learning_rate": 1.9994438065174855e-05, + "loss": 1.2766, + "step": 170 + }, + { + "epoch": 0.03392545822927955, + "grad_norm": 0.24893872439861298, + "learning_rate": 1.9993760444639096e-05, + "loss": 1.2833, + "step": 180 + }, + { + "epoch": 0.03581020590868397, + "grad_norm": 0.25260043144226074, + "learning_rate": 1.999304389704923e-05, + "loss": 1.2742, + "step": 190 + }, + { + "epoch": 0.037694953588088394, + "grad_norm": 0.2702144384384155, + "learning_rate": 1.9992288425196322e-05, + "loss": 1.2708, + "step": 200 + }, + { + "epoch": 0.039579701267492814, + "grad_norm": 0.23077067732810974, + "learning_rate": 1.9991494032023026e-05, + "loss": 1.2488, + "step": 210 + }, + { + "epoch": 0.041464448946897235, + "grad_norm": 0.2482827752828598, + "learning_rate": 1.9990660720623612e-05, + "loss": 1.2706, + "step": 220 + }, + { + "epoch": 0.043349196626301656, + "grad_norm": 0.26306962966918945, + "learning_rate": 1.998978849424395e-05, + "loss": 1.2659, + "step": 230 + }, + { + "epoch": 0.04523394430570608, + "grad_norm": 0.2499198466539383, + "learning_rate": 1.9988877356281466e-05, + "loss": 1.2586, + "step": 240 + }, + { + "epoch": 0.04711869198511049, + "grad_norm": 0.2686818838119507, + "learning_rate": 1.9987927310285176e-05, + "loss": 1.262, + "step": 250 + }, + { + "epoch": 0.04900343966451491, + "grad_norm": 0.2811981737613678, + "learning_rate": 1.9986938359955644e-05, + "loss": 1.2495, + "step": 260 + }, + { + "epoch": 0.05088818734391933, + "grad_norm": 0.2394895851612091, + "learning_rate": 1.9985910509144954e-05, + "loss": 1.2576, + "step": 270 + }, + { + "epoch": 0.05277293502332375, + "grad_norm": 0.2487143725156784, + "learning_rate": 1.9984843761856723e-05, + "loss": 1.2335, + "step": 280 + }, + { + "epoch": 0.05465768270272817, + "grad_norm": 0.25012126564979553, + "learning_rate": 1.998373812224609e-05, + "loss": 1.2321, + "step": 290 + }, + { + "epoch": 0.056542430382132594, + "grad_norm": 0.2570323050022125, + "learning_rate": 1.998259359461966e-05, + "loss": 1.2423, + "step": 300 + }, + { + "epoch": 0.058427178061537015, + "grad_norm": 0.2380693256855011, + "learning_rate": 1.998141018343553e-05, + "loss": 1.2327, + "step": 310 + }, + { + "epoch": 0.06031192574094143, + "grad_norm": 0.2572581470012665, + "learning_rate": 1.9980187893303243e-05, + "loss": 1.2297, + "step": 320 + }, + { + "epoch": 0.06219667342034585, + "grad_norm": 0.24813765287399292, + "learning_rate": 1.997892672898379e-05, + "loss": 1.2361, + "step": 330 + }, + { + "epoch": 0.06408142109975028, + "grad_norm": 0.25302258133888245, + "learning_rate": 1.9977626695389568e-05, + "loss": 1.2271, + "step": 340 + }, + { + "epoch": 0.06596616877915469, + "grad_norm": 0.2392202764749527, + "learning_rate": 1.9976287797584392e-05, + "loss": 1.2384, + "step": 350 + }, + { + "epoch": 0.0678509164585591, + "grad_norm": 0.25518399477005005, + "learning_rate": 1.997491004078345e-05, + "loss": 1.2415, + "step": 360 + }, + { + "epoch": 0.06973566413796353, + "grad_norm": 0.27044379711151123, + "learning_rate": 1.997349343035329e-05, + "loss": 1.2264, + "step": 370 + }, + { + "epoch": 0.07162041181736795, + "grad_norm": 0.25347742438316345, + "learning_rate": 1.9972037971811802e-05, + "loss": 1.2071, + "step": 380 + }, + { + "epoch": 0.07350515949677237, + "grad_norm": 0.26112523674964905, + "learning_rate": 1.99705436708282e-05, + "loss": 1.1952, + "step": 390 + }, + { + "epoch": 0.07538990717617679, + "grad_norm": 0.23909437656402588, + "learning_rate": 1.996901053322298e-05, + "loss": 1.2166, + "step": 400 + }, + { + "epoch": 0.07727465485558122, + "grad_norm": 0.2504359185695648, + "learning_rate": 1.996743856496793e-05, + "loss": 1.2104, + "step": 410 + }, + { + "epoch": 0.07915940253498563, + "grad_norm": 0.2588813304901123, + "learning_rate": 1.9965827772186072e-05, + "loss": 1.1922, + "step": 420 + }, + { + "epoch": 0.08104415021439004, + "grad_norm": 0.2423546463251114, + "learning_rate": 1.9964178161151668e-05, + "loss": 1.2184, + "step": 430 + }, + { + "epoch": 0.08292889789379447, + "grad_norm": 0.26253944635391235, + "learning_rate": 1.9962489738290177e-05, + "loss": 1.1928, + "step": 440 + }, + { + "epoch": 0.08481364557319888, + "grad_norm": 0.24807363748550415, + "learning_rate": 1.996076251017823e-05, + "loss": 1.1901, + "step": 450 + }, + { + "epoch": 0.08669839325260331, + "grad_norm": 0.2503909170627594, + "learning_rate": 1.9958996483543624e-05, + "loss": 1.1904, + "step": 460 + }, + { + "epoch": 0.08858314093200773, + "grad_norm": 0.24793989956378937, + "learning_rate": 1.9957191665265262e-05, + "loss": 1.2119, + "step": 470 + }, + { + "epoch": 0.09046788861141215, + "grad_norm": 0.2453005164861679, + "learning_rate": 1.9955348062373168e-05, + "loss": 1.1725, + "step": 480 + }, + { + "epoch": 0.09235263629081657, + "grad_norm": 0.2712901532649994, + "learning_rate": 1.9953465682048418e-05, + "loss": 1.1877, + "step": 490 + }, + { + "epoch": 0.09423738397022098, + "grad_norm": 0.24500596523284912, + "learning_rate": 1.9951544531623134e-05, + "loss": 1.1836, + "step": 500 + }, + { + "epoch": 0.09612213164962541, + "grad_norm": 0.2433580756187439, + "learning_rate": 1.994958461858047e-05, + "loss": 1.1879, + "step": 510 + }, + { + "epoch": 0.09800687932902982, + "grad_norm": 0.23772740364074707, + "learning_rate": 1.9947585950554543e-05, + "loss": 1.1941, + "step": 520 + }, + { + "epoch": 0.09989162700843425, + "grad_norm": 0.24457943439483643, + "learning_rate": 1.994554853533045e-05, + "loss": 1.1974, + "step": 530 + }, + { + "epoch": 0.10177637468783866, + "grad_norm": 0.2423318773508072, + "learning_rate": 1.994347238084418e-05, + "loss": 1.18, + "step": 540 + }, + { + "epoch": 0.10366112236724309, + "grad_norm": 0.24526557326316833, + "learning_rate": 1.9941357495182655e-05, + "loss": 1.1951, + "step": 550 + }, + { + "epoch": 0.1055458700466475, + "grad_norm": 0.23744036257266998, + "learning_rate": 1.993920388658363e-05, + "loss": 1.1813, + "step": 560 + }, + { + "epoch": 0.10743061772605192, + "grad_norm": 0.2410334050655365, + "learning_rate": 1.993701156343571e-05, + "loss": 1.1759, + "step": 570 + }, + { + "epoch": 0.10931536540545635, + "grad_norm": 0.23521552979946136, + "learning_rate": 1.993478053427829e-05, + "loss": 1.1969, + "step": 580 + }, + { + "epoch": 0.11120011308486076, + "grad_norm": 0.23922745883464813, + "learning_rate": 1.993251080780153e-05, + "loss": 1.1859, + "step": 590 + }, + { + "epoch": 0.11308486076426519, + "grad_norm": 0.22873084247112274, + "learning_rate": 1.9930202392846322e-05, + "loss": 1.1585, + "step": 600 + }, + { + "epoch": 0.1149696084436696, + "grad_norm": 0.24506738781929016, + "learning_rate": 1.9927855298404255e-05, + "loss": 1.1768, + "step": 610 + }, + { + "epoch": 0.11685435612307403, + "grad_norm": 0.24229852855205536, + "learning_rate": 1.9925469533617587e-05, + "loss": 1.1816, + "step": 620 + }, + { + "epoch": 0.11873910380247844, + "grad_norm": 0.23421397805213928, + "learning_rate": 1.9923045107779188e-05, + "loss": 1.1821, + "step": 630 + }, + { + "epoch": 0.12062385148188286, + "grad_norm": 0.24380525946617126, + "learning_rate": 1.992058203033253e-05, + "loss": 1.1604, + "step": 640 + }, + { + "epoch": 0.12250859916128728, + "grad_norm": 0.23961135745048523, + "learning_rate": 1.9918080310871636e-05, + "loss": 1.1802, + "step": 650 + }, + { + "epoch": 0.1243933468406917, + "grad_norm": 0.24533973634243011, + "learning_rate": 1.9915539959141033e-05, + "loss": 1.1709, + "step": 660 + }, + { + "epoch": 0.1262780945200961, + "grad_norm": 0.28405964374542236, + "learning_rate": 1.991296098503575e-05, + "loss": 1.1399, + "step": 670 + }, + { + "epoch": 0.12816284219950055, + "grad_norm": 0.30202099680900574, + "learning_rate": 1.9910343398601236e-05, + "loss": 1.1815, + "step": 680 + }, + { + "epoch": 0.13004758987890497, + "grad_norm": 0.23650214076042175, + "learning_rate": 1.9907687210033343e-05, + "loss": 1.1512, + "step": 690 + }, + { + "epoch": 0.13193233755830938, + "grad_norm": 0.2527135908603668, + "learning_rate": 1.9904992429678284e-05, + "loss": 1.1627, + "step": 700 + }, + { + "epoch": 0.1338170852377138, + "grad_norm": 0.2362278252840042, + "learning_rate": 1.99022590680326e-05, + "loss": 1.1536, + "step": 710 + }, + { + "epoch": 0.1357018329171182, + "grad_norm": 0.2330750674009323, + "learning_rate": 1.9899487135743104e-05, + "loss": 1.1733, + "step": 720 + }, + { + "epoch": 0.13758658059652265, + "grad_norm": 0.2556784152984619, + "learning_rate": 1.989667664360685e-05, + "loss": 1.1719, + "step": 730 + }, + { + "epoch": 0.13947132827592706, + "grad_norm": 0.23475973308086395, + "learning_rate": 1.9893827602571087e-05, + "loss": 1.156, + "step": 740 + }, + { + "epoch": 0.14135607595533148, + "grad_norm": 0.24918922781944275, + "learning_rate": 1.9890940023733208e-05, + "loss": 1.1561, + "step": 750 + }, + { + "epoch": 0.1432408236347359, + "grad_norm": 0.22945019602775574, + "learning_rate": 1.9888013918340737e-05, + "loss": 1.1733, + "step": 760 + }, + { + "epoch": 0.1451255713141403, + "grad_norm": 0.25335609912872314, + "learning_rate": 1.9885049297791245e-05, + "loss": 1.1533, + "step": 770 + }, + { + "epoch": 0.14701031899354475, + "grad_norm": 0.2379768341779709, + "learning_rate": 1.9882046173632335e-05, + "loss": 1.1656, + "step": 780 + }, + { + "epoch": 0.14889506667294916, + "grad_norm": 0.2410801351070404, + "learning_rate": 1.9879004557561577e-05, + "loss": 1.1503, + "step": 790 + }, + { + "epoch": 0.15077981435235357, + "grad_norm": 0.2520928978919983, + "learning_rate": 1.9875924461426486e-05, + "loss": 1.1469, + "step": 800 + }, + { + "epoch": 0.152664562031758, + "grad_norm": 0.22509939968585968, + "learning_rate": 1.9872805897224455e-05, + "loss": 1.1669, + "step": 810 + }, + { + "epoch": 0.15454930971116243, + "grad_norm": 0.23607099056243896, + "learning_rate": 1.9869648877102707e-05, + "loss": 1.1541, + "step": 820 + }, + { + "epoch": 0.15643405739056684, + "grad_norm": 0.22916655242443085, + "learning_rate": 1.986645341335827e-05, + "loss": 1.148, + "step": 830 + }, + { + "epoch": 0.15831880506997126, + "grad_norm": 0.23453421890735626, + "learning_rate": 1.986321951843791e-05, + "loss": 1.1617, + "step": 840 + }, + { + "epoch": 0.16020355274937567, + "grad_norm": 0.23339857161045074, + "learning_rate": 1.9859947204938085e-05, + "loss": 1.1541, + "step": 850 + }, + { + "epoch": 0.16208830042878009, + "grad_norm": 0.23789982497692108, + "learning_rate": 1.98566364856049e-05, + "loss": 1.1535, + "step": 860 + }, + { + "epoch": 0.16397304810818453, + "grad_norm": 0.2343784123659134, + "learning_rate": 1.9853287373334057e-05, + "loss": 1.1563, + "step": 870 + }, + { + "epoch": 0.16585779578758894, + "grad_norm": 0.22868357598781586, + "learning_rate": 1.9849899881170803e-05, + "loss": 1.1315, + "step": 880 + }, + { + "epoch": 0.16774254346699335, + "grad_norm": 0.23975709080696106, + "learning_rate": 1.984647402230988e-05, + "loss": 1.1487, + "step": 890 + }, + { + "epoch": 0.16962729114639777, + "grad_norm": 0.2430054396390915, + "learning_rate": 1.984300981009547e-05, + "loss": 1.1299, + "step": 900 + }, + { + "epoch": 0.17151203882580218, + "grad_norm": 0.24087710678577423, + "learning_rate": 1.983950725802116e-05, + "loss": 1.1514, + "step": 910 + }, + { + "epoch": 0.17339678650520662, + "grad_norm": 0.2398061901330948, + "learning_rate": 1.983596637972986e-05, + "loss": 1.1664, + "step": 920 + }, + { + "epoch": 0.17528153418461104, + "grad_norm": 0.2304048091173172, + "learning_rate": 1.983238718901377e-05, + "loss": 1.1534, + "step": 930 + }, + { + "epoch": 0.17716628186401545, + "grad_norm": 0.2336307168006897, + "learning_rate": 1.9828769699814322e-05, + "loss": 1.1296, + "step": 940 + }, + { + "epoch": 0.17905102954341987, + "grad_norm": 0.23780956864356995, + "learning_rate": 1.9825113926222132e-05, + "loss": 1.128, + "step": 950 + }, + { + "epoch": 0.1809357772228243, + "grad_norm": 0.2330734133720398, + "learning_rate": 1.9821419882476934e-05, + "loss": 1.1461, + "step": 960 + }, + { + "epoch": 0.18282052490222872, + "grad_norm": 0.24197718501091003, + "learning_rate": 1.9817687582967532e-05, + "loss": 1.1421, + "step": 970 + }, + { + "epoch": 0.18470527258163313, + "grad_norm": 0.23220518231391907, + "learning_rate": 1.9813917042231735e-05, + "loss": 1.1285, + "step": 980 + }, + { + "epoch": 0.18659002026103755, + "grad_norm": 0.22721554338932037, + "learning_rate": 1.9810108274956314e-05, + "loss": 1.1372, + "step": 990 + }, + { + "epoch": 0.18847476794044196, + "grad_norm": 0.23689280450344086, + "learning_rate": 1.9806261295976938e-05, + "loss": 1.135, + "step": 1000 + }, + { + "epoch": 0.1903595156198464, + "grad_norm": 0.23823745548725128, + "learning_rate": 1.9802376120278102e-05, + "loss": 1.1393, + "step": 1010 + }, + { + "epoch": 0.19224426329925082, + "grad_norm": 0.23724611103534698, + "learning_rate": 1.9798452762993102e-05, + "loss": 1.1425, + "step": 1020 + }, + { + "epoch": 0.19412901097865523, + "grad_norm": 0.2323419600725174, + "learning_rate": 1.9794491239403946e-05, + "loss": 1.1385, + "step": 1030 + }, + { + "epoch": 0.19601375865805964, + "grad_norm": 0.23249243199825287, + "learning_rate": 1.9790491564941304e-05, + "loss": 1.1289, + "step": 1040 + }, + { + "epoch": 0.19789850633746406, + "grad_norm": 0.23415586352348328, + "learning_rate": 1.9786453755184454e-05, + "loss": 1.1372, + "step": 1050 + }, + { + "epoch": 0.1997832540168685, + "grad_norm": 0.22952094674110413, + "learning_rate": 1.9782377825861212e-05, + "loss": 1.1346, + "step": 1060 + }, + { + "epoch": 0.2016680016962729, + "grad_norm": 0.23553574085235596, + "learning_rate": 1.9778263792847876e-05, + "loss": 1.134, + "step": 1070 + }, + { + "epoch": 0.20355274937567733, + "grad_norm": 0.2506747245788574, + "learning_rate": 1.9774111672169166e-05, + "loss": 1.127, + "step": 1080 + }, + { + "epoch": 0.20543749705508174, + "grad_norm": 0.2367413192987442, + "learning_rate": 1.9769921479998148e-05, + "loss": 1.1461, + "step": 1090 + }, + { + "epoch": 0.20732224473448618, + "grad_norm": 0.22659966349601746, + "learning_rate": 1.9765693232656197e-05, + "loss": 1.1067, + "step": 1100 + }, + { + "epoch": 0.2092069924138906, + "grad_norm": 0.2289683073759079, + "learning_rate": 1.976142694661291e-05, + "loss": 1.1322, + "step": 1110 + }, + { + "epoch": 0.211091740093295, + "grad_norm": 0.22230739891529083, + "learning_rate": 1.9757122638486047e-05, + "loss": 1.1172, + "step": 1120 + }, + { + "epoch": 0.21297648777269942, + "grad_norm": 0.23025445640087128, + "learning_rate": 1.9752780325041477e-05, + "loss": 1.1238, + "step": 1130 + }, + { + "epoch": 0.21486123545210384, + "grad_norm": 0.22977577149868011, + "learning_rate": 1.9748400023193096e-05, + "loss": 1.1268, + "step": 1140 + }, + { + "epoch": 0.21674598313150828, + "grad_norm": 0.22136132419109344, + "learning_rate": 1.974398175000278e-05, + "loss": 1.1132, + "step": 1150 + }, + { + "epoch": 0.2186307308109127, + "grad_norm": 0.22870974242687225, + "learning_rate": 1.9739525522680296e-05, + "loss": 1.1266, + "step": 1160 + }, + { + "epoch": 0.2205154784903171, + "grad_norm": 0.2345503866672516, + "learning_rate": 1.973503135858326e-05, + "loss": 1.1321, + "step": 1170 + }, + { + "epoch": 0.22240022616972152, + "grad_norm": 0.23089513182640076, + "learning_rate": 1.9730499275217048e-05, + "loss": 1.1507, + "step": 1180 + }, + { + "epoch": 0.22428497384912593, + "grad_norm": 0.22011809051036835, + "learning_rate": 1.9725929290234748e-05, + "loss": 1.1148, + "step": 1190 + }, + { + "epoch": 0.22616972152853038, + "grad_norm": 0.24076172709465027, + "learning_rate": 1.972132142143706e-05, + "loss": 1.1255, + "step": 1200 + }, + { + "epoch": 0.2280544692079348, + "grad_norm": 0.23350296914577484, + "learning_rate": 1.971667568677227e-05, + "loss": 1.109, + "step": 1210 + }, + { + "epoch": 0.2299392168873392, + "grad_norm": 0.22932660579681396, + "learning_rate": 1.9711992104336138e-05, + "loss": 1.1187, + "step": 1220 + }, + { + "epoch": 0.23182396456674362, + "grad_norm": 0.2350001186132431, + "learning_rate": 1.9707270692371854e-05, + "loss": 1.1075, + "step": 1230 + }, + { + "epoch": 0.23370871224614806, + "grad_norm": 0.23797191679477692, + "learning_rate": 1.9702511469269964e-05, + "loss": 1.1319, + "step": 1240 + }, + { + "epoch": 0.23559345992555247, + "grad_norm": 0.22966952621936798, + "learning_rate": 1.9697714453568286e-05, + "loss": 1.1225, + "step": 1250 + }, + { + "epoch": 0.2374782076049569, + "grad_norm": 0.2432156205177307, + "learning_rate": 1.9692879663951846e-05, + "loss": 1.125, + "step": 1260 + }, + { + "epoch": 0.2393629552843613, + "grad_norm": 0.22376728057861328, + "learning_rate": 1.9688007119252804e-05, + "loss": 1.1166, + "step": 1270 + }, + { + "epoch": 0.24124770296376571, + "grad_norm": 0.25832831859588623, + "learning_rate": 1.9683096838450386e-05, + "loss": 1.137, + "step": 1280 + }, + { + "epoch": 0.24313245064317016, + "grad_norm": 0.23004275560379028, + "learning_rate": 1.9678148840670803e-05, + "loss": 1.1128, + "step": 1290 + }, + { + "epoch": 0.24501719832257457, + "grad_norm": 0.2327570915222168, + "learning_rate": 1.9673163145187182e-05, + "loss": 1.1091, + "step": 1300 + }, + { + "epoch": 0.24690194600197898, + "grad_norm": 0.22204618155956268, + "learning_rate": 1.9668139771419477e-05, + "loss": 1.1185, + "step": 1310 + }, + { + "epoch": 0.2487866936813834, + "grad_norm": 0.2332202047109604, + "learning_rate": 1.9663078738934414e-05, + "loss": 1.1046, + "step": 1320 + }, + { + "epoch": 0.25067144136078784, + "grad_norm": 0.2484271228313446, + "learning_rate": 1.96579800674454e-05, + "loss": 1.0891, + "step": 1330 + }, + { + "epoch": 0.2525561890401922, + "grad_norm": 0.2349151372909546, + "learning_rate": 1.965284377681246e-05, + "loss": 1.136, + "step": 1340 + }, + { + "epoch": 0.25444093671959667, + "grad_norm": 0.23666912317276, + "learning_rate": 1.9647669887042132e-05, + "loss": 1.1197, + "step": 1350 + }, + { + "epoch": 0.2563256843990011, + "grad_norm": 0.23333217203617096, + "learning_rate": 1.964245841828743e-05, + "loss": 1.1133, + "step": 1360 + }, + { + "epoch": 0.2582104320784055, + "grad_norm": 0.2301831692457199, + "learning_rate": 1.9637209390847724e-05, + "loss": 1.1062, + "step": 1370 + }, + { + "epoch": 0.26009517975780994, + "grad_norm": 0.2266763299703598, + "learning_rate": 1.9631922825168694e-05, + "loss": 1.1247, + "step": 1380 + }, + { + "epoch": 0.2619799274372143, + "grad_norm": 0.23397673666477203, + "learning_rate": 1.9626598741842233e-05, + "loss": 1.1225, + "step": 1390 + }, + { + "epoch": 0.26386467511661876, + "grad_norm": 0.2325909584760666, + "learning_rate": 1.9621237161606364e-05, + "loss": 1.1146, + "step": 1400 + }, + { + "epoch": 0.2657494227960232, + "grad_norm": 0.23758016526699066, + "learning_rate": 1.961583810534517e-05, + "loss": 1.0954, + "step": 1410 + }, + { + "epoch": 0.2676341704754276, + "grad_norm": 0.23477062582969666, + "learning_rate": 1.961040159408871e-05, + "loss": 1.1044, + "step": 1420 + }, + { + "epoch": 0.26951891815483203, + "grad_norm": 0.22722028195858002, + "learning_rate": 1.9604927649012934e-05, + "loss": 1.1064, + "step": 1430 + }, + { + "epoch": 0.2714036658342364, + "grad_norm": 0.2328077107667923, + "learning_rate": 1.95994162914396e-05, + "loss": 1.0913, + "step": 1440 + }, + { + "epoch": 0.27328841351364086, + "grad_norm": 0.23263655602931976, + "learning_rate": 1.9593867542836198e-05, + "loss": 1.1127, + "step": 1450 + }, + { + "epoch": 0.2751731611930453, + "grad_norm": 0.2270570695400238, + "learning_rate": 1.9588281424815854e-05, + "loss": 1.1079, + "step": 1460 + }, + { + "epoch": 0.2770579088724497, + "grad_norm": 0.5680676102638245, + "learning_rate": 1.9582657959137257e-05, + "loss": 1.0872, + "step": 1470 + }, + { + "epoch": 0.27894265655185413, + "grad_norm": 0.22764664888381958, + "learning_rate": 1.957699716770457e-05, + "loss": 1.0963, + "step": 1480 + }, + { + "epoch": 0.2808274042312585, + "grad_norm": 0.22060087323188782, + "learning_rate": 1.957129907256734e-05, + "loss": 1.105, + "step": 1490 + }, + { + "epoch": 0.28271215191066296, + "grad_norm": 0.22591201961040497, + "learning_rate": 1.9565563695920426e-05, + "loss": 1.1104, + "step": 1500 + }, + { + "epoch": 0.2845968995900674, + "grad_norm": 0.22907570004463196, + "learning_rate": 1.95597910601039e-05, + "loss": 1.1049, + "step": 1510 + }, + { + "epoch": 0.2864816472694718, + "grad_norm": 0.2240200787782669, + "learning_rate": 1.955398118760296e-05, + "loss": 1.1237, + "step": 1520 + }, + { + "epoch": 0.2883663949488762, + "grad_norm": 0.21757692098617554, + "learning_rate": 1.9548134101047846e-05, + "loss": 1.0758, + "step": 1530 + }, + { + "epoch": 0.2902511426282806, + "grad_norm": 0.22303441166877747, + "learning_rate": 1.9542249823213762e-05, + "loss": 1.1, + "step": 1540 + }, + { + "epoch": 0.29213589030768505, + "grad_norm": 0.22459468245506287, + "learning_rate": 1.9536328377020763e-05, + "loss": 1.1088, + "step": 1550 + }, + { + "epoch": 0.2940206379870895, + "grad_norm": 0.22137030959129333, + "learning_rate": 1.9530369785533683e-05, + "loss": 1.0945, + "step": 1560 + }, + { + "epoch": 0.2959053856664939, + "grad_norm": 0.23165886104106903, + "learning_rate": 1.9524374071962054e-05, + "loss": 1.1056, + "step": 1570 + }, + { + "epoch": 0.2977901333458983, + "grad_norm": 0.2607102394104004, + "learning_rate": 1.9518341259659988e-05, + "loss": 1.0897, + "step": 1580 + }, + { + "epoch": 0.29967488102530276, + "grad_norm": 0.23659060895442963, + "learning_rate": 1.951227137212611e-05, + "loss": 1.0864, + "step": 1590 + }, + { + "epoch": 0.30155962870470715, + "grad_norm": 0.22756868600845337, + "learning_rate": 1.9506164433003457e-05, + "loss": 1.1051, + "step": 1600 + }, + { + "epoch": 0.3034443763841116, + "grad_norm": 0.22655488550662994, + "learning_rate": 1.9500020466079386e-05, + "loss": 1.1003, + "step": 1610 + }, + { + "epoch": 0.305329124063516, + "grad_norm": 0.21322037279605865, + "learning_rate": 1.949383949528548e-05, + "loss": 1.1019, + "step": 1620 + }, + { + "epoch": 0.3072138717429204, + "grad_norm": 0.2231883704662323, + "learning_rate": 1.9487621544697463e-05, + "loss": 1.0787, + "step": 1630 + }, + { + "epoch": 0.30909861942232486, + "grad_norm": 0.23086197674274445, + "learning_rate": 1.9481366638535096e-05, + "loss": 1.0937, + "step": 1640 + }, + { + "epoch": 0.31098336710172925, + "grad_norm": 0.21777348220348358, + "learning_rate": 1.947507480116209e-05, + "loss": 1.0947, + "step": 1650 + }, + { + "epoch": 0.3128681147811337, + "grad_norm": 0.28279581665992737, + "learning_rate": 1.9468746057086e-05, + "loss": 1.1108, + "step": 1660 + }, + { + "epoch": 0.3147528624605381, + "grad_norm": 0.24145016074180603, + "learning_rate": 1.9462380430958146e-05, + "loss": 1.1078, + "step": 1670 + }, + { + "epoch": 0.3166376101399425, + "grad_norm": 0.22494813799858093, + "learning_rate": 1.9455977947573514e-05, + "loss": 1.0773, + "step": 1680 + }, + { + "epoch": 0.31852235781934696, + "grad_norm": 0.21982720494270325, + "learning_rate": 1.9449538631870632e-05, + "loss": 1.0994, + "step": 1690 + }, + { + "epoch": 0.32040710549875134, + "grad_norm": 0.2225201278924942, + "learning_rate": 1.9443062508931523e-05, + "loss": 1.1067, + "step": 1700 + }, + { + "epoch": 0.3222918531781558, + "grad_norm": 0.2318694144487381, + "learning_rate": 1.9436549603981556e-05, + "loss": 1.0971, + "step": 1710 + }, + { + "epoch": 0.32417660085756017, + "grad_norm": 0.22039298713207245, + "learning_rate": 1.9429999942389385e-05, + "loss": 1.0974, + "step": 1720 + }, + { + "epoch": 0.3260613485369646, + "grad_norm": 0.22072811424732208, + "learning_rate": 1.942341354966683e-05, + "loss": 1.101, + "step": 1730 + }, + { + "epoch": 0.32794609621636905, + "grad_norm": 0.22200477123260498, + "learning_rate": 1.941679045146878e-05, + "loss": 1.0955, + "step": 1740 + }, + { + "epoch": 0.32983084389577344, + "grad_norm": 0.22069425880908966, + "learning_rate": 1.9410130673593113e-05, + "loss": 1.1007, + "step": 1750 + }, + { + "epoch": 0.3317155915751779, + "grad_norm": 0.24102486670017242, + "learning_rate": 1.940343424198056e-05, + "loss": 1.0882, + "step": 1760 + }, + { + "epoch": 0.33360033925458227, + "grad_norm": 0.23189294338226318, + "learning_rate": 1.9396701182714632e-05, + "loss": 1.0773, + "step": 1770 + }, + { + "epoch": 0.3354850869339867, + "grad_norm": 0.2358550727367401, + "learning_rate": 1.9389931522021513e-05, + "loss": 1.1002, + "step": 1780 + }, + { + "epoch": 0.33736983461339115, + "grad_norm": 0.23121659457683563, + "learning_rate": 1.9383125286269946e-05, + "loss": 1.0882, + "step": 1790 + }, + { + "epoch": 0.33925458229279554, + "grad_norm": 0.21421286463737488, + "learning_rate": 1.937628250197115e-05, + "loss": 1.1083, + "step": 1800 + }, + { + "epoch": 0.3411393299722, + "grad_norm": 0.2365601509809494, + "learning_rate": 1.9369403195778692e-05, + "loss": 1.0921, + "step": 1810 + }, + { + "epoch": 0.34302407765160436, + "grad_norm": 0.22509247064590454, + "learning_rate": 1.9362487394488405e-05, + "loss": 1.0677, + "step": 1820 + }, + { + "epoch": 0.3449088253310088, + "grad_norm": 0.2377094179391861, + "learning_rate": 1.935553512503828e-05, + "loss": 1.0796, + "step": 1830 + }, + { + "epoch": 0.34679357301041325, + "grad_norm": 0.23316587507724762, + "learning_rate": 1.9348546414508345e-05, + "loss": 1.0901, + "step": 1840 + }, + { + "epoch": 0.34867832068981763, + "grad_norm": 0.21752586960792542, + "learning_rate": 1.934152129012058e-05, + "loss": 1.0949, + "step": 1850 + }, + { + "epoch": 0.3505630683692221, + "grad_norm": 0.2217262238264084, + "learning_rate": 1.9334459779238796e-05, + "loss": 1.1102, + "step": 1860 + }, + { + "epoch": 0.3524478160486265, + "grad_norm": 0.2220594435930252, + "learning_rate": 1.9327361909368535e-05, + "loss": 1.0801, + "step": 1870 + }, + { + "epoch": 0.3543325637280309, + "grad_norm": 0.21926771104335785, + "learning_rate": 1.932022770815697e-05, + "loss": 1.1064, + "step": 1880 + }, + { + "epoch": 0.35621731140743534, + "grad_norm": 0.23664908111095428, + "learning_rate": 1.931305720339278e-05, + "loss": 1.0916, + "step": 1890 + }, + { + "epoch": 0.35810205908683973, + "grad_norm": 0.21648700535297394, + "learning_rate": 1.9305850423006056e-05, + "loss": 1.0906, + "step": 1900 + }, + { + "epoch": 0.35998680676624417, + "grad_norm": 0.24570511281490326, + "learning_rate": 1.9298607395068182e-05, + "loss": 1.0867, + "step": 1910 + }, + { + "epoch": 0.3618715544456486, + "grad_norm": 0.21339622139930725, + "learning_rate": 1.929132814779174e-05, + "loss": 1.0963, + "step": 1920 + }, + { + "epoch": 0.363756302125053, + "grad_norm": 0.22392895817756653, + "learning_rate": 1.928401270953039e-05, + "loss": 1.0881, + "step": 1930 + }, + { + "epoch": 0.36564104980445744, + "grad_norm": 0.2746802568435669, + "learning_rate": 1.927666110877875e-05, + "loss": 1.0872, + "step": 1940 + }, + { + "epoch": 0.3675257974838618, + "grad_norm": 0.22988495230674744, + "learning_rate": 1.9269273374172303e-05, + "loss": 1.0841, + "step": 1950 + }, + { + "epoch": 0.36941054516326627, + "grad_norm": 0.21574869751930237, + "learning_rate": 1.9261849534487286e-05, + "loss": 1.0758, + "step": 1960 + }, + { + "epoch": 0.3712952928426707, + "grad_norm": 0.22537797689437866, + "learning_rate": 1.925438961864056e-05, + "loss": 1.0773, + "step": 1970 + }, + { + "epoch": 0.3731800405220751, + "grad_norm": 0.22775666415691376, + "learning_rate": 1.9246893655689507e-05, + "loss": 1.0873, + "step": 1980 + }, + { + "epoch": 0.37506478820147954, + "grad_norm": 0.22897839546203613, + "learning_rate": 1.9239361674831922e-05, + "loss": 1.075, + "step": 1990 + }, + { + "epoch": 0.3769495358808839, + "grad_norm": 0.23395264148712158, + "learning_rate": 1.9231793705405893e-05, + "loss": 1.0849, + "step": 2000 + }, + { + "epoch": 0.37883428356028837, + "grad_norm": 0.23048214614391327, + "learning_rate": 1.922418977688969e-05, + "loss": 1.0718, + "step": 2010 + }, + { + "epoch": 0.3807190312396928, + "grad_norm": 0.2195880264043808, + "learning_rate": 1.921654991890165e-05, + "loss": 1.0406, + "step": 2020 + }, + { + "epoch": 0.3826037789190972, + "grad_norm": 0.2269481122493744, + "learning_rate": 1.9208874161200047e-05, + "loss": 1.0727, + "step": 2030 + }, + { + "epoch": 0.38448852659850163, + "grad_norm": 0.25090083479881287, + "learning_rate": 1.9201162533683013e-05, + "loss": 1.089, + "step": 2040 + }, + { + "epoch": 0.386373274277906, + "grad_norm": 0.251005619764328, + "learning_rate": 1.9193415066388378e-05, + "loss": 1.0814, + "step": 2050 + }, + { + "epoch": 0.38825802195731046, + "grad_norm": 0.2105259895324707, + "learning_rate": 1.918563178949358e-05, + "loss": 1.0863, + "step": 2060 + }, + { + "epoch": 0.3901427696367149, + "grad_norm": 0.22367620468139648, + "learning_rate": 1.9177812733315532e-05, + "loss": 1.0546, + "step": 2070 + }, + { + "epoch": 0.3920275173161193, + "grad_norm": 0.22837083041667938, + "learning_rate": 1.9169957928310533e-05, + "loss": 1.0815, + "step": 2080 + }, + { + "epoch": 0.39391226499552373, + "grad_norm": 0.2286073863506317, + "learning_rate": 1.9162067405074107e-05, + "loss": 1.0767, + "step": 2090 + }, + { + "epoch": 0.3957970126749281, + "grad_norm": 0.2199411541223526, + "learning_rate": 1.9154141194340914e-05, + "loss": 1.0774, + "step": 2100 + }, + { + "epoch": 0.39768176035433256, + "grad_norm": 0.22925490140914917, + "learning_rate": 1.9146179326984624e-05, + "loss": 1.0633, + "step": 2110 + }, + { + "epoch": 0.399566508033737, + "grad_norm": 0.23518528044223785, + "learning_rate": 1.9138181834017792e-05, + "loss": 1.0778, + "step": 2120 + }, + { + "epoch": 0.4014512557131414, + "grad_norm": 0.2305455207824707, + "learning_rate": 1.9130148746591733e-05, + "loss": 1.0603, + "step": 2130 + }, + { + "epoch": 0.4033360033925458, + "grad_norm": 0.22571596503257751, + "learning_rate": 1.912208009599642e-05, + "loss": 1.0829, + "step": 2140 + }, + { + "epoch": 0.40522075107195027, + "grad_norm": 0.21482621133327484, + "learning_rate": 1.911397591366034e-05, + "loss": 1.0644, + "step": 2150 + }, + { + "epoch": 0.40710549875135466, + "grad_norm": 0.23023715615272522, + "learning_rate": 1.9105836231150383e-05, + "loss": 1.0817, + "step": 2160 + }, + { + "epoch": 0.4089902464307591, + "grad_norm": 0.225046306848526, + "learning_rate": 1.909766108017172e-05, + "loss": 1.0743, + "step": 2170 + }, + { + "epoch": 0.4108749941101635, + "grad_norm": 0.22448822855949402, + "learning_rate": 1.9089450492567678e-05, + "loss": 1.0818, + "step": 2180 + }, + { + "epoch": 0.4127597417895679, + "grad_norm": 0.22508658468723297, + "learning_rate": 1.90812045003196e-05, + "loss": 1.0779, + "step": 2190 + }, + { + "epoch": 0.41464448946897237, + "grad_norm": 0.2228625863790512, + "learning_rate": 1.9072923135546754e-05, + "loss": 1.066, + "step": 2200 + }, + { + "epoch": 0.41652923714837675, + "grad_norm": 0.2219139039516449, + "learning_rate": 1.9064606430506176e-05, + "loss": 1.0605, + "step": 2210 + }, + { + "epoch": 0.4184139848277812, + "grad_norm": 0.2209967076778412, + "learning_rate": 1.9056254417592566e-05, + "loss": 1.0679, + "step": 2220 + }, + { + "epoch": 0.4202987325071856, + "grad_norm": 0.2256278693675995, + "learning_rate": 1.9047867129338144e-05, + "loss": 1.0775, + "step": 2230 + }, + { + "epoch": 0.42218348018659, + "grad_norm": 0.2144649624824524, + "learning_rate": 1.903944459841254e-05, + "loss": 1.0695, + "step": 2240 + }, + { + "epoch": 0.42406822786599446, + "grad_norm": 0.2666056454181671, + "learning_rate": 1.9030986857622654e-05, + "loss": 1.0625, + "step": 2250 + }, + { + "epoch": 0.42595297554539885, + "grad_norm": 0.22506961226463318, + "learning_rate": 1.9022493939912533e-05, + "loss": 1.0619, + "step": 2260 + }, + { + "epoch": 0.4278377232248033, + "grad_norm": 0.2494383007287979, + "learning_rate": 1.901396587836325e-05, + "loss": 1.0663, + "step": 2270 + }, + { + "epoch": 0.4297224709042077, + "grad_norm": 0.23654042184352875, + "learning_rate": 1.9005402706192762e-05, + "loss": 1.0741, + "step": 2280 + }, + { + "epoch": 0.4316072185836121, + "grad_norm": 0.21608759462833405, + "learning_rate": 1.8996804456755783e-05, + "loss": 1.0742, + "step": 2290 + }, + { + "epoch": 0.43349196626301656, + "grad_norm": 0.22069399058818817, + "learning_rate": 1.898817116354367e-05, + "loss": 1.0782, + "step": 2300 + }, + { + "epoch": 0.43537671394242095, + "grad_norm": 0.21386189758777618, + "learning_rate": 1.897950286018426e-05, + "loss": 1.0809, + "step": 2310 + }, + { + "epoch": 0.4372614616218254, + "grad_norm": 0.2284873127937317, + "learning_rate": 1.8970799580441784e-05, + "loss": 1.0768, + "step": 2320 + }, + { + "epoch": 0.4391462093012298, + "grad_norm": 0.23891320824623108, + "learning_rate": 1.8962061358216694e-05, + "loss": 1.0764, + "step": 2330 + }, + { + "epoch": 0.4410309569806342, + "grad_norm": 0.4403688609600067, + "learning_rate": 1.895328822754555e-05, + "loss": 1.0922, + "step": 2340 + }, + { + "epoch": 0.44291570466003866, + "grad_norm": 0.23310713469982147, + "learning_rate": 1.894448022260089e-05, + "loss": 1.0591, + "step": 2350 + }, + { + "epoch": 0.44480045233944304, + "grad_norm": 0.21827839314937592, + "learning_rate": 1.8935637377691087e-05, + "loss": 1.0692, + "step": 2360 + }, + { + "epoch": 0.4466852000188475, + "grad_norm": 0.22945663332939148, + "learning_rate": 1.8926759727260226e-05, + "loss": 1.0908, + "step": 2370 + }, + { + "epoch": 0.44856994769825187, + "grad_norm": 0.22784274816513062, + "learning_rate": 1.891784730588796e-05, + "loss": 1.0686, + "step": 2380 + }, + { + "epoch": 0.4504546953776563, + "grad_norm": 0.22548478841781616, + "learning_rate": 1.8908900148289378e-05, + "loss": 1.0602, + "step": 2390 + }, + { + "epoch": 0.45233944305706075, + "grad_norm": 0.22366739809513092, + "learning_rate": 1.889991828931488e-05, + "loss": 1.0794, + "step": 2400 + }, + { + "epoch": 0.45422419073646514, + "grad_norm": 0.22350668907165527, + "learning_rate": 1.8890901763950018e-05, + "loss": 1.0809, + "step": 2410 + }, + { + "epoch": 0.4561089384158696, + "grad_norm": 0.21918301284313202, + "learning_rate": 1.888185060731539e-05, + "loss": 1.051, + "step": 2420 + }, + { + "epoch": 0.457993686095274, + "grad_norm": 0.21336427330970764, + "learning_rate": 1.8872764854666474e-05, + "loss": 1.072, + "step": 2430 + }, + { + "epoch": 0.4598784337746784, + "grad_norm": 0.21722489595413208, + "learning_rate": 1.8863644541393516e-05, + "loss": 1.0459, + "step": 2440 + }, + { + "epoch": 0.46176318145408285, + "grad_norm": 0.22130219638347626, + "learning_rate": 1.8854489703021375e-05, + "loss": 1.061, + "step": 2450 + }, + { + "epoch": 0.46364792913348724, + "grad_norm": 0.248040109872818, + "learning_rate": 1.8845300375209382e-05, + "loss": 1.0583, + "step": 2460 + }, + { + "epoch": 0.4655326768128917, + "grad_norm": 0.22171063721179962, + "learning_rate": 1.8836076593751226e-05, + "loss": 1.0592, + "step": 2470 + }, + { + "epoch": 0.4674174244922961, + "grad_norm": 0.21777012944221497, + "learning_rate": 1.8826818394574788e-05, + "loss": 1.0595, + "step": 2480 + }, + { + "epoch": 0.4693021721717005, + "grad_norm": 0.22155867516994476, + "learning_rate": 1.8817525813742e-05, + "loss": 1.0422, + "step": 2490 + }, + { + "epoch": 0.47118691985110495, + "grad_norm": 0.24434182047843933, + "learning_rate": 1.8808198887448737e-05, + "loss": 1.0647, + "step": 2500 + }, + { + "epoch": 0.47307166753050933, + "grad_norm": 0.22356875240802765, + "learning_rate": 1.8798837652024643e-05, + "loss": 1.0651, + "step": 2510 + }, + { + "epoch": 0.4749564152099138, + "grad_norm": 0.21497099101543427, + "learning_rate": 1.8789442143932993e-05, + "loss": 1.0759, + "step": 2520 + }, + { + "epoch": 0.4768411628893182, + "grad_norm": 0.21545691788196564, + "learning_rate": 1.8780012399770575e-05, + "loss": 1.07, + "step": 2530 + }, + { + "epoch": 0.4787259105687226, + "grad_norm": 0.22562828660011292, + "learning_rate": 1.877054845626752e-05, + "loss": 1.0617, + "step": 2540 + }, + { + "epoch": 0.48061065824812704, + "grad_norm": 0.2323274314403534, + "learning_rate": 1.8761050350287178e-05, + "loss": 1.0642, + "step": 2550 + }, + { + "epoch": 0.48249540592753143, + "grad_norm": 0.21447454392910004, + "learning_rate": 1.8751518118825954e-05, + "loss": 1.0518, + "step": 2560 + }, + { + "epoch": 0.48438015360693587, + "grad_norm": 0.2292879819869995, + "learning_rate": 1.8741951799013186e-05, + "loss": 1.0567, + "step": 2570 + }, + { + "epoch": 0.4862649012863403, + "grad_norm": 0.22435525059700012, + "learning_rate": 1.8732351428110993e-05, + "loss": 1.0551, + "step": 2580 + }, + { + "epoch": 0.4881496489657447, + "grad_norm": 0.22137649357318878, + "learning_rate": 1.8722717043514118e-05, + "loss": 1.066, + "step": 2590 + }, + { + "epoch": 0.49003439664514914, + "grad_norm": 0.23160088062286377, + "learning_rate": 1.8713048682749802e-05, + "loss": 1.0504, + "step": 2600 + }, + { + "epoch": 0.4919191443245535, + "grad_norm": 0.24425852298736572, + "learning_rate": 1.870334638347762e-05, + "loss": 1.0594, + "step": 2610 + }, + { + "epoch": 0.49380389200395797, + "grad_norm": 0.2133430391550064, + "learning_rate": 1.8693610183489346e-05, + "loss": 1.0546, + "step": 2620 + }, + { + "epoch": 0.4956886396833624, + "grad_norm": 0.22564521431922913, + "learning_rate": 1.8683840120708804e-05, + "loss": 1.0409, + "step": 2630 + }, + { + "epoch": 0.4975733873627668, + "grad_norm": 0.22771432995796204, + "learning_rate": 1.867403623319171e-05, + "loss": 1.0731, + "step": 2640 + }, + { + "epoch": 0.49945813504217124, + "grad_norm": 0.21632668375968933, + "learning_rate": 1.866419855912554e-05, + "loss": 1.0638, + "step": 2650 + }, + { + "epoch": 0.5013428827215757, + "grad_norm": 0.22975364327430725, + "learning_rate": 1.8654327136829376e-05, + "loss": 1.0469, + "step": 2660 + }, + { + "epoch": 0.5032276304009801, + "grad_norm": 0.21563822031021118, + "learning_rate": 1.8644422004753736e-05, + "loss": 1.0754, + "step": 2670 + }, + { + "epoch": 0.5051123780803844, + "grad_norm": 0.2241036593914032, + "learning_rate": 1.8634483201480467e-05, + "loss": 1.0644, + "step": 2680 + }, + { + "epoch": 0.506997125759789, + "grad_norm": 0.23674558103084564, + "learning_rate": 1.862451076572255e-05, + "loss": 1.0649, + "step": 2690 + }, + { + "epoch": 0.5088818734391933, + "grad_norm": 0.21504618227481842, + "learning_rate": 1.8614504736323976e-05, + "loss": 1.0497, + "step": 2700 + }, + { + "epoch": 0.5107666211185977, + "grad_norm": 0.21865041553974152, + "learning_rate": 1.8604465152259595e-05, + "loss": 1.0483, + "step": 2710 + }, + { + "epoch": 0.5126513687980022, + "grad_norm": 0.22045278549194336, + "learning_rate": 1.8594392052634943e-05, + "loss": 1.0557, + "step": 2720 + }, + { + "epoch": 0.5145361164774066, + "grad_norm": 0.22401635348796844, + "learning_rate": 1.8584285476686117e-05, + "loss": 1.0651, + "step": 2730 + }, + { + "epoch": 0.516420864156811, + "grad_norm": 0.22248347103595734, + "learning_rate": 1.8574145463779597e-05, + "loss": 1.0654, + "step": 2740 + }, + { + "epoch": 0.5183056118362154, + "grad_norm": 0.2244492769241333, + "learning_rate": 1.8563972053412116e-05, + "loss": 1.0301, + "step": 2750 + }, + { + "epoch": 0.5201903595156199, + "grad_norm": 0.21923644840717316, + "learning_rate": 1.8553765285210487e-05, + "loss": 1.079, + "step": 2760 + }, + { + "epoch": 0.5220751071950243, + "grad_norm": 0.3092012107372284, + "learning_rate": 1.8543525198931456e-05, + "loss": 1.0392, + "step": 2770 + }, + { + "epoch": 0.5239598548744286, + "grad_norm": 0.21812060475349426, + "learning_rate": 1.853325183446155e-05, + "loss": 1.0583, + "step": 2780 + }, + { + "epoch": 0.5258446025538331, + "grad_norm": 0.2504223883152008, + "learning_rate": 1.852294523181692e-05, + "loss": 1.0562, + "step": 2790 + }, + { + "epoch": 0.5277293502332375, + "grad_norm": 0.22639353573322296, + "learning_rate": 1.8512605431143185e-05, + "loss": 1.0658, + "step": 2800 + }, + { + "epoch": 0.5296140979126419, + "grad_norm": 0.21415212750434875, + "learning_rate": 1.850223247271527e-05, + "loss": 1.0632, + "step": 2810 + }, + { + "epoch": 0.5314988455920464, + "grad_norm": 0.21469293534755707, + "learning_rate": 1.8491826396937257e-05, + "loss": 1.04, + "step": 2820 + }, + { + "epoch": 0.5333835932714508, + "grad_norm": 0.2169499695301056, + "learning_rate": 1.8481387244342222e-05, + "loss": 1.0672, + "step": 2830 + }, + { + "epoch": 0.5352683409508552, + "grad_norm": 0.22114944458007812, + "learning_rate": 1.847091505559209e-05, + "loss": 1.0447, + "step": 2840 + }, + { + "epoch": 0.5371530886302597, + "grad_norm": 0.21896474063396454, + "learning_rate": 1.846040987147745e-05, + "loss": 1.0442, + "step": 2850 + }, + { + "epoch": 0.5390378363096641, + "grad_norm": 0.21406564116477966, + "learning_rate": 1.8449871732917424e-05, + "loss": 1.062, + "step": 2860 + }, + { + "epoch": 0.5409225839890685, + "grad_norm": 0.22089830040931702, + "learning_rate": 1.843930068095949e-05, + "loss": 1.0594, + "step": 2870 + }, + { + "epoch": 0.5428073316684728, + "grad_norm": 0.2243814915418625, + "learning_rate": 1.842869675677934e-05, + "loss": 1.0484, + "step": 2880 + }, + { + "epoch": 0.5446920793478773, + "grad_norm": 0.2168908268213272, + "learning_rate": 1.841806000168069e-05, + "loss": 1.0362, + "step": 2890 + }, + { + "epoch": 0.5465768270272817, + "grad_norm": 0.2150019407272339, + "learning_rate": 1.8407390457095145e-05, + "loss": 1.0328, + "step": 2900 + }, + { + "epoch": 0.5484615747066861, + "grad_norm": 0.20865511894226074, + "learning_rate": 1.8396688164582036e-05, + "loss": 1.0508, + "step": 2910 + }, + { + "epoch": 0.5503463223860906, + "grad_norm": 0.9704879522323608, + "learning_rate": 1.8385953165828242e-05, + "loss": 1.0274, + "step": 2920 + }, + { + "epoch": 0.552231070065495, + "grad_norm": 0.21843120455741882, + "learning_rate": 1.837518550264804e-05, + "loss": 1.0486, + "step": 2930 + }, + { + "epoch": 0.5541158177448994, + "grad_norm": 0.21802903711795807, + "learning_rate": 1.836438521698294e-05, + "loss": 1.0636, + "step": 2940 + }, + { + "epoch": 0.5560005654243039, + "grad_norm": 0.22014521062374115, + "learning_rate": 1.835355235090153e-05, + "loss": 1.0533, + "step": 2950 + }, + { + "epoch": 0.5578853131037083, + "grad_norm": 0.22617284953594208, + "learning_rate": 1.8342686946599283e-05, + "loss": 1.05, + "step": 2960 + }, + { + "epoch": 0.5597700607831126, + "grad_norm": 0.22925904393196106, + "learning_rate": 1.8331789046398428e-05, + "loss": 1.0602, + "step": 2970 + }, + { + "epoch": 0.561654808462517, + "grad_norm": 0.22886398434638977, + "learning_rate": 1.832085869274777e-05, + "loss": 1.0498, + "step": 2980 + }, + { + "epoch": 0.5635395561419215, + "grad_norm": 0.2270791083574295, + "learning_rate": 1.830989592822252e-05, + "loss": 1.0376, + "step": 2990 + }, + { + "epoch": 0.5654243038213259, + "grad_norm": 0.21975870430469513, + "learning_rate": 1.829890079552413e-05, + "loss": 1.0504, + "step": 3000 + }, + { + "epoch": 0.5673090515007303, + "grad_norm": 0.2233470380306244, + "learning_rate": 1.828787333748014e-05, + "loss": 1.0462, + "step": 3010 + }, + { + "epoch": 0.5691937991801348, + "grad_norm": 0.20786228775978088, + "learning_rate": 1.8276813597043993e-05, + "loss": 1.054, + "step": 3020 + }, + { + "epoch": 0.5710785468595392, + "grad_norm": 0.2267577052116394, + "learning_rate": 1.8265721617294882e-05, + "loss": 1.0458, + "step": 3030 + }, + { + "epoch": 0.5729632945389436, + "grad_norm": 0.221834197640419, + "learning_rate": 1.8254597441437575e-05, + "loss": 1.0486, + "step": 3040 + }, + { + "epoch": 0.5748480422183481, + "grad_norm": 0.2116076648235321, + "learning_rate": 1.824344111280224e-05, + "loss": 1.0387, + "step": 3050 + }, + { + "epoch": 0.5767327898977525, + "grad_norm": 0.2096250206232071, + "learning_rate": 1.8232252674844305e-05, + "loss": 1.0554, + "step": 3060 + }, + { + "epoch": 0.5786175375771568, + "grad_norm": 0.22298020124435425, + "learning_rate": 1.822103217114425e-05, + "loss": 1.0656, + "step": 3070 + }, + { + "epoch": 0.5805022852565612, + "grad_norm": 0.22350934147834778, + "learning_rate": 1.8209779645407456e-05, + "loss": 1.0529, + "step": 3080 + }, + { + "epoch": 0.5823870329359657, + "grad_norm": 0.24499738216400146, + "learning_rate": 1.8198495141464045e-05, + "loss": 1.0449, + "step": 3090 + }, + { + "epoch": 0.5842717806153701, + "grad_norm": 0.21206438541412354, + "learning_rate": 1.8187178703268685e-05, + "loss": 1.0394, + "step": 3100 + }, + { + "epoch": 0.5861565282947745, + "grad_norm": 0.22638669610023499, + "learning_rate": 1.8175830374900443e-05, + "loss": 1.0485, + "step": 3110 + }, + { + "epoch": 0.588041275974179, + "grad_norm": 0.21643531322479248, + "learning_rate": 1.8164450200562608e-05, + "loss": 1.0504, + "step": 3120 + }, + { + "epoch": 0.5899260236535834, + "grad_norm": 0.21274417638778687, + "learning_rate": 1.8153038224582493e-05, + "loss": 1.0285, + "step": 3130 + }, + { + "epoch": 0.5918107713329878, + "grad_norm": 0.21556533873081207, + "learning_rate": 1.8141594491411297e-05, + "loss": 1.019, + "step": 3140 + }, + { + "epoch": 0.5936955190123923, + "grad_norm": 0.2211538553237915, + "learning_rate": 1.813011904562392e-05, + "loss": 1.0485, + "step": 3150 + }, + { + "epoch": 0.5955802666917966, + "grad_norm": 0.2195790857076645, + "learning_rate": 1.8118611931918774e-05, + "loss": 1.0159, + "step": 3160 + }, + { + "epoch": 0.597465014371201, + "grad_norm": 0.21492570638656616, + "learning_rate": 1.8107073195117632e-05, + "loss": 1.0514, + "step": 3170 + }, + { + "epoch": 0.5993497620506055, + "grad_norm": 0.21749347448349, + "learning_rate": 1.8095502880165443e-05, + "loss": 1.0416, + "step": 3180 + }, + { + "epoch": 0.6012345097300099, + "grad_norm": 0.2222120463848114, + "learning_rate": 1.8083901032130153e-05, + "loss": 1.0254, + "step": 3190 + }, + { + "epoch": 0.6031192574094143, + "grad_norm": 0.21424059569835663, + "learning_rate": 1.807226769620254e-05, + "loss": 1.0552, + "step": 3200 + }, + { + "epoch": 0.6050040050888187, + "grad_norm": 0.21407698094844818, + "learning_rate": 1.806060291769602e-05, + "loss": 1.0486, + "step": 3210 + }, + { + "epoch": 0.6068887527682232, + "grad_norm": 0.2170197069644928, + "learning_rate": 1.8048906742046496e-05, + "loss": 1.0518, + "step": 3220 + }, + { + "epoch": 0.6087735004476276, + "grad_norm": 0.22323155403137207, + "learning_rate": 1.8037179214812156e-05, + "loss": 1.0494, + "step": 3230 + }, + { + "epoch": 0.610658248127032, + "grad_norm": 0.2206757664680481, + "learning_rate": 1.8025420381673314e-05, + "loss": 1.0594, + "step": 3240 + }, + { + "epoch": 0.6125429958064365, + "grad_norm": 0.21856994926929474, + "learning_rate": 1.801363028843222e-05, + "loss": 1.0495, + "step": 3250 + }, + { + "epoch": 0.6144277434858408, + "grad_norm": 0.22009429335594177, + "learning_rate": 1.8001808981012888e-05, + "loss": 1.0545, + "step": 3260 + }, + { + "epoch": 0.6163124911652452, + "grad_norm": 0.22052286565303802, + "learning_rate": 1.798995650546092e-05, + "loss": 1.0455, + "step": 3270 + }, + { + "epoch": 0.6181972388446497, + "grad_norm": 0.21965639293193817, + "learning_rate": 1.797807290794331e-05, + "loss": 1.0499, + "step": 3280 + }, + { + "epoch": 0.6200819865240541, + "grad_norm": 0.22027024626731873, + "learning_rate": 1.7966158234748296e-05, + "loss": 1.0363, + "step": 3290 + }, + { + "epoch": 0.6219667342034585, + "grad_norm": 0.21696113049983978, + "learning_rate": 1.7954212532285134e-05, + "loss": 1.0424, + "step": 3300 + }, + { + "epoch": 0.6238514818828629, + "grad_norm": 0.22434957325458527, + "learning_rate": 1.7942235847083967e-05, + "loss": 1.0113, + "step": 3310 + }, + { + "epoch": 0.6257362295622674, + "grad_norm": 0.21257033944129944, + "learning_rate": 1.793022822579561e-05, + "loss": 1.0285, + "step": 3320 + }, + { + "epoch": 0.6276209772416718, + "grad_norm": 0.22205692529678345, + "learning_rate": 1.7918189715191375e-05, + "loss": 1.0408, + "step": 3330 + }, + { + "epoch": 0.6295057249210761, + "grad_norm": 0.21635259687900543, + "learning_rate": 1.7906120362162895e-05, + "loss": 1.0407, + "step": 3340 + }, + { + "epoch": 0.6313904726004806, + "grad_norm": 0.21145284175872803, + "learning_rate": 1.789402021372194e-05, + "loss": 1.0312, + "step": 3350 + }, + { + "epoch": 0.633275220279885, + "grad_norm": 0.2200097292661667, + "learning_rate": 1.7881889317000236e-05, + "loss": 1.0237, + "step": 3360 + }, + { + "epoch": 0.6351599679592894, + "grad_norm": 0.222577303647995, + "learning_rate": 1.786972771924927e-05, + "loss": 1.0499, + "step": 3370 + }, + { + "epoch": 0.6370447156386939, + "grad_norm": 0.21570006012916565, + "learning_rate": 1.7857535467840114e-05, + "loss": 1.0217, + "step": 3380 + }, + { + "epoch": 0.6389294633180983, + "grad_norm": 0.2192840576171875, + "learning_rate": 1.7845312610263243e-05, + "loss": 1.0139, + "step": 3390 + }, + { + "epoch": 0.6408142109975027, + "grad_norm": 0.2202545702457428, + "learning_rate": 1.7833059194128352e-05, + "loss": 1.0402, + "step": 3400 + }, + { + "epoch": 0.6426989586769072, + "grad_norm": 0.22077195346355438, + "learning_rate": 1.7820775267164158e-05, + "loss": 1.0507, + "step": 3410 + }, + { + "epoch": 0.6445837063563116, + "grad_norm": 0.21460367739200592, + "learning_rate": 1.7808460877218228e-05, + "loss": 1.0382, + "step": 3420 + }, + { + "epoch": 0.646468454035716, + "grad_norm": 0.2385546714067459, + "learning_rate": 1.7796116072256773e-05, + "loss": 1.0314, + "step": 3430 + }, + { + "epoch": 0.6483532017151203, + "grad_norm": 0.2198946475982666, + "learning_rate": 1.77837409003645e-05, + "loss": 1.0392, + "step": 3440 + }, + { + "epoch": 0.6502379493945248, + "grad_norm": 0.21367612481117249, + "learning_rate": 1.777133540974437e-05, + "loss": 1.0574, + "step": 3450 + }, + { + "epoch": 0.6521226970739292, + "grad_norm": 0.2183675616979599, + "learning_rate": 1.7758899648717463e-05, + "loss": 1.0273, + "step": 3460 + }, + { + "epoch": 0.6540074447533336, + "grad_norm": 0.23218151926994324, + "learning_rate": 1.774643366572275e-05, + "loss": 1.049, + "step": 3470 + }, + { + "epoch": 0.6558921924327381, + "grad_norm": 0.22189924120903015, + "learning_rate": 1.7733937509316934e-05, + "loss": 1.0188, + "step": 3480 + }, + { + "epoch": 0.6577769401121425, + "grad_norm": 0.2215665578842163, + "learning_rate": 1.772141122817424e-05, + "loss": 1.023, + "step": 3490 + }, + { + "epoch": 0.6596616877915469, + "grad_norm": 0.21426072716712952, + "learning_rate": 1.7708854871086233e-05, + "loss": 1.0221, + "step": 3500 + }, + { + "epoch": 0.6615464354709514, + "grad_norm": 0.21811255812644958, + "learning_rate": 1.7696268486961624e-05, + "loss": 1.0378, + "step": 3510 + }, + { + "epoch": 0.6634311831503558, + "grad_norm": 0.21636582911014557, + "learning_rate": 1.7683652124826097e-05, + "loss": 1.0346, + "step": 3520 + }, + { + "epoch": 0.6653159308297601, + "grad_norm": 0.21975700557231903, + "learning_rate": 1.767100583382209e-05, + "loss": 1.0415, + "step": 3530 + }, + { + "epoch": 0.6672006785091645, + "grad_norm": 0.2215617597103119, + "learning_rate": 1.7658329663208626e-05, + "loss": 1.0229, + "step": 3540 + }, + { + "epoch": 0.669085426188569, + "grad_norm": 0.21243301033973694, + "learning_rate": 1.7645623662361112e-05, + "loss": 1.0281, + "step": 3550 + }, + { + "epoch": 0.6709701738679734, + "grad_norm": 0.20586301386356354, + "learning_rate": 1.7632887880771147e-05, + "loss": 1.0451, + "step": 3560 + }, + { + "epoch": 0.6728549215473778, + "grad_norm": 0.22428378462791443, + "learning_rate": 1.7620122368046323e-05, + "loss": 1.041, + "step": 3570 + }, + { + "epoch": 0.6747396692267823, + "grad_norm": 0.23330065608024597, + "learning_rate": 1.7607327173910054e-05, + "loss": 1.0398, + "step": 3580 + }, + { + "epoch": 0.6766244169061867, + "grad_norm": 0.21890075504779816, + "learning_rate": 1.7594502348201358e-05, + "loss": 1.0226, + "step": 3590 + }, + { + "epoch": 0.6785091645855911, + "grad_norm": 0.23604132235050201, + "learning_rate": 1.758164794087467e-05, + "loss": 1.0166, + "step": 3600 + }, + { + "epoch": 0.6803939122649956, + "grad_norm": 0.21405720710754395, + "learning_rate": 1.756876400199965e-05, + "loss": 1.0425, + "step": 3610 + }, + { + "epoch": 0.6822786599444, + "grad_norm": 0.22015635669231415, + "learning_rate": 1.755585058176099e-05, + "loss": 1.0249, + "step": 3620 + }, + { + "epoch": 0.6841634076238043, + "grad_norm": 0.22019772231578827, + "learning_rate": 1.7542907730458222e-05, + "loss": 1.0181, + "step": 3630 + }, + { + "epoch": 0.6860481553032087, + "grad_norm": 0.21981114149093628, + "learning_rate": 1.7529935498505505e-05, + "loss": 1.0365, + "step": 3640 + }, + { + "epoch": 0.6879329029826132, + "grad_norm": 0.22304756939411163, + "learning_rate": 1.7516933936431446e-05, + "loss": 1.0186, + "step": 3650 + }, + { + "epoch": 0.6898176506620176, + "grad_norm": 0.21750465035438538, + "learning_rate": 1.750390309487889e-05, + "loss": 1.027, + "step": 3660 + }, + { + "epoch": 0.691702398341422, + "grad_norm": 0.2187330275774002, + "learning_rate": 1.749084302460474e-05, + "loss": 1.0214, + "step": 3670 + }, + { + "epoch": 0.6935871460208265, + "grad_norm": 0.21320421993732452, + "learning_rate": 1.7477753776479743e-05, + "loss": 1.0229, + "step": 3680 + }, + { + "epoch": 0.6954718937002309, + "grad_norm": 0.21425755321979523, + "learning_rate": 1.7464635401488292e-05, + "loss": 1.0251, + "step": 3690 + }, + { + "epoch": 0.6973566413796353, + "grad_norm": 0.2125249207019806, + "learning_rate": 1.7451487950728247e-05, + "loss": 1.0378, + "step": 3700 + }, + { + "epoch": 0.6992413890590398, + "grad_norm": 0.2168605923652649, + "learning_rate": 1.74383114754107e-05, + "loss": 1.0157, + "step": 3710 + }, + { + "epoch": 0.7011261367384442, + "grad_norm": 0.23292389512062073, + "learning_rate": 1.742510602685983e-05, + "loss": 1.0232, + "step": 3720 + }, + { + "epoch": 0.7030108844178485, + "grad_norm": 0.2234078347682953, + "learning_rate": 1.7411871656512648e-05, + "loss": 1.0342, + "step": 3730 + }, + { + "epoch": 0.704895632097253, + "grad_norm": 0.21237532794475555, + "learning_rate": 1.7398608415918817e-05, + "loss": 1.0347, + "step": 3740 + }, + { + "epoch": 0.7067803797766574, + "grad_norm": 0.21920648217201233, + "learning_rate": 1.7385316356740463e-05, + "loss": 1.0225, + "step": 3750 + }, + { + "epoch": 0.7086651274560618, + "grad_norm": 0.21557724475860596, + "learning_rate": 1.737199553075196e-05, + "loss": 1.0476, + "step": 3760 + }, + { + "epoch": 0.7105498751354662, + "grad_norm": 0.2130526751279831, + "learning_rate": 1.735864598983974e-05, + "loss": 1.0342, + "step": 3770 + }, + { + "epoch": 0.7124346228148707, + "grad_norm": 0.21980445086956024, + "learning_rate": 1.7345267786002073e-05, + "loss": 1.0475, + "step": 3780 + }, + { + "epoch": 0.7143193704942751, + "grad_norm": 0.21147915720939636, + "learning_rate": 1.733186097134888e-05, + "loss": 1.0387, + "step": 3790 + }, + { + "epoch": 0.7162041181736795, + "grad_norm": 0.22808130085468292, + "learning_rate": 1.7318425598101526e-05, + "loss": 1.0402, + "step": 3800 + }, + { + "epoch": 0.718088865853084, + "grad_norm": 0.20858605206012726, + "learning_rate": 1.7304961718592613e-05, + "loss": 1.0177, + "step": 3810 + }, + { + "epoch": 0.7199736135324883, + "grad_norm": 0.21343304216861725, + "learning_rate": 1.729146938526578e-05, + "loss": 1.0124, + "step": 3820 + }, + { + "epoch": 0.7218583612118927, + "grad_norm": 0.22087880969047546, + "learning_rate": 1.72779486506755e-05, + "loss": 1.0081, + "step": 3830 + }, + { + "epoch": 0.7237431088912972, + "grad_norm": 0.2203933447599411, + "learning_rate": 1.726439956748687e-05, + "loss": 1.0249, + "step": 3840 + }, + { + "epoch": 0.7256278565707016, + "grad_norm": 0.20899949967861176, + "learning_rate": 1.7250822188475406e-05, + "loss": 1.0267, + "step": 3850 + }, + { + "epoch": 0.727512604250106, + "grad_norm": 0.21876177191734314, + "learning_rate": 1.7237216566526845e-05, + "loss": 1.0308, + "step": 3860 + }, + { + "epoch": 0.7293973519295104, + "grad_norm": 0.2183823585510254, + "learning_rate": 1.7223582754636936e-05, + "loss": 1.0446, + "step": 3870 + }, + { + "epoch": 0.7312820996089149, + "grad_norm": 0.22745539247989655, + "learning_rate": 1.7209920805911223e-05, + "loss": 1.0057, + "step": 3880 + }, + { + "epoch": 0.7331668472883193, + "grad_norm": 0.21709993481636047, + "learning_rate": 1.719623077356485e-05, + "loss": 1.0234, + "step": 3890 + }, + { + "epoch": 0.7350515949677237, + "grad_norm": 0.20493055880069733, + "learning_rate": 1.7182512710922352e-05, + "loss": 1.0528, + "step": 3900 + }, + { + "epoch": 0.7369363426471282, + "grad_norm": 0.21471171081066132, + "learning_rate": 1.716876667141745e-05, + "loss": 1.0187, + "step": 3910 + }, + { + "epoch": 0.7388210903265325, + "grad_norm": 0.20834101736545563, + "learning_rate": 1.7154992708592838e-05, + "loss": 1.0108, + "step": 3920 + }, + { + "epoch": 0.7407058380059369, + "grad_norm": 0.215990349650383, + "learning_rate": 1.7141190876099964e-05, + "loss": 0.9974, + "step": 3930 + }, + { + "epoch": 0.7425905856853414, + "grad_norm": 0.21805961430072784, + "learning_rate": 1.7127361227698843e-05, + "loss": 1.0496, + "step": 3940 + }, + { + "epoch": 0.7444753333647458, + "grad_norm": 0.20897091925144196, + "learning_rate": 1.711350381725783e-05, + "loss": 1.0383, + "step": 3950 + }, + { + "epoch": 0.7463600810441502, + "grad_norm": 0.2209184616804123, + "learning_rate": 1.7099618698753427e-05, + "loss": 1.0365, + "step": 3960 + }, + { + "epoch": 0.7482448287235547, + "grad_norm": 0.2159198820590973, + "learning_rate": 1.708570592627006e-05, + "loss": 1.0085, + "step": 3970 + }, + { + "epoch": 0.7501295764029591, + "grad_norm": 0.2072182446718216, + "learning_rate": 1.707176555399985e-05, + "loss": 1.0379, + "step": 3980 + }, + { + "epoch": 0.7520143240823635, + "grad_norm": 0.20915807783603668, + "learning_rate": 1.7057797636242452e-05, + "loss": 1.0222, + "step": 3990 + }, + { + "epoch": 0.7538990717617678, + "grad_norm": 0.21903884410858154, + "learning_rate": 1.7043802227404805e-05, + "loss": 1.0261, + "step": 4000 + }, + { + "epoch": 0.7557838194411723, + "grad_norm": 0.21355722844600677, + "learning_rate": 1.702977938200092e-05, + "loss": 1.0188, + "step": 4010 + }, + { + "epoch": 0.7576685671205767, + "grad_norm": 0.21749593317508698, + "learning_rate": 1.701572915465169e-05, + "loss": 1.0264, + "step": 4020 + }, + { + "epoch": 0.7595533147999811, + "grad_norm": 0.2184886634349823, + "learning_rate": 1.700165160008465e-05, + "loss": 1.0543, + "step": 4030 + }, + { + "epoch": 0.7614380624793856, + "grad_norm": 0.2109834849834442, + "learning_rate": 1.6987546773133797e-05, + "loss": 1.0206, + "step": 4040 + }, + { + "epoch": 0.76332281015879, + "grad_norm": 0.21594549715518951, + "learning_rate": 1.6973414728739342e-05, + "loss": 1.0176, + "step": 4050 + }, + { + "epoch": 0.7652075578381944, + "grad_norm": 0.2345925122499466, + "learning_rate": 1.695925552194752e-05, + "loss": 1.0189, + "step": 4060 + }, + { + "epoch": 0.7670923055175989, + "grad_norm": 0.22583965957164764, + "learning_rate": 1.694506920791036e-05, + "loss": 1.0204, + "step": 4070 + }, + { + "epoch": 0.7689770531970033, + "grad_norm": 0.22112563252449036, + "learning_rate": 1.693085584188548e-05, + "loss": 1.0016, + "step": 4080 + }, + { + "epoch": 0.7708618008764077, + "grad_norm": 0.22002318501472473, + "learning_rate": 1.6916615479235884e-05, + "loss": 1.0294, + "step": 4090 + }, + { + "epoch": 0.772746548555812, + "grad_norm": 0.21532638370990753, + "learning_rate": 1.6902348175429706e-05, + "loss": 1.0297, + "step": 4100 + }, + { + "epoch": 0.7746312962352165, + "grad_norm": 0.2174837738275528, + "learning_rate": 1.6888053986040032e-05, + "loss": 1.0233, + "step": 4110 + }, + { + "epoch": 0.7765160439146209, + "grad_norm": 0.2098788470029831, + "learning_rate": 1.6873732966744678e-05, + "loss": 1.0373, + "step": 4120 + }, + { + "epoch": 0.7784007915940253, + "grad_norm": 0.20841015875339508, + "learning_rate": 1.6859385173325952e-05, + "loss": 1.0174, + "step": 4130 + }, + { + "epoch": 0.7802855392734298, + "grad_norm": 0.2148674577474594, + "learning_rate": 1.6845010661670462e-05, + "loss": 0.9996, + "step": 4140 + }, + { + "epoch": 0.7821702869528342, + "grad_norm": 0.21161173284053802, + "learning_rate": 1.6830609487768882e-05, + "loss": 0.9921, + "step": 4150 + }, + { + "epoch": 0.7840550346322386, + "grad_norm": 0.2190217673778534, + "learning_rate": 1.6816181707715738e-05, + "loss": 1.0201, + "step": 4160 + }, + { + "epoch": 0.7859397823116431, + "grad_norm": 0.2249136120080948, + "learning_rate": 1.6801727377709195e-05, + "loss": 1.0327, + "step": 4170 + }, + { + "epoch": 0.7878245299910475, + "grad_norm": 0.2299618124961853, + "learning_rate": 1.678724655405083e-05, + "loss": 1.0249, + "step": 4180 + }, + { + "epoch": 0.7897092776704518, + "grad_norm": 0.21398840844631195, + "learning_rate": 1.677273929314542e-05, + "loss": 1.0276, + "step": 4190 + }, + { + "epoch": 0.7915940253498562, + "grad_norm": 0.219278484582901, + "learning_rate": 1.6758205651500716e-05, + "loss": 1.0196, + "step": 4200 + }, + { + "epoch": 0.7934787730292607, + "grad_norm": 0.21358928084373474, + "learning_rate": 1.674364568572722e-05, + "loss": 1.0075, + "step": 4210 + }, + { + "epoch": 0.7953635207086651, + "grad_norm": 0.22888432443141937, + "learning_rate": 1.6729059452537984e-05, + "loss": 1.0204, + "step": 4220 + }, + { + "epoch": 0.7972482683880695, + "grad_norm": 0.21871310472488403, + "learning_rate": 1.6714447008748365e-05, + "loss": 1.005, + "step": 4230 + }, + { + "epoch": 0.799133016067474, + "grad_norm": 0.22267279028892517, + "learning_rate": 1.669980841127581e-05, + "loss": 1.0257, + "step": 4240 + }, + { + "epoch": 0.8010177637468784, + "grad_norm": 0.22739073634147644, + "learning_rate": 1.6685143717139654e-05, + "loss": 1.0141, + "step": 4250 + }, + { + "epoch": 0.8029025114262828, + "grad_norm": 0.21348080039024353, + "learning_rate": 1.6670452983460866e-05, + "loss": 1.0071, + "step": 4260 + }, + { + "epoch": 0.8047872591056873, + "grad_norm": 0.22079524397850037, + "learning_rate": 1.6655736267461846e-05, + "loss": 1.0168, + "step": 4270 + }, + { + "epoch": 0.8066720067850917, + "grad_norm": 0.22130829095840454, + "learning_rate": 1.6640993626466208e-05, + "loss": 1.0261, + "step": 4280 + }, + { + "epoch": 0.808556754464496, + "grad_norm": 0.22089999914169312, + "learning_rate": 1.6626225117898535e-05, + "loss": 1.0385, + "step": 4290 + }, + { + "epoch": 0.8104415021439005, + "grad_norm": 0.2258572280406952, + "learning_rate": 1.661143079928418e-05, + "loss": 1.0211, + "step": 4300 + }, + { + "epoch": 0.8123262498233049, + "grad_norm": 0.20995450019836426, + "learning_rate": 1.6596610728249015e-05, + "loss": 1.0015, + "step": 4310 + }, + { + "epoch": 0.8142109975027093, + "grad_norm": 0.21556559205055237, + "learning_rate": 1.6581764962519235e-05, + "loss": 1.029, + "step": 4320 + }, + { + "epoch": 0.8160957451821137, + "grad_norm": 0.23006987571716309, + "learning_rate": 1.656689355992111e-05, + "loss": 1.0299, + "step": 4330 + }, + { + "epoch": 0.8179804928615182, + "grad_norm": 0.22498996555805206, + "learning_rate": 1.655199657838078e-05, + "loss": 1.0191, + "step": 4340 + }, + { + "epoch": 0.8198652405409226, + "grad_norm": 0.22021719813346863, + "learning_rate": 1.6537074075924e-05, + "loss": 1.0053, + "step": 4350 + }, + { + "epoch": 0.821749988220327, + "grad_norm": 0.22088730335235596, + "learning_rate": 1.6522126110675954e-05, + "loss": 1.0215, + "step": 4360 + }, + { + "epoch": 0.8236347358997315, + "grad_norm": 0.21371595561504364, + "learning_rate": 1.650715274086099e-05, + "loss": 1.0037, + "step": 4370 + }, + { + "epoch": 0.8255194835791358, + "grad_norm": 0.220917746424675, + "learning_rate": 1.649215402480242e-05, + "loss": 1.006, + "step": 4380 + }, + { + "epoch": 0.8274042312585402, + "grad_norm": 0.23192252218723297, + "learning_rate": 1.6477130020922277e-05, + "loss": 1.001, + "step": 4390 + }, + { + "epoch": 0.8292889789379447, + "grad_norm": 0.2321854829788208, + "learning_rate": 1.6462080787741103e-05, + "loss": 1.0007, + "step": 4400 + }, + { + "epoch": 0.8311737266173491, + "grad_norm": 0.2218000292778015, + "learning_rate": 1.6447006383877697e-05, + "loss": 1.0253, + "step": 4410 + }, + { + "epoch": 0.8330584742967535, + "grad_norm": 0.21392178535461426, + "learning_rate": 1.643190686804891e-05, + "loss": 0.9973, + "step": 4420 + }, + { + "epoch": 0.8349432219761579, + "grad_norm": 0.21729114651679993, + "learning_rate": 1.6416782299069413e-05, + "loss": 1.0189, + "step": 4430 + }, + { + "epoch": 0.8368279696555624, + "grad_norm": 0.21560348570346832, + "learning_rate": 1.640163273585145e-05, + "loss": 1.0091, + "step": 4440 + }, + { + "epoch": 0.8387127173349668, + "grad_norm": 0.21741706132888794, + "learning_rate": 1.638645823740463e-05, + "loss": 1.0083, + "step": 4450 + }, + { + "epoch": 0.8405974650143712, + "grad_norm": 0.21971356868743896, + "learning_rate": 1.637125886283568e-05, + "loss": 0.9963, + "step": 4460 + }, + { + "epoch": 0.8424822126937757, + "grad_norm": 0.21592329442501068, + "learning_rate": 1.635603467134824e-05, + "loss": 0.9938, + "step": 4470 + }, + { + "epoch": 0.84436696037318, + "grad_norm": 0.21293628215789795, + "learning_rate": 1.6340785722242592e-05, + "loss": 1.005, + "step": 4480 + }, + { + "epoch": 0.8462517080525844, + "grad_norm": 0.20907250046730042, + "learning_rate": 1.6325512074915467e-05, + "loss": 1.0308, + "step": 4490 + }, + { + "epoch": 0.8481364557319889, + "grad_norm": 0.23167361319065094, + "learning_rate": 1.6310213788859796e-05, + "loss": 1.0016, + "step": 4500 + }, + { + "epoch": 0.8500212034113933, + "grad_norm": 0.21692821383476257, + "learning_rate": 1.629489092366448e-05, + "loss": 1.0129, + "step": 4510 + }, + { + "epoch": 0.8519059510907977, + "grad_norm": 0.21018582582473755, + "learning_rate": 1.6279543539014163e-05, + "loss": 1.0142, + "step": 4520 + }, + { + "epoch": 0.8537906987702022, + "grad_norm": 0.21704675257205963, + "learning_rate": 1.6264171694688986e-05, + "loss": 1.0071, + "step": 4530 + }, + { + "epoch": 0.8556754464496066, + "grad_norm": 0.22476141154766083, + "learning_rate": 1.6248775450564375e-05, + "loss": 1.0217, + "step": 4540 + }, + { + "epoch": 0.857560194129011, + "grad_norm": 0.22506673634052277, + "learning_rate": 1.623335486661079e-05, + "loss": 1.0178, + "step": 4550 + }, + { + "epoch": 0.8594449418084154, + "grad_norm": 0.2084660530090332, + "learning_rate": 1.6217910002893505e-05, + "loss": 1.0084, + "step": 4560 + }, + { + "epoch": 0.8613296894878198, + "grad_norm": 0.22871838510036469, + "learning_rate": 1.6202440919572353e-05, + "loss": 0.997, + "step": 4570 + }, + { + "epoch": 0.8632144371672242, + "grad_norm": 0.22012539207935333, + "learning_rate": 1.618694767690152e-05, + "loss": 1.0251, + "step": 4580 + }, + { + "epoch": 0.8650991848466286, + "grad_norm": 0.22487348318099976, + "learning_rate": 1.6171430335229285e-05, + "loss": 1.0048, + "step": 4590 + }, + { + "epoch": 0.8669839325260331, + "grad_norm": 0.22973212599754333, + "learning_rate": 1.615588895499781e-05, + "loss": 0.9988, + "step": 4600 + }, + { + "epoch": 0.8688686802054375, + "grad_norm": 0.2214285135269165, + "learning_rate": 1.614032359674287e-05, + "loss": 1.0082, + "step": 4610 + }, + { + "epoch": 0.8707534278848419, + "grad_norm": 0.218957781791687, + "learning_rate": 1.6124734321093658e-05, + "loss": 1.0068, + "step": 4620 + }, + { + "epoch": 0.8726381755642464, + "grad_norm": 0.2046903520822525, + "learning_rate": 1.610912118877252e-05, + "loss": 1.0229, + "step": 4630 + }, + { + "epoch": 0.8745229232436508, + "grad_norm": 0.22204430401325226, + "learning_rate": 1.6093484260594718e-05, + "loss": 1.0143, + "step": 4640 + }, + { + "epoch": 0.8764076709230552, + "grad_norm": 0.21503140032291412, + "learning_rate": 1.607782359746822e-05, + "loss": 1.0162, + "step": 4650 + }, + { + "epoch": 0.8782924186024595, + "grad_norm": 0.2154746800661087, + "learning_rate": 1.606213926039343e-05, + "loss": 1.0051, + "step": 4660 + }, + { + "epoch": 0.880177166281864, + "grad_norm": 0.21909531950950623, + "learning_rate": 1.6046431310462977e-05, + "loss": 1.0071, + "step": 4670 + }, + { + "epoch": 0.8820619139612684, + "grad_norm": 0.2458638846874237, + "learning_rate": 1.603069980886145e-05, + "loss": 1.002, + "step": 4680 + }, + { + "epoch": 0.8839466616406728, + "grad_norm": 0.2160467654466629, + "learning_rate": 1.601494481686519e-05, + "loss": 0.9921, + "step": 4690 + }, + { + "epoch": 0.8858314093200773, + "grad_norm": 0.2116820067167282, + "learning_rate": 1.5999166395842028e-05, + "loss": 1.0078, + "step": 4700 + }, + { + "epoch": 0.8877161569994817, + "grad_norm": 0.22162412106990814, + "learning_rate": 1.598336460725106e-05, + "loss": 1.0049, + "step": 4710 + }, + { + "epoch": 0.8896009046788861, + "grad_norm": 0.21271055936813354, + "learning_rate": 1.59675395126424e-05, + "loss": 0.9987, + "step": 4720 + }, + { + "epoch": 0.8914856523582906, + "grad_norm": 0.20891942083835602, + "learning_rate": 1.5951691173656933e-05, + "loss": 1.0026, + "step": 4730 + }, + { + "epoch": 0.893370400037695, + "grad_norm": 0.21908487379550934, + "learning_rate": 1.5935819652026098e-05, + "loss": 1.0133, + "step": 4740 + }, + { + "epoch": 0.8952551477170994, + "grad_norm": 0.20726846158504486, + "learning_rate": 1.5919925009571623e-05, + "loss": 1.0021, + "step": 4750 + }, + { + "epoch": 0.8971398953965037, + "grad_norm": 0.2279621958732605, + "learning_rate": 1.5904007308205303e-05, + "loss": 1.0051, + "step": 4760 + }, + { + "epoch": 0.8990246430759082, + "grad_norm": 0.21259722113609314, + "learning_rate": 1.5888066609928742e-05, + "loss": 0.9895, + "step": 4770 + }, + { + "epoch": 0.9009093907553126, + "grad_norm": 0.21769499778747559, + "learning_rate": 1.5872102976833125e-05, + "loss": 0.9876, + "step": 4780 + }, + { + "epoch": 0.902794138434717, + "grad_norm": 0.21847137808799744, + "learning_rate": 1.5856116471098976e-05, + "loss": 0.9992, + "step": 4790 + }, + { + "epoch": 0.9046788861141215, + "grad_norm": 0.2173769176006317, + "learning_rate": 1.58401071549959e-05, + "loss": 0.9922, + "step": 4800 + }, + { + "epoch": 0.9065636337935259, + "grad_norm": 0.209466814994812, + "learning_rate": 1.5824075090882364e-05, + "loss": 1.0007, + "step": 4810 + }, + { + "epoch": 0.9084483814729303, + "grad_norm": 0.20582905411720276, + "learning_rate": 1.5808020341205427e-05, + "loss": 1.0115, + "step": 4820 + }, + { + "epoch": 0.9103331291523348, + "grad_norm": 0.2141278088092804, + "learning_rate": 1.5791942968500523e-05, + "loss": 1.0065, + "step": 4830 + }, + { + "epoch": 0.9122178768317392, + "grad_norm": 0.2126617431640625, + "learning_rate": 1.57758430353912e-05, + "loss": 0.9886, + "step": 4840 + }, + { + "epoch": 0.9141026245111435, + "grad_norm": 0.21502944827079773, + "learning_rate": 1.575972060458889e-05, + "loss": 1.0149, + "step": 4850 + }, + { + "epoch": 0.915987372190548, + "grad_norm": 0.21828798949718475, + "learning_rate": 1.5743575738892645e-05, + "loss": 0.9983, + "step": 4860 + }, + { + "epoch": 0.9178721198699524, + "grad_norm": 0.20862551033496857, + "learning_rate": 1.5727408501188907e-05, + "loss": 1.0046, + "step": 4870 + }, + { + "epoch": 0.9197568675493568, + "grad_norm": 0.21126747131347656, + "learning_rate": 1.571121895445127e-05, + "loss": 0.9912, + "step": 4880 + }, + { + "epoch": 0.9216416152287612, + "grad_norm": 0.2120884507894516, + "learning_rate": 1.5695007161740213e-05, + "loss": 0.9992, + "step": 4890 + }, + { + "epoch": 0.9235263629081657, + "grad_norm": 0.21593983471393585, + "learning_rate": 1.567877318620287e-05, + "loss": 0.9789, + "step": 4900 + }, + { + "epoch": 0.9254111105875701, + "grad_norm": 0.2110901027917862, + "learning_rate": 1.5662517091072777e-05, + "loss": 0.9973, + "step": 4910 + }, + { + "epoch": 0.9272958582669745, + "grad_norm": 0.21888457238674164, + "learning_rate": 1.5646238939669637e-05, + "loss": 1.0032, + "step": 4920 + }, + { + "epoch": 0.929180605946379, + "grad_norm": 0.21466167271137238, + "learning_rate": 1.562993879539906e-05, + "loss": 0.9984, + "step": 4930 + }, + { + "epoch": 0.9310653536257834, + "grad_norm": 0.22090941667556763, + "learning_rate": 1.5613616721752322e-05, + "loss": 0.9717, + "step": 4940 + }, + { + "epoch": 0.9329501013051877, + "grad_norm": 0.2137564718723297, + "learning_rate": 1.5597272782306112e-05, + "loss": 1.0011, + "step": 4950 + }, + { + "epoch": 0.9348348489845922, + "grad_norm": 0.2229919135570526, + "learning_rate": 1.55809070407223e-05, + "loss": 0.9924, + "step": 4960 + }, + { + "epoch": 0.9367195966639966, + "grad_norm": 0.2118062973022461, + "learning_rate": 1.5564519560747665e-05, + "loss": 1.0027, + "step": 4970 + }, + { + "epoch": 0.938604344343401, + "grad_norm": 0.22967861592769623, + "learning_rate": 1.5548110406213673e-05, + "loss": 1.0114, + "step": 4980 + }, + { + "epoch": 0.9404890920228054, + "grad_norm": 0.46491095423698425, + "learning_rate": 1.5531679641036206e-05, + "loss": 0.9999, + "step": 4990 + }, + { + "epoch": 0.9423738397022099, + "grad_norm": 0.21877862513065338, + "learning_rate": 1.5515227329215324e-05, + "loss": 1.0046, + "step": 5000 + }, + { + "epoch": 0.9442585873816143, + "grad_norm": 0.2100636214017868, + "learning_rate": 1.5498753534835024e-05, + "loss": 1.0016, + "step": 5010 + }, + { + "epoch": 0.9461433350610187, + "grad_norm": 0.20900030434131622, + "learning_rate": 1.548225832206296e-05, + "loss": 0.974, + "step": 5020 + }, + { + "epoch": 0.9480280827404232, + "grad_norm": 0.2143971025943756, + "learning_rate": 1.5465741755150237e-05, + "loss": 1.0087, + "step": 5030 + }, + { + "epoch": 0.9499128304198275, + "grad_norm": 0.2131759375333786, + "learning_rate": 1.544920389843112e-05, + "loss": 1.0109, + "step": 5040 + }, + { + "epoch": 0.9517975780992319, + "grad_norm": 0.20999424159526825, + "learning_rate": 1.5432644816322808e-05, + "loss": 0.9945, + "step": 5050 + }, + { + "epoch": 0.9536823257786364, + "grad_norm": 0.20096446573734283, + "learning_rate": 1.5416064573325175e-05, + "loss": 1.0074, + "step": 5060 + }, + { + "epoch": 0.9555670734580408, + "grad_norm": 0.21887625753879547, + "learning_rate": 1.539946323402052e-05, + "loss": 0.9822, + "step": 5070 + }, + { + "epoch": 0.9574518211374452, + "grad_norm": 0.2174791693687439, + "learning_rate": 1.5382840863073312e-05, + "loss": 1.0008, + "step": 5080 + }, + { + "epoch": 0.9593365688168497, + "grad_norm": 0.21888576447963715, + "learning_rate": 1.5366197525229942e-05, + "loss": 1.0008, + "step": 5090 + }, + { + "epoch": 0.9612213164962541, + "grad_norm": 0.21766896545886993, + "learning_rate": 1.5349533285318477e-05, + "loss": 1.0103, + "step": 5100 + }, + { + "epoch": 0.9631060641756585, + "grad_norm": 0.2162325084209442, + "learning_rate": 1.533284820824839e-05, + "loss": 0.999, + "step": 5110 + }, + { + "epoch": 0.9649908118550629, + "grad_norm": 0.21531203389167786, + "learning_rate": 1.5316142359010323e-05, + "loss": 0.9913, + "step": 5120 + }, + { + "epoch": 0.9668755595344674, + "grad_norm": 0.2119629681110382, + "learning_rate": 1.5299415802675826e-05, + "loss": 0.999, + "step": 5130 + }, + { + "epoch": 0.9687603072138717, + "grad_norm": 0.21696802973747253, + "learning_rate": 1.5282668604397108e-05, + "loss": 1.0132, + "step": 5140 + }, + { + "epoch": 0.9706450548932761, + "grad_norm": 0.2109297215938568, + "learning_rate": 1.5265900829406787e-05, + "loss": 1.0087, + "step": 5150 + }, + { + "epoch": 0.9725298025726806, + "grad_norm": 0.22545278072357178, + "learning_rate": 1.524911254301761e-05, + "loss": 0.9817, + "step": 5160 + }, + { + "epoch": 0.974414550252085, + "grad_norm": 0.22250774502754211, + "learning_rate": 1.5232303810622245e-05, + "loss": 0.9953, + "step": 5170 + }, + { + "epoch": 0.9762992979314894, + "grad_norm": 0.2173687219619751, + "learning_rate": 1.5215474697692977e-05, + "loss": 0.9917, + "step": 5180 + }, + { + "epoch": 0.9781840456108939, + "grad_norm": 0.253191739320755, + "learning_rate": 1.51986252697815e-05, + "loss": 0.9905, + "step": 5190 + }, + { + "epoch": 0.9800687932902983, + "grad_norm": 0.2323060780763626, + "learning_rate": 1.5181755592518613e-05, + "loss": 0.9949, + "step": 5200 + }, + { + "epoch": 0.9819535409697027, + "grad_norm": 0.22499704360961914, + "learning_rate": 1.5164865731614002e-05, + "loss": 0.9993, + "step": 5210 + }, + { + "epoch": 0.983838288649107, + "grad_norm": 0.21726462244987488, + "learning_rate": 1.5147955752855967e-05, + "loss": 0.991, + "step": 5220 + }, + { + "epoch": 0.9857230363285115, + "grad_norm": 0.21456758677959442, + "learning_rate": 1.5131025722111176e-05, + "loss": 0.9911, + "step": 5230 + }, + { + "epoch": 0.9876077840079159, + "grad_norm": 0.21249902248382568, + "learning_rate": 1.5114075705324398e-05, + "loss": 0.9895, + "step": 5240 + }, + { + "epoch": 0.9894925316873203, + "grad_norm": 0.2184160351753235, + "learning_rate": 1.5097105768518248e-05, + "loss": 1.014, + "step": 5250 + }, + { + "epoch": 0.9913772793667248, + "grad_norm": 0.22215837240219116, + "learning_rate": 1.5080115977792933e-05, + "loss": 1.0215, + "step": 5260 + }, + { + "epoch": 0.9932620270461292, + "grad_norm": 0.2187243402004242, + "learning_rate": 1.5063106399325997e-05, + "loss": 0.9899, + "step": 5270 + }, + { + "epoch": 0.9951467747255336, + "grad_norm": 0.21137861907482147, + "learning_rate": 1.504607709937206e-05, + "loss": 0.9881, + "step": 5280 + }, + { + "epoch": 0.9970315224049381, + "grad_norm": 0.2110944241285324, + "learning_rate": 1.5029028144262555e-05, + "loss": 0.9817, + "step": 5290 + }, + { + "epoch": 0.9989162700843425, + "grad_norm": 0.21050478518009186, + "learning_rate": 1.5011959600405477e-05, + "loss": 1.0142, + "step": 5300 + }, + { + "epoch": 1.0007538990717617, + "grad_norm": 0.29858848452568054, + "learning_rate": 1.4994871534285125e-05, + "loss": 0.9929, + "step": 5310 + }, + { + "epoch": 1.002638646751166, + "grad_norm": 0.23209863901138306, + "learning_rate": 1.4977764012461836e-05, + "loss": 0.9236, + "step": 5320 + }, + { + "epoch": 1.0045233944305707, + "grad_norm": 0.23006853461265564, + "learning_rate": 1.4960637101571727e-05, + "loss": 0.9375, + "step": 5330 + }, + { + "epoch": 1.006408142109975, + "grad_norm": 0.2194632738828659, + "learning_rate": 1.4943490868326449e-05, + "loss": 0.9394, + "step": 5340 + }, + { + "epoch": 1.0082928897893795, + "grad_norm": 0.22014226019382477, + "learning_rate": 1.4926325379512903e-05, + "loss": 0.9158, + "step": 5350 + }, + { + "epoch": 1.0101776374687839, + "grad_norm": 0.22187545895576477, + "learning_rate": 1.4909140701993e-05, + "loss": 0.9137, + "step": 5360 + }, + { + "epoch": 1.0120623851481882, + "grad_norm": 0.21812096238136292, + "learning_rate": 1.4891936902703399e-05, + "loss": 0.9172, + "step": 5370 + }, + { + "epoch": 1.0139471328275926, + "grad_norm": 0.22761112451553345, + "learning_rate": 1.4874714048655226e-05, + "loss": 0.9203, + "step": 5380 + }, + { + "epoch": 1.015831880506997, + "grad_norm": 0.22453482449054718, + "learning_rate": 1.4857472206933838e-05, + "loss": 0.9313, + "step": 5390 + }, + { + "epoch": 1.0177166281864016, + "grad_norm": 0.22108279168605804, + "learning_rate": 1.4840211444698556e-05, + "loss": 0.9395, + "step": 5400 + }, + { + "epoch": 1.019601375865806, + "grad_norm": 0.21845021843910217, + "learning_rate": 1.4822931829182383e-05, + "loss": 0.9023, + "step": 5410 + }, + { + "epoch": 1.0214861235452104, + "grad_norm": 0.6463075876235962, + "learning_rate": 1.480563342769178e-05, + "loss": 0.9435, + "step": 5420 + }, + { + "epoch": 1.0233708712246148, + "grad_norm": 0.23657025396823883, + "learning_rate": 1.4788316307606358e-05, + "loss": 0.9082, + "step": 5430 + }, + { + "epoch": 1.0252556189040192, + "grad_norm": 0.2216455638408661, + "learning_rate": 1.477098053637866e-05, + "loss": 0.9357, + "step": 5440 + }, + { + "epoch": 1.0271403665834236, + "grad_norm": 0.2289433777332306, + "learning_rate": 1.4753626181533867e-05, + "loss": 0.9341, + "step": 5450 + }, + { + "epoch": 1.0290251142628282, + "grad_norm": 0.22586382925510406, + "learning_rate": 1.4736253310669547e-05, + "loss": 0.9314, + "step": 5460 + }, + { + "epoch": 1.0309098619422326, + "grad_norm": 0.21936683356761932, + "learning_rate": 1.4718861991455389e-05, + "loss": 0.9217, + "step": 5470 + }, + { + "epoch": 1.032794609621637, + "grad_norm": 0.2179298847913742, + "learning_rate": 1.4701452291632948e-05, + "loss": 0.9296, + "step": 5480 + }, + { + "epoch": 1.0346793573010413, + "grad_norm": 0.22207029163837433, + "learning_rate": 1.4684024279015366e-05, + "loss": 0.9381, + "step": 5490 + }, + { + "epoch": 1.0365641049804457, + "grad_norm": 0.23602813482284546, + "learning_rate": 1.4666578021487116e-05, + "loss": 0.9522, + "step": 5500 + }, + { + "epoch": 1.03844885265985, + "grad_norm": 0.23088358342647552, + "learning_rate": 1.464911358700375e-05, + "loss": 0.9164, + "step": 5510 + }, + { + "epoch": 1.0403336003392545, + "grad_norm": 0.22507105767726898, + "learning_rate": 1.46316310435916e-05, + "loss": 0.929, + "step": 5520 + }, + { + "epoch": 1.042218348018659, + "grad_norm": 0.23624849319458008, + "learning_rate": 1.4614130459347558e-05, + "loss": 0.9147, + "step": 5530 + }, + { + "epoch": 1.0441030956980635, + "grad_norm": 0.23513133823871613, + "learning_rate": 1.4596611902438765e-05, + "loss": 0.9191, + "step": 5540 + }, + { + "epoch": 1.0459878433774679, + "grad_norm": 0.21015501022338867, + "learning_rate": 1.4579075441102391e-05, + "loss": 0.9342, + "step": 5550 + }, + { + "epoch": 1.0478725910568722, + "grad_norm": 0.21727100014686584, + "learning_rate": 1.4561521143645323e-05, + "loss": 0.919, + "step": 5560 + }, + { + "epoch": 1.0497573387362766, + "grad_norm": 0.2306618094444275, + "learning_rate": 1.4543949078443942e-05, + "loss": 0.9345, + "step": 5570 + }, + { + "epoch": 1.051642086415681, + "grad_norm": 0.22116506099700928, + "learning_rate": 1.4526359313943822e-05, + "loss": 0.9072, + "step": 5580 + }, + { + "epoch": 1.0535268340950856, + "grad_norm": 0.22553704679012299, + "learning_rate": 1.4508751918659492e-05, + "loss": 0.9432, + "step": 5590 + }, + { + "epoch": 1.05541158177449, + "grad_norm": 0.2522180676460266, + "learning_rate": 1.449112696117414e-05, + "loss": 0.9281, + "step": 5600 + }, + { + "epoch": 1.0572963294538944, + "grad_norm": 0.22489477694034576, + "learning_rate": 1.4473484510139373e-05, + "loss": 0.9405, + "step": 5610 + }, + { + "epoch": 1.0591810771332988, + "grad_norm": 0.22234433889389038, + "learning_rate": 1.4455824634274936e-05, + "loss": 0.9246, + "step": 5620 + }, + { + "epoch": 1.0610658248127032, + "grad_norm": 0.21798133850097656, + "learning_rate": 1.4438147402368436e-05, + "loss": 0.9061, + "step": 5630 + }, + { + "epoch": 1.0629505724921076, + "grad_norm": 0.22142323851585388, + "learning_rate": 1.44204528832751e-05, + "loss": 0.9092, + "step": 5640 + }, + { + "epoch": 1.064835320171512, + "grad_norm": 0.22391140460968018, + "learning_rate": 1.4402741145917475e-05, + "loss": 0.9223, + "step": 5650 + }, + { + "epoch": 1.0667200678509166, + "grad_norm": 0.2199719101190567, + "learning_rate": 1.4385012259285194e-05, + "loss": 0.9306, + "step": 5660 + }, + { + "epoch": 1.068604815530321, + "grad_norm": 0.21679045259952545, + "learning_rate": 1.4367266292434665e-05, + "loss": 0.9385, + "step": 5670 + }, + { + "epoch": 1.0704895632097253, + "grad_norm": 0.2160375416278839, + "learning_rate": 1.4349503314488848e-05, + "loss": 0.9251, + "step": 5680 + }, + { + "epoch": 1.0723743108891297, + "grad_norm": 0.2267838716506958, + "learning_rate": 1.4331723394636951e-05, + "loss": 0.9213, + "step": 5690 + }, + { + "epoch": 1.074259058568534, + "grad_norm": 0.21326640248298645, + "learning_rate": 1.431392660213418e-05, + "loss": 0.9257, + "step": 5700 + }, + { + "epoch": 1.0761438062479385, + "grad_norm": 0.22939977049827576, + "learning_rate": 1.4296113006301453e-05, + "loss": 0.9281, + "step": 5710 + }, + { + "epoch": 1.0780285539273429, + "grad_norm": 0.22874711453914642, + "learning_rate": 1.4278282676525148e-05, + "loss": 0.9255, + "step": 5720 + }, + { + "epoch": 1.0799133016067475, + "grad_norm": 0.22632792592048645, + "learning_rate": 1.4260435682256822e-05, + "loss": 0.9232, + "step": 5730 + }, + { + "epoch": 1.0817980492861519, + "grad_norm": 0.22694247961044312, + "learning_rate": 1.4242572093012939e-05, + "loss": 0.9102, + "step": 5740 + }, + { + "epoch": 1.0836827969655562, + "grad_norm": 0.21883288025856018, + "learning_rate": 1.4224691978374608e-05, + "loss": 0.9304, + "step": 5750 + }, + { + "epoch": 1.0855675446449606, + "grad_norm": 0.21993470191955566, + "learning_rate": 1.4206795407987297e-05, + "loss": 0.9281, + "step": 5760 + }, + { + "epoch": 1.087452292324365, + "grad_norm": 0.2177504450082779, + "learning_rate": 1.4188882451560586e-05, + "loss": 0.9157, + "step": 5770 + }, + { + "epoch": 1.0893370400037694, + "grad_norm": 0.22279870510101318, + "learning_rate": 1.4170953178867868e-05, + "loss": 0.9303, + "step": 5780 + }, + { + "epoch": 1.091221787683174, + "grad_norm": 0.22243599593639374, + "learning_rate": 1.4153007659746096e-05, + "loss": 0.9103, + "step": 5790 + }, + { + "epoch": 1.0931065353625784, + "grad_norm": 0.2217782586812973, + "learning_rate": 1.4135045964095507e-05, + "loss": 0.9102, + "step": 5800 + }, + { + "epoch": 1.0949912830419828, + "grad_norm": 0.22789952158927917, + "learning_rate": 1.411706816187934e-05, + "loss": 0.9387, + "step": 5810 + }, + { + "epoch": 1.0968760307213872, + "grad_norm": 0.22178812325000763, + "learning_rate": 1.4099074323123577e-05, + "loss": 0.9064, + "step": 5820 + }, + { + "epoch": 1.0987607784007916, + "grad_norm": 0.22514748573303223, + "learning_rate": 1.4081064517916667e-05, + "loss": 0.9391, + "step": 5830 + }, + { + "epoch": 1.100645526080196, + "grad_norm": 0.2329104095697403, + "learning_rate": 1.4063038816409245e-05, + "loss": 0.9199, + "step": 5840 + }, + { + "epoch": 1.1025302737596003, + "grad_norm": 0.23610034584999084, + "learning_rate": 1.4044997288813865e-05, + "loss": 0.915, + "step": 5850 + }, + { + "epoch": 1.104415021439005, + "grad_norm": 0.2151385247707367, + "learning_rate": 1.4026940005404726e-05, + "loss": 0.9285, + "step": 5860 + }, + { + "epoch": 1.1062997691184093, + "grad_norm": 0.22116857767105103, + "learning_rate": 1.4008867036517403e-05, + "loss": 0.9379, + "step": 5870 + }, + { + "epoch": 1.1081845167978137, + "grad_norm": 0.4489651024341583, + "learning_rate": 1.3990778452548563e-05, + "loss": 0.9053, + "step": 5880 + }, + { + "epoch": 1.110069264477218, + "grad_norm": 0.22348885238170624, + "learning_rate": 1.3972674323955699e-05, + "loss": 0.9226, + "step": 5890 + }, + { + "epoch": 1.1119540121566225, + "grad_norm": 0.22957371175289154, + "learning_rate": 1.3954554721256843e-05, + "loss": 0.91, + "step": 5900 + }, + { + "epoch": 1.1138387598360269, + "grad_norm": 0.22682736814022064, + "learning_rate": 1.3936419715030318e-05, + "loss": 0.9199, + "step": 5910 + }, + { + "epoch": 1.1157235075154315, + "grad_norm": 0.22072015702724457, + "learning_rate": 1.3918269375914429e-05, + "loss": 0.9171, + "step": 5920 + }, + { + "epoch": 1.1176082551948359, + "grad_norm": 0.2118896245956421, + "learning_rate": 1.3900103774607211e-05, + "loss": 0.9303, + "step": 5930 + }, + { + "epoch": 1.1194930028742403, + "grad_norm": 0.22123779356479645, + "learning_rate": 1.3881922981866154e-05, + "loss": 0.9241, + "step": 5940 + }, + { + "epoch": 1.1213777505536446, + "grad_norm": 0.2305184155702591, + "learning_rate": 1.386372706850791e-05, + "loss": 0.926, + "step": 5950 + }, + { + "epoch": 1.123262498233049, + "grad_norm": 0.22554059326648712, + "learning_rate": 1.3845516105408035e-05, + "loss": 0.9248, + "step": 5960 + }, + { + "epoch": 1.1251472459124534, + "grad_norm": 0.23516002297401428, + "learning_rate": 1.38272901635007e-05, + "loss": 0.9211, + "step": 5970 + }, + { + "epoch": 1.1270319935918578, + "grad_norm": 0.2195538878440857, + "learning_rate": 1.380904931377843e-05, + "loss": 0.9268, + "step": 5980 + }, + { + "epoch": 1.1289167412712624, + "grad_norm": 0.23041526973247528, + "learning_rate": 1.3790793627291805e-05, + "loss": 0.9114, + "step": 5990 + }, + { + "epoch": 1.1308014889506668, + "grad_norm": 0.2175453007221222, + "learning_rate": 1.3772523175149209e-05, + "loss": 0.9164, + "step": 6000 + }, + { + "epoch": 1.1326862366300712, + "grad_norm": 0.22529591619968414, + "learning_rate": 1.3754238028516526e-05, + "loss": 0.9394, + "step": 6010 + }, + { + "epoch": 1.1345709843094756, + "grad_norm": 0.21660064160823822, + "learning_rate": 1.3735938258616898e-05, + "loss": 0.9254, + "step": 6020 + }, + { + "epoch": 1.13645573198888, + "grad_norm": 0.2256104201078415, + "learning_rate": 1.37176239367304e-05, + "loss": 0.9336, + "step": 6030 + }, + { + "epoch": 1.1383404796682843, + "grad_norm": 0.22415399551391602, + "learning_rate": 1.3699295134193813e-05, + "loss": 0.9302, + "step": 6040 + }, + { + "epoch": 1.1402252273476887, + "grad_norm": 0.23482459783554077, + "learning_rate": 1.3680951922400305e-05, + "loss": 0.9367, + "step": 6050 + }, + { + "epoch": 1.1421099750270933, + "grad_norm": 0.22941826283931732, + "learning_rate": 1.3662594372799182e-05, + "loss": 0.936, + "step": 6060 + }, + { + "epoch": 1.1439947227064977, + "grad_norm": 0.2294449806213379, + "learning_rate": 1.3644222556895591e-05, + "loss": 0.918, + "step": 6070 + }, + { + "epoch": 1.145879470385902, + "grad_norm": 0.21437901258468628, + "learning_rate": 1.3625836546250253e-05, + "loss": 0.9263, + "step": 6080 + }, + { + "epoch": 1.1477642180653065, + "grad_norm": 0.23280884325504303, + "learning_rate": 1.360743641247917e-05, + "loss": 0.9227, + "step": 6090 + }, + { + "epoch": 1.1496489657447109, + "grad_norm": 0.22374652326107025, + "learning_rate": 1.3589022227253366e-05, + "loss": 0.8951, + "step": 6100 + }, + { + "epoch": 1.1515337134241153, + "grad_norm": 0.21991413831710815, + "learning_rate": 1.3570594062298593e-05, + "loss": 0.9142, + "step": 6110 + }, + { + "epoch": 1.1534184611035196, + "grad_norm": 0.22591035068035126, + "learning_rate": 1.355215198939505e-05, + "loss": 0.9318, + "step": 6120 + }, + { + "epoch": 1.1553032087829243, + "grad_norm": 0.2286655604839325, + "learning_rate": 1.3533696080377126e-05, + "loss": 0.9147, + "step": 6130 + }, + { + "epoch": 1.1571879564623286, + "grad_norm": 0.2287946492433548, + "learning_rate": 1.3515226407133085e-05, + "loss": 0.9193, + "step": 6140 + }, + { + "epoch": 1.159072704141733, + "grad_norm": 0.26710590720176697, + "learning_rate": 1.3496743041604817e-05, + "loss": 0.9143, + "step": 6150 + }, + { + "epoch": 1.1609574518211374, + "grad_norm": 0.23235651850700378, + "learning_rate": 1.3478246055787536e-05, + "loss": 0.9163, + "step": 6160 + }, + { + "epoch": 1.1628421995005418, + "grad_norm": 0.2512282729148865, + "learning_rate": 1.3459735521729519e-05, + "loss": 0.9288, + "step": 6170 + }, + { + "epoch": 1.1647269471799464, + "grad_norm": 0.22250600159168243, + "learning_rate": 1.3441211511531804e-05, + "loss": 0.9177, + "step": 6180 + }, + { + "epoch": 1.1666116948593508, + "grad_norm": 0.22275856137275696, + "learning_rate": 1.3422674097347935e-05, + "loss": 0.9419, + "step": 6190 + }, + { + "epoch": 1.1684964425387552, + "grad_norm": 0.21800775825977325, + "learning_rate": 1.3404123351383646e-05, + "loss": 0.9189, + "step": 6200 + }, + { + "epoch": 1.1703811902181596, + "grad_norm": 0.2309367060661316, + "learning_rate": 1.3385559345896621e-05, + "loss": 0.9163, + "step": 6210 + }, + { + "epoch": 1.172265937897564, + "grad_norm": 0.2247256487607956, + "learning_rate": 1.3366982153196181e-05, + "loss": 0.9182, + "step": 6220 + }, + { + "epoch": 1.1741506855769683, + "grad_norm": 0.22170545160770416, + "learning_rate": 1.3348391845643013e-05, + "loss": 0.9207, + "step": 6230 + }, + { + "epoch": 1.1760354332563727, + "grad_norm": 0.22576652467250824, + "learning_rate": 1.332978849564889e-05, + "loss": 0.9064, + "step": 6240 + }, + { + "epoch": 1.1779201809357773, + "grad_norm": 0.23562553524971008, + "learning_rate": 1.3311172175676384e-05, + "loss": 0.9274, + "step": 6250 + }, + { + "epoch": 1.1798049286151817, + "grad_norm": 0.2235068380832672, + "learning_rate": 1.329254295823859e-05, + "loss": 0.9166, + "step": 6260 + }, + { + "epoch": 1.181689676294586, + "grad_norm": 0.22437547147274017, + "learning_rate": 1.3273900915898845e-05, + "loss": 0.9088, + "step": 6270 + }, + { + "epoch": 1.1835744239739905, + "grad_norm": 0.22752645611763, + "learning_rate": 1.325524612127043e-05, + "loss": 0.9132, + "step": 6280 + }, + { + "epoch": 1.1854591716533949, + "grad_norm": 0.2202550172805786, + "learning_rate": 1.3236578647016303e-05, + "loss": 0.9275, + "step": 6290 + }, + { + "epoch": 1.1873439193327993, + "grad_norm": 0.22893770039081573, + "learning_rate": 1.3217898565848818e-05, + "loss": 0.943, + "step": 6300 + }, + { + "epoch": 1.1892286670122036, + "grad_norm": 0.22508858144283295, + "learning_rate": 1.3199205950529419e-05, + "loss": 0.9186, + "step": 6310 + }, + { + "epoch": 1.1911134146916083, + "grad_norm": 0.225150004029274, + "learning_rate": 1.3180500873868388e-05, + "loss": 0.9259, + "step": 6320 + }, + { + "epoch": 1.1929981623710126, + "grad_norm": 0.23732155561447144, + "learning_rate": 1.3161783408724534e-05, + "loss": 0.9275, + "step": 6330 + }, + { + "epoch": 1.194882910050417, + "grad_norm": 0.21989399194717407, + "learning_rate": 1.3143053628004931e-05, + "loss": 0.9309, + "step": 6340 + }, + { + "epoch": 1.1967676577298214, + "grad_norm": 0.22706113755702972, + "learning_rate": 1.3124311604664613e-05, + "loss": 0.913, + "step": 6350 + }, + { + "epoch": 1.1986524054092258, + "grad_norm": 0.2207358479499817, + "learning_rate": 1.3105557411706311e-05, + "loss": 0.8964, + "step": 6360 + }, + { + "epoch": 1.2005371530886302, + "grad_norm": 0.21873906254768372, + "learning_rate": 1.3086791122180147e-05, + "loss": 0.932, + "step": 6370 + }, + { + "epoch": 1.2024219007680346, + "grad_norm": 0.22024978697299957, + "learning_rate": 1.3068012809183377e-05, + "loss": 0.931, + "step": 6380 + }, + { + "epoch": 1.2043066484474392, + "grad_norm": 0.22729642689228058, + "learning_rate": 1.304922254586007e-05, + "loss": 0.9045, + "step": 6390 + }, + { + "epoch": 1.2061913961268436, + "grad_norm": 0.220153346657753, + "learning_rate": 1.303042040540086e-05, + "loss": 0.9121, + "step": 6400 + }, + { + "epoch": 1.208076143806248, + "grad_norm": 0.22822965681552887, + "learning_rate": 1.3011606461042633e-05, + "loss": 0.9206, + "step": 6410 + }, + { + "epoch": 1.2099608914856523, + "grad_norm": 0.22767098248004913, + "learning_rate": 1.2992780786068258e-05, + "loss": 0.9235, + "step": 6420 + }, + { + "epoch": 1.2118456391650567, + "grad_norm": 0.22266320884227753, + "learning_rate": 1.2973943453806301e-05, + "loss": 0.9049, + "step": 6430 + }, + { + "epoch": 1.213730386844461, + "grad_norm": 0.22684356570243835, + "learning_rate": 1.2955094537630722e-05, + "loss": 0.9337, + "step": 6440 + }, + { + "epoch": 1.2156151345238655, + "grad_norm": 0.22273842990398407, + "learning_rate": 1.2936234110960614e-05, + "loss": 0.9126, + "step": 6450 + }, + { + "epoch": 1.21749988220327, + "grad_norm": 0.22580714523792267, + "learning_rate": 1.2917362247259897e-05, + "loss": 0.9085, + "step": 6460 + }, + { + "epoch": 1.2193846298826745, + "grad_norm": 0.23013198375701904, + "learning_rate": 1.2898479020037048e-05, + "loss": 0.9122, + "step": 6470 + }, + { + "epoch": 1.2212693775620789, + "grad_norm": 0.22466449439525604, + "learning_rate": 1.2879584502844794e-05, + "loss": 0.9215, + "step": 6480 + }, + { + "epoch": 1.2231541252414833, + "grad_norm": 0.2188250869512558, + "learning_rate": 1.2860678769279855e-05, + "loss": 0.9292, + "step": 6490 + }, + { + "epoch": 1.2250388729208876, + "grad_norm": 0.23460616171360016, + "learning_rate": 1.284176189298262e-05, + "loss": 0.9091, + "step": 6500 + }, + { + "epoch": 1.2269236206002923, + "grad_norm": 0.2196764051914215, + "learning_rate": 1.2822833947636897e-05, + "loss": 0.9245, + "step": 6510 + }, + { + "epoch": 1.2288083682796966, + "grad_norm": 0.22811390459537506, + "learning_rate": 1.2803895006969605e-05, + "loss": 0.9053, + "step": 6520 + }, + { + "epoch": 1.230693115959101, + "grad_norm": 0.22004549205303192, + "learning_rate": 1.2784945144750485e-05, + "loss": 0.9131, + "step": 6530 + }, + { + "epoch": 1.2325778636385054, + "grad_norm": 0.229686439037323, + "learning_rate": 1.2765984434791828e-05, + "loss": 0.9241, + "step": 6540 + }, + { + "epoch": 1.2344626113179098, + "grad_norm": 0.22825191915035248, + "learning_rate": 1.274701295094817e-05, + "loss": 0.9072, + "step": 6550 + }, + { + "epoch": 1.2363473589973142, + "grad_norm": 0.23291198909282684, + "learning_rate": 1.2728030767116022e-05, + "loss": 0.9263, + "step": 6560 + }, + { + "epoch": 1.2382321066767186, + "grad_norm": 0.23086600005626678, + "learning_rate": 1.2709037957233566e-05, + "loss": 0.9057, + "step": 6570 + }, + { + "epoch": 1.2401168543561232, + "grad_norm": 0.2277623564004898, + "learning_rate": 1.2690034595280376e-05, + "loss": 0.9264, + "step": 6580 + }, + { + "epoch": 1.2420016020355276, + "grad_norm": 0.2232382893562317, + "learning_rate": 1.2671020755277122e-05, + "loss": 0.9246, + "step": 6590 + }, + { + "epoch": 1.243886349714932, + "grad_norm": 0.22753356397151947, + "learning_rate": 1.2651996511285298e-05, + "loss": 0.9162, + "step": 6600 + }, + { + "epoch": 1.2457710973943363, + "grad_norm": 0.23055724799633026, + "learning_rate": 1.263296193740692e-05, + "loss": 0.903, + "step": 6610 + }, + { + "epoch": 1.2476558450737407, + "grad_norm": 0.22811348736286163, + "learning_rate": 1.2613917107784236e-05, + "loss": 0.9221, + "step": 6620 + }, + { + "epoch": 1.249540592753145, + "grad_norm": 0.23350735008716583, + "learning_rate": 1.259486209659944e-05, + "loss": 0.9237, + "step": 6630 + }, + { + "epoch": 1.2514253404325495, + "grad_norm": 0.23428195714950562, + "learning_rate": 1.2575796978074392e-05, + "loss": 0.8933, + "step": 6640 + }, + { + "epoch": 1.253310088111954, + "grad_norm": 0.23756927251815796, + "learning_rate": 1.255672182647032e-05, + "loss": 0.9175, + "step": 6650 + }, + { + "epoch": 1.2551948357913585, + "grad_norm": 0.22465968132019043, + "learning_rate": 1.2537636716087524e-05, + "loss": 0.927, + "step": 6660 + }, + { + "epoch": 1.2570795834707629, + "grad_norm": 0.2241566926240921, + "learning_rate": 1.2518541721265105e-05, + "loss": 0.913, + "step": 6670 + }, + { + "epoch": 1.2589643311501673, + "grad_norm": 0.22283205389976501, + "learning_rate": 1.249943691638066e-05, + "loss": 0.928, + "step": 6680 + }, + { + "epoch": 1.2608490788295716, + "grad_norm": 0.23866012692451477, + "learning_rate": 1.2480322375850001e-05, + "loss": 0.8997, + "step": 6690 + }, + { + "epoch": 1.262733826508976, + "grad_norm": 0.24363973736763, + "learning_rate": 1.2461198174126851e-05, + "loss": 0.9477, + "step": 6700 + }, + { + "epoch": 1.2646185741883804, + "grad_norm": 0.22232966125011444, + "learning_rate": 1.244206438570258e-05, + "loss": 0.9149, + "step": 6710 + }, + { + "epoch": 1.266503321867785, + "grad_norm": 0.22593726217746735, + "learning_rate": 1.2422921085105895e-05, + "loss": 0.9094, + "step": 6720 + }, + { + "epoch": 1.2683880695471894, + "grad_norm": 0.23672741651535034, + "learning_rate": 1.2403768346902542e-05, + "loss": 0.9215, + "step": 6730 + }, + { + "epoch": 1.2702728172265938, + "grad_norm": 0.22895105183124542, + "learning_rate": 1.2384606245695044e-05, + "loss": 0.9327, + "step": 6740 + }, + { + "epoch": 1.2721575649059982, + "grad_norm": 0.22399824857711792, + "learning_rate": 1.2365434856122385e-05, + "loss": 0.9247, + "step": 6750 + }, + { + "epoch": 1.2740423125854026, + "grad_norm": 0.22685328125953674, + "learning_rate": 1.2346254252859733e-05, + "loss": 0.9237, + "step": 6760 + }, + { + "epoch": 1.2759270602648072, + "grad_norm": 0.21988481283187866, + "learning_rate": 1.2327064510618141e-05, + "loss": 0.9133, + "step": 6770 + }, + { + "epoch": 1.2778118079442113, + "grad_norm": 0.22276684641838074, + "learning_rate": 1.2307865704144258e-05, + "loss": 0.9188, + "step": 6780 + }, + { + "epoch": 1.279696555623616, + "grad_norm": 0.22081723809242249, + "learning_rate": 1.2288657908220051e-05, + "loss": 0.902, + "step": 6790 + }, + { + "epoch": 1.2815813033030203, + "grad_norm": 0.22139418125152588, + "learning_rate": 1.2269441197662484e-05, + "loss": 0.9111, + "step": 6800 + }, + { + "epoch": 1.2834660509824247, + "grad_norm": 0.401782751083374, + "learning_rate": 1.2250215647323262e-05, + "loss": 0.913, + "step": 6810 + }, + { + "epoch": 1.285350798661829, + "grad_norm": 0.22312961518764496, + "learning_rate": 1.2230981332088505e-05, + "loss": 0.9265, + "step": 6820 + }, + { + "epoch": 1.2872355463412335, + "grad_norm": 0.22870683670043945, + "learning_rate": 1.2211738326878497e-05, + "loss": 0.913, + "step": 6830 + }, + { + "epoch": 1.289120294020638, + "grad_norm": 0.2312675565481186, + "learning_rate": 1.2192486706647342e-05, + "loss": 0.9295, + "step": 6840 + }, + { + "epoch": 1.2910050417000425, + "grad_norm": 0.22798092663288116, + "learning_rate": 1.2173226546382724e-05, + "loss": 0.93, + "step": 6850 + }, + { + "epoch": 1.2928897893794469, + "grad_norm": 0.2288781851530075, + "learning_rate": 1.2153957921105582e-05, + "loss": 0.912, + "step": 6860 + }, + { + "epoch": 1.2947745370588513, + "grad_norm": 0.2272433340549469, + "learning_rate": 1.213468090586982e-05, + "loss": 0.9083, + "step": 6870 + }, + { + "epoch": 1.2966592847382556, + "grad_norm": 0.2492210865020752, + "learning_rate": 1.2115395575762044e-05, + "loss": 0.9171, + "step": 6880 + }, + { + "epoch": 1.29854403241766, + "grad_norm": 0.2252228558063507, + "learning_rate": 1.209610200590122e-05, + "loss": 0.936, + "step": 6890 + }, + { + "epoch": 1.3004287800970644, + "grad_norm": 0.23242458701133728, + "learning_rate": 1.2076800271438436e-05, + "loss": 0.9348, + "step": 6900 + }, + { + "epoch": 1.302313527776469, + "grad_norm": 0.2343248426914215, + "learning_rate": 1.2057490447556556e-05, + "loss": 0.9118, + "step": 6910 + }, + { + "epoch": 1.3041982754558734, + "grad_norm": 0.22584888339042664, + "learning_rate": 1.2038172609469978e-05, + "loss": 0.9148, + "step": 6920 + }, + { + "epoch": 1.3060830231352778, + "grad_norm": 0.2299051135778427, + "learning_rate": 1.2018846832424294e-05, + "loss": 0.9047, + "step": 6930 + }, + { + "epoch": 1.3079677708146822, + "grad_norm": 0.228471040725708, + "learning_rate": 1.199951319169604e-05, + "loss": 0.9273, + "step": 6940 + }, + { + "epoch": 1.3098525184940866, + "grad_norm": 0.22028133273124695, + "learning_rate": 1.1980171762592361e-05, + "loss": 0.9186, + "step": 6950 + }, + { + "epoch": 1.311737266173491, + "grad_norm": 0.2290957272052765, + "learning_rate": 1.196082262045076e-05, + "loss": 0.9193, + "step": 6960 + }, + { + "epoch": 1.3136220138528953, + "grad_norm": 0.23160114884376526, + "learning_rate": 1.1941465840638768e-05, + "loss": 0.9206, + "step": 6970 + }, + { + "epoch": 1.3155067615323, + "grad_norm": 0.23056699335575104, + "learning_rate": 1.1922101498553676e-05, + "loss": 0.9114, + "step": 6980 + }, + { + "epoch": 1.3173915092117043, + "grad_norm": 0.2241402268409729, + "learning_rate": 1.190272966962222e-05, + "loss": 0.9179, + "step": 6990 + }, + { + "epoch": 1.3192762568911087, + "grad_norm": 0.22617432475090027, + "learning_rate": 1.1883350429300308e-05, + "loss": 0.9237, + "step": 7000 + }, + { + "epoch": 1.321161004570513, + "grad_norm": 0.23305636644363403, + "learning_rate": 1.1863963853072713e-05, + "loss": 0.9059, + "step": 7010 + }, + { + "epoch": 1.3230457522499175, + "grad_norm": 0.22191418707370758, + "learning_rate": 1.184457001645278e-05, + "loss": 0.9191, + "step": 7020 + }, + { + "epoch": 1.3249304999293219, + "grad_norm": 0.23184920847415924, + "learning_rate": 1.1825168994982136e-05, + "loss": 0.9246, + "step": 7030 + }, + { + "epoch": 1.3268152476087263, + "grad_norm": 0.23088310658931732, + "learning_rate": 1.1805760864230398e-05, + "loss": 0.9274, + "step": 7040 + }, + { + "epoch": 1.3286999952881309, + "grad_norm": 0.2264508605003357, + "learning_rate": 1.1786345699794868e-05, + "loss": 0.9147, + "step": 7050 + }, + { + "epoch": 1.3305847429675353, + "grad_norm": 0.2320396602153778, + "learning_rate": 1.1766923577300249e-05, + "loss": 0.9126, + "step": 7060 + }, + { + "epoch": 1.3324694906469396, + "grad_norm": 0.23665688931941986, + "learning_rate": 1.1747494572398346e-05, + "loss": 0.9102, + "step": 7070 + }, + { + "epoch": 1.334354238326344, + "grad_norm": 0.22712397575378418, + "learning_rate": 1.1728058760767768e-05, + "loss": 0.9214, + "step": 7080 + }, + { + "epoch": 1.3362389860057484, + "grad_norm": 0.23514389991760254, + "learning_rate": 1.1708616218113649e-05, + "loss": 0.8782, + "step": 7090 + }, + { + "epoch": 1.338123733685153, + "grad_norm": 0.23583270609378815, + "learning_rate": 1.1689167020167325e-05, + "loss": 0.8973, + "step": 7100 + }, + { + "epoch": 1.3400084813645572, + "grad_norm": 0.23568639159202576, + "learning_rate": 1.1669711242686064e-05, + "loss": 0.8947, + "step": 7110 + }, + { + "epoch": 1.3418932290439618, + "grad_norm": 0.22919389605522156, + "learning_rate": 1.1650248961452765e-05, + "loss": 0.9171, + "step": 7120 + }, + { + "epoch": 1.3437779767233662, + "grad_norm": 0.22865018248558044, + "learning_rate": 1.1630780252275655e-05, + "loss": 0.9235, + "step": 7130 + }, + { + "epoch": 1.3456627244027706, + "grad_norm": 0.225424662232399, + "learning_rate": 1.1611305190987998e-05, + "loss": 0.9036, + "step": 7140 + }, + { + "epoch": 1.347547472082175, + "grad_norm": 0.22072014212608337, + "learning_rate": 1.1591823853447808e-05, + "loss": 0.9172, + "step": 7150 + }, + { + "epoch": 1.3494322197615793, + "grad_norm": 0.2378140240907669, + "learning_rate": 1.1572336315537536e-05, + "loss": 0.9065, + "step": 7160 + }, + { + "epoch": 1.351316967440984, + "grad_norm": 0.3522769510746002, + "learning_rate": 1.1552842653163788e-05, + "loss": 0.8969, + "step": 7170 + }, + { + "epoch": 1.3532017151203883, + "grad_norm": 0.2272019386291504, + "learning_rate": 1.1533342942257027e-05, + "loss": 0.9122, + "step": 7180 + }, + { + "epoch": 1.3550864627997927, + "grad_norm": 0.22933454811573029, + "learning_rate": 1.1513837258771277e-05, + "loss": 0.9016, + "step": 7190 + }, + { + "epoch": 1.356971210479197, + "grad_norm": 0.23473717272281647, + "learning_rate": 1.149432567868382e-05, + "loss": 0.9276, + "step": 7200 + }, + { + "epoch": 1.3588559581586015, + "grad_norm": 0.23833122849464417, + "learning_rate": 1.1474808277994915e-05, + "loss": 0.9086, + "step": 7210 + }, + { + "epoch": 1.3607407058380059, + "grad_norm": 0.23015838861465454, + "learning_rate": 1.1455285132727485e-05, + "loss": 0.9202, + "step": 7220 + }, + { + "epoch": 1.3626254535174103, + "grad_norm": 0.22292736172676086, + "learning_rate": 1.1435756318926832e-05, + "loss": 0.9023, + "step": 7230 + }, + { + "epoch": 1.3645102011968149, + "grad_norm": 0.2303238958120346, + "learning_rate": 1.1416221912660343e-05, + "loss": 0.9006, + "step": 7240 + }, + { + "epoch": 1.3663949488762193, + "grad_norm": 0.23296818137168884, + "learning_rate": 1.1396681990017173e-05, + "loss": 0.9043, + "step": 7250 + }, + { + "epoch": 1.3682796965556236, + "grad_norm": 0.2210160344839096, + "learning_rate": 1.1377136627107988e-05, + "loss": 0.8945, + "step": 7260 + }, + { + "epoch": 1.370164444235028, + "grad_norm": 0.2252742499113083, + "learning_rate": 1.1357585900064616e-05, + "loss": 0.9083, + "step": 7270 + }, + { + "epoch": 1.3720491919144324, + "grad_norm": 0.21804966032505035, + "learning_rate": 1.1338029885039805e-05, + "loss": 0.9195, + "step": 7280 + }, + { + "epoch": 1.3739339395938368, + "grad_norm": 0.2233145534992218, + "learning_rate": 1.1318468658206883e-05, + "loss": 0.9156, + "step": 7290 + }, + { + "epoch": 1.3758186872732412, + "grad_norm": 0.23149368166923523, + "learning_rate": 1.129890229575949e-05, + "loss": 0.9194, + "step": 7300 + }, + { + "epoch": 1.3777034349526458, + "grad_norm": 0.21982479095458984, + "learning_rate": 1.1279330873911259e-05, + "loss": 0.9087, + "step": 7310 + }, + { + "epoch": 1.3795881826320502, + "grad_norm": 0.22799623012542725, + "learning_rate": 1.1259754468895543e-05, + "loss": 0.9095, + "step": 7320 + }, + { + "epoch": 1.3814729303114546, + "grad_norm": 0.2423265278339386, + "learning_rate": 1.1240173156965089e-05, + "loss": 0.9109, + "step": 7330 + }, + { + "epoch": 1.383357677990859, + "grad_norm": 0.2324947565793991, + "learning_rate": 1.1220587014391773e-05, + "loss": 0.9136, + "step": 7340 + }, + { + "epoch": 1.3852424256702633, + "grad_norm": 0.22605924308300018, + "learning_rate": 1.120099611746628e-05, + "loss": 0.8814, + "step": 7350 + }, + { + "epoch": 1.3871271733496677, + "grad_norm": 0.2252420037984848, + "learning_rate": 1.1181400542497809e-05, + "loss": 0.9222, + "step": 7360 + }, + { + "epoch": 1.3890119210290721, + "grad_norm": 0.22567017376422882, + "learning_rate": 1.1161800365813793e-05, + "loss": 0.9179, + "step": 7370 + }, + { + "epoch": 1.3908966687084767, + "grad_norm": 0.22838516533374786, + "learning_rate": 1.1142195663759574e-05, + "loss": 0.9005, + "step": 7380 + }, + { + "epoch": 1.392781416387881, + "grad_norm": 0.22734062373638153, + "learning_rate": 1.1122586512698137e-05, + "loss": 0.9059, + "step": 7390 + }, + { + "epoch": 1.3946661640672855, + "grad_norm": 0.2306530922651291, + "learning_rate": 1.1102972989009781e-05, + "loss": 0.9153, + "step": 7400 + }, + { + "epoch": 1.3965509117466899, + "grad_norm": 0.2245134860277176, + "learning_rate": 1.1083355169091849e-05, + "loss": 0.8913, + "step": 7410 + }, + { + "epoch": 1.3984356594260943, + "grad_norm": 0.23024848103523254, + "learning_rate": 1.106373312935841e-05, + "loss": 0.9205, + "step": 7420 + }, + { + "epoch": 1.4003204071054989, + "grad_norm": 0.2231600284576416, + "learning_rate": 1.1044106946239977e-05, + "loss": 0.9257, + "step": 7430 + }, + { + "epoch": 1.402205154784903, + "grad_norm": 0.23288121819496155, + "learning_rate": 1.1024476696183197e-05, + "loss": 0.9145, + "step": 7440 + }, + { + "epoch": 1.4040899024643076, + "grad_norm": 0.21642638742923737, + "learning_rate": 1.1004842455650559e-05, + "loss": 0.9171, + "step": 7450 + }, + { + "epoch": 1.405974650143712, + "grad_norm": 0.23001976311206818, + "learning_rate": 1.0985204301120098e-05, + "loss": 0.9119, + "step": 7460 + }, + { + "epoch": 1.4078593978231164, + "grad_norm": 0.2238442301750183, + "learning_rate": 1.0965562309085092e-05, + "loss": 0.9084, + "step": 7470 + }, + { + "epoch": 1.4097441455025208, + "grad_norm": 0.23155613243579865, + "learning_rate": 1.0945916556053765e-05, + "loss": 0.9102, + "step": 7480 + }, + { + "epoch": 1.4116288931819252, + "grad_norm": 0.23288863897323608, + "learning_rate": 1.0926267118548999e-05, + "loss": 0.8961, + "step": 7490 + }, + { + "epoch": 1.4135136408613298, + "grad_norm": 0.24038955569267273, + "learning_rate": 1.0906614073108015e-05, + "loss": 0.9148, + "step": 7500 + }, + { + "epoch": 1.4153983885407342, + "grad_norm": 0.2227010577917099, + "learning_rate": 1.0886957496282098e-05, + "loss": 0.9078, + "step": 7510 + }, + { + "epoch": 1.4172831362201386, + "grad_norm": 0.24735458195209503, + "learning_rate": 1.0867297464636281e-05, + "loss": 0.9019, + "step": 7520 + }, + { + "epoch": 1.419167883899543, + "grad_norm": 0.22768346965312958, + "learning_rate": 1.0847634054749061e-05, + "loss": 0.8951, + "step": 7530 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.23067669570446014, + "learning_rate": 1.0827967343212087e-05, + "loss": 0.9108, + "step": 7540 + }, + { + "epoch": 1.4229373792583517, + "grad_norm": 0.22677800059318542, + "learning_rate": 1.0808297406629874e-05, + "loss": 0.91, + "step": 7550 + }, + { + "epoch": 1.4248221269377561, + "grad_norm": 0.23281393945217133, + "learning_rate": 1.078862432161949e-05, + "loss": 0.9133, + "step": 7560 + }, + { + "epoch": 1.4267068746171607, + "grad_norm": 0.23670922219753265, + "learning_rate": 1.076894816481028e-05, + "loss": 0.9183, + "step": 7570 + }, + { + "epoch": 1.4285916222965651, + "grad_norm": 0.23590032756328583, + "learning_rate": 1.0749269012843541e-05, + "loss": 0.9271, + "step": 7580 + }, + { + "epoch": 1.4304763699759695, + "grad_norm": 0.23442265391349792, + "learning_rate": 1.0729586942372248e-05, + "loss": 0.9145, + "step": 7590 + }, + { + "epoch": 1.4323611176553739, + "grad_norm": 0.21706892549991608, + "learning_rate": 1.0709902030060734e-05, + "loss": 0.9103, + "step": 7600 + }, + { + "epoch": 1.4342458653347783, + "grad_norm": 0.23177258670330048, + "learning_rate": 1.0690214352584405e-05, + "loss": 0.926, + "step": 7610 + }, + { + "epoch": 1.4361306130141827, + "grad_norm": 0.25657588243484497, + "learning_rate": 1.0670523986629444e-05, + "loss": 0.9293, + "step": 7620 + }, + { + "epoch": 1.438015360693587, + "grad_norm": 0.22809098660945892, + "learning_rate": 1.0650831008892497e-05, + "loss": 0.8935, + "step": 7630 + }, + { + "epoch": 1.4399001083729916, + "grad_norm": 0.22526070475578308, + "learning_rate": 1.0631135496080385e-05, + "loss": 0.9182, + "step": 7640 + }, + { + "epoch": 1.441784856052396, + "grad_norm": 0.2336445450782776, + "learning_rate": 1.0611437524909806e-05, + "loss": 0.9241, + "step": 7650 + }, + { + "epoch": 1.4436696037318004, + "grad_norm": 0.23072561621665955, + "learning_rate": 1.0591737172107036e-05, + "loss": 0.9016, + "step": 7660 + }, + { + "epoch": 1.4455543514112048, + "grad_norm": 0.23024435341358185, + "learning_rate": 1.0572034514407618e-05, + "loss": 0.9173, + "step": 7670 + }, + { + "epoch": 1.4474390990906092, + "grad_norm": 0.23413725197315216, + "learning_rate": 1.0552329628556086e-05, + "loss": 0.907, + "step": 7680 + }, + { + "epoch": 1.4493238467700138, + "grad_norm": 0.22813543677330017, + "learning_rate": 1.0532622591305642e-05, + "loss": 0.9114, + "step": 7690 + }, + { + "epoch": 1.451208594449418, + "grad_norm": 0.23336417973041534, + "learning_rate": 1.0512913479417873e-05, + "loss": 0.9105, + "step": 7700 + }, + { + "epoch": 1.4530933421288226, + "grad_norm": 0.23557589948177338, + "learning_rate": 1.049320236966245e-05, + "loss": 0.917, + "step": 7710 + }, + { + "epoch": 1.454978089808227, + "grad_norm": 0.2237783968448639, + "learning_rate": 1.0473489338816817e-05, + "loss": 0.8965, + "step": 7720 + }, + { + "epoch": 1.4568628374876313, + "grad_norm": 0.22563421726226807, + "learning_rate": 1.0453774463665912e-05, + "loss": 0.9247, + "step": 7730 + }, + { + "epoch": 1.4587475851670357, + "grad_norm": 0.22884467244148254, + "learning_rate": 1.0434057821001842e-05, + "loss": 0.8961, + "step": 7740 + }, + { + "epoch": 1.4606323328464401, + "grad_norm": 0.23478251695632935, + "learning_rate": 1.041433948762362e-05, + "loss": 0.9071, + "step": 7750 + }, + { + "epoch": 1.4625170805258447, + "grad_norm": 0.22617754340171814, + "learning_rate": 1.039461954033682e-05, + "loss": 0.9087, + "step": 7760 + }, + { + "epoch": 1.464401828205249, + "grad_norm": 0.2358904629945755, + "learning_rate": 1.0374898055953327e-05, + "loss": 0.9073, + "step": 7770 + }, + { + "epoch": 1.4662865758846535, + "grad_norm": 0.2331264764070511, + "learning_rate": 1.0355175111290987e-05, + "loss": 0.8954, + "step": 7780 + }, + { + "epoch": 1.4681713235640579, + "grad_norm": 0.2822405695915222, + "learning_rate": 1.0335450783173359e-05, + "loss": 0.8926, + "step": 7790 + }, + { + "epoch": 1.4700560712434623, + "grad_norm": 0.23156620562076569, + "learning_rate": 1.0315725148429377e-05, + "loss": 0.9013, + "step": 7800 + }, + { + "epoch": 1.4719408189228667, + "grad_norm": 0.23397091031074524, + "learning_rate": 1.0295998283893066e-05, + "loss": 0.9245, + "step": 7810 + }, + { + "epoch": 1.473825566602271, + "grad_norm": 0.22240161895751953, + "learning_rate": 1.027627026640324e-05, + "loss": 0.9279, + "step": 7820 + }, + { + "epoch": 1.4757103142816757, + "grad_norm": 0.22860097885131836, + "learning_rate": 1.0256541172803206e-05, + "loss": 0.9125, + "step": 7830 + }, + { + "epoch": 1.47759506196108, + "grad_norm": 0.23673903942108154, + "learning_rate": 1.0236811079940473e-05, + "loss": 0.903, + "step": 7840 + }, + { + "epoch": 1.4794798096404844, + "grad_norm": 0.22764548659324646, + "learning_rate": 1.0217080064666418e-05, + "loss": 0.9226, + "step": 7850 + }, + { + "epoch": 1.4813645573198888, + "grad_norm": 0.22970451414585114, + "learning_rate": 1.0197348203836035e-05, + "loss": 0.8972, + "step": 7860 + }, + { + "epoch": 1.4832493049992932, + "grad_norm": 0.2309989333152771, + "learning_rate": 1.0177615574307593e-05, + "loss": 0.91, + "step": 7870 + }, + { + "epoch": 1.4851340526786976, + "grad_norm": 0.22437341511249542, + "learning_rate": 1.0157882252942373e-05, + "loss": 0.9029, + "step": 7880 + }, + { + "epoch": 1.487018800358102, + "grad_norm": 0.22497232258319855, + "learning_rate": 1.0138148316604334e-05, + "loss": 0.9172, + "step": 7890 + }, + { + "epoch": 1.4889035480375066, + "grad_norm": 0.22326721251010895, + "learning_rate": 1.0118413842159845e-05, + "loss": 0.9054, + "step": 7900 + }, + { + "epoch": 1.490788295716911, + "grad_norm": 0.22457511723041534, + "learning_rate": 1.0098678906477357e-05, + "loss": 0.909, + "step": 7910 + }, + { + "epoch": 1.4926730433963153, + "grad_norm": 0.21988549828529358, + "learning_rate": 1.0078943586427127e-05, + "loss": 0.9021, + "step": 7920 + }, + { + "epoch": 1.4945577910757197, + "grad_norm": 0.22954754531383514, + "learning_rate": 1.005920795888091e-05, + "loss": 0.9007, + "step": 7930 + }, + { + "epoch": 1.4964425387551241, + "grad_norm": 0.22897584736347198, + "learning_rate": 1.0039472100711651e-05, + "loss": 0.9052, + "step": 7940 + }, + { + "epoch": 1.4983272864345285, + "grad_norm": 0.23869483172893524, + "learning_rate": 1.00197360887932e-05, + "loss": 0.9198, + "step": 7950 + }, + { + "epoch": 1.500212034113933, + "grad_norm": 0.23050592839717865, + "learning_rate": 1e-05, + "loss": 0.9313, + "step": 7960 + }, + { + "epoch": 1.5020967817933375, + "grad_norm": 0.22967670857906342, + "learning_rate": 9.980263911206803e-06, + "loss": 0.9167, + "step": 7970 + }, + { + "epoch": 1.5039815294727419, + "grad_norm": 0.2269078493118286, + "learning_rate": 9.96052789928835e-06, + "loss": 0.9044, + "step": 7980 + }, + { + "epoch": 1.5058662771521463, + "grad_norm": 0.22307592630386353, + "learning_rate": 9.940792041119094e-06, + "loss": 0.9192, + "step": 7990 + }, + { + "epoch": 1.5077510248315507, + "grad_norm": 0.23156896233558655, + "learning_rate": 9.921056413572875e-06, + "loss": 0.8965, + "step": 8000 + }, + { + "epoch": 1.509635772510955, + "grad_norm": 0.22353577613830566, + "learning_rate": 9.901321093522646e-06, + "loss": 0.9053, + "step": 8010 + }, + { + "epoch": 1.5115205201903597, + "grad_norm": 0.22150367498397827, + "learning_rate": 9.881586157840157e-06, + "loss": 0.8964, + "step": 8020 + }, + { + "epoch": 1.5134052678697638, + "grad_norm": 0.22103632986545563, + "learning_rate": 9.86185168339567e-06, + "loss": 0.9011, + "step": 8030 + }, + { + "epoch": 1.5152900155491684, + "grad_norm": 0.2321416288614273, + "learning_rate": 9.842117747057632e-06, + "loss": 0.9086, + "step": 8040 + }, + { + "epoch": 1.5171747632285728, + "grad_norm": 0.23300579190254211, + "learning_rate": 9.822384425692409e-06, + "loss": 0.9046, + "step": 8050 + }, + { + "epoch": 1.5190595109079772, + "grad_norm": 0.23319683969020844, + "learning_rate": 9.802651796163968e-06, + "loss": 0.9128, + "step": 8060 + }, + { + "epoch": 1.5209442585873816, + "grad_norm": 0.2380746603012085, + "learning_rate": 9.782919935333584e-06, + "loss": 0.9107, + "step": 8070 + }, + { + "epoch": 1.522829006266786, + "grad_norm": 0.2230675369501114, + "learning_rate": 9.763188920059532e-06, + "loss": 0.8961, + "step": 8080 + }, + { + "epoch": 1.5247137539461906, + "grad_norm": 0.23184828460216522, + "learning_rate": 9.743458827196795e-06, + "loss": 0.896, + "step": 8090 + }, + { + "epoch": 1.5265985016255947, + "grad_norm": 0.22784964740276337, + "learning_rate": 9.723729733596763e-06, + "loss": 0.9083, + "step": 8100 + }, + { + "epoch": 1.5284832493049993, + "grad_norm": 0.2225867509841919, + "learning_rate": 9.704001716106936e-06, + "loss": 0.9077, + "step": 8110 + }, + { + "epoch": 1.5303679969844037, + "grad_norm": 0.22300243377685547, + "learning_rate": 9.684274851570627e-06, + "loss": 0.9151, + "step": 8120 + }, + { + "epoch": 1.5322527446638081, + "grad_norm": 0.2374887466430664, + "learning_rate": 9.664549216826643e-06, + "loss": 0.9008, + "step": 8130 + }, + { + "epoch": 1.5341374923432125, + "grad_norm": 0.22568826377391815, + "learning_rate": 9.644824888709015e-06, + "loss": 0.8955, + "step": 8140 + }, + { + "epoch": 1.536022240022617, + "grad_norm": 0.230172261595726, + "learning_rate": 9.625101944046676e-06, + "loss": 0.9016, + "step": 8150 + }, + { + "epoch": 1.5379069877020215, + "grad_norm": 0.2397712767124176, + "learning_rate": 9.60538045966318e-06, + "loss": 0.9032, + "step": 8160 + }, + { + "epoch": 1.5397917353814257, + "grad_norm": 0.22941763699054718, + "learning_rate": 9.585660512376385e-06, + "loss": 0.9156, + "step": 8170 + }, + { + "epoch": 1.5416764830608303, + "grad_norm": 0.2352951020002365, + "learning_rate": 9.56594217899816e-06, + "loss": 0.9092, + "step": 8180 + }, + { + "epoch": 1.5435612307402347, + "grad_norm": 0.2349829077720642, + "learning_rate": 9.546225536334091e-06, + "loss": 0.8959, + "step": 8190 + }, + { + "epoch": 1.545445978419639, + "grad_norm": 0.2359655797481537, + "learning_rate": 9.526510661183185e-06, + "loss": 0.907, + "step": 8200 + }, + { + "epoch": 1.5473307260990437, + "grad_norm": 0.2293015420436859, + "learning_rate": 9.506797630337556e-06, + "loss": 0.9065, + "step": 8210 + }, + { + "epoch": 1.5492154737784478, + "grad_norm": 0.2354525774717331, + "learning_rate": 9.48708652058213e-06, + "loss": 0.9036, + "step": 8220 + }, + { + "epoch": 1.5511002214578524, + "grad_norm": 0.22575883567333221, + "learning_rate": 9.46737740869436e-06, + "loss": 0.8913, + "step": 8230 + }, + { + "epoch": 1.5529849691372566, + "grad_norm": 0.21927089989185333, + "learning_rate": 9.447670371443918e-06, + "loss": 0.9096, + "step": 8240 + }, + { + "epoch": 1.5548697168166612, + "grad_norm": 0.23728735744953156, + "learning_rate": 9.427965485592384e-06, + "loss": 0.9189, + "step": 8250 + }, + { + "epoch": 1.5567544644960656, + "grad_norm": 0.2445736676454544, + "learning_rate": 9.408262827892971e-06, + "loss": 0.9176, + "step": 8260 + }, + { + "epoch": 1.55863921217547, + "grad_norm": 0.22549158334732056, + "learning_rate": 9.388562475090197e-06, + "loss": 0.9099, + "step": 8270 + }, + { + "epoch": 1.5605239598548746, + "grad_norm": 0.22102531790733337, + "learning_rate": 9.368864503919618e-06, + "loss": 0.9109, + "step": 8280 + }, + { + "epoch": 1.5624087075342787, + "grad_norm": 0.2316359579563141, + "learning_rate": 9.349168991107506e-06, + "loss": 0.9045, + "step": 8290 + }, + { + "epoch": 1.5642934552136833, + "grad_norm": 0.22457793354988098, + "learning_rate": 9.32947601337056e-06, + "loss": 0.8918, + "step": 8300 + }, + { + "epoch": 1.5661782028930877, + "grad_norm": 0.22386574745178223, + "learning_rate": 9.309785647415597e-06, + "loss": 0.9001, + "step": 8310 + }, + { + "epoch": 1.5680629505724921, + "grad_norm": 0.21889880299568176, + "learning_rate": 9.290097969939269e-06, + "loss": 0.9085, + "step": 8320 + }, + { + "epoch": 1.5699476982518965, + "grad_norm": 0.226237490773201, + "learning_rate": 9.270413057627755e-06, + "loss": 0.9069, + "step": 8330 + }, + { + "epoch": 1.571832445931301, + "grad_norm": 0.23926052451133728, + "learning_rate": 9.250730987156459e-06, + "loss": 0.8947, + "step": 8340 + }, + { + "epoch": 1.5737171936107055, + "grad_norm": 0.23155365884304047, + "learning_rate": 9.231051835189726e-06, + "loss": 0.92, + "step": 8350 + }, + { + "epoch": 1.5756019412901097, + "grad_norm": 0.2211756557226181, + "learning_rate": 9.211375678380511e-06, + "loss": 0.8963, + "step": 8360 + }, + { + "epoch": 1.5774866889695143, + "grad_norm": 0.21844765543937683, + "learning_rate": 9.19170259337013e-06, + "loss": 0.8849, + "step": 8370 + }, + { + "epoch": 1.5793714366489187, + "grad_norm": 0.22600440680980682, + "learning_rate": 9.172032656787913e-06, + "loss": 0.8968, + "step": 8380 + }, + { + "epoch": 1.581256184328323, + "grad_norm": 0.2303336262702942, + "learning_rate": 9.152365945250942e-06, + "loss": 0.9124, + "step": 8390 + }, + { + "epoch": 1.5831409320077274, + "grad_norm": 0.22613443434238434, + "learning_rate": 9.13270253536372e-06, + "loss": 0.9065, + "step": 8400 + }, + { + "epoch": 1.5850256796871318, + "grad_norm": 0.22926831245422363, + "learning_rate": 9.113042503717906e-06, + "loss": 0.9117, + "step": 8410 + }, + { + "epoch": 1.5869104273665364, + "grad_norm": 0.23342342674732208, + "learning_rate": 9.093385926891987e-06, + "loss": 0.9094, + "step": 8420 + }, + { + "epoch": 1.5887951750459406, + "grad_norm": 0.23629553616046906, + "learning_rate": 9.073732881451001e-06, + "loss": 0.8899, + "step": 8430 + }, + { + "epoch": 1.5906799227253452, + "grad_norm": 0.22612647712230682, + "learning_rate": 9.054083443946236e-06, + "loss": 0.8953, + "step": 8440 + }, + { + "epoch": 1.5925646704047496, + "grad_norm": 0.22955381870269775, + "learning_rate": 9.034437690914911e-06, + "loss": 0.8921, + "step": 8450 + }, + { + "epoch": 1.594449418084154, + "grad_norm": 0.23840993642807007, + "learning_rate": 9.014795698879905e-06, + "loss": 0.8873, + "step": 8460 + }, + { + "epoch": 1.5963341657635584, + "grad_norm": 0.22730538249015808, + "learning_rate": 8.995157544349441e-06, + "loss": 0.8831, + "step": 8470 + }, + { + "epoch": 1.5982189134429627, + "grad_norm": 0.2300329953432083, + "learning_rate": 8.975523303816807e-06, + "loss": 0.885, + "step": 8480 + }, + { + "epoch": 1.6001036611223673, + "grad_norm": 0.22916771471500397, + "learning_rate": 8.955893053760027e-06, + "loss": 0.914, + "step": 8490 + }, + { + "epoch": 1.6019884088017715, + "grad_norm": 0.22204940021038055, + "learning_rate": 8.936266870641592e-06, + "loss": 0.8886, + "step": 8500 + }, + { + "epoch": 1.6038731564811761, + "grad_norm": 0.23086737096309662, + "learning_rate": 8.916644830908153e-06, + "loss": 0.9063, + "step": 8510 + }, + { + "epoch": 1.6057579041605805, + "grad_norm": 0.23701445758342743, + "learning_rate": 8.89702701099022e-06, + "loss": 0.9104, + "step": 8520 + }, + { + "epoch": 1.607642651839985, + "grad_norm": 0.23537544906139374, + "learning_rate": 8.877413487301868e-06, + "loss": 0.9098, + "step": 8530 + }, + { + "epoch": 1.6095273995193895, + "grad_norm": 0.23011383414268494, + "learning_rate": 8.857804336240428e-06, + "loss": 0.9094, + "step": 8540 + }, + { + "epoch": 1.6114121471987937, + "grad_norm": 0.24564208090305328, + "learning_rate": 8.838199634186209e-06, + "loss": 0.8852, + "step": 8550 + }, + { + "epoch": 1.6132968948781983, + "grad_norm": 0.23240883648395538, + "learning_rate": 8.818599457502191e-06, + "loss": 0.9028, + "step": 8560 + }, + { + "epoch": 1.6151816425576024, + "grad_norm": 0.24352017045021057, + "learning_rate": 8.799003882533724e-06, + "loss": 0.9015, + "step": 8570 + }, + { + "epoch": 1.617066390237007, + "grad_norm": 0.22767941653728485, + "learning_rate": 8.77941298560823e-06, + "loss": 0.8917, + "step": 8580 + }, + { + "epoch": 1.6189511379164114, + "grad_norm": 0.2323886752128601, + "learning_rate": 8.759826843034916e-06, + "loss": 0.9127, + "step": 8590 + }, + { + "epoch": 1.6208358855958158, + "grad_norm": 0.23042109608650208, + "learning_rate": 8.74024553110446e-06, + "loss": 0.9039, + "step": 8600 + }, + { + "epoch": 1.6227206332752204, + "grad_norm": 0.231172114610672, + "learning_rate": 8.720669126088741e-06, + "loss": 0.9046, + "step": 8610 + }, + { + "epoch": 1.6246053809546246, + "grad_norm": 0.23506036400794983, + "learning_rate": 8.701097704240515e-06, + "loss": 0.9084, + "step": 8620 + }, + { + "epoch": 1.6264901286340292, + "grad_norm": 0.2275066077709198, + "learning_rate": 8.68153134179312e-06, + "loss": 0.904, + "step": 8630 + }, + { + "epoch": 1.6283748763134336, + "grad_norm": 0.22837400436401367, + "learning_rate": 8.661970114960198e-06, + "loss": 0.9019, + "step": 8640 + }, + { + "epoch": 1.630259623992838, + "grad_norm": 0.21917550265789032, + "learning_rate": 8.642414099935385e-06, + "loss": 0.9169, + "step": 8650 + }, + { + "epoch": 1.6321443716722424, + "grad_norm": 0.22337158024311066, + "learning_rate": 8.622863372892019e-06, + "loss": 0.892, + "step": 8660 + }, + { + "epoch": 1.6340291193516467, + "grad_norm": 0.22163262963294983, + "learning_rate": 8.603318009982829e-06, + "loss": 0.8939, + "step": 8670 + }, + { + "epoch": 1.6359138670310513, + "grad_norm": 0.22648808360099792, + "learning_rate": 8.58377808733966e-06, + "loss": 0.908, + "step": 8680 + }, + { + "epoch": 1.6377986147104555, + "grad_norm": 0.2321372777223587, + "learning_rate": 8.564243681073168e-06, + "loss": 0.8909, + "step": 8690 + }, + { + "epoch": 1.6396833623898601, + "grad_norm": 0.23342463374137878, + "learning_rate": 8.544714867272516e-06, + "loss": 0.9153, + "step": 8700 + }, + { + "epoch": 1.6415681100692645, + "grad_norm": 0.23778577148914337, + "learning_rate": 8.52519172200509e-06, + "loss": 0.9056, + "step": 8710 + }, + { + "epoch": 1.643452857748669, + "grad_norm": 0.24257998168468475, + "learning_rate": 8.505674321316181e-06, + "loss": 0.9102, + "step": 8720 + }, + { + "epoch": 1.6453376054280733, + "grad_norm": 0.2255079597234726, + "learning_rate": 8.486162741228728e-06, + "loss": 0.9168, + "step": 8730 + }, + { + "epoch": 1.6472223531074777, + "grad_norm": 0.22320619225502014, + "learning_rate": 8.466657057742977e-06, + "loss": 0.9024, + "step": 8740 + }, + { + "epoch": 1.6491071007868823, + "grad_norm": 0.2286251038312912, + "learning_rate": 8.447157346836217e-06, + "loss": 0.8784, + "step": 8750 + }, + { + "epoch": 1.6509918484662864, + "grad_norm": 0.22908121347427368, + "learning_rate": 8.427663684462469e-06, + "loss": 0.9047, + "step": 8760 + }, + { + "epoch": 1.652876596145691, + "grad_norm": 0.22324106097221375, + "learning_rate": 8.408176146552193e-06, + "loss": 0.8891, + "step": 8770 + }, + { + "epoch": 1.6547613438250954, + "grad_norm": 0.2259310781955719, + "learning_rate": 8.388694809012002e-06, + "loss": 0.9164, + "step": 8780 + }, + { + "epoch": 1.6566460915044998, + "grad_norm": 0.22758160531520844, + "learning_rate": 8.369219747724344e-06, + "loss": 0.8968, + "step": 8790 + }, + { + "epoch": 1.6585308391839042, + "grad_norm": 0.2241467386484146, + "learning_rate": 8.34975103854724e-06, + "loss": 0.8998, + "step": 8800 + }, + { + "epoch": 1.6604155868633086, + "grad_norm": 0.21953533589839935, + "learning_rate": 8.330288757313938e-06, + "loss": 0.9094, + "step": 8810 + }, + { + "epoch": 1.6623003345427132, + "grad_norm": 0.22547945380210876, + "learning_rate": 8.31083297983268e-06, + "loss": 0.9033, + "step": 8820 + }, + { + "epoch": 1.6641850822221174, + "grad_norm": 0.23003444075584412, + "learning_rate": 8.291383781886352e-06, + "loss": 0.8814, + "step": 8830 + }, + { + "epoch": 1.666069829901522, + "grad_norm": 0.22156579792499542, + "learning_rate": 8.271941239232235e-06, + "loss": 0.8927, + "step": 8840 + }, + { + "epoch": 1.6679545775809264, + "grad_norm": 0.22913557291030884, + "learning_rate": 8.252505427601657e-06, + "loss": 0.8794, + "step": 8850 + }, + { + "epoch": 1.6698393252603307, + "grad_norm": 0.22364173829555511, + "learning_rate": 8.233076422699754e-06, + "loss": 0.8994, + "step": 8860 + }, + { + "epoch": 1.6717240729397354, + "grad_norm": 0.25098878145217896, + "learning_rate": 8.213654300205134e-06, + "loss": 0.9013, + "step": 8870 + }, + { + "epoch": 1.6736088206191395, + "grad_norm": 0.2504746913909912, + "learning_rate": 8.194239135769602e-06, + "loss": 0.908, + "step": 8880 + }, + { + "epoch": 1.6754935682985441, + "grad_norm": 0.23507602512836456, + "learning_rate": 8.174831005017865e-06, + "loss": 0.9046, + "step": 8890 + }, + { + "epoch": 1.6773783159779483, + "grad_norm": 0.23796898126602173, + "learning_rate": 8.155429983547222e-06, + "loss": 0.9094, + "step": 8900 + }, + { + "epoch": 1.679263063657353, + "grad_norm": 0.2341054528951645, + "learning_rate": 8.13603614692729e-06, + "loss": 0.907, + "step": 8910 + }, + { + "epoch": 1.6811478113367573, + "grad_norm": 0.23235785961151123, + "learning_rate": 8.116649570699694e-06, + "loss": 0.896, + "step": 8920 + }, + { + "epoch": 1.6830325590161617, + "grad_norm": 0.23428434133529663, + "learning_rate": 8.097270330377782e-06, + "loss": 0.9068, + "step": 8930 + }, + { + "epoch": 1.6849173066955663, + "grad_norm": 0.2364516705274582, + "learning_rate": 8.077898501446328e-06, + "loss": 0.9037, + "step": 8940 + }, + { + "epoch": 1.6868020543749704, + "grad_norm": 0.2230335921049118, + "learning_rate": 8.058534159361233e-06, + "loss": 0.8951, + "step": 8950 + }, + { + "epoch": 1.688686802054375, + "grad_norm": 0.234270840883255, + "learning_rate": 8.039177379549241e-06, + "loss": 0.9075, + "step": 8960 + }, + { + "epoch": 1.6905715497337794, + "grad_norm": 0.2414071410894394, + "learning_rate": 8.01982823740764e-06, + "loss": 0.9102, + "step": 8970 + }, + { + "epoch": 1.6924562974131838, + "grad_norm": 0.2239461988210678, + "learning_rate": 8.000486808303967e-06, + "loss": 0.8805, + "step": 8980 + }, + { + "epoch": 1.6943410450925882, + "grad_norm": 0.2901683449745178, + "learning_rate": 7.98115316757571e-06, + "loss": 0.8998, + "step": 8990 + }, + { + "epoch": 1.6962257927719926, + "grad_norm": 0.22979173064231873, + "learning_rate": 7.961827390530025e-06, + "loss": 0.9084, + "step": 9000 + }, + { + "epoch": 1.6981105404513972, + "grad_norm": 0.2258685678243637, + "learning_rate": 7.942509552443445e-06, + "loss": 0.8781, + "step": 9010 + }, + { + "epoch": 1.6999952881308014, + "grad_norm": 0.2213747799396515, + "learning_rate": 7.923199728561569e-06, + "loss": 0.9175, + "step": 9020 + }, + { + "epoch": 1.701880035810206, + "grad_norm": 0.23156055808067322, + "learning_rate": 7.903897994098781e-06, + "loss": 0.9039, + "step": 9030 + }, + { + "epoch": 1.7037647834896104, + "grad_norm": 0.23495662212371826, + "learning_rate": 7.88460442423796e-06, + "loss": 0.8799, + "step": 9040 + }, + { + "epoch": 1.7056495311690147, + "grad_norm": 0.23376572132110596, + "learning_rate": 7.86531909413018e-06, + "loss": 0.8905, + "step": 9050 + }, + { + "epoch": 1.7075342788484191, + "grad_norm": 0.22342433035373688, + "learning_rate": 7.846042078894423e-06, + "loss": 0.882, + "step": 9060 + }, + { + "epoch": 1.7094190265278235, + "grad_norm": 0.22667643427848816, + "learning_rate": 7.82677345361728e-06, + "loss": 0.9074, + "step": 9070 + }, + { + "epoch": 1.7113037742072281, + "grad_norm": 0.22309918701648712, + "learning_rate": 7.807513293352661e-06, + "loss": 0.8838, + "step": 9080 + }, + { + "epoch": 1.7131885218866323, + "grad_norm": 0.2220918983221054, + "learning_rate": 7.788261673121506e-06, + "loss": 0.8884, + "step": 9090 + }, + { + "epoch": 1.715073269566037, + "grad_norm": 0.24215960502624512, + "learning_rate": 7.769018667911494e-06, + "loss": 0.9062, + "step": 9100 + }, + { + "epoch": 1.7169580172454413, + "grad_norm": 0.22776749730110168, + "learning_rate": 7.749784352676743e-06, + "loss": 0.8946, + "step": 9110 + }, + { + "epoch": 1.7188427649248457, + "grad_norm": 0.2295990139245987, + "learning_rate": 7.73055880233752e-06, + "loss": 0.9209, + "step": 9120 + }, + { + "epoch": 1.72072751260425, + "grad_norm": 0.23977740108966827, + "learning_rate": 7.711342091779952e-06, + "loss": 0.8929, + "step": 9130 + }, + { + "epoch": 1.7226122602836544, + "grad_norm": 0.23079144954681396, + "learning_rate": 7.692134295855742e-06, + "loss": 0.9101, + "step": 9140 + }, + { + "epoch": 1.724497007963059, + "grad_norm": 0.2258554846048355, + "learning_rate": 7.672935489381862e-06, + "loss": 0.8939, + "step": 9150 + }, + { + "epoch": 1.7263817556424632, + "grad_norm": 0.2362135499715805, + "learning_rate": 7.653745747140272e-06, + "loss": 0.909, + "step": 9160 + }, + { + "epoch": 1.7282665033218678, + "grad_norm": 0.22513015568256378, + "learning_rate": 7.634565143877617e-06, + "loss": 0.8934, + "step": 9170 + }, + { + "epoch": 1.7301512510012722, + "grad_norm": 0.2308982014656067, + "learning_rate": 7.615393754304958e-06, + "loss": 0.8818, + "step": 9180 + }, + { + "epoch": 1.7320359986806766, + "grad_norm": 0.21769167482852936, + "learning_rate": 7.596231653097461e-06, + "loss": 0.9024, + "step": 9190 + }, + { + "epoch": 1.7339207463600812, + "grad_norm": 0.2219037413597107, + "learning_rate": 7.577078914894111e-06, + "loss": 0.9047, + "step": 9200 + }, + { + "epoch": 1.7358054940394854, + "grad_norm": 0.22065497934818268, + "learning_rate": 7.557935614297421e-06, + "loss": 0.9147, + "step": 9210 + }, + { + "epoch": 1.73769024171889, + "grad_norm": 0.22735194861888885, + "learning_rate": 7.538801825873151e-06, + "loss": 0.8817, + "step": 9220 + }, + { + "epoch": 1.7395749893982944, + "grad_norm": 0.22561149299144745, + "learning_rate": 7.519677624150004e-06, + "loss": 0.89, + "step": 9230 + }, + { + "epoch": 1.7414597370776987, + "grad_norm": 0.23324687778949738, + "learning_rate": 7.50056308361934e-06, + "loss": 0.8787, + "step": 9240 + }, + { + "epoch": 1.7433444847571031, + "grad_norm": 0.22331084311008453, + "learning_rate": 7.481458278734897e-06, + "loss": 0.8894, + "step": 9250 + }, + { + "epoch": 1.7452292324365075, + "grad_norm": 0.223952516913414, + "learning_rate": 7.462363283912478e-06, + "loss": 0.8898, + "step": 9260 + }, + { + "epoch": 1.7471139801159121, + "grad_norm": 0.23146231472492218, + "learning_rate": 7.443278173529684e-06, + "loss": 0.9102, + "step": 9270 + }, + { + "epoch": 1.7489987277953163, + "grad_norm": 0.22037121653556824, + "learning_rate": 7.424203021925608e-06, + "loss": 0.905, + "step": 9280 + }, + { + "epoch": 1.750883475474721, + "grad_norm": 0.22742916643619537, + "learning_rate": 7.405137903400566e-06, + "loss": 0.8763, + "step": 9290 + }, + { + "epoch": 1.7527682231541253, + "grad_norm": 0.24321119487285614, + "learning_rate": 7.386082892215769e-06, + "loss": 0.8906, + "step": 9300 + }, + { + "epoch": 1.7546529708335297, + "grad_norm": 0.23914779722690582, + "learning_rate": 7.367038062593084e-06, + "loss": 0.8944, + "step": 9310 + }, + { + "epoch": 1.756537718512934, + "grad_norm": 0.23057810962200165, + "learning_rate": 7.3480034887147016e-06, + "loss": 0.8879, + "step": 9320 + }, + { + "epoch": 1.7584224661923384, + "grad_norm": 0.22740276157855988, + "learning_rate": 7.3289792447228835e-06, + "loss": 0.9105, + "step": 9330 + }, + { + "epoch": 1.760307213871743, + "grad_norm": 0.23685742914676666, + "learning_rate": 7.30996540471963e-06, + "loss": 0.8896, + "step": 9340 + }, + { + "epoch": 1.7621919615511472, + "grad_norm": 0.23985283076763153, + "learning_rate": 7.290962042766437e-06, + "loss": 0.8792, + "step": 9350 + }, + { + "epoch": 1.7640767092305518, + "grad_norm": 0.23605075478553772, + "learning_rate": 7.27196923288398e-06, + "loss": 0.8742, + "step": 9360 + }, + { + "epoch": 1.7659614569099562, + "grad_norm": 0.23639057576656342, + "learning_rate": 7.25298704905183e-06, + "loss": 0.8998, + "step": 9370 + }, + { + "epoch": 1.7678462045893606, + "grad_norm": 0.2320484220981598, + "learning_rate": 7.2340155652081754e-06, + "loss": 0.888, + "step": 9380 + }, + { + "epoch": 1.769730952268765, + "grad_norm": 0.23318111896514893, + "learning_rate": 7.215054855249518e-06, + "loss": 0.9107, + "step": 9390 + }, + { + "epoch": 1.7716156999481694, + "grad_norm": 0.2385403960943222, + "learning_rate": 7.196104993030399e-06, + "loss": 0.8979, + "step": 9400 + }, + { + "epoch": 1.773500447627574, + "grad_norm": 0.22439107298851013, + "learning_rate": 7.177166052363103e-06, + "loss": 0.8961, + "step": 9410 + }, + { + "epoch": 1.7753851953069781, + "grad_norm": 0.22573572397232056, + "learning_rate": 7.1582381070173834e-06, + "loss": 0.9083, + "step": 9420 + }, + { + "epoch": 1.7772699429863827, + "grad_norm": 0.22317597270011902, + "learning_rate": 7.139321230720151e-06, + "loss": 0.8886, + "step": 9430 + }, + { + "epoch": 1.7791546906657871, + "grad_norm": 0.2243865430355072, + "learning_rate": 7.120415497155209e-06, + "loss": 0.8955, + "step": 9440 + }, + { + "epoch": 1.7810394383451915, + "grad_norm": 0.23646317422389984, + "learning_rate": 7.101520979962955e-06, + "loss": 0.8983, + "step": 9450 + }, + { + "epoch": 1.782924186024596, + "grad_norm": 0.23641552031040192, + "learning_rate": 7.082637752740104e-06, + "loss": 0.8995, + "step": 9460 + }, + { + "epoch": 1.7848089337040003, + "grad_norm": 0.23694801330566406, + "learning_rate": 7.06376588903939e-06, + "loss": 0.9094, + "step": 9470 + }, + { + "epoch": 1.786693681383405, + "grad_norm": 0.22671470046043396, + "learning_rate": 7.0449054623692824e-06, + "loss": 0.9077, + "step": 9480 + }, + { + "epoch": 1.788578429062809, + "grad_norm": 0.22473391890525818, + "learning_rate": 7.026056546193702e-06, + "loss": 0.8809, + "step": 9490 + }, + { + "epoch": 1.7904631767422137, + "grad_norm": 0.22928671538829803, + "learning_rate": 7.0072192139317444e-06, + "loss": 0.8885, + "step": 9500 + }, + { + "epoch": 1.792347924421618, + "grad_norm": 0.2302265465259552, + "learning_rate": 6.988393538957373e-06, + "loss": 0.9054, + "step": 9510 + }, + { + "epoch": 1.7942326721010224, + "grad_norm": 0.23828274011611938, + "learning_rate": 6.9695795945991475e-06, + "loss": 0.8968, + "step": 9520 + }, + { + "epoch": 1.796117419780427, + "grad_norm": 0.2447022795677185, + "learning_rate": 6.950777454139933e-06, + "loss": 0.8906, + "step": 9530 + }, + { + "epoch": 1.7980021674598312, + "grad_norm": 0.2287413626909256, + "learning_rate": 6.931987190816627e-06, + "loss": 0.9019, + "step": 9540 + }, + { + "epoch": 1.7998869151392358, + "grad_norm": 0.22899089753627777, + "learning_rate": 6.913208877819853e-06, + "loss": 0.9083, + "step": 9550 + }, + { + "epoch": 1.8017716628186402, + "grad_norm": 0.23157139122486115, + "learning_rate": 6.894442588293695e-06, + "loss": 0.8995, + "step": 9560 + }, + { + "epoch": 1.8036564104980446, + "grad_norm": 0.2276429980993271, + "learning_rate": 6.875688395335392e-06, + "loss": 0.8804, + "step": 9570 + }, + { + "epoch": 1.805541158177449, + "grad_norm": 0.2620387375354767, + "learning_rate": 6.856946371995072e-06, + "loss": 0.8875, + "step": 9580 + }, + { + "epoch": 1.8074259058568534, + "grad_norm": 0.23903511464595795, + "learning_rate": 6.8382165912754676e-06, + "loss": 0.896, + "step": 9590 + }, + { + "epoch": 1.809310653536258, + "grad_norm": 0.2399318963289261, + "learning_rate": 6.819499126131617e-06, + "loss": 0.8745, + "step": 9600 + }, + { + "epoch": 1.8111954012156621, + "grad_norm": 0.2356083244085312, + "learning_rate": 6.800794049470586e-06, + "loss": 0.9009, + "step": 9610 + }, + { + "epoch": 1.8130801488950667, + "grad_norm": 0.23368997871875763, + "learning_rate": 6.782101434151187e-06, + "loss": 0.8906, + "step": 9620 + }, + { + "epoch": 1.8149648965744711, + "grad_norm": 0.2524058520793915, + "learning_rate": 6.763421352983699e-06, + "loss": 0.8679, + "step": 9630 + }, + { + "epoch": 1.8168496442538755, + "grad_norm": 0.2235083132982254, + "learning_rate": 6.744753878729574e-06, + "loss": 0.8975, + "step": 9640 + }, + { + "epoch": 1.81873439193328, + "grad_norm": 0.22758863866329193, + "learning_rate": 6.726099084101161e-06, + "loss": 0.8981, + "step": 9650 + }, + { + "epoch": 1.8206191396126843, + "grad_norm": 0.22522778809070587, + "learning_rate": 6.7074570417614115e-06, + "loss": 0.8945, + "step": 9660 + }, + { + "epoch": 1.822503887292089, + "grad_norm": 0.2318229079246521, + "learning_rate": 6.68882782432362e-06, + "loss": 0.8925, + "step": 9670 + }, + { + "epoch": 1.824388634971493, + "grad_norm": 0.2325669825077057, + "learning_rate": 6.670211504351114e-06, + "loss": 0.9052, + "step": 9680 + }, + { + "epoch": 1.8262733826508977, + "grad_norm": 0.22296932339668274, + "learning_rate": 6.651608154356992e-06, + "loss": 0.905, + "step": 9690 + }, + { + "epoch": 1.828158130330302, + "grad_norm": 0.22154469788074493, + "learning_rate": 6.633017846803821e-06, + "loss": 0.8878, + "step": 9700 + }, + { + "epoch": 1.8300428780097064, + "grad_norm": 0.23253493010997772, + "learning_rate": 6.61444065410338e-06, + "loss": 0.8985, + "step": 9710 + }, + { + "epoch": 1.8319276256891108, + "grad_norm": 0.23508669435977936, + "learning_rate": 6.595876648616355e-06, + "loss": 0.8997, + "step": 9720 + }, + { + "epoch": 1.8338123733685152, + "grad_norm": 0.22651048004627228, + "learning_rate": 6.577325902652068e-06, + "loss": 0.9037, + "step": 9730 + }, + { + "epoch": 1.8356971210479198, + "grad_norm": 0.23645028471946716, + "learning_rate": 6.5587884884681995e-06, + "loss": 0.9, + "step": 9740 + }, + { + "epoch": 1.837581868727324, + "grad_norm": 0.22717803716659546, + "learning_rate": 6.540264478270486e-06, + "loss": 0.9036, + "step": 9750 + }, + { + "epoch": 1.8394666164067286, + "grad_norm": 0.22862401604652405, + "learning_rate": 6.521753944212468e-06, + "loss": 0.8974, + "step": 9760 + }, + { + "epoch": 1.841351364086133, + "grad_norm": 0.22483615577220917, + "learning_rate": 6.503256958395186e-06, + "loss": 0.8828, + "step": 9770 + }, + { + "epoch": 1.8432361117655374, + "grad_norm": 0.2501292824745178, + "learning_rate": 6.48477359286692e-06, + "loss": 0.8917, + "step": 9780 + }, + { + "epoch": 1.845120859444942, + "grad_norm": 0.24290792644023895, + "learning_rate": 6.466303919622878e-06, + "loss": 0.9037, + "step": 9790 + }, + { + "epoch": 1.8470056071243461, + "grad_norm": 0.2380213737487793, + "learning_rate": 6.4478480106049516e-06, + "loss": 0.8786, + "step": 9800 + }, + { + "epoch": 1.8488903548037507, + "grad_norm": 0.22485703229904175, + "learning_rate": 6.429405937701411e-06, + "loss": 0.89, + "step": 9810 + }, + { + "epoch": 1.850775102483155, + "grad_norm": 0.23403207957744598, + "learning_rate": 6.410977772746636e-06, + "loss": 0.8865, + "step": 9820 + }, + { + "epoch": 1.8526598501625595, + "grad_norm": 0.22541895508766174, + "learning_rate": 6.392563587520833e-06, + "loss": 0.9015, + "step": 9830 + }, + { + "epoch": 1.854544597841964, + "grad_norm": 0.22902552783489227, + "learning_rate": 6.374163453749751e-06, + "loss": 0.867, + "step": 9840 + }, + { + "epoch": 1.8564293455213683, + "grad_norm": 0.230988010764122, + "learning_rate": 6.355777443104409e-06, + "loss": 0.9038, + "step": 9850 + }, + { + "epoch": 1.858314093200773, + "grad_norm": 0.23819199204444885, + "learning_rate": 6.337405627200817e-06, + "loss": 0.88, + "step": 9860 + }, + { + "epoch": 1.860198840880177, + "grad_norm": 0.23556335270404816, + "learning_rate": 6.3190480775996966e-06, + "loss": 0.8956, + "step": 9870 + }, + { + "epoch": 1.8620835885595817, + "grad_norm": 0.23634588718414307, + "learning_rate": 6.3007048658061906e-06, + "loss": 0.8912, + "step": 9880 + }, + { + "epoch": 1.863968336238986, + "grad_norm": 0.2301328480243683, + "learning_rate": 6.282376063269603e-06, + "loss": 0.882, + "step": 9890 + }, + { + "epoch": 1.8658530839183904, + "grad_norm": 0.235159769654274, + "learning_rate": 6.264061741383105e-06, + "loss": 0.8865, + "step": 9900 + }, + { + "epoch": 1.8677378315977948, + "grad_norm": 0.24038714170455933, + "learning_rate": 6.245761971483473e-06, + "loss": 0.8856, + "step": 9910 + }, + { + "epoch": 1.8696225792771992, + "grad_norm": 0.2281685173511505, + "learning_rate": 6.227476824850795e-06, + "loss": 0.8886, + "step": 9920 + }, + { + "epoch": 1.8715073269566038, + "grad_norm": 0.23089322447776794, + "learning_rate": 6.209206372708199e-06, + "loss": 0.8838, + "step": 9930 + }, + { + "epoch": 1.873392074636008, + "grad_norm": 0.23512980341911316, + "learning_rate": 6.1909506862215725e-06, + "loss": 0.875, + "step": 9940 + }, + { + "epoch": 1.8752768223154126, + "grad_norm": 0.23389123380184174, + "learning_rate": 6.1727098364992996e-06, + "loss": 0.8861, + "step": 9950 + }, + { + "epoch": 1.877161569994817, + "grad_norm": 0.24204064905643463, + "learning_rate": 6.15448389459197e-06, + "loss": 0.8893, + "step": 9960 + }, + { + "epoch": 1.8790463176742214, + "grad_norm": 0.2255880981683731, + "learning_rate": 6.136272931492093e-06, + "loss": 0.889, + "step": 9970 + }, + { + "epoch": 1.8809310653536258, + "grad_norm": 0.2251080423593521, + "learning_rate": 6.1180770181338475e-06, + "loss": 0.8858, + "step": 9980 + }, + { + "epoch": 1.8828158130330301, + "grad_norm": 0.23081152141094208, + "learning_rate": 6.0998962253927895e-06, + "loss": 0.872, + "step": 9990 + }, + { + "epoch": 1.8847005607124347, + "grad_norm": 0.2369980812072754, + "learning_rate": 6.081730624085575e-06, + "loss": 0.9013, + "step": 10000 + }, + { + "epoch": 1.886585308391839, + "grad_norm": 0.2321942299604416, + "learning_rate": 6.0635802849696875e-06, + "loss": 0.8936, + "step": 10010 + }, + { + "epoch": 1.8884700560712435, + "grad_norm": 0.22632603347301483, + "learning_rate": 6.0454452787431595e-06, + "loss": 0.8918, + "step": 10020 + }, + { + "epoch": 1.890354803750648, + "grad_norm": 0.23758600652217865, + "learning_rate": 6.027325676044304e-06, + "loss": 0.9134, + "step": 10030 + }, + { + "epoch": 1.8922395514300523, + "grad_norm": 0.23487801849842072, + "learning_rate": 6.009221547451438e-06, + "loss": 0.8792, + "step": 10040 + }, + { + "epoch": 1.8941242991094567, + "grad_norm": 0.2403988242149353, + "learning_rate": 5.9911329634826e-06, + "loss": 0.8637, + "step": 10050 + }, + { + "epoch": 1.896009046788861, + "grad_norm": 0.26421618461608887, + "learning_rate": 5.973059994595277e-06, + "loss": 0.9, + "step": 10060 + }, + { + "epoch": 1.8978937944682657, + "grad_norm": 0.22247080504894257, + "learning_rate": 5.955002711186139e-06, + "loss": 0.9014, + "step": 10070 + }, + { + "epoch": 1.8997785421476698, + "grad_norm": 0.23290959000587463, + "learning_rate": 5.936961183590759e-06, + "loss": 0.8997, + "step": 10080 + }, + { + "epoch": 1.9016632898270744, + "grad_norm": 0.2305413782596588, + "learning_rate": 5.9189354820833344e-06, + "loss": 0.8995, + "step": 10090 + }, + { + "epoch": 1.9035480375064788, + "grad_norm": 0.23337997496128082, + "learning_rate": 5.900925676876428e-06, + "loss": 0.8854, + "step": 10100 + }, + { + "epoch": 1.9054327851858832, + "grad_norm": 0.23526322841644287, + "learning_rate": 5.882931838120665e-06, + "loss": 0.9085, + "step": 10110 + }, + { + "epoch": 1.9073175328652878, + "grad_norm": 0.231749027967453, + "learning_rate": 5.864954035904497e-06, + "loss": 0.8953, + "step": 10120 + }, + { + "epoch": 1.909202280544692, + "grad_norm": 0.2299305647611618, + "learning_rate": 5.8469923402539064e-06, + "loss": 0.9023, + "step": 10130 + }, + { + "epoch": 1.9110870282240966, + "grad_norm": 0.21948467195034027, + "learning_rate": 5.829046821132137e-06, + "loss": 0.8717, + "step": 10140 + }, + { + "epoch": 1.9129717759035008, + "grad_norm": 0.2384217083454132, + "learning_rate": 5.81111754843942e-06, + "loss": 0.8896, + "step": 10150 + }, + { + "epoch": 1.9148565235829054, + "grad_norm": 0.2315974086523056, + "learning_rate": 5.793204592012703e-06, + "loss": 0.9099, + "step": 10160 + }, + { + "epoch": 1.9167412712623098, + "grad_norm": 0.23426386713981628, + "learning_rate": 5.775308021625395e-06, + "loss": 0.8752, + "step": 10170 + }, + { + "epoch": 1.9186260189417141, + "grad_norm": 0.22818118333816528, + "learning_rate": 5.7574279069870606e-06, + "loss": 0.9126, + "step": 10180 + }, + { + "epoch": 1.9205107666211187, + "grad_norm": 0.24530449509620667, + "learning_rate": 5.739564317743183e-06, + "loss": 0.8799, + "step": 10190 + }, + { + "epoch": 1.922395514300523, + "grad_norm": 0.23236067593097687, + "learning_rate": 5.721717323474856e-06, + "loss": 0.8793, + "step": 10200 + }, + { + "epoch": 1.9242802619799275, + "grad_norm": 0.2423572838306427, + "learning_rate": 5.703886993698547e-06, + "loss": 0.8897, + "step": 10210 + }, + { + "epoch": 1.926165009659332, + "grad_norm": 0.23328860104084015, + "learning_rate": 5.6860733978658214e-06, + "loss": 0.8932, + "step": 10220 + }, + { + "epoch": 1.9280497573387363, + "grad_norm": 0.2349046766757965, + "learning_rate": 5.668276605363052e-06, + "loss": 0.8908, + "step": 10230 + }, + { + "epoch": 1.9299345050181407, + "grad_norm": 0.22404028475284576, + "learning_rate": 5.6504966855111565e-06, + "loss": 0.881, + "step": 10240 + }, + { + "epoch": 1.931819252697545, + "grad_norm": 0.22888754308223724, + "learning_rate": 5.632733707565335e-06, + "loss": 0.8959, + "step": 10250 + }, + { + "epoch": 1.9337040003769497, + "grad_norm": 0.2299530953168869, + "learning_rate": 5.61498774071481e-06, + "loss": 0.8831, + "step": 10260 + }, + { + "epoch": 1.9355887480563538, + "grad_norm": 0.23142626881599426, + "learning_rate": 5.5972588540825245e-06, + "loss": 0.8846, + "step": 10270 + }, + { + "epoch": 1.9374734957357584, + "grad_norm": 0.2290276288986206, + "learning_rate": 5.579547116724906e-06, + "loss": 0.8845, + "step": 10280 + }, + { + "epoch": 1.9393582434151628, + "grad_norm": 0.22451536357402802, + "learning_rate": 5.5618525976315655e-06, + "loss": 0.8867, + "step": 10290 + }, + { + "epoch": 1.9412429910945672, + "grad_norm": 0.22780054807662964, + "learning_rate": 5.544175365725067e-06, + "loss": 0.885, + "step": 10300 + }, + { + "epoch": 1.9431277387739716, + "grad_norm": 0.23396742343902588, + "learning_rate": 5.526515489860626e-06, + "loss": 0.8915, + "step": 10310 + }, + { + "epoch": 1.945012486453376, + "grad_norm": 0.22758087515830994, + "learning_rate": 5.508873038825864e-06, + "loss": 0.869, + "step": 10320 + }, + { + "epoch": 1.9468972341327806, + "grad_norm": 0.2890429198741913, + "learning_rate": 5.49124808134051e-06, + "loss": 0.8792, + "step": 10330 + }, + { + "epoch": 1.9487819818121848, + "grad_norm": 0.23406127095222473, + "learning_rate": 5.473640686056178e-06, + "loss": 0.8996, + "step": 10340 + }, + { + "epoch": 1.9506667294915894, + "grad_norm": 0.23863603174686432, + "learning_rate": 5.45605092155606e-06, + "loss": 0.8802, + "step": 10350 + }, + { + "epoch": 1.9525514771709938, + "grad_norm": 0.2363266795873642, + "learning_rate": 5.438478856354679e-06, + "loss": 0.8759, + "step": 10360 + }, + { + "epoch": 1.9544362248503981, + "grad_norm": 0.2253665179014206, + "learning_rate": 5.420924558897616e-06, + "loss": 0.9028, + "step": 10370 + }, + { + "epoch": 1.9563209725298025, + "grad_norm": 0.228993758559227, + "learning_rate": 5.403388097561235e-06, + "loss": 0.8983, + "step": 10380 + }, + { + "epoch": 1.958205720209207, + "grad_norm": 0.22133120894432068, + "learning_rate": 5.385869540652445e-06, + "loss": 0.912, + "step": 10390 + }, + { + "epoch": 1.9600904678886115, + "grad_norm": 0.22942990064620972, + "learning_rate": 5.368368956408401e-06, + "loss": 0.8816, + "step": 10400 + }, + { + "epoch": 1.9619752155680157, + "grad_norm": 0.2309000939130783, + "learning_rate": 5.3508864129962566e-06, + "loss": 0.8825, + "step": 10410 + }, + { + "epoch": 1.9638599632474203, + "grad_norm": 0.22815611958503723, + "learning_rate": 5.333421978512884e-06, + "loss": 0.8682, + "step": 10420 + }, + { + "epoch": 1.9657447109268247, + "grad_norm": 0.2259228229522705, + "learning_rate": 5.3159757209846364e-06, + "loss": 0.8722, + "step": 10430 + }, + { + "epoch": 1.967629458606229, + "grad_norm": 0.22418862581253052, + "learning_rate": 5.298547708367054e-06, + "loss": 0.903, + "step": 10440 + }, + { + "epoch": 1.9695142062856337, + "grad_norm": 0.22846896946430206, + "learning_rate": 5.2811380085446125e-06, + "loss": 0.8753, + "step": 10450 + }, + { + "epoch": 1.9713989539650378, + "grad_norm": 0.2290189564228058, + "learning_rate": 5.263746689330456e-06, + "loss": 0.9136, + "step": 10460 + }, + { + "epoch": 1.9732837016444424, + "grad_norm": 0.2355813980102539, + "learning_rate": 5.246373818466135e-06, + "loss": 0.8864, + "step": 10470 + }, + { + "epoch": 1.9751684493238466, + "grad_norm": 0.24332086741924286, + "learning_rate": 5.229019463621341e-06, + "loss": 0.8809, + "step": 10480 + }, + { + "epoch": 1.9770531970032512, + "grad_norm": 0.22658877074718475, + "learning_rate": 5.2116836923936435e-06, + "loss": 0.8902, + "step": 10490 + }, + { + "epoch": 1.9789379446826556, + "grad_norm": 0.2260638028383255, + "learning_rate": 5.1943665723082235e-06, + "loss": 0.9168, + "step": 10500 + }, + { + "epoch": 1.98082269236206, + "grad_norm": 0.22660474479198456, + "learning_rate": 5.177068170817618e-06, + "loss": 0.8594, + "step": 10510 + }, + { + "epoch": 1.9827074400414646, + "grad_norm": 0.2363136261701584, + "learning_rate": 5.159788555301448e-06, + "loss": 0.882, + "step": 10520 + }, + { + "epoch": 1.9845921877208688, + "grad_norm": 0.23157107830047607, + "learning_rate": 5.142527793066163e-06, + "loss": 0.9007, + "step": 10530 + }, + { + "epoch": 1.9864769354002734, + "grad_norm": 0.22773833572864532, + "learning_rate": 5.125285951344778e-06, + "loss": 0.8757, + "step": 10540 + }, + { + "epoch": 1.9883616830796778, + "grad_norm": 0.23509202897548676, + "learning_rate": 5.108063097296605e-06, + "loss": 0.9031, + "step": 10550 + }, + { + "epoch": 1.9902464307590821, + "grad_norm": 0.22999800741672516, + "learning_rate": 5.090859298007e-06, + "loss": 0.8754, + "step": 10560 + }, + { + "epoch": 1.9921311784384865, + "grad_norm": 0.22917740046977997, + "learning_rate": 5.073674620487099e-06, + "loss": 0.8855, + "step": 10570 + }, + { + "epoch": 1.994015926117891, + "grad_norm": 0.24235695600509644, + "learning_rate": 5.056509131673555e-06, + "loss": 0.8701, + "step": 10580 + }, + { + "epoch": 1.9959006737972955, + "grad_norm": 0.22780245542526245, + "learning_rate": 5.039362898428275e-06, + "loss": 0.8988, + "step": 10590 + }, + { + "epoch": 1.9977854214766997, + "grad_norm": 0.23184923827648163, + "learning_rate": 5.0222359875381685e-06, + "loss": 0.8873, + "step": 10600 + }, + { + "epoch": 1.9996701691561043, + "grad_norm": 0.2361418753862381, + "learning_rate": 5.005128465714879e-06, + "loss": 0.8718, + "step": 10610 + }, + { + "epoch": 2.001696272911464, + "grad_norm": 0.2560180127620697, + "learning_rate": 4.988040399594526e-06, + "loss": 0.9234, + "step": 10620 + }, + { + "epoch": 2.003581020590868, + "grad_norm": 0.24925002455711365, + "learning_rate": 4.9709718557374485e-06, + "loss": 0.8332, + "step": 10630 + }, + { + "epoch": 2.005465768270273, + "grad_norm": 0.26883363723754883, + "learning_rate": 4.953922900627943e-06, + "loss": 0.8279, + "step": 10640 + }, + { + "epoch": 2.0073505159496774, + "grad_norm": 0.24383634328842163, + "learning_rate": 4.936893600674005e-06, + "loss": 0.7992, + "step": 10650 + }, + { + "epoch": 2.0092352636290816, + "grad_norm": 0.2473938763141632, + "learning_rate": 4.91988402220707e-06, + "loss": 0.8283, + "step": 10660 + }, + { + "epoch": 2.011120011308486, + "grad_norm": 0.23907481133937836, + "learning_rate": 4.902894231481756e-06, + "loss": 0.8066, + "step": 10670 + }, + { + "epoch": 2.0130047589878903, + "grad_norm": 0.24316947162151337, + "learning_rate": 4.885924294675605e-06, + "loss": 0.8383, + "step": 10680 + }, + { + "epoch": 2.014889506667295, + "grad_norm": 0.23081253468990326, + "learning_rate": 4.868974277888826e-06, + "loss": 0.8139, + "step": 10690 + }, + { + "epoch": 2.0167742543466995, + "grad_norm": 0.24183881282806396, + "learning_rate": 4.852044247144036e-06, + "loss": 0.8119, + "step": 10700 + }, + { + "epoch": 2.0186590020261037, + "grad_norm": 0.24454614520072937, + "learning_rate": 4.835134268386004e-06, + "loss": 0.8086, + "step": 10710 + }, + { + "epoch": 2.0205437497055083, + "grad_norm": 0.24963019788265228, + "learning_rate": 4.818244407481388e-06, + "loss": 0.8189, + "step": 10720 + }, + { + "epoch": 2.0224284973849125, + "grad_norm": 0.23485392332077026, + "learning_rate": 4.801374730218504e-06, + "loss": 0.8065, + "step": 10730 + }, + { + "epoch": 2.024313245064317, + "grad_norm": 0.24228812754154205, + "learning_rate": 4.784525302307023e-06, + "loss": 0.8128, + "step": 10740 + }, + { + "epoch": 2.0261979927437213, + "grad_norm": 0.24162879586219788, + "learning_rate": 4.7676961893777595e-06, + "loss": 0.8023, + "step": 10750 + }, + { + "epoch": 2.028082740423126, + "grad_norm": 0.24873000383377075, + "learning_rate": 4.750887456982394e-06, + "loss": 0.812, + "step": 10760 + }, + { + "epoch": 2.0299674881025305, + "grad_norm": 0.24241720139980316, + "learning_rate": 4.734099170593221e-06, + "loss": 0.815, + "step": 10770 + }, + { + "epoch": 2.0318522357819346, + "grad_norm": 0.24945859611034393, + "learning_rate": 4.7173313956028945e-06, + "loss": 0.8164, + "step": 10780 + }, + { + "epoch": 2.0337369834613392, + "grad_norm": 0.23338696360588074, + "learning_rate": 4.700584197324179e-06, + "loss": 0.8212, + "step": 10790 + }, + { + "epoch": 2.0356217311407434, + "grad_norm": 0.2504580020904541, + "learning_rate": 4.683857640989682e-06, + "loss": 0.8247, + "step": 10800 + }, + { + "epoch": 2.037506478820148, + "grad_norm": 0.23590603470802307, + "learning_rate": 4.66715179175161e-06, + "loss": 0.824, + "step": 10810 + }, + { + "epoch": 2.039391226499552, + "grad_norm": 0.24513493478298187, + "learning_rate": 4.650466714681526e-06, + "loss": 0.8056, + "step": 10820 + }, + { + "epoch": 2.041275974178957, + "grad_norm": 0.23629970848560333, + "learning_rate": 4.6338024747700595e-06, + "loss": 0.8197, + "step": 10830 + }, + { + "epoch": 2.0431607218583614, + "grad_norm": 0.24195465445518494, + "learning_rate": 4.617159136926692e-06, + "loss": 0.8083, + "step": 10840 + }, + { + "epoch": 2.0450454695377656, + "grad_norm": 0.2452387660741806, + "learning_rate": 4.600536765979481e-06, + "loss": 0.8316, + "step": 10850 + }, + { + "epoch": 2.04693021721717, + "grad_norm": 0.24240179359912872, + "learning_rate": 4.5839354266748284e-06, + "loss": 0.8155, + "step": 10860 + }, + { + "epoch": 2.0488149648965743, + "grad_norm": 0.2518026828765869, + "learning_rate": 4.5673551836771955e-06, + "loss": 0.8264, + "step": 10870 + }, + { + "epoch": 2.050699712575979, + "grad_norm": 0.2484510838985443, + "learning_rate": 4.550796101568884e-06, + "loss": 0.822, + "step": 10880 + }, + { + "epoch": 2.052584460255383, + "grad_norm": 0.24544069170951843, + "learning_rate": 4.534258244849763e-06, + "loss": 0.8238, + "step": 10890 + }, + { + "epoch": 2.0544692079347877, + "grad_norm": 0.24874962866306305, + "learning_rate": 4.517741677937039e-06, + "loss": 0.818, + "step": 10900 + }, + { + "epoch": 2.0563539556141923, + "grad_norm": 0.24343523383140564, + "learning_rate": 4.501246465164981e-06, + "loss": 0.8226, + "step": 10910 + }, + { + "epoch": 2.0582387032935965, + "grad_norm": 0.23904551565647125, + "learning_rate": 4.484772670784679e-06, + "loss": 0.8071, + "step": 10920 + }, + { + "epoch": 2.060123450973001, + "grad_norm": 0.25297197699546814, + "learning_rate": 4.468320358963799e-06, + "loss": 0.8282, + "step": 10930 + }, + { + "epoch": 2.0620081986524053, + "grad_norm": 0.2462884485721588, + "learning_rate": 4.451889593786328e-06, + "loss": 0.8202, + "step": 10940 + }, + { + "epoch": 2.06389294633181, + "grad_norm": 0.24125376343727112, + "learning_rate": 4.435480439252339e-06, + "loss": 0.7995, + "step": 10950 + }, + { + "epoch": 2.0657776940112145, + "grad_norm": 0.24337950348854065, + "learning_rate": 4.419092959277706e-06, + "loss": 0.8318, + "step": 10960 + }, + { + "epoch": 2.0676624416906186, + "grad_norm": 0.24656328558921814, + "learning_rate": 4.402727217693892e-06, + "loss": 0.8273, + "step": 10970 + }, + { + "epoch": 2.0695471893700232, + "grad_norm": 0.2367190271615982, + "learning_rate": 4.386383278247679e-06, + "loss": 0.8231, + "step": 10980 + }, + { + "epoch": 2.0714319370494274, + "grad_norm": 0.24478529393672943, + "learning_rate": 4.3700612046009396e-06, + "loss": 0.8319, + "step": 10990 + }, + { + "epoch": 2.073316684728832, + "grad_norm": 0.2592780292034149, + "learning_rate": 4.353761060330366e-06, + "loss": 0.8155, + "step": 11000 + }, + { + "epoch": 2.075201432408236, + "grad_norm": 0.2566168010234833, + "learning_rate": 4.337482908927227e-06, + "loss": 0.8205, + "step": 11010 + }, + { + "epoch": 2.077086180087641, + "grad_norm": 0.2513549327850342, + "learning_rate": 4.3212268137971325e-06, + "loss": 0.8107, + "step": 11020 + }, + { + "epoch": 2.0789709277670454, + "grad_norm": 0.2495751529932022, + "learning_rate": 4.304992838259788e-06, + "loss": 0.8264, + "step": 11030 + }, + { + "epoch": 2.0808556754464496, + "grad_norm": 0.24470172822475433, + "learning_rate": 4.288781045548734e-06, + "loss": 0.813, + "step": 11040 + }, + { + "epoch": 2.082740423125854, + "grad_norm": 0.24850420653820038, + "learning_rate": 4.2725914988110966e-06, + "loss": 0.8071, + "step": 11050 + }, + { + "epoch": 2.0846251708052583, + "grad_norm": 0.24322272837162018, + "learning_rate": 4.256424261107357e-06, + "loss": 0.8125, + "step": 11060 + }, + { + "epoch": 2.086509918484663, + "grad_norm": 0.2384253740310669, + "learning_rate": 4.240279395411111e-06, + "loss": 0.8167, + "step": 11070 + }, + { + "epoch": 2.088394666164067, + "grad_norm": 0.24936096370220184, + "learning_rate": 4.2241569646087985e-06, + "loss": 0.8142, + "step": 11080 + }, + { + "epoch": 2.0902794138434717, + "grad_norm": 0.25035253167152405, + "learning_rate": 4.208057031499482e-06, + "loss": 0.8238, + "step": 11090 + }, + { + "epoch": 2.0921641615228763, + "grad_norm": 0.24180178344249725, + "learning_rate": 4.191979658794575e-06, + "loss": 0.8225, + "step": 11100 + }, + { + "epoch": 2.0940489092022805, + "grad_norm": 0.2440442442893982, + "learning_rate": 4.175924909117638e-06, + "loss": 0.8227, + "step": 11110 + }, + { + "epoch": 2.095933656881685, + "grad_norm": 0.24729479849338531, + "learning_rate": 4.159892845004099e-06, + "loss": 0.8069, + "step": 11120 + }, + { + "epoch": 2.0978184045610893, + "grad_norm": 0.25781846046447754, + "learning_rate": 4.1438835289010286e-06, + "loss": 0.8125, + "step": 11130 + }, + { + "epoch": 2.099703152240494, + "grad_norm": 0.24645312130451202, + "learning_rate": 4.127897023166878e-06, + "loss": 0.8235, + "step": 11140 + }, + { + "epoch": 2.101587899919898, + "grad_norm": 0.24224001169204712, + "learning_rate": 4.11193339007126e-06, + "loss": 0.8119, + "step": 11150 + }, + { + "epoch": 2.1034726475993026, + "grad_norm": 0.244343101978302, + "learning_rate": 4.095992691794699e-06, + "loss": 0.8187, + "step": 11160 + }, + { + "epoch": 2.1053573952787072, + "grad_norm": 0.24623258411884308, + "learning_rate": 4.080074990428378e-06, + "loss": 0.8251, + "step": 11170 + }, + { + "epoch": 2.1072421429581114, + "grad_norm": 0.2696931064128876, + "learning_rate": 4.064180347973907e-06, + "loss": 0.835, + "step": 11180 + }, + { + "epoch": 2.109126890637516, + "grad_norm": 0.25944069027900696, + "learning_rate": 4.048308826343068e-06, + "loss": 0.8082, + "step": 11190 + }, + { + "epoch": 2.11101163831692, + "grad_norm": 0.2443627417087555, + "learning_rate": 4.032460487357602e-06, + "loss": 0.8179, + "step": 11200 + }, + { + "epoch": 2.112896385996325, + "grad_norm": 0.2455567717552185, + "learning_rate": 4.016635392748939e-06, + "loss": 0.8064, + "step": 11210 + }, + { + "epoch": 2.114781133675729, + "grad_norm": 0.25627031922340393, + "learning_rate": 4.000833604157975e-06, + "loss": 0.8386, + "step": 11220 + }, + { + "epoch": 2.1166658813551336, + "grad_norm": 0.2484632134437561, + "learning_rate": 3.985055183134812e-06, + "loss": 0.8029, + "step": 11230 + }, + { + "epoch": 2.118550629034538, + "grad_norm": 0.2524816393852234, + "learning_rate": 3.969300191138553e-06, + "loss": 0.8281, + "step": 11240 + }, + { + "epoch": 2.1204353767139423, + "grad_norm": 0.2410929799079895, + "learning_rate": 3.953568689537027e-06, + "loss": 0.8182, + "step": 11250 + }, + { + "epoch": 2.1221316496254063, + "grad_norm": 0.2454637587070465, + "learning_rate": 3.937860739606571e-06, + "loss": 0.8133, + "step": 11260 + }, + { + "epoch": 2.124016397304811, + "grad_norm": 0.24103568494319916, + "learning_rate": 3.922176402531782e-06, + "loss": 0.8068, + "step": 11270 + }, + { + "epoch": 2.125901144984215, + "grad_norm": 0.2459927648305893, + "learning_rate": 3.9065157394052835e-06, + "loss": 0.8197, + "step": 11280 + }, + { + "epoch": 2.1277858926636197, + "grad_norm": 0.25149574875831604, + "learning_rate": 3.890878811227484e-06, + "loss": 0.8251, + "step": 11290 + }, + { + "epoch": 2.129670640343024, + "grad_norm": 0.25246286392211914, + "learning_rate": 3.875265678906342e-06, + "loss": 0.8062, + "step": 11300 + }, + { + "epoch": 2.1315553880224285, + "grad_norm": 0.3085257112979889, + "learning_rate": 3.859676403257131e-06, + "loss": 0.8155, + "step": 11310 + }, + { + "epoch": 2.133440135701833, + "grad_norm": 0.25187939405441284, + "learning_rate": 3.844111045002193e-06, + "loss": 0.8133, + "step": 11320 + }, + { + "epoch": 2.1353248833812373, + "grad_norm": 0.240624338388443, + "learning_rate": 3.8285696647707146e-06, + "loss": 0.8075, + "step": 11330 + }, + { + "epoch": 2.137209631060642, + "grad_norm": 0.2506885230541229, + "learning_rate": 3.8130523230984827e-06, + "loss": 0.8242, + "step": 11340 + }, + { + "epoch": 2.139094378740046, + "grad_norm": 0.2454839050769806, + "learning_rate": 3.7975590804276484e-06, + "loss": 0.8222, + "step": 11350 + }, + { + "epoch": 2.1409791264194507, + "grad_norm": 0.2395995557308197, + "learning_rate": 3.7820899971064972e-06, + "loss": 0.8139, + "step": 11360 + }, + { + "epoch": 2.142863874098855, + "grad_norm": 0.2515275180339813, + "learning_rate": 3.7666451333892097e-06, + "loss": 0.8148, + "step": 11370 + }, + { + "epoch": 2.1447486217782594, + "grad_norm": 0.2368747740983963, + "learning_rate": 3.751224549435626e-06, + "loss": 0.8203, + "step": 11380 + }, + { + "epoch": 2.146633369457664, + "grad_norm": 0.24806943535804749, + "learning_rate": 3.7358283053110156e-06, + "loss": 0.8281, + "step": 11390 + }, + { + "epoch": 2.148518117137068, + "grad_norm": 0.24763457477092743, + "learning_rate": 3.7204564609858407e-06, + "loss": 0.8196, + "step": 11400 + }, + { + "epoch": 2.150402864816473, + "grad_norm": 0.23972785472869873, + "learning_rate": 3.7051090763355222e-06, + "loss": 0.8061, + "step": 11410 + }, + { + "epoch": 2.152287612495877, + "grad_norm": 0.24285432696342468, + "learning_rate": 3.689786211140207e-06, + "loss": 0.8184, + "step": 11420 + }, + { + "epoch": 2.1541723601752816, + "grad_norm": 0.24448655545711517, + "learning_rate": 3.6744879250845354e-06, + "loss": 0.8213, + "step": 11430 + }, + { + "epoch": 2.1560571078546857, + "grad_norm": 0.2486555129289627, + "learning_rate": 3.6592142777574114e-06, + "loss": 0.8176, + "step": 11440 + }, + { + "epoch": 2.1579418555340903, + "grad_norm": 0.2540287375450134, + "learning_rate": 3.643965328651763e-06, + "loss": 0.8106, + "step": 11450 + }, + { + "epoch": 2.159826603213495, + "grad_norm": 0.2507999837398529, + "learning_rate": 3.62874113716432e-06, + "loss": 0.8112, + "step": 11460 + }, + { + "epoch": 2.161711350892899, + "grad_norm": 0.2464211881160736, + "learning_rate": 3.613541762595374e-06, + "loss": 0.8133, + "step": 11470 + }, + { + "epoch": 2.1635960985723037, + "grad_norm": 0.237705796957016, + "learning_rate": 3.5983672641485535e-06, + "loss": 0.82, + "step": 11480 + }, + { + "epoch": 2.165480846251708, + "grad_norm": 0.23233497142791748, + "learning_rate": 3.5832177009305903e-06, + "loss": 0.8155, + "step": 11490 + }, + { + "epoch": 2.1673655939311125, + "grad_norm": 0.2472357302904129, + "learning_rate": 3.5680931319510913e-06, + "loss": 0.8063, + "step": 11500 + }, + { + "epoch": 2.1692503416105167, + "grad_norm": 0.2413078248500824, + "learning_rate": 3.5529936161223067e-06, + "loss": 0.8012, + "step": 11510 + }, + { + "epoch": 2.1711350892899213, + "grad_norm": 0.2434530407190323, + "learning_rate": 3.5379192122589e-06, + "loss": 0.8161, + "step": 11520 + }, + { + "epoch": 2.173019836969326, + "grad_norm": 0.2521507740020752, + "learning_rate": 3.522869979077723e-06, + "loss": 0.8367, + "step": 11530 + }, + { + "epoch": 2.17490458464873, + "grad_norm": 0.24296317994594574, + "learning_rate": 3.507845975197581e-06, + "loss": 0.8124, + "step": 11540 + }, + { + "epoch": 2.1767893323281347, + "grad_norm": 0.25801870226860046, + "learning_rate": 3.4928472591390107e-06, + "loss": 0.8249, + "step": 11550 + }, + { + "epoch": 2.178674080007539, + "grad_norm": 0.25144097208976746, + "learning_rate": 3.4778738893240484e-06, + "loss": 0.8208, + "step": 11560 + }, + { + "epoch": 2.1805588276869434, + "grad_norm": 0.24377991259098053, + "learning_rate": 3.4629259240760004e-06, + "loss": 0.8246, + "step": 11570 + }, + { + "epoch": 2.182443575366348, + "grad_norm": 0.24247846007347107, + "learning_rate": 3.4480034216192237e-06, + "loss": 0.8094, + "step": 11580 + }, + { + "epoch": 2.184328323045752, + "grad_norm": 0.24987009167671204, + "learning_rate": 3.4331064400788904e-06, + "loss": 0.82, + "step": 11590 + }, + { + "epoch": 2.186213070725157, + "grad_norm": 0.24102284014225006, + "learning_rate": 3.4182350374807684e-06, + "loss": 0.8309, + "step": 11600 + }, + { + "epoch": 2.188097818404561, + "grad_norm": 0.25077807903289795, + "learning_rate": 3.4033892717509888e-06, + "loss": 0.821, + "step": 11610 + }, + { + "epoch": 2.1899825660839656, + "grad_norm": 0.2519144117832184, + "learning_rate": 3.388569200715822e-06, + "loss": 0.8143, + "step": 11620 + }, + { + "epoch": 2.1918673137633697, + "grad_norm": 0.2795230746269226, + "learning_rate": 3.373774882101468e-06, + "loss": 0.8171, + "step": 11630 + }, + { + "epoch": 2.1937520614427743, + "grad_norm": 0.2398180067539215, + "learning_rate": 3.3590063735337964e-06, + "loss": 0.8214, + "step": 11640 + }, + { + "epoch": 2.195636809122179, + "grad_norm": 0.2451600283384323, + "learning_rate": 3.3442637325381578e-06, + "loss": 0.8231, + "step": 11650 + }, + { + "epoch": 2.197521556801583, + "grad_norm": 0.24921981990337372, + "learning_rate": 3.329547016539135e-06, + "loss": 0.8147, + "step": 11660 + }, + { + "epoch": 2.1994063044809877, + "grad_norm": 0.24091488122940063, + "learning_rate": 3.3148562828603494e-06, + "loss": 0.798, + "step": 11670 + }, + { + "epoch": 2.201291052160392, + "grad_norm": 0.24540333449840546, + "learning_rate": 3.300191588724191e-06, + "loss": 0.8088, + "step": 11680 + }, + { + "epoch": 2.2031757998397965, + "grad_norm": 0.2531675696372986, + "learning_rate": 3.285552991251638e-06, + "loss": 0.8313, + "step": 11690 + }, + { + "epoch": 2.2050605475192007, + "grad_norm": 0.2434483766555786, + "learning_rate": 3.270940547462018e-06, + "loss": 0.8225, + "step": 11700 + }, + { + "epoch": 2.2069452951986053, + "grad_norm": 0.2557360529899597, + "learning_rate": 3.2563543142727784e-06, + "loss": 0.8321, + "step": 11710 + }, + { + "epoch": 2.20883004287801, + "grad_norm": 0.24668008089065552, + "learning_rate": 3.2417943484992877e-06, + "loss": 0.8154, + "step": 11720 + }, + { + "epoch": 2.210714790557414, + "grad_norm": 0.2438749521970749, + "learning_rate": 3.2272607068545825e-06, + "loss": 0.8202, + "step": 11730 + }, + { + "epoch": 2.2125995382368187, + "grad_norm": 0.24026921391487122, + "learning_rate": 3.2127534459491715e-06, + "loss": 0.8318, + "step": 11740 + }, + { + "epoch": 2.214484285916223, + "grad_norm": 0.250897079706192, + "learning_rate": 3.1982726222908046e-06, + "loss": 0.8074, + "step": 11750 + }, + { + "epoch": 2.2163690335956274, + "grad_norm": 0.2464846670627594, + "learning_rate": 3.1838182922842653e-06, + "loss": 0.8083, + "step": 11760 + }, + { + "epoch": 2.218253781275032, + "grad_norm": 0.24076153337955475, + "learning_rate": 3.169390512231123e-06, + "loss": 0.8092, + "step": 11770 + }, + { + "epoch": 2.220138528954436, + "grad_norm": 0.2606302499771118, + "learning_rate": 3.1549893383295415e-06, + "loss": 0.8203, + "step": 11780 + }, + { + "epoch": 2.222023276633841, + "grad_norm": 0.24699008464813232, + "learning_rate": 3.1406148266740487e-06, + "loss": 0.8121, + "step": 11790 + }, + { + "epoch": 2.223908024313245, + "grad_norm": 0.23926283419132233, + "learning_rate": 3.1262670332553235e-06, + "loss": 0.8149, + "step": 11800 + }, + { + "epoch": 2.2257927719926496, + "grad_norm": 0.26139411330223083, + "learning_rate": 3.111946013959971e-06, + "loss": 0.823, + "step": 11810 + }, + { + "epoch": 2.2276775196720537, + "grad_norm": 0.24025562405586243, + "learning_rate": 3.097651824570298e-06, + "loss": 0.8098, + "step": 11820 + }, + { + "epoch": 2.2295622673514583, + "grad_norm": 0.24612407386302948, + "learning_rate": 3.0833845207641166e-06, + "loss": 0.8173, + "step": 11830 + }, + { + "epoch": 2.231447015030863, + "grad_norm": 0.2446829229593277, + "learning_rate": 3.0691441581145153e-06, + "loss": 0.8181, + "step": 11840 + }, + { + "epoch": 2.233331762710267, + "grad_norm": 0.2485884577035904, + "learning_rate": 3.0549307920896433e-06, + "loss": 0.8141, + "step": 11850 + }, + { + "epoch": 2.2352165103896717, + "grad_norm": 0.25292423367500305, + "learning_rate": 3.040744478052484e-06, + "loss": 0.8303, + "step": 11860 + }, + { + "epoch": 2.237101258069076, + "grad_norm": 0.2497747540473938, + "learning_rate": 3.026585271260657e-06, + "loss": 0.8134, + "step": 11870 + }, + { + "epoch": 2.2389860057484805, + "grad_norm": 0.25006502866744995, + "learning_rate": 3.0124532268662033e-06, + "loss": 0.8061, + "step": 11880 + }, + { + "epoch": 2.2408707534278847, + "grad_norm": 0.2527255117893219, + "learning_rate": 2.9983483999153496e-06, + "loss": 0.8186, + "step": 11890 + }, + { + "epoch": 2.2427555011072893, + "grad_norm": 0.24054576456546783, + "learning_rate": 2.984270845348316e-06, + "loss": 0.7875, + "step": 11900 + }, + { + "epoch": 2.244640248786694, + "grad_norm": 0.2478708177804947, + "learning_rate": 2.9702206179990844e-06, + "loss": 0.8191, + "step": 11910 + }, + { + "epoch": 2.246524996466098, + "grad_norm": 0.24709494411945343, + "learning_rate": 2.9561977725951975e-06, + "loss": 0.809, + "step": 11920 + }, + { + "epoch": 2.2484097441455027, + "grad_norm": 0.24183432757854462, + "learning_rate": 2.942202363757548e-06, + "loss": 0.815, + "step": 11930 + }, + { + "epoch": 2.250294491824907, + "grad_norm": 0.25928908586502075, + "learning_rate": 2.9282344460001554e-06, + "loss": 0.8246, + "step": 11940 + }, + { + "epoch": 2.2521792395043114, + "grad_norm": 0.26835206151008606, + "learning_rate": 2.9142940737299486e-06, + "loss": 0.8124, + "step": 11950 + }, + { + "epoch": 2.2540639871837156, + "grad_norm": 0.2382918894290924, + "learning_rate": 2.900381301246572e-06, + "loss": 0.8011, + "step": 11960 + }, + { + "epoch": 2.25594873486312, + "grad_norm": 0.24477191269397736, + "learning_rate": 2.8864961827421687e-06, + "loss": 0.8007, + "step": 11970 + }, + { + "epoch": 2.257833482542525, + "grad_norm": 0.24288354814052582, + "learning_rate": 2.8726387723011616e-06, + "loss": 0.8123, + "step": 11980 + }, + { + "epoch": 2.259718230221929, + "grad_norm": 0.24399004876613617, + "learning_rate": 2.8588091239000405e-06, + "loss": 0.8172, + "step": 11990 + }, + { + "epoch": 2.2616029779013336, + "grad_norm": 0.24827581644058228, + "learning_rate": 2.8450072914071634e-06, + "loss": 0.8179, + "step": 12000 + }, + { + "epoch": 2.2634877255807377, + "grad_norm": 0.25022023916244507, + "learning_rate": 2.831233328582549e-06, + "loss": 0.8278, + "step": 12010 + }, + { + "epoch": 2.2653724732601423, + "grad_norm": 0.24463871121406555, + "learning_rate": 2.8174872890776484e-06, + "loss": 0.8211, + "step": 12020 + }, + { + "epoch": 2.2672572209395465, + "grad_norm": 0.23996546864509583, + "learning_rate": 2.803769226435157e-06, + "loss": 0.8047, + "step": 12030 + }, + { + "epoch": 2.269141968618951, + "grad_norm": 0.24569767713546753, + "learning_rate": 2.7900791940887816e-06, + "loss": 0.814, + "step": 12040 + }, + { + "epoch": 2.2710267162983557, + "grad_norm": 0.2563968896865845, + "learning_rate": 2.7764172453630667e-06, + "loss": 0.8125, + "step": 12050 + }, + { + "epoch": 2.27291146397776, + "grad_norm": 0.24970680475234985, + "learning_rate": 2.762783433473154e-06, + "loss": 0.8121, + "step": 12060 + }, + { + "epoch": 2.2747962116571645, + "grad_norm": 0.24802082777023315, + "learning_rate": 2.749177811524597e-06, + "loss": 0.7995, + "step": 12070 + }, + { + "epoch": 2.2766809593365687, + "grad_norm": 0.2517079710960388, + "learning_rate": 2.7356004325131334e-06, + "loss": 0.825, + "step": 12080 + }, + { + "epoch": 2.2785657070159733, + "grad_norm": 0.2397715449333191, + "learning_rate": 2.7220513493245006e-06, + "loss": 0.8078, + "step": 12090 + }, + { + "epoch": 2.2804504546953774, + "grad_norm": 0.2788592576980591, + "learning_rate": 2.7085306147342207e-06, + "loss": 0.8165, + "step": 12100 + }, + { + "epoch": 2.282335202374782, + "grad_norm": 0.2643386423587799, + "learning_rate": 2.6950382814073884e-06, + "loss": 0.8146, + "step": 12110 + }, + { + "epoch": 2.2842199500541867, + "grad_norm": 0.24478402733802795, + "learning_rate": 2.6815744018984793e-06, + "loss": 0.7987, + "step": 12120 + }, + { + "epoch": 2.286104697733591, + "grad_norm": 0.24901443719863892, + "learning_rate": 2.6681390286511223e-06, + "loss": 0.8184, + "step": 12130 + }, + { + "epoch": 2.2879894454129954, + "grad_norm": 0.2519311010837555, + "learning_rate": 2.6547322139979294e-06, + "loss": 0.797, + "step": 12140 + }, + { + "epoch": 2.2898741930923996, + "grad_norm": 0.2518060803413391, + "learning_rate": 2.641354010160262e-06, + "loss": 0.8016, + "step": 12150 + }, + { + "epoch": 2.291758940771804, + "grad_norm": 0.24641728401184082, + "learning_rate": 2.628004469248043e-06, + "loss": 0.805, + "step": 12160 + }, + { + "epoch": 2.2936436884512084, + "grad_norm": 0.26472392678260803, + "learning_rate": 2.614683643259539e-06, + "loss": 0.8318, + "step": 12170 + }, + { + "epoch": 2.295528436130613, + "grad_norm": 0.2475789189338684, + "learning_rate": 2.6013915840811853e-06, + "loss": 0.8284, + "step": 12180 + }, + { + "epoch": 2.2974131838100176, + "grad_norm": 0.24545416235923767, + "learning_rate": 2.588128343487354e-06, + "loss": 0.8227, + "step": 12190 + }, + { + "epoch": 2.2992979314894217, + "grad_norm": 0.24939602613449097, + "learning_rate": 2.574893973140168e-06, + "loss": 0.8137, + "step": 12200 + }, + { + "epoch": 2.3011826791688264, + "grad_norm": 0.2527926564216614, + "learning_rate": 2.5616885245892976e-06, + "loss": 0.8065, + "step": 12210 + }, + { + "epoch": 2.3030674268482305, + "grad_norm": 0.24875377118587494, + "learning_rate": 2.5485120492717585e-06, + "loss": 0.8221, + "step": 12220 + }, + { + "epoch": 2.304952174527635, + "grad_norm": 0.25809380412101746, + "learning_rate": 2.5353645985117113e-06, + "loss": 0.8031, + "step": 12230 + }, + { + "epoch": 2.3068369222070393, + "grad_norm": 0.24700433015823364, + "learning_rate": 2.5222462235202614e-06, + "loss": 0.8335, + "step": 12240 + }, + { + "epoch": 2.308721669886444, + "grad_norm": 0.2484574019908905, + "learning_rate": 2.5091569753952615e-06, + "loss": 0.8429, + "step": 12250 + }, + { + "epoch": 2.3106064175658485, + "grad_norm": 0.24806469678878784, + "learning_rate": 2.4960969051211115e-06, + "loss": 0.8151, + "step": 12260 + }, + { + "epoch": 2.3124911652452527, + "grad_norm": 0.2459646761417389, + "learning_rate": 2.4830660635685567e-06, + "loss": 0.8016, + "step": 12270 + }, + { + "epoch": 2.3143759129246573, + "grad_norm": 0.24594217538833618, + "learning_rate": 2.470064501494496e-06, + "loss": 0.8149, + "step": 12280 + }, + { + "epoch": 2.3162606606040614, + "grad_norm": 0.2522444725036621, + "learning_rate": 2.4570922695417788e-06, + "loss": 0.8127, + "step": 12290 + }, + { + "epoch": 2.318145408283466, + "grad_norm": 0.24006417393684387, + "learning_rate": 2.44414941823901e-06, + "loss": 0.8085, + "step": 12300 + }, + { + "epoch": 2.3200301559628707, + "grad_norm": 0.23911994695663452, + "learning_rate": 2.431235998000353e-06, + "loss": 0.8215, + "step": 12310 + }, + { + "epoch": 2.321914903642275, + "grad_norm": 0.2545563280582428, + "learning_rate": 2.4183520591253353e-06, + "loss": 0.8112, + "step": 12320 + }, + { + "epoch": 2.3237996513216794, + "grad_norm": 0.24978111684322357, + "learning_rate": 2.405497651798645e-06, + "loss": 0.811, + "step": 12330 + }, + { + "epoch": 2.3256843990010836, + "grad_norm": 0.24093689024448395, + "learning_rate": 2.392672826089948e-06, + "loss": 0.8034, + "step": 12340 + }, + { + "epoch": 2.327569146680488, + "grad_norm": 0.2571706473827362, + "learning_rate": 2.3798776319536788e-06, + "loss": 0.8162, + "step": 12350 + }, + { + "epoch": 2.329453894359893, + "grad_norm": 0.24618114531040192, + "learning_rate": 2.3671121192288583e-06, + "loss": 0.8226, + "step": 12360 + }, + { + "epoch": 2.331338642039297, + "grad_norm": 0.24746663868427277, + "learning_rate": 2.3543763376388906e-06, + "loss": 0.8256, + "step": 12370 + }, + { + "epoch": 2.3332233897187016, + "grad_norm": 0.2561565339565277, + "learning_rate": 2.341670336791376e-06, + "loss": 0.8014, + "step": 12380 + }, + { + "epoch": 2.3351081373981057, + "grad_norm": 0.25759387016296387, + "learning_rate": 2.3289941661779124e-06, + "loss": 0.8393, + "step": 12390 + }, + { + "epoch": 2.3369928850775104, + "grad_norm": 0.25672033429145813, + "learning_rate": 2.3163478751739067e-06, + "loss": 0.7805, + "step": 12400 + }, + { + "epoch": 2.3388776327569145, + "grad_norm": 0.2542650103569031, + "learning_rate": 2.3037315130383773e-06, + "loss": 0.8116, + "step": 12410 + }, + { + "epoch": 2.340762380436319, + "grad_norm": 0.23899777233600616, + "learning_rate": 2.291145128913771e-06, + "loss": 0.8062, + "step": 12420 + }, + { + "epoch": 2.3426471281157237, + "grad_norm": 0.2506767511367798, + "learning_rate": 2.278588771825763e-06, + "loss": 0.8216, + "step": 12430 + }, + { + "epoch": 2.344531875795128, + "grad_norm": 0.25370845198631287, + "learning_rate": 2.2660624906830663e-06, + "loss": 0.8192, + "step": 12440 + }, + { + "epoch": 2.3464166234745325, + "grad_norm": 0.2587071359157562, + "learning_rate": 2.253566334277251e-06, + "loss": 0.8094, + "step": 12450 + }, + { + "epoch": 2.3483013711539367, + "grad_norm": 0.2527341842651367, + "learning_rate": 2.2411003512825403e-06, + "loss": 0.8166, + "step": 12460 + }, + { + "epoch": 2.3501861188333413, + "grad_norm": 0.25732749700546265, + "learning_rate": 2.2286645902556325e-06, + "loss": 0.8212, + "step": 12470 + }, + { + "epoch": 2.3520708665127454, + "grad_norm": 0.2446395456790924, + "learning_rate": 2.216259099635505e-06, + "loss": 0.8103, + "step": 12480 + }, + { + "epoch": 2.35395561419215, + "grad_norm": 0.2494322657585144, + "learning_rate": 2.203883927743228e-06, + "loss": 0.799, + "step": 12490 + }, + { + "epoch": 2.3558403618715547, + "grad_norm": 0.2605748772621155, + "learning_rate": 2.1915391227817775e-06, + "loss": 0.8355, + "step": 12500 + }, + { + "epoch": 2.357725109550959, + "grad_norm": 0.24805057048797607, + "learning_rate": 2.179224732835844e-06, + "loss": 0.8191, + "step": 12510 + }, + { + "epoch": 2.3596098572303634, + "grad_norm": 0.24817612767219543, + "learning_rate": 2.1669408058716503e-06, + "loss": 0.8114, + "step": 12520 + }, + { + "epoch": 2.3614946049097676, + "grad_norm": 0.24583624303340912, + "learning_rate": 2.154687389736759e-06, + "loss": 0.8137, + "step": 12530 + }, + { + "epoch": 2.363379352589172, + "grad_norm": 0.24508365988731384, + "learning_rate": 2.1424645321598913e-06, + "loss": 0.8185, + "step": 12540 + }, + { + "epoch": 2.3652641002685764, + "grad_norm": 0.24812927842140198, + "learning_rate": 2.1302722807507346e-06, + "loss": 0.8176, + "step": 12550 + }, + { + "epoch": 2.367148847947981, + "grad_norm": 0.24235019087791443, + "learning_rate": 2.1181106829997645e-06, + "loss": 0.8199, + "step": 12560 + }, + { + "epoch": 2.3690335956273856, + "grad_norm": 0.24542522430419922, + "learning_rate": 2.105979786278062e-06, + "loss": 0.8143, + "step": 12570 + }, + { + "epoch": 2.3709183433067897, + "grad_norm": 0.258809894323349, + "learning_rate": 2.0938796378371084e-06, + "loss": 0.818, + "step": 12580 + }, + { + "epoch": 2.3728030909861944, + "grad_norm": 0.27078887820243835, + "learning_rate": 2.0818102848086295e-06, + "loss": 0.8274, + "step": 12590 + }, + { + "epoch": 2.3746878386655985, + "grad_norm": 0.25553572177886963, + "learning_rate": 2.069771774204391e-06, + "loss": 0.8157, + "step": 12600 + }, + { + "epoch": 2.376572586345003, + "grad_norm": 0.24842026829719543, + "learning_rate": 2.0577641529160352e-06, + "loss": 0.8211, + "step": 12610 + }, + { + "epoch": 2.3784573340244073, + "grad_norm": 0.2450547069311142, + "learning_rate": 2.045787467714868e-06, + "loss": 0.8181, + "step": 12620 + }, + { + "epoch": 2.380342081703812, + "grad_norm": 0.25636473298072815, + "learning_rate": 2.0338417652517095e-06, + "loss": 0.8192, + "step": 12630 + }, + { + "epoch": 2.3822268293832165, + "grad_norm": 0.24148419499397278, + "learning_rate": 2.021927092056689e-06, + "loss": 0.8009, + "step": 12640 + }, + { + "epoch": 2.3841115770626207, + "grad_norm": 0.24721112847328186, + "learning_rate": 2.0100434945390825e-06, + "loss": 0.8122, + "step": 12650 + }, + { + "epoch": 2.3859963247420253, + "grad_norm": 0.24923764169216156, + "learning_rate": 1.9981910189871144e-06, + "loss": 0.8145, + "step": 12660 + }, + { + "epoch": 2.3878810724214294, + "grad_norm": 0.2457035332918167, + "learning_rate": 1.9863697115677836e-06, + "loss": 0.8214, + "step": 12670 + }, + { + "epoch": 2.389765820100834, + "grad_norm": 0.2467338591814041, + "learning_rate": 1.974579618326691e-06, + "loss": 0.8082, + "step": 12680 + }, + { + "epoch": 2.391650567780238, + "grad_norm": 0.24966008961200714, + "learning_rate": 1.9628207851878456e-06, + "loss": 0.8192, + "step": 12690 + }, + { + "epoch": 2.393535315459643, + "grad_norm": 0.25840523838996887, + "learning_rate": 1.951093257953508e-06, + "loss": 0.8106, + "step": 12700 + }, + { + "epoch": 2.3954200631390474, + "grad_norm": 0.24037225544452667, + "learning_rate": 1.939397082303983e-06, + "loss": 0.8128, + "step": 12710 + }, + { + "epoch": 2.3973048108184516, + "grad_norm": 0.24867680668830872, + "learning_rate": 1.9277323037974647e-06, + "loss": 0.8162, + "step": 12720 + }, + { + "epoch": 2.399189558497856, + "grad_norm": 0.24992601573467255, + "learning_rate": 1.9160989678698473e-06, + "loss": 0.8172, + "step": 12730 + }, + { + "epoch": 2.4010743061772604, + "grad_norm": 0.24319303035736084, + "learning_rate": 1.9044971198345584e-06, + "loss": 0.8257, + "step": 12740 + }, + { + "epoch": 2.402959053856665, + "grad_norm": 0.2538902163505554, + "learning_rate": 1.8929268048823724e-06, + "loss": 0.817, + "step": 12750 + }, + { + "epoch": 2.404843801536069, + "grad_norm": 0.24555231630802155, + "learning_rate": 1.8813880680812313e-06, + "loss": 0.821, + "step": 12760 + }, + { + "epoch": 2.4067285492154737, + "grad_norm": 0.2501438856124878, + "learning_rate": 1.8698809543760833e-06, + "loss": 0.8162, + "step": 12770 + }, + { + "epoch": 2.4086132968948784, + "grad_norm": 0.2529199719429016, + "learning_rate": 1.858405508588703e-06, + "loss": 0.8105, + "step": 12780 + }, + { + "epoch": 2.4104980445742825, + "grad_norm": 0.2513485848903656, + "learning_rate": 1.8469617754175106e-06, + "loss": 0.8177, + "step": 12790 + }, + { + "epoch": 2.412382792253687, + "grad_norm": 0.280857115983963, + "learning_rate": 1.8355497994373961e-06, + "loss": 0.8053, + "step": 12800 + }, + { + "epoch": 2.4142675399330913, + "grad_norm": 0.24794934689998627, + "learning_rate": 1.8241696250995544e-06, + "loss": 0.8147, + "step": 12810 + }, + { + "epoch": 2.416152287612496, + "grad_norm": 0.2518700659275055, + "learning_rate": 1.8128212967313164e-06, + "loss": 0.811, + "step": 12820 + }, + { + "epoch": 2.4180370352919, + "grad_norm": 0.2552350163459778, + "learning_rate": 1.8015048585359595e-06, + "loss": 0.8081, + "step": 12830 + }, + { + "epoch": 2.4199217829713047, + "grad_norm": 0.2540658414363861, + "learning_rate": 1.7902203545925488e-06, + "loss": 0.8078, + "step": 12840 + }, + { + "epoch": 2.4218065306507093, + "grad_norm": 0.2703763246536255, + "learning_rate": 1.7789678288557555e-06, + "loss": 0.8035, + "step": 12850 + }, + { + "epoch": 2.4236912783301134, + "grad_norm": 0.25825202465057373, + "learning_rate": 1.7677473251556953e-06, + "loss": 0.8036, + "step": 12860 + }, + { + "epoch": 2.425576026009518, + "grad_norm": 0.25024035573005676, + "learning_rate": 1.756558887197759e-06, + "loss": 0.8039, + "step": 12870 + }, + { + "epoch": 2.427460773688922, + "grad_norm": 0.25463417172431946, + "learning_rate": 1.7454025585624312e-06, + "loss": 0.8276, + "step": 12880 + }, + { + "epoch": 2.429345521368327, + "grad_norm": 0.24543416500091553, + "learning_rate": 1.7342783827051223e-06, + "loss": 0.8292, + "step": 12890 + }, + { + "epoch": 2.431230269047731, + "grad_norm": 0.25603410601615906, + "learning_rate": 1.7231864029560085e-06, + "loss": 0.8328, + "step": 12900 + }, + { + "epoch": 2.4331150167271356, + "grad_norm": 0.23531144857406616, + "learning_rate": 1.7121266625198618e-06, + "loss": 0.8061, + "step": 12910 + }, + { + "epoch": 2.43499976440654, + "grad_norm": 0.2586028277873993, + "learning_rate": 1.70109920447587e-06, + "loss": 0.8067, + "step": 12920 + }, + { + "epoch": 2.4368845120859444, + "grad_norm": 0.2526091933250427, + "learning_rate": 1.690104071777483e-06, + "loss": 0.8146, + "step": 12930 + }, + { + "epoch": 2.438769259765349, + "grad_norm": 0.2502641975879669, + "learning_rate": 1.6791413072522288e-06, + "loss": 0.8257, + "step": 12940 + }, + { + "epoch": 2.4406540074447536, + "grad_norm": 0.2606465518474579, + "learning_rate": 1.668210953601571e-06, + "loss": 0.8198, + "step": 12950 + }, + { + "epoch": 2.4425387551241577, + "grad_norm": 0.2550368010997772, + "learning_rate": 1.6573130534007186e-06, + "loss": 0.818, + "step": 12960 + }, + { + "epoch": 2.4444235028035624, + "grad_norm": 0.24474389851093292, + "learning_rate": 1.646447649098475e-06, + "loss": 0.8203, + "step": 12970 + }, + { + "epoch": 2.4463082504829665, + "grad_norm": 0.24402247369289398, + "learning_rate": 1.6356147830170588e-06, + "loss": 0.8172, + "step": 12980 + }, + { + "epoch": 2.448192998162371, + "grad_norm": 0.24879758059978485, + "learning_rate": 1.624814497351962e-06, + "loss": 0.816, + "step": 12990 + }, + { + "epoch": 2.4500777458417753, + "grad_norm": 0.2416171431541443, + "learning_rate": 1.6140468341717607e-06, + "loss": 0.8242, + "step": 13000 + }, + { + "epoch": 2.45196249352118, + "grad_norm": 0.24606920778751373, + "learning_rate": 1.6033118354179667e-06, + "loss": 0.8036, + "step": 13010 + }, + { + "epoch": 2.4538472412005845, + "grad_norm": 0.24599704146385193, + "learning_rate": 1.592609542904856e-06, + "loss": 0.8063, + "step": 13020 + }, + { + "epoch": 2.4557319888799887, + "grad_norm": 0.2544080913066864, + "learning_rate": 1.5819399983193129e-06, + "loss": 0.8239, + "step": 13030 + }, + { + "epoch": 2.4576167365593933, + "grad_norm": 0.24427728354930878, + "learning_rate": 1.5713032432206609e-06, + "loss": 0.7987, + "step": 13040 + }, + { + "epoch": 2.4595014842387974, + "grad_norm": 0.2550491690635681, + "learning_rate": 1.5606993190405084e-06, + "loss": 0.8244, + "step": 13050 + }, + { + "epoch": 2.461386231918202, + "grad_norm": 0.2574317157268524, + "learning_rate": 1.550128267082579e-06, + "loss": 0.8014, + "step": 13060 + }, + { + "epoch": 2.463270979597606, + "grad_norm": 0.2546432912349701, + "learning_rate": 1.5395901285225512e-06, + "loss": 0.8232, + "step": 13070 + }, + { + "epoch": 2.465155727277011, + "grad_norm": 0.24264824390411377, + "learning_rate": 1.5290849444079126e-06, + "loss": 0.7939, + "step": 13080 + }, + { + "epoch": 2.4670404749564154, + "grad_norm": 0.250150591135025, + "learning_rate": 1.5186127556577778e-06, + "loss": 0.817, + "step": 13090 + }, + { + "epoch": 2.4689252226358196, + "grad_norm": 0.24032209813594818, + "learning_rate": 1.508173603062746e-06, + "loss": 0.8216, + "step": 13100 + }, + { + "epoch": 2.470809970315224, + "grad_norm": 0.24979449808597565, + "learning_rate": 1.4977675272847326e-06, + "loss": 0.8059, + "step": 13110 + }, + { + "epoch": 2.4726947179946284, + "grad_norm": 0.25147831439971924, + "learning_rate": 1.487394568856817e-06, + "loss": 0.8048, + "step": 13120 + }, + { + "epoch": 2.474579465674033, + "grad_norm": 0.2481861710548401, + "learning_rate": 1.4770547681830805e-06, + "loss": 0.8186, + "step": 13130 + }, + { + "epoch": 2.476464213353437, + "grad_norm": 0.25081998109817505, + "learning_rate": 1.4667481655384518e-06, + "loss": 0.8122, + "step": 13140 + }, + { + "epoch": 2.4783489610328417, + "grad_norm": 0.25492528080940247, + "learning_rate": 1.456474801068547e-06, + "loss": 0.8308, + "step": 13150 + }, + { + "epoch": 2.4802337087122464, + "grad_norm": 0.24808630347251892, + "learning_rate": 1.4462347147895162e-06, + "loss": 0.7875, + "step": 13160 + }, + { + "epoch": 2.4821184563916505, + "grad_norm": 0.2479819655418396, + "learning_rate": 1.4360279465878856e-06, + "loss": 0.8186, + "step": 13170 + }, + { + "epoch": 2.484003204071055, + "grad_norm": 0.24826273322105408, + "learning_rate": 1.4258545362204035e-06, + "loss": 0.8026, + "step": 13180 + }, + { + "epoch": 2.4858879517504593, + "grad_norm": 0.2459457814693451, + "learning_rate": 1.4157145233138858e-06, + "loss": 0.8268, + "step": 13190 + }, + { + "epoch": 2.487772699429864, + "grad_norm": 0.2527751624584198, + "learning_rate": 1.4056079473650586e-06, + "loss": 0.8089, + "step": 13200 + }, + { + "epoch": 2.489657447109268, + "grad_norm": 0.24943867325782776, + "learning_rate": 1.3955348477404073e-06, + "loss": 0.8017, + "step": 13210 + }, + { + "epoch": 2.4915421947886727, + "grad_norm": 0.24275365471839905, + "learning_rate": 1.3854952636760254e-06, + "loss": 0.8293, + "step": 13220 + }, + { + "epoch": 2.4934269424680773, + "grad_norm": 0.25114986300468445, + "learning_rate": 1.3754892342774528e-06, + "loss": 0.7984, + "step": 13230 + }, + { + "epoch": 2.4953116901474814, + "grad_norm": 0.24418845772743225, + "learning_rate": 1.3655167985195362e-06, + "loss": 0.8136, + "step": 13240 + }, + { + "epoch": 2.497196437826886, + "grad_norm": 0.25467386841773987, + "learning_rate": 1.3555779952462643e-06, + "loss": 0.8264, + "step": 13250 + }, + { + "epoch": 2.49908118550629, + "grad_norm": 0.2516363561153412, + "learning_rate": 1.3456728631706284e-06, + "loss": 0.8212, + "step": 13260 + }, + { + "epoch": 2.500965933185695, + "grad_norm": 0.2539440989494324, + "learning_rate": 1.3358014408744602e-06, + "loss": 0.7933, + "step": 13270 + }, + { + "epoch": 2.502850680865099, + "grad_norm": 0.2559252083301544, + "learning_rate": 1.3259637668082915e-06, + "loss": 0.8109, + "step": 13280 + }, + { + "epoch": 2.5047354285445036, + "grad_norm": 0.2545989751815796, + "learning_rate": 1.316159879291199e-06, + "loss": 0.8112, + "step": 13290 + }, + { + "epoch": 2.506620176223908, + "grad_norm": 0.25465795397758484, + "learning_rate": 1.3063898165106558e-06, + "loss": 0.814, + "step": 13300 + }, + { + "epoch": 2.5085049239033124, + "grad_norm": 0.24290414154529572, + "learning_rate": 1.2966536165223831e-06, + "loss": 0.8082, + "step": 13310 + }, + { + "epoch": 2.510389671582717, + "grad_norm": 0.2509731352329254, + "learning_rate": 1.2869513172502013e-06, + "loss": 0.8274, + "step": 13320 + }, + { + "epoch": 2.512274419262121, + "grad_norm": 0.25207099318504333, + "learning_rate": 1.2772829564858846e-06, + "loss": 0.8094, + "step": 13330 + }, + { + "epoch": 2.5141591669415257, + "grad_norm": 0.2512916624546051, + "learning_rate": 1.267648571889011e-06, + "loss": 0.8138, + "step": 13340 + }, + { + "epoch": 2.51604391462093, + "grad_norm": 0.24304769933223724, + "learning_rate": 1.2580482009868167e-06, + "loss": 0.8187, + "step": 13350 + }, + { + "epoch": 2.5179286623003345, + "grad_norm": 0.24602949619293213, + "learning_rate": 1.2484818811740507e-06, + "loss": 0.8184, + "step": 13360 + }, + { + "epoch": 2.519813409979739, + "grad_norm": 0.2600632905960083, + "learning_rate": 1.2389496497128251e-06, + "loss": 0.8213, + "step": 13370 + }, + { + "epoch": 2.5216981576591433, + "grad_norm": 0.2592945098876953, + "learning_rate": 1.2294515437324816e-06, + "loss": 0.8037, + "step": 13380 + }, + { + "epoch": 2.523582905338548, + "grad_norm": 0.2563936412334442, + "learning_rate": 1.2199876002294274e-06, + "loss": 0.8009, + "step": 13390 + }, + { + "epoch": 2.525467653017952, + "grad_norm": 0.24926042556762695, + "learning_rate": 1.2105578560670095e-06, + "loss": 0.8039, + "step": 13400 + }, + { + "epoch": 2.5273524006973567, + "grad_norm": 0.24498221278190613, + "learning_rate": 1.201162347975362e-06, + "loss": 0.7982, + "step": 13410 + }, + { + "epoch": 2.529237148376761, + "grad_norm": 0.25144505500793457, + "learning_rate": 1.1918011125512651e-06, + "loss": 0.8066, + "step": 13420 + }, + { + "epoch": 2.5311218960561654, + "grad_norm": 0.24688313901424408, + "learning_rate": 1.1824741862580025e-06, + "loss": 0.8077, + "step": 13430 + }, + { + "epoch": 2.53300664373557, + "grad_norm": 0.25755774974823, + "learning_rate": 1.1731816054252177e-06, + "loss": 0.8011, + "step": 13440 + }, + { + "epoch": 2.534891391414974, + "grad_norm": 0.24877141416072845, + "learning_rate": 1.1639234062487748e-06, + "loss": 0.8258, + "step": 13450 + }, + { + "epoch": 2.536776139094379, + "grad_norm": 0.23837682604789734, + "learning_rate": 1.1546996247906162e-06, + "loss": 0.8085, + "step": 13460 + }, + { + "epoch": 2.5386608867737834, + "grad_norm": 0.2557922601699829, + "learning_rate": 1.145510296978628e-06, + "loss": 0.817, + "step": 13470 + }, + { + "epoch": 2.5405456344531876, + "grad_norm": 0.2543253004550934, + "learning_rate": 1.1363554586064852e-06, + "loss": 0.811, + "step": 13480 + }, + { + "epoch": 2.5424303821325918, + "grad_norm": 0.24622730910778046, + "learning_rate": 1.1272351453335274e-06, + "loss": 0.8158, + "step": 13490 + }, + { + "epoch": 2.5443151298119964, + "grad_norm": 0.24601022899150848, + "learning_rate": 1.118149392684612e-06, + "loss": 0.802, + "step": 13500 + }, + { + "epoch": 2.546199877491401, + "grad_norm": 0.26041731238365173, + "learning_rate": 1.1090982360499847e-06, + "loss": 0.832, + "step": 13510 + }, + { + "epoch": 2.548084625170805, + "grad_norm": 0.24699217081069946, + "learning_rate": 1.1000817106851247e-06, + "loss": 0.8247, + "step": 13520 + }, + { + "epoch": 2.5499693728502097, + "grad_norm": 0.2564820349216461, + "learning_rate": 1.0910998517106242e-06, + "loss": 0.8242, + "step": 13530 + }, + { + "epoch": 2.5518541205296144, + "grad_norm": 0.2520027756690979, + "learning_rate": 1.0821526941120408e-06, + "loss": 0.8374, + "step": 13540 + }, + { + "epoch": 2.5537388682090185, + "grad_norm": 0.4110133945941925, + "learning_rate": 1.073240272739775e-06, + "loss": 0.8073, + "step": 13550 + }, + { + "epoch": 2.5556236158884227, + "grad_norm": 0.24911150336265564, + "learning_rate": 1.0643626223089153e-06, + "loss": 0.8345, + "step": 13560 + }, + { + "epoch": 2.5575083635678273, + "grad_norm": 0.25879111886024475, + "learning_rate": 1.0555197773991132e-06, + "loss": 0.8153, + "step": 13570 + }, + { + "epoch": 2.559393111247232, + "grad_norm": 0.24806149303913116, + "learning_rate": 1.0467117724544518e-06, + "loss": 0.799, + "step": 13580 + }, + { + "epoch": 2.561277858926636, + "grad_norm": 0.2523815929889679, + "learning_rate": 1.0379386417833083e-06, + "loss": 0.8192, + "step": 13590 + }, + { + "epoch": 2.5631626066060407, + "grad_norm": 0.2525022625923157, + "learning_rate": 1.0292004195582184e-06, + "loss": 0.7957, + "step": 13600 + }, + { + "epoch": 2.5650473542854453, + "grad_norm": 0.2494993656873703, + "learning_rate": 1.0204971398157405e-06, + "loss": 0.8114, + "step": 13610 + }, + { + "epoch": 2.5669321019648494, + "grad_norm": 0.24915970861911774, + "learning_rate": 1.0118288364563345e-06, + "loss": 0.8016, + "step": 13620 + }, + { + "epoch": 2.5688168496442536, + "grad_norm": 0.24997591972351074, + "learning_rate": 1.0031955432442153e-06, + "loss": 0.8065, + "step": 13630 + }, + { + "epoch": 2.570701597323658, + "grad_norm": 0.24461428821086884, + "learning_rate": 9.945972938072379e-07, + "loss": 0.8161, + "step": 13640 + }, + { + "epoch": 2.572586345003063, + "grad_norm": 0.23783884942531586, + "learning_rate": 9.860341216367498e-07, + "loss": 0.8121, + "step": 13650 + }, + { + "epoch": 2.574471092682467, + "grad_norm": 0.2474040538072586, + "learning_rate": 9.775060600874675e-07, + "loss": 0.818, + "step": 13660 + }, + { + "epoch": 2.5763558403618716, + "grad_norm": 0.24973392486572266, + "learning_rate": 9.690131423773474e-07, + "loss": 0.8136, + "step": 13670 + }, + { + "epoch": 2.578240588041276, + "grad_norm": 0.2554716169834137, + "learning_rate": 9.605554015874619e-07, + "loss": 0.8092, + "step": 13680 + }, + { + "epoch": 2.5801253357206804, + "grad_norm": 0.24705396592617035, + "learning_rate": 9.521328706618593e-07, + "loss": 0.815, + "step": 13690 + }, + { + "epoch": 2.582010083400085, + "grad_norm": 0.2521660625934601, + "learning_rate": 9.437455824074382e-07, + "loss": 0.8263, + "step": 13700 + }, + { + "epoch": 2.583894831079489, + "grad_norm": 0.25129029154777527, + "learning_rate": 9.353935694938254e-07, + "loss": 0.8234, + "step": 13710 + }, + { + "epoch": 2.5857795787588937, + "grad_norm": 0.25427740812301636, + "learning_rate": 9.27076864453249e-07, + "loss": 0.8202, + "step": 13720 + }, + { + "epoch": 2.587664326438298, + "grad_norm": 0.25162559747695923, + "learning_rate": 9.187954996804016e-07, + "loss": 0.8085, + "step": 13730 + }, + { + "epoch": 2.5895490741177025, + "grad_norm": 0.25208336114883423, + "learning_rate": 9.105495074323267e-07, + "loss": 0.8274, + "step": 13740 + }, + { + "epoch": 2.591433821797107, + "grad_norm": 0.2440558224916458, + "learning_rate": 9.023389198282795e-07, + "loss": 0.8134, + "step": 13750 + }, + { + "epoch": 2.5933185694765113, + "grad_norm": 0.250679612159729, + "learning_rate": 8.941637688496163e-07, + "loss": 0.8008, + "step": 13760 + }, + { + "epoch": 2.595203317155916, + "grad_norm": 0.25643491744995117, + "learning_rate": 8.860240863396607e-07, + "loss": 0.8111, + "step": 13770 + }, + { + "epoch": 2.59708806483532, + "grad_norm": 0.25435560941696167, + "learning_rate": 8.779199040035824e-07, + "loss": 0.8082, + "step": 13780 + }, + { + "epoch": 2.5989728125147247, + "grad_norm": 0.254193514585495, + "learning_rate": 8.698512534082681e-07, + "loss": 0.8242, + "step": 13790 + }, + { + "epoch": 2.600857560194129, + "grad_norm": 0.2494610846042633, + "learning_rate": 8.618181659822111e-07, + "loss": 0.805, + "step": 13800 + }, + { + "epoch": 2.6027423078735334, + "grad_norm": 0.24583107233047485, + "learning_rate": 8.538206730153764e-07, + "loss": 0.8028, + "step": 13810 + }, + { + "epoch": 2.604627055552938, + "grad_norm": 0.2529905140399933, + "learning_rate": 8.458588056590866e-07, + "loss": 0.8075, + "step": 13820 + }, + { + "epoch": 2.606511803232342, + "grad_norm": 0.24674545228481293, + "learning_rate": 8.379325949258965e-07, + "loss": 0.8211, + "step": 13830 + }, + { + "epoch": 2.608396550911747, + "grad_norm": 0.2550770044326782, + "learning_rate": 8.300420716894685e-07, + "loss": 0.8149, + "step": 13840 + }, + { + "epoch": 2.610281298591151, + "grad_norm": 0.24981015920639038, + "learning_rate": 8.221872666844676e-07, + "loss": 0.8125, + "step": 13850 + }, + { + "epoch": 2.6121660462705556, + "grad_norm": 0.2512831687927246, + "learning_rate": 8.143682105064243e-07, + "loss": 0.809, + "step": 13860 + }, + { + "epoch": 2.6140507939499598, + "grad_norm": 0.25697073340415955, + "learning_rate": 8.065849336116261e-07, + "loss": 0.823, + "step": 13870 + }, + { + "epoch": 2.6159355416293644, + "grad_norm": 0.2515193223953247, + "learning_rate": 7.988374663169873e-07, + "loss": 0.8128, + "step": 13880 + }, + { + "epoch": 2.617820289308769, + "grad_norm": 0.25017204880714417, + "learning_rate": 7.911258387999521e-07, + "loss": 0.8237, + "step": 13890 + }, + { + "epoch": 2.619705036988173, + "grad_norm": 0.24427729845046997, + "learning_rate": 7.834500810983536e-07, + "loss": 0.8129, + "step": 13900 + }, + { + "epoch": 2.6215897846675777, + "grad_norm": 0.25279298424720764, + "learning_rate": 7.758102231103115e-07, + "loss": 0.825, + "step": 13910 + }, + { + "epoch": 2.623474532346982, + "grad_norm": 0.24618178606033325, + "learning_rate": 7.682062945941093e-07, + "loss": 0.8062, + "step": 13920 + }, + { + "epoch": 2.6253592800263865, + "grad_norm": 0.24731379747390747, + "learning_rate": 7.606383251680804e-07, + "loss": 0.7896, + "step": 13930 + }, + { + "epoch": 2.6272440277057907, + "grad_norm": 0.2562975287437439, + "learning_rate": 7.531063443104958e-07, + "loss": 0.8258, + "step": 13940 + }, + { + "epoch": 2.6291287753851953, + "grad_norm": 0.24520623683929443, + "learning_rate": 7.456103813594428e-07, + "loss": 0.8143, + "step": 13950 + }, + { + "epoch": 2.6310135230646, + "grad_norm": 0.24603167176246643, + "learning_rate": 7.381504655127147e-07, + "loss": 0.8313, + "step": 13960 + }, + { + "epoch": 2.632898270744004, + "grad_norm": 0.24289558827877045, + "learning_rate": 7.307266258276968e-07, + "loss": 0.8142, + "step": 13970 + }, + { + "epoch": 2.6347830184234087, + "grad_norm": 0.2416558563709259, + "learning_rate": 7.233388912212525e-07, + "loss": 0.8033, + "step": 13980 + }, + { + "epoch": 2.636667766102813, + "grad_norm": 0.24304676055908203, + "learning_rate": 7.15987290469613e-07, + "loss": 0.7932, + "step": 13990 + }, + { + "epoch": 2.6385525137822174, + "grad_norm": 0.24614085257053375, + "learning_rate": 7.086718522082592e-07, + "loss": 0.8026, + "step": 14000 + }, + { + "epoch": 2.6404372614616216, + "grad_norm": 0.255624920129776, + "learning_rate": 7.013926049318188e-07, + "loss": 0.8154, + "step": 14010 + }, + { + "epoch": 2.642322009141026, + "grad_norm": 0.2519976794719696, + "learning_rate": 6.941495769939466e-07, + "loss": 0.8243, + "step": 14020 + }, + { + "epoch": 2.644206756820431, + "grad_norm": 0.24328982830047607, + "learning_rate": 6.869427966072218e-07, + "loss": 0.8276, + "step": 14030 + }, + { + "epoch": 2.646091504499835, + "grad_norm": 0.24781540036201477, + "learning_rate": 6.797722918430316e-07, + "loss": 0.8032, + "step": 14040 + }, + { + "epoch": 2.6479762521792396, + "grad_norm": 0.2471706122159958, + "learning_rate": 6.726380906314655e-07, + "loss": 0.8076, + "step": 14050 + }, + { + "epoch": 2.6498609998586438, + "grad_norm": 0.2463628351688385, + "learning_rate": 6.655402207612071e-07, + "loss": 0.8125, + "step": 14060 + }, + { + "epoch": 2.6517457475380484, + "grad_norm": 0.2548023760318756, + "learning_rate": 6.584787098794231e-07, + "loss": 0.8282, + "step": 14070 + }, + { + "epoch": 2.6536304952174525, + "grad_norm": 0.2524365186691284, + "learning_rate": 6.51453585491657e-07, + "loss": 0.8224, + "step": 14080 + }, + { + "epoch": 2.655515242896857, + "grad_norm": 0.2509068548679352, + "learning_rate": 6.444648749617221e-07, + "loss": 0.8035, + "step": 14090 + }, + { + "epoch": 2.6573999905762618, + "grad_norm": 0.24761579930782318, + "learning_rate": 6.375126055115954e-07, + "loss": 0.821, + "step": 14100 + }, + { + "epoch": 2.659284738255666, + "grad_norm": 0.24618789553642273, + "learning_rate": 6.305968042213117e-07, + "loss": 0.8038, + "step": 14110 + }, + { + "epoch": 2.6611694859350705, + "grad_norm": 0.24829542636871338, + "learning_rate": 6.237174980288541e-07, + "loss": 0.8082, + "step": 14120 + }, + { + "epoch": 2.663054233614475, + "grad_norm": 0.24370425939559937, + "learning_rate": 6.168747137300557e-07, + "loss": 0.8155, + "step": 14130 + }, + { + "epoch": 2.6649389812938793, + "grad_norm": 0.24746324121952057, + "learning_rate": 6.100684779784894e-07, + "loss": 0.7996, + "step": 14140 + }, + { + "epoch": 2.6668237289732835, + "grad_norm": 0.255350798368454, + "learning_rate": 6.032988172853693e-07, + "loss": 0.8161, + "step": 14150 + } + ], + "logging_steps": 10, + "max_steps": 15918, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7877818568796537e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd35643dc66bba3beef713985613e4d4c39dd2c0 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2950b0c7f49758c92e7289f24838addac62540022ef61fee9dd60bb1f362697e +size 5432